Core: Stream::GetUtf8 fixes

This commit is contained in:
Mirek Fidler 2022-10-03 11:46:27 +02:00
parent 5e4dc5ae9f
commit 6cd5e1aa28
2 changed files with 66 additions and 21 deletions

View file

@ -6,7 +6,7 @@ void ValidateUtf8(const String& src, int begin, int end, int pos)
{ {
// Checks for malformed, imcomplete and overlong UTF-8 sequences. // Checks for malformed, imcomplete and overlong UTF-8 sequences.
// Replaces each malformed/illegal byte with the replacement character. (Recommended method) // Replaces each malformed/illegal byte with the replacement character. (Recommended method)
// The length of the original line and decoded/replated line MUST be equal for these sequences. // The length of the original line and decoded/replaced line MUST be equal for these sequences.
String dest; String dest;
StringStream ss(src); StringStream ss(src);
@ -15,13 +15,67 @@ void ValidateUtf8(const String& src, int begin, int end, int pos)
int c = check_utf8 ? ss.GetUtf8() : ss.Get(); int c = check_utf8 ? ss.GetUtf8() : ss.Get();
dest.Cat(c < 0 ? 0xFFFD : c); dest.Cat(c < 0 ? 0xFFFD : c);
} }
DLOG(dest);
if(check_utf8) if(check_utf8)
ASSERT(dest.GetLength() == src.GetLength()); ASSERT(dest.GetLength() == src.GetLength());
} }
void CheckLine(const String& l)
{
StringStream ss(l);
WString q;
while(!ss.IsEof())
q.Cat(ss.GetUtf8());
if(ss.IsOK() != CheckUtf8(l)) {
DDUMP(AsCString(l));
DDUMP(ss.IsOK());
DDUMP(CheckUtf8(l));
}
ASSERT(ss.IsOK() == CheckUtf8(l));
if(ss.IsOK())
ASSERT(ToUtf8(q) == l);
}
CONSOLE_APP_MAIN CONSOLE_APP_MAIN
{ {
StdLogSetup(LOG_COUT|LOG_FILE);
FileIn in(GetDataFile("utf8_stress_test.txt"));
/* in case something does not work
char h[] = "2.1.5 5 bytes (U-00200000): \"øˆ€€€\" |";
String q(h, sizeof(h));
DDUMPHEX(q);
CheckUtf8(q);
CheckLine(q);
*/
while(!in.IsEof()) {
String l = in.GetLine();
CheckLine(l);
}
for(int pass = 0; pass < 2; pass++)
for(int i = 0; i < 0x110000; i++) {
if(i >= 0xee00 && i <= 0xeeff) // skip error escapes
continue;
WString q = "Test ";
q.Cat(i);
if(pass)
q.Cat(" ..");
String s = ToUtf8(q);
StringStream ss(s);
WString qq;
while(!ss.IsEof())
qq.Cat(ss.GetUtf8());
if(qq != q) {
DDUMPHEX(i);
DDUMPHEX(s);
DDUMPC(qq);
DDUMPC(q);
}
ASSERT(qq == q);
}
// This autotest uses Marcus Kuhn's UTf-8 stress test text. // This autotest uses Marcus Kuhn's UTf-8 stress test text.
// StdLogSetup(LOG_COUT); // StdLogSetup(LOG_COUT);
String text = LoadDataFile("utf8_stress_test.txt"); String text = LoadDataFile("utf8_stress_test.txt");
@ -35,4 +89,6 @@ CONSOLE_APP_MAIN
String line = ss.GetLine(); String line = ss.GetLine();
ValidateUtf8(line, begin, end, pos++); ValidateUtf8(line, begin, end, pos++);
} }
LOG("============= OK");
} }

View file

@ -225,7 +225,7 @@ int Stream::GetUtf8()
{ {
int code = Get(); int code = Get();
if(code <= 0) { if(code < 0) {
LoadError(); LoadError();
return -1; return -1;
} }
@ -238,11 +238,8 @@ int Stream::GetUtf8()
if(code < 0xE0) { if(code < 0xE0) {
int c0 = Get(); int c0 = Get();
if(c0 >= 0x80 && c0 < 0xC0 && if(c0 >= 0x80 && c0 < 0xC0 &&
(c = ((code - 0xC0) << 6) + c0 - 0x80) >= 0x80 && c < 0x800) { (c = ((code - 0xC0) << 6) + c0 - 0x80) >= 0x80 && c < 0x800)
return c; return c;
}
if(c0 < 0)
LoadError();
} }
else else
if(code < 0xF0) { if(code < 0xF0) {
@ -250,12 +247,8 @@ int Stream::GetUtf8()
int c1 = Get(); int c1 = Get();
if(c1 >= 0x80 && c1 < 0xC0 && if(c1 >= 0x80 && c1 < 0xC0 &&
c0 >= 0x80 && c0 < 0xC0 && c0 >= 0x80 && c0 < 0xC0 &&
(c = ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80) >= 0x800 && c < 0x10000) { (c = ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80) >= 0x800 && c < 0x10000)
return c; return c;
}
if(c1 < 0)
LoadError();
} }
else else
if(code < 0xF8) { if(code < 0xF8) {
@ -265,16 +258,12 @@ int Stream::GetUtf8()
if(c2 >= 0x80 && c2 < 0xC0 && if(c2 >= 0x80 && c2 < 0xC0 &&
c1 >= 0x80 && c1 < 0xC0 && c1 >= 0x80 && c1 < 0xC0 &&
c0 >= 0x80 && c0 < 0xC0 && c0 >= 0x80 && c0 < 0xC0 &&
(c = ((code - 0xF0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80) >= 0x10000 && c < 0x110000) { (c = ((code - 0xF0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80) >= 0x10000 && c < 0x110000)
return c; return c;
}
if(c2 < 0)
LoadError();
} }
if(!IsError())
Seek(pos); // Rewind (to represent each invalid byte).
} }
LoadError();
return -1; return -1;
} }