diff --git a/autotest/StreamUTF8Test/StreamUTF8Test.cpp b/autotest/StreamUTF8Test/StreamUTF8Test.cpp index c7612e0de..edb12e673 100644 --- a/autotest/StreamUTF8Test/StreamUTF8Test.cpp +++ b/autotest/StreamUTF8Test/StreamUTF8Test.cpp @@ -6,7 +6,7 @@ void ValidateUtf8(const String& src, int begin, int end, int pos) { // Checks for malformed, imcomplete and overlong UTF-8 sequences. // Replaces each malformed/illegal byte with the replacement character. (Recommended method) - // The length of the original line and decoded/replated line MUST be equal for these sequences. + // The length of the original line and decoded/replaced line MUST be equal for these sequences. String dest; StringStream ss(src); @@ -15,13 +15,67 @@ void ValidateUtf8(const String& src, int begin, int end, int pos) int c = check_utf8 ? ss.GetUtf8() : ss.Get(); dest.Cat(c < 0 ? 0xFFFD : c); } - DLOG(dest); if(check_utf8) ASSERT(dest.GetLength() == src.GetLength()); } + +void CheckLine(const String& l) +{ + StringStream ss(l); + WString q; + while(!ss.IsEof()) + q.Cat(ss.GetUtf8()); + if(ss.IsOK() != CheckUtf8(l)) { + DDUMP(AsCString(l)); + DDUMP(ss.IsOK()); + DDUMP(CheckUtf8(l)); + } + ASSERT(ss.IsOK() == CheckUtf8(l)); + if(ss.IsOK()) + ASSERT(ToUtf8(q) == l); +} + CONSOLE_APP_MAIN { + StdLogSetup(LOG_COUT|LOG_FILE); + FileIn in(GetDataFile("utf8_stress_test.txt")); + +/* in case something does not work + char h[] = "2.1.5 5 bytes (U-00200000): \"øˆ€€€\" |"; + String q(h, sizeof(h)); + DDUMPHEX(q); + CheckUtf8(q); + CheckLine(q); +*/ + + while(!in.IsEof()) { + String l = in.GetLine(); + CheckLine(l); + } + + for(int pass = 0; pass < 2; pass++) + for(int i = 0; i < 0x110000; i++) { + if(i >= 0xee00 && i <= 0xeeff) // skip error escapes + continue; + WString q = "Test "; + q.Cat(i); + if(pass) + q.Cat(" .."); + String s = ToUtf8(q); + StringStream ss(s); + WString qq; + while(!ss.IsEof()) + qq.Cat(ss.GetUtf8()); + if(qq != q) { + DDUMPHEX(i); + DDUMPHEX(s); + DDUMPC(qq); + DDUMPC(q); + } + ASSERT(qq == q); + } + // This autotest uses Marcus Kuhn's UTf-8 stress test text. // StdLogSetup(LOG_COUT); String text = LoadDataFile("utf8_stress_test.txt"); @@ -35,4 +89,6 @@ CONSOLE_APP_MAIN String line = ss.GetLine(); ValidateUtf8(line, begin, end, pos++); } + + LOG("============= OK"); } diff --git a/uppsrc/Core/Stream.cpp b/uppsrc/Core/Stream.cpp index 51f0274bf..81501ac9d 100644 --- a/uppsrc/Core/Stream.cpp +++ b/uppsrc/Core/Stream.cpp @@ -225,7 +225,7 @@ int Stream::GetUtf8() { int code = Get(); - if(code <= 0) { + if(code < 0) { LoadError(); return -1; } @@ -238,11 +238,8 @@ int Stream::GetUtf8() if(code < 0xE0) { int c0 = Get(); if(c0 >= 0x80 && c0 < 0xC0 && - (c = ((code - 0xC0) << 6) + c0 - 0x80) >= 0x80 && c < 0x800) { - return c; - } - if(c0 < 0) - LoadError(); + (c = ((code - 0xC0) << 6) + c0 - 0x80) >= 0x80 && c < 0x800) + return c; } else if(code < 0xF0) { @@ -250,12 +247,8 @@ int Stream::GetUtf8() int c1 = Get(); if(c1 >= 0x80 && c1 < 0xC0 && c0 >= 0x80 && c0 < 0xC0 && - (c = ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80) >= 0x800 && c < 0x10000) { - return c; - } - if(c1 < 0) - LoadError(); - + (c = ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80) >= 0x800 && c < 0x10000) + return c; } else if(code < 0xF8) { @@ -265,16 +258,12 @@ int Stream::GetUtf8() if(c2 >= 0x80 && c2 < 0xC0 && c1 >= 0x80 && c1 < 0xC0 && c0 >= 0x80 && c0 < 0xC0 && - (c = ((code - 0xF0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80) >= 0x10000 && c < 0x110000) { - return c; - } - if(c2 < 0) - LoadError(); + (c = ((code - 0xF0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80) >= 0x10000 && c < 0x110000) + return c; } - if(!IsError()) - Seek(pos); // Rewind (to represent each invalid byte). } + LoadError(); return -1; }