diff --git a/autotest/StreamUTF8Test/StreamUTF8Test.cpp b/autotest/StreamUTF8Test/StreamUTF8Test.cpp new file mode 100644 index 000000000..c7612e0de --- /dev/null +++ b/autotest/StreamUTF8Test/StreamUTF8Test.cpp @@ -0,0 +1,38 @@ +#include + +using namespace Upp; + +void ValidateUtf8(const String& src, int begin, int end, int pos) +{ + // Checks for malformed, imcomplete and overlong UTF-8 sequences. + // Replaces each malformed/illegal byte with the replacement character. (Recommended method) + // The length of the original line and decoded/replated line MUST be equal for these sequences. + + String dest; + StringStream ss(src); + bool check_utf8 = pos >= begin && pos < end; + while(!ss.IsEof()) { + int c = check_utf8 ? ss.GetUtf8() : ss.Get(); + dest.Cat(c < 0 ? 0xFFFD : c); + } + DLOG(dest); + if(check_utf8) + ASSERT(dest.GetLength() == src.GetLength()); +} + +CONSOLE_APP_MAIN +{ + // This autotest uses Marcus Kuhn's UTf-8 stress test text. + // StdLogSetup(LOG_COUT); + String text = LoadDataFile("utf8_stress_test.txt"); + ASSERT(!IsNull(text)); + StringStream ss(text); + int pos = 0; + int begin = text.Find("3 Malformed sequences"); + int end = text.Find("5.3 Noncharacter code positions"); + ASSERT(begin >= 0 && begin < end); + while(!ss.IsEof()) { + String line = ss.GetLine(); + ValidateUtf8(line, begin, end, pos++); + } +} diff --git a/autotest/StreamUTF8Test/StreamUTF8Test.upp b/autotest/StreamUTF8Test/StreamUTF8Test.upp new file mode 100644 index 000000000..ec803701a --- /dev/null +++ b/autotest/StreamUTF8Test/StreamUTF8Test.upp @@ -0,0 +1,11 @@ +description "Check's Stream's Utf8 decoder for incomplete, invalid and overlong sequences\377"; + +uses + Core; + +file + StreamUTF8Test.cpp; + +mainconfig + "" = ""; + diff --git a/autotest/StreamUTF8Test/utf8_stress_test.txt b/autotest/StreamUTF8Test/utf8_stress_test.txt new file mode 100644 index 000000000..a5b5d50e6 Binary files /dev/null and b/autotest/StreamUTF8Test/utf8_stress_test.txt differ diff --git a/uppsrc/Core/Stream.cpp b/uppsrc/Core/Stream.cpp index 53c8642e7..51f0274bf 100644 --- a/uppsrc/Core/Stream.cpp +++ b/uppsrc/Core/Stream.cpp @@ -224,76 +224,58 @@ int64 Stream::_Get64() { int Stream::GetUtf8() { int code = Get(); + if(code <= 0) { LoadError(); return -1; } + if(code < 0x80) return code; - else - if(code < 0xC2) - return -1; - else - if(code < 0xE0) { - if(IsEof()) { - LoadError(); - return -1; + + if(code >= 0xC2) { + int c = 0, pos = GetPos(); + if(code < 0xE0) { + int c0 = Get(); + if(c0 >= 0x80 && c0 < 0xC0 && + (c = ((code - 0xC0) << 6) + c0 - 0x80) >= 0x80 && c < 0x800) { + return c; + } + if(c0 < 0) + LoadError(); } - return ((code - 0xC0) << 6) + Get() - 0x80; + else + if(code < 0xF0) { + int c0 = Get(); + int c1 = Get(); + if(c1 >= 0x80 && c1 < 0xC0 && + c0 >= 0x80 && c0 < 0xC0 && + (c = ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80) >= 0x800 && c < 0x10000) { + return c; + } + if(c1 < 0) + LoadError(); + + } + else + if(code < 0xF8) { + int c0 = Get(); + int c1 = Get(); + int c2 = Get(); + if(c2 >= 0x80 && c2 < 0xC0 && + c1 >= 0x80 && c1 < 0xC0 && + c0 >= 0x80 && c0 < 0xC0 && + (c = ((code - 0xF0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80) >= 0x10000 && c < 0x110000) { + return c; + } + if(c2 < 0) + LoadError(); + } + if(!IsError()) + Seek(pos); // Rewind (to represent each invalid byte). } - else - if(code < 0xF0) { - int c0 = Get(); - int c1 = Get(); - if(c1 < 0) { - LoadError(); - return -1; - } - return ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80; - } - else - if(code < 0xF8) { - int c0 = Get(); - int c1 = Get(); - int c2 = Get(); - if(c2 < 0) { - LoadError(); - return -1; - } - return ((code - 0xf0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80; - } - else - if(code < 0xFC) { - int c0 = Get(); - int c1 = Get(); - int c2 = Get(); - int c3 = Get(); - if(c3 < 0) { - LoadError(); - return -1; - } - return ((code - 0xF8) << 24) + ((c0 - 0x80) << 18) + ((c1 - 0x80) << 12) + - ((c2 - 0x80) << 6) + c3 - 0x80; - } - else - if(code < 0xFE) { - int c0 = Get(); - int c1 = Get(); - int c2 = Get(); - int c3 = Get(); - int c4 = Get(); - if(c4 < 0) { - LoadError(); - return -1; - } - return ((code - 0xFC) << 30) + ((c0 - 0x80) << 24) + ((c1 - 0x80) << 18) + - ((c2 - 0x80) << 12) + ((c3 - 0x80) << 6) + c4 - 0x80; - } - else { - LoadError(); - return -1; - } + return -1; } String Stream::GetLine() {