Core/Stream: GetUtf8() method now returns failure on overlong and invalid sequences. (#99)

Core: Stream: GetUtf8() method now returns failure on overlong squences.
This commit is contained in:
İsmail Yılmaz 2022-10-03 11:57:12 +03:00 committed by GitHub
parent d2d036540b
commit 5e4dc5ae9f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 92 additions and 61 deletions

View file

@ -0,0 +1,38 @@
#include <Core/Core.h>
using namespace Upp;
void ValidateUtf8(const String& src, int begin, int end, int pos)
{
// Checks for malformed, imcomplete and overlong UTF-8 sequences.
// Replaces each malformed/illegal byte with the replacement character. (Recommended method)
// The length of the original line and decoded/replated line MUST be equal for these sequences.
String dest;
StringStream ss(src);
bool check_utf8 = pos >= begin && pos < end;
while(!ss.IsEof()) {
int c = check_utf8 ? ss.GetUtf8() : ss.Get();
dest.Cat(c < 0 ? 0xFFFD : c);
}
DLOG(dest);
if(check_utf8)
ASSERT(dest.GetLength() == src.GetLength());
}
CONSOLE_APP_MAIN
{
// This autotest uses Marcus Kuhn's UTf-8 stress test text.
// StdLogSetup(LOG_COUT);
String text = LoadDataFile("utf8_stress_test.txt");
ASSERT(!IsNull(text));
StringStream ss(text);
int pos = 0;
int begin = text.Find("3 Malformed sequences");
int end = text.Find("5.3 Noncharacter code positions");
ASSERT(begin >= 0 && begin < end);
while(!ss.IsEof()) {
String line = ss.GetLine();
ValidateUtf8(line, begin, end, pos++);
}
}

View file

@ -0,0 +1,11 @@
description "Check's Stream's Utf8 decoder for incomplete, invalid and overlong sequences\377";
uses
Core;
file
StreamUTF8Test.cpp;
mainconfig
"" = "";

Binary file not shown.

View file

@ -224,76 +224,58 @@ int64 Stream::_Get64() {
int Stream::GetUtf8()
{
int code = Get();
if(code <= 0) {
LoadError();
return -1;
}
if(code < 0x80)
return code;
else
if(code < 0xC2)
return -1;
else
if(code < 0xE0) {
if(IsEof()) {
LoadError();
return -1;
if(code >= 0xC2) {
int c = 0, pos = GetPos();
if(code < 0xE0) {
int c0 = Get();
if(c0 >= 0x80 && c0 < 0xC0 &&
(c = ((code - 0xC0) << 6) + c0 - 0x80) >= 0x80 && c < 0x800) {
return c;
}
if(c0 < 0)
LoadError();
}
return ((code - 0xC0) << 6) + Get() - 0x80;
else
if(code < 0xF0) {
int c0 = Get();
int c1 = Get();
if(c1 >= 0x80 && c1 < 0xC0 &&
c0 >= 0x80 && c0 < 0xC0 &&
(c = ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80) >= 0x800 && c < 0x10000) {
return c;
}
if(c1 < 0)
LoadError();
}
else
if(code < 0xF8) {
int c0 = Get();
int c1 = Get();
int c2 = Get();
if(c2 >= 0x80 && c2 < 0xC0 &&
c1 >= 0x80 && c1 < 0xC0 &&
c0 >= 0x80 && c0 < 0xC0 &&
(c = ((code - 0xF0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80) >= 0x10000 && c < 0x110000) {
return c;
}
if(c2 < 0)
LoadError();
}
if(!IsError())
Seek(pos); // Rewind (to represent each invalid byte).
}
else
if(code < 0xF0) {
int c0 = Get();
int c1 = Get();
if(c1 < 0) {
LoadError();
return -1;
}
return ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80;
}
else
if(code < 0xF8) {
int c0 = Get();
int c1 = Get();
int c2 = Get();
if(c2 < 0) {
LoadError();
return -1;
}
return ((code - 0xf0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80;
}
else
if(code < 0xFC) {
int c0 = Get();
int c1 = Get();
int c2 = Get();
int c3 = Get();
if(c3 < 0) {
LoadError();
return -1;
}
return ((code - 0xF8) << 24) + ((c0 - 0x80) << 18) + ((c1 - 0x80) << 12) +
((c2 - 0x80) << 6) + c3 - 0x80;
}
else
if(code < 0xFE) {
int c0 = Get();
int c1 = Get();
int c2 = Get();
int c3 = Get();
int c4 = Get();
if(c4 < 0) {
LoadError();
return -1;
}
return ((code - 0xFC) << 30) + ((c0 - 0x80) << 24) + ((c1 - 0x80) << 18) +
((c2 - 0x80) << 12) + ((c3 - 0x80) << 6) + c4 - 0x80;
}
else {
LoadError();
return -1;
}
return -1;
}
String Stream::GetLine() {