mirror of
https://github.com/ultimatepp/ultimatepp.git
synced 2026-05-15 14:16:07 -06:00
Core/Stream: GetUtf8() method now returns failure on overlong and invalid sequences. (#99)
Core: Stream: GetUtf8() method now returns failure on overlong squences.
This commit is contained in:
parent
d2d036540b
commit
5e4dc5ae9f
4 changed files with 92 additions and 61 deletions
38
autotest/StreamUTF8Test/StreamUTF8Test.cpp
Normal file
38
autotest/StreamUTF8Test/StreamUTF8Test.cpp
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
#include <Core/Core.h>
|
||||
|
||||
using namespace Upp;
|
||||
|
||||
void ValidateUtf8(const String& src, int begin, int end, int pos)
|
||||
{
|
||||
// Checks for malformed, imcomplete and overlong UTF-8 sequences.
|
||||
// Replaces each malformed/illegal byte with the replacement character. (Recommended method)
|
||||
// The length of the original line and decoded/replated line MUST be equal for these sequences.
|
||||
|
||||
String dest;
|
||||
StringStream ss(src);
|
||||
bool check_utf8 = pos >= begin && pos < end;
|
||||
while(!ss.IsEof()) {
|
||||
int c = check_utf8 ? ss.GetUtf8() : ss.Get();
|
||||
dest.Cat(c < 0 ? 0xFFFD : c);
|
||||
}
|
||||
DLOG(dest);
|
||||
if(check_utf8)
|
||||
ASSERT(dest.GetLength() == src.GetLength());
|
||||
}
|
||||
|
||||
CONSOLE_APP_MAIN
|
||||
{
|
||||
// This autotest uses Marcus Kuhn's UTf-8 stress test text.
|
||||
// StdLogSetup(LOG_COUT);
|
||||
String text = LoadDataFile("utf8_stress_test.txt");
|
||||
ASSERT(!IsNull(text));
|
||||
StringStream ss(text);
|
||||
int pos = 0;
|
||||
int begin = text.Find("3 Malformed sequences");
|
||||
int end = text.Find("5.3 Noncharacter code positions");
|
||||
ASSERT(begin >= 0 && begin < end);
|
||||
while(!ss.IsEof()) {
|
||||
String line = ss.GetLine();
|
||||
ValidateUtf8(line, begin, end, pos++);
|
||||
}
|
||||
}
|
||||
11
autotest/StreamUTF8Test/StreamUTF8Test.upp
Normal file
11
autotest/StreamUTF8Test/StreamUTF8Test.upp
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
description "Check's Stream's Utf8 decoder for incomplete, invalid and overlong sequences\377";
|
||||
|
||||
uses
|
||||
Core;
|
||||
|
||||
file
|
||||
StreamUTF8Test.cpp;
|
||||
|
||||
mainconfig
|
||||
"" = "";
|
||||
|
||||
BIN
autotest/StreamUTF8Test/utf8_stress_test.txt
Normal file
BIN
autotest/StreamUTF8Test/utf8_stress_test.txt
Normal file
Binary file not shown.
|
|
@ -224,76 +224,58 @@ int64 Stream::_Get64() {
|
|||
int Stream::GetUtf8()
|
||||
{
|
||||
int code = Get();
|
||||
|
||||
if(code <= 0) {
|
||||
LoadError();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if(code < 0x80)
|
||||
return code;
|
||||
else
|
||||
if(code < 0xC2)
|
||||
return -1;
|
||||
else
|
||||
if(code < 0xE0) {
|
||||
if(IsEof()) {
|
||||
LoadError();
|
||||
return -1;
|
||||
|
||||
if(code >= 0xC2) {
|
||||
int c = 0, pos = GetPos();
|
||||
if(code < 0xE0) {
|
||||
int c0 = Get();
|
||||
if(c0 >= 0x80 && c0 < 0xC0 &&
|
||||
(c = ((code - 0xC0) << 6) + c0 - 0x80) >= 0x80 && c < 0x800) {
|
||||
return c;
|
||||
}
|
||||
if(c0 < 0)
|
||||
LoadError();
|
||||
}
|
||||
return ((code - 0xC0) << 6) + Get() - 0x80;
|
||||
else
|
||||
if(code < 0xF0) {
|
||||
int c0 = Get();
|
||||
int c1 = Get();
|
||||
if(c1 >= 0x80 && c1 < 0xC0 &&
|
||||
c0 >= 0x80 && c0 < 0xC0 &&
|
||||
(c = ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80) >= 0x800 && c < 0x10000) {
|
||||
return c;
|
||||
}
|
||||
if(c1 < 0)
|
||||
LoadError();
|
||||
|
||||
}
|
||||
else
|
||||
if(code < 0xF8) {
|
||||
int c0 = Get();
|
||||
int c1 = Get();
|
||||
int c2 = Get();
|
||||
if(c2 >= 0x80 && c2 < 0xC0 &&
|
||||
c1 >= 0x80 && c1 < 0xC0 &&
|
||||
c0 >= 0x80 && c0 < 0xC0 &&
|
||||
(c = ((code - 0xF0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80) >= 0x10000 && c < 0x110000) {
|
||||
return c;
|
||||
}
|
||||
if(c2 < 0)
|
||||
LoadError();
|
||||
}
|
||||
if(!IsError())
|
||||
Seek(pos); // Rewind (to represent each invalid byte).
|
||||
}
|
||||
else
|
||||
if(code < 0xF0) {
|
||||
int c0 = Get();
|
||||
int c1 = Get();
|
||||
if(c1 < 0) {
|
||||
LoadError();
|
||||
return -1;
|
||||
}
|
||||
return ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80;
|
||||
}
|
||||
else
|
||||
if(code < 0xF8) {
|
||||
int c0 = Get();
|
||||
int c1 = Get();
|
||||
int c2 = Get();
|
||||
if(c2 < 0) {
|
||||
LoadError();
|
||||
return -1;
|
||||
}
|
||||
return ((code - 0xf0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80;
|
||||
}
|
||||
else
|
||||
if(code < 0xFC) {
|
||||
int c0 = Get();
|
||||
int c1 = Get();
|
||||
int c2 = Get();
|
||||
int c3 = Get();
|
||||
if(c3 < 0) {
|
||||
LoadError();
|
||||
return -1;
|
||||
}
|
||||
return ((code - 0xF8) << 24) + ((c0 - 0x80) << 18) + ((c1 - 0x80) << 12) +
|
||||
((c2 - 0x80) << 6) + c3 - 0x80;
|
||||
}
|
||||
else
|
||||
if(code < 0xFE) {
|
||||
int c0 = Get();
|
||||
int c1 = Get();
|
||||
int c2 = Get();
|
||||
int c3 = Get();
|
||||
int c4 = Get();
|
||||
if(c4 < 0) {
|
||||
LoadError();
|
||||
return -1;
|
||||
}
|
||||
return ((code - 0xFC) << 30) + ((c0 - 0x80) << 24) + ((c1 - 0x80) << 18) +
|
||||
((c2 - 0x80) << 12) + ((c3 - 0x80) << 6) + c4 - 0x80;
|
||||
|
||||
}
|
||||
else {
|
||||
LoadError();
|
||||
return -1;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
String Stream::GetLine() {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue