From 5e4dc5ae9f51b49a87be6a1d6fdc497ea1db514d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C4=B0smail=20Y=C4=B1lmaz?= <32938453+ismail-yilmaz@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:57:12 +0300 Subject: [PATCH] Core/Stream: GetUtf8() method now returns failure on overlong and invalid sequences. (#99) Core: Stream: GetUtf8() method now returns failure on overlong squences. --- autotest/StreamUTF8Test/StreamUTF8Test.cpp | 38 +++++++ autotest/StreamUTF8Test/StreamUTF8Test.upp | 11 ++ autotest/StreamUTF8Test/utf8_stress_test.txt | Bin 0 -> 22781 bytes uppsrc/Core/Stream.cpp | 104 ++++++++----------- 4 files changed, 92 insertions(+), 61 deletions(-) create mode 100644 autotest/StreamUTF8Test/StreamUTF8Test.cpp create mode 100644 autotest/StreamUTF8Test/StreamUTF8Test.upp create mode 100644 autotest/StreamUTF8Test/utf8_stress_test.txt diff --git a/autotest/StreamUTF8Test/StreamUTF8Test.cpp b/autotest/StreamUTF8Test/StreamUTF8Test.cpp new file mode 100644 index 000000000..c7612e0de --- /dev/null +++ b/autotest/StreamUTF8Test/StreamUTF8Test.cpp @@ -0,0 +1,38 @@ +#include + +using namespace Upp; + +void ValidateUtf8(const String& src, int begin, int end, int pos) +{ + // Checks for malformed, imcomplete and overlong UTF-8 sequences. + // Replaces each malformed/illegal byte with the replacement character. (Recommended method) + // The length of the original line and decoded/replated line MUST be equal for these sequences. + + String dest; + StringStream ss(src); + bool check_utf8 = pos >= begin && pos < end; + while(!ss.IsEof()) { + int c = check_utf8 ? ss.GetUtf8() : ss.Get(); + dest.Cat(c < 0 ? 0xFFFD : c); + } + DLOG(dest); + if(check_utf8) + ASSERT(dest.GetLength() == src.GetLength()); +} + +CONSOLE_APP_MAIN +{ + // This autotest uses Marcus Kuhn's UTf-8 stress test text. + // StdLogSetup(LOG_COUT); + String text = LoadDataFile("utf8_stress_test.txt"); + ASSERT(!IsNull(text)); + StringStream ss(text); + int pos = 0; + int begin = text.Find("3 Malformed sequences"); + int end = text.Find("5.3 Noncharacter code positions"); + ASSERT(begin >= 0 && begin < end); + while(!ss.IsEof()) { + String line = ss.GetLine(); + ValidateUtf8(line, begin, end, pos++); + } +} diff --git a/autotest/StreamUTF8Test/StreamUTF8Test.upp b/autotest/StreamUTF8Test/StreamUTF8Test.upp new file mode 100644 index 000000000..ec803701a --- /dev/null +++ b/autotest/StreamUTF8Test/StreamUTF8Test.upp @@ -0,0 +1,11 @@ +description "Check's Stream's Utf8 decoder for incomplete, invalid and overlong sequences\377"; + +uses + Core; + +file + StreamUTF8Test.cpp; + +mainconfig + "" = ""; + diff --git a/autotest/StreamUTF8Test/utf8_stress_test.txt b/autotest/StreamUTF8Test/utf8_stress_test.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5b5d50e6b61eb9a3b751b3954f83e61bb59db9b GIT binary patch literal 22781 zcmdU1X_Fh*b>CYjiIrHEbIyIeVd1bb#8j*y4MbNJE8ngr{T%> zH=?M0*NGF|Zns=*maBHFY*)*j-4j3B+Sy$_dEy6TNmiFvPA)BPEUj+fmviUj>}zDb zylhzyHeB;;sk==Fw0Y8Snr+$lJK|ijTdwCUO2hB+4}n}98At=QQVnO2tbk<@(9!SA*KTMb-()7zJ}9Yk)m)3ovMPE_?< z4&m`=AYfdi2gL9x(zW;T2&>e#!>~kZOLg0AmhhcCgBHcvb3FXf@9Z>qKS;P_9xq{M zX9r^v+X4}FC$KAEXd?8A)3Eozr9kXRLJ%VDrme$0ABAGOEs(JYzJ!rugIk~^6$2>n zcEzq>1dFOMR23y23`tPI^(hkaJx~+w1GHs>5#U|33BkDdJ8tNBov>p!@zi!F0^_$X zwVlAF6hNWW!L)ctp%pbicFV`;>TVE5OOcBa*d6d8P>GKyMu;y#v2jP!v2ly zk#^vNEf_7C=(s`3piW3-P;OK9WvZqTur#5BA;#K-8dm*1m=Jj3)$CxO%20DT9xyLe zwsDmL*Fxz!%_e`wy4l5ZQiI8()+YdS1n&OdNLR+dk#pITbEYjt^fc~L?miBLAnr&X@2<+Unk z1xxnas$;=~9Ct4@5j@+*5Y%*_V9+xuWkXKLj~OeW4FN%?jpW=YyARqX)rK9|RoI{z z6*L^m6}lKPRobrk?FftsJC)FhN~FyyrMJYKuwgkgxUNh6oN`}GFDwzt;8txQTCib0 zpaX3vNd^8-vO$_rm*s*-zPDPw2N~g;P^31r#;!I1WQ{2rZPN#c8k3rQ#uD?@lrel9 zc&guQIdCZHh-Bg7@$K#H&0~wwZMb+&)Ax7DrV9Ahba&}X@N9^jV}PH6M+Px?o*Hkbs)2c49sb4l;8pIYPlARu zoGPT(Z8+q>K>~n^Evy`)jv=&wc~r>Af(RF7VRiKHP%*^JY6M2Y=RqsH9fW|J=p5L9 znCI@cwPR{0b+~K4%VEB*<45q|vDS%Qr|sIF1&&Z+V?K-_EmasQ z8kByYLD(%L0K%By=V1!rzE^DozUTgkfsVl1(8MKiTZq~&WQ?&SPutJLna1mUxDM_8 zP)ZNpy(L@sZI)m$I*2Ai09`mdo-eM(DM*@X6gpHP8)DPgmBh+wV22I*AB0J!9+fy; z8{>>!u-$}rV^pxJHiRsQ=#+ES2`J?=&=eCklnU@e6AC)5L_(+U%sfB{lLwa<+D)c0 z@KC(hsyj~Ym{ISVi82sR3hY2obbeCmm0&TKs!kix5mfO0s0&XAKVcanR?92p6&3Aj z8SfK619~GH1bV)=)P}u8YO7}+ICuU$#}Wm8Gqf%qN3>YhoB8(iHYkdjXatJ~CgW3j zY>f4iQ(Nk~HBd)WUqlpu?vm2{m{U#~UP-4-nIT8$v^)WO76SEQ@N+P=V`Y;of&qRv z)cla%oCy!KxVqci&#Y-mCc=zBS!*Ss((AM;P-3Qwi3bb$e1=vyB2^fTxfonxrVgS2 zoKKuoB7(+ABNaY2vzm!32iYF<<33-#7gY~A0sMvCghhF zq-e|{8C-$Ugm&8k%c>);(`u^J2$kppjT%GKyf1X4j@nzOxnwR`JcchcJNi(NkUAXU zsBwD@$v}mutT zh$~}5e&&(OGeVveIhlv#Txx{K7vFe!R?Jf(r*fF<3Yq8RORop!41|tK>vn&mbRg_| zm`Ke*yX7d<8wxo^y-#oF)cePZYNK$x)71O)b{_BPK~X?uV0lu#G3m<#4hJ(Lj;r?y z0rQC89Lx$irrv9cn1}opAZH|VRK3AOy>gQIHZf-ayJ7t0~3rOy}M?jxyqUWGiivnIv=ZUUoW)yJ@%RT>A zZ|6_n7AGs61L#Y#T@(gG2L*alV=(oPCW|Yqyd_s}%50DMoQs02-}TDL_lo|U|9IgU zC=ru;a?(B`3|YRkv3!@5islZ`Ka<&Mu%8qz@G5oS74LhKL_mwUeB`g-0 z885ud+RgBaa?Su6Jh+(ko8rwxp}|7{J$ZAyStvAkr{x|eJ@}@Vx<*jdf`#awGu6MuZz3+SfqlZB<_ZYm#KJdX0efT3E z{n*Dp@ySnp`tc{O+%Dc{KKqNm^vj?7mHw}O{tLhM#V4=MWD%p!p1(ZSbgF~$IZ9|@ zg!S8ed2saM1%5jc`snXjU|w4_tD$k&MOa#0(%M{7dBDVc`BIg7T&ttBKjlj4(;A;C z%OM8p(pcEW0&zC_v`#PuCjC>2m;UAU-Y8z_ntVyVEMJkY%CF0B$ZyJT$#2W=$nVPU z$?wZk^0gTrVd(n<`9t|5`D6JL`BV8b`E&UT`AhjL`D^(b`CIurd0L7ms`DQ&^p6@y z`L3KAm+#sX`8t%Z^NN=5b@{sdz5Ijxqx_Tnv;2$vtNfe%yZndzr`(Wl$ngJN9>_N+ zkBxD0tWOa~o#LpUo^Ebg{P?c6w`b&8c}|{}7v#U>zvY`V@PFJCkH$WbuTdWB&64GgT zysDZJJZ*k%+6;6}KSAQkmiF_oRc4|oB7=iG(#2q13uZ9^c@>u(QBsJ|L8XgMXgXOq zm#rG3WWd$cxU&sS8R<2D{;i>eh5TVo7);F63rof4=zI7naP@a<-sQ;!47FNLX26 z$t>m86H2;}Fo~I(<6PcQ(on~9X!b$vkUblwOT$G!=a?r8i#%}tK7u}$E(?52e5$%z z*gDHJG1+X$Lc>MgN8*O8*Ce66mo|5>-H8DsEtVFV6DByx&-srR$fax7FT;7z+SNRt z9uA?!e{cTuJqZ2D`v2HZW++5+qqcUo+YTB| z5%$vF(4+}4($33K3g=HB_{TU{ceuFp^0>{9KK<6e(WXyw-d)#qrSHrK4`C4W@n1a# ze);9^9145>{DTL(fKE5hGUG_Lmbq~a<|`Q&!6_2p=}QBWMm2ea0_}P(FGrd?Dc08U2^rTakV#X z(Xa@!XT)ZkA}VQUq(i@Jt;5YC4X0I>Gg1m|sD$YQ%kbjn(^BG&1Lw8KC-o-N)2xl) z*{Cy)XwN6?Gk^8aUTcTgmT;Fg4`YW4w{vK-y7n%!r@hIhc*HIwuXW+JhA~|rCz7kD zn_I9`2oX=0$j~ z30Q8Z3fn^)s82;V(zUh!)zHLRu!j}ZoYp3d(oJZC#sC`E$OSyN?QmdVhS@=Y=}{-( zrZe5oLhIafo$g}7J)};MR=h6_ACB8crtjR2W%^xdvJl5O|J8_AObq4NWVKM1)X&S53})42gHvR~Io zCU5;QAF6wAcnkNvJ|GL31u)a|b4=={0}XCp4!onNK0W z_(((4S{kxYI#EJ5wb`r%y)(juZ6z1&m-~8h>Gr6A7gfNC(|{@KZm0;o+q!9y_RIk$ z7vs^vE!Ql8fE`MnZe=scf6dnoFWAP%kJAe|{{-W{H~n<>H}jV-^bZ=y^2RC~gR0n} zkc;=A5NYAHD~0p8o<+db4A9Y@pgbl3uV0=OaVhN4#N5@hu%i?v24T zJ(F?mj%30|pjAJ?#2fmEhk#j-o4F^<;Vp>-Cod#(nAA=*Rwo7g`!3#-%}x${>ipI5 z!O2`BF>Lq(lPBm7t^@2UBe@rEL(U*LH;rLfodHTRS5xE6G8Hmm;>xXhvJ0b73K;#4*}HwR>dIZnepaCUAE6hTQO%}vGzUetHYUmh$4KVTMi zNfR2UFij8q9n>fC1Idl)ptnatpK)KC*w6+8W0gf`1F*Flj}by43 zC48);eEiH|X~d!}__&>n8g^^E;Ek(?MZ~f#5tC7yM2Vv=mLH8tX5`n~{)LiWmHTZs zfIPlSt;OW-$JJWE$W++IXh3%+^~WyXxV7MQ%qHF^n5l3w%v3jld22x#%-CXJgClgR zDUQ(j_@FSIvCSCFjLsA@r4yJ%=`>bD%O0UqReglcCyMja*iDR?(V1eVbON&|oyKzL zFGlE8s1Z7^6z3;;9sm|&W^|^QDV@M9Lg#Ip;0OTqXie_JvV5|DfU-W8I1Zch%s+aK zW;-RE7Lt;R(lwWfN<5k-*n@yrRL~=$aGDH@-g?Z^*m(16PpHNsaT!^{^P4=S4s$u4 zn4_g-PG6ET$^-GbU5eSWO3BPePsM`!RsujH>i4x$Ilc89&MIe9-DdQnA3T!WrWLlMIw zp{lN#KWL=_YYXWDsV)UqntpW`df>;3Z59c&tui(!b9f~fie^igAJ#+iN8yc(*PM%o zgQ+lQ#JPjq$ur$Tjq@=p!HGQLh0Y9+wa(;}L%?JP$Ax_^nZjg87@pla_1*mCaesQs zBt{CNc)%f4raCGP@*pJw5#o&C&kib4X2*-!d*E^m64IZa^19wsZbc|NX=Mqr?7V z!~O?`{SOZN9~$;QJnVmD*#GFT|FL2J|ZGz&1f`h z_Eg0#9_P<(SztEeVN5IK^2&0qE&E{3&!4{dd8KC_9bCn0@XQtbyn>$t{6x|G*@p*L z@ftjf&+!^Oi_h^IJd4lq8a#*3@fsXVxA`=q{yCuGHFyqacnzKd8eW6vfri)Md3=u7 p;CXzG*Wh`4j@JOEDqJmkvO@N>BDCH5J3T8tzwg#xE%c3^^?x0SNE-kE literal 0 HcmV?d00001 diff --git a/uppsrc/Core/Stream.cpp b/uppsrc/Core/Stream.cpp index 53c8642e7..51f0274bf 100644 --- a/uppsrc/Core/Stream.cpp +++ b/uppsrc/Core/Stream.cpp @@ -224,76 +224,58 @@ int64 Stream::_Get64() { int Stream::GetUtf8() { int code = Get(); + if(code <= 0) { LoadError(); return -1; } + if(code < 0x80) return code; - else - if(code < 0xC2) - return -1; - else - if(code < 0xE0) { - if(IsEof()) { - LoadError(); - return -1; + + if(code >= 0xC2) { + int c = 0, pos = GetPos(); + if(code < 0xE0) { + int c0 = Get(); + if(c0 >= 0x80 && c0 < 0xC0 && + (c = ((code - 0xC0) << 6) + c0 - 0x80) >= 0x80 && c < 0x800) { + return c; + } + if(c0 < 0) + LoadError(); } - return ((code - 0xC0) << 6) + Get() - 0x80; + else + if(code < 0xF0) { + int c0 = Get(); + int c1 = Get(); + if(c1 >= 0x80 && c1 < 0xC0 && + c0 >= 0x80 && c0 < 0xC0 && + (c = ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80) >= 0x800 && c < 0x10000) { + return c; + } + if(c1 < 0) + LoadError(); + + } + else + if(code < 0xF8) { + int c0 = Get(); + int c1 = Get(); + int c2 = Get(); + if(c2 >= 0x80 && c2 < 0xC0 && + c1 >= 0x80 && c1 < 0xC0 && + c0 >= 0x80 && c0 < 0xC0 && + (c = ((code - 0xF0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80) >= 0x10000 && c < 0x110000) { + return c; + } + if(c2 < 0) + LoadError(); + } + if(!IsError()) + Seek(pos); // Rewind (to represent each invalid byte). } - else - if(code < 0xF0) { - int c0 = Get(); - int c1 = Get(); - if(c1 < 0) { - LoadError(); - return -1; - } - return ((code - 0xE0) << 12) + ((c0 - 0x80) << 6) + c1 - 0x80; - } - else - if(code < 0xF8) { - int c0 = Get(); - int c1 = Get(); - int c2 = Get(); - if(c2 < 0) { - LoadError(); - return -1; - } - return ((code - 0xf0) << 18) + ((c0 - 0x80) << 12) + ((c1 - 0x80) << 6) + c2 - 0x80; - } - else - if(code < 0xFC) { - int c0 = Get(); - int c1 = Get(); - int c2 = Get(); - int c3 = Get(); - if(c3 < 0) { - LoadError(); - return -1; - } - return ((code - 0xF8) << 24) + ((c0 - 0x80) << 18) + ((c1 - 0x80) << 12) + - ((c2 - 0x80) << 6) + c3 - 0x80; - } - else - if(code < 0xFE) { - int c0 = Get(); - int c1 = Get(); - int c2 = Get(); - int c3 = Get(); - int c4 = Get(); - if(c4 < 0) { - LoadError(); - return -1; - } - return ((code - 0xFC) << 30) + ((c0 - 0x80) << 24) + ((c1 - 0x80) << 18) + - ((c2 - 0x80) << 12) + ((c3 - 0x80) << 6) + c4 - 0x80; - } - else { - LoadError(); - return -1; - } + return -1; } String Stream::GetLine() {