ultimatepp/uppsrc/Core/Utf.hpp
Mirek Fidler 34ff691308 sizeof(wchar) is changed to 4 (32 bits) to support non BMP unicode characters
This might bring some incompatibilities in the code that expects wchar to be 16 bit, which
  escpecially involves dealing with Win32 (and to lesser extend MacOS) APIs, so if your application
  is doing that, please check all instances of WCHAR (UniChar on MacOS) or even wchar
  especially type casts.

  To support host APIs, char16 is introduced (but there is no 16-bit String varian).

  Use ToSystemCharsetW, FromSystemCharsetW to convert texts to Win32 API.

- Support of drawing non-BMP characters in GUI
- Vastly improved character font replacement code (when drawing characters missing with requested font, replacement font is used)
- Last instances of Win32 ANSI calls (those ending with A) are removed
- UTF handling routines are refactored and their's naming is unified
- RTF is now being able to handle non-BMP characters (RTF is used as clipboard format for RichText)

Other minor changes:

- fixed TryRealloc issue
- improved MemoryCheck
- Removed MemoryAlloc48/MemoryFree48
- In theide Background parsing should less often cause delays in the main thread
2021-12-02 12:03:19 +01:00

129 lines
3 KiB
C++

template <class Target>
force_inline bool ToUtf8_(Target t, wchar codepoint)
{
if(codepoint < 0x80)
t((char)codepoint);
else
if(codepoint < 0x800) {
t(0xc0 | byte(codepoint >> 6));
t(0x80 | byte(codepoint & 0x3f));
}
else
if((codepoint & 0xFFFFFF00) == 0xEE00) // ERROR ESCAPE
t((char) codepoint);
else
if(codepoint < 0x10000) {
t(0xe0 | byte(codepoint >> 12));
t(0x80 | byte((codepoint >> 6) & 0x3f));
t(0x80 | byte(codepoint & 0x3f));
}
else
if(codepoint < 0x110000) {
t(0xf0 | byte(codepoint >> 18));
t(0x80 | byte((codepoint >> 12) & 0x3f));
t(0x80 | byte((codepoint >> 6) & 0x3f));
t(0x80 | byte(codepoint & 0x3f));
}
else
return false;
return true;
}
force_inline dword FetchUtf8(const char *&_s, bool nolim, const char *_lim, bool& ok)
{
const byte *s = (const byte *)_s;
const byte *lim = (const byte *)_lim;
dword code = *s;
if(code < 0x80) {
_s++;
return *s;
}
else
if(code >= 0xC2) {
dword c;
if(code < 0xE0 && (nolim || s + 1 < lim) &&
s[1] >= 0x80 && s[1] < 0xc0 &&
(c = ((code - 0xC0) << 6) + s[1] - 0x80) >= 0x80 && c < 0x800) {
_s += 2;
return c;
}
else
if(code < 0xF0 && (nolim || s + 2 < lim) &&
s[1] >= 0x80 && s[1] < 0xc0 && s[2] >= 0x80 && s[2] < 0xc0 &&
(c = ((code - 0xE0) << 12) + ((s[1] - 0x80) << 6) + s[2] - 0x80) >= 0x800 &&
!(c >= 0xEE00 && c <= 0xEEFF)) {
_s += 3;
return c;
}
else
if(code < 0xF8 && (nolim || s + 3 < lim) &&
s[1] >= 0x80 && s[1] < 0xc0 && s[2] >= 0x80 && s[2] < 0xc0 && s[3] >= 0x80 && s[3] < 0xc0 &&
(c = ((code - 0xF0) << 18) + ((s[1] - 0x80) << 12) + ((s[2] - 0x80) << 6) + s[3] - 0x80) >= 0x10000 &&
c < 0x110000) {
_s += 4;
return c;
}
}
_s++;
ok = false;
return 0xEE00 + code; // ERROR ESCAPE
}
force_inline dword FetchUtf8(const char *&s, const char *lim, bool& ok)
{
return FetchUtf8(s, false, lim, ok);
}
force_inline dword FetchUtf8(const char *&s, bool& ok)
{
return FetchUtf8(s, true, NULL, ok);
}
template <class Target>
force_inline bool FromUtf8_(Target t, const char *s, size_t len)
{
bool ok = true;
const char *lim = s + len;
while(s < lim)
t(FetchUtf8(s, lim, ok));
return ok;
}
template <class Target>
force_inline bool ToUtf16_(Target t, size_t codepoint)
{
if(codepoint < 0x10000)
t((char16)codepoint);
else
if(codepoint < 0x110000) {
codepoint -= 0x10000;
t(char16(0xD800 + (0x3ff & (codepoint >> 10))));
t(char16(0xDC00 + (0x3ff & codepoint)));
}
else
return false;
return true;
}
force_inline wchar ReadSurrogatePair(const char16 *s, const char16 *lim)
{
return (*s & 0XFC00) == 0xD800 && s + 1 < lim && (s[1] & 0xFC00) == 0xDC00 ?
((wchar(s[0] & 0x3ff) << 10) | (s[1] & 0x3ff)) + 0x10000 : 0;
}
template <class Target>
force_inline void FromUtf16_(Target t, const char16 *s, size_t len)
{
const char16 *lim = s + len;
while(s < lim) {
wchar c = ReadSurrogatePair(s, lim);
if(c) {
t(c);
s += 2;
}
else {
t(*s);
s++;
}
}
}