#include "RichText.h" #define LLOG(x) // DLOG(x) namespace Upp { const byte html_colors_z[] = { 120,156,93,86,107,186,171,42,12,157,10,67,176,62,113,56,129,132,202,173,130,27,117,247,235,25,253,13,143,218,186,229,143,89,96,12,201,90,1,53,131,126,136,42,61,98,179,243,47,5,161,171,56,196,61,192,75,200,42,14,241,156,236,78,194,164,71,44,16,188,119,60,149,62,10,132,60,145,94,215,35,172,51,165,9,254,198,28,122,218,44,164,73,254,234,30,136,156,136,51,188,114,182,11,137,8,243,187,159,237,47,165,31,177,241,162,121,246,207,244,39,182,28,252,190,82,112,236,78,205,7,165,119,246,181,19,204,201,21,227,240,115,64,114,197,56,204,86,83,90,104,42,35,35,224,118,251,115,80,9,31,72,225,112,253,128,223,121,59,214,145,24,216,198,86,192,191,35,164,207,227,180,34,123,103,163,51,29,106,161,236,246,19,61,27,106,117,203,225,124,37,142,13,167,39,66,152,23,239,98,58,72,105,188,68,28,223,127,173,159,105,23,18,106,69,181,80,193,63,157,128,174,134,26,132,58,194,252,122,122,143,2,73,73,57,8,13,72,123,114,208,153,145,160,18,122, 130,176,7,58,182,28,105,21,17,175,253,12,188,49,172,251,241,70,66,251,192,105,49,102,48,93,21,13,103,56,149,20,146,151,190,29,59,46,84,68,185,202,143,152,96,201,123,210,193,46,27,215,18,245,173,109,216,124,129,123,167,6,33,60,206,29,72,149,236,50,47,85,177,239,126,70,114,129,195,230,160,251,170,128,145,55,48,198,81,236,92,248,190,229,160,11,112,89,240,152,224,97,133,66,53,244,217,195,2,119,114,59,8,169,206,63,39,146,100,79,93,215,171,58,199,199,27,118,177,64,70,234,226,219,7,61,89,20,227,216,212,90,39,36,242,51,58,42,11,182,84,35,65,227,216,15,144,17,130,236,88,26,165,101,118,188,197,188,166,221,183,178,193,18,67,2,211,238,106,211,242,248,6,233,2,238,71,248,57,188,221,98,242,52,225,45,129,165,252,35,167,1,27,174,51,173,171,117,177,18,183,118,204,246,246,120,149,140,171,84,1,187,164,191,245,99,28,217,164,143,233,241,94,138,123,163,49,114,204,216,64,42,88,166,165,170,227,35,184,254,204,136,83,187,96,88, 148,62,208,182,231,253,214,181,84,113,213,31,157,130,117,155,242,193,51,41,226,16,247,201,111,123,113,34,147,170,98,217,121,57,14,156,211,15,5,16,152,204,215,182,113,209,124,50,138,190,1,141,169,83,79,56,215,78,222,209,11,233,153,181,23,237,189,228,167,31,85,43,172,67,11,46,22,83,99,167,59,157,128,187,23,109,228,72,45,236,175,15,175,220,160,42,145,9,101,42,234,165,22,51,252,146,67,110,107,212,83,111,224,180,57,115,219,20,191,168,76,199,224,211,229,96,7,109,34,151,102,98,150,48,149,140,97,178,196,220,177,158,103,123,159,178,38,1,81,82,159,129,162,186,188,137,140,68,149,80,86,81,2,206,28,189,187,27,24,192,186,204,197,116,97,19,199,27,136,97,140,21,113,81,79,228,186,164,228,69,245,250,150,129,66,106,14,179,98,82,103,232,205,234,186,82,53,188,193,66,48,57,104,50,111,236,100,245,48,72,57,142,23,148,254,160,59,209,156,60,168,74,183,72,25,253,234,217,116,109,237,241,61,71,193,114,196,38,110,217,177,17,153,200,233,123, 203,188,16,239,122,172,44,132,246,88,190,250,115,223,107,102,88,193,207,198,196,101,201,72,17,190,130,174,227,52,101,172,156,71,99,51,84,168,10,118,230,165,209,170,25,110,111,244,212,251,160,122,73,244,134,87,254,245,253,205,98,195,29,171,76,124,4,222,74,188,113,167,201,112,150,120,34,233,112,235,100,39,22,139,238,100,205,109,188,141,3,239,204,186,93,7,130,37,30,44,92,50,6,182,253,21,252,150,207,22,226,152,188,214,176,89,151,108,213,197,99,16,254,243,111,29,35,1,94,78,70,102,23,31,70,60,133,166,227,180,94,206,211,100,96,0,37,122,37,169,110,196,217,50,161,139,211,201,202,71,120,155,129,148,70,4,78,89,47,86,152,233,35,112,34,146,92,129,4,102,146,114,199,28,101,2,62,9,1,195,235,40,129,159,116,112,111,175,184,201,173,176,194,11,120,35,107,220,154,193,78,172,4,122,90,143,216,189,12,130,26,25,8,7,75,92,118,141,17,133,231,124,27,81,98,157,143,69,32,66,133,40,86,255,196,210,250,84,69,145,74,215,155,7,183,65, 226,12,22,176,239,155,134,9,252,117,79,225,92,191,242,241,27,27,62,183,252,224,95,144,121,221,222,250,145,43,176,177,194,103,202,107,164,106,187,91,35,222,26,3,89,13,53,91,14,139,15,211,66,219,243,229,233,20,28,73,213,13,209,222,38,86,70,212,69,199,9,217,44,57,7,2,170,174,174,241,207,85,235,91,151,164,196,135,141,61,116,177,243,124,105,148,247,199,125,225,75,158,5,112,89,129,220,89,196,31,214,242,125,64,124,132,219,246,178,230,118,186,115,139,66,126,225,6,249,125,153,218,39,230,34,231,140,143,60,131,82,236,126,129,221,199,22,220,180,131,248,34,61,103,29,43,81,78,52,166,69,205,27,124,78,4,59,115,154,111,48,205,245,194,152,140,109,241,143,124,151,226,126,123,189,230,101,171,112,10,98,163,248,31,84,48,144,29, }; struct TrivialHtmlParser { XmlParser p; Font base_font; RichPara para; RichText richtext; int prev_margin = 0; // for margin collapsing int parent_tag_margin = 0; // for margin (de)collapsing bool wastext = false; struct Style : RichPara::Format { int border = 0; Rect padding = Rect(0, 0, 0, 0); Rect margin = Rect(0, 0, 0, 0); int bullet_style = RichPara::BULLET_ROUND; bool pre = false; }; static VectorMap named_color; static VectorMap div; static VectorMap span; void ReadStyle(Style& fmt, const String& style); void AddPara(const Style& fmt); void Parse(Style fmt); TrivialHtmlParser(const char *html); }; VectorMap TrivialHtmlParser::named_color; VectorMap TrivialHtmlParser::div; VectorMap TrivialHtmlParser::span; TrivialHtmlParser::TrivialHtmlParser(const char *html) : p(html) { ONCELOCK { div ("div", "") ("h1", "font-size: 2em; margin-top: 0.67em; margin-bottom: 0.67em; margin-left: 0; margin-right: 0; font-weight: bold;") ("h2", "font-size: 1.5em; margin-top: 0.83em; margin-bottom: 0.83em; margin-left: 0; margin-right: 0; font-weight: bold;") ("h3", "font-size: 1.17em; margin-top: 1em; margin-bottom: 1em; margin-left: 0; margin-right: 0; font-weight: bold;") ("h4", "margin-top: 1.33em; margin-bottom: 1.33em; margin-left: 0; margin-right: 0; font-weight: bold;") ("h5", "font-size: .83em; margin-top: 1.67em; margin-bottom: 1.67em; margin-left: 0; margin-right: 0; font-weight: bold;") ("h6", "font-size: .67em; margin-top: 2.33em; margin-bottom: 2.33em; margin-left: 0; margin-right: 0; font-weight: bold;") ("p", "margin-top: 1em; margin-bottom: 1em;") ("dt", "margin-top: 1em; margin-bottom: 1em;") ("dd", "margin-left: 40px") ("dl", "") ("hr", "margin-top: 0.5em; margin-bottom: 0.5em; border-width: 1px;") ("li", "display: list-item; margin-top: 0; margin-bottom: 0;") ("ol", "list-style-type: decimal; margin-top: 1em; margin-bottom: 1em; margin-left: 0; margin-right: 0; padding-left: 40px;") ("ul", "list-style-type: disc; margin-top: 1em; margin-bottom: 1em; margin-left: 0; margin-right: 0; padding-left: 40px;") ("pre", "font-family: monospace; white-space: pre; margin: 1em 0;") ("figure", "margin-top: 1em; margin-bottom: 1em; margin-left: 40px; margin-right: 40px;") ("blockquote", "margin-top: 1em; margin-bottom: 1em; margin-left: 40px; margin-right: 40px;") ("legend", "padding-left: 2px; padding-right: 2px;") ; span ("code", "font-family:monospace") ("b", "font-weight:bold") ("strong", "font-weight:bold") ("i", "font-style:italic") ("cite", "font-style:italic") ("em", "font-style:italic") ("u", "text-decoration:underline") ("strike", "text-decoration:line-through") ("s", "text-decoration:line-through") ("sub", "vertical-align:sub") ("sup", "vertical-align:super") ; named_color.Add("transparent", Null); String html_colors = ZDecompress(html_colors_z, sizeof(html_colors_z)); CParser p(html_colors); while(!p.IsEof()) { String id = p.ReadId(); dword n = p.ReadNumber(16); named_color.Add(id, Color((byte)(n >> 16), (byte)(n >> 8), (byte)(n))); } } p.PreserveAllWhiteSpaces(); p.Relaxed(); p.RegisterEntity("nbsp", WString(0xa0, 1).ToString()); } void TrivialHtmlParser::ReadStyle(Style& fmt, const String& style) { Style parent_fmt = fmt; // TODO: BETTER! try { CParser p(style); auto Do = [&](auto x) { while(!p.IsChar(';') && !p.IsEof()) { if(!x()) p.Skip(); } }; auto ReadLength = [&](double& l, double psize = 2000) { if(p.IsDouble()) { l = p.ReadDoubleNoE(); if(p.Char('%')) l *= l * psize / 100; else if(p.Id("em")) l *= parent_fmt.GetHeight(); else if(p.Id("rem")) l *= base_font.GetHeight(); else if(p.Id("rlh")) l *= base_font.GetLineHeight(); else if(p.Id("lh")) l *= fmt.GetLineHeight(); else if(p.Id("ch")) l *= parent_fmt['0']; else if(p.Id("ex")) l *= parent_fmt.GetHeight() * 0.5; else if(p.Id("ic")) l *= parent_fmt[0x6C34]; else if(p.Id("cm")) l *= 600 / 2.54; else if(p.Id("mm")) l *= 60 / 2.54; else if(p.Id("qm")) l *= 60 / 2.54 / 4; else if(p.Id("in")) l *= 600; else if(p.Id("pt")) l *= 600 / 72.0; else if(p.Id("px")) l *= 600 / 96.0; else l *= psize; l = clamp(abs(l), 0., 2000.); return true; } return false; }; auto ReadLengthPar = [&](int& tsize, double psize = 2000) { double l; if(ReadLength(l, psize)) tsize = (int)l; }; auto ReadLengthParAdd = [&](int& tsize, double psize = 2000) { int n = 0; ReadLengthPar(n, psize); tsize += n; }; auto ReadColor = [&](Color& c) { if(p.IsId()) { int q = named_color.Find(p.ReadId()); if(q >= 0) { c = named_color[q]; return true; } } else if(p.Char('#')) { dword n = p.ReadNumber(16); c = Color((byte)(n >> 16), (byte)(n >> 8), (byte)(n)); return true; } return false; }; auto TextAlign = [&] { if(p.Id("start") || p.Id("left")) // TODO RTL fmt.align = ALIGN_LEFT; else if(p.Id("end") || p.Id("right")) // TODO RTL fmt.align = ALIGN_RIGHT; else if(p.Id("center")) fmt.align = ALIGN_CENTER; else if(p.Id("justify") || p.Id("justify-all")) fmt.align = ALIGN_JUSTIFY; else return false; return true; }; auto TextDecoration = [&] { if(p.Id("underline")) fmt.Underline(); else if(p.Id("line-through")) fmt.Strikeout(); else if(p.Id("none")) { fmt.Underline(false); fmt.Strikeout(false); } else return false; return true; }; auto FontSize = [&] { double l; if(ReadLength(l, parent_fmt.GetHeight())) fmt.Height((int)l); else { l = base_font.GetHeight(); if(p.Id("xx-small")) l *= 0.512; else if(p.Id("x-small")) l *= 0.64; else if(p.Id("small")) l *= 0.8; else if(p.Id("medium")) ; else if(p.Id("large")) l *= 1.25; else if(p.Id("x-large")) l *= 1.5; else if(p.Id("xx-large")) l *= 1.8; else if(p.Id("xxx-large")) l *= 2.16; else { l = parent_fmt.GetHeight(); if(p.Id("smaller")) l *= 0.8; else if(p.Id("larger")) l *= 1.25; else return false; } fmt.Height((int)l); } return true; }; auto FontStyle = [&] { if(p.Id("normal")) fmt.Italic(false); else if(p.Id("italic") || p.Id("oblique")) fmt.Italic(); else return false; return true; }; auto FontWeight = [&] { if(p.Id("normal") || p.Id("lighter")) fmt.Bold(false); else if(p.Id("bold") || p.Id("bolder")) fmt.Bold(); else if(p.IsDouble()) fmt.Bold(p.ReadDouble() >= 500); else return false; return true; }; auto FontFamily = [&] { if(p.Id("serif")) fmt.Face(Font::SERIF); else if(p.Id("sans-serif")) fmt.Face(Font::SANSSERIF); else if(p.Id("monospace")) fmt.Face(Font::MONOSPACE); else return false; return true; }; auto FontVariant = [&] { if(!p.Id("small-caps")) return false; fmt.capitals = true; return true; }; auto LineHeight = [&] { double l; if(p.Id("normal")) fmt.linespacing = RichPara::LSP115; if(ReadLength(l, fmt.GetHeight())) { l /= fmt.GetHeight(); fmt.linespacing = l < 1.1 ? RichPara::LSP10 : l < 1.4 ? RichPara::LSP115 : l < 1.8 ? RichPara::LSP15 : RichPara::LSP20; return true; } return false; }; auto ReadLengthPar4 = [&](Rect& r) { Vector m; for(int i = 0; i < 4; i++) { double x; if(ReadLength(x, 2000)) m.Add(x); else break; } switch(m.GetCount()) { case 1: r.left = r.right = r.top = r.bottom = (int)m[0]; break; case 2: r.left = r.right = (int)m[1]; r.top = r.bottom = (int)m[0]; break; case 3: r.left = r.right = (int)m[1]; r.top = (int)m[0]; r.bottom = (int)m[2]; break; case 4: r.left = (int)m[3]; r.right = (int)m[1]; r.top = (int)m[0]; r.bottom = (int)m[2]; break; default:; } }; while(!p.IsEof()) { try { auto Attr = [&](const char *s) { if(p.Id(s)) { p.Char(':'); return true; } return false; }; if(Attr("text-decoration") || Attr("text-decoration-line")) Do([&] { return TextDecoration(); }); else if(Attr("font-variant")) { if(!FontVariant()) fmt.capitals = false; } else if(Attr("font-family")) Do([&] { return FontFamily(); }); else if(Attr("line-height")) LineHeight(); else if(Attr("text-align")) { p.Char(':'); TextAlign(); } else if(Attr("border-width")) ReadLengthPar(fmt.border); else if(Attr("text-indent")) ReadLengthPar(fmt.indent); else if(Attr("margin-left")) ReadLengthParAdd(fmt.margin.left); else if(Attr("margin-right")) ReadLengthParAdd(fmt.margin.right); else if(Attr("margin-top")) ReadLengthPar(fmt.margin.top); else if(Attr("margin-bottom")) ReadLengthPar(fmt.margin.bottom); else if(Attr("margin")) ReadLengthPar4(fmt.margin); else if(Attr("border-width")) ReadLengthPar(fmt.border); else if(Attr("text-indent")) ReadLengthPar(fmt.indent); else if(Attr("padding-left")) ReadLengthParAdd(fmt.padding.left); else if(Attr("padding-right")) ReadLengthParAdd(fmt.padding.right); else if(Attr("padding-top")) ReadLengthPar(fmt.padding.top); else if(Attr("padding-bottom")) ReadLengthPar(fmt.padding.bottom); else if(Attr("padding")) ReadLengthPar4(fmt.padding); else if(Attr("font-weight")) FontWeight(); else if(Attr("font-size")) FontSize(); else if(Attr("font-style")) FontStyle(); else if(Attr("font")) while(!p.IsChar(';') && !p.IsEof()) { if(!FontVariant() && !FontSize() && !FontStyle() && !FontFamily() && !FontWeight()) { // order matters! if(p.Char('/')) LineHeight(); else p.Skip(); } } else if(Attr("color")) ReadColor(fmt.ink); else if(Attr("background-color")) ReadColor(fmt.paper); else if(Attr("list-style-type") || Attr("list-style")) { if(Attr("disc")) fmt.bullet_style = RichPara::BULLET_ROUND; else if(Attr("circle")) fmt.bullet_style = RichPara::BULLET_ROUNDWHITE; else if(Attr("square")) fmt.bullet_style = RichPara::BULLET_BOX; } else if(Attr("display")) { if(Attr("list-item")) // TODO: Text, decimal... fmt.bullet = fmt.bullet_style; else fmt.bullet = RichPara::BULLET_NONE; } else if(Attr("vertical-align")) { if(Attr("sub")) fmt.sscript = 2; else if(Attr("super")) fmt.sscript = 1; else fmt.sscript = 0; } else if(Attr("white-space")) fmt.pre = Attr("pre"); } catch(CParser::Error) {} while(!p.Char(';') && !p.IsEof()) p.Skip(); } } catch(CParser::Error) {} } void TrivialHtmlParser::AddPara(const Style& fmt) { // para.Dump(); para.format = fmt; para.format.before = max(fmt.margin.top - prev_margin, parent_tag_margin) + fmt.padding.top; para.format.after = fmt.margin.bottom + fmt.padding.bottom; para.format.lm = fmt.margin.left + fmt.padding.left; para.format.rm = fmt.margin.right + fmt.padding.right; prev_margin = fmt.margin.bottom; parent_tag_margin = 0; if(fmt.bullet) para.format.indent = fmt.GetHeight(); LLOG("AddPara " << AsCString(para.GetText().ToString()) << ", before: " << para.format.before << ", after: " << para.format.after); richtext.Cat(para); para.part.Clear(); wastext = false; } void TrivialHtmlParser::Parse(Style tfmt) { auto Br = [&] { Style fmt = tfmt; fmt.margin.top = fmt.margin.bottom = fmt.padding.top = fmt.padding.bottom = 0; AddPara(fmt); }; while(!p.IsEnd() && !p.IsEof()) { if(p.IsText()) { String text = p.ReadText(); if(tfmt.pre) { String h; for(const char *s = text; *s; s++) { if((byte)*s >= ' ' || *s == '\t') h.Cat(*s); if(*s == '\n') { para.Cat(h.ToWString(), tfmt); Br(); h.Clear(); wastext = true; } } para.Cat(h.ToWString(), tfmt); } else { WString t; if(text.GetCount()) { WString h = text.ToWString(); for(const wchar *s = h; *s;) // ignore more than 1 space between words if(*s == ' ') { s++; while(*s == ' ') s++; t.Cat(' '); } else if(*s >= ' ') { // ignore other whitespaces t.Cat(*s++); wastext = true; } else s++; } if(t.GetCount()) { LLOG("Text: " << AsCString(t.ToString())); para.Cat(t, tfmt); } } } else if(p.IsTag()) { String tag = ToLower(p.ReadTag()); if(tag == "br") { Br(); p.End(); } else if(tag == "hr") { if(wastext) AddPara(tfmt); para.part.Clear(); Style fmt = tfmt; ReadStyle(fmt, div.Get("hr", "")); ReadStyle(fmt, p["style"]); int after = fmt.after; fmt.Face(Font::SERIF); fmt.Height(max(fmt.before, 1)); fmt.before = fmt.after = 0; AddPara(fmt); fmt.ruler = fmt.border; fmt.Height(max(after, 1)); AddPara(fmt); p.End(); } else { LLOG(tag << LOG_BEGIN); Style fmt = tfmt; int divi = div.Find(tag); int parai; if(divi >= 0) { if(wastext) AddPara(tfmt); else para.part.Clear(); ReadStyle(fmt, div[divi]); parai = richtext.GetPartCount(); parent_tag_margin = max(parent_tag_margin, fmt.margin.bottom); // margin (de)collapsing } if(tag == "a") fmt.link = p["href"]; int spani = span.Find(tag); if(spani >= 0) ReadStyle(fmt, span[spani]); ReadStyle(fmt, p["style"]); Parse(fmt); p.End(); if(divi >= 0 && (wastext || parai == richtext.GetPartCount())) AddPara(fmt); LLOG("/" << tag << ", wastext: " << wastext << ", waspara: " << (parai == richtext.GetPartCount()) << LOG_END); } } else p.Skip(); } } RichText ParseTrivialHtml(const char *html, Font base_font) { TrivialHtmlParser p(html); TrivialHtmlParser::Style fmt; (Font&)fmt = base_font; p.base_font = base_font; try { p.Parse(fmt); p.AddPara(fmt); } catch(XmlError) {} return pick(p.richtext); } }