web engine - experimental web browser

Merge branch 'html5-tokenizer': HTML5 tokenizer state machine

+4229 -4
+2162
crates/html/src/entities.rs
··· 1 + //! HTML named character reference table. 2 + //! 3 + //! Complete list of 2125 named character references from the WHATWG HTML spec. 4 + //! Uses a sorted array with binary search for O(log n) lookup. 5 + 6 + /// All 2125 named character references, sorted for binary search. 7 + static ENTITIES: &[(&str, &str)] = &[ 8 + ("AElig", "\u{00C6}"), 9 + ("AMP", "&"), 10 + ("Aacute", "\u{00C1}"), 11 + ("Abreve", "\u{0102}"), 12 + ("Acirc", "\u{00C2}"), 13 + ("Acy", "\u{0410}"), 14 + ("Afr", "\u{1D504}"), 15 + ("Agrave", "\u{00C0}"), 16 + ("Alpha", "\u{0391}"), 17 + ("Amacr", "\u{0100}"), 18 + ("And", "\u{2A53}"), 19 + ("Aogon", "\u{0104}"), 20 + ("Aopf", "\u{1D538}"), 21 + ("ApplyFunction", "\u{2061}"), 22 + ("Aring", "\u{00C5}"), 23 + ("Ascr", "\u{1D49C}"), 24 + ("Assign", "\u{2254}"), 25 + ("Atilde", "\u{00C3}"), 26 + ("Auml", "\u{00C4}"), 27 + ("Backslash", "\u{2216}"), 28 + ("Barv", "\u{2AE7}"), 29 + ("Barwed", "\u{2306}"), 30 + ("Bcy", "\u{0411}"), 31 + ("Because", "\u{2235}"), 32 + ("Bernoullis", "\u{212C}"), 33 + ("Beta", "\u{0392}"), 34 + ("Bfr", "\u{1D505}"), 35 + ("Bopf", "\u{1D539}"), 36 + ("Breve", "\u{02D8}"), 37 + ("Bscr", "\u{212C}"), 38 + ("Bumpeq", "\u{224E}"), 39 + ("CHcy", "\u{0427}"), 40 + ("COPY", "\u{00A9}"), 41 + ("Cacute", "\u{0106}"), 42 + ("Cap", "\u{22D2}"), 43 + ("CapitalDifferentialD", "\u{2145}"), 44 + ("Cayleys", "\u{212D}"), 45 + ("Ccaron", "\u{010C}"), 46 + ("Ccedil", "\u{00C7}"), 47 + ("Ccirc", "\u{0108}"), 48 + ("Cconint", "\u{2230}"), 49 + ("Cdot", "\u{010A}"), 50 + ("Cedilla", "\u{00B8}"), 51 + ("CenterDot", "\u{00B7}"), 52 + ("Cfr", "\u{212D}"), 53 + ("Chi", "\u{03A7}"), 54 + ("CircleDot", "\u{2299}"), 55 + ("CircleMinus", "\u{2296}"), 56 + ("CirclePlus", "\u{2295}"), 57 + ("CircleTimes", "\u{2297}"), 58 + ("ClockwiseContourIntegral", "\u{2232}"), 59 + ("CloseCurlyDoubleQuote", "\u{201D}"), 60 + ("CloseCurlyQuote", "\u{2019}"), 61 + ("Colon", "\u{2237}"), 62 + ("Colone", "\u{2A74}"), 63 + ("Congruent", "\u{2261}"), 64 + ("Conint", "\u{222F}"), 65 + ("ContourIntegral", "\u{222E}"), 66 + ("Copf", "\u{2102}"), 67 + ("Coproduct", "\u{2210}"), 68 + ("CounterClockwiseContourIntegral", "\u{2233}"), 69 + ("Cross", "\u{2A2F}"), 70 + ("Cscr", "\u{1D49E}"), 71 + ("Cup", "\u{22D3}"), 72 + ("CupCap", "\u{224D}"), 73 + ("DD", "\u{2145}"), 74 + ("DDotrahd", "\u{2911}"), 75 + ("DJcy", "\u{0402}"), 76 + ("DScy", "\u{0405}"), 77 + ("DZcy", "\u{040F}"), 78 + ("Dagger", "\u{2021}"), 79 + ("Darr", "\u{21A1}"), 80 + ("Dashv", "\u{2AE4}"), 81 + ("Dcaron", "\u{010E}"), 82 + ("Dcy", "\u{0414}"), 83 + ("Del", "\u{2207}"), 84 + ("Delta", "\u{0394}"), 85 + ("Dfr", "\u{1D507}"), 86 + ("DiacriticalAcute", "\u{00B4}"), 87 + ("DiacriticalDot", "\u{02D9}"), 88 + ("DiacriticalDoubleAcute", "\u{02DD}"), 89 + ("DiacriticalGrave", "`"), 90 + ("DiacriticalTilde", "\u{02DC}"), 91 + ("Diamond", "\u{22C4}"), 92 + ("DifferentialD", "\u{2146}"), 93 + ("Dopf", "\u{1D53B}"), 94 + ("Dot", "\u{00A8}"), 95 + ("DotDot", "\u{20DC}"), 96 + ("DotEqual", "\u{2250}"), 97 + ("DoubleContourIntegral", "\u{222F}"), 98 + ("DoubleDot", "\u{00A8}"), 99 + ("DoubleDownArrow", "\u{21D3}"), 100 + ("DoubleLeftArrow", "\u{21D0}"), 101 + ("DoubleLeftRightArrow", "\u{21D4}"), 102 + ("DoubleLeftTee", "\u{2AE4}"), 103 + ("DoubleLongLeftArrow", "\u{27F8}"), 104 + ("DoubleLongLeftRightArrow", "\u{27FA}"), 105 + ("DoubleLongRightArrow", "\u{27F9}"), 106 + ("DoubleRightArrow", "\u{21D2}"), 107 + ("DoubleRightTee", "\u{22A8}"), 108 + ("DoubleUpArrow", "\u{21D1}"), 109 + ("DoubleUpDownArrow", "\u{21D5}"), 110 + ("DoubleVerticalBar", "\u{2225}"), 111 + ("DownArrow", "\u{2193}"), 112 + ("DownArrowBar", "\u{2913}"), 113 + ("DownArrowUpArrow", "\u{21F5}"), 114 + ("DownBreve", "\u{0311}"), 115 + ("DownLeftRightVector", "\u{2950}"), 116 + ("DownLeftTeeVector", "\u{295E}"), 117 + ("DownLeftVector", "\u{21BD}"), 118 + ("DownLeftVectorBar", "\u{2956}"), 119 + ("DownRightTeeVector", "\u{295F}"), 120 + ("DownRightVector", "\u{21C1}"), 121 + ("DownRightVectorBar", "\u{2957}"), 122 + ("DownTee", "\u{22A4}"), 123 + ("DownTeeArrow", "\u{21A7}"), 124 + ("Downarrow", "\u{21D3}"), 125 + ("Dscr", "\u{1D49F}"), 126 + ("Dstrok", "\u{0110}"), 127 + ("ENG", "\u{014A}"), 128 + ("ETH", "\u{00D0}"), 129 + ("Eacute", "\u{00C9}"), 130 + ("Ecaron", "\u{011A}"), 131 + ("Ecirc", "\u{00CA}"), 132 + ("Ecy", "\u{042D}"), 133 + ("Edot", "\u{0116}"), 134 + ("Efr", "\u{1D508}"), 135 + ("Egrave", "\u{00C8}"), 136 + ("Element", "\u{2208}"), 137 + ("Emacr", "\u{0112}"), 138 + ("EmptySmallSquare", "\u{25FB}"), 139 + ("EmptyVerySmallSquare", "\u{25AB}"), 140 + ("Eogon", "\u{0118}"), 141 + ("Eopf", "\u{1D53C}"), 142 + ("Epsilon", "\u{0395}"), 143 + ("Equal", "\u{2A75}"), 144 + ("EqualTilde", "\u{2242}"), 145 + ("Equilibrium", "\u{21CC}"), 146 + ("Escr", "\u{2130}"), 147 + ("Esim", "\u{2A73}"), 148 + ("Eta", "\u{0397}"), 149 + ("Euml", "\u{00CB}"), 150 + ("Exists", "\u{2203}"), 151 + ("ExponentialE", "\u{2147}"), 152 + ("Fcy", "\u{0424}"), 153 + ("Ffr", "\u{1D509}"), 154 + ("FilledSmallSquare", "\u{25FC}"), 155 + ("FilledVerySmallSquare", "\u{25AA}"), 156 + ("Fopf", "\u{1D53D}"), 157 + ("ForAll", "\u{2200}"), 158 + ("Fouriertrf", "\u{2131}"), 159 + ("Fscr", "\u{2131}"), 160 + ("GJcy", "\u{0403}"), 161 + ("GT", ">"), 162 + ("Gamma", "\u{0393}"), 163 + ("Gammad", "\u{03DC}"), 164 + ("Gbreve", "\u{011E}"), 165 + ("Gcedil", "\u{0122}"), 166 + ("Gcirc", "\u{011C}"), 167 + ("Gcy", "\u{0413}"), 168 + ("Gdot", "\u{0120}"), 169 + ("Gfr", "\u{1D50A}"), 170 + ("Gg", "\u{22D9}"), 171 + ("Gopf", "\u{1D53E}"), 172 + ("GreaterEqual", "\u{2265}"), 173 + ("GreaterEqualLess", "\u{22DB}"), 174 + ("GreaterFullEqual", "\u{2267}"), 175 + ("GreaterGreater", "\u{2AA2}"), 176 + ("GreaterLess", "\u{2277}"), 177 + ("GreaterSlantEqual", "\u{2A7E}"), 178 + ("GreaterTilde", "\u{2273}"), 179 + ("Gscr", "\u{1D4A2}"), 180 + ("Gt", "\u{226B}"), 181 + ("HARDcy", "\u{042A}"), 182 + ("Hacek", "\u{02C7}"), 183 + ("Hat", "^"), 184 + ("Hcirc", "\u{0124}"), 185 + ("Hfr", "\u{210C}"), 186 + ("HilbertSpace", "\u{210B}"), 187 + ("Hopf", "\u{210D}"), 188 + ("HorizontalLine", "\u{2500}"), 189 + ("Hscr", "\u{210B}"), 190 + ("Hstrok", "\u{0126}"), 191 + ("HumpDownHump", "\u{224E}"), 192 + ("HumpEqual", "\u{224F}"), 193 + ("IEcy", "\u{0415}"), 194 + ("IJlig", "\u{0132}"), 195 + ("IOcy", "\u{0401}"), 196 + ("Iacute", "\u{00CD}"), 197 + ("Icirc", "\u{00CE}"), 198 + ("Icy", "\u{0418}"), 199 + ("Idot", "\u{0130}"), 200 + ("Ifr", "\u{2111}"), 201 + ("Igrave", "\u{00CC}"), 202 + ("Im", "\u{2111}"), 203 + ("Imacr", "\u{012A}"), 204 + ("ImaginaryI", "\u{2148}"), 205 + ("Implies", "\u{21D2}"), 206 + ("Int", "\u{222C}"), 207 + ("Integral", "\u{222B}"), 208 + ("Intersection", "\u{22C2}"), 209 + ("InvisibleComma", "\u{2063}"), 210 + ("InvisibleTimes", "\u{2062}"), 211 + ("Iogon", "\u{012E}"), 212 + ("Iopf", "\u{1D540}"), 213 + ("Iota", "\u{0399}"), 214 + ("Iscr", "\u{2110}"), 215 + ("Itilde", "\u{0128}"), 216 + ("Iukcy", "\u{0406}"), 217 + ("Iuml", "\u{00CF}"), 218 + ("Jcirc", "\u{0134}"), 219 + ("Jcy", "\u{0419}"), 220 + ("Jfr", "\u{1D50D}"), 221 + ("Jopf", "\u{1D541}"), 222 + ("Jscr", "\u{1D4A5}"), 223 + ("Jsercy", "\u{0408}"), 224 + ("Jukcy", "\u{0404}"), 225 + ("KHcy", "\u{0425}"), 226 + ("KJcy", "\u{040C}"), 227 + ("Kappa", "\u{039A}"), 228 + ("Kcedil", "\u{0136}"), 229 + ("Kcy", "\u{041A}"), 230 + ("Kfr", "\u{1D50E}"), 231 + ("Kopf", "\u{1D542}"), 232 + ("Kscr", "\u{1D4A6}"), 233 + ("LJcy", "\u{0409}"), 234 + ("LT", "<"), 235 + ("Lacute", "\u{0139}"), 236 + ("Lambda", "\u{039B}"), 237 + ("Lang", "\u{27EA}"), 238 + ("Laplacetrf", "\u{2112}"), 239 + ("Larr", "\u{219E}"), 240 + ("Lcaron", "\u{013D}"), 241 + ("Lcedil", "\u{013B}"), 242 + ("Lcy", "\u{041B}"), 243 + ("LeftAngleBracket", "\u{27E8}"), 244 + ("LeftArrow", "\u{2190}"), 245 + ("LeftArrowBar", "\u{21E4}"), 246 + ("LeftArrowRightArrow", "\u{21C6}"), 247 + ("LeftCeiling", "\u{2308}"), 248 + ("LeftDoubleBracket", "\u{27E6}"), 249 + ("LeftDownTeeVector", "\u{2961}"), 250 + ("LeftDownVector", "\u{21C3}"), 251 + ("LeftDownVectorBar", "\u{2959}"), 252 + ("LeftFloor", "\u{230A}"), 253 + ("LeftRightArrow", "\u{2194}"), 254 + ("LeftRightVector", "\u{294E}"), 255 + ("LeftTee", "\u{22A3}"), 256 + ("LeftTeeArrow", "\u{21A4}"), 257 + ("LeftTeeVector", "\u{295A}"), 258 + ("LeftTriangle", "\u{22B2}"), 259 + ("LeftTriangleBar", "\u{29CF}"), 260 + ("LeftTriangleEqual", "\u{22B4}"), 261 + ("LeftUpDownVector", "\u{2951}"), 262 + ("LeftUpTeeVector", "\u{2960}"), 263 + ("LeftUpVector", "\u{21BF}"), 264 + ("LeftUpVectorBar", "\u{2958}"), 265 + ("LeftVector", "\u{21BC}"), 266 + ("LeftVectorBar", "\u{2952}"), 267 + ("Leftarrow", "\u{21D0}"), 268 + ("Leftrightarrow", "\u{21D4}"), 269 + ("LessEqualGreater", "\u{22DA}"), 270 + ("LessFullEqual", "\u{2266}"), 271 + ("LessGreater", "\u{2276}"), 272 + ("LessLess", "\u{2AA1}"), 273 + ("LessSlantEqual", "\u{2A7D}"), 274 + ("LessTilde", "\u{2272}"), 275 + ("Lfr", "\u{1D50F}"), 276 + ("Ll", "\u{22D8}"), 277 + ("Lleftarrow", "\u{21DA}"), 278 + ("Lmidot", "\u{013F}"), 279 + ("LongLeftArrow", "\u{27F5}"), 280 + ("LongLeftRightArrow", "\u{27F7}"), 281 + ("LongRightArrow", "\u{27F6}"), 282 + ("Longleftarrow", "\u{27F8}"), 283 + ("Longleftrightarrow", "\u{27FA}"), 284 + ("Longrightarrow", "\u{27F9}"), 285 + ("Lopf", "\u{1D543}"), 286 + ("LowerLeftArrow", "\u{2199}"), 287 + ("LowerRightArrow", "\u{2198}"), 288 + ("Lscr", "\u{2112}"), 289 + ("Lsh", "\u{21B0}"), 290 + ("Lstrok", "\u{0141}"), 291 + ("Lt", "\u{226A}"), 292 + ("Map", "\u{2905}"), 293 + ("Mcy", "\u{041C}"), 294 + ("MediumSpace", "\u{205F}"), 295 + ("Mellintrf", "\u{2133}"), 296 + ("Mfr", "\u{1D510}"), 297 + ("MinusPlus", "\u{2213}"), 298 + ("Mopf", "\u{1D544}"), 299 + ("Mscr", "\u{2133}"), 300 + ("Mu", "\u{039C}"), 301 + ("NJcy", "\u{040A}"), 302 + ("Nacute", "\u{0143}"), 303 + ("Ncaron", "\u{0147}"), 304 + ("Ncedil", "\u{0145}"), 305 + ("Ncy", "\u{041D}"), 306 + ("NegativeMediumSpace", "\u{200B}"), 307 + ("NegativeThickSpace", "\u{200B}"), 308 + ("NegativeThinSpace", "\u{200B}"), 309 + ("NegativeVeryThinSpace", "\u{200B}"), 310 + ("NestedGreaterGreater", "\u{226B}"), 311 + ("NestedLessLess", "\u{226A}"), 312 + ("NewLine", "\u{000A}"), 313 + ("Nfr", "\u{1D511}"), 314 + ("NoBreak", "\u{2060}"), 315 + ("NonBreakingSpace", "\u{00A0}"), 316 + ("Nopf", "\u{2115}"), 317 + ("Not", "\u{2AEC}"), 318 + ("NotCongruent", "\u{2262}"), 319 + ("NotCupCap", "\u{226D}"), 320 + ("NotDoubleVerticalBar", "\u{2226}"), 321 + ("NotElement", "\u{2209}"), 322 + ("NotEqual", "\u{2260}"), 323 + ("NotEqualTilde", "\u{2242}\u{0338}"), 324 + ("NotExists", "\u{2204}"), 325 + ("NotGreater", "\u{226F}"), 326 + ("NotGreaterEqual", "\u{2271}"), 327 + ("NotGreaterFullEqual", "\u{2267}\u{0338}"), 328 + ("NotGreaterGreater", "\u{226B}\u{0338}"), 329 + ("NotGreaterLess", "\u{2279}"), 330 + ("NotGreaterSlantEqual", "\u{2A7E}\u{0338}"), 331 + ("NotGreaterTilde", "\u{2275}"), 332 + ("NotHumpDownHump", "\u{224E}\u{0338}"), 333 + ("NotHumpEqual", "\u{224F}\u{0338}"), 334 + ("NotLeftTriangle", "\u{22EA}"), 335 + ("NotLeftTriangleBar", "\u{29CF}\u{0338}"), 336 + ("NotLeftTriangleEqual", "\u{22EC}"), 337 + ("NotLess", "\u{226E}"), 338 + ("NotLessEqual", "\u{2270}"), 339 + ("NotLessGreater", "\u{2278}"), 340 + ("NotLessLess", "\u{226A}\u{0338}"), 341 + ("NotLessSlantEqual", "\u{2A7D}\u{0338}"), 342 + ("NotLessTilde", "\u{2274}"), 343 + ("NotNestedGreaterGreater", "\u{2AA2}\u{0338}"), 344 + ("NotNestedLessLess", "\u{2AA1}\u{0338}"), 345 + ("NotPrecedes", "\u{2280}"), 346 + ("NotPrecedesEqual", "\u{2AAF}\u{0338}"), 347 + ("NotPrecedesSlantEqual", "\u{22E0}"), 348 + ("NotReverseElement", "\u{220C}"), 349 + ("NotRightTriangle", "\u{22EB}"), 350 + ("NotRightTriangleBar", "\u{29D0}\u{0338}"), 351 + ("NotRightTriangleEqual", "\u{22ED}"), 352 + ("NotSquareSubset", "\u{228F}\u{0338}"), 353 + ("NotSquareSubsetEqual", "\u{22E2}"), 354 + ("NotSquareSuperset", "\u{2290}\u{0338}"), 355 + ("NotSquareSupersetEqual", "\u{22E3}"), 356 + ("NotSubset", "\u{2282}\u{20D2}"), 357 + ("NotSubsetEqual", "\u{2288}"), 358 + ("NotSucceeds", "\u{2281}"), 359 + ("NotSucceedsEqual", "\u{2AB0}\u{0338}"), 360 + ("NotSucceedsSlantEqual", "\u{22E1}"), 361 + ("NotSucceedsTilde", "\u{227F}\u{0338}"), 362 + ("NotSuperset", "\u{2283}\u{20D2}"), 363 + ("NotSupersetEqual", "\u{2289}"), 364 + ("NotTilde", "\u{2241}"), 365 + ("NotTildeEqual", "\u{2244}"), 366 + ("NotTildeFullEqual", "\u{2247}"), 367 + ("NotTildeTilde", "\u{2249}"), 368 + ("NotVerticalBar", "\u{2224}"), 369 + ("Nscr", "\u{1D4A9}"), 370 + ("Ntilde", "\u{00D1}"), 371 + ("Nu", "\u{039D}"), 372 + ("OElig", "\u{0152}"), 373 + ("Oacute", "\u{00D3}"), 374 + ("Ocirc", "\u{00D4}"), 375 + ("Ocy", "\u{041E}"), 376 + ("Odblac", "\u{0150}"), 377 + ("Ofr", "\u{1D512}"), 378 + ("Ograve", "\u{00D2}"), 379 + ("Omacr", "\u{014C}"), 380 + ("Omega", "\u{03A9}"), 381 + ("Omicron", "\u{039F}"), 382 + ("Oopf", "\u{1D546}"), 383 + ("OpenCurlyDoubleQuote", "\u{201C}"), 384 + ("OpenCurlyQuote", "\u{2018}"), 385 + ("Or", "\u{2A54}"), 386 + ("Oscr", "\u{1D4AA}"), 387 + ("Oslash", "\u{00D8}"), 388 + ("Otilde", "\u{00D5}"), 389 + ("Otimes", "\u{2A37}"), 390 + ("Ouml", "\u{00D6}"), 391 + ("OverBar", "\u{203E}"), 392 + ("OverBrace", "\u{23DE}"), 393 + ("OverBracket", "\u{23B4}"), 394 + ("OverParenthesis", "\u{23DC}"), 395 + ("PartialD", "\u{2202}"), 396 + ("Pcy", "\u{041F}"), 397 + ("Pfr", "\u{1D513}"), 398 + ("Phi", "\u{03A6}"), 399 + ("Pi", "\u{03A0}"), 400 + ("PlusMinus", "\u{00B1}"), 401 + ("Poincareplane", "\u{210C}"), 402 + ("Popf", "\u{2119}"), 403 + ("Pr", "\u{2ABB}"), 404 + ("Precedes", "\u{227A}"), 405 + ("PrecedesEqual", "\u{2AAF}"), 406 + ("PrecedesSlantEqual", "\u{227C}"), 407 + ("PrecedesTilde", "\u{227E}"), 408 + ("Prime", "\u{2033}"), 409 + ("Product", "\u{220F}"), 410 + ("Proportion", "\u{2237}"), 411 + ("Proportional", "\u{221D}"), 412 + ("Pscr", "\u{1D4AB}"), 413 + ("Psi", "\u{03A8}"), 414 + ("QUOT", "\u{0022}"), 415 + ("Qfr", "\u{1D514}"), 416 + ("Qopf", "\u{211A}"), 417 + ("Qscr", "\u{1D4AC}"), 418 + ("RBarr", "\u{2910}"), 419 + ("REG", "\u{00AE}"), 420 + ("Racute", "\u{0154}"), 421 + ("Rang", "\u{27EB}"), 422 + ("Rarr", "\u{21A0}"), 423 + ("Rarrtl", "\u{2916}"), 424 + ("Rcaron", "\u{0158}"), 425 + ("Rcedil", "\u{0156}"), 426 + ("Rcy", "\u{0420}"), 427 + ("Re", "\u{211C}"), 428 + ("ReverseElement", "\u{220B}"), 429 + ("ReverseEquilibrium", "\u{21CB}"), 430 + ("ReverseUpEquilibrium", "\u{296F}"), 431 + ("Rfr", "\u{211C}"), 432 + ("Rho", "\u{03A1}"), 433 + ("RightAngleBracket", "\u{27E9}"), 434 + ("RightArrow", "\u{2192}"), 435 + ("RightArrowBar", "\u{21E5}"), 436 + ("RightArrowLeftArrow", "\u{21C4}"), 437 + ("RightCeiling", "\u{2309}"), 438 + ("RightDoubleBracket", "\u{27E7}"), 439 + ("RightDownTeeVector", "\u{295D}"), 440 + ("RightDownVector", "\u{21C2}"), 441 + ("RightDownVectorBar", "\u{2955}"), 442 + ("RightFloor", "\u{230B}"), 443 + ("RightTee", "\u{22A2}"), 444 + ("RightTeeArrow", "\u{21A6}"), 445 + ("RightTeeVector", "\u{295B}"), 446 + ("RightTriangle", "\u{22B3}"), 447 + ("RightTriangleBar", "\u{29D0}"), 448 + ("RightTriangleEqual", "\u{22B5}"), 449 + ("RightUpDownVector", "\u{294F}"), 450 + ("RightUpTeeVector", "\u{295C}"), 451 + ("RightUpVector", "\u{21BE}"), 452 + ("RightUpVectorBar", "\u{2954}"), 453 + ("RightVector", "\u{21C0}"), 454 + ("RightVectorBar", "\u{2953}"), 455 + ("Rightarrow", "\u{21D2}"), 456 + ("Ropf", "\u{211D}"), 457 + ("RoundImplies", "\u{2970}"), 458 + ("Rrightarrow", "\u{21DB}"), 459 + ("Rscr", "\u{211B}"), 460 + ("Rsh", "\u{21B1}"), 461 + ("RuleDelayed", "\u{29F4}"), 462 + ("SHCHcy", "\u{0429}"), 463 + ("SHcy", "\u{0428}"), 464 + ("SOFTcy", "\u{042C}"), 465 + ("Sacute", "\u{015A}"), 466 + ("Sc", "\u{2ABC}"), 467 + ("Scaron", "\u{0160}"), 468 + ("Scedil", "\u{015E}"), 469 + ("Scirc", "\u{015C}"), 470 + ("Scy", "\u{0421}"), 471 + ("Sfr", "\u{1D516}"), 472 + ("ShortDownArrow", "\u{2193}"), 473 + ("ShortLeftArrow", "\u{2190}"), 474 + ("ShortRightArrow", "\u{2192}"), 475 + ("ShortUpArrow", "\u{2191}"), 476 + ("Sigma", "\u{03A3}"), 477 + ("SmallCircle", "\u{2218}"), 478 + ("Sopf", "\u{1D54A}"), 479 + ("Sqrt", "\u{221A}"), 480 + ("Square", "\u{25A1}"), 481 + ("SquareIntersection", "\u{2293}"), 482 + ("SquareSubset", "\u{228F}"), 483 + ("SquareSubsetEqual", "\u{2291}"), 484 + ("SquareSuperset", "\u{2290}"), 485 + ("SquareSupersetEqual", "\u{2292}"), 486 + ("SquareUnion", "\u{2294}"), 487 + ("Sscr", "\u{1D4AE}"), 488 + ("Star", "\u{22C6}"), 489 + ("Sub", "\u{22D0}"), 490 + ("Subset", "\u{22D0}"), 491 + ("SubsetEqual", "\u{2286}"), 492 + ("Succeeds", "\u{227B}"), 493 + ("SucceedsEqual", "\u{2AB0}"), 494 + ("SucceedsSlantEqual", "\u{227D}"), 495 + ("SucceedsTilde", "\u{227F}"), 496 + ("SuchThat", "\u{220B}"), 497 + ("Sum", "\u{2211}"), 498 + ("Sup", "\u{22D1}"), 499 + ("Superset", "\u{2283}"), 500 + ("SupersetEqual", "\u{2287}"), 501 + ("Supset", "\u{22D1}"), 502 + ("THORN", "\u{00DE}"), 503 + ("TRADE", "\u{2122}"), 504 + ("TSHcy", "\u{040B}"), 505 + ("TScy", "\u{0426}"), 506 + ("Tab", "\u{0009}"), 507 + ("Tau", "\u{03A4}"), 508 + ("Tcaron", "\u{0164}"), 509 + ("Tcedil", "\u{0162}"), 510 + ("Tcy", "\u{0422}"), 511 + ("Tfr", "\u{1D517}"), 512 + ("Therefore", "\u{2234}"), 513 + ("Theta", "\u{0398}"), 514 + ("ThickSpace", "\u{205F}\u{200A}"), 515 + ("ThinSpace", "\u{2009}"), 516 + ("Tilde", "\u{223C}"), 517 + ("TildeEqual", "\u{2243}"), 518 + ("TildeFullEqual", "\u{2245}"), 519 + ("TildeTilde", "\u{2248}"), 520 + ("Topf", "\u{1D54B}"), 521 + ("TripleDot", "\u{20DB}"), 522 + ("Tscr", "\u{1D4AF}"), 523 + ("Tstrok", "\u{0166}"), 524 + ("Uacute", "\u{00DA}"), 525 + ("Uarr", "\u{219F}"), 526 + ("Uarrocir", "\u{2949}"), 527 + ("Ubrcy", "\u{040E}"), 528 + ("Ubreve", "\u{016C}"), 529 + ("Ucirc", "\u{00DB}"), 530 + ("Ucy", "\u{0423}"), 531 + ("Udblac", "\u{0170}"), 532 + ("Ufr", "\u{1D518}"), 533 + ("Ugrave", "\u{00D9}"), 534 + ("Umacr", "\u{016A}"), 535 + ("UnderBar", "_"), 536 + ("UnderBrace", "\u{23DF}"), 537 + ("UnderBracket", "\u{23B5}"), 538 + ("UnderParenthesis", "\u{23DD}"), 539 + ("Union", "\u{22C3}"), 540 + ("UnionPlus", "\u{228E}"), 541 + ("Uogon", "\u{0172}"), 542 + ("Uopf", "\u{1D54C}"), 543 + ("UpArrow", "\u{2191}"), 544 + ("UpArrowBar", "\u{2912}"), 545 + ("UpArrowDownArrow", "\u{21C5}"), 546 + ("UpDownArrow", "\u{2195}"), 547 + ("UpEquilibrium", "\u{296E}"), 548 + ("UpTee", "\u{22A5}"), 549 + ("UpTeeArrow", "\u{21A5}"), 550 + ("Uparrow", "\u{21D1}"), 551 + ("Updownarrow", "\u{21D5}"), 552 + ("UpperLeftArrow", "\u{2196}"), 553 + ("UpperRightArrow", "\u{2197}"), 554 + ("Upsi", "\u{03D2}"), 555 + ("Upsilon", "\u{03A5}"), 556 + ("Uring", "\u{016E}"), 557 + ("Uscr", "\u{1D4B0}"), 558 + ("Utilde", "\u{0168}"), 559 + ("Uuml", "\u{00DC}"), 560 + ("VDash", "\u{22AB}"), 561 + ("Vbar", "\u{2AEB}"), 562 + ("Vcy", "\u{0412}"), 563 + ("Vdash", "\u{22A9}"), 564 + ("Vdashl", "\u{2AE6}"), 565 + ("Vee", "\u{22C1}"), 566 + ("Verbar", "\u{2016}"), 567 + ("Vert", "\u{2016}"), 568 + ("VerticalBar", "\u{2223}"), 569 + ("VerticalLine", "|"), 570 + ("VerticalSeparator", "\u{2758}"), 571 + ("VerticalTilde", "\u{2240}"), 572 + ("VeryThinSpace", "\u{200A}"), 573 + ("Vfr", "\u{1D519}"), 574 + ("Vopf", "\u{1D54D}"), 575 + ("Vscr", "\u{1D4B1}"), 576 + ("Vvdash", "\u{22AA}"), 577 + ("Wcirc", "\u{0174}"), 578 + ("Wedge", "\u{22C0}"), 579 + ("Wfr", "\u{1D51A}"), 580 + ("Wopf", "\u{1D54E}"), 581 + ("Wscr", "\u{1D4B2}"), 582 + ("Xfr", "\u{1D51B}"), 583 + ("Xi", "\u{039E}"), 584 + ("Xopf", "\u{1D54F}"), 585 + ("Xscr", "\u{1D4B3}"), 586 + ("YAcy", "\u{042F}"), 587 + ("YIcy", "\u{0407}"), 588 + ("YUcy", "\u{042E}"), 589 + ("Yacute", "\u{00DD}"), 590 + ("Ycirc", "\u{0176}"), 591 + ("Ycy", "\u{042B}"), 592 + ("Yfr", "\u{1D51C}"), 593 + ("Yopf", "\u{1D550}"), 594 + ("Yscr", "\u{1D4B4}"), 595 + ("Yuml", "\u{0178}"), 596 + ("ZHcy", "\u{0416}"), 597 + ("Zacute", "\u{0179}"), 598 + ("Zcaron", "\u{017D}"), 599 + ("Zcy", "\u{0417}"), 600 + ("Zdot", "\u{017B}"), 601 + ("ZeroWidthSpace", "\u{200B}"), 602 + ("Zeta", "\u{0396}"), 603 + ("Zfr", "\u{2128}"), 604 + ("Zopf", "\u{2124}"), 605 + ("Zscr", "\u{1D4B5}"), 606 + ("aacute", "\u{00E1}"), 607 + ("abreve", "\u{0103}"), 608 + ("ac", "\u{223E}"), 609 + ("acE", "\u{223E}\u{0333}"), 610 + ("acd", "\u{223F}"), 611 + ("acirc", "\u{00E2}"), 612 + ("acute", "\u{00B4}"), 613 + ("acy", "\u{0430}"), 614 + ("aelig", "\u{00E6}"), 615 + ("af", "\u{2061}"), 616 + ("afr", "\u{1D51E}"), 617 + ("agrave", "\u{00E0}"), 618 + ("alefsym", "\u{2135}"), 619 + ("aleph", "\u{2135}"), 620 + ("alpha", "\u{03B1}"), 621 + ("amacr", "\u{0101}"), 622 + ("amalg", "\u{2A3F}"), 623 + ("amp", "&"), 624 + ("and", "\u{2227}"), 625 + ("andand", "\u{2A55}"), 626 + ("andd", "\u{2A5C}"), 627 + ("andslope", "\u{2A58}"), 628 + ("andv", "\u{2A5A}"), 629 + ("ang", "\u{2220}"), 630 + ("ange", "\u{29A4}"), 631 + ("angle", "\u{2220}"), 632 + ("angmsd", "\u{2221}"), 633 + ("angmsdaa", "\u{29A8}"), 634 + ("angmsdab", "\u{29A9}"), 635 + ("angmsdac", "\u{29AA}"), 636 + ("angmsdad", "\u{29AB}"), 637 + ("angmsdae", "\u{29AC}"), 638 + ("angmsdaf", "\u{29AD}"), 639 + ("angmsdag", "\u{29AE}"), 640 + ("angmsdah", "\u{29AF}"), 641 + ("angrt", "\u{221F}"), 642 + ("angrtvb", "\u{22BE}"), 643 + ("angrtvbd", "\u{299D}"), 644 + ("angsph", "\u{2222}"), 645 + ("angst", "\u{00C5}"), 646 + ("angzarr", "\u{237C}"), 647 + ("aogon", "\u{0105}"), 648 + ("aopf", "\u{1D552}"), 649 + ("ap", "\u{2248}"), 650 + ("apE", "\u{2A70}"), 651 + ("apacir", "\u{2A6F}"), 652 + ("ape", "\u{224A}"), 653 + ("apid", "\u{224B}"), 654 + ("apos", "'"), 655 + ("approx", "\u{2248}"), 656 + ("approxeq", "\u{224A}"), 657 + ("aring", "\u{00E5}"), 658 + ("ascr", "\u{1D4B6}"), 659 + ("ast", "*"), 660 + ("asymp", "\u{2248}"), 661 + ("asympeq", "\u{224D}"), 662 + ("atilde", "\u{00E3}"), 663 + ("auml", "\u{00E4}"), 664 + ("awconint", "\u{2233}"), 665 + ("awint", "\u{2A11}"), 666 + ("bNot", "\u{2AED}"), 667 + ("backcong", "\u{224C}"), 668 + ("backepsilon", "\u{03F6}"), 669 + ("backprime", "\u{2035}"), 670 + ("backsim", "\u{223D}"), 671 + ("backsimeq", "\u{22CD}"), 672 + ("barvee", "\u{22BD}"), 673 + ("barwed", "\u{2305}"), 674 + ("barwedge", "\u{2305}"), 675 + ("bbrk", "\u{23B5}"), 676 + ("bbrktbrk", "\u{23B6}"), 677 + ("bcong", "\u{224C}"), 678 + ("bcy", "\u{0431}"), 679 + ("bdquo", "\u{201E}"), 680 + ("becaus", "\u{2235}"), 681 + ("because", "\u{2235}"), 682 + ("bemptyv", "\u{29B0}"), 683 + ("bepsi", "\u{03F6}"), 684 + ("bernou", "\u{212C}"), 685 + ("beta", "\u{03B2}"), 686 + ("beth", "\u{2136}"), 687 + ("between", "\u{226C}"), 688 + ("bfr", "\u{1D51F}"), 689 + ("bigcap", "\u{22C2}"), 690 + ("bigcirc", "\u{25EF}"), 691 + ("bigcup", "\u{22C3}"), 692 + ("bigodot", "\u{2A00}"), 693 + ("bigoplus", "\u{2A01}"), 694 + ("bigotimes", "\u{2A02}"), 695 + ("bigsqcup", "\u{2A06}"), 696 + ("bigstar", "\u{2605}"), 697 + ("bigtriangledown", "\u{25BD}"), 698 + ("bigtriangleup", "\u{25B3}"), 699 + ("biguplus", "\u{2A04}"), 700 + ("bigvee", "\u{22C1}"), 701 + ("bigwedge", "\u{22C0}"), 702 + ("bkarow", "\u{290D}"), 703 + ("blacklozenge", "\u{29EB}"), 704 + ("blacksquare", "\u{25AA}"), 705 + ("blacktriangle", "\u{25B4}"), 706 + ("blacktriangledown", "\u{25BE}"), 707 + ("blacktriangleleft", "\u{25C2}"), 708 + ("blacktriangleright", "\u{25B8}"), 709 + ("blank", "\u{2423}"), 710 + ("blk12", "\u{2592}"), 711 + ("blk14", "\u{2591}"), 712 + ("blk34", "\u{2593}"), 713 + ("block", "\u{2588}"), 714 + ("bne", "=\u{20E5}"), 715 + ("bnequiv", "\u{2261}\u{20E5}"), 716 + ("bnot", "\u{2310}"), 717 + ("bopf", "\u{1D553}"), 718 + ("bot", "\u{22A5}"), 719 + ("bottom", "\u{22A5}"), 720 + ("bowtie", "\u{22C8}"), 721 + ("boxDL", "\u{2557}"), 722 + ("boxDR", "\u{2554}"), 723 + ("boxDl", "\u{2556}"), 724 + ("boxDr", "\u{2553}"), 725 + ("boxH", "\u{2550}"), 726 + ("boxHD", "\u{2566}"), 727 + ("boxHU", "\u{2569}"), 728 + ("boxHd", "\u{2564}"), 729 + ("boxHu", "\u{2567}"), 730 + ("boxUL", "\u{255D}"), 731 + ("boxUR", "\u{255A}"), 732 + ("boxUl", "\u{255C}"), 733 + ("boxUr", "\u{2559}"), 734 + ("boxV", "\u{2551}"), 735 + ("boxVH", "\u{256C}"), 736 + ("boxVL", "\u{2563}"), 737 + ("boxVR", "\u{2560}"), 738 + ("boxVh", "\u{256B}"), 739 + ("boxVl", "\u{2562}"), 740 + ("boxVr", "\u{255F}"), 741 + ("boxbox", "\u{29C9}"), 742 + ("boxdL", "\u{2555}"), 743 + ("boxdR", "\u{2552}"), 744 + ("boxdl", "\u{2510}"), 745 + ("boxdr", "\u{250C}"), 746 + ("boxh", "\u{2500}"), 747 + ("boxhD", "\u{2565}"), 748 + ("boxhU", "\u{2568}"), 749 + ("boxhd", "\u{252C}"), 750 + ("boxhu", "\u{2534}"), 751 + ("boxminus", "\u{229F}"), 752 + ("boxplus", "\u{229E}"), 753 + ("boxtimes", "\u{22A0}"), 754 + ("boxuL", "\u{255B}"), 755 + ("boxuR", "\u{2558}"), 756 + ("boxul", "\u{2518}"), 757 + ("boxur", "\u{2514}"), 758 + ("boxv", "\u{2502}"), 759 + ("boxvH", "\u{256A}"), 760 + ("boxvL", "\u{2561}"), 761 + ("boxvR", "\u{255E}"), 762 + ("boxvh", "\u{253C}"), 763 + ("boxvl", "\u{2524}"), 764 + ("boxvr", "\u{251C}"), 765 + ("bprime", "\u{2035}"), 766 + ("breve", "\u{02D8}"), 767 + ("brvbar", "\u{00A6}"), 768 + ("bscr", "\u{1D4B7}"), 769 + ("bsemi", "\u{204F}"), 770 + ("bsim", "\u{223D}"), 771 + ("bsime", "\u{22CD}"), 772 + ("bsol", "\u{005C}"), 773 + ("bsolb", "\u{29C5}"), 774 + ("bsolhsub", "\u{27C8}"), 775 + ("bull", "\u{2022}"), 776 + ("bullet", "\u{2022}"), 777 + ("bump", "\u{224E}"), 778 + ("bumpE", "\u{2AAE}"), 779 + ("bumpe", "\u{224F}"), 780 + ("bumpeq", "\u{224F}"), 781 + ("cacute", "\u{0107}"), 782 + ("cap", "\u{2229}"), 783 + ("capand", "\u{2A44}"), 784 + ("capbrcup", "\u{2A49}"), 785 + ("capcap", "\u{2A4B}"), 786 + ("capcup", "\u{2A47}"), 787 + ("capdot", "\u{2A40}"), 788 + ("caps", "\u{2229}\u{FE00}"), 789 + ("caret", "\u{2041}"), 790 + ("caron", "\u{02C7}"), 791 + ("ccaps", "\u{2A4D}"), 792 + ("ccaron", "\u{010D}"), 793 + ("ccedil", "\u{00E7}"), 794 + ("ccirc", "\u{0109}"), 795 + ("ccups", "\u{2A4C}"), 796 + ("ccupssm", "\u{2A50}"), 797 + ("cdot", "\u{010B}"), 798 + ("cedil", "\u{00B8}"), 799 + ("cemptyv", "\u{29B2}"), 800 + ("cent", "\u{00A2}"), 801 + ("centerdot", "\u{00B7}"), 802 + ("cfr", "\u{1D520}"), 803 + ("chcy", "\u{0447}"), 804 + ("check", "\u{2713}"), 805 + ("checkmark", "\u{2713}"), 806 + ("chi", "\u{03C7}"), 807 + ("cir", "\u{25CB}"), 808 + ("cirE", "\u{29C3}"), 809 + ("circ", "\u{02C6}"), 810 + ("circeq", "\u{2257}"), 811 + ("circlearrowleft", "\u{21BA}"), 812 + ("circlearrowright", "\u{21BB}"), 813 + ("circledR", "\u{00AE}"), 814 + ("circledS", "\u{24C8}"), 815 + ("circledast", "\u{229B}"), 816 + ("circledcirc", "\u{229A}"), 817 + ("circleddash", "\u{229D}"), 818 + ("cire", "\u{2257}"), 819 + ("cirfnint", "\u{2A10}"), 820 + ("cirmid", "\u{2AEF}"), 821 + ("cirscir", "\u{29C2}"), 822 + ("clubs", "\u{2663}"), 823 + ("clubsuit", "\u{2663}"), 824 + ("colon", ":"), 825 + ("colone", "\u{2254}"), 826 + ("coloneq", "\u{2254}"), 827 + ("comma", ","), 828 + ("commat", "@"), 829 + ("comp", "\u{2201}"), 830 + ("compfn", "\u{2218}"), 831 + ("complement", "\u{2201}"), 832 + ("complexes", "\u{2102}"), 833 + ("cong", "\u{2245}"), 834 + ("congdot", "\u{2A6D}"), 835 + ("conint", "\u{222E}"), 836 + ("copf", "\u{1D554}"), 837 + ("coprod", "\u{2210}"), 838 + ("copy", "\u{00A9}"), 839 + ("copysr", "\u{2117}"), 840 + ("crarr", "\u{21B5}"), 841 + ("cross", "\u{2717}"), 842 + ("cscr", "\u{1D4B8}"), 843 + ("csub", "\u{2ACF}"), 844 + ("csube", "\u{2AD1}"), 845 + ("csup", "\u{2AD0}"), 846 + ("csupe", "\u{2AD2}"), 847 + ("ctdot", "\u{22EF}"), 848 + ("cudarrl", "\u{2938}"), 849 + ("cudarrr", "\u{2935}"), 850 + ("cuepr", "\u{22DE}"), 851 + ("cuesc", "\u{22DF}"), 852 + ("cularr", "\u{21B6}"), 853 + ("cularrp", "\u{293D}"), 854 + ("cup", "\u{222A}"), 855 + ("cupbrcap", "\u{2A48}"), 856 + ("cupcap", "\u{2A46}"), 857 + ("cupcup", "\u{2A4A}"), 858 + ("cupdot", "\u{228D}"), 859 + ("cupor", "\u{2A45}"), 860 + ("cups", "\u{222A}\u{FE00}"), 861 + ("curarr", "\u{21B7}"), 862 + ("curarrm", "\u{293C}"), 863 + ("curlyeqprec", "\u{22DE}"), 864 + ("curlyeqsucc", "\u{22DF}"), 865 + ("curlyvee", "\u{22CE}"), 866 + ("curlywedge", "\u{22CF}"), 867 + ("curren", "\u{00A4}"), 868 + ("curvearrowleft", "\u{21B6}"), 869 + ("curvearrowright", "\u{21B7}"), 870 + ("cuvee", "\u{22CE}"), 871 + ("cuwed", "\u{22CF}"), 872 + ("cwconint", "\u{2232}"), 873 + ("cwint", "\u{2231}"), 874 + ("cylcty", "\u{232D}"), 875 + ("dArr", "\u{21D3}"), 876 + ("dHar", "\u{2965}"), 877 + ("dagger", "\u{2020}"), 878 + ("daleth", "\u{2138}"), 879 + ("darr", "\u{2193}"), 880 + ("dash", "\u{2010}"), 881 + ("dashv", "\u{22A3}"), 882 + ("dbkarow", "\u{290F}"), 883 + ("dblac", "\u{02DD}"), 884 + ("dcaron", "\u{010F}"), 885 + ("dcy", "\u{0434}"), 886 + ("dd", "\u{2146}"), 887 + ("ddagger", "\u{2021}"), 888 + ("ddarr", "\u{21CA}"), 889 + ("ddotseq", "\u{2A77}"), 890 + ("deg", "\u{00B0}"), 891 + ("delta", "\u{03B4}"), 892 + ("demptyv", "\u{29B1}"), 893 + ("dfisht", "\u{297F}"), 894 + ("dfr", "\u{1D521}"), 895 + ("dharl", "\u{21C3}"), 896 + ("dharr", "\u{21C2}"), 897 + ("diam", "\u{22C4}"), 898 + ("diamond", "\u{22C4}"), 899 + ("diamondsuit", "\u{2666}"), 900 + ("diams", "\u{2666}"), 901 + ("die", "\u{00A8}"), 902 + ("digamma", "\u{03DD}"), 903 + ("disin", "\u{22F2}"), 904 + ("div", "\u{00F7}"), 905 + ("divide", "\u{00F7}"), 906 + ("divideontimes", "\u{22C7}"), 907 + ("divonx", "\u{22C7}"), 908 + ("djcy", "\u{0452}"), 909 + ("dlcorn", "\u{231E}"), 910 + ("dlcrop", "\u{230D}"), 911 + ("dollar", "$"), 912 + ("dopf", "\u{1D555}"), 913 + ("dot", "\u{02D9}"), 914 + ("doteq", "\u{2250}"), 915 + ("doteqdot", "\u{2251}"), 916 + ("dotminus", "\u{2238}"), 917 + ("dotplus", "\u{2214}"), 918 + ("dotsquare", "\u{22A1}"), 919 + ("doublebarwedge", "\u{2306}"), 920 + ("downarrow", "\u{2193}"), 921 + ("downdownarrows", "\u{21CA}"), 922 + ("downharpoonleft", "\u{21C3}"), 923 + ("downharpoonright", "\u{21C2}"), 924 + ("drbkarow", "\u{2910}"), 925 + ("drcorn", "\u{231F}"), 926 + ("drcrop", "\u{230C}"), 927 + ("dscr", "\u{1D4B9}"), 928 + ("dscy", "\u{0455}"), 929 + ("dsol", "\u{29F6}"), 930 + ("dstrok", "\u{0111}"), 931 + ("dtdot", "\u{22F1}"), 932 + ("dtri", "\u{25BF}"), 933 + ("dtrif", "\u{25BE}"), 934 + ("duarr", "\u{21F5}"), 935 + ("duhar", "\u{296F}"), 936 + ("dwangle", "\u{29A6}"), 937 + ("dzcy", "\u{045F}"), 938 + ("dzigrarr", "\u{27FF}"), 939 + ("eDDot", "\u{2A77}"), 940 + ("eDot", "\u{2251}"), 941 + ("eacute", "\u{00E9}"), 942 + ("easter", "\u{2A6E}"), 943 + ("ecaron", "\u{011B}"), 944 + ("ecir", "\u{2256}"), 945 + ("ecirc", "\u{00EA}"), 946 + ("ecolon", "\u{2255}"), 947 + ("ecy", "\u{044D}"), 948 + ("edot", "\u{0117}"), 949 + ("ee", "\u{2147}"), 950 + ("efDot", "\u{2252}"), 951 + ("efr", "\u{1D522}"), 952 + ("eg", "\u{2A9A}"), 953 + ("egrave", "\u{00E8}"), 954 + ("egs", "\u{2A96}"), 955 + ("egsdot", "\u{2A98}"), 956 + ("el", "\u{2A99}"), 957 + ("elinters", "\u{23E7}"), 958 + ("ell", "\u{2113}"), 959 + ("els", "\u{2A95}"), 960 + ("elsdot", "\u{2A97}"), 961 + ("emacr", "\u{0113}"), 962 + ("empty", "\u{2205}"), 963 + ("emptyset", "\u{2205}"), 964 + ("emptyv", "\u{2205}"), 965 + ("emsp", "\u{2003}"), 966 + ("emsp13", "\u{2004}"), 967 + ("emsp14", "\u{2005}"), 968 + ("eng", "\u{014B}"), 969 + ("ensp", "\u{2002}"), 970 + ("eogon", "\u{0119}"), 971 + ("eopf", "\u{1D556}"), 972 + ("epar", "\u{22D5}"), 973 + ("eparsl", "\u{29E3}"), 974 + ("eplus", "\u{2A71}"), 975 + ("epsi", "\u{03B5}"), 976 + ("epsilon", "\u{03B5}"), 977 + ("epsiv", "\u{03F5}"), 978 + ("eqcirc", "\u{2256}"), 979 + ("eqcolon", "\u{2255}"), 980 + ("eqsim", "\u{2242}"), 981 + ("eqslantgtr", "\u{2A96}"), 982 + ("eqslantless", "\u{2A95}"), 983 + ("equals", "="), 984 + ("equest", "\u{225F}"), 985 + ("equiv", "\u{2261}"), 986 + ("equivDD", "\u{2A78}"), 987 + ("eqvparsl", "\u{29E5}"), 988 + ("erDot", "\u{2253}"), 989 + ("erarr", "\u{2971}"), 990 + ("escr", "\u{212F}"), 991 + ("esdot", "\u{2250}"), 992 + ("esim", "\u{2242}"), 993 + ("eta", "\u{03B7}"), 994 + ("eth", "\u{00F0}"), 995 + ("euml", "\u{00EB}"), 996 + ("euro", "\u{20AC}"), 997 + ("excl", "!"), 998 + ("exist", "\u{2203}"), 999 + ("expectation", "\u{2130}"), 1000 + ("exponentiale", "\u{2147}"), 1001 + ("fallingdotseq", "\u{2252}"), 1002 + ("fcy", "\u{0444}"), 1003 + ("female", "\u{2640}"), 1004 + ("ffilig", "\u{FB03}"), 1005 + ("fflig", "\u{FB00}"), 1006 + ("ffllig", "\u{FB04}"), 1007 + ("ffr", "\u{1D523}"), 1008 + ("filig", "\u{FB01}"), 1009 + ("fjlig", "fj"), 1010 + ("flat", "\u{266D}"), 1011 + ("fllig", "\u{FB02}"), 1012 + ("fltns", "\u{25B1}"), 1013 + ("fnof", "\u{0192}"), 1014 + ("fopf", "\u{1D557}"), 1015 + ("forall", "\u{2200}"), 1016 + ("fork", "\u{22D4}"), 1017 + ("forkv", "\u{2AD9}"), 1018 + ("fpartint", "\u{2A0D}"), 1019 + ("frac12", "\u{00BD}"), 1020 + ("frac13", "\u{2153}"), 1021 + ("frac14", "\u{00BC}"), 1022 + ("frac15", "\u{2155}"), 1023 + ("frac16", "\u{2159}"), 1024 + ("frac18", "\u{215B}"), 1025 + ("frac23", "\u{2154}"), 1026 + ("frac25", "\u{2156}"), 1027 + ("frac34", "\u{00BE}"), 1028 + ("frac35", "\u{2157}"), 1029 + ("frac38", "\u{215C}"), 1030 + ("frac45", "\u{2158}"), 1031 + ("frac56", "\u{215A}"), 1032 + ("frac58", "\u{215D}"), 1033 + ("frac78", "\u{215E}"), 1034 + ("frasl", "\u{2044}"), 1035 + ("frown", "\u{2322}"), 1036 + ("fscr", "\u{1D4BB}"), 1037 + ("gE", "\u{2267}"), 1038 + ("gEl", "\u{2A8C}"), 1039 + ("gacute", "\u{01F5}"), 1040 + ("gamma", "\u{03B3}"), 1041 + ("gammad", "\u{03DD}"), 1042 + ("gap", "\u{2A86}"), 1043 + ("gbreve", "\u{011F}"), 1044 + ("gcirc", "\u{011D}"), 1045 + ("gcy", "\u{0433}"), 1046 + ("gdot", "\u{0121}"), 1047 + ("ge", "\u{2265}"), 1048 + ("gel", "\u{22DB}"), 1049 + ("geq", "\u{2265}"), 1050 + ("geqq", "\u{2267}"), 1051 + ("geqslant", "\u{2A7E}"), 1052 + ("ges", "\u{2A7E}"), 1053 + ("gescc", "\u{2AA9}"), 1054 + ("gesdot", "\u{2A80}"), 1055 + ("gesdoto", "\u{2A82}"), 1056 + ("gesdotol", "\u{2A84}"), 1057 + ("gesl", "\u{22DB}\u{FE00}"), 1058 + ("gesles", "\u{2A94}"), 1059 + ("gfr", "\u{1D524}"), 1060 + ("gg", "\u{226B}"), 1061 + ("ggg", "\u{22D9}"), 1062 + ("gimel", "\u{2137}"), 1063 + ("gjcy", "\u{0453}"), 1064 + ("gl", "\u{2277}"), 1065 + ("glE", "\u{2A92}"), 1066 + ("gla", "\u{2AA5}"), 1067 + ("glj", "\u{2AA4}"), 1068 + ("gnE", "\u{2269}"), 1069 + ("gnap", "\u{2A8A}"), 1070 + ("gnapprox", "\u{2A8A}"), 1071 + ("gne", "\u{2A88}"), 1072 + ("gneq", "\u{2A88}"), 1073 + ("gneqq", "\u{2269}"), 1074 + ("gnsim", "\u{22E7}"), 1075 + ("gopf", "\u{1D558}"), 1076 + ("grave", "`"), 1077 + ("gscr", "\u{210A}"), 1078 + ("gsim", "\u{2273}"), 1079 + ("gsime", "\u{2A8E}"), 1080 + ("gsiml", "\u{2A90}"), 1081 + ("gt", ">"), 1082 + ("gtcc", "\u{2AA7}"), 1083 + ("gtcir", "\u{2A7A}"), 1084 + ("gtdot", "\u{22D7}"), 1085 + ("gtlPar", "\u{2995}"), 1086 + ("gtquest", "\u{2A7C}"), 1087 + ("gtrapprox", "\u{2A86}"), 1088 + ("gtrarr", "\u{2978}"), 1089 + ("gtrdot", "\u{22D7}"), 1090 + ("gtreqless", "\u{22DB}"), 1091 + ("gtreqqless", "\u{2A8C}"), 1092 + ("gtrless", "\u{2277}"), 1093 + ("gtrsim", "\u{2273}"), 1094 + ("gvertneqq", "\u{2269}\u{FE00}"), 1095 + ("gvnE", "\u{2269}\u{FE00}"), 1096 + ("hArr", "\u{21D4}"), 1097 + ("hairsp", "\u{200A}"), 1098 + ("half", "\u{00BD}"), 1099 + ("hamilt", "\u{210B}"), 1100 + ("hardcy", "\u{044A}"), 1101 + ("harr", "\u{2194}"), 1102 + ("harrcir", "\u{2948}"), 1103 + ("harrw", "\u{21AD}"), 1104 + ("hbar", "\u{210F}"), 1105 + ("hcirc", "\u{0125}"), 1106 + ("hearts", "\u{2665}"), 1107 + ("heartsuit", "\u{2665}"), 1108 + ("hellip", "\u{2026}"), 1109 + ("hercon", "\u{22B9}"), 1110 + ("hfr", "\u{1D525}"), 1111 + ("hksearow", "\u{2925}"), 1112 + ("hkswarow", "\u{2926}"), 1113 + ("hoarr", "\u{21FF}"), 1114 + ("homtht", "\u{223B}"), 1115 + ("hookleftarrow", "\u{21A9}"), 1116 + ("hookrightarrow", "\u{21AA}"), 1117 + ("hopf", "\u{1D559}"), 1118 + ("horbar", "\u{2015}"), 1119 + ("hscr", "\u{1D4BD}"), 1120 + ("hslash", "\u{210F}"), 1121 + ("hstrok", "\u{0127}"), 1122 + ("hybull", "\u{2043}"), 1123 + ("hyphen", "\u{2010}"), 1124 + ("iacute", "\u{00ED}"), 1125 + ("ic", "\u{2063}"), 1126 + ("icirc", "\u{00EE}"), 1127 + ("icy", "\u{0438}"), 1128 + ("iecy", "\u{0435}"), 1129 + ("iexcl", "\u{00A1}"), 1130 + ("iff", "\u{21D4}"), 1131 + ("ifr", "\u{1D526}"), 1132 + ("igrave", "\u{00EC}"), 1133 + ("ii", "\u{2148}"), 1134 + ("iiiint", "\u{2A0C}"), 1135 + ("iiint", "\u{222D}"), 1136 + ("iinfin", "\u{29DC}"), 1137 + ("iiota", "\u{2129}"), 1138 + ("ijlig", "\u{0133}"), 1139 + ("imacr", "\u{012B}"), 1140 + ("image", "\u{2111}"), 1141 + ("imagline", "\u{2110}"), 1142 + ("imagpart", "\u{2111}"), 1143 + ("imath", "\u{0131}"), 1144 + ("imof", "\u{22B7}"), 1145 + ("imped", "\u{01B5}"), 1146 + ("in", "\u{2208}"), 1147 + ("incare", "\u{2105}"), 1148 + ("infin", "\u{221E}"), 1149 + ("infintie", "\u{29DD}"), 1150 + ("inodot", "\u{0131}"), 1151 + ("int", "\u{222B}"), 1152 + ("intcal", "\u{22BA}"), 1153 + ("integers", "\u{2124}"), 1154 + ("intercal", "\u{22BA}"), 1155 + ("intlarhk", "\u{2A17}"), 1156 + ("intprod", "\u{2A3C}"), 1157 + ("iocy", "\u{0451}"), 1158 + ("iogon", "\u{012F}"), 1159 + ("iopf", "\u{1D55A}"), 1160 + ("iota", "\u{03B9}"), 1161 + ("iprod", "\u{2A3C}"), 1162 + ("iquest", "\u{00BF}"), 1163 + ("iscr", "\u{1D4BE}"), 1164 + ("isin", "\u{2208}"), 1165 + ("isinE", "\u{22F9}"), 1166 + ("isindot", "\u{22F5}"), 1167 + ("isins", "\u{22F4}"), 1168 + ("isinsv", "\u{22F3}"), 1169 + ("isinv", "\u{2208}"), 1170 + ("it", "\u{2062}"), 1171 + ("itilde", "\u{0129}"), 1172 + ("iukcy", "\u{0456}"), 1173 + ("iuml", "\u{00EF}"), 1174 + ("jcirc", "\u{0135}"), 1175 + ("jcy", "\u{0439}"), 1176 + ("jfr", "\u{1D527}"), 1177 + ("jmath", "\u{0237}"), 1178 + ("jopf", "\u{1D55B}"), 1179 + ("jscr", "\u{1D4BF}"), 1180 + ("jsercy", "\u{0458}"), 1181 + ("jukcy", "\u{0454}"), 1182 + ("kappa", "\u{03BA}"), 1183 + ("kappav", "\u{03F0}"), 1184 + ("kcedil", "\u{0137}"), 1185 + ("kcy", "\u{043A}"), 1186 + ("kfr", "\u{1D528}"), 1187 + ("kgreen", "\u{0138}"), 1188 + ("khcy", "\u{0445}"), 1189 + ("kjcy", "\u{045C}"), 1190 + ("kopf", "\u{1D55C}"), 1191 + ("kscr", "\u{1D4C0}"), 1192 + ("lAarr", "\u{21DA}"), 1193 + ("lArr", "\u{21D0}"), 1194 + ("lAtail", "\u{291B}"), 1195 + ("lBarr", "\u{290E}"), 1196 + ("lE", "\u{2266}"), 1197 + ("lEg", "\u{2A8B}"), 1198 + ("lHar", "\u{2962}"), 1199 + ("lacute", "\u{013A}"), 1200 + ("laemptyv", "\u{29B4}"), 1201 + ("lagran", "\u{2112}"), 1202 + ("lambda", "\u{03BB}"), 1203 + ("lang", "\u{27E8}"), 1204 + ("langd", "\u{2991}"), 1205 + ("langle", "\u{27E8}"), 1206 + ("lap", "\u{2A85}"), 1207 + ("laquo", "\u{00AB}"), 1208 + ("larr", "\u{2190}"), 1209 + ("larrb", "\u{21E4}"), 1210 + ("larrbfs", "\u{291F}"), 1211 + ("larrfs", "\u{291D}"), 1212 + ("larrhk", "\u{21A9}"), 1213 + ("larrlp", "\u{21AB}"), 1214 + ("larrpl", "\u{2939}"), 1215 + ("larrsim", "\u{2973}"), 1216 + ("larrtl", "\u{21A2}"), 1217 + ("lat", "\u{2AAB}"), 1218 + ("latail", "\u{2919}"), 1219 + ("late", "\u{2AAD}"), 1220 + ("lates", "\u{2AAD}\u{FE00}"), 1221 + ("lbarr", "\u{290C}"), 1222 + ("lbbrk", "\u{2772}"), 1223 + ("lbrace", "{"), 1224 + ("lbrack", "["), 1225 + ("lbrke", "\u{298B}"), 1226 + ("lbrksld", "\u{298F}"), 1227 + ("lbrkslu", "\u{298D}"), 1228 + ("lcaron", "\u{013E}"), 1229 + ("lcedil", "\u{013C}"), 1230 + ("lceil", "\u{2308}"), 1231 + ("lcub", "{"), 1232 + ("lcy", "\u{043B}"), 1233 + ("ldca", "\u{2936}"), 1234 + ("ldquo", "\u{201C}"), 1235 + ("ldquor", "\u{201E}"), 1236 + ("ldrdhar", "\u{2967}"), 1237 + ("ldrushar", "\u{294B}"), 1238 + ("ldsh", "\u{21B2}"), 1239 + ("le", "\u{2264}"), 1240 + ("leftarrow", "\u{2190}"), 1241 + ("leftarrowtail", "\u{21A2}"), 1242 + ("leftharpoondown", "\u{21BD}"), 1243 + ("leftharpoonup", "\u{21BC}"), 1244 + ("leftleftarrows", "\u{21C7}"), 1245 + ("leftrightarrow", "\u{2194}"), 1246 + ("leftrightarrows", "\u{21C6}"), 1247 + ("leftrightharpoons", "\u{21CB}"), 1248 + ("leftrightsquigarrow", "\u{21AD}"), 1249 + ("leftthreetimes", "\u{22CB}"), 1250 + ("leg", "\u{22DA}"), 1251 + ("leq", "\u{2264}"), 1252 + ("leqq", "\u{2266}"), 1253 + ("leqslant", "\u{2A7D}"), 1254 + ("les", "\u{2A7D}"), 1255 + ("lescc", "\u{2AA8}"), 1256 + ("lesdot", "\u{2A7F}"), 1257 + ("lesdoto", "\u{2A81}"), 1258 + ("lesdotor", "\u{2A83}"), 1259 + ("lesg", "\u{22DA}\u{FE00}"), 1260 + ("lesges", "\u{2A93}"), 1261 + ("lessapprox", "\u{2A85}"), 1262 + ("lessdot", "\u{22D6}"), 1263 + ("lesseqgtr", "\u{22DA}"), 1264 + ("lesseqqgtr", "\u{2A8B}"), 1265 + ("lessgtr", "\u{2276}"), 1266 + ("lesssim", "\u{2272}"), 1267 + ("lfisht", "\u{297C}"), 1268 + ("lfloor", "\u{230A}"), 1269 + ("lfr", "\u{1D529}"), 1270 + ("lg", "\u{2276}"), 1271 + ("lgE", "\u{2A91}"), 1272 + ("lhard", "\u{21BD}"), 1273 + ("lharu", "\u{21BC}"), 1274 + ("lharul", "\u{296A}"), 1275 + ("lhblk", "\u{2584}"), 1276 + ("ljcy", "\u{0459}"), 1277 + ("ll", "\u{226A}"), 1278 + ("llarr", "\u{21C7}"), 1279 + ("llcorner", "\u{231E}"), 1280 + ("llhard", "\u{296B}"), 1281 + ("lltri", "\u{25FA}"), 1282 + ("lmidot", "\u{0140}"), 1283 + ("lmoust", "\u{23B0}"), 1284 + ("lmoustache", "\u{23B0}"), 1285 + ("lnE", "\u{2268}"), 1286 + ("lnap", "\u{2A89}"), 1287 + ("lnapprox", "\u{2A89}"), 1288 + ("lne", "\u{2A87}"), 1289 + ("lneq", "\u{2A87}"), 1290 + ("lneqq", "\u{2268}"), 1291 + ("lnsim", "\u{22E6}"), 1292 + ("loang", "\u{27EC}"), 1293 + ("loarr", "\u{21FD}"), 1294 + ("lobrk", "\u{27E6}"), 1295 + ("longleftarrow", "\u{27F5}"), 1296 + ("longleftrightarrow", "\u{27F7}"), 1297 + ("longmapsto", "\u{27FC}"), 1298 + ("longrightarrow", "\u{27F6}"), 1299 + ("looparrowleft", "\u{21AB}"), 1300 + ("looparrowright", "\u{21AC}"), 1301 + ("lopar", "\u{2985}"), 1302 + ("lopf", "\u{1D55D}"), 1303 + ("loplus", "\u{2A2D}"), 1304 + ("lotimes", "\u{2A34}"), 1305 + ("lowast", "\u{2217}"), 1306 + ("lowbar", "_"), 1307 + ("loz", "\u{25CA}"), 1308 + ("lozenge", "\u{25CA}"), 1309 + ("lozf", "\u{29EB}"), 1310 + ("lpar", "("), 1311 + ("lparlt", "\u{2993}"), 1312 + ("lrarr", "\u{21C6}"), 1313 + ("lrcorner", "\u{231F}"), 1314 + ("lrhar", "\u{21CB}"), 1315 + ("lrhard", "\u{296D}"), 1316 + ("lrm", "\u{200E}"), 1317 + ("lrtri", "\u{22BF}"), 1318 + ("lsaquo", "\u{2039}"), 1319 + ("lscr", "\u{1D4C1}"), 1320 + ("lsh", "\u{21B0}"), 1321 + ("lsim", "\u{2272}"), 1322 + ("lsime", "\u{2A8D}"), 1323 + ("lsimg", "\u{2A8F}"), 1324 + ("lsqb", "["), 1325 + ("lsquo", "\u{2018}"), 1326 + ("lsquor", "\u{201A}"), 1327 + ("lstrok", "\u{0142}"), 1328 + ("lt", "<"), 1329 + ("ltcc", "\u{2AA6}"), 1330 + ("ltcir", "\u{2A79}"), 1331 + ("ltdot", "\u{22D6}"), 1332 + ("lthree", "\u{22CB}"), 1333 + ("ltimes", "\u{22C9}"), 1334 + ("ltlarr", "\u{2976}"), 1335 + ("ltquest", "\u{2A7B}"), 1336 + ("ltrPar", "\u{2996}"), 1337 + ("ltri", "\u{25C3}"), 1338 + ("ltrie", "\u{22B4}"), 1339 + ("ltrif", "\u{25C2}"), 1340 + ("lurdshar", "\u{294A}"), 1341 + ("luruhar", "\u{2966}"), 1342 + ("lvertneqq", "\u{2268}\u{FE00}"), 1343 + ("lvnE", "\u{2268}\u{FE00}"), 1344 + ("mDDot", "\u{223A}"), 1345 + ("macr", "\u{00AF}"), 1346 + ("male", "\u{2642}"), 1347 + ("malt", "\u{2720}"), 1348 + ("maltese", "\u{2720}"), 1349 + ("map", "\u{21A6}"), 1350 + ("mapsto", "\u{21A6}"), 1351 + ("mapstodown", "\u{21A7}"), 1352 + ("mapstoleft", "\u{21A4}"), 1353 + ("mapstoup", "\u{21A5}"), 1354 + ("marker", "\u{25AE}"), 1355 + ("mcomma", "\u{2A29}"), 1356 + ("mcy", "\u{043C}"), 1357 + ("mdash", "\u{2014}"), 1358 + ("measuredangle", "\u{2221}"), 1359 + ("mfr", "\u{1D52A}"), 1360 + ("mho", "\u{2127}"), 1361 + ("micro", "\u{00B5}"), 1362 + ("mid", "\u{2223}"), 1363 + ("midast", "*"), 1364 + ("midcir", "\u{2AF0}"), 1365 + ("middot", "\u{00B7}"), 1366 + ("minus", "\u{2212}"), 1367 + ("minusb", "\u{229F}"), 1368 + ("minusd", "\u{2238}"), 1369 + ("minusdu", "\u{2A2A}"), 1370 + ("mlcp", "\u{2ADB}"), 1371 + ("mldr", "\u{2026}"), 1372 + ("mnplus", "\u{2213}"), 1373 + ("models", "\u{22A7}"), 1374 + ("mopf", "\u{1D55E}"), 1375 + ("mp", "\u{2213}"), 1376 + ("mscr", "\u{1D4C2}"), 1377 + ("mstpos", "\u{223E}"), 1378 + ("mu", "\u{03BC}"), 1379 + ("multimap", "\u{22B8}"), 1380 + ("mumap", "\u{22B8}"), 1381 + ("nGg", "\u{22D9}\u{0338}"), 1382 + ("nGt", "\u{226B}\u{20D2}"), 1383 + ("nGtv", "\u{226B}\u{0338}"), 1384 + ("nLeftarrow", "\u{21CD}"), 1385 + ("nLeftrightarrow", "\u{21CE}"), 1386 + ("nLl", "\u{22D8}\u{0338}"), 1387 + ("nLt", "\u{226A}\u{20D2}"), 1388 + ("nLtv", "\u{226A}\u{0338}"), 1389 + ("nRightarrow", "\u{21CF}"), 1390 + ("nVDash", "\u{22AF}"), 1391 + ("nVdash", "\u{22AE}"), 1392 + ("nabla", "\u{2207}"), 1393 + ("nacute", "\u{0144}"), 1394 + ("nang", "\u{2220}\u{20D2}"), 1395 + ("nap", "\u{2249}"), 1396 + ("napE", "\u{2A70}\u{0338}"), 1397 + ("napid", "\u{224B}\u{0338}"), 1398 + ("napos", "\u{0149}"), 1399 + ("napprox", "\u{2249}"), 1400 + ("natur", "\u{266E}"), 1401 + ("natural", "\u{266E}"), 1402 + ("naturals", "\u{2115}"), 1403 + ("nbsp", "\u{00A0}"), 1404 + ("nbump", "\u{224E}\u{0338}"), 1405 + ("nbumpe", "\u{224F}\u{0338}"), 1406 + ("ncap", "\u{2A43}"), 1407 + ("ncaron", "\u{0148}"), 1408 + ("ncedil", "\u{0146}"), 1409 + ("ncong", "\u{2247}"), 1410 + ("ncongdot", "\u{2A6D}\u{0338}"), 1411 + ("ncup", "\u{2A42}"), 1412 + ("ncy", "\u{043D}"), 1413 + ("ndash", "\u{2013}"), 1414 + ("ne", "\u{2260}"), 1415 + ("neArr", "\u{21D7}"), 1416 + ("nearhk", "\u{2924}"), 1417 + ("nearr", "\u{2197}"), 1418 + ("nearrow", "\u{2197}"), 1419 + ("nedot", "\u{2250}\u{0338}"), 1420 + ("nequiv", "\u{2262}"), 1421 + ("nesear", "\u{2928}"), 1422 + ("nesim", "\u{2242}\u{0338}"), 1423 + ("nexist", "\u{2204}"), 1424 + ("nexists", "\u{2204}"), 1425 + ("nfr", "\u{1D52B}"), 1426 + ("ngE", "\u{2267}\u{0338}"), 1427 + ("nge", "\u{2271}"), 1428 + ("ngeq", "\u{2271}"), 1429 + ("ngeqq", "\u{2267}\u{0338}"), 1430 + ("ngeqslant", "\u{2A7E}\u{0338}"), 1431 + ("nges", "\u{2A7E}\u{0338}"), 1432 + ("ngsim", "\u{2275}"), 1433 + ("ngt", "\u{226F}"), 1434 + ("ngtr", "\u{226F}"), 1435 + ("nhArr", "\u{21CE}"), 1436 + ("nharr", "\u{21AE}"), 1437 + ("nhpar", "\u{2AF2}"), 1438 + ("ni", "\u{220B}"), 1439 + ("nis", "\u{22FC}"), 1440 + ("nisd", "\u{22FA}"), 1441 + ("niv", "\u{220B}"), 1442 + ("njcy", "\u{045A}"), 1443 + ("nlArr", "\u{21CD}"), 1444 + ("nlE", "\u{2266}\u{0338}"), 1445 + ("nlarr", "\u{219A}"), 1446 + ("nldr", "\u{2025}"), 1447 + ("nle", "\u{2270}"), 1448 + ("nleftarrow", "\u{219A}"), 1449 + ("nleftrightarrow", "\u{21AE}"), 1450 + ("nleq", "\u{2270}"), 1451 + ("nleqq", "\u{2266}\u{0338}"), 1452 + ("nleqslant", "\u{2A7D}\u{0338}"), 1453 + ("nles", "\u{2A7D}\u{0338}"), 1454 + ("nless", "\u{226E}"), 1455 + ("nlsim", "\u{2274}"), 1456 + ("nlt", "\u{226E}"), 1457 + ("nltri", "\u{22EA}"), 1458 + ("nltrie", "\u{22EC}"), 1459 + ("nmid", "\u{2224}"), 1460 + ("nopf", "\u{1D55F}"), 1461 + ("not", "\u{00AC}"), 1462 + ("notin", "\u{2209}"), 1463 + ("notinE", "\u{22F9}\u{0338}"), 1464 + ("notindot", "\u{22F5}\u{0338}"), 1465 + ("notinva", "\u{2209}"), 1466 + ("notinvb", "\u{22F7}"), 1467 + ("notinvc", "\u{22F6}"), 1468 + ("notni", "\u{220C}"), 1469 + ("notniva", "\u{220C}"), 1470 + ("notnivb", "\u{22FE}"), 1471 + ("notnivc", "\u{22FD}"), 1472 + ("npar", "\u{2226}"), 1473 + ("nparallel", "\u{2226}"), 1474 + ("nparsl", "\u{2AFD}\u{20E5}"), 1475 + ("npart", "\u{2202}\u{0338}"), 1476 + ("npolint", "\u{2A14}"), 1477 + ("npr", "\u{2280}"), 1478 + ("nprcue", "\u{22E0}"), 1479 + ("npre", "\u{2AAF}\u{0338}"), 1480 + ("nprec", "\u{2280}"), 1481 + ("npreceq", "\u{2AAF}\u{0338}"), 1482 + ("nrArr", "\u{21CF}"), 1483 + ("nrarr", "\u{219B}"), 1484 + ("nrarrc", "\u{2933}\u{0338}"), 1485 + ("nrarrw", "\u{219D}\u{0338}"), 1486 + ("nrightarrow", "\u{219B}"), 1487 + ("nrtri", "\u{22EB}"), 1488 + ("nrtrie", "\u{22ED}"), 1489 + ("nsc", "\u{2281}"), 1490 + ("nsccue", "\u{22E1}"), 1491 + ("nsce", "\u{2AB0}\u{0338}"), 1492 + ("nscr", "\u{1D4C3}"), 1493 + ("nshortmid", "\u{2224}"), 1494 + ("nshortparallel", "\u{2226}"), 1495 + ("nsim", "\u{2241}"), 1496 + ("nsime", "\u{2244}"), 1497 + ("nsimeq", "\u{2244}"), 1498 + ("nsmid", "\u{2224}"), 1499 + ("nspar", "\u{2226}"), 1500 + ("nsqsube", "\u{22E2}"), 1501 + ("nsqsupe", "\u{22E3}"), 1502 + ("nsub", "\u{2284}"), 1503 + ("nsubE", "\u{2AC5}\u{0338}"), 1504 + ("nsube", "\u{2288}"), 1505 + ("nsubset", "\u{2282}\u{20D2}"), 1506 + ("nsubseteq", "\u{2288}"), 1507 + ("nsubseteqq", "\u{2AC5}\u{0338}"), 1508 + ("nsucc", "\u{2281}"), 1509 + ("nsucceq", "\u{2AB0}\u{0338}"), 1510 + ("nsup", "\u{2285}"), 1511 + ("nsupE", "\u{2AC6}\u{0338}"), 1512 + ("nsupe", "\u{2289}"), 1513 + ("nsupset", "\u{2283}\u{20D2}"), 1514 + ("nsupseteq", "\u{2289}"), 1515 + ("nsupseteqq", "\u{2AC6}\u{0338}"), 1516 + ("ntgl", "\u{2279}"), 1517 + ("ntilde", "\u{00F1}"), 1518 + ("ntlg", "\u{2278}"), 1519 + ("ntriangleleft", "\u{22EA}"), 1520 + ("ntrianglelefteq", "\u{22EC}"), 1521 + ("ntriangleright", "\u{22EB}"), 1522 + ("ntrianglerighteq", "\u{22ED}"), 1523 + ("nu", "\u{03BD}"), 1524 + ("num", "#"), 1525 + ("numero", "\u{2116}"), 1526 + ("numsp", "\u{2007}"), 1527 + ("nvDash", "\u{22AD}"), 1528 + ("nvHarr", "\u{2904}"), 1529 + ("nvap", "\u{224D}\u{20D2}"), 1530 + ("nvdash", "\u{22AC}"), 1531 + ("nvge", "\u{2265}\u{20D2}"), 1532 + ("nvgt", ">\u{20D2}"), 1533 + ("nvinfin", "\u{29DE}"), 1534 + ("nvlArr", "\u{2902}"), 1535 + ("nvle", "\u{2264}\u{20D2}"), 1536 + ("nvlt", "<\u{20D2}"), 1537 + ("nvltrie", "\u{22B4}\u{20D2}"), 1538 + ("nvrArr", "\u{2903}"), 1539 + ("nvrtrie", "\u{22B5}\u{20D2}"), 1540 + ("nvsim", "\u{223C}\u{20D2}"), 1541 + ("nwArr", "\u{21D6}"), 1542 + ("nwarhk", "\u{2923}"), 1543 + ("nwarr", "\u{2196}"), 1544 + ("nwarrow", "\u{2196}"), 1545 + ("nwnear", "\u{2927}"), 1546 + ("oS", "\u{24C8}"), 1547 + ("oacute", "\u{00F3}"), 1548 + ("oast", "\u{229B}"), 1549 + ("ocir", "\u{229A}"), 1550 + ("ocirc", "\u{00F4}"), 1551 + ("ocy", "\u{043E}"), 1552 + ("odash", "\u{229D}"), 1553 + ("odblac", "\u{0151}"), 1554 + ("odiv", "\u{2A38}"), 1555 + ("odot", "\u{2299}"), 1556 + ("odsold", "\u{29BC}"), 1557 + ("oelig", "\u{0153}"), 1558 + ("ofcir", "\u{29BF}"), 1559 + ("ofr", "\u{1D52C}"), 1560 + ("ogon", "\u{02DB}"), 1561 + ("ograve", "\u{00F2}"), 1562 + ("ogt", "\u{29C1}"), 1563 + ("ohbar", "\u{29B5}"), 1564 + ("ohm", "\u{03A9}"), 1565 + ("oint", "\u{222E}"), 1566 + ("olarr", "\u{21BA}"), 1567 + ("olcir", "\u{29BE}"), 1568 + ("olcross", "\u{29BB}"), 1569 + ("oline", "\u{203E}"), 1570 + ("olt", "\u{29C0}"), 1571 + ("omacr", "\u{014D}"), 1572 + ("omega", "\u{03C9}"), 1573 + ("omicron", "\u{03BF}"), 1574 + ("omid", "\u{29B6}"), 1575 + ("ominus", "\u{2296}"), 1576 + ("oopf", "\u{1D560}"), 1577 + ("opar", "\u{29B7}"), 1578 + ("operp", "\u{29B9}"), 1579 + ("oplus", "\u{2295}"), 1580 + ("or", "\u{2228}"), 1581 + ("orarr", "\u{21BB}"), 1582 + ("ord", "\u{2A5D}"), 1583 + ("order", "\u{2134}"), 1584 + ("orderof", "\u{2134}"), 1585 + ("ordf", "\u{00AA}"), 1586 + ("ordm", "\u{00BA}"), 1587 + ("origof", "\u{22B6}"), 1588 + ("oror", "\u{2A56}"), 1589 + ("orslope", "\u{2A57}"), 1590 + ("orv", "\u{2A5B}"), 1591 + ("oscr", "\u{2134}"), 1592 + ("oslash", "\u{00F8}"), 1593 + ("osol", "\u{2298}"), 1594 + ("otilde", "\u{00F5}"), 1595 + ("otimes", "\u{2297}"), 1596 + ("otimesas", "\u{2A36}"), 1597 + ("ouml", "\u{00F6}"), 1598 + ("ovbar", "\u{233D}"), 1599 + ("par", "\u{2225}"), 1600 + ("para", "\u{00B6}"), 1601 + ("parallel", "\u{2225}"), 1602 + ("parsim", "\u{2AF3}"), 1603 + ("parsl", "\u{2AFD}"), 1604 + ("part", "\u{2202}"), 1605 + ("pcy", "\u{043F}"), 1606 + ("percnt", "%"), 1607 + ("period", "."), 1608 + ("permil", "\u{2030}"), 1609 + ("perp", "\u{22A5}"), 1610 + ("pertenk", "\u{2031}"), 1611 + ("pfr", "\u{1D52D}"), 1612 + ("phi", "\u{03C6}"), 1613 + ("phiv", "\u{03D5}"), 1614 + ("phmmat", "\u{2133}"), 1615 + ("phone", "\u{260E}"), 1616 + ("pi", "\u{03C0}"), 1617 + ("pitchfork", "\u{22D4}"), 1618 + ("piv", "\u{03D6}"), 1619 + ("planck", "\u{210F}"), 1620 + ("planckh", "\u{210E}"), 1621 + ("plankv", "\u{210F}"), 1622 + ("plus", "+"), 1623 + ("plusacir", "\u{2A23}"), 1624 + ("plusb", "\u{229E}"), 1625 + ("pluscir", "\u{2A22}"), 1626 + ("plusdo", "\u{2214}"), 1627 + ("plusdu", "\u{2A25}"), 1628 + ("pluse", "\u{2A72}"), 1629 + ("plusmn", "\u{00B1}"), 1630 + ("plussim", "\u{2A26}"), 1631 + ("plustwo", "\u{2A27}"), 1632 + ("pm", "\u{00B1}"), 1633 + ("pointint", "\u{2A15}"), 1634 + ("popf", "\u{1D561}"), 1635 + ("pound", "\u{00A3}"), 1636 + ("pr", "\u{227A}"), 1637 + ("prE", "\u{2AB3}"), 1638 + ("prap", "\u{2AB7}"), 1639 + ("prcue", "\u{227C}"), 1640 + ("pre", "\u{2AAF}"), 1641 + ("prec", "\u{227A}"), 1642 + ("precapprox", "\u{2AB7}"), 1643 + ("preccurlyeq", "\u{227C}"), 1644 + ("preceq", "\u{2AAF}"), 1645 + ("precnapprox", "\u{2AB9}"), 1646 + ("precneqq", "\u{2AB5}"), 1647 + ("precnsim", "\u{22E8}"), 1648 + ("precsim", "\u{227E}"), 1649 + ("prime", "\u{2032}"), 1650 + ("primes", "\u{2119}"), 1651 + ("prnE", "\u{2AB5}"), 1652 + ("prnap", "\u{2AB9}"), 1653 + ("prnsim", "\u{22E8}"), 1654 + ("prod", "\u{220F}"), 1655 + ("profalar", "\u{232E}"), 1656 + ("profline", "\u{2312}"), 1657 + ("profsurf", "\u{2313}"), 1658 + ("prop", "\u{221D}"), 1659 + ("propto", "\u{221D}"), 1660 + ("prsim", "\u{227E}"), 1661 + ("prurel", "\u{22B0}"), 1662 + ("pscr", "\u{1D4C5}"), 1663 + ("psi", "\u{03C8}"), 1664 + ("puncsp", "\u{2008}"), 1665 + ("qfr", "\u{1D52E}"), 1666 + ("qint", "\u{2A0C}"), 1667 + ("qopf", "\u{1D562}"), 1668 + ("qprime", "\u{2057}"), 1669 + ("qscr", "\u{1D4C6}"), 1670 + ("quaternions", "\u{210D}"), 1671 + ("quatint", "\u{2A16}"), 1672 + ("quest", "?"), 1673 + ("questeq", "\u{225F}"), 1674 + ("quot", "\u{0022}"), 1675 + ("rAarr", "\u{21DB}"), 1676 + ("rArr", "\u{21D2}"), 1677 + ("rAtail", "\u{291C}"), 1678 + ("rBarr", "\u{290F}"), 1679 + ("rHar", "\u{2964}"), 1680 + ("race", "\u{223D}\u{0331}"), 1681 + ("racute", "\u{0155}"), 1682 + ("radic", "\u{221A}"), 1683 + ("raemptyv", "\u{29B3}"), 1684 + ("rang", "\u{27E9}"), 1685 + ("rangd", "\u{2992}"), 1686 + ("range", "\u{29A5}"), 1687 + ("rangle", "\u{27E9}"), 1688 + ("raquo", "\u{00BB}"), 1689 + ("rarr", "\u{2192}"), 1690 + ("rarrap", "\u{2975}"), 1691 + ("rarrb", "\u{21E5}"), 1692 + ("rarrbfs", "\u{2920}"), 1693 + ("rarrc", "\u{2933}"), 1694 + ("rarrfs", "\u{291E}"), 1695 + ("rarrhk", "\u{21AA}"), 1696 + ("rarrlp", "\u{21AC}"), 1697 + ("rarrpl", "\u{2945}"), 1698 + ("rarrsim", "\u{2974}"), 1699 + ("rarrtl", "\u{21A3}"), 1700 + ("rarrw", "\u{219D}"), 1701 + ("ratail", "\u{291A}"), 1702 + ("ratio", "\u{2236}"), 1703 + ("rationals", "\u{211A}"), 1704 + ("rbarr", "\u{290D}"), 1705 + ("rbbrk", "\u{2773}"), 1706 + ("rbrace", "}"), 1707 + ("rbrack", "]"), 1708 + ("rbrke", "\u{298C}"), 1709 + ("rbrksld", "\u{298E}"), 1710 + ("rbrkslu", "\u{2990}"), 1711 + ("rcaron", "\u{0159}"), 1712 + ("rcedil", "\u{0157}"), 1713 + ("rceil", "\u{2309}"), 1714 + ("rcub", "}"), 1715 + ("rcy", "\u{0440}"), 1716 + ("rdca", "\u{2937}"), 1717 + ("rdldhar", "\u{2969}"), 1718 + ("rdquo", "\u{201D}"), 1719 + ("rdquor", "\u{201D}"), 1720 + ("rdsh", "\u{21B3}"), 1721 + ("real", "\u{211C}"), 1722 + ("realine", "\u{211B}"), 1723 + ("realpart", "\u{211C}"), 1724 + ("reals", "\u{211D}"), 1725 + ("rect", "\u{25AD}"), 1726 + ("reg", "\u{00AE}"), 1727 + ("rfisht", "\u{297D}"), 1728 + ("rfloor", "\u{230B}"), 1729 + ("rfr", "\u{1D52F}"), 1730 + ("rhard", "\u{21C1}"), 1731 + ("rharu", "\u{21C0}"), 1732 + ("rharul", "\u{296C}"), 1733 + ("rho", "\u{03C1}"), 1734 + ("rhov", "\u{03F1}"), 1735 + ("rightarrow", "\u{2192}"), 1736 + ("rightarrowtail", "\u{21A3}"), 1737 + ("rightharpoondown", "\u{21C1}"), 1738 + ("rightharpoonup", "\u{21C0}"), 1739 + ("rightleftarrows", "\u{21C4}"), 1740 + ("rightleftharpoons", "\u{21CC}"), 1741 + ("rightrightarrows", "\u{21C9}"), 1742 + ("rightsquigarrow", "\u{219D}"), 1743 + ("rightthreetimes", "\u{22CC}"), 1744 + ("ring", "\u{02DA}"), 1745 + ("risingdotseq", "\u{2253}"), 1746 + ("rlarr", "\u{21C4}"), 1747 + ("rlhar", "\u{21CC}"), 1748 + ("rlm", "\u{200F}"), 1749 + ("rmoust", "\u{23B1}"), 1750 + ("rmoustache", "\u{23B1}"), 1751 + ("rnmid", "\u{2AEE}"), 1752 + ("roang", "\u{27ED}"), 1753 + ("roarr", "\u{21FE}"), 1754 + ("robrk", "\u{27E7}"), 1755 + ("ropar", "\u{2986}"), 1756 + ("ropf", "\u{1D563}"), 1757 + ("roplus", "\u{2A2E}"), 1758 + ("rotimes", "\u{2A35}"), 1759 + ("rpar", ")"), 1760 + ("rpargt", "\u{2994}"), 1761 + ("rppolint", "\u{2A12}"), 1762 + ("rrarr", "\u{21C9}"), 1763 + ("rsaquo", "\u{203A}"), 1764 + ("rscr", "\u{1D4C7}"), 1765 + ("rsh", "\u{21B1}"), 1766 + ("rsqb", "]"), 1767 + ("rsquo", "\u{2019}"), 1768 + ("rsquor", "\u{2019}"), 1769 + ("rthree", "\u{22CC}"), 1770 + ("rtimes", "\u{22CA}"), 1771 + ("rtri", "\u{25B9}"), 1772 + ("rtrie", "\u{22B5}"), 1773 + ("rtrif", "\u{25B8}"), 1774 + ("rtriltri", "\u{29CE}"), 1775 + ("ruluhar", "\u{2968}"), 1776 + ("rx", "\u{211E}"), 1777 + ("sacute", "\u{015B}"), 1778 + ("sbquo", "\u{201A}"), 1779 + ("sc", "\u{227B}"), 1780 + ("scE", "\u{2AB4}"), 1781 + ("scap", "\u{2AB8}"), 1782 + ("scaron", "\u{0161}"), 1783 + ("sccue", "\u{227D}"), 1784 + ("sce", "\u{2AB0}"), 1785 + ("scedil", "\u{015F}"), 1786 + ("scirc", "\u{015D}"), 1787 + ("scnE", "\u{2AB6}"), 1788 + ("scnap", "\u{2ABA}"), 1789 + ("scnsim", "\u{22E9}"), 1790 + ("scpolint", "\u{2A13}"), 1791 + ("scsim", "\u{227F}"), 1792 + ("scy", "\u{0441}"), 1793 + ("sdot", "\u{22C5}"), 1794 + ("sdotb", "\u{22A1}"), 1795 + ("sdote", "\u{2A66}"), 1796 + ("seArr", "\u{21D8}"), 1797 + ("searhk", "\u{2925}"), 1798 + ("searr", "\u{2198}"), 1799 + ("searrow", "\u{2198}"), 1800 + ("sect", "\u{00A7}"), 1801 + ("semi", ";"), 1802 + ("seswar", "\u{2929}"), 1803 + ("setminus", "\u{2216}"), 1804 + ("setmn", "\u{2216}"), 1805 + ("sext", "\u{2736}"), 1806 + ("sfr", "\u{1D530}"), 1807 + ("sfrown", "\u{2322}"), 1808 + ("sharp", "\u{266F}"), 1809 + ("shchcy", "\u{0449}"), 1810 + ("shcy", "\u{0448}"), 1811 + ("shortmid", "\u{2223}"), 1812 + ("shortparallel", "\u{2225}"), 1813 + ("shy", "\u{00AD}"), 1814 + ("sigma", "\u{03C3}"), 1815 + ("sigmaf", "\u{03C2}"), 1816 + ("sigmav", "\u{03C2}"), 1817 + ("sim", "\u{223C}"), 1818 + ("simdot", "\u{2A6A}"), 1819 + ("sime", "\u{2243}"), 1820 + ("simeq", "\u{2243}"), 1821 + ("simg", "\u{2A9E}"), 1822 + ("simgE", "\u{2AA0}"), 1823 + ("siml", "\u{2A9D}"), 1824 + ("simlE", "\u{2A9F}"), 1825 + ("simne", "\u{2246}"), 1826 + ("simplus", "\u{2A24}"), 1827 + ("simrarr", "\u{2972}"), 1828 + ("slarr", "\u{2190}"), 1829 + ("smallsetminus", "\u{2216}"), 1830 + ("smashp", "\u{2A33}"), 1831 + ("smeparsl", "\u{29E4}"), 1832 + ("smid", "\u{2223}"), 1833 + ("smile", "\u{2323}"), 1834 + ("smt", "\u{2AAA}"), 1835 + ("smte", "\u{2AAC}"), 1836 + ("smtes", "\u{2AAC}\u{FE00}"), 1837 + ("softcy", "\u{044C}"), 1838 + ("sol", "/"), 1839 + ("solb", "\u{29C4}"), 1840 + ("solbar", "\u{233F}"), 1841 + ("sopf", "\u{1D564}"), 1842 + ("spades", "\u{2660}"), 1843 + ("spadesuit", "\u{2660}"), 1844 + ("spar", "\u{2225}"), 1845 + ("sqcap", "\u{2293}"), 1846 + ("sqcaps", "\u{2293}\u{FE00}"), 1847 + ("sqcup", "\u{2294}"), 1848 + ("sqcups", "\u{2294}\u{FE00}"), 1849 + ("sqsub", "\u{228F}"), 1850 + ("sqsube", "\u{2291}"), 1851 + ("sqsubset", "\u{228F}"), 1852 + ("sqsubseteq", "\u{2291}"), 1853 + ("sqsup", "\u{2290}"), 1854 + ("sqsupe", "\u{2292}"), 1855 + ("sqsupset", "\u{2290}"), 1856 + ("sqsupseteq", "\u{2292}"), 1857 + ("squ", "\u{25A1}"), 1858 + ("square", "\u{25A1}"), 1859 + ("squarf", "\u{25AA}"), 1860 + ("squf", "\u{25AA}"), 1861 + ("srarr", "\u{2192}"), 1862 + ("sscr", "\u{1D4C8}"), 1863 + ("ssetmn", "\u{2216}"), 1864 + ("ssmile", "\u{2323}"), 1865 + ("sstarf", "\u{22C6}"), 1866 + ("star", "\u{2606}"), 1867 + ("starf", "\u{2605}"), 1868 + ("straightepsilon", "\u{03F5}"), 1869 + ("straightphi", "\u{03D5}"), 1870 + ("strns", "\u{00AF}"), 1871 + ("sub", "\u{2282}"), 1872 + ("subE", "\u{2AC5}"), 1873 + ("subdot", "\u{2ABD}"), 1874 + ("sube", "\u{2286}"), 1875 + ("subedot", "\u{2AC3}"), 1876 + ("submult", "\u{2AC1}"), 1877 + ("subnE", "\u{2ACB}"), 1878 + ("subne", "\u{228A}"), 1879 + ("subplus", "\u{2ABF}"), 1880 + ("subrarr", "\u{2979}"), 1881 + ("subset", "\u{2282}"), 1882 + ("subseteq", "\u{2286}"), 1883 + ("subseteqq", "\u{2AC5}"), 1884 + ("subsetneq", "\u{228A}"), 1885 + ("subsetneqq", "\u{2ACB}"), 1886 + ("subsim", "\u{2AC7}"), 1887 + ("subsub", "\u{2AD5}"), 1888 + ("subsup", "\u{2AD3}"), 1889 + ("succ", "\u{227B}"), 1890 + ("succapprox", "\u{2AB8}"), 1891 + ("succcurlyeq", "\u{227D}"), 1892 + ("succeq", "\u{2AB0}"), 1893 + ("succnapprox", "\u{2ABA}"), 1894 + ("succneqq", "\u{2AB6}"), 1895 + ("succnsim", "\u{22E9}"), 1896 + ("succsim", "\u{227F}"), 1897 + ("sum", "\u{2211}"), 1898 + ("sung", "\u{266A}"), 1899 + ("sup", "\u{2283}"), 1900 + ("sup1", "\u{00B9}"), 1901 + ("sup2", "\u{00B2}"), 1902 + ("sup3", "\u{00B3}"), 1903 + ("supE", "\u{2AC6}"), 1904 + ("supdot", "\u{2ABE}"), 1905 + ("supdsub", "\u{2AD8}"), 1906 + ("supe", "\u{2287}"), 1907 + ("supedot", "\u{2AC4}"), 1908 + ("suphsol", "\u{27C9}"), 1909 + ("suphsub", "\u{2AD7}"), 1910 + ("suplarr", "\u{297B}"), 1911 + ("supmult", "\u{2AC2}"), 1912 + ("supnE", "\u{2ACC}"), 1913 + ("supne", "\u{228B}"), 1914 + ("supplus", "\u{2AC0}"), 1915 + ("supset", "\u{2283}"), 1916 + ("supseteq", "\u{2287}"), 1917 + ("supseteqq", "\u{2AC6}"), 1918 + ("supsetneq", "\u{228B}"), 1919 + ("supsetneqq", "\u{2ACC}"), 1920 + ("supsim", "\u{2AC8}"), 1921 + ("supsub", "\u{2AD4}"), 1922 + ("supsup", "\u{2AD6}"), 1923 + ("swArr", "\u{21D9}"), 1924 + ("swarhk", "\u{2926}"), 1925 + ("swarr", "\u{2199}"), 1926 + ("swarrow", "\u{2199}"), 1927 + ("swnwar", "\u{292A}"), 1928 + ("szlig", "\u{00DF}"), 1929 + ("target", "\u{2316}"), 1930 + ("tau", "\u{03C4}"), 1931 + ("tbrk", "\u{23B4}"), 1932 + ("tcaron", "\u{0165}"), 1933 + ("tcedil", "\u{0163}"), 1934 + ("tcy", "\u{0442}"), 1935 + ("tdot", "\u{20DB}"), 1936 + ("telrec", "\u{2315}"), 1937 + ("tfr", "\u{1D531}"), 1938 + ("there4", "\u{2234}"), 1939 + ("therefore", "\u{2234}"), 1940 + ("theta", "\u{03B8}"), 1941 + ("thetasym", "\u{03D1}"), 1942 + ("thetav", "\u{03D1}"), 1943 + ("thickapprox", "\u{2248}"), 1944 + ("thicksim", "\u{223C}"), 1945 + ("thinsp", "\u{2009}"), 1946 + ("thkap", "\u{2248}"), 1947 + ("thksim", "\u{223C}"), 1948 + ("thorn", "\u{00FE}"), 1949 + ("tilde", "\u{02DC}"), 1950 + ("times", "\u{00D7}"), 1951 + ("timesb", "\u{22A0}"), 1952 + ("timesbar", "\u{2A31}"), 1953 + ("timesd", "\u{2A30}"), 1954 + ("tint", "\u{222D}"), 1955 + ("toea", "\u{2928}"), 1956 + ("top", "\u{22A4}"), 1957 + ("topbot", "\u{2336}"), 1958 + ("topcir", "\u{2AF1}"), 1959 + ("topf", "\u{1D565}"), 1960 + ("topfork", "\u{2ADA}"), 1961 + ("tosa", "\u{2929}"), 1962 + ("tprime", "\u{2034}"), 1963 + ("trade", "\u{2122}"), 1964 + ("triangle", "\u{25B5}"), 1965 + ("triangledown", "\u{25BF}"), 1966 + ("triangleleft", "\u{25C3}"), 1967 + ("trianglelefteq", "\u{22B4}"), 1968 + ("triangleq", "\u{225C}"), 1969 + ("triangleright", "\u{25B9}"), 1970 + ("trianglerighteq", "\u{22B5}"), 1971 + ("tridot", "\u{25EC}"), 1972 + ("trie", "\u{225C}"), 1973 + ("triminus", "\u{2A3A}"), 1974 + ("triplus", "\u{2A39}"), 1975 + ("trisb", "\u{29CD}"), 1976 + ("tritime", "\u{2A3B}"), 1977 + ("trpezium", "\u{23E2}"), 1978 + ("tscr", "\u{1D4C9}"), 1979 + ("tscy", "\u{0446}"), 1980 + ("tshcy", "\u{045B}"), 1981 + ("tstrok", "\u{0167}"), 1982 + ("twixt", "\u{226C}"), 1983 + ("twoheadleftarrow", "\u{219E}"), 1984 + ("twoheadrightarrow", "\u{21A0}"), 1985 + ("uArr", "\u{21D1}"), 1986 + ("uHar", "\u{2963}"), 1987 + ("uacute", "\u{00FA}"), 1988 + ("uarr", "\u{2191}"), 1989 + ("ubrcy", "\u{045E}"), 1990 + ("ubreve", "\u{016D}"), 1991 + ("ucirc", "\u{00FB}"), 1992 + ("ucy", "\u{0443}"), 1993 + ("udarr", "\u{21C5}"), 1994 + ("udblac", "\u{0171}"), 1995 + ("udhar", "\u{296E}"), 1996 + ("ufisht", "\u{297E}"), 1997 + ("ufr", "\u{1D532}"), 1998 + ("ugrave", "\u{00F9}"), 1999 + ("uharl", "\u{21BF}"), 2000 + ("uharr", "\u{21BE}"), 2001 + ("uhblk", "\u{2580}"), 2002 + ("ulcorn", "\u{231C}"), 2003 + ("ulcorner", "\u{231C}"), 2004 + ("ulcrop", "\u{230F}"), 2005 + ("ultri", "\u{25F8}"), 2006 + ("umacr", "\u{016B}"), 2007 + ("uml", "\u{00A8}"), 2008 + ("uogon", "\u{0173}"), 2009 + ("uopf", "\u{1D566}"), 2010 + ("uparrow", "\u{2191}"), 2011 + ("updownarrow", "\u{2195}"), 2012 + ("upharpoonleft", "\u{21BF}"), 2013 + ("upharpoonright", "\u{21BE}"), 2014 + ("uplus", "\u{228E}"), 2015 + ("upsi", "\u{03C5}"), 2016 + ("upsih", "\u{03D2}"), 2017 + ("upsilon", "\u{03C5}"), 2018 + ("upuparrows", "\u{21C8}"), 2019 + ("urcorn", "\u{231D}"), 2020 + ("urcorner", "\u{231D}"), 2021 + ("urcrop", "\u{230E}"), 2022 + ("uring", "\u{016F}"), 2023 + ("urtri", "\u{25F9}"), 2024 + ("uscr", "\u{1D4CA}"), 2025 + ("utdot", "\u{22F0}"), 2026 + ("utilde", "\u{0169}"), 2027 + ("utri", "\u{25B5}"), 2028 + ("utrif", "\u{25B4}"), 2029 + ("uuarr", "\u{21C8}"), 2030 + ("uuml", "\u{00FC}"), 2031 + ("uwangle", "\u{29A7}"), 2032 + ("vArr", "\u{21D5}"), 2033 + ("vBar", "\u{2AE8}"), 2034 + ("vBarv", "\u{2AE9}"), 2035 + ("vDash", "\u{22A8}"), 2036 + ("vangrt", "\u{299C}"), 2037 + ("varepsilon", "\u{03F5}"), 2038 + ("varkappa", "\u{03F0}"), 2039 + ("varnothing", "\u{2205}"), 2040 + ("varphi", "\u{03D5}"), 2041 + ("varpi", "\u{03D6}"), 2042 + ("varpropto", "\u{221D}"), 2043 + ("varr", "\u{2195}"), 2044 + ("varrho", "\u{03F1}"), 2045 + ("varsigma", "\u{03C2}"), 2046 + ("varsubsetneq", "\u{228A}\u{FE00}"), 2047 + ("varsubsetneqq", "\u{2ACB}\u{FE00}"), 2048 + ("varsupsetneq", "\u{228B}\u{FE00}"), 2049 + ("varsupsetneqq", "\u{2ACC}\u{FE00}"), 2050 + ("vartheta", "\u{03D1}"), 2051 + ("vartriangleleft", "\u{22B2}"), 2052 + ("vartriangleright", "\u{22B3}"), 2053 + ("vcy", "\u{0432}"), 2054 + ("vdash", "\u{22A2}"), 2055 + ("vee", "\u{2228}"), 2056 + ("veebar", "\u{22BB}"), 2057 + ("veeeq", "\u{225A}"), 2058 + ("vellip", "\u{22EE}"), 2059 + ("verbar", "|"), 2060 + ("vert", "|"), 2061 + ("vfr", "\u{1D533}"), 2062 + ("vltri", "\u{22B2}"), 2063 + ("vnsub", "\u{2282}\u{20D2}"), 2064 + ("vnsup", "\u{2283}\u{20D2}"), 2065 + ("vopf", "\u{1D567}"), 2066 + ("vprop", "\u{221D}"), 2067 + ("vrtri", "\u{22B3}"), 2068 + ("vscr", "\u{1D4CB}"), 2069 + ("vsubnE", "\u{2ACB}\u{FE00}"), 2070 + ("vsubne", "\u{228A}\u{FE00}"), 2071 + ("vsupnE", "\u{2ACC}\u{FE00}"), 2072 + ("vsupne", "\u{228B}\u{FE00}"), 2073 + ("vzigzag", "\u{299A}"), 2074 + ("wcirc", "\u{0175}"), 2075 + ("wedbar", "\u{2A5F}"), 2076 + ("wedge", "\u{2227}"), 2077 + ("wedgeq", "\u{2259}"), 2078 + ("weierp", "\u{2118}"), 2079 + ("wfr", "\u{1D534}"), 2080 + ("wopf", "\u{1D568}"), 2081 + ("wp", "\u{2118}"), 2082 + ("wr", "\u{2240}"), 2083 + ("wreath", "\u{2240}"), 2084 + ("wscr", "\u{1D4CC}"), 2085 + ("xcap", "\u{22C2}"), 2086 + ("xcirc", "\u{25EF}"), 2087 + ("xcup", "\u{22C3}"), 2088 + ("xdtri", "\u{25BD}"), 2089 + ("xfr", "\u{1D535}"), 2090 + ("xhArr", "\u{27FA}"), 2091 + ("xharr", "\u{27F7}"), 2092 + ("xi", "\u{03BE}"), 2093 + ("xlArr", "\u{27F8}"), 2094 + ("xlarr", "\u{27F5}"), 2095 + ("xmap", "\u{27FC}"), 2096 + ("xnis", "\u{22FB}"), 2097 + ("xodot", "\u{2A00}"), 2098 + ("xopf", "\u{1D569}"), 2099 + ("xoplus", "\u{2A01}"), 2100 + ("xotime", "\u{2A02}"), 2101 + ("xrArr", "\u{27F9}"), 2102 + ("xrarr", "\u{27F6}"), 2103 + ("xscr", "\u{1D4CD}"), 2104 + ("xsqcup", "\u{2A06}"), 2105 + ("xuplus", "\u{2A04}"), 2106 + ("xutri", "\u{25B3}"), 2107 + ("xvee", "\u{22C1}"), 2108 + ("xwedge", "\u{22C0}"), 2109 + ("yacute", "\u{00FD}"), 2110 + ("yacy", "\u{044F}"), 2111 + ("ycirc", "\u{0177}"), 2112 + ("ycy", "\u{044B}"), 2113 + ("yen", "\u{00A5}"), 2114 + ("yfr", "\u{1D536}"), 2115 + ("yicy", "\u{0457}"), 2116 + ("yopf", "\u{1D56A}"), 2117 + ("yscr", "\u{1D4CE}"), 2118 + ("yucy", "\u{044E}"), 2119 + ("yuml", "\u{00FF}"), 2120 + ("zacute", "\u{017A}"), 2121 + ("zcaron", "\u{017E}"), 2122 + ("zcy", "\u{0437}"), 2123 + ("zdot", "\u{017C}"), 2124 + ("zeetrf", "\u{2128}"), 2125 + ("zeta", "\u{03B6}"), 2126 + ("zfr", "\u{1D537}"), 2127 + ("zhcy", "\u{0436}"), 2128 + ("zigrarr", "\u{21DD}"), 2129 + ("zopf", "\u{1D56B}"), 2130 + ("zscr", "\u{1D4CF}"), 2131 + ("zwj", "\u{200D}"), 2132 + ("zwnj", "\u{200C}"), 2133 + ]; 2134 + 2135 + /// The 106 legacy named entities recognized without a trailing semicolon. 2136 + static LEGACY_ENTITIES: &[&str] = &[ 2137 + "AElig", "AMP", "Aacute", "Acirc", "Agrave", "Aring", "Atilde", "Auml", "COPY", "Ccedil", 2138 + "ETH", "Eacute", "Ecirc", "Egrave", "Euml", "GT", "Iacute", "Icirc", "Igrave", "Iuml", "LT", 2139 + "Ntilde", "Oacute", "Ocirc", "Ograve", "Oslash", "Otilde", "Ouml", "QUOT", "REG", "THORN", 2140 + "Uacute", "Ucirc", "Ugrave", "Uuml", "Yacute", "aacute", "acirc", "acute", "aelig", "agrave", 2141 + "amp", "aring", "atilde", "auml", "brvbar", "ccedil", "cedil", "cent", "copy", "curren", "deg", 2142 + "divide", "eacute", "ecirc", "egrave", "eth", "euml", "frac12", "frac14", "frac34", "gt", 2143 + "iacute", "icirc", "iexcl", "igrave", "iquest", "iuml", "laquo", "lt", "macr", "micro", 2144 + "middot", "nbsp", "not", "ntilde", "oacute", "ocirc", "ograve", "ordf", "ordm", "oslash", 2145 + "otilde", "ouml", "para", "plusmn", "pound", "quot", "raquo", "reg", "sect", "shy", "sup1", 2146 + "sup2", "sup3", "szlig", "thorn", "times", "uacute", "ucirc", "ugrave", "uml", "uuml", 2147 + "yacute", "yen", "yuml", 2148 + ]; 2149 + 2150 + /// Look up a named character reference by name (without & prefix or ; suffix). 2151 + /// Returns the replacement string if found. 2152 + pub fn lookup_entity(name: &str) -> Option<&'static str> { 2153 + ENTITIES 2154 + .binary_search_by_key(&name, |&(n, _)| n) 2155 + .ok() 2156 + .map(|i| ENTITIES[i].1) 2157 + } 2158 + 2159 + /// Check if an entity name is a "legacy" entity recognized without semicolon. 2160 + pub fn is_legacy_entity(name: &str) -> bool { 2161 + LEGACY_ENTITIES.binary_search(&name).is_ok() 2162 + }
+28 -4
crates/html/src/lib.rs
··· 1 1 //! HTML5 tokenizer and tree builder. 2 + //! 3 + //! Implements the WHATWG HTML5 tokenizer state machine (§13.2.5). 4 + 5 + mod entities; 6 + mod tokenizer; 7 + 8 + pub use tokenizer::Tokenizer; 2 9 3 10 /// A token emitted by the HTML tokenizer. 4 11 #[derive(Debug, Clone, PartialEq)] ··· 28 35 29 36 /// Tokenize an HTML input string into a sequence of tokens. 30 37 /// 31 - /// This is a stub that returns an empty `Vec`. The real implementation 32 - /// will be a spec-compliant HTML5 tokenizer state machine. 33 - pub fn tokenize(_input: &str) -> Vec<Token> { 34 - Vec::new() 38 + /// Runs the HTML5 tokenizer state machine and returns all emitted tokens 39 + /// (excluding Eof). Adjacent Character tokens are coalesced. 40 + pub fn tokenize(input: &str) -> Vec<Token> { 41 + let mut tok = Tokenizer::new(input); 42 + let mut tokens = Vec::new(); 43 + loop { 44 + let token = tok.next_token(); 45 + match token { 46 + Token::Eof => break, 47 + Token::Character(ref s) => { 48 + // Coalesce adjacent character tokens. 49 + if let Some(Token::Character(ref mut prev)) = tokens.last_mut() { 50 + prev.push_str(s); 51 + } else { 52 + tokens.push(token); 53 + } 54 + } 55 + _ => tokens.push(token), 56 + } 57 + } 58 + tokens 35 59 }
+2039
crates/html/src/tokenizer.rs
··· 1 + //! HTML5 tokenizer state machine per WHATWG spec §13.2.5. 2 + 3 + use crate::entities; 4 + use crate::Token; 5 + 6 + #[derive(Debug, Clone, Copy, PartialEq)] 7 + enum State { 8 + Data, 9 + TagOpen, 10 + EndTagOpen, 11 + TagName, 12 + BeforeAttributeName, 13 + AttributeName, 14 + AfterAttributeName, 15 + BeforeAttributeValue, 16 + AttributeValueDoubleQuoted, 17 + AttributeValueSingleQuoted, 18 + AttributeValueUnquoted, 19 + AfterAttributeValueQuoted, 20 + SelfClosingStartTag, 21 + BogusComment, 22 + MarkupDeclarationOpen, 23 + CommentStart, 24 + CommentStartDash, 25 + Comment, 26 + CommentLessThanSign, 27 + CommentLessThanSignBang, 28 + CommentLessThanSignBangDash, 29 + CommentLessThanSignBangDashDash, 30 + CommentEndDash, 31 + CommentEnd, 32 + CommentEndBang, 33 + Doctype, 34 + BeforeDoctypeName, 35 + DoctypeName, 36 + AfterDoctypeName, 37 + AfterDoctypePublicKeyword, 38 + BeforeDoctypePublicIdentifier, 39 + DoctypePublicIdentifierDoubleQuoted, 40 + DoctypePublicIdentifierSingleQuoted, 41 + AfterDoctypePublicIdentifier, 42 + BetweenDoctypePublicAndSystemIdentifiers, 43 + AfterDoctypeSystemKeyword, 44 + BeforeDoctypeSystemIdentifier, 45 + DoctypeSystemIdentifierDoubleQuoted, 46 + DoctypeSystemIdentifierSingleQuoted, 47 + AfterDoctypeSystemIdentifier, 48 + BogusDoctype, 49 + CharacterReference, 50 + NumericCharacterReference, 51 + HexCharacterReferenceStart, 52 + DecCharacterReferenceStart, 53 + HexCharacterReference, 54 + DecCharacterReference, 55 + NumericCharacterReferenceEnd, 56 + NamedCharacterReference, 57 + } 58 + 59 + /// HTML5 tokenizer state machine. 60 + pub struct Tokenizer { 61 + input: Vec<char>, 62 + pos: usize, 63 + state: State, 64 + return_state: State, 65 + pending: Vec<Token>, 66 + /// Current tag being built. 67 + tag_name: String, 68 + tag_self_closing: bool, 69 + tag_is_end: bool, 70 + tag_attributes: Vec<(String, String)>, 71 + current_attr_name: String, 72 + current_attr_value: String, 73 + /// Current comment or doctype being built. 74 + comment_data: String, 75 + doctype_name: Option<String>, 76 + doctype_public_id: Option<String>, 77 + doctype_system_id: Option<String>, 78 + doctype_force_quirks: bool, 79 + /// Character reference accumulator. 80 + char_ref_code: u32, 81 + temp_buf: String, 82 + } 83 + 84 + impl Tokenizer { 85 + /// Create a new tokenizer for the given input. 86 + pub fn new(input: &str) -> Self { 87 + Tokenizer { 88 + input: input.chars().collect(), 89 + pos: 0, 90 + state: State::Data, 91 + return_state: State::Data, 92 + pending: Vec::new(), 93 + tag_name: String::new(), 94 + tag_self_closing: false, 95 + tag_is_end: false, 96 + tag_attributes: Vec::new(), 97 + current_attr_name: String::new(), 98 + current_attr_value: String::new(), 99 + comment_data: String::new(), 100 + doctype_name: None, 101 + doctype_public_id: None, 102 + doctype_system_id: None, 103 + doctype_force_quirks: false, 104 + char_ref_code: 0, 105 + temp_buf: String::new(), 106 + } 107 + } 108 + 109 + /// Return the next token from the input. 110 + pub fn next_token(&mut self) -> Token { 111 + loop { 112 + if let Some(token) = self.pending.pop() { 113 + return token; 114 + } 115 + self.step(); 116 + } 117 + } 118 + 119 + fn next_char(&mut self) -> Option<char> { 120 + if self.pos < self.input.len() { 121 + let ch = self.input[self.pos]; 122 + self.pos += 1; 123 + Some(ch) 124 + } else { 125 + None 126 + } 127 + } 128 + 129 + fn peek_char(&self) -> Option<char> { 130 + if self.pos < self.input.len() { 131 + Some(self.input[self.pos]) 132 + } else { 133 + None 134 + } 135 + } 136 + 137 + fn reconsume(&mut self) { 138 + if self.pos > 0 { 139 + self.pos -= 1; 140 + } 141 + } 142 + 143 + fn emit(&mut self, token: Token) { 144 + // We use a Vec as a stack, so push to front by inserting at 0. 145 + self.pending.insert(0, token); 146 + } 147 + 148 + fn emit_current_tag(&mut self) { 149 + // Finalize the current attribute if there is one. 150 + self.finish_attribute(); 151 + 152 + if self.tag_is_end { 153 + self.emit(Token::EndTag { 154 + name: self.tag_name.clone(), 155 + }); 156 + } else { 157 + self.emit(Token::StartTag { 158 + name: self.tag_name.clone(), 159 + attributes: self.tag_attributes.clone(), 160 + self_closing: self.tag_self_closing, 161 + }); 162 + } 163 + } 164 + 165 + fn emit_current_comment(&mut self) { 166 + self.emit(Token::Comment(self.comment_data.clone())); 167 + } 168 + 169 + fn emit_current_doctype(&mut self) { 170 + self.emit(Token::Doctype { 171 + name: self.doctype_name.clone(), 172 + public_id: self.doctype_public_id.clone(), 173 + system_id: self.doctype_system_id.clone(), 174 + force_quirks: self.doctype_force_quirks, 175 + }); 176 + } 177 + 178 + fn emit_char(&mut self, ch: char) { 179 + self.emit(Token::Character(ch.to_string())); 180 + } 181 + 182 + fn emit_eof(&mut self) { 183 + self.emit(Token::Eof); 184 + } 185 + 186 + fn start_new_tag(&mut self, is_end: bool) { 187 + self.tag_name.clear(); 188 + self.tag_self_closing = false; 189 + self.tag_is_end = is_end; 190 + self.tag_attributes.clear(); 191 + self.current_attr_name.clear(); 192 + self.current_attr_value.clear(); 193 + } 194 + 195 + fn start_new_attribute(&mut self) { 196 + self.finish_attribute(); 197 + self.current_attr_name.clear(); 198 + self.current_attr_value.clear(); 199 + } 200 + 201 + fn finish_attribute(&mut self) { 202 + if !self.current_attr_name.is_empty() { 203 + // Per spec: if duplicate attribute name, ignore the later one. 204 + let name = self.current_attr_name.clone(); 205 + if !self.tag_attributes.iter().any(|(n, _)| n == &name) { 206 + self.tag_attributes 207 + .push((name, self.current_attr_value.clone())); 208 + } 209 + self.current_attr_name.clear(); 210 + self.current_attr_value.clear(); 211 + } 212 + } 213 + 214 + /// Flush character reference code to the return state. 215 + fn flush_char_ref(&mut self, s: &str) { 216 + match self.return_state { 217 + State::AttributeValueDoubleQuoted 218 + | State::AttributeValueSingleQuoted 219 + | State::AttributeValueUnquoted => { 220 + self.current_attr_value.push_str(s); 221 + } 222 + _ => { 223 + for ch in s.chars() { 224 + self.emit_char(ch); 225 + } 226 + } 227 + } 228 + } 229 + 230 + fn step(&mut self) { 231 + match self.state { 232 + State::Data => self.state_data(), 233 + State::TagOpen => self.state_tag_open(), 234 + State::EndTagOpen => self.state_end_tag_open(), 235 + State::TagName => self.state_tag_name(), 236 + State::BeforeAttributeName => self.state_before_attribute_name(), 237 + State::AttributeName => self.state_attribute_name(), 238 + State::AfterAttributeName => self.state_after_attribute_name(), 239 + State::BeforeAttributeValue => self.state_before_attribute_value(), 240 + State::AttributeValueDoubleQuoted => self.state_attribute_value_double_quoted(), 241 + State::AttributeValueSingleQuoted => self.state_attribute_value_single_quoted(), 242 + State::AttributeValueUnquoted => self.state_attribute_value_unquoted(), 243 + State::AfterAttributeValueQuoted => self.state_after_attribute_value_quoted(), 244 + State::SelfClosingStartTag => self.state_self_closing_start_tag(), 245 + State::BogusComment => self.state_bogus_comment(), 246 + State::MarkupDeclarationOpen => self.state_markup_declaration_open(), 247 + State::CommentStart => self.state_comment_start(), 248 + State::CommentStartDash => self.state_comment_start_dash(), 249 + State::Comment => self.state_comment(), 250 + State::CommentLessThanSign => self.state_comment_less_than_sign(), 251 + State::CommentLessThanSignBang => self.state_comment_less_than_sign_bang(), 252 + State::CommentLessThanSignBangDash => self.state_comment_less_than_sign_bang_dash(), 253 + State::CommentLessThanSignBangDashDash => { 254 + self.state_comment_less_than_sign_bang_dash_dash() 255 + } 256 + State::CommentEndDash => self.state_comment_end_dash(), 257 + State::CommentEnd => self.state_comment_end(), 258 + State::CommentEndBang => self.state_comment_end_bang(), 259 + State::Doctype => self.state_doctype(), 260 + State::BeforeDoctypeName => self.state_before_doctype_name(), 261 + State::DoctypeName => self.state_doctype_name(), 262 + State::AfterDoctypeName => self.state_after_doctype_name(), 263 + State::AfterDoctypePublicKeyword => self.state_after_doctype_public_keyword(), 264 + State::BeforeDoctypePublicIdentifier => self.state_before_doctype_public_identifier(), 265 + State::DoctypePublicIdentifierDoubleQuoted => { 266 + self.state_doctype_public_identifier_double_quoted() 267 + } 268 + State::DoctypePublicIdentifierSingleQuoted => { 269 + self.state_doctype_public_identifier_single_quoted() 270 + } 271 + State::AfterDoctypePublicIdentifier => self.state_after_doctype_public_identifier(), 272 + State::BetweenDoctypePublicAndSystemIdentifiers => { 273 + self.state_between_doctype_public_and_system_identifiers() 274 + } 275 + State::AfterDoctypeSystemKeyword => self.state_after_doctype_system_keyword(), 276 + State::BeforeDoctypeSystemIdentifier => self.state_before_doctype_system_identifier(), 277 + State::DoctypeSystemIdentifierDoubleQuoted => { 278 + self.state_doctype_system_identifier_double_quoted() 279 + } 280 + State::DoctypeSystemIdentifierSingleQuoted => { 281 + self.state_doctype_system_identifier_single_quoted() 282 + } 283 + State::AfterDoctypeSystemIdentifier => self.state_after_doctype_system_identifier(), 284 + State::BogusDoctype => self.state_bogus_doctype(), 285 + State::CharacterReference => self.state_character_reference(), 286 + State::NumericCharacterReference => self.state_numeric_character_reference(), 287 + State::HexCharacterReferenceStart => self.state_hex_character_reference_start(), 288 + State::DecCharacterReferenceStart => self.state_dec_character_reference_start(), 289 + State::HexCharacterReference => self.state_hex_character_reference(), 290 + State::DecCharacterReference => self.state_dec_character_reference(), 291 + State::NumericCharacterReferenceEnd => self.state_numeric_character_reference_end(), 292 + State::NamedCharacterReference => self.state_named_character_reference(), 293 + } 294 + } 295 + 296 + // --- State implementations --- 297 + 298 + fn state_data(&mut self) { 299 + match self.next_char() { 300 + Some('&') => { 301 + self.return_state = State::Data; 302 + self.state = State::CharacterReference; 303 + } 304 + Some('<') => { 305 + self.state = State::TagOpen; 306 + } 307 + Some('\0') => { 308 + // Parse error. Emit replacement character. 309 + self.emit_char('\u{FFFD}'); 310 + } 311 + None => { 312 + self.emit_eof(); 313 + } 314 + Some(c) => { 315 + self.emit_char(c); 316 + } 317 + } 318 + } 319 + 320 + fn state_tag_open(&mut self) { 321 + match self.next_char() { 322 + Some('!') => { 323 + self.state = State::MarkupDeclarationOpen; 324 + } 325 + Some('/') => { 326 + self.state = State::EndTagOpen; 327 + } 328 + Some(c) if c.is_ascii_alphabetic() => { 329 + self.start_new_tag(false); 330 + self.reconsume(); 331 + self.state = State::TagName; 332 + } 333 + Some('?') => { 334 + // Parse error. Create a comment token. 335 + self.comment_data.clear(); 336 + self.reconsume(); 337 + self.state = State::BogusComment; 338 + } 339 + None => { 340 + // Parse error. Emit '<' and EOF. 341 + self.emit_char('<'); 342 + self.emit_eof(); 343 + } 344 + Some(_) => { 345 + // Parse error. Emit '<' and reconsume. 346 + self.emit_char('<'); 347 + self.reconsume(); 348 + self.state = State::Data; 349 + } 350 + } 351 + } 352 + 353 + fn state_end_tag_open(&mut self) { 354 + match self.next_char() { 355 + Some(c) if c.is_ascii_alphabetic() => { 356 + self.start_new_tag(true); 357 + self.reconsume(); 358 + self.state = State::TagName; 359 + } 360 + Some('>') => { 361 + // Parse error. Switch to data state. 362 + self.state = State::Data; 363 + } 364 + None => { 365 + self.emit_char('<'); 366 + self.emit_char('/'); 367 + self.emit_eof(); 368 + } 369 + Some(_) => { 370 + // Parse error. Create a comment. 371 + self.comment_data.clear(); 372 + self.reconsume(); 373 + self.state = State::BogusComment; 374 + } 375 + } 376 + } 377 + 378 + fn state_tag_name(&mut self) { 379 + match self.next_char() { 380 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 381 + self.state = State::BeforeAttributeName; 382 + } 383 + Some('/') => { 384 + self.state = State::SelfClosingStartTag; 385 + } 386 + Some('>') => { 387 + self.state = State::Data; 388 + self.emit_current_tag(); 389 + } 390 + Some(c) if c.is_ascii_uppercase() => { 391 + self.tag_name.push(c.to_ascii_lowercase()); 392 + } 393 + Some('\0') => { 394 + self.tag_name.push('\u{FFFD}'); 395 + } 396 + None => { 397 + self.emit_eof(); 398 + } 399 + Some(c) => { 400 + self.tag_name.push(c); 401 + } 402 + } 403 + } 404 + 405 + fn state_before_attribute_name(&mut self) { 406 + match self.next_char() { 407 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 408 + // Ignore whitespace. 409 + } 410 + Some('/') | Some('>') => { 411 + self.reconsume(); 412 + self.state = State::AfterAttributeName; 413 + } 414 + None => { 415 + // EOF: go to AfterAttributeName without reconsuming. 416 + self.state = State::AfterAttributeName; 417 + } 418 + Some('=') => { 419 + // Parse error. Start a new attribute with '=' as name. 420 + self.start_new_attribute(); 421 + self.current_attr_name.push('='); 422 + self.state = State::AttributeName; 423 + } 424 + Some(_) => { 425 + self.start_new_attribute(); 426 + self.reconsume(); 427 + self.state = State::AttributeName; 428 + } 429 + } 430 + } 431 + 432 + fn state_attribute_name(&mut self) { 433 + match self.next_char() { 434 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') | Some('/') | Some('>') => { 435 + self.reconsume(); 436 + self.state = State::AfterAttributeName; 437 + } 438 + None => { 439 + self.state = State::AfterAttributeName; 440 + } 441 + Some('=') => { 442 + self.state = State::BeforeAttributeValue; 443 + } 444 + Some(c) if c.is_ascii_uppercase() => { 445 + self.current_attr_name.push(c.to_ascii_lowercase()); 446 + } 447 + Some('\0') => { 448 + self.current_attr_name.push('\u{FFFD}'); 449 + } 450 + Some(c) => { 451 + self.current_attr_name.push(c); 452 + } 453 + } 454 + } 455 + 456 + fn state_after_attribute_name(&mut self) { 457 + match self.next_char() { 458 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 459 + // Ignore. 460 + } 461 + Some('/') => { 462 + self.state = State::SelfClosingStartTag; 463 + } 464 + Some('=') => { 465 + self.state = State::BeforeAttributeValue; 466 + } 467 + Some('>') => { 468 + self.state = State::Data; 469 + self.emit_current_tag(); 470 + } 471 + None => { 472 + self.emit_eof(); 473 + } 474 + Some(_) => { 475 + self.start_new_attribute(); 476 + self.reconsume(); 477 + self.state = State::AttributeName; 478 + } 479 + } 480 + } 481 + 482 + fn state_before_attribute_value(&mut self) { 483 + match self.next_char() { 484 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 485 + // Ignore. 486 + } 487 + Some('"') => { 488 + self.state = State::AttributeValueDoubleQuoted; 489 + } 490 + Some('\'') => { 491 + self.state = State::AttributeValueSingleQuoted; 492 + } 493 + Some('>') => { 494 + // Parse error. Emit tag with missing value. 495 + self.state = State::Data; 496 + self.emit_current_tag(); 497 + } 498 + _ => { 499 + self.reconsume(); 500 + self.state = State::AttributeValueUnquoted; 501 + } 502 + } 503 + } 504 + 505 + fn state_attribute_value_double_quoted(&mut self) { 506 + match self.next_char() { 507 + Some('"') => { 508 + self.state = State::AfterAttributeValueQuoted; 509 + } 510 + Some('&') => { 511 + self.return_state = State::AttributeValueDoubleQuoted; 512 + self.state = State::CharacterReference; 513 + } 514 + Some('\0') => { 515 + self.current_attr_value.push('\u{FFFD}'); 516 + } 517 + None => { 518 + self.emit_eof(); 519 + } 520 + Some(c) => { 521 + self.current_attr_value.push(c); 522 + } 523 + } 524 + } 525 + 526 + fn state_attribute_value_single_quoted(&mut self) { 527 + match self.next_char() { 528 + Some('\'') => { 529 + self.state = State::AfterAttributeValueQuoted; 530 + } 531 + Some('&') => { 532 + self.return_state = State::AttributeValueSingleQuoted; 533 + self.state = State::CharacterReference; 534 + } 535 + Some('\0') => { 536 + self.current_attr_value.push('\u{FFFD}'); 537 + } 538 + None => { 539 + self.emit_eof(); 540 + } 541 + Some(c) => { 542 + self.current_attr_value.push(c); 543 + } 544 + } 545 + } 546 + 547 + fn state_attribute_value_unquoted(&mut self) { 548 + match self.next_char() { 549 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 550 + self.state = State::BeforeAttributeName; 551 + } 552 + Some('&') => { 553 + self.return_state = State::AttributeValueUnquoted; 554 + self.state = State::CharacterReference; 555 + } 556 + Some('>') => { 557 + self.state = State::Data; 558 + self.emit_current_tag(); 559 + } 560 + Some('\0') => { 561 + self.current_attr_value.push('\u{FFFD}'); 562 + } 563 + None => { 564 + self.emit_eof(); 565 + } 566 + Some(c) => { 567 + self.current_attr_value.push(c); 568 + } 569 + } 570 + } 571 + 572 + fn state_after_attribute_value_quoted(&mut self) { 573 + match self.next_char() { 574 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 575 + self.state = State::BeforeAttributeName; 576 + } 577 + Some('/') => { 578 + self.state = State::SelfClosingStartTag; 579 + } 580 + Some('>') => { 581 + self.state = State::Data; 582 + self.emit_current_tag(); 583 + } 584 + None => { 585 + self.emit_eof(); 586 + } 587 + Some(_) => { 588 + // Parse error. Reconsume in before attribute name. 589 + self.reconsume(); 590 + self.state = State::BeforeAttributeName; 591 + } 592 + } 593 + } 594 + 595 + fn state_self_closing_start_tag(&mut self) { 596 + match self.next_char() { 597 + Some('>') => { 598 + self.tag_self_closing = true; 599 + self.state = State::Data; 600 + self.emit_current_tag(); 601 + } 602 + None => { 603 + self.emit_eof(); 604 + } 605 + Some(_) => { 606 + // Parse error. Reconsume in before attribute name. 607 + self.reconsume(); 608 + self.state = State::BeforeAttributeName; 609 + } 610 + } 611 + } 612 + 613 + fn state_bogus_comment(&mut self) { 614 + match self.next_char() { 615 + Some('>') => { 616 + self.state = State::Data; 617 + self.emit_current_comment(); 618 + } 619 + None => { 620 + self.emit_current_comment(); 621 + self.emit_eof(); 622 + } 623 + Some('\0') => { 624 + self.comment_data.push('\u{FFFD}'); 625 + } 626 + Some(c) => { 627 + self.comment_data.push(c); 628 + } 629 + } 630 + } 631 + 632 + fn state_markup_declaration_open(&mut self) { 633 + // Check for `--`, `DOCTYPE`, or `[CDATA[` 634 + if self.starts_with("--") { 635 + self.pos += 2; 636 + self.comment_data.clear(); 637 + self.state = State::CommentStart; 638 + } else if self.starts_with_case_insensitive("DOCTYPE") { 639 + self.pos += 7; 640 + self.state = State::Doctype; 641 + } else if self.starts_with("[CDATA[") { 642 + // Per spec, if not in foreign content, parse error → bogus comment. 643 + self.pos += 7; 644 + self.comment_data.clear(); 645 + self.comment_data.push_str("[CDATA["); 646 + self.state = State::BogusComment; 647 + } else { 648 + // Parse error. Bogus comment. 649 + self.comment_data.clear(); 650 + self.state = State::BogusComment; 651 + } 652 + } 653 + 654 + fn state_comment_start(&mut self) { 655 + match self.next_char() { 656 + Some('-') => { 657 + self.state = State::CommentStartDash; 658 + } 659 + Some('>') => { 660 + // Parse error. Emit empty comment. 661 + self.state = State::Data; 662 + self.emit_current_comment(); 663 + } 664 + _ => { 665 + self.reconsume(); 666 + self.state = State::Comment; 667 + } 668 + } 669 + } 670 + 671 + fn state_comment_start_dash(&mut self) { 672 + match self.next_char() { 673 + Some('-') => { 674 + self.state = State::CommentEnd; 675 + } 676 + Some('>') => { 677 + // Parse error. 678 + self.state = State::Data; 679 + self.emit_current_comment(); 680 + } 681 + None => { 682 + self.emit_current_comment(); 683 + self.emit_eof(); 684 + } 685 + Some(_) => { 686 + self.comment_data.push('-'); 687 + self.reconsume(); 688 + self.state = State::Comment; 689 + } 690 + } 691 + } 692 + 693 + fn state_comment(&mut self) { 694 + match self.next_char() { 695 + Some('<') => { 696 + self.comment_data.push('<'); 697 + self.state = State::CommentLessThanSign; 698 + } 699 + Some('-') => { 700 + self.state = State::CommentEndDash; 701 + } 702 + Some('\0') => { 703 + self.comment_data.push('\u{FFFD}'); 704 + } 705 + None => { 706 + self.emit_current_comment(); 707 + self.emit_eof(); 708 + } 709 + Some(c) => { 710 + self.comment_data.push(c); 711 + } 712 + } 713 + } 714 + 715 + fn state_comment_less_than_sign(&mut self) { 716 + match self.next_char() { 717 + Some('!') => { 718 + self.comment_data.push('!'); 719 + self.state = State::CommentLessThanSignBang; 720 + } 721 + Some('<') => { 722 + self.comment_data.push('<'); 723 + } 724 + None => { 725 + // Don't reconsume on EOF — pos didn't advance, so reconsuming 726 + // would back up to '<' and loop forever between here and Comment. 727 + self.state = State::Comment; 728 + } 729 + Some(_) => { 730 + self.reconsume(); 731 + self.state = State::Comment; 732 + } 733 + } 734 + } 735 + 736 + fn state_comment_less_than_sign_bang(&mut self) { 737 + match self.next_char() { 738 + Some('-') => { 739 + self.state = State::CommentLessThanSignBangDash; 740 + } 741 + _ => { 742 + self.reconsume(); 743 + self.state = State::Comment; 744 + } 745 + } 746 + } 747 + 748 + fn state_comment_less_than_sign_bang_dash(&mut self) { 749 + match self.next_char() { 750 + Some('-') => { 751 + self.state = State::CommentLessThanSignBangDashDash; 752 + } 753 + _ => { 754 + self.reconsume(); 755 + self.state = State::CommentEndDash; 756 + } 757 + } 758 + } 759 + 760 + fn state_comment_less_than_sign_bang_dash_dash(&mut self) { 761 + match self.next_char() { 762 + Some('>') | None => { 763 + self.reconsume(); 764 + self.state = State::CommentEnd; 765 + } 766 + Some(_) => { 767 + // Parse error. 768 + self.reconsume(); 769 + self.state = State::CommentEnd; 770 + } 771 + } 772 + } 773 + 774 + fn state_comment_end_dash(&mut self) { 775 + match self.next_char() { 776 + Some('-') => { 777 + self.state = State::CommentEnd; 778 + } 779 + None => { 780 + self.emit_current_comment(); 781 + self.emit_eof(); 782 + } 783 + Some(_) => { 784 + self.comment_data.push('-'); 785 + self.reconsume(); 786 + self.state = State::Comment; 787 + } 788 + } 789 + } 790 + 791 + fn state_comment_end(&mut self) { 792 + match self.next_char() { 793 + Some('>') => { 794 + self.state = State::Data; 795 + self.emit_current_comment(); 796 + } 797 + Some('!') => { 798 + self.state = State::CommentEndBang; 799 + } 800 + Some('-') => { 801 + self.comment_data.push('-'); 802 + } 803 + None => { 804 + self.emit_current_comment(); 805 + self.emit_eof(); 806 + } 807 + Some(_) => { 808 + self.comment_data.push('-'); 809 + self.comment_data.push('-'); 810 + self.reconsume(); 811 + self.state = State::Comment; 812 + } 813 + } 814 + } 815 + 816 + fn state_comment_end_bang(&mut self) { 817 + match self.next_char() { 818 + Some('-') => { 819 + self.comment_data.push('-'); 820 + self.comment_data.push('-'); 821 + self.comment_data.push('!'); 822 + self.state = State::CommentEndDash; 823 + } 824 + Some('>') => { 825 + self.state = State::Data; 826 + self.emit_current_comment(); 827 + } 828 + None => { 829 + self.emit_current_comment(); 830 + self.emit_eof(); 831 + } 832 + Some(_) => { 833 + self.comment_data.push('-'); 834 + self.comment_data.push('-'); 835 + self.comment_data.push('!'); 836 + self.reconsume(); 837 + self.state = State::Comment; 838 + } 839 + } 840 + } 841 + 842 + fn state_doctype(&mut self) { 843 + match self.next_char() { 844 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 845 + self.state = State::BeforeDoctypeName; 846 + } 847 + Some('>') => { 848 + self.reconsume(); 849 + self.state = State::BeforeDoctypeName; 850 + } 851 + None => { 852 + self.doctype_name = None; 853 + self.doctype_public_id = None; 854 + self.doctype_system_id = None; 855 + self.doctype_force_quirks = true; 856 + self.emit_current_doctype(); 857 + self.emit_eof(); 858 + } 859 + Some(_) => { 860 + // Parse error. Missing whitespace before DOCTYPE name. 861 + self.reconsume(); 862 + self.state = State::BeforeDoctypeName; 863 + } 864 + } 865 + } 866 + 867 + fn state_before_doctype_name(&mut self) { 868 + match self.next_char() { 869 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 870 + // Ignore whitespace. 871 + } 872 + Some(c) if c.is_ascii_uppercase() => { 873 + self.doctype_name = Some(c.to_ascii_lowercase().to_string()); 874 + self.doctype_public_id = None; 875 + self.doctype_system_id = None; 876 + self.doctype_force_quirks = false; 877 + self.state = State::DoctypeName; 878 + } 879 + Some('\0') => { 880 + self.doctype_name = Some("\u{FFFD}".to_string()); 881 + self.doctype_public_id = None; 882 + self.doctype_system_id = None; 883 + self.doctype_force_quirks = false; 884 + self.state = State::DoctypeName; 885 + } 886 + Some('>') => { 887 + // Parse error. Force quirks. 888 + self.doctype_name = None; 889 + self.doctype_public_id = None; 890 + self.doctype_system_id = None; 891 + self.doctype_force_quirks = true; 892 + self.state = State::Data; 893 + self.emit_current_doctype(); 894 + } 895 + None => { 896 + self.doctype_name = None; 897 + self.doctype_public_id = None; 898 + self.doctype_system_id = None; 899 + self.doctype_force_quirks = true; 900 + self.emit_current_doctype(); 901 + self.emit_eof(); 902 + } 903 + Some(c) => { 904 + self.doctype_name = Some(c.to_string()); 905 + self.doctype_public_id = None; 906 + self.doctype_system_id = None; 907 + self.doctype_force_quirks = false; 908 + self.state = State::DoctypeName; 909 + } 910 + } 911 + } 912 + 913 + fn state_doctype_name(&mut self) { 914 + match self.next_char() { 915 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 916 + self.state = State::AfterDoctypeName; 917 + } 918 + Some('>') => { 919 + self.state = State::Data; 920 + self.emit_current_doctype(); 921 + } 922 + Some(c) if c.is_ascii_uppercase() => { 923 + if let Some(ref mut name) = self.doctype_name { 924 + name.push(c.to_ascii_lowercase()); 925 + } 926 + } 927 + Some('\0') => { 928 + if let Some(ref mut name) = self.doctype_name { 929 + name.push('\u{FFFD}'); 930 + } 931 + } 932 + None => { 933 + self.doctype_force_quirks = true; 934 + self.emit_current_doctype(); 935 + self.emit_eof(); 936 + } 937 + Some(c) => { 938 + if let Some(ref mut name) = self.doctype_name { 939 + name.push(c); 940 + } 941 + } 942 + } 943 + } 944 + 945 + fn state_after_doctype_name(&mut self) { 946 + match self.next_char() { 947 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 948 + // Ignore. 949 + } 950 + Some('>') => { 951 + self.state = State::Data; 952 + self.emit_current_doctype(); 953 + } 954 + None => { 955 + self.doctype_force_quirks = true; 956 + self.emit_current_doctype(); 957 + self.emit_eof(); 958 + } 959 + Some(_) => { 960 + // Check for PUBLIC or SYSTEM keyword. 961 + self.reconsume(); 962 + if self.starts_with_case_insensitive("PUBLIC") { 963 + self.pos += 6; 964 + self.state = State::AfterDoctypePublicKeyword; 965 + } else if self.starts_with_case_insensitive("SYSTEM") { 966 + self.pos += 6; 967 + self.state = State::AfterDoctypeSystemKeyword; 968 + } else { 969 + // Parse error. 970 + self.doctype_force_quirks = true; 971 + self.next_char(); // consume the reconsumed char 972 + self.state = State::BogusDoctype; 973 + } 974 + } 975 + } 976 + } 977 + 978 + fn state_after_doctype_public_keyword(&mut self) { 979 + match self.next_char() { 980 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 981 + self.state = State::BeforeDoctypePublicIdentifier; 982 + } 983 + Some('"') => { 984 + // Parse error. Missing whitespace. 985 + self.doctype_public_id = Some(String::new()); 986 + self.state = State::DoctypePublicIdentifierDoubleQuoted; 987 + } 988 + Some('\'') => { 989 + self.doctype_public_id = Some(String::new()); 990 + self.state = State::DoctypePublicIdentifierSingleQuoted; 991 + } 992 + Some('>') => { 993 + self.doctype_force_quirks = true; 994 + self.state = State::Data; 995 + self.emit_current_doctype(); 996 + } 997 + None => { 998 + self.doctype_force_quirks = true; 999 + self.emit_current_doctype(); 1000 + self.emit_eof(); 1001 + } 1002 + Some(_) => { 1003 + self.doctype_force_quirks = true; 1004 + self.reconsume(); 1005 + self.state = State::BogusDoctype; 1006 + } 1007 + } 1008 + } 1009 + 1010 + fn state_before_doctype_public_identifier(&mut self) { 1011 + match self.next_char() { 1012 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1013 + // Ignore. 1014 + } 1015 + Some('"') => { 1016 + self.doctype_public_id = Some(String::new()); 1017 + self.state = State::DoctypePublicIdentifierDoubleQuoted; 1018 + } 1019 + Some('\'') => { 1020 + self.doctype_public_id = Some(String::new()); 1021 + self.state = State::DoctypePublicIdentifierSingleQuoted; 1022 + } 1023 + Some('>') => { 1024 + self.doctype_force_quirks = true; 1025 + self.state = State::Data; 1026 + self.emit_current_doctype(); 1027 + } 1028 + None => { 1029 + self.doctype_force_quirks = true; 1030 + self.emit_current_doctype(); 1031 + self.emit_eof(); 1032 + } 1033 + Some(_) => { 1034 + self.doctype_force_quirks = true; 1035 + self.reconsume(); 1036 + self.state = State::BogusDoctype; 1037 + } 1038 + } 1039 + } 1040 + 1041 + fn state_doctype_public_identifier_double_quoted(&mut self) { 1042 + match self.next_char() { 1043 + Some('"') => { 1044 + self.state = State::AfterDoctypePublicIdentifier; 1045 + } 1046 + Some('\0') => { 1047 + if let Some(ref mut id) = self.doctype_public_id { 1048 + id.push('\u{FFFD}'); 1049 + } 1050 + } 1051 + Some('>') => { 1052 + self.doctype_force_quirks = true; 1053 + self.state = State::Data; 1054 + self.emit_current_doctype(); 1055 + } 1056 + None => { 1057 + self.doctype_force_quirks = true; 1058 + self.emit_current_doctype(); 1059 + self.emit_eof(); 1060 + } 1061 + Some(c) => { 1062 + if let Some(ref mut id) = self.doctype_public_id { 1063 + id.push(c); 1064 + } 1065 + } 1066 + } 1067 + } 1068 + 1069 + fn state_doctype_public_identifier_single_quoted(&mut self) { 1070 + match self.next_char() { 1071 + Some('\'') => { 1072 + self.state = State::AfterDoctypePublicIdentifier; 1073 + } 1074 + Some('\0') => { 1075 + if let Some(ref mut id) = self.doctype_public_id { 1076 + id.push('\u{FFFD}'); 1077 + } 1078 + } 1079 + Some('>') => { 1080 + self.doctype_force_quirks = true; 1081 + self.state = State::Data; 1082 + self.emit_current_doctype(); 1083 + } 1084 + None => { 1085 + self.doctype_force_quirks = true; 1086 + self.emit_current_doctype(); 1087 + self.emit_eof(); 1088 + } 1089 + Some(c) => { 1090 + if let Some(ref mut id) = self.doctype_public_id { 1091 + id.push(c); 1092 + } 1093 + } 1094 + } 1095 + } 1096 + 1097 + fn state_after_doctype_public_identifier(&mut self) { 1098 + match self.next_char() { 1099 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1100 + self.state = State::BetweenDoctypePublicAndSystemIdentifiers; 1101 + } 1102 + Some('>') => { 1103 + self.state = State::Data; 1104 + self.emit_current_doctype(); 1105 + } 1106 + Some('"') => { 1107 + // Parse error. Missing whitespace. 1108 + self.doctype_system_id = Some(String::new()); 1109 + self.state = State::DoctypeSystemIdentifierDoubleQuoted; 1110 + } 1111 + Some('\'') => { 1112 + self.doctype_system_id = Some(String::new()); 1113 + self.state = State::DoctypeSystemIdentifierSingleQuoted; 1114 + } 1115 + None => { 1116 + self.doctype_force_quirks = true; 1117 + self.emit_current_doctype(); 1118 + self.emit_eof(); 1119 + } 1120 + Some(_) => { 1121 + self.doctype_force_quirks = true; 1122 + self.reconsume(); 1123 + self.state = State::BogusDoctype; 1124 + } 1125 + } 1126 + } 1127 + 1128 + fn state_between_doctype_public_and_system_identifiers(&mut self) { 1129 + match self.next_char() { 1130 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1131 + // Ignore. 1132 + } 1133 + Some('>') => { 1134 + self.state = State::Data; 1135 + self.emit_current_doctype(); 1136 + } 1137 + Some('"') => { 1138 + self.doctype_system_id = Some(String::new()); 1139 + self.state = State::DoctypeSystemIdentifierDoubleQuoted; 1140 + } 1141 + Some('\'') => { 1142 + self.doctype_system_id = Some(String::new()); 1143 + self.state = State::DoctypeSystemIdentifierSingleQuoted; 1144 + } 1145 + None => { 1146 + self.doctype_force_quirks = true; 1147 + self.emit_current_doctype(); 1148 + self.emit_eof(); 1149 + } 1150 + Some(_) => { 1151 + self.doctype_force_quirks = true; 1152 + self.reconsume(); 1153 + self.state = State::BogusDoctype; 1154 + } 1155 + } 1156 + } 1157 + 1158 + fn state_after_doctype_system_keyword(&mut self) { 1159 + match self.next_char() { 1160 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1161 + self.state = State::BeforeDoctypeSystemIdentifier; 1162 + } 1163 + Some('"') => { 1164 + self.doctype_system_id = Some(String::new()); 1165 + self.state = State::DoctypeSystemIdentifierDoubleQuoted; 1166 + } 1167 + Some('\'') => { 1168 + self.doctype_system_id = Some(String::new()); 1169 + self.state = State::DoctypeSystemIdentifierSingleQuoted; 1170 + } 1171 + Some('>') => { 1172 + self.doctype_force_quirks = true; 1173 + self.state = State::Data; 1174 + self.emit_current_doctype(); 1175 + } 1176 + None => { 1177 + self.doctype_force_quirks = true; 1178 + self.emit_current_doctype(); 1179 + self.emit_eof(); 1180 + } 1181 + Some(_) => { 1182 + self.doctype_force_quirks = true; 1183 + self.reconsume(); 1184 + self.state = State::BogusDoctype; 1185 + } 1186 + } 1187 + } 1188 + 1189 + fn state_before_doctype_system_identifier(&mut self) { 1190 + match self.next_char() { 1191 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1192 + // Ignore. 1193 + } 1194 + Some('"') => { 1195 + self.doctype_system_id = Some(String::new()); 1196 + self.state = State::DoctypeSystemIdentifierDoubleQuoted; 1197 + } 1198 + Some('\'') => { 1199 + self.doctype_system_id = Some(String::new()); 1200 + self.state = State::DoctypeSystemIdentifierSingleQuoted; 1201 + } 1202 + Some('>') => { 1203 + self.doctype_force_quirks = true; 1204 + self.state = State::Data; 1205 + self.emit_current_doctype(); 1206 + } 1207 + None => { 1208 + self.doctype_force_quirks = true; 1209 + self.emit_current_doctype(); 1210 + self.emit_eof(); 1211 + } 1212 + Some(_) => { 1213 + self.doctype_force_quirks = true; 1214 + self.reconsume(); 1215 + self.state = State::BogusDoctype; 1216 + } 1217 + } 1218 + } 1219 + 1220 + fn state_doctype_system_identifier_double_quoted(&mut self) { 1221 + match self.next_char() { 1222 + Some('"') => { 1223 + self.state = State::AfterDoctypeSystemIdentifier; 1224 + } 1225 + Some('\0') => { 1226 + if let Some(ref mut id) = self.doctype_system_id { 1227 + id.push('\u{FFFD}'); 1228 + } 1229 + } 1230 + Some('>') => { 1231 + self.doctype_force_quirks = true; 1232 + self.state = State::Data; 1233 + self.emit_current_doctype(); 1234 + } 1235 + None => { 1236 + self.doctype_force_quirks = true; 1237 + self.emit_current_doctype(); 1238 + self.emit_eof(); 1239 + } 1240 + Some(c) => { 1241 + if let Some(ref mut id) = self.doctype_system_id { 1242 + id.push(c); 1243 + } 1244 + } 1245 + } 1246 + } 1247 + 1248 + fn state_doctype_system_identifier_single_quoted(&mut self) { 1249 + match self.next_char() { 1250 + Some('\'') => { 1251 + self.state = State::AfterDoctypeSystemIdentifier; 1252 + } 1253 + Some('\0') => { 1254 + if let Some(ref mut id) = self.doctype_system_id { 1255 + id.push('\u{FFFD}'); 1256 + } 1257 + } 1258 + Some('>') => { 1259 + self.doctype_force_quirks = true; 1260 + self.state = State::Data; 1261 + self.emit_current_doctype(); 1262 + } 1263 + None => { 1264 + self.doctype_force_quirks = true; 1265 + self.emit_current_doctype(); 1266 + self.emit_eof(); 1267 + } 1268 + Some(c) => { 1269 + if let Some(ref mut id) = self.doctype_system_id { 1270 + id.push(c); 1271 + } 1272 + } 1273 + } 1274 + } 1275 + 1276 + fn state_after_doctype_system_identifier(&mut self) { 1277 + match self.next_char() { 1278 + Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1279 + // Ignore. 1280 + } 1281 + Some('>') => { 1282 + self.state = State::Data; 1283 + self.emit_current_doctype(); 1284 + } 1285 + None => { 1286 + self.doctype_force_quirks = true; 1287 + self.emit_current_doctype(); 1288 + self.emit_eof(); 1289 + } 1290 + Some(_) => { 1291 + // Parse error, but do NOT set force_quirks. 1292 + self.reconsume(); 1293 + self.state = State::BogusDoctype; 1294 + } 1295 + } 1296 + } 1297 + 1298 + fn state_bogus_doctype(&mut self) { 1299 + match self.next_char() { 1300 + Some('>') => { 1301 + self.state = State::Data; 1302 + self.emit_current_doctype(); 1303 + } 1304 + Some('\0') => { 1305 + // Parse error. Ignore. 1306 + } 1307 + None => { 1308 + self.emit_current_doctype(); 1309 + self.emit_eof(); 1310 + } 1311 + Some(_) => { 1312 + // Ignore. 1313 + } 1314 + } 1315 + } 1316 + 1317 + // --- Character reference states --- 1318 + 1319 + fn state_character_reference(&mut self) { 1320 + self.temp_buf.clear(); 1321 + self.temp_buf.push('&'); 1322 + 1323 + match self.peek_char() { 1324 + Some(c) if c.is_ascii_alphanumeric() => { 1325 + self.state = State::NamedCharacterReference; 1326 + } 1327 + Some('#') => { 1328 + self.temp_buf.push('#'); 1329 + self.next_char(); 1330 + self.state = State::NumericCharacterReference; 1331 + } 1332 + _ => { 1333 + // Not a character reference. Flush '&' to return state. 1334 + self.flush_char_ref("&"); 1335 + self.state = self.return_state; 1336 + } 1337 + } 1338 + } 1339 + 1340 + fn state_named_character_reference(&mut self) { 1341 + // Collect alphanumeric characters to form the entity name. 1342 + // Per spec, entity names can also contain digits after the first char. 1343 + let mut name = String::new(); 1344 + let start_pos = self.pos; 1345 + 1346 + while let Some(c) = self.peek_char() { 1347 + if c.is_ascii_alphanumeric() { 1348 + name.push(c); 1349 + self.pos += 1; 1350 + } else { 1351 + break; 1352 + } 1353 + } 1354 + 1355 + // Try to find a match, trying longest match first. 1356 + // First check if the full name + semicolon matches. 1357 + let has_trailing_semi = self.peek_char() == Some(';'); 1358 + 1359 + let mut matched_value: Option<&str> = None; 1360 + let mut matched_len = 0; 1361 + 1362 + // Try the full name first (with semicolon if present). 1363 + if has_trailing_semi { 1364 + if let Some(val) = entities::lookup_entity(&name) { 1365 + matched_value = Some(val); 1366 + matched_len = name.len(); 1367 + } 1368 + } 1369 + 1370 + // If no match with full name, try progressively shorter prefixes. 1371 + if matched_value.is_none() { 1372 + for i in (1..=name.len()).rev() { 1373 + let candidate = &name[..i]; 1374 + if let Some(val) = entities::lookup_entity(candidate) { 1375 + // Without semicolon, only legacy entities are recognized. 1376 + if entities::is_legacy_entity(candidate) { 1377 + matched_value = Some(val); 1378 + matched_len = i; 1379 + break; 1380 + } 1381 + } 1382 + } 1383 + } 1384 + 1385 + // Also try the full name without semicolon for legacy entities. 1386 + if matched_value.is_none() && !has_trailing_semi { 1387 + if let Some(val) = entities::lookup_entity(&name) { 1388 + if entities::is_legacy_entity(&name) { 1389 + matched_value = Some(val); 1390 + matched_len = name.len(); 1391 + } 1392 + } 1393 + } 1394 + 1395 + if let Some(value) = matched_value { 1396 + // Rewind to just after the matched portion. 1397 + self.pos = start_pos + matched_len; 1398 + 1399 + // Check for semicolon after the matched portion. 1400 + let has_semi = self.peek_char() == Some(';'); 1401 + if has_semi { 1402 + self.pos += 1; 1403 + } 1404 + 1405 + // Per spec: if consumed as part of an attribute and the character 1406 + // after the match is `=` or alphanumeric, and no semicolon, 1407 + // flush the original text instead. 1408 + let in_attribute = matches!( 1409 + self.return_state, 1410 + State::AttributeValueDoubleQuoted 1411 + | State::AttributeValueSingleQuoted 1412 + | State::AttributeValueUnquoted 1413 + ); 1414 + 1415 + if !has_semi && in_attribute { 1416 + if let Some(next) = self.peek_char() { 1417 + if next == '=' || next.is_ascii_alphanumeric() { 1418 + // Not a reference. Flush original text. 1419 + let mut original = "&".to_string(); 1420 + original.push_str(&name[..matched_len]); 1421 + self.flush_char_ref(&original); 1422 + self.state = self.return_state; 1423 + return; 1424 + } 1425 + } 1426 + } 1427 + 1428 + self.flush_char_ref(value); 1429 + self.state = self.return_state; 1430 + } else { 1431 + // No match. Rewind and flush '&' + all collected chars. 1432 + self.pos = start_pos; 1433 + self.flush_char_ref("&"); 1434 + for _ in 0..name.len() { 1435 + let c = self.next_char().unwrap(); 1436 + let s = c.to_string(); 1437 + self.flush_char_ref(&s); 1438 + } 1439 + self.state = self.return_state; 1440 + } 1441 + } 1442 + 1443 + fn state_numeric_character_reference(&mut self) { 1444 + self.char_ref_code = 0; 1445 + match self.peek_char() { 1446 + Some('x') | Some('X') => { 1447 + self.temp_buf.push(self.peek_char().unwrap()); 1448 + self.next_char(); 1449 + self.state = State::HexCharacterReferenceStart; 1450 + } 1451 + _ => { 1452 + self.state = State::DecCharacterReferenceStart; 1453 + } 1454 + } 1455 + } 1456 + 1457 + fn state_hex_character_reference_start(&mut self) { 1458 + match self.peek_char() { 1459 + Some(c) if c.is_ascii_hexdigit() => { 1460 + self.state = State::HexCharacterReference; 1461 + } 1462 + _ => { 1463 + // Parse error. Flush temp_buf. 1464 + let buf = self.temp_buf.clone(); 1465 + self.flush_char_ref(&buf); 1466 + self.state = self.return_state; 1467 + } 1468 + } 1469 + } 1470 + 1471 + fn state_dec_character_reference_start(&mut self) { 1472 + match self.peek_char() { 1473 + Some(c) if c.is_ascii_digit() => { 1474 + self.state = State::DecCharacterReference; 1475 + } 1476 + _ => { 1477 + let buf = self.temp_buf.clone(); 1478 + self.flush_char_ref(&buf); 1479 + self.state = self.return_state; 1480 + } 1481 + } 1482 + } 1483 + 1484 + fn state_hex_character_reference(&mut self) { 1485 + match self.next_char() { 1486 + Some(c) if c.is_ascii_hexdigit() => { 1487 + // Cap at a value that's clearly out of range but won't overflow. 1488 + self.char_ref_code = self 1489 + .char_ref_code 1490 + .saturating_mul(16) 1491 + .saturating_add(c.to_digit(16).unwrap()); 1492 + if self.char_ref_code > 0x10FFFF { 1493 + self.char_ref_code = 0x110000; 1494 + } 1495 + } 1496 + Some(';') => { 1497 + self.state = State::NumericCharacterReferenceEnd; 1498 + } 1499 + None => { 1500 + // EOF: missing semicolon parse error. Don't reconsume. 1501 + self.state = State::NumericCharacterReferenceEnd; 1502 + } 1503 + Some(_) => { 1504 + // Parse error: missing semicolon. 1505 + self.reconsume(); 1506 + self.state = State::NumericCharacterReferenceEnd; 1507 + } 1508 + } 1509 + } 1510 + 1511 + fn state_dec_character_reference(&mut self) { 1512 + match self.next_char() { 1513 + Some(c) if c.is_ascii_digit() => { 1514 + self.char_ref_code = self 1515 + .char_ref_code 1516 + .saturating_mul(10) 1517 + .saturating_add(c.to_digit(10).unwrap()); 1518 + if self.char_ref_code > 0x10FFFF { 1519 + self.char_ref_code = 0x110000; 1520 + } 1521 + } 1522 + Some(';') => { 1523 + self.state = State::NumericCharacterReferenceEnd; 1524 + } 1525 + None => { 1526 + // EOF: missing semicolon parse error. Don't reconsume. 1527 + self.state = State::NumericCharacterReferenceEnd; 1528 + } 1529 + Some(_) => { 1530 + self.reconsume(); 1531 + self.state = State::NumericCharacterReferenceEnd; 1532 + } 1533 + } 1534 + } 1535 + 1536 + fn state_numeric_character_reference_end(&mut self) { 1537 + let code = self.char_ref_code; 1538 + let ch = match code { 1539 + 0 => '\u{FFFD}', 1540 + // Surrogate range. 1541 + 0xD800..=0xDFFF => '\u{FFFD}', 1542 + // Out of Unicode range. 1543 + c if c > 0x10FFFF => '\u{FFFD}', 1544 + // Windows-1252 replacement table for 0x80..0x9F. 1545 + 0x80 => '\u{20AC}', 1546 + 0x82 => '\u{201A}', 1547 + 0x83 => '\u{0192}', 1548 + 0x84 => '\u{201E}', 1549 + 0x85 => '\u{2026}', 1550 + 0x86 => '\u{2020}', 1551 + 0x87 => '\u{2021}', 1552 + 0x88 => '\u{02C6}', 1553 + 0x89 => '\u{2030}', 1554 + 0x8A => '\u{0160}', 1555 + 0x8B => '\u{2039}', 1556 + 0x8C => '\u{0152}', 1557 + 0x8E => '\u{017D}', 1558 + 0x91 => '\u{2018}', 1559 + 0x92 => '\u{2019}', 1560 + 0x93 => '\u{201C}', 1561 + 0x94 => '\u{201D}', 1562 + 0x95 => '\u{2022}', 1563 + 0x96 => '\u{2013}', 1564 + 0x97 => '\u{2014}', 1565 + 0x98 => '\u{02DC}', 1566 + 0x99 => '\u{2122}', 1567 + 0x9A => '\u{0161}', 1568 + 0x9B => '\u{203A}', 1569 + 0x9C => '\u{0153}', 1570 + 0x9E => '\u{017E}', 1571 + 0x9F => '\u{0178}', 1572 + c => char::from_u32(c).unwrap_or('\u{FFFD}'), 1573 + }; 1574 + 1575 + let s = ch.to_string(); 1576 + self.flush_char_ref(&s); 1577 + self.state = self.return_state; 1578 + } 1579 + 1580 + // --- Helpers --- 1581 + 1582 + fn starts_with(&self, s: &str) -> bool { 1583 + let bytes: Vec<char> = s.chars().collect(); 1584 + if self.pos + bytes.len() > self.input.len() { 1585 + return false; 1586 + } 1587 + for (i, &c) in bytes.iter().enumerate() { 1588 + if self.input[self.pos + i] != c { 1589 + return false; 1590 + } 1591 + } 1592 + true 1593 + } 1594 + 1595 + fn starts_with_case_insensitive(&self, s: &str) -> bool { 1596 + let bytes: Vec<char> = s.chars().collect(); 1597 + if self.pos + bytes.len() > self.input.len() { 1598 + return false; 1599 + } 1600 + for (i, &c) in bytes.iter().enumerate() { 1601 + if !self.input[self.pos + i].eq_ignore_ascii_case(&c) { 1602 + return false; 1603 + } 1604 + } 1605 + true 1606 + } 1607 + } 1608 + 1609 + #[cfg(test)] 1610 + mod tests { 1611 + use super::*; 1612 + use crate::tokenize; 1613 + 1614 + #[test] 1615 + fn empty_input() { 1616 + let tokens = tokenize(""); 1617 + assert!(tokens.is_empty()); 1618 + } 1619 + 1620 + #[test] 1621 + fn plain_text() { 1622 + let tokens = tokenize("Hello, world!"); 1623 + assert_eq!(tokens, vec![Token::Character("Hello, world!".to_string())]); 1624 + } 1625 + 1626 + #[test] 1627 + fn simple_element() { 1628 + let tokens = tokenize("<p>Hello</p>"); 1629 + assert_eq!( 1630 + tokens, 1631 + vec![ 1632 + Token::StartTag { 1633 + name: "p".to_string(), 1634 + attributes: vec![], 1635 + self_closing: false, 1636 + }, 1637 + Token::Character("Hello".to_string()), 1638 + Token::EndTag { 1639 + name: "p".to_string(), 1640 + }, 1641 + ] 1642 + ); 1643 + } 1644 + 1645 + #[test] 1646 + fn self_closing_tag() { 1647 + let tokens = tokenize("<br/>"); 1648 + assert_eq!( 1649 + tokens, 1650 + vec![Token::StartTag { 1651 + name: "br".to_string(), 1652 + attributes: vec![], 1653 + self_closing: true, 1654 + }] 1655 + ); 1656 + } 1657 + 1658 + #[test] 1659 + fn self_closing_img() { 1660 + let tokens = tokenize("<img/>"); 1661 + assert_eq!( 1662 + tokens, 1663 + vec![Token::StartTag { 1664 + name: "img".to_string(), 1665 + attributes: vec![], 1666 + self_closing: true, 1667 + }] 1668 + ); 1669 + } 1670 + 1671 + #[test] 1672 + fn tag_with_attributes() { 1673 + let tokens = tokenize(r#"<a href="url" class="link">"#); 1674 + assert_eq!( 1675 + tokens, 1676 + vec![Token::StartTag { 1677 + name: "a".to_string(), 1678 + attributes: vec![ 1679 + ("href".to_string(), "url".to_string()), 1680 + ("class".to_string(), "link".to_string()), 1681 + ], 1682 + self_closing: false, 1683 + }] 1684 + ); 1685 + } 1686 + 1687 + #[test] 1688 + fn tag_with_single_quoted_attributes() { 1689 + let tokens = tokenize("<div id='main'>"); 1690 + assert_eq!( 1691 + tokens, 1692 + vec![Token::StartTag { 1693 + name: "div".to_string(), 1694 + attributes: vec![("id".to_string(), "main".to_string())], 1695 + self_closing: false, 1696 + }] 1697 + ); 1698 + } 1699 + 1700 + #[test] 1701 + fn tag_with_unquoted_attribute() { 1702 + let tokens = tokenize("<input type=text>"); 1703 + assert_eq!( 1704 + tokens, 1705 + vec![Token::StartTag { 1706 + name: "input".to_string(), 1707 + attributes: vec![("type".to_string(), "text".to_string())], 1708 + self_closing: false, 1709 + }] 1710 + ); 1711 + } 1712 + 1713 + #[test] 1714 + fn comment() { 1715 + let tokens = tokenize("<!-- comment -->"); 1716 + assert_eq!(tokens, vec![Token::Comment(" comment ".to_string())]); 1717 + } 1718 + 1719 + #[test] 1720 + fn empty_comment() { 1721 + let tokens = tokenize("<!---->"); 1722 + assert_eq!(tokens, vec![Token::Comment("".to_string())]); 1723 + } 1724 + 1725 + #[test] 1726 + fn doctype_html() { 1727 + let tokens = tokenize("<!DOCTYPE html>"); 1728 + assert_eq!( 1729 + tokens, 1730 + vec![Token::Doctype { 1731 + name: Some("html".to_string()), 1732 + public_id: None, 1733 + system_id: None, 1734 + force_quirks: false, 1735 + }] 1736 + ); 1737 + } 1738 + 1739 + #[test] 1740 + fn doctype_case_insensitive() { 1741 + let tokens = tokenize("<!doctype html>"); 1742 + assert_eq!( 1743 + tokens, 1744 + vec![Token::Doctype { 1745 + name: Some("html".to_string()), 1746 + public_id: None, 1747 + system_id: None, 1748 + force_quirks: false, 1749 + }] 1750 + ); 1751 + } 1752 + 1753 + #[test] 1754 + fn char_ref_named() { 1755 + let tokens = tokenize("&amp;&lt;&gt;&quot;"); 1756 + assert_eq!(tokens, vec![Token::Character("&<>\"".to_string())]); 1757 + } 1758 + 1759 + #[test] 1760 + fn char_ref_numeric_decimal() { 1761 + let tokens = tokenize("&#65;"); 1762 + assert_eq!(tokens, vec![Token::Character("A".to_string())]); 1763 + } 1764 + 1765 + #[test] 1766 + fn char_ref_numeric_hex() { 1767 + let tokens = tokenize("&#x41;"); 1768 + assert_eq!(tokens, vec![Token::Character("A".to_string())]); 1769 + } 1770 + 1771 + #[test] 1772 + fn char_ref_numeric_hex_uppercase() { 1773 + let tokens = tokenize("&#X41;"); 1774 + assert_eq!(tokens, vec![Token::Character("A".to_string())]); 1775 + } 1776 + 1777 + #[test] 1778 + fn full_html_document() { 1779 + let tokens = 1780 + tokenize("<html><head><title>Test</title></head><body><p>Hello</p></body></html>"); 1781 + assert_eq!( 1782 + tokens, 1783 + vec![ 1784 + Token::StartTag { 1785 + name: "html".to_string(), 1786 + attributes: vec![], 1787 + self_closing: false, 1788 + }, 1789 + Token::StartTag { 1790 + name: "head".to_string(), 1791 + attributes: vec![], 1792 + self_closing: false, 1793 + }, 1794 + Token::StartTag { 1795 + name: "title".to_string(), 1796 + attributes: vec![], 1797 + self_closing: false, 1798 + }, 1799 + Token::Character("Test".to_string()), 1800 + Token::EndTag { 1801 + name: "title".to_string(), 1802 + }, 1803 + Token::EndTag { 1804 + name: "head".to_string(), 1805 + }, 1806 + Token::StartTag { 1807 + name: "body".to_string(), 1808 + attributes: vec![], 1809 + self_closing: false, 1810 + }, 1811 + Token::StartTag { 1812 + name: "p".to_string(), 1813 + attributes: vec![], 1814 + self_closing: false, 1815 + }, 1816 + Token::Character("Hello".to_string()), 1817 + Token::EndTag { 1818 + name: "p".to_string(), 1819 + }, 1820 + Token::EndTag { 1821 + name: "body".to_string(), 1822 + }, 1823 + Token::EndTag { 1824 + name: "html".to_string(), 1825 + }, 1826 + ] 1827 + ); 1828 + } 1829 + 1830 + #[test] 1831 + fn uppercase_tag_names_lowercased() { 1832 + let tokens = tokenize("<DIV></DIV>"); 1833 + assert_eq!( 1834 + tokens, 1835 + vec![ 1836 + Token::StartTag { 1837 + name: "div".to_string(), 1838 + attributes: vec![], 1839 + self_closing: false, 1840 + }, 1841 + Token::EndTag { 1842 + name: "div".to_string(), 1843 + }, 1844 + ] 1845 + ); 1846 + } 1847 + 1848 + #[test] 1849 + fn uppercase_attribute_names_lowercased() { 1850 + let tokens = tokenize(r#"<div CLASS="x">"#); 1851 + assert_eq!( 1852 + tokens, 1853 + vec![Token::StartTag { 1854 + name: "div".to_string(), 1855 + attributes: vec![("class".to_string(), "x".to_string())], 1856 + self_closing: false, 1857 + }] 1858 + ); 1859 + } 1860 + 1861 + #[test] 1862 + fn duplicate_attributes_first_wins() { 1863 + let tokens = tokenize(r#"<div class="a" class="b">"#); 1864 + assert_eq!( 1865 + tokens, 1866 + vec![Token::StartTag { 1867 + name: "div".to_string(), 1868 + attributes: vec![("class".to_string(), "a".to_string())], 1869 + self_closing: false, 1870 + }] 1871 + ); 1872 + } 1873 + 1874 + #[test] 1875 + fn char_ref_in_attribute() { 1876 + let tokens = tokenize(r#"<a href="?a=1&amp;b=2">"#); 1877 + assert_eq!( 1878 + tokens, 1879 + vec![Token::StartTag { 1880 + name: "a".to_string(), 1881 + attributes: vec![("href".to_string(), "?a=1&b=2".to_string())], 1882 + self_closing: false, 1883 + }] 1884 + ); 1885 + } 1886 + 1887 + #[test] 1888 + fn multiple_attributes() { 1889 + let tokens = tokenize(r#"<input type="text" name="foo" value="bar">"#); 1890 + assert_eq!( 1891 + tokens, 1892 + vec![Token::StartTag { 1893 + name: "input".to_string(), 1894 + attributes: vec![ 1895 + ("type".to_string(), "text".to_string()), 1896 + ("name".to_string(), "foo".to_string()), 1897 + ("value".to_string(), "bar".to_string()), 1898 + ], 1899 + self_closing: false, 1900 + }] 1901 + ); 1902 + } 1903 + 1904 + #[test] 1905 + fn boolean_attribute() { 1906 + let tokens = tokenize("<input disabled>"); 1907 + assert_eq!( 1908 + tokens, 1909 + vec![Token::StartTag { 1910 + name: "input".to_string(), 1911 + attributes: vec![("disabled".to_string(), "".to_string())], 1912 + self_closing: false, 1913 + }] 1914 + ); 1915 + } 1916 + 1917 + #[test] 1918 + fn mixed_content() { 1919 + let tokens = tokenize("Hello <!-- comment --> World"); 1920 + assert_eq!( 1921 + tokens, 1922 + vec![ 1923 + Token::Character("Hello ".to_string()), 1924 + Token::Comment(" comment ".to_string()), 1925 + Token::Character(" World".to_string()), 1926 + ] 1927 + ); 1928 + } 1929 + 1930 + #[test] 1931 + fn doctype_with_public_id() { 1932 + let tokens = tokenize( 1933 + r#"<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">"#, 1934 + ); 1935 + assert_eq!( 1936 + tokens, 1937 + vec![Token::Doctype { 1938 + name: Some("html".to_string()), 1939 + public_id: Some("-//W3C//DTD XHTML 1.0 Strict//EN".to_string()), 1940 + system_id: Some("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd".to_string()), 1941 + force_quirks: false, 1942 + }] 1943 + ); 1944 + } 1945 + 1946 + #[test] 1947 + fn null_in_text() { 1948 + let tokens = tokenize("a\0b"); 1949 + assert_eq!(tokens, vec![Token::Character("a\u{FFFD}b".to_string())]); 1950 + } 1951 + 1952 + #[test] 1953 + fn windows_1252_numeric_refs() { 1954 + // &#128; should map to Euro sign. 1955 + let tokens = tokenize("&#128;"); 1956 + assert_eq!(tokens, vec![Token::Character("\u{20AC}".to_string())]); 1957 + } 1958 + 1959 + #[test] 1960 + fn attribute_with_empty_value() { 1961 + let tokens = tokenize(r#"<div class="">"#); 1962 + assert_eq!( 1963 + tokens, 1964 + vec![Token::StartTag { 1965 + name: "div".to_string(), 1966 + attributes: vec![("class".to_string(), "".to_string())], 1967 + self_closing: false, 1968 + }] 1969 + ); 1970 + } 1971 + 1972 + #[test] 1973 + fn adjacent_tags() { 1974 + let tokens = tokenize("<b></b><i></i>"); 1975 + assert_eq!( 1976 + tokens, 1977 + vec![ 1978 + Token::StartTag { 1979 + name: "b".to_string(), 1980 + attributes: vec![], 1981 + self_closing: false, 1982 + }, 1983 + Token::EndTag { 1984 + name: "b".to_string(), 1985 + }, 1986 + Token::StartTag { 1987 + name: "i".to_string(), 1988 + attributes: vec![], 1989 + self_closing: false, 1990 + }, 1991 + Token::EndTag { 1992 + name: "i".to_string(), 1993 + }, 1994 + ] 1995 + ); 1996 + } 1997 + 1998 + #[test] 1999 + fn newlines_in_text() { 2000 + let tokens = tokenize("line1\nline2\nline3"); 2001 + assert_eq!( 2002 + tokens, 2003 + vec![Token::Character("line1\nline2\nline3".to_string())] 2004 + ); 2005 + } 2006 + 2007 + #[test] 2008 + fn self_closing_with_attribute() { 2009 + let tokens = tokenize(r#"<img src="test.png"/>"#); 2010 + assert_eq!( 2011 + tokens, 2012 + vec![Token::StartTag { 2013 + name: "img".to_string(), 2014 + attributes: vec![("src".to_string(), "test.png".to_string())], 2015 + self_closing: true, 2016 + }] 2017 + ); 2018 + } 2019 + 2020 + #[test] 2021 + fn less_than_in_text_not_tag() { 2022 + // A bare '<' not followed by a letter should be emitted as text. 2023 + let tokens = tokenize("1 < 2"); 2024 + assert_eq!(tokens, vec![Token::Character("1 < 2".to_string())]); 2025 + } 2026 + 2027 + #[test] 2028 + fn ampersand_not_entity() { 2029 + let tokens = tokenize("a & b"); 2030 + assert_eq!(tokens, vec![Token::Character("a & b".to_string())]); 2031 + } 2032 + 2033 + #[test] 2034 + fn cdata_in_html_becomes_comment() { 2035 + let tokens = tokenize("<![CDATA[hello]]>"); 2036 + // In HTML (non-foreign) context, CDATA is a parse error → bogus comment. 2037 + assert_eq!(tokens, vec![Token::Comment("[CDATA[hello]]".to_string())]); 2038 + } 2039 + }