web engine - experimental web browser
at poly1305-h4-fix 2039 lines 66 kB view raw
1//! HTML5 tokenizer state machine per WHATWG spec §13.2.5. 2 3use crate::entities; 4use crate::Token; 5 6#[derive(Debug, Clone, Copy, PartialEq)] 7enum State { 8 Data, 9 TagOpen, 10 EndTagOpen, 11 TagName, 12 BeforeAttributeName, 13 AttributeName, 14 AfterAttributeName, 15 BeforeAttributeValue, 16 AttributeValueDoubleQuoted, 17 AttributeValueSingleQuoted, 18 AttributeValueUnquoted, 19 AfterAttributeValueQuoted, 20 SelfClosingStartTag, 21 BogusComment, 22 MarkupDeclarationOpen, 23 CommentStart, 24 CommentStartDash, 25 Comment, 26 CommentLessThanSign, 27 CommentLessThanSignBang, 28 CommentLessThanSignBangDash, 29 CommentLessThanSignBangDashDash, 30 CommentEndDash, 31 CommentEnd, 32 CommentEndBang, 33 Doctype, 34 BeforeDoctypeName, 35 DoctypeName, 36 AfterDoctypeName, 37 AfterDoctypePublicKeyword, 38 BeforeDoctypePublicIdentifier, 39 DoctypePublicIdentifierDoubleQuoted, 40 DoctypePublicIdentifierSingleQuoted, 41 AfterDoctypePublicIdentifier, 42 BetweenDoctypePublicAndSystemIdentifiers, 43 AfterDoctypeSystemKeyword, 44 BeforeDoctypeSystemIdentifier, 45 DoctypeSystemIdentifierDoubleQuoted, 46 DoctypeSystemIdentifierSingleQuoted, 47 AfterDoctypeSystemIdentifier, 48 BogusDoctype, 49 CharacterReference, 50 NumericCharacterReference, 51 HexCharacterReferenceStart, 52 DecCharacterReferenceStart, 53 HexCharacterReference, 54 DecCharacterReference, 55 NumericCharacterReferenceEnd, 56 NamedCharacterReference, 57} 58 59/// HTML5 tokenizer state machine. 60pub struct Tokenizer { 61 input: Vec<char>, 62 pos: usize, 63 state: State, 64 return_state: State, 65 pending: Vec<Token>, 66 /// Current tag being built. 67 tag_name: String, 68 tag_self_closing: bool, 69 tag_is_end: bool, 70 tag_attributes: Vec<(String, String)>, 71 current_attr_name: String, 72 current_attr_value: String, 73 /// Current comment or doctype being built. 74 comment_data: String, 75 doctype_name: Option<String>, 76 doctype_public_id: Option<String>, 77 doctype_system_id: Option<String>, 78 doctype_force_quirks: bool, 79 /// Character reference accumulator. 80 char_ref_code: u32, 81 temp_buf: String, 82} 83 84impl Tokenizer { 85 /// Create a new tokenizer for the given input. 86 pub fn new(input: &str) -> Self { 87 Tokenizer { 88 input: input.chars().collect(), 89 pos: 0, 90 state: State::Data, 91 return_state: State::Data, 92 pending: Vec::new(), 93 tag_name: String::new(), 94 tag_self_closing: false, 95 tag_is_end: false, 96 tag_attributes: Vec::new(), 97 current_attr_name: String::new(), 98 current_attr_value: String::new(), 99 comment_data: String::new(), 100 doctype_name: None, 101 doctype_public_id: None, 102 doctype_system_id: None, 103 doctype_force_quirks: false, 104 char_ref_code: 0, 105 temp_buf: String::new(), 106 } 107 } 108 109 /// Return the next token from the input. 110 pub fn next_token(&mut self) -> Token { 111 loop { 112 if let Some(token) = self.pending.pop() { 113 return token; 114 } 115 self.step(); 116 } 117 } 118 119 fn next_char(&mut self) -> Option<char> { 120 if self.pos < self.input.len() { 121 let ch = self.input[self.pos]; 122 self.pos += 1; 123 Some(ch) 124 } else { 125 None 126 } 127 } 128 129 fn peek_char(&self) -> Option<char> { 130 if self.pos < self.input.len() { 131 Some(self.input[self.pos]) 132 } else { 133 None 134 } 135 } 136 137 fn reconsume(&mut self) { 138 if self.pos > 0 { 139 self.pos -= 1; 140 } 141 } 142 143 fn emit(&mut self, token: Token) { 144 // We use a Vec as a stack, so push to front by inserting at 0. 145 self.pending.insert(0, token); 146 } 147 148 fn emit_current_tag(&mut self) { 149 // Finalize the current attribute if there is one. 150 self.finish_attribute(); 151 152 if self.tag_is_end { 153 self.emit(Token::EndTag { 154 name: self.tag_name.clone(), 155 }); 156 } else { 157 self.emit(Token::StartTag { 158 name: self.tag_name.clone(), 159 attributes: self.tag_attributes.clone(), 160 self_closing: self.tag_self_closing, 161 }); 162 } 163 } 164 165 fn emit_current_comment(&mut self) { 166 self.emit(Token::Comment(self.comment_data.clone())); 167 } 168 169 fn emit_current_doctype(&mut self) { 170 self.emit(Token::Doctype { 171 name: self.doctype_name.clone(), 172 public_id: self.doctype_public_id.clone(), 173 system_id: self.doctype_system_id.clone(), 174 force_quirks: self.doctype_force_quirks, 175 }); 176 } 177 178 fn emit_char(&mut self, ch: char) { 179 self.emit(Token::Character(ch.to_string())); 180 } 181 182 fn emit_eof(&mut self) { 183 self.emit(Token::Eof); 184 } 185 186 fn start_new_tag(&mut self, is_end: bool) { 187 self.tag_name.clear(); 188 self.tag_self_closing = false; 189 self.tag_is_end = is_end; 190 self.tag_attributes.clear(); 191 self.current_attr_name.clear(); 192 self.current_attr_value.clear(); 193 } 194 195 fn start_new_attribute(&mut self) { 196 self.finish_attribute(); 197 self.current_attr_name.clear(); 198 self.current_attr_value.clear(); 199 } 200 201 fn finish_attribute(&mut self) { 202 if !self.current_attr_name.is_empty() { 203 // Per spec: if duplicate attribute name, ignore the later one. 204 let name = self.current_attr_name.clone(); 205 if !self.tag_attributes.iter().any(|(n, _)| n == &name) { 206 self.tag_attributes 207 .push((name, self.current_attr_value.clone())); 208 } 209 self.current_attr_name.clear(); 210 self.current_attr_value.clear(); 211 } 212 } 213 214 /// Flush character reference code to the return state. 215 fn flush_char_ref(&mut self, s: &str) { 216 match self.return_state { 217 State::AttributeValueDoubleQuoted 218 | State::AttributeValueSingleQuoted 219 | State::AttributeValueUnquoted => { 220 self.current_attr_value.push_str(s); 221 } 222 _ => { 223 for ch in s.chars() { 224 self.emit_char(ch); 225 } 226 } 227 } 228 } 229 230 fn step(&mut self) { 231 match self.state { 232 State::Data => self.state_data(), 233 State::TagOpen => self.state_tag_open(), 234 State::EndTagOpen => self.state_end_tag_open(), 235 State::TagName => self.state_tag_name(), 236 State::BeforeAttributeName => self.state_before_attribute_name(), 237 State::AttributeName => self.state_attribute_name(), 238 State::AfterAttributeName => self.state_after_attribute_name(), 239 State::BeforeAttributeValue => self.state_before_attribute_value(), 240 State::AttributeValueDoubleQuoted => self.state_attribute_value_double_quoted(), 241 State::AttributeValueSingleQuoted => self.state_attribute_value_single_quoted(), 242 State::AttributeValueUnquoted => self.state_attribute_value_unquoted(), 243 State::AfterAttributeValueQuoted => self.state_after_attribute_value_quoted(), 244 State::SelfClosingStartTag => self.state_self_closing_start_tag(), 245 State::BogusComment => self.state_bogus_comment(), 246 State::MarkupDeclarationOpen => self.state_markup_declaration_open(), 247 State::CommentStart => self.state_comment_start(), 248 State::CommentStartDash => self.state_comment_start_dash(), 249 State::Comment => self.state_comment(), 250 State::CommentLessThanSign => self.state_comment_less_than_sign(), 251 State::CommentLessThanSignBang => self.state_comment_less_than_sign_bang(), 252 State::CommentLessThanSignBangDash => self.state_comment_less_than_sign_bang_dash(), 253 State::CommentLessThanSignBangDashDash => { 254 self.state_comment_less_than_sign_bang_dash_dash() 255 } 256 State::CommentEndDash => self.state_comment_end_dash(), 257 State::CommentEnd => self.state_comment_end(), 258 State::CommentEndBang => self.state_comment_end_bang(), 259 State::Doctype => self.state_doctype(), 260 State::BeforeDoctypeName => self.state_before_doctype_name(), 261 State::DoctypeName => self.state_doctype_name(), 262 State::AfterDoctypeName => self.state_after_doctype_name(), 263 State::AfterDoctypePublicKeyword => self.state_after_doctype_public_keyword(), 264 State::BeforeDoctypePublicIdentifier => self.state_before_doctype_public_identifier(), 265 State::DoctypePublicIdentifierDoubleQuoted => { 266 self.state_doctype_public_identifier_double_quoted() 267 } 268 State::DoctypePublicIdentifierSingleQuoted => { 269 self.state_doctype_public_identifier_single_quoted() 270 } 271 State::AfterDoctypePublicIdentifier => self.state_after_doctype_public_identifier(), 272 State::BetweenDoctypePublicAndSystemIdentifiers => { 273 self.state_between_doctype_public_and_system_identifiers() 274 } 275 State::AfterDoctypeSystemKeyword => self.state_after_doctype_system_keyword(), 276 State::BeforeDoctypeSystemIdentifier => self.state_before_doctype_system_identifier(), 277 State::DoctypeSystemIdentifierDoubleQuoted => { 278 self.state_doctype_system_identifier_double_quoted() 279 } 280 State::DoctypeSystemIdentifierSingleQuoted => { 281 self.state_doctype_system_identifier_single_quoted() 282 } 283 State::AfterDoctypeSystemIdentifier => self.state_after_doctype_system_identifier(), 284 State::BogusDoctype => self.state_bogus_doctype(), 285 State::CharacterReference => self.state_character_reference(), 286 State::NumericCharacterReference => self.state_numeric_character_reference(), 287 State::HexCharacterReferenceStart => self.state_hex_character_reference_start(), 288 State::DecCharacterReferenceStart => self.state_dec_character_reference_start(), 289 State::HexCharacterReference => self.state_hex_character_reference(), 290 State::DecCharacterReference => self.state_dec_character_reference(), 291 State::NumericCharacterReferenceEnd => self.state_numeric_character_reference_end(), 292 State::NamedCharacterReference => self.state_named_character_reference(), 293 } 294 } 295 296 // --- State implementations --- 297 298 fn state_data(&mut self) { 299 match self.next_char() { 300 Some('&') => { 301 self.return_state = State::Data; 302 self.state = State::CharacterReference; 303 } 304 Some('<') => { 305 self.state = State::TagOpen; 306 } 307 Some('\0') => { 308 // Parse error. Emit replacement character. 309 self.emit_char('\u{FFFD}'); 310 } 311 None => { 312 self.emit_eof(); 313 } 314 Some(c) => { 315 self.emit_char(c); 316 } 317 } 318 } 319 320 fn state_tag_open(&mut self) { 321 match self.next_char() { 322 Some('!') => { 323 self.state = State::MarkupDeclarationOpen; 324 } 325 Some('/') => { 326 self.state = State::EndTagOpen; 327 } 328 Some(c) if c.is_ascii_alphabetic() => { 329 self.start_new_tag(false); 330 self.reconsume(); 331 self.state = State::TagName; 332 } 333 Some('?') => { 334 // Parse error. Create a comment token. 335 self.comment_data.clear(); 336 self.reconsume(); 337 self.state = State::BogusComment; 338 } 339 None => { 340 // Parse error. Emit '<' and EOF. 341 self.emit_char('<'); 342 self.emit_eof(); 343 } 344 Some(_) => { 345 // Parse error. Emit '<' and reconsume. 346 self.emit_char('<'); 347 self.reconsume(); 348 self.state = State::Data; 349 } 350 } 351 } 352 353 fn state_end_tag_open(&mut self) { 354 match self.next_char() { 355 Some(c) if c.is_ascii_alphabetic() => { 356 self.start_new_tag(true); 357 self.reconsume(); 358 self.state = State::TagName; 359 } 360 Some('>') => { 361 // Parse error. Switch to data state. 362 self.state = State::Data; 363 } 364 None => { 365 self.emit_char('<'); 366 self.emit_char('/'); 367 self.emit_eof(); 368 } 369 Some(_) => { 370 // Parse error. Create a comment. 371 self.comment_data.clear(); 372 self.reconsume(); 373 self.state = State::BogusComment; 374 } 375 } 376 } 377 378 fn state_tag_name(&mut self) { 379 match self.next_char() { 380 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 381 self.state = State::BeforeAttributeName; 382 } 383 Some('/') => { 384 self.state = State::SelfClosingStartTag; 385 } 386 Some('>') => { 387 self.state = State::Data; 388 self.emit_current_tag(); 389 } 390 Some(c) if c.is_ascii_uppercase() => { 391 self.tag_name.push(c.to_ascii_lowercase()); 392 } 393 Some('\0') => { 394 self.tag_name.push('\u{FFFD}'); 395 } 396 None => { 397 self.emit_eof(); 398 } 399 Some(c) => { 400 self.tag_name.push(c); 401 } 402 } 403 } 404 405 fn state_before_attribute_name(&mut self) { 406 match self.next_char() { 407 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 408 // Ignore whitespace. 409 } 410 Some('/') | Some('>') => { 411 self.reconsume(); 412 self.state = State::AfterAttributeName; 413 } 414 None => { 415 // EOF: go to AfterAttributeName without reconsuming. 416 self.state = State::AfterAttributeName; 417 } 418 Some('=') => { 419 // Parse error. Start a new attribute with '=' as name. 420 self.start_new_attribute(); 421 self.current_attr_name.push('='); 422 self.state = State::AttributeName; 423 } 424 Some(_) => { 425 self.start_new_attribute(); 426 self.reconsume(); 427 self.state = State::AttributeName; 428 } 429 } 430 } 431 432 fn state_attribute_name(&mut self) { 433 match self.next_char() { 434 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') | Some('/') | Some('>') => { 435 self.reconsume(); 436 self.state = State::AfterAttributeName; 437 } 438 None => { 439 self.state = State::AfterAttributeName; 440 } 441 Some('=') => { 442 self.state = State::BeforeAttributeValue; 443 } 444 Some(c) if c.is_ascii_uppercase() => { 445 self.current_attr_name.push(c.to_ascii_lowercase()); 446 } 447 Some('\0') => { 448 self.current_attr_name.push('\u{FFFD}'); 449 } 450 Some(c) => { 451 self.current_attr_name.push(c); 452 } 453 } 454 } 455 456 fn state_after_attribute_name(&mut self) { 457 match self.next_char() { 458 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 459 // Ignore. 460 } 461 Some('/') => { 462 self.state = State::SelfClosingStartTag; 463 } 464 Some('=') => { 465 self.state = State::BeforeAttributeValue; 466 } 467 Some('>') => { 468 self.state = State::Data; 469 self.emit_current_tag(); 470 } 471 None => { 472 self.emit_eof(); 473 } 474 Some(_) => { 475 self.start_new_attribute(); 476 self.reconsume(); 477 self.state = State::AttributeName; 478 } 479 } 480 } 481 482 fn state_before_attribute_value(&mut self) { 483 match self.next_char() { 484 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 485 // Ignore. 486 } 487 Some('"') => { 488 self.state = State::AttributeValueDoubleQuoted; 489 } 490 Some('\'') => { 491 self.state = State::AttributeValueSingleQuoted; 492 } 493 Some('>') => { 494 // Parse error. Emit tag with missing value. 495 self.state = State::Data; 496 self.emit_current_tag(); 497 } 498 _ => { 499 self.reconsume(); 500 self.state = State::AttributeValueUnquoted; 501 } 502 } 503 } 504 505 fn state_attribute_value_double_quoted(&mut self) { 506 match self.next_char() { 507 Some('"') => { 508 self.state = State::AfterAttributeValueQuoted; 509 } 510 Some('&') => { 511 self.return_state = State::AttributeValueDoubleQuoted; 512 self.state = State::CharacterReference; 513 } 514 Some('\0') => { 515 self.current_attr_value.push('\u{FFFD}'); 516 } 517 None => { 518 self.emit_eof(); 519 } 520 Some(c) => { 521 self.current_attr_value.push(c); 522 } 523 } 524 } 525 526 fn state_attribute_value_single_quoted(&mut self) { 527 match self.next_char() { 528 Some('\'') => { 529 self.state = State::AfterAttributeValueQuoted; 530 } 531 Some('&') => { 532 self.return_state = State::AttributeValueSingleQuoted; 533 self.state = State::CharacterReference; 534 } 535 Some('\0') => { 536 self.current_attr_value.push('\u{FFFD}'); 537 } 538 None => { 539 self.emit_eof(); 540 } 541 Some(c) => { 542 self.current_attr_value.push(c); 543 } 544 } 545 } 546 547 fn state_attribute_value_unquoted(&mut self) { 548 match self.next_char() { 549 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 550 self.state = State::BeforeAttributeName; 551 } 552 Some('&') => { 553 self.return_state = State::AttributeValueUnquoted; 554 self.state = State::CharacterReference; 555 } 556 Some('>') => { 557 self.state = State::Data; 558 self.emit_current_tag(); 559 } 560 Some('\0') => { 561 self.current_attr_value.push('\u{FFFD}'); 562 } 563 None => { 564 self.emit_eof(); 565 } 566 Some(c) => { 567 self.current_attr_value.push(c); 568 } 569 } 570 } 571 572 fn state_after_attribute_value_quoted(&mut self) { 573 match self.next_char() { 574 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 575 self.state = State::BeforeAttributeName; 576 } 577 Some('/') => { 578 self.state = State::SelfClosingStartTag; 579 } 580 Some('>') => { 581 self.state = State::Data; 582 self.emit_current_tag(); 583 } 584 None => { 585 self.emit_eof(); 586 } 587 Some(_) => { 588 // Parse error. Reconsume in before attribute name. 589 self.reconsume(); 590 self.state = State::BeforeAttributeName; 591 } 592 } 593 } 594 595 fn state_self_closing_start_tag(&mut self) { 596 match self.next_char() { 597 Some('>') => { 598 self.tag_self_closing = true; 599 self.state = State::Data; 600 self.emit_current_tag(); 601 } 602 None => { 603 self.emit_eof(); 604 } 605 Some(_) => { 606 // Parse error. Reconsume in before attribute name. 607 self.reconsume(); 608 self.state = State::BeforeAttributeName; 609 } 610 } 611 } 612 613 fn state_bogus_comment(&mut self) { 614 match self.next_char() { 615 Some('>') => { 616 self.state = State::Data; 617 self.emit_current_comment(); 618 } 619 None => { 620 self.emit_current_comment(); 621 self.emit_eof(); 622 } 623 Some('\0') => { 624 self.comment_data.push('\u{FFFD}'); 625 } 626 Some(c) => { 627 self.comment_data.push(c); 628 } 629 } 630 } 631 632 fn state_markup_declaration_open(&mut self) { 633 // Check for `--`, `DOCTYPE`, or `[CDATA[` 634 if self.starts_with("--") { 635 self.pos += 2; 636 self.comment_data.clear(); 637 self.state = State::CommentStart; 638 } else if self.starts_with_case_insensitive("DOCTYPE") { 639 self.pos += 7; 640 self.state = State::Doctype; 641 } else if self.starts_with("[CDATA[") { 642 // Per spec, if not in foreign content, parse error → bogus comment. 643 self.pos += 7; 644 self.comment_data.clear(); 645 self.comment_data.push_str("[CDATA["); 646 self.state = State::BogusComment; 647 } else { 648 // Parse error. Bogus comment. 649 self.comment_data.clear(); 650 self.state = State::BogusComment; 651 } 652 } 653 654 fn state_comment_start(&mut self) { 655 match self.next_char() { 656 Some('-') => { 657 self.state = State::CommentStartDash; 658 } 659 Some('>') => { 660 // Parse error. Emit empty comment. 661 self.state = State::Data; 662 self.emit_current_comment(); 663 } 664 _ => { 665 self.reconsume(); 666 self.state = State::Comment; 667 } 668 } 669 } 670 671 fn state_comment_start_dash(&mut self) { 672 match self.next_char() { 673 Some('-') => { 674 self.state = State::CommentEnd; 675 } 676 Some('>') => { 677 // Parse error. 678 self.state = State::Data; 679 self.emit_current_comment(); 680 } 681 None => { 682 self.emit_current_comment(); 683 self.emit_eof(); 684 } 685 Some(_) => { 686 self.comment_data.push('-'); 687 self.reconsume(); 688 self.state = State::Comment; 689 } 690 } 691 } 692 693 fn state_comment(&mut self) { 694 match self.next_char() { 695 Some('<') => { 696 self.comment_data.push('<'); 697 self.state = State::CommentLessThanSign; 698 } 699 Some('-') => { 700 self.state = State::CommentEndDash; 701 } 702 Some('\0') => { 703 self.comment_data.push('\u{FFFD}'); 704 } 705 None => { 706 self.emit_current_comment(); 707 self.emit_eof(); 708 } 709 Some(c) => { 710 self.comment_data.push(c); 711 } 712 } 713 } 714 715 fn state_comment_less_than_sign(&mut self) { 716 match self.next_char() { 717 Some('!') => { 718 self.comment_data.push('!'); 719 self.state = State::CommentLessThanSignBang; 720 } 721 Some('<') => { 722 self.comment_data.push('<'); 723 } 724 None => { 725 // Don't reconsume on EOF — pos didn't advance, so reconsuming 726 // would back up to '<' and loop forever between here and Comment. 727 self.state = State::Comment; 728 } 729 Some(_) => { 730 self.reconsume(); 731 self.state = State::Comment; 732 } 733 } 734 } 735 736 fn state_comment_less_than_sign_bang(&mut self) { 737 match self.next_char() { 738 Some('-') => { 739 self.state = State::CommentLessThanSignBangDash; 740 } 741 _ => { 742 self.reconsume(); 743 self.state = State::Comment; 744 } 745 } 746 } 747 748 fn state_comment_less_than_sign_bang_dash(&mut self) { 749 match self.next_char() { 750 Some('-') => { 751 self.state = State::CommentLessThanSignBangDashDash; 752 } 753 _ => { 754 self.reconsume(); 755 self.state = State::CommentEndDash; 756 } 757 } 758 } 759 760 fn state_comment_less_than_sign_bang_dash_dash(&mut self) { 761 match self.next_char() { 762 Some('>') | None => { 763 self.reconsume(); 764 self.state = State::CommentEnd; 765 } 766 Some(_) => { 767 // Parse error. 768 self.reconsume(); 769 self.state = State::CommentEnd; 770 } 771 } 772 } 773 774 fn state_comment_end_dash(&mut self) { 775 match self.next_char() { 776 Some('-') => { 777 self.state = State::CommentEnd; 778 } 779 None => { 780 self.emit_current_comment(); 781 self.emit_eof(); 782 } 783 Some(_) => { 784 self.comment_data.push('-'); 785 self.reconsume(); 786 self.state = State::Comment; 787 } 788 } 789 } 790 791 fn state_comment_end(&mut self) { 792 match self.next_char() { 793 Some('>') => { 794 self.state = State::Data; 795 self.emit_current_comment(); 796 } 797 Some('!') => { 798 self.state = State::CommentEndBang; 799 } 800 Some('-') => { 801 self.comment_data.push('-'); 802 } 803 None => { 804 self.emit_current_comment(); 805 self.emit_eof(); 806 } 807 Some(_) => { 808 self.comment_data.push('-'); 809 self.comment_data.push('-'); 810 self.reconsume(); 811 self.state = State::Comment; 812 } 813 } 814 } 815 816 fn state_comment_end_bang(&mut self) { 817 match self.next_char() { 818 Some('-') => { 819 self.comment_data.push('-'); 820 self.comment_data.push('-'); 821 self.comment_data.push('!'); 822 self.state = State::CommentEndDash; 823 } 824 Some('>') => { 825 self.state = State::Data; 826 self.emit_current_comment(); 827 } 828 None => { 829 self.emit_current_comment(); 830 self.emit_eof(); 831 } 832 Some(_) => { 833 self.comment_data.push('-'); 834 self.comment_data.push('-'); 835 self.comment_data.push('!'); 836 self.reconsume(); 837 self.state = State::Comment; 838 } 839 } 840 } 841 842 fn state_doctype(&mut self) { 843 match self.next_char() { 844 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 845 self.state = State::BeforeDoctypeName; 846 } 847 Some('>') => { 848 self.reconsume(); 849 self.state = State::BeforeDoctypeName; 850 } 851 None => { 852 self.doctype_name = None; 853 self.doctype_public_id = None; 854 self.doctype_system_id = None; 855 self.doctype_force_quirks = true; 856 self.emit_current_doctype(); 857 self.emit_eof(); 858 } 859 Some(_) => { 860 // Parse error. Missing whitespace before DOCTYPE name. 861 self.reconsume(); 862 self.state = State::BeforeDoctypeName; 863 } 864 } 865 } 866 867 fn state_before_doctype_name(&mut self) { 868 match self.next_char() { 869 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 870 // Ignore whitespace. 871 } 872 Some(c) if c.is_ascii_uppercase() => { 873 self.doctype_name = Some(c.to_ascii_lowercase().to_string()); 874 self.doctype_public_id = None; 875 self.doctype_system_id = None; 876 self.doctype_force_quirks = false; 877 self.state = State::DoctypeName; 878 } 879 Some('\0') => { 880 self.doctype_name = Some("\u{FFFD}".to_string()); 881 self.doctype_public_id = None; 882 self.doctype_system_id = None; 883 self.doctype_force_quirks = false; 884 self.state = State::DoctypeName; 885 } 886 Some('>') => { 887 // Parse error. Force quirks. 888 self.doctype_name = None; 889 self.doctype_public_id = None; 890 self.doctype_system_id = None; 891 self.doctype_force_quirks = true; 892 self.state = State::Data; 893 self.emit_current_doctype(); 894 } 895 None => { 896 self.doctype_name = None; 897 self.doctype_public_id = None; 898 self.doctype_system_id = None; 899 self.doctype_force_quirks = true; 900 self.emit_current_doctype(); 901 self.emit_eof(); 902 } 903 Some(c) => { 904 self.doctype_name = Some(c.to_string()); 905 self.doctype_public_id = None; 906 self.doctype_system_id = None; 907 self.doctype_force_quirks = false; 908 self.state = State::DoctypeName; 909 } 910 } 911 } 912 913 fn state_doctype_name(&mut self) { 914 match self.next_char() { 915 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 916 self.state = State::AfterDoctypeName; 917 } 918 Some('>') => { 919 self.state = State::Data; 920 self.emit_current_doctype(); 921 } 922 Some(c) if c.is_ascii_uppercase() => { 923 if let Some(ref mut name) = self.doctype_name { 924 name.push(c.to_ascii_lowercase()); 925 } 926 } 927 Some('\0') => { 928 if let Some(ref mut name) = self.doctype_name { 929 name.push('\u{FFFD}'); 930 } 931 } 932 None => { 933 self.doctype_force_quirks = true; 934 self.emit_current_doctype(); 935 self.emit_eof(); 936 } 937 Some(c) => { 938 if let Some(ref mut name) = self.doctype_name { 939 name.push(c); 940 } 941 } 942 } 943 } 944 945 fn state_after_doctype_name(&mut self) { 946 match self.next_char() { 947 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 948 // Ignore. 949 } 950 Some('>') => { 951 self.state = State::Data; 952 self.emit_current_doctype(); 953 } 954 None => { 955 self.doctype_force_quirks = true; 956 self.emit_current_doctype(); 957 self.emit_eof(); 958 } 959 Some(_) => { 960 // Check for PUBLIC or SYSTEM keyword. 961 self.reconsume(); 962 if self.starts_with_case_insensitive("PUBLIC") { 963 self.pos += 6; 964 self.state = State::AfterDoctypePublicKeyword; 965 } else if self.starts_with_case_insensitive("SYSTEM") { 966 self.pos += 6; 967 self.state = State::AfterDoctypeSystemKeyword; 968 } else { 969 // Parse error. 970 self.doctype_force_quirks = true; 971 self.next_char(); // consume the reconsumed char 972 self.state = State::BogusDoctype; 973 } 974 } 975 } 976 } 977 978 fn state_after_doctype_public_keyword(&mut self) { 979 match self.next_char() { 980 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 981 self.state = State::BeforeDoctypePublicIdentifier; 982 } 983 Some('"') => { 984 // Parse error. Missing whitespace. 985 self.doctype_public_id = Some(String::new()); 986 self.state = State::DoctypePublicIdentifierDoubleQuoted; 987 } 988 Some('\'') => { 989 self.doctype_public_id = Some(String::new()); 990 self.state = State::DoctypePublicIdentifierSingleQuoted; 991 } 992 Some('>') => { 993 self.doctype_force_quirks = true; 994 self.state = State::Data; 995 self.emit_current_doctype(); 996 } 997 None => { 998 self.doctype_force_quirks = true; 999 self.emit_current_doctype(); 1000 self.emit_eof(); 1001 } 1002 Some(_) => { 1003 self.doctype_force_quirks = true; 1004 self.reconsume(); 1005 self.state = State::BogusDoctype; 1006 } 1007 } 1008 } 1009 1010 fn state_before_doctype_public_identifier(&mut self) { 1011 match self.next_char() { 1012 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1013 // Ignore. 1014 } 1015 Some('"') => { 1016 self.doctype_public_id = Some(String::new()); 1017 self.state = State::DoctypePublicIdentifierDoubleQuoted; 1018 } 1019 Some('\'') => { 1020 self.doctype_public_id = Some(String::new()); 1021 self.state = State::DoctypePublicIdentifierSingleQuoted; 1022 } 1023 Some('>') => { 1024 self.doctype_force_quirks = true; 1025 self.state = State::Data; 1026 self.emit_current_doctype(); 1027 } 1028 None => { 1029 self.doctype_force_quirks = true; 1030 self.emit_current_doctype(); 1031 self.emit_eof(); 1032 } 1033 Some(_) => { 1034 self.doctype_force_quirks = true; 1035 self.reconsume(); 1036 self.state = State::BogusDoctype; 1037 } 1038 } 1039 } 1040 1041 fn state_doctype_public_identifier_double_quoted(&mut self) { 1042 match self.next_char() { 1043 Some('"') => { 1044 self.state = State::AfterDoctypePublicIdentifier; 1045 } 1046 Some('\0') => { 1047 if let Some(ref mut id) = self.doctype_public_id { 1048 id.push('\u{FFFD}'); 1049 } 1050 } 1051 Some('>') => { 1052 self.doctype_force_quirks = true; 1053 self.state = State::Data; 1054 self.emit_current_doctype(); 1055 } 1056 None => { 1057 self.doctype_force_quirks = true; 1058 self.emit_current_doctype(); 1059 self.emit_eof(); 1060 } 1061 Some(c) => { 1062 if let Some(ref mut id) = self.doctype_public_id { 1063 id.push(c); 1064 } 1065 } 1066 } 1067 } 1068 1069 fn state_doctype_public_identifier_single_quoted(&mut self) { 1070 match self.next_char() { 1071 Some('\'') => { 1072 self.state = State::AfterDoctypePublicIdentifier; 1073 } 1074 Some('\0') => { 1075 if let Some(ref mut id) = self.doctype_public_id { 1076 id.push('\u{FFFD}'); 1077 } 1078 } 1079 Some('>') => { 1080 self.doctype_force_quirks = true; 1081 self.state = State::Data; 1082 self.emit_current_doctype(); 1083 } 1084 None => { 1085 self.doctype_force_quirks = true; 1086 self.emit_current_doctype(); 1087 self.emit_eof(); 1088 } 1089 Some(c) => { 1090 if let Some(ref mut id) = self.doctype_public_id { 1091 id.push(c); 1092 } 1093 } 1094 } 1095 } 1096 1097 fn state_after_doctype_public_identifier(&mut self) { 1098 match self.next_char() { 1099 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1100 self.state = State::BetweenDoctypePublicAndSystemIdentifiers; 1101 } 1102 Some('>') => { 1103 self.state = State::Data; 1104 self.emit_current_doctype(); 1105 } 1106 Some('"') => { 1107 // Parse error. Missing whitespace. 1108 self.doctype_system_id = Some(String::new()); 1109 self.state = State::DoctypeSystemIdentifierDoubleQuoted; 1110 } 1111 Some('\'') => { 1112 self.doctype_system_id = Some(String::new()); 1113 self.state = State::DoctypeSystemIdentifierSingleQuoted; 1114 } 1115 None => { 1116 self.doctype_force_quirks = true; 1117 self.emit_current_doctype(); 1118 self.emit_eof(); 1119 } 1120 Some(_) => { 1121 self.doctype_force_quirks = true; 1122 self.reconsume(); 1123 self.state = State::BogusDoctype; 1124 } 1125 } 1126 } 1127 1128 fn state_between_doctype_public_and_system_identifiers(&mut self) { 1129 match self.next_char() { 1130 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1131 // Ignore. 1132 } 1133 Some('>') => { 1134 self.state = State::Data; 1135 self.emit_current_doctype(); 1136 } 1137 Some('"') => { 1138 self.doctype_system_id = Some(String::new()); 1139 self.state = State::DoctypeSystemIdentifierDoubleQuoted; 1140 } 1141 Some('\'') => { 1142 self.doctype_system_id = Some(String::new()); 1143 self.state = State::DoctypeSystemIdentifierSingleQuoted; 1144 } 1145 None => { 1146 self.doctype_force_quirks = true; 1147 self.emit_current_doctype(); 1148 self.emit_eof(); 1149 } 1150 Some(_) => { 1151 self.doctype_force_quirks = true; 1152 self.reconsume(); 1153 self.state = State::BogusDoctype; 1154 } 1155 } 1156 } 1157 1158 fn state_after_doctype_system_keyword(&mut self) { 1159 match self.next_char() { 1160 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1161 self.state = State::BeforeDoctypeSystemIdentifier; 1162 } 1163 Some('"') => { 1164 self.doctype_system_id = Some(String::new()); 1165 self.state = State::DoctypeSystemIdentifierDoubleQuoted; 1166 } 1167 Some('\'') => { 1168 self.doctype_system_id = Some(String::new()); 1169 self.state = State::DoctypeSystemIdentifierSingleQuoted; 1170 } 1171 Some('>') => { 1172 self.doctype_force_quirks = true; 1173 self.state = State::Data; 1174 self.emit_current_doctype(); 1175 } 1176 None => { 1177 self.doctype_force_quirks = true; 1178 self.emit_current_doctype(); 1179 self.emit_eof(); 1180 } 1181 Some(_) => { 1182 self.doctype_force_quirks = true; 1183 self.reconsume(); 1184 self.state = State::BogusDoctype; 1185 } 1186 } 1187 } 1188 1189 fn state_before_doctype_system_identifier(&mut self) { 1190 match self.next_char() { 1191 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1192 // Ignore. 1193 } 1194 Some('"') => { 1195 self.doctype_system_id = Some(String::new()); 1196 self.state = State::DoctypeSystemIdentifierDoubleQuoted; 1197 } 1198 Some('\'') => { 1199 self.doctype_system_id = Some(String::new()); 1200 self.state = State::DoctypeSystemIdentifierSingleQuoted; 1201 } 1202 Some('>') => { 1203 self.doctype_force_quirks = true; 1204 self.state = State::Data; 1205 self.emit_current_doctype(); 1206 } 1207 None => { 1208 self.doctype_force_quirks = true; 1209 self.emit_current_doctype(); 1210 self.emit_eof(); 1211 } 1212 Some(_) => { 1213 self.doctype_force_quirks = true; 1214 self.reconsume(); 1215 self.state = State::BogusDoctype; 1216 } 1217 } 1218 } 1219 1220 fn state_doctype_system_identifier_double_quoted(&mut self) { 1221 match self.next_char() { 1222 Some('"') => { 1223 self.state = State::AfterDoctypeSystemIdentifier; 1224 } 1225 Some('\0') => { 1226 if let Some(ref mut id) = self.doctype_system_id { 1227 id.push('\u{FFFD}'); 1228 } 1229 } 1230 Some('>') => { 1231 self.doctype_force_quirks = true; 1232 self.state = State::Data; 1233 self.emit_current_doctype(); 1234 } 1235 None => { 1236 self.doctype_force_quirks = true; 1237 self.emit_current_doctype(); 1238 self.emit_eof(); 1239 } 1240 Some(c) => { 1241 if let Some(ref mut id) = self.doctype_system_id { 1242 id.push(c); 1243 } 1244 } 1245 } 1246 } 1247 1248 fn state_doctype_system_identifier_single_quoted(&mut self) { 1249 match self.next_char() { 1250 Some('\'') => { 1251 self.state = State::AfterDoctypeSystemIdentifier; 1252 } 1253 Some('\0') => { 1254 if let Some(ref mut id) = self.doctype_system_id { 1255 id.push('\u{FFFD}'); 1256 } 1257 } 1258 Some('>') => { 1259 self.doctype_force_quirks = true; 1260 self.state = State::Data; 1261 self.emit_current_doctype(); 1262 } 1263 None => { 1264 self.doctype_force_quirks = true; 1265 self.emit_current_doctype(); 1266 self.emit_eof(); 1267 } 1268 Some(c) => { 1269 if let Some(ref mut id) = self.doctype_system_id { 1270 id.push(c); 1271 } 1272 } 1273 } 1274 } 1275 1276 fn state_after_doctype_system_identifier(&mut self) { 1277 match self.next_char() { 1278 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { 1279 // Ignore. 1280 } 1281 Some('>') => { 1282 self.state = State::Data; 1283 self.emit_current_doctype(); 1284 } 1285 None => { 1286 self.doctype_force_quirks = true; 1287 self.emit_current_doctype(); 1288 self.emit_eof(); 1289 } 1290 Some(_) => { 1291 // Parse error, but do NOT set force_quirks. 1292 self.reconsume(); 1293 self.state = State::BogusDoctype; 1294 } 1295 } 1296 } 1297 1298 fn state_bogus_doctype(&mut self) { 1299 match self.next_char() { 1300 Some('>') => { 1301 self.state = State::Data; 1302 self.emit_current_doctype(); 1303 } 1304 Some('\0') => { 1305 // Parse error. Ignore. 1306 } 1307 None => { 1308 self.emit_current_doctype(); 1309 self.emit_eof(); 1310 } 1311 Some(_) => { 1312 // Ignore. 1313 } 1314 } 1315 } 1316 1317 // --- Character reference states --- 1318 1319 fn state_character_reference(&mut self) { 1320 self.temp_buf.clear(); 1321 self.temp_buf.push('&'); 1322 1323 match self.peek_char() { 1324 Some(c) if c.is_ascii_alphanumeric() => { 1325 self.state = State::NamedCharacterReference; 1326 } 1327 Some('#') => { 1328 self.temp_buf.push('#'); 1329 self.next_char(); 1330 self.state = State::NumericCharacterReference; 1331 } 1332 _ => { 1333 // Not a character reference. Flush '&' to return state. 1334 self.flush_char_ref("&"); 1335 self.state = self.return_state; 1336 } 1337 } 1338 } 1339 1340 fn state_named_character_reference(&mut self) { 1341 // Collect alphanumeric characters to form the entity name. 1342 // Per spec, entity names can also contain digits after the first char. 1343 let mut name = String::new(); 1344 let start_pos = self.pos; 1345 1346 while let Some(c) = self.peek_char() { 1347 if c.is_ascii_alphanumeric() { 1348 name.push(c); 1349 self.pos += 1; 1350 } else { 1351 break; 1352 } 1353 } 1354 1355 // Try to find a match, trying longest match first. 1356 // First check if the full name + semicolon matches. 1357 let has_trailing_semi = self.peek_char() == Some(';'); 1358 1359 let mut matched_value: Option<&str> = None; 1360 let mut matched_len = 0; 1361 1362 // Try the full name first (with semicolon if present). 1363 if has_trailing_semi { 1364 if let Some(val) = entities::lookup_entity(&name) { 1365 matched_value = Some(val); 1366 matched_len = name.len(); 1367 } 1368 } 1369 1370 // If no match with full name, try progressively shorter prefixes. 1371 if matched_value.is_none() { 1372 for i in (1..=name.len()).rev() { 1373 let candidate = &name[..i]; 1374 if let Some(val) = entities::lookup_entity(candidate) { 1375 // Without semicolon, only legacy entities are recognized. 1376 if entities::is_legacy_entity(candidate) { 1377 matched_value = Some(val); 1378 matched_len = i; 1379 break; 1380 } 1381 } 1382 } 1383 } 1384 1385 // Also try the full name without semicolon for legacy entities. 1386 if matched_value.is_none() && !has_trailing_semi { 1387 if let Some(val) = entities::lookup_entity(&name) { 1388 if entities::is_legacy_entity(&name) { 1389 matched_value = Some(val); 1390 matched_len = name.len(); 1391 } 1392 } 1393 } 1394 1395 if let Some(value) = matched_value { 1396 // Rewind to just after the matched portion. 1397 self.pos = start_pos + matched_len; 1398 1399 // Check for semicolon after the matched portion. 1400 let has_semi = self.peek_char() == Some(';'); 1401 if has_semi { 1402 self.pos += 1; 1403 } 1404 1405 // Per spec: if consumed as part of an attribute and the character 1406 // after the match is `=` or alphanumeric, and no semicolon, 1407 // flush the original text instead. 1408 let in_attribute = matches!( 1409 self.return_state, 1410 State::AttributeValueDoubleQuoted 1411 | State::AttributeValueSingleQuoted 1412 | State::AttributeValueUnquoted 1413 ); 1414 1415 if !has_semi && in_attribute { 1416 if let Some(next) = self.peek_char() { 1417 if next == '=' || next.is_ascii_alphanumeric() { 1418 // Not a reference. Flush original text. 1419 let mut original = "&".to_string(); 1420 original.push_str(&name[..matched_len]); 1421 self.flush_char_ref(&original); 1422 self.state = self.return_state; 1423 return; 1424 } 1425 } 1426 } 1427 1428 self.flush_char_ref(value); 1429 self.state = self.return_state; 1430 } else { 1431 // No match. Rewind and flush '&' + all collected chars. 1432 self.pos = start_pos; 1433 self.flush_char_ref("&"); 1434 for _ in 0..name.len() { 1435 let c = self.next_char().unwrap(); 1436 let s = c.to_string(); 1437 self.flush_char_ref(&s); 1438 } 1439 self.state = self.return_state; 1440 } 1441 } 1442 1443 fn state_numeric_character_reference(&mut self) { 1444 self.char_ref_code = 0; 1445 match self.peek_char() { 1446 Some('x') | Some('X') => { 1447 self.temp_buf.push(self.peek_char().unwrap()); 1448 self.next_char(); 1449 self.state = State::HexCharacterReferenceStart; 1450 } 1451 _ => { 1452 self.state = State::DecCharacterReferenceStart; 1453 } 1454 } 1455 } 1456 1457 fn state_hex_character_reference_start(&mut self) { 1458 match self.peek_char() { 1459 Some(c) if c.is_ascii_hexdigit() => { 1460 self.state = State::HexCharacterReference; 1461 } 1462 _ => { 1463 // Parse error. Flush temp_buf. 1464 let buf = self.temp_buf.clone(); 1465 self.flush_char_ref(&buf); 1466 self.state = self.return_state; 1467 } 1468 } 1469 } 1470 1471 fn state_dec_character_reference_start(&mut self) { 1472 match self.peek_char() { 1473 Some(c) if c.is_ascii_digit() => { 1474 self.state = State::DecCharacterReference; 1475 } 1476 _ => { 1477 let buf = self.temp_buf.clone(); 1478 self.flush_char_ref(&buf); 1479 self.state = self.return_state; 1480 } 1481 } 1482 } 1483 1484 fn state_hex_character_reference(&mut self) { 1485 match self.next_char() { 1486 Some(c) if c.is_ascii_hexdigit() => { 1487 // Cap at a value that's clearly out of range but won't overflow. 1488 self.char_ref_code = self 1489 .char_ref_code 1490 .saturating_mul(16) 1491 .saturating_add(c.to_digit(16).unwrap()); 1492 if self.char_ref_code > 0x10FFFF { 1493 self.char_ref_code = 0x110000; 1494 } 1495 } 1496 Some(';') => { 1497 self.state = State::NumericCharacterReferenceEnd; 1498 } 1499 None => { 1500 // EOF: missing semicolon parse error. Don't reconsume. 1501 self.state = State::NumericCharacterReferenceEnd; 1502 } 1503 Some(_) => { 1504 // Parse error: missing semicolon. 1505 self.reconsume(); 1506 self.state = State::NumericCharacterReferenceEnd; 1507 } 1508 } 1509 } 1510 1511 fn state_dec_character_reference(&mut self) { 1512 match self.next_char() { 1513 Some(c) if c.is_ascii_digit() => { 1514 self.char_ref_code = self 1515 .char_ref_code 1516 .saturating_mul(10) 1517 .saturating_add(c.to_digit(10).unwrap()); 1518 if self.char_ref_code > 0x10FFFF { 1519 self.char_ref_code = 0x110000; 1520 } 1521 } 1522 Some(';') => { 1523 self.state = State::NumericCharacterReferenceEnd; 1524 } 1525 None => { 1526 // EOF: missing semicolon parse error. Don't reconsume. 1527 self.state = State::NumericCharacterReferenceEnd; 1528 } 1529 Some(_) => { 1530 self.reconsume(); 1531 self.state = State::NumericCharacterReferenceEnd; 1532 } 1533 } 1534 } 1535 1536 fn state_numeric_character_reference_end(&mut self) { 1537 let code = self.char_ref_code; 1538 let ch = match code { 1539 0 => '\u{FFFD}', 1540 // Surrogate range. 1541 0xD800..=0xDFFF => '\u{FFFD}', 1542 // Out of Unicode range. 1543 c if c > 0x10FFFF => '\u{FFFD}', 1544 // Windows-1252 replacement table for 0x80..0x9F. 1545 0x80 => '\u{20AC}', 1546 0x82 => '\u{201A}', 1547 0x83 => '\u{0192}', 1548 0x84 => '\u{201E}', 1549 0x85 => '\u{2026}', 1550 0x86 => '\u{2020}', 1551 0x87 => '\u{2021}', 1552 0x88 => '\u{02C6}', 1553 0x89 => '\u{2030}', 1554 0x8A => '\u{0160}', 1555 0x8B => '\u{2039}', 1556 0x8C => '\u{0152}', 1557 0x8E => '\u{017D}', 1558 0x91 => '\u{2018}', 1559 0x92 => '\u{2019}', 1560 0x93 => '\u{201C}', 1561 0x94 => '\u{201D}', 1562 0x95 => '\u{2022}', 1563 0x96 => '\u{2013}', 1564 0x97 => '\u{2014}', 1565 0x98 => '\u{02DC}', 1566 0x99 => '\u{2122}', 1567 0x9A => '\u{0161}', 1568 0x9B => '\u{203A}', 1569 0x9C => '\u{0153}', 1570 0x9E => '\u{017E}', 1571 0x9F => '\u{0178}', 1572 c => char::from_u32(c).unwrap_or('\u{FFFD}'), 1573 }; 1574 1575 let s = ch.to_string(); 1576 self.flush_char_ref(&s); 1577 self.state = self.return_state; 1578 } 1579 1580 // --- Helpers --- 1581 1582 fn starts_with(&self, s: &str) -> bool { 1583 let bytes: Vec<char> = s.chars().collect(); 1584 if self.pos + bytes.len() > self.input.len() { 1585 return false; 1586 } 1587 for (i, &c) in bytes.iter().enumerate() { 1588 if self.input[self.pos + i] != c { 1589 return false; 1590 } 1591 } 1592 true 1593 } 1594 1595 fn starts_with_case_insensitive(&self, s: &str) -> bool { 1596 let bytes: Vec<char> = s.chars().collect(); 1597 if self.pos + bytes.len() > self.input.len() { 1598 return false; 1599 } 1600 for (i, &c) in bytes.iter().enumerate() { 1601 if !self.input[self.pos + i].eq_ignore_ascii_case(&c) { 1602 return false; 1603 } 1604 } 1605 true 1606 } 1607} 1608 1609#[cfg(test)] 1610mod tests { 1611 use super::*; 1612 use crate::tokenize; 1613 1614 #[test] 1615 fn empty_input() { 1616 let tokens = tokenize(""); 1617 assert!(tokens.is_empty()); 1618 } 1619 1620 #[test] 1621 fn plain_text() { 1622 let tokens = tokenize("Hello, world!"); 1623 assert_eq!(tokens, vec![Token::Character("Hello, world!".to_string())]); 1624 } 1625 1626 #[test] 1627 fn simple_element() { 1628 let tokens = tokenize("<p>Hello</p>"); 1629 assert_eq!( 1630 tokens, 1631 vec![ 1632 Token::StartTag { 1633 name: "p".to_string(), 1634 attributes: vec![], 1635 self_closing: false, 1636 }, 1637 Token::Character("Hello".to_string()), 1638 Token::EndTag { 1639 name: "p".to_string(), 1640 }, 1641 ] 1642 ); 1643 } 1644 1645 #[test] 1646 fn self_closing_tag() { 1647 let tokens = tokenize("<br/>"); 1648 assert_eq!( 1649 tokens, 1650 vec![Token::StartTag { 1651 name: "br".to_string(), 1652 attributes: vec![], 1653 self_closing: true, 1654 }] 1655 ); 1656 } 1657 1658 #[test] 1659 fn self_closing_img() { 1660 let tokens = tokenize("<img/>"); 1661 assert_eq!( 1662 tokens, 1663 vec![Token::StartTag { 1664 name: "img".to_string(), 1665 attributes: vec![], 1666 self_closing: true, 1667 }] 1668 ); 1669 } 1670 1671 #[test] 1672 fn tag_with_attributes() { 1673 let tokens = tokenize(r#"<a href="url" class="link">"#); 1674 assert_eq!( 1675 tokens, 1676 vec![Token::StartTag { 1677 name: "a".to_string(), 1678 attributes: vec![ 1679 ("href".to_string(), "url".to_string()), 1680 ("class".to_string(), "link".to_string()), 1681 ], 1682 self_closing: false, 1683 }] 1684 ); 1685 } 1686 1687 #[test] 1688 fn tag_with_single_quoted_attributes() { 1689 let tokens = tokenize("<div id='main'>"); 1690 assert_eq!( 1691 tokens, 1692 vec![Token::StartTag { 1693 name: "div".to_string(), 1694 attributes: vec![("id".to_string(), "main".to_string())], 1695 self_closing: false, 1696 }] 1697 ); 1698 } 1699 1700 #[test] 1701 fn tag_with_unquoted_attribute() { 1702 let tokens = tokenize("<input type=text>"); 1703 assert_eq!( 1704 tokens, 1705 vec![Token::StartTag { 1706 name: "input".to_string(), 1707 attributes: vec![("type".to_string(), "text".to_string())], 1708 self_closing: false, 1709 }] 1710 ); 1711 } 1712 1713 #[test] 1714 fn comment() { 1715 let tokens = tokenize("<!-- comment -->"); 1716 assert_eq!(tokens, vec![Token::Comment(" comment ".to_string())]); 1717 } 1718 1719 #[test] 1720 fn empty_comment() { 1721 let tokens = tokenize("<!---->"); 1722 assert_eq!(tokens, vec![Token::Comment("".to_string())]); 1723 } 1724 1725 #[test] 1726 fn doctype_html() { 1727 let tokens = tokenize("<!DOCTYPE html>"); 1728 assert_eq!( 1729 tokens, 1730 vec![Token::Doctype { 1731 name: Some("html".to_string()), 1732 public_id: None, 1733 system_id: None, 1734 force_quirks: false, 1735 }] 1736 ); 1737 } 1738 1739 #[test] 1740 fn doctype_case_insensitive() { 1741 let tokens = tokenize("<!doctype html>"); 1742 assert_eq!( 1743 tokens, 1744 vec![Token::Doctype { 1745 name: Some("html".to_string()), 1746 public_id: None, 1747 system_id: None, 1748 force_quirks: false, 1749 }] 1750 ); 1751 } 1752 1753 #[test] 1754 fn char_ref_named() { 1755 let tokens = tokenize("&amp;&lt;&gt;&quot;"); 1756 assert_eq!(tokens, vec![Token::Character("&<>\"".to_string())]); 1757 } 1758 1759 #[test] 1760 fn char_ref_numeric_decimal() { 1761 let tokens = tokenize("&#65;"); 1762 assert_eq!(tokens, vec![Token::Character("A".to_string())]); 1763 } 1764 1765 #[test] 1766 fn char_ref_numeric_hex() { 1767 let tokens = tokenize("&#x41;"); 1768 assert_eq!(tokens, vec![Token::Character("A".to_string())]); 1769 } 1770 1771 #[test] 1772 fn char_ref_numeric_hex_uppercase() { 1773 let tokens = tokenize("&#X41;"); 1774 assert_eq!(tokens, vec![Token::Character("A".to_string())]); 1775 } 1776 1777 #[test] 1778 fn full_html_document() { 1779 let tokens = 1780 tokenize("<html><head><title>Test</title></head><body><p>Hello</p></body></html>"); 1781 assert_eq!( 1782 tokens, 1783 vec![ 1784 Token::StartTag { 1785 name: "html".to_string(), 1786 attributes: vec![], 1787 self_closing: false, 1788 }, 1789 Token::StartTag { 1790 name: "head".to_string(), 1791 attributes: vec![], 1792 self_closing: false, 1793 }, 1794 Token::StartTag { 1795 name: "title".to_string(), 1796 attributes: vec![], 1797 self_closing: false, 1798 }, 1799 Token::Character("Test".to_string()), 1800 Token::EndTag { 1801 name: "title".to_string(), 1802 }, 1803 Token::EndTag { 1804 name: "head".to_string(), 1805 }, 1806 Token::StartTag { 1807 name: "body".to_string(), 1808 attributes: vec![], 1809 self_closing: false, 1810 }, 1811 Token::StartTag { 1812 name: "p".to_string(), 1813 attributes: vec![], 1814 self_closing: false, 1815 }, 1816 Token::Character("Hello".to_string()), 1817 Token::EndTag { 1818 name: "p".to_string(), 1819 }, 1820 Token::EndTag { 1821 name: "body".to_string(), 1822 }, 1823 Token::EndTag { 1824 name: "html".to_string(), 1825 }, 1826 ] 1827 ); 1828 } 1829 1830 #[test] 1831 fn uppercase_tag_names_lowercased() { 1832 let tokens = tokenize("<DIV></DIV>"); 1833 assert_eq!( 1834 tokens, 1835 vec![ 1836 Token::StartTag { 1837 name: "div".to_string(), 1838 attributes: vec![], 1839 self_closing: false, 1840 }, 1841 Token::EndTag { 1842 name: "div".to_string(), 1843 }, 1844 ] 1845 ); 1846 } 1847 1848 #[test] 1849 fn uppercase_attribute_names_lowercased() { 1850 let tokens = tokenize(r#"<div CLASS="x">"#); 1851 assert_eq!( 1852 tokens, 1853 vec![Token::StartTag { 1854 name: "div".to_string(), 1855 attributes: vec![("class".to_string(), "x".to_string())], 1856 self_closing: false, 1857 }] 1858 ); 1859 } 1860 1861 #[test] 1862 fn duplicate_attributes_first_wins() { 1863 let tokens = tokenize(r#"<div class="a" class="b">"#); 1864 assert_eq!( 1865 tokens, 1866 vec![Token::StartTag { 1867 name: "div".to_string(), 1868 attributes: vec![("class".to_string(), "a".to_string())], 1869 self_closing: false, 1870 }] 1871 ); 1872 } 1873 1874 #[test] 1875 fn char_ref_in_attribute() { 1876 let tokens = tokenize(r#"<a href="?a=1&amp;b=2">"#); 1877 assert_eq!( 1878 tokens, 1879 vec![Token::StartTag { 1880 name: "a".to_string(), 1881 attributes: vec![("href".to_string(), "?a=1&b=2".to_string())], 1882 self_closing: false, 1883 }] 1884 ); 1885 } 1886 1887 #[test] 1888 fn multiple_attributes() { 1889 let tokens = tokenize(r#"<input type="text" name="foo" value="bar">"#); 1890 assert_eq!( 1891 tokens, 1892 vec![Token::StartTag { 1893 name: "input".to_string(), 1894 attributes: vec![ 1895 ("type".to_string(), "text".to_string()), 1896 ("name".to_string(), "foo".to_string()), 1897 ("value".to_string(), "bar".to_string()), 1898 ], 1899 self_closing: false, 1900 }] 1901 ); 1902 } 1903 1904 #[test] 1905 fn boolean_attribute() { 1906 let tokens = tokenize("<input disabled>"); 1907 assert_eq!( 1908 tokens, 1909 vec![Token::StartTag { 1910 name: "input".to_string(), 1911 attributes: vec![("disabled".to_string(), "".to_string())], 1912 self_closing: false, 1913 }] 1914 ); 1915 } 1916 1917 #[test] 1918 fn mixed_content() { 1919 let tokens = tokenize("Hello <!-- comment --> World"); 1920 assert_eq!( 1921 tokens, 1922 vec![ 1923 Token::Character("Hello ".to_string()), 1924 Token::Comment(" comment ".to_string()), 1925 Token::Character(" World".to_string()), 1926 ] 1927 ); 1928 } 1929 1930 #[test] 1931 fn doctype_with_public_id() { 1932 let tokens = tokenize( 1933 r#"<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">"#, 1934 ); 1935 assert_eq!( 1936 tokens, 1937 vec![Token::Doctype { 1938 name: Some("html".to_string()), 1939 public_id: Some("-//W3C//DTD XHTML 1.0 Strict//EN".to_string()), 1940 system_id: Some("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd".to_string()), 1941 force_quirks: false, 1942 }] 1943 ); 1944 } 1945 1946 #[test] 1947 fn null_in_text() { 1948 let tokens = tokenize("a\0b"); 1949 assert_eq!(tokens, vec![Token::Character("a\u{FFFD}b".to_string())]); 1950 } 1951 1952 #[test] 1953 fn windows_1252_numeric_refs() { 1954 // &#128; should map to Euro sign. 1955 let tokens = tokenize("&#128;"); 1956 assert_eq!(tokens, vec![Token::Character("\u{20AC}".to_string())]); 1957 } 1958 1959 #[test] 1960 fn attribute_with_empty_value() { 1961 let tokens = tokenize(r#"<div class="">"#); 1962 assert_eq!( 1963 tokens, 1964 vec![Token::StartTag { 1965 name: "div".to_string(), 1966 attributes: vec![("class".to_string(), "".to_string())], 1967 self_closing: false, 1968 }] 1969 ); 1970 } 1971 1972 #[test] 1973 fn adjacent_tags() { 1974 let tokens = tokenize("<b></b><i></i>"); 1975 assert_eq!( 1976 tokens, 1977 vec![ 1978 Token::StartTag { 1979 name: "b".to_string(), 1980 attributes: vec![], 1981 self_closing: false, 1982 }, 1983 Token::EndTag { 1984 name: "b".to_string(), 1985 }, 1986 Token::StartTag { 1987 name: "i".to_string(), 1988 attributes: vec![], 1989 self_closing: false, 1990 }, 1991 Token::EndTag { 1992 name: "i".to_string(), 1993 }, 1994 ] 1995 ); 1996 } 1997 1998 #[test] 1999 fn newlines_in_text() { 2000 let tokens = tokenize("line1\nline2\nline3"); 2001 assert_eq!( 2002 tokens, 2003 vec![Token::Character("line1\nline2\nline3".to_string())] 2004 ); 2005 } 2006 2007 #[test] 2008 fn self_closing_with_attribute() { 2009 let tokens = tokenize(r#"<img src="test.png"/>"#); 2010 assert_eq!( 2011 tokens, 2012 vec![Token::StartTag { 2013 name: "img".to_string(), 2014 attributes: vec![("src".to_string(), "test.png".to_string())], 2015 self_closing: true, 2016 }] 2017 ); 2018 } 2019 2020 #[test] 2021 fn less_than_in_text_not_tag() { 2022 // A bare '<' not followed by a letter should be emitted as text. 2023 let tokens = tokenize("1 < 2"); 2024 assert_eq!(tokens, vec![Token::Character("1 < 2".to_string())]); 2025 } 2026 2027 #[test] 2028 fn ampersand_not_entity() { 2029 let tokens = tokenize("a & b"); 2030 assert_eq!(tokens, vec![Token::Character("a & b".to_string())]); 2031 } 2032 2033 #[test] 2034 fn cdata_in_html_becomes_comment() { 2035 let tokens = tokenize("<![CDATA[hello]]>"); 2036 // In HTML (non-foreign) context, CDATA is a parse error → bogus comment. 2037 assert_eq!(tokens, vec![Token::Comment("[CDATA[hello]]".to_string())]); 2038 } 2039}