web engine - experimental web browser
1//! HTML5 tokenizer state machine per WHATWG spec §13.2.5.
2
3use crate::entities;
4use crate::Token;
5
6#[derive(Debug, Clone, Copy, PartialEq)]
7enum State {
8 Data,
9 TagOpen,
10 EndTagOpen,
11 TagName,
12 BeforeAttributeName,
13 AttributeName,
14 AfterAttributeName,
15 BeforeAttributeValue,
16 AttributeValueDoubleQuoted,
17 AttributeValueSingleQuoted,
18 AttributeValueUnquoted,
19 AfterAttributeValueQuoted,
20 SelfClosingStartTag,
21 BogusComment,
22 MarkupDeclarationOpen,
23 CommentStart,
24 CommentStartDash,
25 Comment,
26 CommentLessThanSign,
27 CommentLessThanSignBang,
28 CommentLessThanSignBangDash,
29 CommentLessThanSignBangDashDash,
30 CommentEndDash,
31 CommentEnd,
32 CommentEndBang,
33 Doctype,
34 BeforeDoctypeName,
35 DoctypeName,
36 AfterDoctypeName,
37 AfterDoctypePublicKeyword,
38 BeforeDoctypePublicIdentifier,
39 DoctypePublicIdentifierDoubleQuoted,
40 DoctypePublicIdentifierSingleQuoted,
41 AfterDoctypePublicIdentifier,
42 BetweenDoctypePublicAndSystemIdentifiers,
43 AfterDoctypeSystemKeyword,
44 BeforeDoctypeSystemIdentifier,
45 DoctypeSystemIdentifierDoubleQuoted,
46 DoctypeSystemIdentifierSingleQuoted,
47 AfterDoctypeSystemIdentifier,
48 BogusDoctype,
49 CharacterReference,
50 NumericCharacterReference,
51 HexCharacterReferenceStart,
52 DecCharacterReferenceStart,
53 HexCharacterReference,
54 DecCharacterReference,
55 NumericCharacterReferenceEnd,
56 NamedCharacterReference,
57}
58
59/// HTML5 tokenizer state machine.
60pub struct Tokenizer {
61 input: Vec<char>,
62 pos: usize,
63 state: State,
64 return_state: State,
65 pending: Vec<Token>,
66 /// Current tag being built.
67 tag_name: String,
68 tag_self_closing: bool,
69 tag_is_end: bool,
70 tag_attributes: Vec<(String, String)>,
71 current_attr_name: String,
72 current_attr_value: String,
73 /// Current comment or doctype being built.
74 comment_data: String,
75 doctype_name: Option<String>,
76 doctype_public_id: Option<String>,
77 doctype_system_id: Option<String>,
78 doctype_force_quirks: bool,
79 /// Character reference accumulator.
80 char_ref_code: u32,
81 temp_buf: String,
82}
83
84impl Tokenizer {
85 /// Create a new tokenizer for the given input.
86 pub fn new(input: &str) -> Self {
87 Tokenizer {
88 input: input.chars().collect(),
89 pos: 0,
90 state: State::Data,
91 return_state: State::Data,
92 pending: Vec::new(),
93 tag_name: String::new(),
94 tag_self_closing: false,
95 tag_is_end: false,
96 tag_attributes: Vec::new(),
97 current_attr_name: String::new(),
98 current_attr_value: String::new(),
99 comment_data: String::new(),
100 doctype_name: None,
101 doctype_public_id: None,
102 doctype_system_id: None,
103 doctype_force_quirks: false,
104 char_ref_code: 0,
105 temp_buf: String::new(),
106 }
107 }
108
109 /// Return the next token from the input.
110 pub fn next_token(&mut self) -> Token {
111 loop {
112 if let Some(token) = self.pending.pop() {
113 return token;
114 }
115 self.step();
116 }
117 }
118
119 fn next_char(&mut self) -> Option<char> {
120 if self.pos < self.input.len() {
121 let ch = self.input[self.pos];
122 self.pos += 1;
123 Some(ch)
124 } else {
125 None
126 }
127 }
128
129 fn peek_char(&self) -> Option<char> {
130 if self.pos < self.input.len() {
131 Some(self.input[self.pos])
132 } else {
133 None
134 }
135 }
136
137 fn reconsume(&mut self) {
138 if self.pos > 0 {
139 self.pos -= 1;
140 }
141 }
142
143 fn emit(&mut self, token: Token) {
144 // We use a Vec as a stack, so push to front by inserting at 0.
145 self.pending.insert(0, token);
146 }
147
148 fn emit_current_tag(&mut self) {
149 // Finalize the current attribute if there is one.
150 self.finish_attribute();
151
152 if self.tag_is_end {
153 self.emit(Token::EndTag {
154 name: self.tag_name.clone(),
155 });
156 } else {
157 self.emit(Token::StartTag {
158 name: self.tag_name.clone(),
159 attributes: self.tag_attributes.clone(),
160 self_closing: self.tag_self_closing,
161 });
162 }
163 }
164
165 fn emit_current_comment(&mut self) {
166 self.emit(Token::Comment(self.comment_data.clone()));
167 }
168
169 fn emit_current_doctype(&mut self) {
170 self.emit(Token::Doctype {
171 name: self.doctype_name.clone(),
172 public_id: self.doctype_public_id.clone(),
173 system_id: self.doctype_system_id.clone(),
174 force_quirks: self.doctype_force_quirks,
175 });
176 }
177
178 fn emit_char(&mut self, ch: char) {
179 self.emit(Token::Character(ch.to_string()));
180 }
181
182 fn emit_eof(&mut self) {
183 self.emit(Token::Eof);
184 }
185
186 fn start_new_tag(&mut self, is_end: bool) {
187 self.tag_name.clear();
188 self.tag_self_closing = false;
189 self.tag_is_end = is_end;
190 self.tag_attributes.clear();
191 self.current_attr_name.clear();
192 self.current_attr_value.clear();
193 }
194
195 fn start_new_attribute(&mut self) {
196 self.finish_attribute();
197 self.current_attr_name.clear();
198 self.current_attr_value.clear();
199 }
200
201 fn finish_attribute(&mut self) {
202 if !self.current_attr_name.is_empty() {
203 // Per spec: if duplicate attribute name, ignore the later one.
204 let name = self.current_attr_name.clone();
205 if !self.tag_attributes.iter().any(|(n, _)| n == &name) {
206 self.tag_attributes
207 .push((name, self.current_attr_value.clone()));
208 }
209 self.current_attr_name.clear();
210 self.current_attr_value.clear();
211 }
212 }
213
214 /// Flush character reference code to the return state.
215 fn flush_char_ref(&mut self, s: &str) {
216 match self.return_state {
217 State::AttributeValueDoubleQuoted
218 | State::AttributeValueSingleQuoted
219 | State::AttributeValueUnquoted => {
220 self.current_attr_value.push_str(s);
221 }
222 _ => {
223 for ch in s.chars() {
224 self.emit_char(ch);
225 }
226 }
227 }
228 }
229
230 fn step(&mut self) {
231 match self.state {
232 State::Data => self.state_data(),
233 State::TagOpen => self.state_tag_open(),
234 State::EndTagOpen => self.state_end_tag_open(),
235 State::TagName => self.state_tag_name(),
236 State::BeforeAttributeName => self.state_before_attribute_name(),
237 State::AttributeName => self.state_attribute_name(),
238 State::AfterAttributeName => self.state_after_attribute_name(),
239 State::BeforeAttributeValue => self.state_before_attribute_value(),
240 State::AttributeValueDoubleQuoted => self.state_attribute_value_double_quoted(),
241 State::AttributeValueSingleQuoted => self.state_attribute_value_single_quoted(),
242 State::AttributeValueUnquoted => self.state_attribute_value_unquoted(),
243 State::AfterAttributeValueQuoted => self.state_after_attribute_value_quoted(),
244 State::SelfClosingStartTag => self.state_self_closing_start_tag(),
245 State::BogusComment => self.state_bogus_comment(),
246 State::MarkupDeclarationOpen => self.state_markup_declaration_open(),
247 State::CommentStart => self.state_comment_start(),
248 State::CommentStartDash => self.state_comment_start_dash(),
249 State::Comment => self.state_comment(),
250 State::CommentLessThanSign => self.state_comment_less_than_sign(),
251 State::CommentLessThanSignBang => self.state_comment_less_than_sign_bang(),
252 State::CommentLessThanSignBangDash => self.state_comment_less_than_sign_bang_dash(),
253 State::CommentLessThanSignBangDashDash => {
254 self.state_comment_less_than_sign_bang_dash_dash()
255 }
256 State::CommentEndDash => self.state_comment_end_dash(),
257 State::CommentEnd => self.state_comment_end(),
258 State::CommentEndBang => self.state_comment_end_bang(),
259 State::Doctype => self.state_doctype(),
260 State::BeforeDoctypeName => self.state_before_doctype_name(),
261 State::DoctypeName => self.state_doctype_name(),
262 State::AfterDoctypeName => self.state_after_doctype_name(),
263 State::AfterDoctypePublicKeyword => self.state_after_doctype_public_keyword(),
264 State::BeforeDoctypePublicIdentifier => self.state_before_doctype_public_identifier(),
265 State::DoctypePublicIdentifierDoubleQuoted => {
266 self.state_doctype_public_identifier_double_quoted()
267 }
268 State::DoctypePublicIdentifierSingleQuoted => {
269 self.state_doctype_public_identifier_single_quoted()
270 }
271 State::AfterDoctypePublicIdentifier => self.state_after_doctype_public_identifier(),
272 State::BetweenDoctypePublicAndSystemIdentifiers => {
273 self.state_between_doctype_public_and_system_identifiers()
274 }
275 State::AfterDoctypeSystemKeyword => self.state_after_doctype_system_keyword(),
276 State::BeforeDoctypeSystemIdentifier => self.state_before_doctype_system_identifier(),
277 State::DoctypeSystemIdentifierDoubleQuoted => {
278 self.state_doctype_system_identifier_double_quoted()
279 }
280 State::DoctypeSystemIdentifierSingleQuoted => {
281 self.state_doctype_system_identifier_single_quoted()
282 }
283 State::AfterDoctypeSystemIdentifier => self.state_after_doctype_system_identifier(),
284 State::BogusDoctype => self.state_bogus_doctype(),
285 State::CharacterReference => self.state_character_reference(),
286 State::NumericCharacterReference => self.state_numeric_character_reference(),
287 State::HexCharacterReferenceStart => self.state_hex_character_reference_start(),
288 State::DecCharacterReferenceStart => self.state_dec_character_reference_start(),
289 State::HexCharacterReference => self.state_hex_character_reference(),
290 State::DecCharacterReference => self.state_dec_character_reference(),
291 State::NumericCharacterReferenceEnd => self.state_numeric_character_reference_end(),
292 State::NamedCharacterReference => self.state_named_character_reference(),
293 }
294 }
295
296 // --- State implementations ---
297
298 fn state_data(&mut self) {
299 match self.next_char() {
300 Some('&') => {
301 self.return_state = State::Data;
302 self.state = State::CharacterReference;
303 }
304 Some('<') => {
305 self.state = State::TagOpen;
306 }
307 Some('\0') => {
308 // Parse error. Emit replacement character.
309 self.emit_char('\u{FFFD}');
310 }
311 None => {
312 self.emit_eof();
313 }
314 Some(c) => {
315 self.emit_char(c);
316 }
317 }
318 }
319
320 fn state_tag_open(&mut self) {
321 match self.next_char() {
322 Some('!') => {
323 self.state = State::MarkupDeclarationOpen;
324 }
325 Some('/') => {
326 self.state = State::EndTagOpen;
327 }
328 Some(c) if c.is_ascii_alphabetic() => {
329 self.start_new_tag(false);
330 self.reconsume();
331 self.state = State::TagName;
332 }
333 Some('?') => {
334 // Parse error. Create a comment token.
335 self.comment_data.clear();
336 self.reconsume();
337 self.state = State::BogusComment;
338 }
339 None => {
340 // Parse error. Emit '<' and EOF.
341 self.emit_char('<');
342 self.emit_eof();
343 }
344 Some(_) => {
345 // Parse error. Emit '<' and reconsume.
346 self.emit_char('<');
347 self.reconsume();
348 self.state = State::Data;
349 }
350 }
351 }
352
353 fn state_end_tag_open(&mut self) {
354 match self.next_char() {
355 Some(c) if c.is_ascii_alphabetic() => {
356 self.start_new_tag(true);
357 self.reconsume();
358 self.state = State::TagName;
359 }
360 Some('>') => {
361 // Parse error. Switch to data state.
362 self.state = State::Data;
363 }
364 None => {
365 self.emit_char('<');
366 self.emit_char('/');
367 self.emit_eof();
368 }
369 Some(_) => {
370 // Parse error. Create a comment.
371 self.comment_data.clear();
372 self.reconsume();
373 self.state = State::BogusComment;
374 }
375 }
376 }
377
378 fn state_tag_name(&mut self) {
379 match self.next_char() {
380 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
381 self.state = State::BeforeAttributeName;
382 }
383 Some('/') => {
384 self.state = State::SelfClosingStartTag;
385 }
386 Some('>') => {
387 self.state = State::Data;
388 self.emit_current_tag();
389 }
390 Some(c) if c.is_ascii_uppercase() => {
391 self.tag_name.push(c.to_ascii_lowercase());
392 }
393 Some('\0') => {
394 self.tag_name.push('\u{FFFD}');
395 }
396 None => {
397 self.emit_eof();
398 }
399 Some(c) => {
400 self.tag_name.push(c);
401 }
402 }
403 }
404
405 fn state_before_attribute_name(&mut self) {
406 match self.next_char() {
407 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
408 // Ignore whitespace.
409 }
410 Some('/') | Some('>') => {
411 self.reconsume();
412 self.state = State::AfterAttributeName;
413 }
414 None => {
415 // EOF: go to AfterAttributeName without reconsuming.
416 self.state = State::AfterAttributeName;
417 }
418 Some('=') => {
419 // Parse error. Start a new attribute with '=' as name.
420 self.start_new_attribute();
421 self.current_attr_name.push('=');
422 self.state = State::AttributeName;
423 }
424 Some(_) => {
425 self.start_new_attribute();
426 self.reconsume();
427 self.state = State::AttributeName;
428 }
429 }
430 }
431
432 fn state_attribute_name(&mut self) {
433 match self.next_char() {
434 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') | Some('/') | Some('>') => {
435 self.reconsume();
436 self.state = State::AfterAttributeName;
437 }
438 None => {
439 self.state = State::AfterAttributeName;
440 }
441 Some('=') => {
442 self.state = State::BeforeAttributeValue;
443 }
444 Some(c) if c.is_ascii_uppercase() => {
445 self.current_attr_name.push(c.to_ascii_lowercase());
446 }
447 Some('\0') => {
448 self.current_attr_name.push('\u{FFFD}');
449 }
450 Some(c) => {
451 self.current_attr_name.push(c);
452 }
453 }
454 }
455
456 fn state_after_attribute_name(&mut self) {
457 match self.next_char() {
458 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
459 // Ignore.
460 }
461 Some('/') => {
462 self.state = State::SelfClosingStartTag;
463 }
464 Some('=') => {
465 self.state = State::BeforeAttributeValue;
466 }
467 Some('>') => {
468 self.state = State::Data;
469 self.emit_current_tag();
470 }
471 None => {
472 self.emit_eof();
473 }
474 Some(_) => {
475 self.start_new_attribute();
476 self.reconsume();
477 self.state = State::AttributeName;
478 }
479 }
480 }
481
482 fn state_before_attribute_value(&mut self) {
483 match self.next_char() {
484 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
485 // Ignore.
486 }
487 Some('"') => {
488 self.state = State::AttributeValueDoubleQuoted;
489 }
490 Some('\'') => {
491 self.state = State::AttributeValueSingleQuoted;
492 }
493 Some('>') => {
494 // Parse error. Emit tag with missing value.
495 self.state = State::Data;
496 self.emit_current_tag();
497 }
498 _ => {
499 self.reconsume();
500 self.state = State::AttributeValueUnquoted;
501 }
502 }
503 }
504
505 fn state_attribute_value_double_quoted(&mut self) {
506 match self.next_char() {
507 Some('"') => {
508 self.state = State::AfterAttributeValueQuoted;
509 }
510 Some('&') => {
511 self.return_state = State::AttributeValueDoubleQuoted;
512 self.state = State::CharacterReference;
513 }
514 Some('\0') => {
515 self.current_attr_value.push('\u{FFFD}');
516 }
517 None => {
518 self.emit_eof();
519 }
520 Some(c) => {
521 self.current_attr_value.push(c);
522 }
523 }
524 }
525
526 fn state_attribute_value_single_quoted(&mut self) {
527 match self.next_char() {
528 Some('\'') => {
529 self.state = State::AfterAttributeValueQuoted;
530 }
531 Some('&') => {
532 self.return_state = State::AttributeValueSingleQuoted;
533 self.state = State::CharacterReference;
534 }
535 Some('\0') => {
536 self.current_attr_value.push('\u{FFFD}');
537 }
538 None => {
539 self.emit_eof();
540 }
541 Some(c) => {
542 self.current_attr_value.push(c);
543 }
544 }
545 }
546
547 fn state_attribute_value_unquoted(&mut self) {
548 match self.next_char() {
549 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
550 self.state = State::BeforeAttributeName;
551 }
552 Some('&') => {
553 self.return_state = State::AttributeValueUnquoted;
554 self.state = State::CharacterReference;
555 }
556 Some('>') => {
557 self.state = State::Data;
558 self.emit_current_tag();
559 }
560 Some('\0') => {
561 self.current_attr_value.push('\u{FFFD}');
562 }
563 None => {
564 self.emit_eof();
565 }
566 Some(c) => {
567 self.current_attr_value.push(c);
568 }
569 }
570 }
571
572 fn state_after_attribute_value_quoted(&mut self) {
573 match self.next_char() {
574 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
575 self.state = State::BeforeAttributeName;
576 }
577 Some('/') => {
578 self.state = State::SelfClosingStartTag;
579 }
580 Some('>') => {
581 self.state = State::Data;
582 self.emit_current_tag();
583 }
584 None => {
585 self.emit_eof();
586 }
587 Some(_) => {
588 // Parse error. Reconsume in before attribute name.
589 self.reconsume();
590 self.state = State::BeforeAttributeName;
591 }
592 }
593 }
594
595 fn state_self_closing_start_tag(&mut self) {
596 match self.next_char() {
597 Some('>') => {
598 self.tag_self_closing = true;
599 self.state = State::Data;
600 self.emit_current_tag();
601 }
602 None => {
603 self.emit_eof();
604 }
605 Some(_) => {
606 // Parse error. Reconsume in before attribute name.
607 self.reconsume();
608 self.state = State::BeforeAttributeName;
609 }
610 }
611 }
612
613 fn state_bogus_comment(&mut self) {
614 match self.next_char() {
615 Some('>') => {
616 self.state = State::Data;
617 self.emit_current_comment();
618 }
619 None => {
620 self.emit_current_comment();
621 self.emit_eof();
622 }
623 Some('\0') => {
624 self.comment_data.push('\u{FFFD}');
625 }
626 Some(c) => {
627 self.comment_data.push(c);
628 }
629 }
630 }
631
632 fn state_markup_declaration_open(&mut self) {
633 // Check for `--`, `DOCTYPE`, or `[CDATA[`
634 if self.starts_with("--") {
635 self.pos += 2;
636 self.comment_data.clear();
637 self.state = State::CommentStart;
638 } else if self.starts_with_case_insensitive("DOCTYPE") {
639 self.pos += 7;
640 self.state = State::Doctype;
641 } else if self.starts_with("[CDATA[") {
642 // Per spec, if not in foreign content, parse error → bogus comment.
643 self.pos += 7;
644 self.comment_data.clear();
645 self.comment_data.push_str("[CDATA[");
646 self.state = State::BogusComment;
647 } else {
648 // Parse error. Bogus comment.
649 self.comment_data.clear();
650 self.state = State::BogusComment;
651 }
652 }
653
654 fn state_comment_start(&mut self) {
655 match self.next_char() {
656 Some('-') => {
657 self.state = State::CommentStartDash;
658 }
659 Some('>') => {
660 // Parse error. Emit empty comment.
661 self.state = State::Data;
662 self.emit_current_comment();
663 }
664 _ => {
665 self.reconsume();
666 self.state = State::Comment;
667 }
668 }
669 }
670
671 fn state_comment_start_dash(&mut self) {
672 match self.next_char() {
673 Some('-') => {
674 self.state = State::CommentEnd;
675 }
676 Some('>') => {
677 // Parse error.
678 self.state = State::Data;
679 self.emit_current_comment();
680 }
681 None => {
682 self.emit_current_comment();
683 self.emit_eof();
684 }
685 Some(_) => {
686 self.comment_data.push('-');
687 self.reconsume();
688 self.state = State::Comment;
689 }
690 }
691 }
692
693 fn state_comment(&mut self) {
694 match self.next_char() {
695 Some('<') => {
696 self.comment_data.push('<');
697 self.state = State::CommentLessThanSign;
698 }
699 Some('-') => {
700 self.state = State::CommentEndDash;
701 }
702 Some('\0') => {
703 self.comment_data.push('\u{FFFD}');
704 }
705 None => {
706 self.emit_current_comment();
707 self.emit_eof();
708 }
709 Some(c) => {
710 self.comment_data.push(c);
711 }
712 }
713 }
714
715 fn state_comment_less_than_sign(&mut self) {
716 match self.next_char() {
717 Some('!') => {
718 self.comment_data.push('!');
719 self.state = State::CommentLessThanSignBang;
720 }
721 Some('<') => {
722 self.comment_data.push('<');
723 }
724 None => {
725 // Don't reconsume on EOF — pos didn't advance, so reconsuming
726 // would back up to '<' and loop forever between here and Comment.
727 self.state = State::Comment;
728 }
729 Some(_) => {
730 self.reconsume();
731 self.state = State::Comment;
732 }
733 }
734 }
735
736 fn state_comment_less_than_sign_bang(&mut self) {
737 match self.next_char() {
738 Some('-') => {
739 self.state = State::CommentLessThanSignBangDash;
740 }
741 _ => {
742 self.reconsume();
743 self.state = State::Comment;
744 }
745 }
746 }
747
748 fn state_comment_less_than_sign_bang_dash(&mut self) {
749 match self.next_char() {
750 Some('-') => {
751 self.state = State::CommentLessThanSignBangDashDash;
752 }
753 _ => {
754 self.reconsume();
755 self.state = State::CommentEndDash;
756 }
757 }
758 }
759
760 fn state_comment_less_than_sign_bang_dash_dash(&mut self) {
761 match self.next_char() {
762 Some('>') | None => {
763 self.reconsume();
764 self.state = State::CommentEnd;
765 }
766 Some(_) => {
767 // Parse error.
768 self.reconsume();
769 self.state = State::CommentEnd;
770 }
771 }
772 }
773
774 fn state_comment_end_dash(&mut self) {
775 match self.next_char() {
776 Some('-') => {
777 self.state = State::CommentEnd;
778 }
779 None => {
780 self.emit_current_comment();
781 self.emit_eof();
782 }
783 Some(_) => {
784 self.comment_data.push('-');
785 self.reconsume();
786 self.state = State::Comment;
787 }
788 }
789 }
790
791 fn state_comment_end(&mut self) {
792 match self.next_char() {
793 Some('>') => {
794 self.state = State::Data;
795 self.emit_current_comment();
796 }
797 Some('!') => {
798 self.state = State::CommentEndBang;
799 }
800 Some('-') => {
801 self.comment_data.push('-');
802 }
803 None => {
804 self.emit_current_comment();
805 self.emit_eof();
806 }
807 Some(_) => {
808 self.comment_data.push('-');
809 self.comment_data.push('-');
810 self.reconsume();
811 self.state = State::Comment;
812 }
813 }
814 }
815
816 fn state_comment_end_bang(&mut self) {
817 match self.next_char() {
818 Some('-') => {
819 self.comment_data.push('-');
820 self.comment_data.push('-');
821 self.comment_data.push('!');
822 self.state = State::CommentEndDash;
823 }
824 Some('>') => {
825 self.state = State::Data;
826 self.emit_current_comment();
827 }
828 None => {
829 self.emit_current_comment();
830 self.emit_eof();
831 }
832 Some(_) => {
833 self.comment_data.push('-');
834 self.comment_data.push('-');
835 self.comment_data.push('!');
836 self.reconsume();
837 self.state = State::Comment;
838 }
839 }
840 }
841
842 fn state_doctype(&mut self) {
843 match self.next_char() {
844 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
845 self.state = State::BeforeDoctypeName;
846 }
847 Some('>') => {
848 self.reconsume();
849 self.state = State::BeforeDoctypeName;
850 }
851 None => {
852 self.doctype_name = None;
853 self.doctype_public_id = None;
854 self.doctype_system_id = None;
855 self.doctype_force_quirks = true;
856 self.emit_current_doctype();
857 self.emit_eof();
858 }
859 Some(_) => {
860 // Parse error. Missing whitespace before DOCTYPE name.
861 self.reconsume();
862 self.state = State::BeforeDoctypeName;
863 }
864 }
865 }
866
867 fn state_before_doctype_name(&mut self) {
868 match self.next_char() {
869 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
870 // Ignore whitespace.
871 }
872 Some(c) if c.is_ascii_uppercase() => {
873 self.doctype_name = Some(c.to_ascii_lowercase().to_string());
874 self.doctype_public_id = None;
875 self.doctype_system_id = None;
876 self.doctype_force_quirks = false;
877 self.state = State::DoctypeName;
878 }
879 Some('\0') => {
880 self.doctype_name = Some("\u{FFFD}".to_string());
881 self.doctype_public_id = None;
882 self.doctype_system_id = None;
883 self.doctype_force_quirks = false;
884 self.state = State::DoctypeName;
885 }
886 Some('>') => {
887 // Parse error. Force quirks.
888 self.doctype_name = None;
889 self.doctype_public_id = None;
890 self.doctype_system_id = None;
891 self.doctype_force_quirks = true;
892 self.state = State::Data;
893 self.emit_current_doctype();
894 }
895 None => {
896 self.doctype_name = None;
897 self.doctype_public_id = None;
898 self.doctype_system_id = None;
899 self.doctype_force_quirks = true;
900 self.emit_current_doctype();
901 self.emit_eof();
902 }
903 Some(c) => {
904 self.doctype_name = Some(c.to_string());
905 self.doctype_public_id = None;
906 self.doctype_system_id = None;
907 self.doctype_force_quirks = false;
908 self.state = State::DoctypeName;
909 }
910 }
911 }
912
913 fn state_doctype_name(&mut self) {
914 match self.next_char() {
915 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
916 self.state = State::AfterDoctypeName;
917 }
918 Some('>') => {
919 self.state = State::Data;
920 self.emit_current_doctype();
921 }
922 Some(c) if c.is_ascii_uppercase() => {
923 if let Some(ref mut name) = self.doctype_name {
924 name.push(c.to_ascii_lowercase());
925 }
926 }
927 Some('\0') => {
928 if let Some(ref mut name) = self.doctype_name {
929 name.push('\u{FFFD}');
930 }
931 }
932 None => {
933 self.doctype_force_quirks = true;
934 self.emit_current_doctype();
935 self.emit_eof();
936 }
937 Some(c) => {
938 if let Some(ref mut name) = self.doctype_name {
939 name.push(c);
940 }
941 }
942 }
943 }
944
945 fn state_after_doctype_name(&mut self) {
946 match self.next_char() {
947 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
948 // Ignore.
949 }
950 Some('>') => {
951 self.state = State::Data;
952 self.emit_current_doctype();
953 }
954 None => {
955 self.doctype_force_quirks = true;
956 self.emit_current_doctype();
957 self.emit_eof();
958 }
959 Some(_) => {
960 // Check for PUBLIC or SYSTEM keyword.
961 self.reconsume();
962 if self.starts_with_case_insensitive("PUBLIC") {
963 self.pos += 6;
964 self.state = State::AfterDoctypePublicKeyword;
965 } else if self.starts_with_case_insensitive("SYSTEM") {
966 self.pos += 6;
967 self.state = State::AfterDoctypeSystemKeyword;
968 } else {
969 // Parse error.
970 self.doctype_force_quirks = true;
971 self.next_char(); // consume the reconsumed char
972 self.state = State::BogusDoctype;
973 }
974 }
975 }
976 }
977
978 fn state_after_doctype_public_keyword(&mut self) {
979 match self.next_char() {
980 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
981 self.state = State::BeforeDoctypePublicIdentifier;
982 }
983 Some('"') => {
984 // Parse error. Missing whitespace.
985 self.doctype_public_id = Some(String::new());
986 self.state = State::DoctypePublicIdentifierDoubleQuoted;
987 }
988 Some('\'') => {
989 self.doctype_public_id = Some(String::new());
990 self.state = State::DoctypePublicIdentifierSingleQuoted;
991 }
992 Some('>') => {
993 self.doctype_force_quirks = true;
994 self.state = State::Data;
995 self.emit_current_doctype();
996 }
997 None => {
998 self.doctype_force_quirks = true;
999 self.emit_current_doctype();
1000 self.emit_eof();
1001 }
1002 Some(_) => {
1003 self.doctype_force_quirks = true;
1004 self.reconsume();
1005 self.state = State::BogusDoctype;
1006 }
1007 }
1008 }
1009
1010 fn state_before_doctype_public_identifier(&mut self) {
1011 match self.next_char() {
1012 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
1013 // Ignore.
1014 }
1015 Some('"') => {
1016 self.doctype_public_id = Some(String::new());
1017 self.state = State::DoctypePublicIdentifierDoubleQuoted;
1018 }
1019 Some('\'') => {
1020 self.doctype_public_id = Some(String::new());
1021 self.state = State::DoctypePublicIdentifierSingleQuoted;
1022 }
1023 Some('>') => {
1024 self.doctype_force_quirks = true;
1025 self.state = State::Data;
1026 self.emit_current_doctype();
1027 }
1028 None => {
1029 self.doctype_force_quirks = true;
1030 self.emit_current_doctype();
1031 self.emit_eof();
1032 }
1033 Some(_) => {
1034 self.doctype_force_quirks = true;
1035 self.reconsume();
1036 self.state = State::BogusDoctype;
1037 }
1038 }
1039 }
1040
1041 fn state_doctype_public_identifier_double_quoted(&mut self) {
1042 match self.next_char() {
1043 Some('"') => {
1044 self.state = State::AfterDoctypePublicIdentifier;
1045 }
1046 Some('\0') => {
1047 if let Some(ref mut id) = self.doctype_public_id {
1048 id.push('\u{FFFD}');
1049 }
1050 }
1051 Some('>') => {
1052 self.doctype_force_quirks = true;
1053 self.state = State::Data;
1054 self.emit_current_doctype();
1055 }
1056 None => {
1057 self.doctype_force_quirks = true;
1058 self.emit_current_doctype();
1059 self.emit_eof();
1060 }
1061 Some(c) => {
1062 if let Some(ref mut id) = self.doctype_public_id {
1063 id.push(c);
1064 }
1065 }
1066 }
1067 }
1068
1069 fn state_doctype_public_identifier_single_quoted(&mut self) {
1070 match self.next_char() {
1071 Some('\'') => {
1072 self.state = State::AfterDoctypePublicIdentifier;
1073 }
1074 Some('\0') => {
1075 if let Some(ref mut id) = self.doctype_public_id {
1076 id.push('\u{FFFD}');
1077 }
1078 }
1079 Some('>') => {
1080 self.doctype_force_quirks = true;
1081 self.state = State::Data;
1082 self.emit_current_doctype();
1083 }
1084 None => {
1085 self.doctype_force_quirks = true;
1086 self.emit_current_doctype();
1087 self.emit_eof();
1088 }
1089 Some(c) => {
1090 if let Some(ref mut id) = self.doctype_public_id {
1091 id.push(c);
1092 }
1093 }
1094 }
1095 }
1096
1097 fn state_after_doctype_public_identifier(&mut self) {
1098 match self.next_char() {
1099 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
1100 self.state = State::BetweenDoctypePublicAndSystemIdentifiers;
1101 }
1102 Some('>') => {
1103 self.state = State::Data;
1104 self.emit_current_doctype();
1105 }
1106 Some('"') => {
1107 // Parse error. Missing whitespace.
1108 self.doctype_system_id = Some(String::new());
1109 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
1110 }
1111 Some('\'') => {
1112 self.doctype_system_id = Some(String::new());
1113 self.state = State::DoctypeSystemIdentifierSingleQuoted;
1114 }
1115 None => {
1116 self.doctype_force_quirks = true;
1117 self.emit_current_doctype();
1118 self.emit_eof();
1119 }
1120 Some(_) => {
1121 self.doctype_force_quirks = true;
1122 self.reconsume();
1123 self.state = State::BogusDoctype;
1124 }
1125 }
1126 }
1127
1128 fn state_between_doctype_public_and_system_identifiers(&mut self) {
1129 match self.next_char() {
1130 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
1131 // Ignore.
1132 }
1133 Some('>') => {
1134 self.state = State::Data;
1135 self.emit_current_doctype();
1136 }
1137 Some('"') => {
1138 self.doctype_system_id = Some(String::new());
1139 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
1140 }
1141 Some('\'') => {
1142 self.doctype_system_id = Some(String::new());
1143 self.state = State::DoctypeSystemIdentifierSingleQuoted;
1144 }
1145 None => {
1146 self.doctype_force_quirks = true;
1147 self.emit_current_doctype();
1148 self.emit_eof();
1149 }
1150 Some(_) => {
1151 self.doctype_force_quirks = true;
1152 self.reconsume();
1153 self.state = State::BogusDoctype;
1154 }
1155 }
1156 }
1157
1158 fn state_after_doctype_system_keyword(&mut self) {
1159 match self.next_char() {
1160 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
1161 self.state = State::BeforeDoctypeSystemIdentifier;
1162 }
1163 Some('"') => {
1164 self.doctype_system_id = Some(String::new());
1165 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
1166 }
1167 Some('\'') => {
1168 self.doctype_system_id = Some(String::new());
1169 self.state = State::DoctypeSystemIdentifierSingleQuoted;
1170 }
1171 Some('>') => {
1172 self.doctype_force_quirks = true;
1173 self.state = State::Data;
1174 self.emit_current_doctype();
1175 }
1176 None => {
1177 self.doctype_force_quirks = true;
1178 self.emit_current_doctype();
1179 self.emit_eof();
1180 }
1181 Some(_) => {
1182 self.doctype_force_quirks = true;
1183 self.reconsume();
1184 self.state = State::BogusDoctype;
1185 }
1186 }
1187 }
1188
1189 fn state_before_doctype_system_identifier(&mut self) {
1190 match self.next_char() {
1191 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
1192 // Ignore.
1193 }
1194 Some('"') => {
1195 self.doctype_system_id = Some(String::new());
1196 self.state = State::DoctypeSystemIdentifierDoubleQuoted;
1197 }
1198 Some('\'') => {
1199 self.doctype_system_id = Some(String::new());
1200 self.state = State::DoctypeSystemIdentifierSingleQuoted;
1201 }
1202 Some('>') => {
1203 self.doctype_force_quirks = true;
1204 self.state = State::Data;
1205 self.emit_current_doctype();
1206 }
1207 None => {
1208 self.doctype_force_quirks = true;
1209 self.emit_current_doctype();
1210 self.emit_eof();
1211 }
1212 Some(_) => {
1213 self.doctype_force_quirks = true;
1214 self.reconsume();
1215 self.state = State::BogusDoctype;
1216 }
1217 }
1218 }
1219
1220 fn state_doctype_system_identifier_double_quoted(&mut self) {
1221 match self.next_char() {
1222 Some('"') => {
1223 self.state = State::AfterDoctypeSystemIdentifier;
1224 }
1225 Some('\0') => {
1226 if let Some(ref mut id) = self.doctype_system_id {
1227 id.push('\u{FFFD}');
1228 }
1229 }
1230 Some('>') => {
1231 self.doctype_force_quirks = true;
1232 self.state = State::Data;
1233 self.emit_current_doctype();
1234 }
1235 None => {
1236 self.doctype_force_quirks = true;
1237 self.emit_current_doctype();
1238 self.emit_eof();
1239 }
1240 Some(c) => {
1241 if let Some(ref mut id) = self.doctype_system_id {
1242 id.push(c);
1243 }
1244 }
1245 }
1246 }
1247
1248 fn state_doctype_system_identifier_single_quoted(&mut self) {
1249 match self.next_char() {
1250 Some('\'') => {
1251 self.state = State::AfterDoctypeSystemIdentifier;
1252 }
1253 Some('\0') => {
1254 if let Some(ref mut id) = self.doctype_system_id {
1255 id.push('\u{FFFD}');
1256 }
1257 }
1258 Some('>') => {
1259 self.doctype_force_quirks = true;
1260 self.state = State::Data;
1261 self.emit_current_doctype();
1262 }
1263 None => {
1264 self.doctype_force_quirks = true;
1265 self.emit_current_doctype();
1266 self.emit_eof();
1267 }
1268 Some(c) => {
1269 if let Some(ref mut id) = self.doctype_system_id {
1270 id.push(c);
1271 }
1272 }
1273 }
1274 }
1275
1276 fn state_after_doctype_system_identifier(&mut self) {
1277 match self.next_char() {
1278 Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => {
1279 // Ignore.
1280 }
1281 Some('>') => {
1282 self.state = State::Data;
1283 self.emit_current_doctype();
1284 }
1285 None => {
1286 self.doctype_force_quirks = true;
1287 self.emit_current_doctype();
1288 self.emit_eof();
1289 }
1290 Some(_) => {
1291 // Parse error, but do NOT set force_quirks.
1292 self.reconsume();
1293 self.state = State::BogusDoctype;
1294 }
1295 }
1296 }
1297
1298 fn state_bogus_doctype(&mut self) {
1299 match self.next_char() {
1300 Some('>') => {
1301 self.state = State::Data;
1302 self.emit_current_doctype();
1303 }
1304 Some('\0') => {
1305 // Parse error. Ignore.
1306 }
1307 None => {
1308 self.emit_current_doctype();
1309 self.emit_eof();
1310 }
1311 Some(_) => {
1312 // Ignore.
1313 }
1314 }
1315 }
1316
1317 // --- Character reference states ---
1318
1319 fn state_character_reference(&mut self) {
1320 self.temp_buf.clear();
1321 self.temp_buf.push('&');
1322
1323 match self.peek_char() {
1324 Some(c) if c.is_ascii_alphanumeric() => {
1325 self.state = State::NamedCharacterReference;
1326 }
1327 Some('#') => {
1328 self.temp_buf.push('#');
1329 self.next_char();
1330 self.state = State::NumericCharacterReference;
1331 }
1332 _ => {
1333 // Not a character reference. Flush '&' to return state.
1334 self.flush_char_ref("&");
1335 self.state = self.return_state;
1336 }
1337 }
1338 }
1339
1340 fn state_named_character_reference(&mut self) {
1341 // Collect alphanumeric characters to form the entity name.
1342 // Per spec, entity names can also contain digits after the first char.
1343 let mut name = String::new();
1344 let start_pos = self.pos;
1345
1346 while let Some(c) = self.peek_char() {
1347 if c.is_ascii_alphanumeric() {
1348 name.push(c);
1349 self.pos += 1;
1350 } else {
1351 break;
1352 }
1353 }
1354
1355 // Try to find a match, trying longest match first.
1356 // First check if the full name + semicolon matches.
1357 let has_trailing_semi = self.peek_char() == Some(';');
1358
1359 let mut matched_value: Option<&str> = None;
1360 let mut matched_len = 0;
1361
1362 // Try the full name first (with semicolon if present).
1363 if has_trailing_semi {
1364 if let Some(val) = entities::lookup_entity(&name) {
1365 matched_value = Some(val);
1366 matched_len = name.len();
1367 }
1368 }
1369
1370 // If no match with full name, try progressively shorter prefixes.
1371 if matched_value.is_none() {
1372 for i in (1..=name.len()).rev() {
1373 let candidate = &name[..i];
1374 if let Some(val) = entities::lookup_entity(candidate) {
1375 // Without semicolon, only legacy entities are recognized.
1376 if entities::is_legacy_entity(candidate) {
1377 matched_value = Some(val);
1378 matched_len = i;
1379 break;
1380 }
1381 }
1382 }
1383 }
1384
1385 // Also try the full name without semicolon for legacy entities.
1386 if matched_value.is_none() && !has_trailing_semi {
1387 if let Some(val) = entities::lookup_entity(&name) {
1388 if entities::is_legacy_entity(&name) {
1389 matched_value = Some(val);
1390 matched_len = name.len();
1391 }
1392 }
1393 }
1394
1395 if let Some(value) = matched_value {
1396 // Rewind to just after the matched portion.
1397 self.pos = start_pos + matched_len;
1398
1399 // Check for semicolon after the matched portion.
1400 let has_semi = self.peek_char() == Some(';');
1401 if has_semi {
1402 self.pos += 1;
1403 }
1404
1405 // Per spec: if consumed as part of an attribute and the character
1406 // after the match is `=` or alphanumeric, and no semicolon,
1407 // flush the original text instead.
1408 let in_attribute = matches!(
1409 self.return_state,
1410 State::AttributeValueDoubleQuoted
1411 | State::AttributeValueSingleQuoted
1412 | State::AttributeValueUnquoted
1413 );
1414
1415 if !has_semi && in_attribute {
1416 if let Some(next) = self.peek_char() {
1417 if next == '=' || next.is_ascii_alphanumeric() {
1418 // Not a reference. Flush original text.
1419 let mut original = "&".to_string();
1420 original.push_str(&name[..matched_len]);
1421 self.flush_char_ref(&original);
1422 self.state = self.return_state;
1423 return;
1424 }
1425 }
1426 }
1427
1428 self.flush_char_ref(value);
1429 self.state = self.return_state;
1430 } else {
1431 // No match. Rewind and flush '&' + all collected chars.
1432 self.pos = start_pos;
1433 self.flush_char_ref("&");
1434 for _ in 0..name.len() {
1435 let c = self.next_char().unwrap();
1436 let s = c.to_string();
1437 self.flush_char_ref(&s);
1438 }
1439 self.state = self.return_state;
1440 }
1441 }
1442
1443 fn state_numeric_character_reference(&mut self) {
1444 self.char_ref_code = 0;
1445 match self.peek_char() {
1446 Some('x') | Some('X') => {
1447 self.temp_buf.push(self.peek_char().unwrap());
1448 self.next_char();
1449 self.state = State::HexCharacterReferenceStart;
1450 }
1451 _ => {
1452 self.state = State::DecCharacterReferenceStart;
1453 }
1454 }
1455 }
1456
1457 fn state_hex_character_reference_start(&mut self) {
1458 match self.peek_char() {
1459 Some(c) if c.is_ascii_hexdigit() => {
1460 self.state = State::HexCharacterReference;
1461 }
1462 _ => {
1463 // Parse error. Flush temp_buf.
1464 let buf = self.temp_buf.clone();
1465 self.flush_char_ref(&buf);
1466 self.state = self.return_state;
1467 }
1468 }
1469 }
1470
1471 fn state_dec_character_reference_start(&mut self) {
1472 match self.peek_char() {
1473 Some(c) if c.is_ascii_digit() => {
1474 self.state = State::DecCharacterReference;
1475 }
1476 _ => {
1477 let buf = self.temp_buf.clone();
1478 self.flush_char_ref(&buf);
1479 self.state = self.return_state;
1480 }
1481 }
1482 }
1483
1484 fn state_hex_character_reference(&mut self) {
1485 match self.next_char() {
1486 Some(c) if c.is_ascii_hexdigit() => {
1487 // Cap at a value that's clearly out of range but won't overflow.
1488 self.char_ref_code = self
1489 .char_ref_code
1490 .saturating_mul(16)
1491 .saturating_add(c.to_digit(16).unwrap());
1492 if self.char_ref_code > 0x10FFFF {
1493 self.char_ref_code = 0x110000;
1494 }
1495 }
1496 Some(';') => {
1497 self.state = State::NumericCharacterReferenceEnd;
1498 }
1499 None => {
1500 // EOF: missing semicolon parse error. Don't reconsume.
1501 self.state = State::NumericCharacterReferenceEnd;
1502 }
1503 Some(_) => {
1504 // Parse error: missing semicolon.
1505 self.reconsume();
1506 self.state = State::NumericCharacterReferenceEnd;
1507 }
1508 }
1509 }
1510
1511 fn state_dec_character_reference(&mut self) {
1512 match self.next_char() {
1513 Some(c) if c.is_ascii_digit() => {
1514 self.char_ref_code = self
1515 .char_ref_code
1516 .saturating_mul(10)
1517 .saturating_add(c.to_digit(10).unwrap());
1518 if self.char_ref_code > 0x10FFFF {
1519 self.char_ref_code = 0x110000;
1520 }
1521 }
1522 Some(';') => {
1523 self.state = State::NumericCharacterReferenceEnd;
1524 }
1525 None => {
1526 // EOF: missing semicolon parse error. Don't reconsume.
1527 self.state = State::NumericCharacterReferenceEnd;
1528 }
1529 Some(_) => {
1530 self.reconsume();
1531 self.state = State::NumericCharacterReferenceEnd;
1532 }
1533 }
1534 }
1535
1536 fn state_numeric_character_reference_end(&mut self) {
1537 let code = self.char_ref_code;
1538 let ch = match code {
1539 0 => '\u{FFFD}',
1540 // Surrogate range.
1541 0xD800..=0xDFFF => '\u{FFFD}',
1542 // Out of Unicode range.
1543 c if c > 0x10FFFF => '\u{FFFD}',
1544 // Windows-1252 replacement table for 0x80..0x9F.
1545 0x80 => '\u{20AC}',
1546 0x82 => '\u{201A}',
1547 0x83 => '\u{0192}',
1548 0x84 => '\u{201E}',
1549 0x85 => '\u{2026}',
1550 0x86 => '\u{2020}',
1551 0x87 => '\u{2021}',
1552 0x88 => '\u{02C6}',
1553 0x89 => '\u{2030}',
1554 0x8A => '\u{0160}',
1555 0x8B => '\u{2039}',
1556 0x8C => '\u{0152}',
1557 0x8E => '\u{017D}',
1558 0x91 => '\u{2018}',
1559 0x92 => '\u{2019}',
1560 0x93 => '\u{201C}',
1561 0x94 => '\u{201D}',
1562 0x95 => '\u{2022}',
1563 0x96 => '\u{2013}',
1564 0x97 => '\u{2014}',
1565 0x98 => '\u{02DC}',
1566 0x99 => '\u{2122}',
1567 0x9A => '\u{0161}',
1568 0x9B => '\u{203A}',
1569 0x9C => '\u{0153}',
1570 0x9E => '\u{017E}',
1571 0x9F => '\u{0178}',
1572 c => char::from_u32(c).unwrap_or('\u{FFFD}'),
1573 };
1574
1575 let s = ch.to_string();
1576 self.flush_char_ref(&s);
1577 self.state = self.return_state;
1578 }
1579
1580 // --- Helpers ---
1581
1582 fn starts_with(&self, s: &str) -> bool {
1583 let bytes: Vec<char> = s.chars().collect();
1584 if self.pos + bytes.len() > self.input.len() {
1585 return false;
1586 }
1587 for (i, &c) in bytes.iter().enumerate() {
1588 if self.input[self.pos + i] != c {
1589 return false;
1590 }
1591 }
1592 true
1593 }
1594
1595 fn starts_with_case_insensitive(&self, s: &str) -> bool {
1596 let bytes: Vec<char> = s.chars().collect();
1597 if self.pos + bytes.len() > self.input.len() {
1598 return false;
1599 }
1600 for (i, &c) in bytes.iter().enumerate() {
1601 if !self.input[self.pos + i].eq_ignore_ascii_case(&c) {
1602 return false;
1603 }
1604 }
1605 true
1606 }
1607}
1608
1609#[cfg(test)]
1610mod tests {
1611 use super::*;
1612 use crate::tokenize;
1613
1614 #[test]
1615 fn empty_input() {
1616 let tokens = tokenize("");
1617 assert!(tokens.is_empty());
1618 }
1619
1620 #[test]
1621 fn plain_text() {
1622 let tokens = tokenize("Hello, world!");
1623 assert_eq!(tokens, vec![Token::Character("Hello, world!".to_string())]);
1624 }
1625
1626 #[test]
1627 fn simple_element() {
1628 let tokens = tokenize("<p>Hello</p>");
1629 assert_eq!(
1630 tokens,
1631 vec![
1632 Token::StartTag {
1633 name: "p".to_string(),
1634 attributes: vec![],
1635 self_closing: false,
1636 },
1637 Token::Character("Hello".to_string()),
1638 Token::EndTag {
1639 name: "p".to_string(),
1640 },
1641 ]
1642 );
1643 }
1644
1645 #[test]
1646 fn self_closing_tag() {
1647 let tokens = tokenize("<br/>");
1648 assert_eq!(
1649 tokens,
1650 vec![Token::StartTag {
1651 name: "br".to_string(),
1652 attributes: vec![],
1653 self_closing: true,
1654 }]
1655 );
1656 }
1657
1658 #[test]
1659 fn self_closing_img() {
1660 let tokens = tokenize("<img/>");
1661 assert_eq!(
1662 tokens,
1663 vec![Token::StartTag {
1664 name: "img".to_string(),
1665 attributes: vec![],
1666 self_closing: true,
1667 }]
1668 );
1669 }
1670
1671 #[test]
1672 fn tag_with_attributes() {
1673 let tokens = tokenize(r#"<a href="url" class="link">"#);
1674 assert_eq!(
1675 tokens,
1676 vec![Token::StartTag {
1677 name: "a".to_string(),
1678 attributes: vec![
1679 ("href".to_string(), "url".to_string()),
1680 ("class".to_string(), "link".to_string()),
1681 ],
1682 self_closing: false,
1683 }]
1684 );
1685 }
1686
1687 #[test]
1688 fn tag_with_single_quoted_attributes() {
1689 let tokens = tokenize("<div id='main'>");
1690 assert_eq!(
1691 tokens,
1692 vec![Token::StartTag {
1693 name: "div".to_string(),
1694 attributes: vec![("id".to_string(), "main".to_string())],
1695 self_closing: false,
1696 }]
1697 );
1698 }
1699
1700 #[test]
1701 fn tag_with_unquoted_attribute() {
1702 let tokens = tokenize("<input type=text>");
1703 assert_eq!(
1704 tokens,
1705 vec![Token::StartTag {
1706 name: "input".to_string(),
1707 attributes: vec![("type".to_string(), "text".to_string())],
1708 self_closing: false,
1709 }]
1710 );
1711 }
1712
1713 #[test]
1714 fn comment() {
1715 let tokens = tokenize("<!-- comment -->");
1716 assert_eq!(tokens, vec![Token::Comment(" comment ".to_string())]);
1717 }
1718
1719 #[test]
1720 fn empty_comment() {
1721 let tokens = tokenize("<!---->");
1722 assert_eq!(tokens, vec![Token::Comment("".to_string())]);
1723 }
1724
1725 #[test]
1726 fn doctype_html() {
1727 let tokens = tokenize("<!DOCTYPE html>");
1728 assert_eq!(
1729 tokens,
1730 vec![Token::Doctype {
1731 name: Some("html".to_string()),
1732 public_id: None,
1733 system_id: None,
1734 force_quirks: false,
1735 }]
1736 );
1737 }
1738
1739 #[test]
1740 fn doctype_case_insensitive() {
1741 let tokens = tokenize("<!doctype html>");
1742 assert_eq!(
1743 tokens,
1744 vec![Token::Doctype {
1745 name: Some("html".to_string()),
1746 public_id: None,
1747 system_id: None,
1748 force_quirks: false,
1749 }]
1750 );
1751 }
1752
1753 #[test]
1754 fn char_ref_named() {
1755 let tokens = tokenize("&<>"");
1756 assert_eq!(tokens, vec![Token::Character("&<>\"".to_string())]);
1757 }
1758
1759 #[test]
1760 fn char_ref_numeric_decimal() {
1761 let tokens = tokenize("A");
1762 assert_eq!(tokens, vec![Token::Character("A".to_string())]);
1763 }
1764
1765 #[test]
1766 fn char_ref_numeric_hex() {
1767 let tokens = tokenize("A");
1768 assert_eq!(tokens, vec![Token::Character("A".to_string())]);
1769 }
1770
1771 #[test]
1772 fn char_ref_numeric_hex_uppercase() {
1773 let tokens = tokenize("A");
1774 assert_eq!(tokens, vec![Token::Character("A".to_string())]);
1775 }
1776
1777 #[test]
1778 fn full_html_document() {
1779 let tokens =
1780 tokenize("<html><head><title>Test</title></head><body><p>Hello</p></body></html>");
1781 assert_eq!(
1782 tokens,
1783 vec![
1784 Token::StartTag {
1785 name: "html".to_string(),
1786 attributes: vec![],
1787 self_closing: false,
1788 },
1789 Token::StartTag {
1790 name: "head".to_string(),
1791 attributes: vec![],
1792 self_closing: false,
1793 },
1794 Token::StartTag {
1795 name: "title".to_string(),
1796 attributes: vec![],
1797 self_closing: false,
1798 },
1799 Token::Character("Test".to_string()),
1800 Token::EndTag {
1801 name: "title".to_string(),
1802 },
1803 Token::EndTag {
1804 name: "head".to_string(),
1805 },
1806 Token::StartTag {
1807 name: "body".to_string(),
1808 attributes: vec![],
1809 self_closing: false,
1810 },
1811 Token::StartTag {
1812 name: "p".to_string(),
1813 attributes: vec![],
1814 self_closing: false,
1815 },
1816 Token::Character("Hello".to_string()),
1817 Token::EndTag {
1818 name: "p".to_string(),
1819 },
1820 Token::EndTag {
1821 name: "body".to_string(),
1822 },
1823 Token::EndTag {
1824 name: "html".to_string(),
1825 },
1826 ]
1827 );
1828 }
1829
1830 #[test]
1831 fn uppercase_tag_names_lowercased() {
1832 let tokens = tokenize("<DIV></DIV>");
1833 assert_eq!(
1834 tokens,
1835 vec![
1836 Token::StartTag {
1837 name: "div".to_string(),
1838 attributes: vec![],
1839 self_closing: false,
1840 },
1841 Token::EndTag {
1842 name: "div".to_string(),
1843 },
1844 ]
1845 );
1846 }
1847
1848 #[test]
1849 fn uppercase_attribute_names_lowercased() {
1850 let tokens = tokenize(r#"<div CLASS="x">"#);
1851 assert_eq!(
1852 tokens,
1853 vec![Token::StartTag {
1854 name: "div".to_string(),
1855 attributes: vec![("class".to_string(), "x".to_string())],
1856 self_closing: false,
1857 }]
1858 );
1859 }
1860
1861 #[test]
1862 fn duplicate_attributes_first_wins() {
1863 let tokens = tokenize(r#"<div class="a" class="b">"#);
1864 assert_eq!(
1865 tokens,
1866 vec![Token::StartTag {
1867 name: "div".to_string(),
1868 attributes: vec![("class".to_string(), "a".to_string())],
1869 self_closing: false,
1870 }]
1871 );
1872 }
1873
1874 #[test]
1875 fn char_ref_in_attribute() {
1876 let tokens = tokenize(r#"<a href="?a=1&b=2">"#);
1877 assert_eq!(
1878 tokens,
1879 vec![Token::StartTag {
1880 name: "a".to_string(),
1881 attributes: vec![("href".to_string(), "?a=1&b=2".to_string())],
1882 self_closing: false,
1883 }]
1884 );
1885 }
1886
1887 #[test]
1888 fn multiple_attributes() {
1889 let tokens = tokenize(r#"<input type="text" name="foo" value="bar">"#);
1890 assert_eq!(
1891 tokens,
1892 vec![Token::StartTag {
1893 name: "input".to_string(),
1894 attributes: vec![
1895 ("type".to_string(), "text".to_string()),
1896 ("name".to_string(), "foo".to_string()),
1897 ("value".to_string(), "bar".to_string()),
1898 ],
1899 self_closing: false,
1900 }]
1901 );
1902 }
1903
1904 #[test]
1905 fn boolean_attribute() {
1906 let tokens = tokenize("<input disabled>");
1907 assert_eq!(
1908 tokens,
1909 vec![Token::StartTag {
1910 name: "input".to_string(),
1911 attributes: vec![("disabled".to_string(), "".to_string())],
1912 self_closing: false,
1913 }]
1914 );
1915 }
1916
1917 #[test]
1918 fn mixed_content() {
1919 let tokens = tokenize("Hello <!-- comment --> World");
1920 assert_eq!(
1921 tokens,
1922 vec![
1923 Token::Character("Hello ".to_string()),
1924 Token::Comment(" comment ".to_string()),
1925 Token::Character(" World".to_string()),
1926 ]
1927 );
1928 }
1929
1930 #[test]
1931 fn doctype_with_public_id() {
1932 let tokens = tokenize(
1933 r#"<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">"#,
1934 );
1935 assert_eq!(
1936 tokens,
1937 vec![Token::Doctype {
1938 name: Some("html".to_string()),
1939 public_id: Some("-//W3C//DTD XHTML 1.0 Strict//EN".to_string()),
1940 system_id: Some("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd".to_string()),
1941 force_quirks: false,
1942 }]
1943 );
1944 }
1945
1946 #[test]
1947 fn null_in_text() {
1948 let tokens = tokenize("a\0b");
1949 assert_eq!(tokens, vec![Token::Character("a\u{FFFD}b".to_string())]);
1950 }
1951
1952 #[test]
1953 fn windows_1252_numeric_refs() {
1954 // € should map to Euro sign.
1955 let tokens = tokenize("€");
1956 assert_eq!(tokens, vec![Token::Character("\u{20AC}".to_string())]);
1957 }
1958
1959 #[test]
1960 fn attribute_with_empty_value() {
1961 let tokens = tokenize(r#"<div class="">"#);
1962 assert_eq!(
1963 tokens,
1964 vec![Token::StartTag {
1965 name: "div".to_string(),
1966 attributes: vec![("class".to_string(), "".to_string())],
1967 self_closing: false,
1968 }]
1969 );
1970 }
1971
1972 #[test]
1973 fn adjacent_tags() {
1974 let tokens = tokenize("<b></b><i></i>");
1975 assert_eq!(
1976 tokens,
1977 vec![
1978 Token::StartTag {
1979 name: "b".to_string(),
1980 attributes: vec![],
1981 self_closing: false,
1982 },
1983 Token::EndTag {
1984 name: "b".to_string(),
1985 },
1986 Token::StartTag {
1987 name: "i".to_string(),
1988 attributes: vec![],
1989 self_closing: false,
1990 },
1991 Token::EndTag {
1992 name: "i".to_string(),
1993 },
1994 ]
1995 );
1996 }
1997
1998 #[test]
1999 fn newlines_in_text() {
2000 let tokens = tokenize("line1\nline2\nline3");
2001 assert_eq!(
2002 tokens,
2003 vec![Token::Character("line1\nline2\nline3".to_string())]
2004 );
2005 }
2006
2007 #[test]
2008 fn self_closing_with_attribute() {
2009 let tokens = tokenize(r#"<img src="test.png"/>"#);
2010 assert_eq!(
2011 tokens,
2012 vec![Token::StartTag {
2013 name: "img".to_string(),
2014 attributes: vec![("src".to_string(), "test.png".to_string())],
2015 self_closing: true,
2016 }]
2017 );
2018 }
2019
2020 #[test]
2021 fn less_than_in_text_not_tag() {
2022 // A bare '<' not followed by a letter should be emitted as text.
2023 let tokens = tokenize("1 < 2");
2024 assert_eq!(tokens, vec![Token::Character("1 < 2".to_string())]);
2025 }
2026
2027 #[test]
2028 fn ampersand_not_entity() {
2029 let tokens = tokenize("a & b");
2030 assert_eq!(tokens, vec![Token::Character("a & b".to_string())]);
2031 }
2032
2033 #[test]
2034 fn cdata_in_html_becomes_comment() {
2035 let tokens = tokenize("<![CDATA[hello]]>");
2036 // In HTML (non-foreign) context, CDATA is a parse error → bogus comment.
2037 assert_eq!(tokens, vec![Token::Comment("[CDATA[hello]]".to_string())]);
2038 }
2039}