//! HTML5 tokenizer state machine per WHATWG spec ยง13.2.5. use crate::entities; use crate::Token; #[derive(Debug, Clone, Copy, PartialEq)] enum State { Data, TagOpen, EndTagOpen, TagName, BeforeAttributeName, AttributeName, AfterAttributeName, BeforeAttributeValue, AttributeValueDoubleQuoted, AttributeValueSingleQuoted, AttributeValueUnquoted, AfterAttributeValueQuoted, SelfClosingStartTag, BogusComment, MarkupDeclarationOpen, CommentStart, CommentStartDash, Comment, CommentLessThanSign, CommentLessThanSignBang, CommentLessThanSignBangDash, CommentLessThanSignBangDashDash, CommentEndDash, CommentEnd, CommentEndBang, Doctype, BeforeDoctypeName, DoctypeName, AfterDoctypeName, AfterDoctypePublicKeyword, BeforeDoctypePublicIdentifier, DoctypePublicIdentifierDoubleQuoted, DoctypePublicIdentifierSingleQuoted, AfterDoctypePublicIdentifier, BetweenDoctypePublicAndSystemIdentifiers, AfterDoctypeSystemKeyword, BeforeDoctypeSystemIdentifier, DoctypeSystemIdentifierDoubleQuoted, DoctypeSystemIdentifierSingleQuoted, AfterDoctypeSystemIdentifier, BogusDoctype, CharacterReference, NumericCharacterReference, HexCharacterReferenceStart, DecCharacterReferenceStart, HexCharacterReference, DecCharacterReference, NumericCharacterReferenceEnd, NamedCharacterReference, } /// HTML5 tokenizer state machine. pub struct Tokenizer { input: Vec, pos: usize, state: State, return_state: State, pending: Vec, /// Current tag being built. tag_name: String, tag_self_closing: bool, tag_is_end: bool, tag_attributes: Vec<(String, String)>, current_attr_name: String, current_attr_value: String, /// Current comment or doctype being built. comment_data: String, doctype_name: Option, doctype_public_id: Option, doctype_system_id: Option, doctype_force_quirks: bool, /// Character reference accumulator. char_ref_code: u32, temp_buf: String, } impl Tokenizer { /// Create a new tokenizer for the given input. pub fn new(input: &str) -> Self { Tokenizer { input: input.chars().collect(), pos: 0, state: State::Data, return_state: State::Data, pending: Vec::new(), tag_name: String::new(), tag_self_closing: false, tag_is_end: false, tag_attributes: Vec::new(), current_attr_name: String::new(), current_attr_value: String::new(), comment_data: String::new(), doctype_name: None, doctype_public_id: None, doctype_system_id: None, doctype_force_quirks: false, char_ref_code: 0, temp_buf: String::new(), } } /// Return the next token from the input. pub fn next_token(&mut self) -> Token { loop { if let Some(token) = self.pending.pop() { return token; } self.step(); } } fn next_char(&mut self) -> Option { if self.pos < self.input.len() { let ch = self.input[self.pos]; self.pos += 1; Some(ch) } else { None } } fn peek_char(&self) -> Option { if self.pos < self.input.len() { Some(self.input[self.pos]) } else { None } } fn reconsume(&mut self) { if self.pos > 0 { self.pos -= 1; } } fn emit(&mut self, token: Token) { // We use a Vec as a stack, so push to front by inserting at 0. self.pending.insert(0, token); } fn emit_current_tag(&mut self) { // Finalize the current attribute if there is one. self.finish_attribute(); if self.tag_is_end { self.emit(Token::EndTag { name: self.tag_name.clone(), }); } else { self.emit(Token::StartTag { name: self.tag_name.clone(), attributes: self.tag_attributes.clone(), self_closing: self.tag_self_closing, }); } } fn emit_current_comment(&mut self) { self.emit(Token::Comment(self.comment_data.clone())); } fn emit_current_doctype(&mut self) { self.emit(Token::Doctype { name: self.doctype_name.clone(), public_id: self.doctype_public_id.clone(), system_id: self.doctype_system_id.clone(), force_quirks: self.doctype_force_quirks, }); } fn emit_char(&mut self, ch: char) { self.emit(Token::Character(ch.to_string())); } fn emit_eof(&mut self) { self.emit(Token::Eof); } fn start_new_tag(&mut self, is_end: bool) { self.tag_name.clear(); self.tag_self_closing = false; self.tag_is_end = is_end; self.tag_attributes.clear(); self.current_attr_name.clear(); self.current_attr_value.clear(); } fn start_new_attribute(&mut self) { self.finish_attribute(); self.current_attr_name.clear(); self.current_attr_value.clear(); } fn finish_attribute(&mut self) { if !self.current_attr_name.is_empty() { // Per spec: if duplicate attribute name, ignore the later one. let name = self.current_attr_name.clone(); if !self.tag_attributes.iter().any(|(n, _)| n == &name) { self.tag_attributes .push((name, self.current_attr_value.clone())); } self.current_attr_name.clear(); self.current_attr_value.clear(); } } /// Flush character reference code to the return state. fn flush_char_ref(&mut self, s: &str) { match self.return_state { State::AttributeValueDoubleQuoted | State::AttributeValueSingleQuoted | State::AttributeValueUnquoted => { self.current_attr_value.push_str(s); } _ => { for ch in s.chars() { self.emit_char(ch); } } } } fn step(&mut self) { match self.state { State::Data => self.state_data(), State::TagOpen => self.state_tag_open(), State::EndTagOpen => self.state_end_tag_open(), State::TagName => self.state_tag_name(), State::BeforeAttributeName => self.state_before_attribute_name(), State::AttributeName => self.state_attribute_name(), State::AfterAttributeName => self.state_after_attribute_name(), State::BeforeAttributeValue => self.state_before_attribute_value(), State::AttributeValueDoubleQuoted => self.state_attribute_value_double_quoted(), State::AttributeValueSingleQuoted => self.state_attribute_value_single_quoted(), State::AttributeValueUnquoted => self.state_attribute_value_unquoted(), State::AfterAttributeValueQuoted => self.state_after_attribute_value_quoted(), State::SelfClosingStartTag => self.state_self_closing_start_tag(), State::BogusComment => self.state_bogus_comment(), State::MarkupDeclarationOpen => self.state_markup_declaration_open(), State::CommentStart => self.state_comment_start(), State::CommentStartDash => self.state_comment_start_dash(), State::Comment => self.state_comment(), State::CommentLessThanSign => self.state_comment_less_than_sign(), State::CommentLessThanSignBang => self.state_comment_less_than_sign_bang(), State::CommentLessThanSignBangDash => self.state_comment_less_than_sign_bang_dash(), State::CommentLessThanSignBangDashDash => { self.state_comment_less_than_sign_bang_dash_dash() } State::CommentEndDash => self.state_comment_end_dash(), State::CommentEnd => self.state_comment_end(), State::CommentEndBang => self.state_comment_end_bang(), State::Doctype => self.state_doctype(), State::BeforeDoctypeName => self.state_before_doctype_name(), State::DoctypeName => self.state_doctype_name(), State::AfterDoctypeName => self.state_after_doctype_name(), State::AfterDoctypePublicKeyword => self.state_after_doctype_public_keyword(), State::BeforeDoctypePublicIdentifier => self.state_before_doctype_public_identifier(), State::DoctypePublicIdentifierDoubleQuoted => { self.state_doctype_public_identifier_double_quoted() } State::DoctypePublicIdentifierSingleQuoted => { self.state_doctype_public_identifier_single_quoted() } State::AfterDoctypePublicIdentifier => self.state_after_doctype_public_identifier(), State::BetweenDoctypePublicAndSystemIdentifiers => { self.state_between_doctype_public_and_system_identifiers() } State::AfterDoctypeSystemKeyword => self.state_after_doctype_system_keyword(), State::BeforeDoctypeSystemIdentifier => self.state_before_doctype_system_identifier(), State::DoctypeSystemIdentifierDoubleQuoted => { self.state_doctype_system_identifier_double_quoted() } State::DoctypeSystemIdentifierSingleQuoted => { self.state_doctype_system_identifier_single_quoted() } State::AfterDoctypeSystemIdentifier => self.state_after_doctype_system_identifier(), State::BogusDoctype => self.state_bogus_doctype(), State::CharacterReference => self.state_character_reference(), State::NumericCharacterReference => self.state_numeric_character_reference(), State::HexCharacterReferenceStart => self.state_hex_character_reference_start(), State::DecCharacterReferenceStart => self.state_dec_character_reference_start(), State::HexCharacterReference => self.state_hex_character_reference(), State::DecCharacterReference => self.state_dec_character_reference(), State::NumericCharacterReferenceEnd => self.state_numeric_character_reference_end(), State::NamedCharacterReference => self.state_named_character_reference(), } } // --- State implementations --- fn state_data(&mut self) { match self.next_char() { Some('&') => { self.return_state = State::Data; self.state = State::CharacterReference; } Some('<') => { self.state = State::TagOpen; } Some('\0') => { // Parse error. Emit replacement character. self.emit_char('\u{FFFD}'); } None => { self.emit_eof(); } Some(c) => { self.emit_char(c); } } } fn state_tag_open(&mut self) { match self.next_char() { Some('!') => { self.state = State::MarkupDeclarationOpen; } Some('/') => { self.state = State::EndTagOpen; } Some(c) if c.is_ascii_alphabetic() => { self.start_new_tag(false); self.reconsume(); self.state = State::TagName; } Some('?') => { // Parse error. Create a comment token. self.comment_data.clear(); self.reconsume(); self.state = State::BogusComment; } None => { // Parse error. Emit '<' and EOF. self.emit_char('<'); self.emit_eof(); } Some(_) => { // Parse error. Emit '<' and reconsume. self.emit_char('<'); self.reconsume(); self.state = State::Data; } } } fn state_end_tag_open(&mut self) { match self.next_char() { Some(c) if c.is_ascii_alphabetic() => { self.start_new_tag(true); self.reconsume(); self.state = State::TagName; } Some('>') => { // Parse error. Switch to data state. self.state = State::Data; } None => { self.emit_char('<'); self.emit_char('/'); self.emit_eof(); } Some(_) => { // Parse error. Create a comment. self.comment_data.clear(); self.reconsume(); self.state = State::BogusComment; } } } fn state_tag_name(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { self.state = State::BeforeAttributeName; } Some('/') => { self.state = State::SelfClosingStartTag; } Some('>') => { self.state = State::Data; self.emit_current_tag(); } Some(c) if c.is_ascii_uppercase() => { self.tag_name.push(c.to_ascii_lowercase()); } Some('\0') => { self.tag_name.push('\u{FFFD}'); } None => { self.emit_eof(); } Some(c) => { self.tag_name.push(c); } } } fn state_before_attribute_name(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { // Ignore whitespace. } Some('/') | Some('>') => { self.reconsume(); self.state = State::AfterAttributeName; } None => { // EOF: go to AfterAttributeName without reconsuming. self.state = State::AfterAttributeName; } Some('=') => { // Parse error. Start a new attribute with '=' as name. self.start_new_attribute(); self.current_attr_name.push('='); self.state = State::AttributeName; } Some(_) => { self.start_new_attribute(); self.reconsume(); self.state = State::AttributeName; } } } fn state_attribute_name(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') | Some('/') | Some('>') => { self.reconsume(); self.state = State::AfterAttributeName; } None => { self.state = State::AfterAttributeName; } Some('=') => { self.state = State::BeforeAttributeValue; } Some(c) if c.is_ascii_uppercase() => { self.current_attr_name.push(c.to_ascii_lowercase()); } Some('\0') => { self.current_attr_name.push('\u{FFFD}'); } Some(c) => { self.current_attr_name.push(c); } } } fn state_after_attribute_name(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { // Ignore. } Some('/') => { self.state = State::SelfClosingStartTag; } Some('=') => { self.state = State::BeforeAttributeValue; } Some('>') => { self.state = State::Data; self.emit_current_tag(); } None => { self.emit_eof(); } Some(_) => { self.start_new_attribute(); self.reconsume(); self.state = State::AttributeName; } } } fn state_before_attribute_value(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { // Ignore. } Some('"') => { self.state = State::AttributeValueDoubleQuoted; } Some('\'') => { self.state = State::AttributeValueSingleQuoted; } Some('>') => { // Parse error. Emit tag with missing value. self.state = State::Data; self.emit_current_tag(); } _ => { self.reconsume(); self.state = State::AttributeValueUnquoted; } } } fn state_attribute_value_double_quoted(&mut self) { match self.next_char() { Some('"') => { self.state = State::AfterAttributeValueQuoted; } Some('&') => { self.return_state = State::AttributeValueDoubleQuoted; self.state = State::CharacterReference; } Some('\0') => { self.current_attr_value.push('\u{FFFD}'); } None => { self.emit_eof(); } Some(c) => { self.current_attr_value.push(c); } } } fn state_attribute_value_single_quoted(&mut self) { match self.next_char() { Some('\'') => { self.state = State::AfterAttributeValueQuoted; } Some('&') => { self.return_state = State::AttributeValueSingleQuoted; self.state = State::CharacterReference; } Some('\0') => { self.current_attr_value.push('\u{FFFD}'); } None => { self.emit_eof(); } Some(c) => { self.current_attr_value.push(c); } } } fn state_attribute_value_unquoted(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { self.state = State::BeforeAttributeName; } Some('&') => { self.return_state = State::AttributeValueUnquoted; self.state = State::CharacterReference; } Some('>') => { self.state = State::Data; self.emit_current_tag(); } Some('\0') => { self.current_attr_value.push('\u{FFFD}'); } None => { self.emit_eof(); } Some(c) => { self.current_attr_value.push(c); } } } fn state_after_attribute_value_quoted(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { self.state = State::BeforeAttributeName; } Some('/') => { self.state = State::SelfClosingStartTag; } Some('>') => { self.state = State::Data; self.emit_current_tag(); } None => { self.emit_eof(); } Some(_) => { // Parse error. Reconsume in before attribute name. self.reconsume(); self.state = State::BeforeAttributeName; } } } fn state_self_closing_start_tag(&mut self) { match self.next_char() { Some('>') => { self.tag_self_closing = true; self.state = State::Data; self.emit_current_tag(); } None => { self.emit_eof(); } Some(_) => { // Parse error. Reconsume in before attribute name. self.reconsume(); self.state = State::BeforeAttributeName; } } } fn state_bogus_comment(&mut self) { match self.next_char() { Some('>') => { self.state = State::Data; self.emit_current_comment(); } None => { self.emit_current_comment(); self.emit_eof(); } Some('\0') => { self.comment_data.push('\u{FFFD}'); } Some(c) => { self.comment_data.push(c); } } } fn state_markup_declaration_open(&mut self) { // Check for `--`, `DOCTYPE`, or `[CDATA[` if self.starts_with("--") { self.pos += 2; self.comment_data.clear(); self.state = State::CommentStart; } else if self.starts_with_case_insensitive("DOCTYPE") { self.pos += 7; self.state = State::Doctype; } else if self.starts_with("[CDATA[") { // Per spec, if not in foreign content, parse error โ†’ bogus comment. self.pos += 7; self.comment_data.clear(); self.comment_data.push_str("[CDATA["); self.state = State::BogusComment; } else { // Parse error. Bogus comment. self.comment_data.clear(); self.state = State::BogusComment; } } fn state_comment_start(&mut self) { match self.next_char() { Some('-') => { self.state = State::CommentStartDash; } Some('>') => { // Parse error. Emit empty comment. self.state = State::Data; self.emit_current_comment(); } _ => { self.reconsume(); self.state = State::Comment; } } } fn state_comment_start_dash(&mut self) { match self.next_char() { Some('-') => { self.state = State::CommentEnd; } Some('>') => { // Parse error. self.state = State::Data; self.emit_current_comment(); } None => { self.emit_current_comment(); self.emit_eof(); } Some(_) => { self.comment_data.push('-'); self.reconsume(); self.state = State::Comment; } } } fn state_comment(&mut self) { match self.next_char() { Some('<') => { self.comment_data.push('<'); self.state = State::CommentLessThanSign; } Some('-') => { self.state = State::CommentEndDash; } Some('\0') => { self.comment_data.push('\u{FFFD}'); } None => { self.emit_current_comment(); self.emit_eof(); } Some(c) => { self.comment_data.push(c); } } } fn state_comment_less_than_sign(&mut self) { match self.next_char() { Some('!') => { self.comment_data.push('!'); self.state = State::CommentLessThanSignBang; } Some('<') => { self.comment_data.push('<'); } None => { // Don't reconsume on EOF โ€” pos didn't advance, so reconsuming // would back up to '<' and loop forever between here and Comment. self.state = State::Comment; } Some(_) => { self.reconsume(); self.state = State::Comment; } } } fn state_comment_less_than_sign_bang(&mut self) { match self.next_char() { Some('-') => { self.state = State::CommentLessThanSignBangDash; } _ => { self.reconsume(); self.state = State::Comment; } } } fn state_comment_less_than_sign_bang_dash(&mut self) { match self.next_char() { Some('-') => { self.state = State::CommentLessThanSignBangDashDash; } _ => { self.reconsume(); self.state = State::CommentEndDash; } } } fn state_comment_less_than_sign_bang_dash_dash(&mut self) { match self.next_char() { Some('>') | None => { self.reconsume(); self.state = State::CommentEnd; } Some(_) => { // Parse error. self.reconsume(); self.state = State::CommentEnd; } } } fn state_comment_end_dash(&mut self) { match self.next_char() { Some('-') => { self.state = State::CommentEnd; } None => { self.emit_current_comment(); self.emit_eof(); } Some(_) => { self.comment_data.push('-'); self.reconsume(); self.state = State::Comment; } } } fn state_comment_end(&mut self) { match self.next_char() { Some('>') => { self.state = State::Data; self.emit_current_comment(); } Some('!') => { self.state = State::CommentEndBang; } Some('-') => { self.comment_data.push('-'); } None => { self.emit_current_comment(); self.emit_eof(); } Some(_) => { self.comment_data.push('-'); self.comment_data.push('-'); self.reconsume(); self.state = State::Comment; } } } fn state_comment_end_bang(&mut self) { match self.next_char() { Some('-') => { self.comment_data.push('-'); self.comment_data.push('-'); self.comment_data.push('!'); self.state = State::CommentEndDash; } Some('>') => { self.state = State::Data; self.emit_current_comment(); } None => { self.emit_current_comment(); self.emit_eof(); } Some(_) => { self.comment_data.push('-'); self.comment_data.push('-'); self.comment_data.push('!'); self.reconsume(); self.state = State::Comment; } } } fn state_doctype(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { self.state = State::BeforeDoctypeName; } Some('>') => { self.reconsume(); self.state = State::BeforeDoctypeName; } None => { self.doctype_name = None; self.doctype_public_id = None; self.doctype_system_id = None; self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(_) => { // Parse error. Missing whitespace before DOCTYPE name. self.reconsume(); self.state = State::BeforeDoctypeName; } } } fn state_before_doctype_name(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { // Ignore whitespace. } Some(c) if c.is_ascii_uppercase() => { self.doctype_name = Some(c.to_ascii_lowercase().to_string()); self.doctype_public_id = None; self.doctype_system_id = None; self.doctype_force_quirks = false; self.state = State::DoctypeName; } Some('\0') => { self.doctype_name = Some("\u{FFFD}".to_string()); self.doctype_public_id = None; self.doctype_system_id = None; self.doctype_force_quirks = false; self.state = State::DoctypeName; } Some('>') => { // Parse error. Force quirks. self.doctype_name = None; self.doctype_public_id = None; self.doctype_system_id = None; self.doctype_force_quirks = true; self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_name = None; self.doctype_public_id = None; self.doctype_system_id = None; self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(c) => { self.doctype_name = Some(c.to_string()); self.doctype_public_id = None; self.doctype_system_id = None; self.doctype_force_quirks = false; self.state = State::DoctypeName; } } } fn state_doctype_name(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { self.state = State::AfterDoctypeName; } Some('>') => { self.state = State::Data; self.emit_current_doctype(); } Some(c) if c.is_ascii_uppercase() => { if let Some(ref mut name) = self.doctype_name { name.push(c.to_ascii_lowercase()); } } Some('\0') => { if let Some(ref mut name) = self.doctype_name { name.push('\u{FFFD}'); } } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(c) => { if let Some(ref mut name) = self.doctype_name { name.push(c); } } } } fn state_after_doctype_name(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { // Ignore. } Some('>') => { self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(_) => { // Check for PUBLIC or SYSTEM keyword. self.reconsume(); if self.starts_with_case_insensitive("PUBLIC") { self.pos += 6; self.state = State::AfterDoctypePublicKeyword; } else if self.starts_with_case_insensitive("SYSTEM") { self.pos += 6; self.state = State::AfterDoctypeSystemKeyword; } else { // Parse error. self.doctype_force_quirks = true; self.next_char(); // consume the reconsumed char self.state = State::BogusDoctype; } } } } fn state_after_doctype_public_keyword(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { self.state = State::BeforeDoctypePublicIdentifier; } Some('"') => { // Parse error. Missing whitespace. self.doctype_public_id = Some(String::new()); self.state = State::DoctypePublicIdentifierDoubleQuoted; } Some('\'') => { self.doctype_public_id = Some(String::new()); self.state = State::DoctypePublicIdentifierSingleQuoted; } Some('>') => { self.doctype_force_quirks = true; self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(_) => { self.doctype_force_quirks = true; self.reconsume(); self.state = State::BogusDoctype; } } } fn state_before_doctype_public_identifier(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { // Ignore. } Some('"') => { self.doctype_public_id = Some(String::new()); self.state = State::DoctypePublicIdentifierDoubleQuoted; } Some('\'') => { self.doctype_public_id = Some(String::new()); self.state = State::DoctypePublicIdentifierSingleQuoted; } Some('>') => { self.doctype_force_quirks = true; self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(_) => { self.doctype_force_quirks = true; self.reconsume(); self.state = State::BogusDoctype; } } } fn state_doctype_public_identifier_double_quoted(&mut self) { match self.next_char() { Some('"') => { self.state = State::AfterDoctypePublicIdentifier; } Some('\0') => { if let Some(ref mut id) = self.doctype_public_id { id.push('\u{FFFD}'); } } Some('>') => { self.doctype_force_quirks = true; self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(c) => { if let Some(ref mut id) = self.doctype_public_id { id.push(c); } } } } fn state_doctype_public_identifier_single_quoted(&mut self) { match self.next_char() { Some('\'') => { self.state = State::AfterDoctypePublicIdentifier; } Some('\0') => { if let Some(ref mut id) = self.doctype_public_id { id.push('\u{FFFD}'); } } Some('>') => { self.doctype_force_quirks = true; self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(c) => { if let Some(ref mut id) = self.doctype_public_id { id.push(c); } } } } fn state_after_doctype_public_identifier(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { self.state = State::BetweenDoctypePublicAndSystemIdentifiers; } Some('>') => { self.state = State::Data; self.emit_current_doctype(); } Some('"') => { // Parse error. Missing whitespace. self.doctype_system_id = Some(String::new()); self.state = State::DoctypeSystemIdentifierDoubleQuoted; } Some('\'') => { self.doctype_system_id = Some(String::new()); self.state = State::DoctypeSystemIdentifierSingleQuoted; } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(_) => { self.doctype_force_quirks = true; self.reconsume(); self.state = State::BogusDoctype; } } } fn state_between_doctype_public_and_system_identifiers(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { // Ignore. } Some('>') => { self.state = State::Data; self.emit_current_doctype(); } Some('"') => { self.doctype_system_id = Some(String::new()); self.state = State::DoctypeSystemIdentifierDoubleQuoted; } Some('\'') => { self.doctype_system_id = Some(String::new()); self.state = State::DoctypeSystemIdentifierSingleQuoted; } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(_) => { self.doctype_force_quirks = true; self.reconsume(); self.state = State::BogusDoctype; } } } fn state_after_doctype_system_keyword(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { self.state = State::BeforeDoctypeSystemIdentifier; } Some('"') => { self.doctype_system_id = Some(String::new()); self.state = State::DoctypeSystemIdentifierDoubleQuoted; } Some('\'') => { self.doctype_system_id = Some(String::new()); self.state = State::DoctypeSystemIdentifierSingleQuoted; } Some('>') => { self.doctype_force_quirks = true; self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(_) => { self.doctype_force_quirks = true; self.reconsume(); self.state = State::BogusDoctype; } } } fn state_before_doctype_system_identifier(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { // Ignore. } Some('"') => { self.doctype_system_id = Some(String::new()); self.state = State::DoctypeSystemIdentifierDoubleQuoted; } Some('\'') => { self.doctype_system_id = Some(String::new()); self.state = State::DoctypeSystemIdentifierSingleQuoted; } Some('>') => { self.doctype_force_quirks = true; self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(_) => { self.doctype_force_quirks = true; self.reconsume(); self.state = State::BogusDoctype; } } } fn state_doctype_system_identifier_double_quoted(&mut self) { match self.next_char() { Some('"') => { self.state = State::AfterDoctypeSystemIdentifier; } Some('\0') => { if let Some(ref mut id) = self.doctype_system_id { id.push('\u{FFFD}'); } } Some('>') => { self.doctype_force_quirks = true; self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(c) => { if let Some(ref mut id) = self.doctype_system_id { id.push(c); } } } } fn state_doctype_system_identifier_single_quoted(&mut self) { match self.next_char() { Some('\'') => { self.state = State::AfterDoctypeSystemIdentifier; } Some('\0') => { if let Some(ref mut id) = self.doctype_system_id { id.push('\u{FFFD}'); } } Some('>') => { self.doctype_force_quirks = true; self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(c) => { if let Some(ref mut id) = self.doctype_system_id { id.push(c); } } } } fn state_after_doctype_system_identifier(&mut self) { match self.next_char() { Some('\t') | Some('\n') | Some('\x0C') | Some(' ') => { // Ignore. } Some('>') => { self.state = State::Data; self.emit_current_doctype(); } None => { self.doctype_force_quirks = true; self.emit_current_doctype(); self.emit_eof(); } Some(_) => { // Parse error, but do NOT set force_quirks. self.reconsume(); self.state = State::BogusDoctype; } } } fn state_bogus_doctype(&mut self) { match self.next_char() { Some('>') => { self.state = State::Data; self.emit_current_doctype(); } Some('\0') => { // Parse error. Ignore. } None => { self.emit_current_doctype(); self.emit_eof(); } Some(_) => { // Ignore. } } } // --- Character reference states --- fn state_character_reference(&mut self) { self.temp_buf.clear(); self.temp_buf.push('&'); match self.peek_char() { Some(c) if c.is_ascii_alphanumeric() => { self.state = State::NamedCharacterReference; } Some('#') => { self.temp_buf.push('#'); self.next_char(); self.state = State::NumericCharacterReference; } _ => { // Not a character reference. Flush '&' to return state. self.flush_char_ref("&"); self.state = self.return_state; } } } fn state_named_character_reference(&mut self) { // Collect alphanumeric characters to form the entity name. // Per spec, entity names can also contain digits after the first char. let mut name = String::new(); let start_pos = self.pos; while let Some(c) = self.peek_char() { if c.is_ascii_alphanumeric() { name.push(c); self.pos += 1; } else { break; } } // Try to find a match, trying longest match first. // First check if the full name + semicolon matches. let has_trailing_semi = self.peek_char() == Some(';'); let mut matched_value: Option<&str> = None; let mut matched_len = 0; // Try the full name first (with semicolon if present). if has_trailing_semi { if let Some(val) = entities::lookup_entity(&name) { matched_value = Some(val); matched_len = name.len(); } } // If no match with full name, try progressively shorter prefixes. if matched_value.is_none() { for i in (1..=name.len()).rev() { let candidate = &name[..i]; if let Some(val) = entities::lookup_entity(candidate) { // Without semicolon, only legacy entities are recognized. if entities::is_legacy_entity(candidate) { matched_value = Some(val); matched_len = i; break; } } } } // Also try the full name without semicolon for legacy entities. if matched_value.is_none() && !has_trailing_semi { if let Some(val) = entities::lookup_entity(&name) { if entities::is_legacy_entity(&name) { matched_value = Some(val); matched_len = name.len(); } } } if let Some(value) = matched_value { // Rewind to just after the matched portion. self.pos = start_pos + matched_len; // Check for semicolon after the matched portion. let has_semi = self.peek_char() == Some(';'); if has_semi { self.pos += 1; } // Per spec: if consumed as part of an attribute and the character // after the match is `=` or alphanumeric, and no semicolon, // flush the original text instead. let in_attribute = matches!( self.return_state, State::AttributeValueDoubleQuoted | State::AttributeValueSingleQuoted | State::AttributeValueUnquoted ); if !has_semi && in_attribute { if let Some(next) = self.peek_char() { if next == '=' || next.is_ascii_alphanumeric() { // Not a reference. Flush original text. let mut original = "&".to_string(); original.push_str(&name[..matched_len]); self.flush_char_ref(&original); self.state = self.return_state; return; } } } self.flush_char_ref(value); self.state = self.return_state; } else { // No match. Rewind and flush '&' + all collected chars. self.pos = start_pos; self.flush_char_ref("&"); for _ in 0..name.len() { let c = self.next_char().unwrap(); let s = c.to_string(); self.flush_char_ref(&s); } self.state = self.return_state; } } fn state_numeric_character_reference(&mut self) { self.char_ref_code = 0; match self.peek_char() { Some('x') | Some('X') => { self.temp_buf.push(self.peek_char().unwrap()); self.next_char(); self.state = State::HexCharacterReferenceStart; } _ => { self.state = State::DecCharacterReferenceStart; } } } fn state_hex_character_reference_start(&mut self) { match self.peek_char() { Some(c) if c.is_ascii_hexdigit() => { self.state = State::HexCharacterReference; } _ => { // Parse error. Flush temp_buf. let buf = self.temp_buf.clone(); self.flush_char_ref(&buf); self.state = self.return_state; } } } fn state_dec_character_reference_start(&mut self) { match self.peek_char() { Some(c) if c.is_ascii_digit() => { self.state = State::DecCharacterReference; } _ => { let buf = self.temp_buf.clone(); self.flush_char_ref(&buf); self.state = self.return_state; } } } fn state_hex_character_reference(&mut self) { match self.next_char() { Some(c) if c.is_ascii_hexdigit() => { // Cap at a value that's clearly out of range but won't overflow. self.char_ref_code = self .char_ref_code .saturating_mul(16) .saturating_add(c.to_digit(16).unwrap()); if self.char_ref_code > 0x10FFFF { self.char_ref_code = 0x110000; } } Some(';') => { self.state = State::NumericCharacterReferenceEnd; } None => { // EOF: missing semicolon parse error. Don't reconsume. self.state = State::NumericCharacterReferenceEnd; } Some(_) => { // Parse error: missing semicolon. self.reconsume(); self.state = State::NumericCharacterReferenceEnd; } } } fn state_dec_character_reference(&mut self) { match self.next_char() { Some(c) if c.is_ascii_digit() => { self.char_ref_code = self .char_ref_code .saturating_mul(10) .saturating_add(c.to_digit(10).unwrap()); if self.char_ref_code > 0x10FFFF { self.char_ref_code = 0x110000; } } Some(';') => { self.state = State::NumericCharacterReferenceEnd; } None => { // EOF: missing semicolon parse error. Don't reconsume. self.state = State::NumericCharacterReferenceEnd; } Some(_) => { self.reconsume(); self.state = State::NumericCharacterReferenceEnd; } } } fn state_numeric_character_reference_end(&mut self) { let code = self.char_ref_code; let ch = match code { 0 => '\u{FFFD}', // Surrogate range. 0xD800..=0xDFFF => '\u{FFFD}', // Out of Unicode range. c if c > 0x10FFFF => '\u{FFFD}', // Windows-1252 replacement table for 0x80..0x9F. 0x80 => '\u{20AC}', 0x82 => '\u{201A}', 0x83 => '\u{0192}', 0x84 => '\u{201E}', 0x85 => '\u{2026}', 0x86 => '\u{2020}', 0x87 => '\u{2021}', 0x88 => '\u{02C6}', 0x89 => '\u{2030}', 0x8A => '\u{0160}', 0x8B => '\u{2039}', 0x8C => '\u{0152}', 0x8E => '\u{017D}', 0x91 => '\u{2018}', 0x92 => '\u{2019}', 0x93 => '\u{201C}', 0x94 => '\u{201D}', 0x95 => '\u{2022}', 0x96 => '\u{2013}', 0x97 => '\u{2014}', 0x98 => '\u{02DC}', 0x99 => '\u{2122}', 0x9A => '\u{0161}', 0x9B => '\u{203A}', 0x9C => '\u{0153}', 0x9E => '\u{017E}', 0x9F => '\u{0178}', c => char::from_u32(c).unwrap_or('\u{FFFD}'), }; let s = ch.to_string(); self.flush_char_ref(&s); self.state = self.return_state; } // --- Helpers --- fn starts_with(&self, s: &str) -> bool { let bytes: Vec = s.chars().collect(); if self.pos + bytes.len() > self.input.len() { return false; } for (i, &c) in bytes.iter().enumerate() { if self.input[self.pos + i] != c { return false; } } true } fn starts_with_case_insensitive(&self, s: &str) -> bool { let bytes: Vec = s.chars().collect(); if self.pos + bytes.len() > self.input.len() { return false; } for (i, &c) in bytes.iter().enumerate() { if !self.input[self.pos + i].eq_ignore_ascii_case(&c) { return false; } } true } } #[cfg(test)] mod tests { use super::*; use crate::tokenize; #[test] fn empty_input() { let tokens = tokenize(""); assert!(tokens.is_empty()); } #[test] fn plain_text() { let tokens = tokenize("Hello, world!"); assert_eq!(tokens, vec![Token::Character("Hello, world!".to_string())]); } #[test] fn simple_element() { let tokens = tokenize("

Hello

"); assert_eq!( tokens, vec![ Token::StartTag { name: "p".to_string(), attributes: vec![], self_closing: false, }, Token::Character("Hello".to_string()), Token::EndTag { name: "p".to_string(), }, ] ); } #[test] fn self_closing_tag() { let tokens = tokenize("
"); assert_eq!( tokens, vec![Token::StartTag { name: "br".to_string(), attributes: vec![], self_closing: true, }] ); } #[test] fn self_closing_img() { let tokens = tokenize(""); assert_eq!( tokens, vec![Token::StartTag { name: "img".to_string(), attributes: vec![], self_closing: true, }] ); } #[test] fn tag_with_attributes() { let tokens = tokenize(r#""#); assert_eq!( tokens, vec![Token::StartTag { name: "a".to_string(), attributes: vec![ ("href".to_string(), "url".to_string()), ("class".to_string(), "link".to_string()), ], self_closing: false, }] ); } #[test] fn tag_with_single_quoted_attributes() { let tokens = tokenize("
"); assert_eq!( tokens, vec![Token::StartTag { name: "div".to_string(), attributes: vec![("id".to_string(), "main".to_string())], self_closing: false, }] ); } #[test] fn tag_with_unquoted_attribute() { let tokens = tokenize(""); assert_eq!( tokens, vec![Token::StartTag { name: "input".to_string(), attributes: vec![("type".to_string(), "text".to_string())], self_closing: false, }] ); } #[test] fn comment() { let tokens = tokenize(""); assert_eq!(tokens, vec![Token::Comment(" comment ".to_string())]); } #[test] fn empty_comment() { let tokens = tokenize(""); assert_eq!(tokens, vec![Token::Comment("".to_string())]); } #[test] fn doctype_html() { let tokens = tokenize(""); assert_eq!( tokens, vec![Token::Doctype { name: Some("html".to_string()), public_id: None, system_id: None, force_quirks: false, }] ); } #[test] fn doctype_case_insensitive() { let tokens = tokenize(""); assert_eq!( tokens, vec![Token::Doctype { name: Some("html".to_string()), public_id: None, system_id: None, force_quirks: false, }] ); } #[test] fn char_ref_named() { let tokens = tokenize("&<>""); assert_eq!(tokens, vec![Token::Character("&<>\"".to_string())]); } #[test] fn char_ref_numeric_decimal() { let tokens = tokenize("A"); assert_eq!(tokens, vec![Token::Character("A".to_string())]); } #[test] fn char_ref_numeric_hex() { let tokens = tokenize("A"); assert_eq!(tokens, vec![Token::Character("A".to_string())]); } #[test] fn char_ref_numeric_hex_uppercase() { let tokens = tokenize("A"); assert_eq!(tokens, vec![Token::Character("A".to_string())]); } #[test] fn full_html_document() { let tokens = tokenize("Test

Hello

"); assert_eq!( tokens, vec![ Token::StartTag { name: "html".to_string(), attributes: vec![], self_closing: false, }, Token::StartTag { name: "head".to_string(), attributes: vec![], self_closing: false, }, Token::StartTag { name: "title".to_string(), attributes: vec![], self_closing: false, }, Token::Character("Test".to_string()), Token::EndTag { name: "title".to_string(), }, Token::EndTag { name: "head".to_string(), }, Token::StartTag { name: "body".to_string(), attributes: vec![], self_closing: false, }, Token::StartTag { name: "p".to_string(), attributes: vec![], self_closing: false, }, Token::Character("Hello".to_string()), Token::EndTag { name: "p".to_string(), }, Token::EndTag { name: "body".to_string(), }, Token::EndTag { name: "html".to_string(), }, ] ); } #[test] fn uppercase_tag_names_lowercased() { let tokens = tokenize("
"); assert_eq!( tokens, vec![ Token::StartTag { name: "div".to_string(), attributes: vec![], self_closing: false, }, Token::EndTag { name: "div".to_string(), }, ] ); } #[test] fn uppercase_attribute_names_lowercased() { let tokens = tokenize(r#"
"#); assert_eq!( tokens, vec![Token::StartTag { name: "div".to_string(), attributes: vec![("class".to_string(), "x".to_string())], self_closing: false, }] ); } #[test] fn duplicate_attributes_first_wins() { let tokens = tokenize(r#"
"#); assert_eq!( tokens, vec![Token::StartTag { name: "div".to_string(), attributes: vec![("class".to_string(), "a".to_string())], self_closing: false, }] ); } #[test] fn char_ref_in_attribute() { let tokens = tokenize(r#""#); assert_eq!( tokens, vec![Token::StartTag { name: "a".to_string(), attributes: vec![("href".to_string(), "?a=1&b=2".to_string())], self_closing: false, }] ); } #[test] fn multiple_attributes() { let tokens = tokenize(r#""#); assert_eq!( tokens, vec![Token::StartTag { name: "input".to_string(), attributes: vec![ ("type".to_string(), "text".to_string()), ("name".to_string(), "foo".to_string()), ("value".to_string(), "bar".to_string()), ], self_closing: false, }] ); } #[test] fn boolean_attribute() { let tokens = tokenize(""); assert_eq!( tokens, vec![Token::StartTag { name: "input".to_string(), attributes: vec![("disabled".to_string(), "".to_string())], self_closing: false, }] ); } #[test] fn mixed_content() { let tokens = tokenize("Hello World"); assert_eq!( tokens, vec![ Token::Character("Hello ".to_string()), Token::Comment(" comment ".to_string()), Token::Character(" World".to_string()), ] ); } #[test] fn doctype_with_public_id() { let tokens = tokenize( r#""#, ); assert_eq!( tokens, vec![Token::Doctype { name: Some("html".to_string()), public_id: Some("-//W3C//DTD XHTML 1.0 Strict//EN".to_string()), system_id: Some("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd".to_string()), force_quirks: false, }] ); } #[test] fn null_in_text() { let tokens = tokenize("a\0b"); assert_eq!(tokens, vec![Token::Character("a\u{FFFD}b".to_string())]); } #[test] fn windows_1252_numeric_refs() { // € should map to Euro sign. let tokens = tokenize("€"); assert_eq!(tokens, vec![Token::Character("\u{20AC}".to_string())]); } #[test] fn attribute_with_empty_value() { let tokens = tokenize(r#"
"#); assert_eq!( tokens, vec![Token::StartTag { name: "div".to_string(), attributes: vec![("class".to_string(), "".to_string())], self_closing: false, }] ); } #[test] fn adjacent_tags() { let tokens = tokenize(""); assert_eq!( tokens, vec![ Token::StartTag { name: "b".to_string(), attributes: vec![], self_closing: false, }, Token::EndTag { name: "b".to_string(), }, Token::StartTag { name: "i".to_string(), attributes: vec![], self_closing: false, }, Token::EndTag { name: "i".to_string(), }, ] ); } #[test] fn newlines_in_text() { let tokens = tokenize("line1\nline2\nline3"); assert_eq!( tokens, vec![Token::Character("line1\nline2\nline3".to_string())] ); } #[test] fn self_closing_with_attribute() { let tokens = tokenize(r#""#); assert_eq!( tokens, vec![Token::StartTag { name: "img".to_string(), attributes: vec![("src".to_string(), "test.png".to_string())], self_closing: true, }] ); } #[test] fn less_than_in_text_not_tag() { // A bare '<' not followed by a letter should be emitted as text. let tokens = tokenize("1 < 2"); assert_eq!(tokens, vec![Token::Character("1 < 2".to_string())]); } #[test] fn ampersand_not_entity() { let tokens = tokenize("a & b"); assert_eq!(tokens, vec![Token::Character("a & b".to_string())]); } #[test] fn cdata_in_html_becomes_comment() { let tokens = tokenize(""); // In HTML (non-foreign) context, CDATA is a parse error โ†’ bogus comment. assert_eq!(tokens, vec![Token::Comment("[CDATA[hello]]".to_string())]); } }