//! HTML5 tokenizer and tree builder. //! //! Implements the WHATWG HTML5 tokenizer state machine (ยง13.2.5) //! and a simplified tree builder for constructing DOM trees from tokens. mod entities; mod tokenizer; mod tree_builder; pub use tokenizer::Tokenizer; pub use tree_builder::{parse_html, TreeBuilder}; /// A token emitted by the HTML tokenizer. #[derive(Debug, Clone, PartialEq)] pub enum Token { /// `` Doctype { name: Option, public_id: Option, system_id: Option, force_quirks: bool, }, /// `` StartTag { name: String, attributes: Vec<(String, String)>, self_closing: bool, }, /// `` EndTag { name: String }, /// Character data (may be coalesced). Character(String), /// `` Comment(String), /// End of file. Eof, } /// Tokenize an HTML input string into a sequence of tokens. /// /// Runs the HTML5 tokenizer state machine and returns all emitted tokens /// (excluding Eof). Adjacent Character tokens are coalesced. pub fn tokenize(input: &str) -> Vec { let mut tok = Tokenizer::new(input); let mut tokens = Vec::new(); loop { let token = tok.next_token(); match token { Token::Eof => break, Token::Character(ref s) => { // Coalesce adjacent character tokens. if let Some(Token::Character(ref mut prev)) = tokens.last_mut() { prev.push_str(s); } else { tokens.push(token); } } _ => tokens.push(token), } } tokens }