web engine - experimental web browser
at poly1305-h4-fix 62 lines 1.7 kB view raw
1//! HTML5 tokenizer and tree builder. 2//! 3//! Implements the WHATWG HTML5 tokenizer state machine (§13.2.5) 4//! and a simplified tree builder for constructing DOM trees from tokens. 5 6mod entities; 7mod tokenizer; 8mod tree_builder; 9 10pub use tokenizer::Tokenizer; 11pub use tree_builder::{parse_html, TreeBuilder}; 12 13/// A token emitted by the HTML tokenizer. 14#[derive(Debug, Clone, PartialEq)] 15pub enum Token { 16 /// `<!DOCTYPE name public_id system_id>` 17 Doctype { 18 name: Option<String>, 19 public_id: Option<String>, 20 system_id: Option<String>, 21 force_quirks: bool, 22 }, 23 /// `<tag attr="val">` 24 StartTag { 25 name: String, 26 attributes: Vec<(String, String)>, 27 self_closing: bool, 28 }, 29 /// `</tag>` 30 EndTag { name: String }, 31 /// Character data (may be coalesced). 32 Character(String), 33 /// `<!-- comment -->` 34 Comment(String), 35 /// End of file. 36 Eof, 37} 38 39/// Tokenize an HTML input string into a sequence of tokens. 40/// 41/// Runs the HTML5 tokenizer state machine and returns all emitted tokens 42/// (excluding Eof). Adjacent Character tokens are coalesced. 43pub fn tokenize(input: &str) -> Vec<Token> { 44 let mut tok = Tokenizer::new(input); 45 let mut tokens = Vec::new(); 46 loop { 47 let token = tok.next_token(); 48 match token { 49 Token::Eof => break, 50 Token::Character(ref s) => { 51 // Coalesce adjacent character tokens. 52 if let Some(Token::Character(ref mut prev)) = tokens.last_mut() { 53 prev.push_str(s); 54 } else { 55 tokens.push(token); 56 } 57 } 58 _ => tokens.push(token), 59 } 60 } 61 tokens 62}