web engine - experimental web browser
1//! HTML5 tokenizer and tree builder.
2//!
3//! Implements the WHATWG HTML5 tokenizer state machine (§13.2.5)
4//! and a simplified tree builder for constructing DOM trees from tokens.
5
6mod entities;
7mod tokenizer;
8mod tree_builder;
9
10pub use tokenizer::Tokenizer;
11pub use tree_builder::{parse_html, TreeBuilder};
12
13/// A token emitted by the HTML tokenizer.
14#[derive(Debug, Clone, PartialEq)]
15pub enum Token {
16 /// `<!DOCTYPE name public_id system_id>`
17 Doctype {
18 name: Option<String>,
19 public_id: Option<String>,
20 system_id: Option<String>,
21 force_quirks: bool,
22 },
23 /// `<tag attr="val">`
24 StartTag {
25 name: String,
26 attributes: Vec<(String, String)>,
27 self_closing: bool,
28 },
29 /// `</tag>`
30 EndTag { name: String },
31 /// Character data (may be coalesced).
32 Character(String),
33 /// `<!-- comment -->`
34 Comment(String),
35 /// End of file.
36 Eof,
37}
38
39/// Tokenize an HTML input string into a sequence of tokens.
40///
41/// Runs the HTML5 tokenizer state machine and returns all emitted tokens
42/// (excluding Eof). Adjacent Character tokens are coalesced.
43pub fn tokenize(input: &str) -> Vec<Token> {
44 let mut tok = Tokenizer::new(input);
45 let mut tokens = Vec::new();
46 loop {
47 let token = tok.next_token();
48 match token {
49 Token::Eof => break,
50 Token::Character(ref s) => {
51 // Coalesce adjacent character tokens.
52 if let Some(Token::Character(ref mut prev)) = tokens.last_mut() {
53 prev.push_str(s);
54 } else {
55 tokens.push(token);
56 }
57 }
58 _ => tokens.push(token),
59 }
60 }
61 tokens
62}