//! HTML5 tokenizer and tree builder.
//!
//! Implements the WHATWG HTML5 tokenizer state machine (ยง13.2.5)
//! and a simplified tree builder for constructing DOM trees from tokens.
mod entities;
mod tokenizer;
mod tree_builder;
pub use tokenizer::Tokenizer;
pub use tree_builder::{parse_html, TreeBuilder};
/// A token emitted by the HTML tokenizer.
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
/// ``
Doctype {
name: Option,
public_id: Option,
system_id: Option,
force_quirks: bool,
},
/// ``
StartTag {
name: String,
attributes: Vec<(String, String)>,
self_closing: bool,
},
/// ``
EndTag { name: String },
/// Character data (may be coalesced).
Character(String),
/// ``
Comment(String),
/// End of file.
Eof,
}
/// Tokenize an HTML input string into a sequence of tokens.
///
/// Runs the HTML5 tokenizer state machine and returns all emitted tokens
/// (excluding Eof). Adjacent Character tokens are coalesced.
pub fn tokenize(input: &str) -> Vec {
let mut tok = Tokenizer::new(input);
let mut tokens = Vec::new();
loop {
let token = tok.next_token();
match token {
Token::Eof => break,
Token::Character(ref s) => {
// Coalesce adjacent character tokens.
if let Some(Token::Character(ref mut prev)) = tokens.last_mut() {
prev.push_str(s);
} else {
tokens.push(token);
}
}
_ => tokens.push(token),
}
}
tokens
}