···11//! HTML5 tokenizer and tree builder.
22+//!
33+//! Implements the WHATWG HTML5 tokenizer state machine (§13.2.5).
44+55+mod entities;
66+mod tokenizer;
77+88+pub use tokenizer::Tokenizer;
29310/// A token emitted by the HTML tokenizer.
411#[derive(Debug, Clone, PartialEq)]
···28352936/// Tokenize an HTML input string into a sequence of tokens.
3037///
3131-/// This is a stub that returns an empty `Vec`. The real implementation
3232-/// will be a spec-compliant HTML5 tokenizer state machine.
3333-pub fn tokenize(_input: &str) -> Vec<Token> {
3434- Vec::new()
3838+/// Runs the HTML5 tokenizer state machine and returns all emitted tokens
3939+/// (excluding Eof). Adjacent Character tokens are coalesced.
4040+pub fn tokenize(input: &str) -> Vec<Token> {
4141+ let mut tok = Tokenizer::new(input);
4242+ let mut tokens = Vec::new();
4343+ loop {
4444+ let token = tok.next_token();
4545+ match token {
4646+ Token::Eof => break,
4747+ Token::Character(ref s) => {
4848+ // Coalesce adjacent character tokens.
4949+ if let Some(Token::Character(ref mut prev)) = tokens.last_mut() {
5050+ prev.push_str(s);
5151+ } else {
5252+ tokens.push(token);
5353+ }
5454+ }
5555+ _ => tokens.push(token),
5656+ }
5757+ }
5858+ tokens
3559}