//! HTML tree builder: construct a DOM tree from tokenizer output.
//!
//! Implements a simplified subset of the WHATWG HTML5 tree construction
//! algorithm for Phase 3 of the browser engine.
use we_dom::{Document, NodeId};
use crate::{Token, Tokenizer};
/// Insertion modes for the tree builder state machine.
#[derive(Debug, Clone, Copy, PartialEq)]
enum InsertionMode {
Initial,
BeforeHtml,
BeforeHead,
InHead,
Text,
AfterHead,
InBody,
AfterBody,
AfterAfterBody,
}
/// Returns true if the given tag name is a void element (self-closing, no end tag).
fn is_void_element(tag: &str) -> bool {
matches!(
tag,
"area"
| "base"
| "br"
| "col"
| "embed"
| "hr"
| "img"
| "input"
| "link"
| "meta"
| "param"
| "source"
| "track"
| "wbr"
)
}
/// HTML tree builder that processes tokens and constructs a DOM tree.
pub struct TreeBuilder {
document: Document,
/// Stack of open elements (the current nesting context).
open_elements: Vec,
head_element: Option,
body_element: Option,
insertion_mode: InsertionMode,
/// Original insertion mode, saved when switching to Text mode.
original_insertion_mode: Option,
/// Pending text for the Text insertion mode (e.g., inside ``).
pending_text: String,
}
impl TreeBuilder {
/// Create a new tree builder with an empty document.
pub fn new() -> Self {
TreeBuilder {
document: Document::new(),
open_elements: Vec::new(),
head_element: None,
body_element: None,
insertion_mode: InsertionMode::Initial,
original_insertion_mode: None,
pending_text: String::new(),
}
}
/// Process a single token, updating the DOM tree.
pub fn process_token(&mut self, token: Token) {
match self.insertion_mode {
InsertionMode::Initial => self.handle_initial(token),
InsertionMode::BeforeHtml => self.handle_before_html(token),
InsertionMode::BeforeHead => self.handle_before_head(token),
InsertionMode::InHead => self.handle_in_head(token),
InsertionMode::Text => self.handle_text(token),
InsertionMode::AfterHead => self.handle_after_head(token),
InsertionMode::InBody => self.handle_in_body(token),
InsertionMode::AfterBody => self.handle_after_body(token),
InsertionMode::AfterAfterBody => self.handle_after_after_body(token),
}
}
/// Finish building and return the constructed DOM document.
pub fn finish(self) -> Document {
self.document
}
// --- Insertion mode handlers ---
fn handle_initial(&mut self, token: Token) {
match token {
Token::Doctype { .. } => {
// For Phase 3, we just acknowledge the DOCTYPE and move on.
self.insertion_mode = InsertionMode::BeforeHtml;
}
Token::Comment(data) => {
let comment = self.document.create_comment(&data);
let root = self.document.root();
self.document.append_child(root, comment);
}
Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
// Ignore whitespace in Initial mode.
}
_ => {
// Anything else: switch to BeforeHtml and reprocess.
self.insertion_mode = InsertionMode::BeforeHtml;
self.handle_before_html(token);
}
}
}
fn handle_before_html(&mut self, token: Token) {
match token {
Token::Doctype { .. } => { /* ignore */ }
Token::Comment(data) => {
let comment = self.document.create_comment(&data);
let root = self.document.root();
self.document.append_child(root, comment);
}
Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
// Ignore whitespace.
}
Token::StartTag { ref name, .. } if name == "html" => {
let html = self.create_element_from_token(&token);
let root = self.document.root();
self.document.append_child(root, html);
self.open_elements.push(html);
self.insertion_mode = InsertionMode::BeforeHead;
}
Token::EndTag { ref name }
if name != "head" && name != "body" && name != "html" && name != "br" =>
{
// Parse error, ignore.
}
_ => {
// Create an implicit element.
let html = self.document.create_element("html");
let root = self.document.root();
self.document.append_child(root, html);
self.open_elements.push(html);
self.insertion_mode = InsertionMode::BeforeHead;
self.handle_before_head(token);
}
}
}
fn handle_before_head(&mut self, token: Token) {
match token {
Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
// Ignore whitespace.
}
Token::Comment(data) => {
self.insert_comment(&data);
}
Token::Doctype { .. } => { /* ignore */ }
Token::StartTag { ref name, .. } if name == "html" => {
// Process as if InBody.
self.handle_in_body(token);
}
Token::StartTag { ref name, .. } if name == "head" => {
let head = self.create_element_from_token(&token);
self.insert_node(head);
self.open_elements.push(head);
self.head_element = Some(head);
self.insertion_mode = InsertionMode::InHead;
}
Token::EndTag { ref name }
if name != "head" && name != "body" && name != "html" && name != "br" =>
{
// Parse error, ignore.
}
_ => {
// Implied .
let head = self.document.create_element("head");
self.insert_node(head);
self.open_elements.push(head);
self.head_element = Some(head);
self.insertion_mode = InsertionMode::InHead;
self.handle_in_head(token);
}
}
}
fn handle_in_head(&mut self, token: Token) {
match token {
Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
self.insert_text(s);
}
Token::Comment(data) => {
self.insert_comment(&data);
}
Token::Doctype { .. } => { /* ignore */ }
Token::StartTag { ref name, .. } if name == "title" => {
let elem = self.create_element_from_token(&token);
self.insert_node(elem);
self.open_elements.push(elem);
self.original_insertion_mode = Some(self.insertion_mode);
self.insertion_mode = InsertionMode::Text;
}
Token::StartTag { ref name, .. }
if name == "style" || name == "script" || name == "noscript" =>
{
let elem = self.create_element_from_token(&token);
self.insert_node(elem);
self.open_elements.push(elem);
self.original_insertion_mode = Some(self.insertion_mode);
self.insertion_mode = InsertionMode::Text;
}
Token::StartTag { ref name, .. } if name == "meta" || name == "link" => {
let elem = self.create_element_from_token(&token);
self.insert_node(elem);
// Void elements: don't push onto stack.
}
Token::StartTag { ref name, .. } if name == "head" => {
// Ignore duplicate .
}
Token::EndTag { ref name } if name == "head" => {
self.pop_until("head");
self.insertion_mode = InsertionMode::AfterHead;
}
Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => {
// Parse error, ignore.
}
_ => {
// Pop and switch to AfterHead, then reprocess.
self.pop_until("head");
self.insertion_mode = InsertionMode::AfterHead;
self.handle_after_head(token);
}
}
}
fn handle_text(&mut self, token: Token) {
match token {
Token::Character(s) => {
self.pending_text.push_str(&s);
}
Token::EndTag { .. } => {
// Flush pending text.
if !self.pending_text.is_empty() {
let text = self.pending_text.clone();
self.pending_text.clear();
self.insert_text(&text);
}
// Pop the element (e.g., ).
self.open_elements.pop();
self.insertion_mode = self
.original_insertion_mode
.unwrap_or(InsertionMode::InBody);
self.original_insertion_mode = None;
}
Token::Eof => {
// Flush pending text.
if !self.pending_text.is_empty() {
let text = self.pending_text.clone();
self.pending_text.clear();
self.insert_text(&text);
}
self.open_elements.pop();
self.insertion_mode = self
.original_insertion_mode
.unwrap_or(InsertionMode::InBody);
self.original_insertion_mode = None;
self.process_token(Token::Eof);
}
_ => {}
}
}
fn handle_after_head(&mut self, token: Token) {
match token {
Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
self.insert_text(s);
}
Token::Comment(data) => {
self.insert_comment(&data);
}
Token::Doctype { .. } => { /* ignore */ }
Token::StartTag { ref name, .. } if name == "html" => {
self.handle_in_body(token);
}
Token::StartTag { ref name, .. } if name == "body" => {
let body = self.create_element_from_token(&token);
self.insert_node(body);
self.open_elements.push(body);
self.body_element = Some(body);
self.insertion_mode = InsertionMode::InBody;
}
Token::StartTag { ref name, .. } if name == "head" => {
// Ignore.
}
Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => {
// Ignore.
}
_ => {
// Implied .
let body = self.document.create_element("body");
self.insert_node(body);
self.open_elements.push(body);
self.body_element = Some(body);
self.insertion_mode = InsertionMode::InBody;
self.handle_in_body(token);
}
}
}
fn handle_in_body(&mut self, token: Token) {
match token {
Token::Character(s) => {
self.insert_text(&s);
}
Token::Comment(data) => {
self.insert_comment(&data);
}
Token::Doctype { .. } => { /* ignore */ }
Token::StartTag { ref name, .. } if name == "html" => {
// Merge attributes onto existing element.
if let Token::StartTag { attributes, .. } = &token {
if let Some(&html_id) = self.open_elements.first() {
for (attr_name, attr_value) in attributes {
if self.document.get_attribute(html_id, attr_name).is_none() {
self.document.set_attribute(html_id, attr_name, attr_value);
}
}
}
}
}
Token::StartTag { ref name, .. }
if name == "body"
|| name == "head"
|| name == "title"
|| name == "style"
|| name == "script" =>
{
match name.as_str() {
"body" => {
// Ignore duplicate .
}
"head" => {
// Ignore in body.
}
_ => {
// title/style/script: process using InHead rules
self.handle_in_head(token);
}
}
}
Token::StartTag { ref name, .. }
if name == "p"
|| name == "div"
|| name == "h1"
|| name == "h2"
|| name == "h3"
|| name == "h4"
|| name == "h5"
|| name == "h6"
|| name == "pre"
|| name == "blockquote"
|| name == "ul"
|| name == "ol"
|| name == "li" =>
{
// If there's a
in button scope, close it first.
if self.has_element_in_button_scope("p") {
self.close_p_element();
}
let elem = self.create_element_from_token(&token);
self.insert_node(elem);
self.open_elements.push(elem);
}
Token::StartTag { ref name, .. } if is_void_element(name) => {
let elem = self.create_element_from_token(&token);
self.insert_node(elem);
// Don't push void elements onto the stack.
}
Token::StartTag { .. } => {
// Generic start tag: create element and push onto stack.
let elem = self.create_element_from_token(&token);
self.insert_node(elem);
self.open_elements.push(elem);
}
Token::EndTag { ref name } if name == "body" => {
if self.has_element_in_scope("body") {
self.insertion_mode = InsertionMode::AfterBody;
}
}
Token::EndTag { ref name } if name == "html" => {
if self.has_element_in_scope("body") {
self.insertion_mode = InsertionMode::AfterBody;
self.handle_after_body(token);
}
}
Token::EndTag { ref name } if name == "p" => {
if !self.has_element_in_button_scope("p") {
// No matching
: insert an empty one, then close it.
let p = self.document.create_element("p");
self.insert_node(p);
self.open_elements.push(p);
}
self.close_p_element();
}
Token::EndTag { ref name }
if name == "div"
|| name == "pre"
|| name == "blockquote"
|| name == "ul"
|| name == "ol"
|| name == "li" =>
{
if self.has_element_in_scope(name) {
self.generate_implied_end_tags(Some(name));
self.pop_until(name);
}
}
Token::EndTag { ref name }
if name == "h1"
|| name == "h2"
|| name == "h3"
|| name == "h4"
|| name == "h5"
|| name == "h6" =>
{
if self.has_heading_in_scope() {
self.generate_implied_end_tags(None);
// Pop until we find a heading element.
while let Some(id) = self.open_elements.pop() {
if let Some(tag) = self.document.tag_name(id) {
if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
break;
}
}
}
}
}
Token::EndTag { ref name } => {
// Generic end tag: walk back through open elements.
self.handle_any_other_end_tag(name);
}
Token::Eof => {
// Stop parsing.
}
}
}
fn handle_after_body(&mut self, token: Token) {
match token {
Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
// Process whitespace as in InBody.
self.handle_in_body(token);
}
Token::Comment(data) => {
// Insert as last child of the first element (html).
let comment = self.document.create_comment(&data);
if let Some(&html) = self.open_elements.first() {
self.document.append_child(html, comment);
}
}
Token::Doctype { .. } => { /* ignore */ }
Token::EndTag { ref name } if name == "html" => {
self.insertion_mode = InsertionMode::AfterAfterBody;
}
Token::Eof => {
// Stop parsing.
}
_ => {
// Anything else: switch back to InBody and reprocess.
self.insertion_mode = InsertionMode::InBody;
self.handle_in_body(token);
}
}
}
fn handle_after_after_body(&mut self, token: Token) {
match token {
Token::Comment(data) => {
let comment = self.document.create_comment(&data);
let root = self.document.root();
self.document.append_child(root, comment);
}
Token::Doctype { .. } => { /* ignore */ }
Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
self.handle_in_body(token);
}
Token::Eof => {
// Stop.
}
_ => {
self.insertion_mode = InsertionMode::InBody;
self.handle_in_body(token);
}
}
}
// --- Helper methods ---
/// Create a DOM element from a StartTag token, setting attributes.
fn create_element_from_token(&mut self, token: &Token) -> NodeId {
if let Token::StartTag {
name, attributes, ..
} = token
{
let id = self.document.create_element(name);
for (attr_name, attr_value) in attributes {
self.document.set_attribute(id, attr_name, attr_value);
}
id
} else {
// Should only be called with StartTag tokens.
self.document.create_element("unknown")
}
}
/// Insert a node at the current insertion point (last open element).
fn insert_node(&mut self, node: NodeId) {
let parent = self
.open_elements
.last()
.copied()
.unwrap_or_else(|| self.document.root());
self.document.append_child(parent, node);
}
/// Insert a text node at the current insertion point.
/// If the last child is already a text node, append to it.
fn insert_text(&mut self, data: &str) {
let parent = self
.open_elements
.last()
.copied()
.unwrap_or_else(|| self.document.root());
// Try to merge with existing text node.
if let Some(last_child) = self.document.last_child(parent) {
if let we_dom::NodeData::Text { data: ref existing } =
*self.document.node_data(last_child)
{
let mut merged = existing.clone();
merged.push_str(data);
self.document.set_text_content(last_child, &merged);
return;
}
}
let text = self.document.create_text(data);
self.document.append_child(parent, text);
}
/// Insert a comment node at the current insertion point.
fn insert_comment(&mut self, data: &str) {
let comment = self.document.create_comment(data);
self.insert_node(comment);
}
/// Pop elements from the stack until we find one with the given tag name.
/// The matching element is also popped.
fn pop_until(&mut self, tag_name: &str) {
while let Some(id) = self.open_elements.pop() {
if self.document.tag_name(id) == Some(tag_name) {
return;
}
}
}
/// Check if the given tag name is "in scope" (simplified).
/// In scope means there's an element with that tag on the stack,
/// and no scope barrier element between it and the top.
fn has_element_in_scope(&self, target: &str) -> bool {
for &id in self.open_elements.iter().rev() {
if let Some(tag) = self.document.tag_name(id) {
if tag == target {
return true;
}
// Scope barrier elements.
if matches!(
tag,
"applet"
| "caption"
| "html"
| "table"
| "td"
| "th"
| "marquee"
| "object"
| "template"
) {
return false;
}
}
}
false
}
/// Check if the given tag name is "in button scope".
fn has_element_in_button_scope(&self, target: &str) -> bool {
for &id in self.open_elements.iter().rev() {
if let Some(tag) = self.document.tag_name(id) {
if tag == target {
return true;
}
// Button scope includes all regular scope barriers plus