`). pending_text: String, } impl TreeBuilder { /// Create a new tree builder with an empty document. pub fn new() -> Self { TreeBuilder { document: Document::new(), open_elements: Vec::new(), head_element: None, body_element: None, insertion_mode: InsertionMode::Initial, original_insertion_mode: None, pending_text: String::new(), } } /// Process a single token, updating the DOM tree. pub fn process_token(&mut self, token: Token) { match self.insertion_mode { InsertionMode::Initial => self.handle_initial(token), InsertionMode::BeforeHtml => self.handle_before_html(token), InsertionMode::BeforeHead => self.handle_before_head(token), InsertionMode::InHead => self.handle_in_head(token), InsertionMode::Text => self.handle_text(token), InsertionMode::AfterHead => self.handle_after_head(token), InsertionMode::InBody => self.handle_in_body(token), InsertionMode::AfterBody => self.handle_after_body(token), InsertionMode::AfterAfterBody => self.handle_after_after_body(token), } } /// Finish building and return the constructed DOM document. pub fn finish(self) -> Document { self.document } // --- Insertion mode handlers --- fn handle_initial(&mut self, token: Token) { match token { Token::Doctype { .. } => { // For Phase 3, we just acknowledge the DOCTYPE and move on. self.insertion_mode = InsertionMode::BeforeHtml; } Token::Comment(data) => { let comment = self.document.create_comment(&data); let root = self.document.root(); self.document.append_child(root, comment); } Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Ignore whitespace in Initial mode. } _ => { // Anything else: switch to BeforeHtml and reprocess. self.insertion_mode = InsertionMode::BeforeHtml; self.handle_before_html(token); } } } fn handle_before_html(&mut self, token: Token) { match token { Token::Doctype { .. } => { /* ignore */ } Token::Comment(data) => { let comment = self.document.create_comment(&data); let root = self.document.root(); self.document.append_child(root, comment); } Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Ignore whitespace. } Token::StartTag { ref name, .. } if name == "html" => { let html = self.create_element_from_token(&token); let root = self.document.root(); self.document.append_child(root, html); self.open_elements.push(html); self.insertion_mode = InsertionMode::BeforeHead; } Token::EndTag { ref name } if name != "head" && name != "body" && name != "html" && name != "br" => { // Parse error, ignore. } _ => { // Create an implicit <html> element. let html = self.document.create_element("html"); let root = self.document.root(); self.document.append_child(root, html); self.open_elements.push(html); self.insertion_mode = InsertionMode::BeforeHead; self.handle_before_head(token); } } } fn handle_before_head(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Ignore whitespace. } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "html" => { // Process as if InBody. self.handle_in_body(token); } Token::StartTag { ref name, .. } if name == "head" => { let head = self.create_element_from_token(&token); self.insert_node(head); self.open_elements.push(head); self.head_element = Some(head); self.insertion_mode = InsertionMode::InHead; } Token::EndTag { ref name } if name != "head" && name != "body" && name != "html" && name != "br" => { // Parse error, ignore. } _ => { // Implied <head>. let head = self.document.create_element("head"); self.insert_node(head); self.open_elements.push(head); self.head_element = Some(head); self.insertion_mode = InsertionMode::InHead; self.handle_in_head(token); } } } fn handle_in_head(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { self.insert_text(s); } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "title" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.original_insertion_mode = Some(self.insertion_mode); self.insertion_mode = InsertionMode::Text; } Token::StartTag { ref name, .. } if name == "style" || name == "script" || name == "noscript" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.original_insertion_mode = Some(self.insertion_mode); self.insertion_mode = InsertionMode::Text; } Token::StartTag { ref name, .. } if name == "meta" || name == "link" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); // Void elements: don't push onto stack. } Token::StartTag { ref name, .. } if name == "head" => { // Ignore duplicate <head>. } Token::EndTag { ref name } if name == "head" => { self.pop_until("head"); self.insertion_mode = InsertionMode::AfterHead; } Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { // Parse error, ignore. } _ => { // Pop <head> and switch to AfterHead, then reprocess. self.pop_until("head"); self.insertion_mode = InsertionMode::AfterHead; self.handle_after_head(token); } } } fn handle_text(&mut self, token: Token) { match token { Token::Character(s) => { self.pending_text.push_str(&s); } Token::EndTag { .. } => { // Flush pending text. if !self.pending_text.is_empty() { let text = self.pending_text.clone(); self.pending_text.clear(); self.insert_text(&text); } // Pop the element (e.g., <title>). self.open_elements.pop(); self.insertion_mode = self .original_insertion_mode .unwrap_or(InsertionMode::InBody); self.original_insertion_mode = None; } Token::Eof => { // Flush pending text. if !self.pending_text.is_empty() { let text = self.pending_text.clone(); self.pending_text.clear(); self.insert_text(&text); } self.open_elements.pop(); self.insertion_mode = self .original_insertion_mode .unwrap_or(InsertionMode::InBody); self.original_insertion_mode = None; self.process_token(Token::Eof); } _ => {} } } fn handle_after_head(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { self.insert_text(s); } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "html" => { self.handle_in_body(token); } Token::StartTag { ref name, .. } if name == "body" => { let body = self.create_element_from_token(&token); self.insert_node(body); self.open_elements.push(body); self.body_element = Some(body); self.insertion_mode = InsertionMode::InBody; } Token::StartTag { ref name, .. } if name == "head" => { // Ignore. } Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { // Ignore. } _ => { // Implied <body>. let body = self.document.create_element("body"); self.insert_node(body); self.open_elements.push(body); self.body_element = Some(body); self.insertion_mode = InsertionMode::InBody; self.handle_in_body(token); } } } fn handle_in_body(&mut self, token: Token) { match token { Token::Character(s) => { self.insert_text(&s); } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "html" => { // Merge attributes onto existing <html> element. if let Token::StartTag { attributes, .. } = &token { if let Some(&html_id) = self.open_elements.first() { for (attr_name, attr_value) in attributes { if self.document.get_attribute(html_id, attr_name).is_none() { self.document.set_attribute(html_id, attr_name, attr_value); } } } } } Token::StartTag { ref name, .. } if name == "body" || name == "head" || name == "title" || name == "style" || name == "script" => { match name.as_str() { "body" => { // Ignore duplicate <body>. } "head" => { // Ignore <head> in body. } _ => { // title/style/script: process using InHead rules self.handle_in_head(token); } } } Token::StartTag { ref name, .. } if name == "p" || name == "div" || name == "h1" || name == "h2" || name == "h3" || name == "h4" || name == "h5" || name == "h6" || name == "pre" || name == "blockquote" || name == "ul" || name == "ol" || name == "li" => { // If there's a <p> in button scope, close it first. if self.has_element_in_button_scope("p") { self.close_p_element(); } let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } Token::StartTag { ref name, .. } if is_void_element(name) => { let elem = self.create_element_from_token(&token); self.insert_node(elem); // Don't push void elements onto the stack. } Token::StartTag { .. } => { // Generic start tag: create element and push onto stack. let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } Token::EndTag { ref name } if name == "body" => { if self.has_element_in_scope("body") { self.insertion_mode = InsertionMode::AfterBody; } } Token::EndTag { ref name } if name == "html" => { if self.has_element_in_scope("body") { self.insertion_mode = InsertionMode::AfterBody; self.handle_after_body(token); } } Token::EndTag { ref name } if name == "p" => { if !self.has_element_in_button_scope("p") { // No matching <p>: insert an empty one, then close it. let p = self.document.create_element("p"); self.insert_node(p); self.open_elements.push(p); } self.close_p_element(); } Token::EndTag { ref name } if name == "div" || name == "pre" || name == "blockquote" || name == "ul" || name == "ol" || name == "li" => { if self.has_element_in_scope(name) { self.generate_implied_end_tags(Some(name)); self.pop_until(name); } } Token::EndTag { ref name } if name == "h1" || name == "h2" || name == "h3" || name == "h4" || name == "h5" || name == "h6" => { if self.has_heading_in_scope() { self.generate_implied_end_tags(None); // Pop until we find a heading element. while let Some(id) = self.open_elements.pop() { if let Some(tag) = self.document.tag_name(id) { if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { break; } } } } } Token::EndTag { ref name } => { // Generic end tag: walk back through open elements. self.handle_any_other_end_tag(name); } Token::Eof => { // Stop parsing. } } } fn handle_after_body(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Process whitespace as in InBody. self.handle_in_body(token); } Token::Comment(data) => { // Insert as last child of the first element (html). let comment = self.document.create_comment(&data); if let Some(&html) = self.open_elements.first() { self.document.append_child(html, comment); } } Token::Doctype { .. } => { /* ignore */ } Token::EndTag { ref name } if name == "html" => { self.insertion_mode = InsertionMode::AfterAfterBody; } Token::Eof => { // Stop parsing. } _ => { // Anything else: switch back to InBody and reprocess. self.insertion_mode = InsertionMode::InBody; self.handle_in_body(token); } } } fn handle_after_after_body(&mut self, token: Token) { match token { Token::Comment(data) => { let comment = self.document.create_comment(&data); let root = self.document.root(); self.document.append_child(root, comment); } Token::Doctype { .. } => { /* ignore */ } Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { self.handle_in_body(token); } Token::Eof => { // Stop. } _ => { self.insertion_mode = InsertionMode::InBody; self.handle_in_body(token); } } } // --- Helper methods --- /// Create a DOM element from a StartTag token, setting attributes. fn create_element_from_token(&mut self, token: &Token) -> NodeId { if let Token::StartTag { name, attributes, .. } = token { let id = self.document.create_element(name); for (attr_name, attr_value) in attributes { self.document.set_attribute(id, attr_name, attr_value); } id } else { // Should only be called with StartTag tokens. self.document.create_element("unknown") } } /// Insert a node at the current insertion point (last open element). fn insert_node(&mut self, node: NodeId) { let parent = self .open_elements .last() .copied() .unwrap_or_else(|| self.document.root()); self.document.append_child(parent, node); } /// Insert a text node at the current insertion point. /// If the last child is already a text node, append to it. fn insert_text(&mut self, data: &str) { let parent = self .open_elements .last() .copied() .unwrap_or_else(|| self.document.root()); // Try to merge with existing text node. if let Some(last_child) = self.document.last_child(parent) { if let we_dom::NodeData::Text { data: ref existing } = *self.document.node_data(last_child) { let mut merged = existing.clone(); merged.push_str(data); self.document.set_text_content(last_child, &merged); return; } } let text = self.document.create_text(data); self.document.append_child(parent, text); } /// Insert a comment node at the current insertion point. fn insert_comment(&mut self, data: &str) { let comment = self.document.create_comment(data); self.insert_node(comment); } /// Pop elements from the stack until we find one with the given tag name. /// The matching element is also popped. fn pop_until(&mut self, tag_name: &str) { while let Some(id) = self.open_elements.pop() { if self.document.tag_name(id) == Some(tag_name) { return; } } } /// Check if the given tag name is "in scope" (simplified). /// In scope means there's an element with that tag on the stack, /// and no scope barrier element between it and the top. fn has_element_in_scope(&self, target: &str) -> bool { for &id in self.open_elements.iter().rev() { if let Some(tag) = self.document.tag_name(id) { if tag == target { return true; } // Scope barrier elements. if matches!( tag, "applet" | "caption" | "html" | "table" | "td" | "th" | "marquee" | "object" | "template" ) { return false; } } } false } /// Check if the given tag name is "in button scope". fn has_element_in_button_scope(&self, target: &str) -> bool { for &id in self.open_elements.iter().rev() { if let Some(tag) = self.document.tag_name(id) { if tag == target { return true; } // Button scope includes all regular scope barriers plus <button>. if matches!( tag, "applet" | "button" | "caption" | "html" | "table" | "td" | "th" | "marquee" | "object" | "template" ) { return false; } } } false } /// Check if any heading element (h1-h6) is in scope. fn has_heading_in_scope(&self) -> bool { for &id in self.open_elements.iter().rev() { if let Some(tag) = self.document.tag_name(id) { if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { return true; } if matches!( tag, "applet" | "caption" | "html" | "table" | "td" | "th" | "marquee" | "object" | "template" ) { return false; } } } false } /// Close a `<p>` element: generate implied end tags (excluding p), /// then pop until we find the `<p>`. fn close_p_element(&mut self) { self.generate_implied_end_tags(Some("p")); self.pop_until("p"); } /// Generate implied end tags. If `exclude` is provided, don't generate /// an end tag for that element. fn generate_implied_end_tags(&mut self, exclude: Option<&str>) { loop { let should_pop = self .open_elements .last() .and_then(|&id| self.document.tag_name(id)) .map(|tag| { if let Some(excl) = exclude { if tag == excl { return false; } } matches!( tag, "dd" | "dt" | "li" | "optgroup" | "option" | "p" | "rb" | "rp" | "rt" | "rtc" ) }) .unwrap_or(false); if should_pop { self.open_elements.pop(); } else { break; } } } /// Handle a generic end tag by walking back through open elements /// using the "any other end tag" algorithm. fn handle_any_other_end_tag(&mut self, name: &str) { // Walk backwards through the stack. let mut i = self.open_elements.len(); while i > 0 { i -= 1; let id = self.open_elements[i]; if self.document.tag_name(id) == Some(name) { // Pop everything above and including this element. self.open_elements.truncate(i); return; } // If this is a "special" element, stop. if let Some(tag) = self.document.tag_name(id) { if is_special_element(tag) { return; } } } } } impl Default for TreeBuilder { fn default() -> Self { Self::new() } } /// Returns true if the tag is a "special" element per the HTML spec. fn is_special_element(tag: &str) -> bool { matches!( tag, "address" | "applet" | "area" | "article" | "aside" | "base" | "basefont" | "bgsound" | "blockquote" | "body" | "br" | "button" | "caption" | "center" | "col" | "colgroup" | "dd" | "details" | "dir" | "div" | "dl" | "dt" | "embed" | "fieldset" | "figcaption" | "figure" | "footer" | "form" | "frame" | "frameset" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "head" | "header" | "hgroup" | "hr" | "html" | "iframe" | "img" | "input" | "li" | "link" | "listing" | "main" | "marquee" | "menu" | "meta" | "nav" | "noembed" | "noframes" | "noscript" | "object" | "ol" | "p" | "param" | "plaintext" | "pre" | "script" | "section" | "select" | "source" | "style" | "summary" | "table" | "tbody" | "td" | "template" | "textarea" | "tfoot" | "th" | "thead" | "title" | "tr" | "track" | "ul" | "wbr" | "xmp" ) } /// Parse an HTML string into a DOM document. /// /// This is a convenience function that tokenizes the input and builds /// a DOM tree using the tree builder. pub fn parse_html(input: &str) -> Document { let mut builder = TreeBuilder::new(); let mut tokenizer = Tokenizer::new(input); loop { let token = tokenizer.next_token(); let is_eof = token == Token::Eof; builder.process_token(token); if is_eof { break; } } builder.finish() } #[cfg(test)] mod tests { use super::*; use we_dom::NodeData; /// Helper: collect tag names of direct children of a node. fn child_tags(doc: &Document, node: NodeId) -> Vec<String> { doc.children(node) .filter_map(|id| doc.tag_name(id).map(String::from)) .collect() } /// Helper: get the text content of all text node children, concatenated. fn text_of_children(doc: &Document, node: NodeId) -> String { let mut result = String::new(); for child in doc.children(node) { if let Some(text) = doc.text_content(child) { result.push_str(text); } } result } #[test] fn parse_full_document() { let doc = parse_html( "<!DOCTYPE html><html><head><title>Test

//! HTML tree builder: construct a DOM tree from tokenizer output. //! //! Implements a simplified subset of the WHATWG HTML5 tree construction //! algorithm for Phase 3 of the browser engine. use we_dom::{Document, NodeId}; use crate::{Token, Tokenizer}; /// Insertion modes for the tree builder state machine. #[derive(Debug, Clone, Copy, PartialEq)] enum InsertionMode { Initial, BeforeHtml, BeforeHead, InHead, Text, AfterHead, InBody, AfterBody, AfterAfterBody, } /// Returns true if the given tag name is a void element (self-closing, no end tag). fn is_void_element(tag: &str) -> bool { matches!( tag, "area" | "base" | "br" | "col" | "embed" | "hr" | "img" | "input" | "link" | "meta" | "param" | "source" | "track" | "wbr" ) } /// HTML tree builder that processes tokens and constructs a DOM tree. pub struct TreeBuilder { document: Document, /// Stack of open elements (the current nesting context). open_elements: Vec, head_element: Option, body_element: Option, insertion_mode: InsertionMode, /// Original insertion mode, saved when switching to Text mode. original_insertion_mode: Option, /// Pending text for the Text insertion mode (e.g., inside ``). pending_text: String, } impl TreeBuilder { /// Create a new tree builder with an empty document. pub fn new() -> Self { TreeBuilder { document: Document::new(), open_elements: Vec::new(), head_element: None, body_element: None, insertion_mode: InsertionMode::Initial, original_insertion_mode: None, pending_text: String::new(), } } /// Process a single token, updating the DOM tree. pub fn process_token(&mut self, token: Token) { match self.insertion_mode { InsertionMode::Initial => self.handle_initial(token), InsertionMode::BeforeHtml => self.handle_before_html(token), InsertionMode::BeforeHead => self.handle_before_head(token), InsertionMode::InHead => self.handle_in_head(token), InsertionMode::Text => self.handle_text(token), InsertionMode::AfterHead => self.handle_after_head(token), InsertionMode::InBody => self.handle_in_body(token), InsertionMode::AfterBody => self.handle_after_body(token), InsertionMode::AfterAfterBody => self.handle_after_after_body(token), } } /// Finish building and return the constructed DOM document. pub fn finish(self) -> Document { self.document } // --- Insertion mode handlers --- fn handle_initial(&mut self, token: Token) { match token { Token::Doctype { .. } => { // For Phase 3, we just acknowledge the DOCTYPE and move on. self.insertion_mode = InsertionMode::BeforeHtml; } Token::Comment(data) => { let comment = self.document.create_comment(&data); let root = self.document.root(); self.document.append_child(root, comment); } Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Ignore whitespace in Initial mode. } _ => { // Anything else: switch to BeforeHtml and reprocess. self.insertion_mode = InsertionMode::BeforeHtml; self.handle_before_html(token); } } } fn handle_before_html(&mut self, token: Token) { match token { Token::Doctype { .. } => { /* ignore */ } Token::Comment(data) => { let comment = self.document.create_comment(&data); let root = self.document.root(); self.document.append_child(root, comment); } Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Ignore whitespace. } Token::StartTag { ref name, .. } if name == "html" => { let html = self.create_element_from_token(&token); let root = self.document.root(); self.document.append_child(root, html); self.open_elements.push(html); self.insertion_mode = InsertionMode::BeforeHead; } Token::EndTag { ref name } if name != "head" && name != "body" && name != "html" && name != "br" => { // Parse error, ignore. } _ => { // Create an implicit <html> element. let html = self.document.create_element("html"); let root = self.document.root(); self.document.append_child(root, html); self.open_elements.push(html); self.insertion_mode = InsertionMode::BeforeHead; self.handle_before_head(token); } } } fn handle_before_head(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Ignore whitespace. } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "html" => { // Process as if InBody. self.handle_in_body(token); } Token::StartTag { ref name, .. } if name == "head" => { let head = self.create_element_from_token(&token); self.insert_node(head); self.open_elements.push(head); self.head_element = Some(head); self.insertion_mode = InsertionMode::InHead; } Token::EndTag { ref name } if name != "head" && name != "body" && name != "html" && name != "br" => { // Parse error, ignore. } _ => { // Implied <head>. let head = self.document.create_element("head"); self.insert_node(head); self.open_elements.push(head); self.head_element = Some(head); self.insertion_mode = InsertionMode::InHead; self.handle_in_head(token); } } } fn handle_in_head(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { self.insert_text(s); } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "title" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.original_insertion_mode = Some(self.insertion_mode); self.insertion_mode = InsertionMode::Text; } Token::StartTag { ref name, .. } if name == "style" || name == "script" || name == "noscript" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); self.original_insertion_mode = Some(self.insertion_mode); self.insertion_mode = InsertionMode::Text; } Token::StartTag { ref name, .. } if name == "meta" || name == "link" => { let elem = self.create_element_from_token(&token); self.insert_node(elem); // Void elements: don't push onto stack. } Token::StartTag { ref name, .. } if name == "head" => { // Ignore duplicate <head>. } Token::EndTag { ref name } if name == "head" => { self.pop_until("head"); self.insertion_mode = InsertionMode::AfterHead; } Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { // Parse error, ignore. } _ => { // Pop <head> and switch to AfterHead, then reprocess. self.pop_until("head"); self.insertion_mode = InsertionMode::AfterHead; self.handle_after_head(token); } } } fn handle_text(&mut self, token: Token) { match token { Token::Character(s) => { self.pending_text.push_str(&s); } Token::EndTag { .. } => { // Flush pending text. if !self.pending_text.is_empty() { let text = self.pending_text.clone(); self.pending_text.clear(); self.insert_text(&text); } // Pop the element (e.g., <title>). self.open_elements.pop(); self.insertion_mode = self .original_insertion_mode .unwrap_or(InsertionMode::InBody); self.original_insertion_mode = None; } Token::Eof => { // Flush pending text. if !self.pending_text.is_empty() { let text = self.pending_text.clone(); self.pending_text.clear(); self.insert_text(&text); } self.open_elements.pop(); self.insertion_mode = self .original_insertion_mode .unwrap_or(InsertionMode::InBody); self.original_insertion_mode = None; self.process_token(Token::Eof); } _ => {} } } fn handle_after_head(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { self.insert_text(s); } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "html" => { self.handle_in_body(token); } Token::StartTag { ref name, .. } if name == "body" => { let body = self.create_element_from_token(&token); self.insert_node(body); self.open_elements.push(body); self.body_element = Some(body); self.insertion_mode = InsertionMode::InBody; } Token::StartTag { ref name, .. } if name == "head" => { // Ignore. } Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { // Ignore. } _ => { // Implied <body>. let body = self.document.create_element("body"); self.insert_node(body); self.open_elements.push(body); self.body_element = Some(body); self.insertion_mode = InsertionMode::InBody; self.handle_in_body(token); } } } fn handle_in_body(&mut self, token: Token) { match token { Token::Character(s) => { self.insert_text(&s); } Token::Comment(data) => { self.insert_comment(&data); } Token::Doctype { .. } => { /* ignore */ } Token::StartTag { ref name, .. } if name == "html" => { // Merge attributes onto existing <html> element. if let Token::StartTag { attributes, .. } = &token { if let Some(&html_id) = self.open_elements.first() { for (attr_name, attr_value) in attributes { if self.document.get_attribute(html_id, attr_name).is_none() { self.document.set_attribute(html_id, attr_name, attr_value); } } } } } Token::StartTag { ref name, .. } if name == "body" || name == "head" || name == "title" || name == "style" || name == "script" => { match name.as_str() { "body" => { // Ignore duplicate <body>. } "head" => { // Ignore <head> in body. } _ => { // title/style/script: process using InHead rules self.handle_in_head(token); } } } Token::StartTag { ref name, .. } if name == "p" || name == "div" || name == "h1" || name == "h2" || name == "h3" || name == "h4" || name == "h5" || name == "h6" || name == "pre" || name == "blockquote" || name == "ul" || name == "ol" || name == "li" => { // If there's a <p> in button scope, close it first. if self.has_element_in_button_scope("p") { self.close_p_element(); } let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } Token::StartTag { ref name, .. } if is_void_element(name) => { let elem = self.create_element_from_token(&token); self.insert_node(elem); // Don't push void elements onto the stack. } Token::StartTag { .. } => { // Generic start tag: create element and push onto stack. let elem = self.create_element_from_token(&token); self.insert_node(elem); self.open_elements.push(elem); } Token::EndTag { ref name } if name == "body" => { if self.has_element_in_scope("body") { self.insertion_mode = InsertionMode::AfterBody; } } Token::EndTag { ref name } if name == "html" => { if self.has_element_in_scope("body") { self.insertion_mode = InsertionMode::AfterBody; self.handle_after_body(token); } } Token::EndTag { ref name } if name == "p" => { if !self.has_element_in_button_scope("p") { // No matching <p>: insert an empty one, then close it. let p = self.document.create_element("p"); self.insert_node(p); self.open_elements.push(p); } self.close_p_element(); } Token::EndTag { ref name } if name == "div" || name == "pre" || name == "blockquote" || name == "ul" || name == "ol" || name == "li" => { if self.has_element_in_scope(name) { self.generate_implied_end_tags(Some(name)); self.pop_until(name); } } Token::EndTag { ref name } if name == "h1" || name == "h2" || name == "h3" || name == "h4" || name == "h5" || name == "h6" => { if self.has_heading_in_scope() { self.generate_implied_end_tags(None); // Pop until we find a heading element. while let Some(id) = self.open_elements.pop() { if let Some(tag) = self.document.tag_name(id) { if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { break; } } } } } Token::EndTag { ref name } => { // Generic end tag: walk back through open elements. self.handle_any_other_end_tag(name); } Token::Eof => { // Stop parsing. } } } fn handle_after_body(&mut self, token: Token) { match token { Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { // Process whitespace as in InBody. self.handle_in_body(token); } Token::Comment(data) => { // Insert as last child of the first element (html). let comment = self.document.create_comment(&data); if let Some(&html) = self.open_elements.first() { self.document.append_child(html, comment); } } Token::Doctype { .. } => { /* ignore */ } Token::EndTag { ref name } if name == "html" => { self.insertion_mode = InsertionMode::AfterAfterBody; } Token::Eof => { // Stop parsing. } _ => { // Anything else: switch back to InBody and reprocess. self.insertion_mode = InsertionMode::InBody; self.handle_in_body(token); } } } fn handle_after_after_body(&mut self, token: Token) { match token { Token::Comment(data) => { let comment = self.document.create_comment(&data); let root = self.document.root(); self.document.append_child(root, comment); } Token::Doctype { .. } => { /* ignore */ } Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { self.handle_in_body(token); } Token::Eof => { // Stop. } _ => { self.insertion_mode = InsertionMode::InBody; self.handle_in_body(token); } } } // --- Helper methods --- /// Create a DOM element from a StartTag token, setting attributes. fn create_element_from_token(&mut self, token: &Token) -> NodeId { if let Token::StartTag { name, attributes, .. } = token { let id = self.document.create_element(name); for (attr_name, attr_value) in attributes { self.document.set_attribute(id, attr_name, attr_value); } id } else { // Should only be called with StartTag tokens. self.document.create_element("unknown") } } /// Insert a node at the current insertion point (last open element). fn insert_node(&mut self, node: NodeId) { let parent = self .open_elements .last() .copied() .unwrap_or_else(|| self.document.root()); self.document.append_child(parent, node); } /// Insert a text node at the current insertion point. /// If the last child is already a text node, append to it. fn insert_text(&mut self, data: &str) { let parent = self .open_elements .last() .copied() .unwrap_or_else(|| self.document.root()); // Try to merge with existing text node. if let Some(last_child) = self.document.last_child(parent) { if let we_dom::NodeData::Text { data: ref existing } = *self.document.node_data(last_child) { let mut merged = existing.clone(); merged.push_str(data); self.document.set_text_content(last_child, &merged); return; } } let text = self.document.create_text(data); self.document.append_child(parent, text); } /// Insert a comment node at the current insertion point. fn insert_comment(&mut self, data: &str) { let comment = self.document.create_comment(data); self.insert_node(comment); } /// Pop elements from the stack until we find one with the given tag name. /// The matching element is also popped. fn pop_until(&mut self, tag_name: &str) { while let Some(id) = self.open_elements.pop() { if self.document.tag_name(id) == Some(tag_name) { return; } } } /// Check if the given tag name is "in scope" (simplified). /// In scope means there's an element with that tag on the stack, /// and no scope barrier element between it and the top. fn has_element_in_scope(&self, target: &str) -> bool { for &id in self.open_elements.iter().rev() { if let Some(tag) = self.document.tag_name(id) { if tag == target { return true; } // Scope barrier elements. if matches!( tag, "applet" | "caption" | "html" | "table" | "td" | "th" | "marquee" | "object" | "template" ) { return false; } } } false } /// Check if the given tag name is "in button scope". fn has_element_in_button_scope(&self, target: &str) -> bool { for &id in self.open_elements.iter().rev() { if let Some(tag) = self.document.tag_name(id) { if tag == target { return true; } // Button scope includes all regular scope barriers plus <button>. if matches!( tag, "applet" | "button" | "caption" | "html" | "table" | "td" | "th" | "marquee" | "object" | "template" ) { return false; } } } false } /// Check if any heading element (h1-h6) is in scope. fn has_heading_in_scope(&self) -> bool { for &id in self.open_elements.iter().rev() { if let Some(tag) = self.document.tag_name(id) { if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { return true; } if matches!( tag, "applet" | "caption" | "html" | "table" | "td" | "th" | "marquee" | "object" | "template" ) { return false; } } } false } /// Close a `<p>` element: generate implied end tags (excluding p), /// then pop until we find the `<p>`. fn close_p_element(&mut self) { self.generate_implied_end_tags(Some("p")); self.pop_until("p"); } /// Generate implied end tags. If `exclude` is provided, don't generate /// an end tag for that element. fn generate_implied_end_tags(&mut self, exclude: Option<&str>) { loop { let should_pop = self .open_elements .last() .and_then(|&id| self.document.tag_name(id)) .map(|tag| { if let Some(excl) = exclude { if tag == excl { return false; } } matches!( tag, "dd" | "dt" | "li" | "optgroup" | "option" | "p" | "rb" | "rp" | "rt" | "rtc" ) }) .unwrap_or(false); if should_pop { self.open_elements.pop(); } else { break; } } } /// Handle a generic end tag by walking back through open elements /// using the "any other end tag" algorithm. fn handle_any_other_end_tag(&mut self, name: &str) { // Walk backwards through the stack. let mut i = self.open_elements.len(); while i > 0 { i -= 1; let id = self.open_elements[i]; if self.document.tag_name(id) == Some(name) { // Pop everything above and including this element. self.open_elements.truncate(i); return; } // If this is a "special" element, stop. if let Some(tag) = self.document.tag_name(id) { if is_special_element(tag) { return; } } } } } impl Default for TreeBuilder { fn default() -> Self { Self::new() } } /// Returns true if the tag is a "special" element per the HTML spec. fn is_special_element(tag: &str) -> bool { matches!( tag, "address" | "applet" | "area" | "article" | "aside" | "base" | "basefont" | "bgsound" | "blockquote" | "body" | "br" | "button" | "caption" | "center" | "col" | "colgroup" | "dd" | "details" | "dir" | "div" | "dl" | "dt" | "embed" | "fieldset" | "figcaption" | "figure" | "footer" | "form" | "frame" | "frameset" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "head" | "header" | "hgroup" | "hr" | "html" | "iframe" | "img" | "input" | "li" | "link" | "listing" | "main" | "marquee" | "menu" | "meta" | "nav" | "noembed" | "noframes" | "noscript" | "object" | "ol" | "p" | "param" | "plaintext" | "pre" | "script" | "section" | "select" | "source" | "style" | "summary" | "table" | "tbody" | "td" | "template" | "textarea" | "tfoot" | "th" | "thead" | "title" | "tr" | "track" | "ul" | "wbr" | "xmp" ) } /// Parse an HTML string into a DOM document. /// /// This is a convenience function that tokenizes the input and builds /// a DOM tree using the tree builder. pub fn parse_html(input: &str) -> Document { let mut builder = TreeBuilder::new(); let mut tokenizer = Tokenizer::new(input); loop { let token = tokenizer.next_token(); let is_eof = token == Token::Eof; builder.process_token(token); if is_eof { break; } } builder.finish() } #[cfg(test)] mod tests { use super::*; use we_dom::NodeData; /// Helper: collect tag names of direct children of a node. fn child_tags(doc: &Document, node: NodeId) -> Vec<String> { doc.children(node) .filter_map(|id| doc.tag_name(id).map(String::from)) .collect() } /// Helper: get the text content of all text node children, concatenated. fn text_of_children(doc: &Document, node: NodeId) -> String { let mut result = String::new(); for child in doc.children(node) { if let Some(text) = doc.text_content(child) { result.push_str(text); } } result } #[test] fn parse_full_document() { let doc = parse_html( "<!DOCTYPE html><html><head><title>Test

Hello

", ); let root = doc.root(); // Root should have one child: let html_children: Vec = doc.children(root).collect(); assert_eq!(html_children.len(), 1); let html = html_children[0]; assert_eq!(doc.tag_name(html), Some("html")); // should have and let tags = child_tags(&doc, html); assert_eq!(tags, vec!["head", "body"]); // should have let head = doc.children(html).next().unwrap(); let head_tags = child_tags(&doc, head); assert_eq!(head_tags, vec!["title"]); // <title> should contain "Test" let title = doc.children(head).next().unwrap(); assert_eq!(text_of_children(&doc, title), "Test"); // <body> should have <p> let body = doc.children(html).nth(1).unwrap(); let body_tags = child_tags(&doc, body); assert_eq!(body_tags, vec!["p"]); // <p> should contain "Hello" let p = doc.children(body).next().unwrap(); assert_eq!(text_of_children(&doc, p), "Hello"); } #[test] fn implicit_html_head_body() { // Minimal document: just <p>Hello let doc = parse_html("<p>Hello"); let root = doc.root(); let html: Vec<NodeId> = doc.children(root).collect(); assert_eq!(html.len(), 1); assert_eq!(doc.tag_name(html[0]), Some("html")); let html_tags = child_tags(&doc, html[0]); assert_eq!(html_tags, vec!["head", "body"]); let body = doc.children(html[0]).nth(1).unwrap(); let body_tags = child_tags(&doc, body); assert_eq!(body_tags, vec!["p"]); let p = doc.children(body).next().unwrap(); assert_eq!(text_of_children(&doc, p), "Hello"); } #[test] fn void_element_br() { let doc = parse_html("<p>Line 1<br>Line 2</p>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let p = doc.children(body).next().unwrap(); // <p> should have: text("Line 1"), <br>, text("Line 2") let children: Vec<NodeId> = doc.children(p).collect(); assert_eq!(children.len(), 3); assert_eq!(doc.text_content(children[0]), Some("Line 1")); assert_eq!(doc.tag_name(children[1]), Some("br")); assert_eq!(doc.text_content(children[2]), Some("Line 2")); } #[test] fn p_inside_p_closes_outer() { let doc = parse_html("<p>First<p>Second"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); // Should have two sibling <p> elements, not nested. let body_tags = child_tags(&doc, body); assert_eq!(body_tags, vec!["p", "p"]); let children: Vec<NodeId> = doc.children(body).collect(); assert_eq!(text_of_children(&doc, children[0]), "First"); assert_eq!(text_of_children(&doc, children[1]), "Second"); } #[test] fn nested_div_elements() { let doc = parse_html("<div><div>inner</div></div>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let outer_div = doc.children(body).next().unwrap(); assert_eq!(doc.tag_name(outer_div), Some("div")); let inner_div = doc.children(outer_div).next().unwrap(); assert_eq!(doc.tag_name(inner_div), Some("div")); assert_eq!(text_of_children(&doc, inner_div), "inner"); } #[test] fn inline_elements_nest_properly() { let doc = parse_html("<p><span><a href=\"#\">link</a></span></p>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let p = doc.children(body).next().unwrap(); let span = doc.children(p).next().unwrap(); assert_eq!(doc.tag_name(span), Some("span")); let a = doc.children(span).next().unwrap(); assert_eq!(doc.tag_name(a), Some("a")); assert_eq!(doc.get_attribute(a, "href"), Some("#")); assert_eq!(text_of_children(&doc, a), "link"); } #[test] fn headings() { let doc = parse_html("<h1>Title</h1><h2>Subtitle</h2><p>Body text</p>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let tags = child_tags(&doc, body); assert_eq!(tags, vec!["h1", "h2", "p"]); } #[test] fn comment_nodes() { let doc = parse_html("<body><p>text</p></body>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let children: Vec<NodeId> = doc.children(body).collect(); assert!(children.len() >= 2); // First child should be a comment. match doc.node_data(children[0]) { NodeData::Comment { data } => assert_eq!(data, " a comment "), other => panic!("expected comment, got {:?}", other), } } #[test] fn pre_element() { let doc = parse_html("<pre>line 1\nline 2</pre>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let pre = doc.children(body).next().unwrap(); assert_eq!(doc.tag_name(pre), Some("pre")); assert_eq!(text_of_children(&doc, pre), "line 1\nline 2"); } #[test] fn attributes_preserved() { let doc = parse_html("<div id=\"main\" class=\"container\"><a href=\"/page\">link</a></div>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let div = doc.children(body).next().unwrap(); assert_eq!(doc.get_attribute(div, "id"), Some("main")); assert_eq!(doc.get_attribute(div, "class"), Some("container")); let a = doc.children(div).next().unwrap(); assert_eq!(doc.get_attribute(a, "href"), Some("/page")); } #[test] fn empty_document() { let doc = parse_html(""); let root = doc.root(); // Even an empty doc should get html/head/body from EOF handling. // The tree builder creates implicit elements. assert!(doc.children(root).next().is_some()); } #[test] fn just_text() { let doc = parse_html("Hello, world!"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); assert_eq!(text_of_children(&doc, body), "Hello, world!"); } #[test] fn heading_closes_open_p() { let doc = parse_html("<p>text<h1>heading</h1>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); // <p> should be closed by <h1>, so they're siblings. let tags = child_tags(&doc, body); assert_eq!(tags, vec!["p", "h1"]); } #[test] fn self_closing_void_elements() { let doc = parse_html("<p>before<br/>after</p>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let p = doc.children(body).next().unwrap(); let children: Vec<NodeId> = doc.children(p).collect(); assert_eq!(children.len(), 3); assert_eq!(doc.tag_name(children[1]), Some("br")); } #[test] fn doctype_is_handled() { let doc = parse_html("<!DOCTYPE html><html><body></body></html>"); let root = doc.root(); let html = doc.children(root).next().unwrap(); assert_eq!(doc.tag_name(html), Some("html")); } #[test] fn tree_builder_step_by_step() { let mut builder = TreeBuilder::new(); builder.process_token(Token::Doctype { name: Some("html".into()), public_id: None, system_id: None, force_quirks: false, }); builder.process_token(Token::StartTag { name: "html".into(), attributes: vec![], self_closing: false, }); builder.process_token(Token::StartTag { name: "head".into(), attributes: vec![], self_closing: false, }); builder.process_token(Token::EndTag { name: "head".into(), }); builder.process_token(Token::StartTag { name: "body".into(), attributes: vec![], self_closing: false, }); builder.process_token(Token::StartTag { name: "p".into(), attributes: vec![], self_closing: false, }); builder.process_token(Token::Character("Hello".into())); builder.process_token(Token::EndTag { name: "p".into() }); builder.process_token(Token::EndTag { name: "body".into(), }); builder.process_token(Token::EndTag { name: "html".into(), }); builder.process_token(Token::Eof); let doc = builder.finish(); let root = doc.root(); let html = doc.children(root).next().unwrap(); assert_eq!(doc.tag_name(html), Some("html")); let body = doc.children(html).nth(1).unwrap(); let p = doc.children(body).next().unwrap(); assert_eq!(text_of_children(&doc, p), "Hello"); } #[test] fn multiple_text_children_merge() { // When consecutive character tokens arrive, they should merge. let mut builder = TreeBuilder::new(); builder.process_token(Token::StartTag { name: "p".into(), attributes: vec![], self_closing: false, }); builder.process_token(Token::Character("Hello ".into())); builder.process_token(Token::Character("world".into())); builder.process_token(Token::EndTag { name: "p".into() }); builder.process_token(Token::Eof); let doc = builder.finish(); let root = doc.root(); let html = doc.children(root).next().unwrap(); let body = doc.children(html).nth(1).unwrap(); let p = doc.children(body).next().unwrap(); // Should be a single text node. let children: Vec<NodeId> = doc.children(p).collect(); assert_eq!(children.len(), 1); assert_eq!(doc.text_content(children[0]), Some("Hello world")); } }