···11//! HTML5 tokenizer and tree builder.
22//!
33-//! Implements the WHATWG HTML5 tokenizer state machine (§13.2.5).
33+//! Implements the WHATWG HTML5 tokenizer state machine (§13.2.5)
44+//! and a simplified tree builder for constructing DOM trees from tokens.
4556mod entities;
67mod tokenizer;
88+mod tree_builder;
79810pub use tokenizer::Tokenizer;
1111+pub use tree_builder::{parse_html, TreeBuilder};
9121013/// A token emitted by the HTML tokenizer.
1114#[derive(Debug, Clone, PartialEq)]
+1159
crates/html/src/tree_builder.rs
···11+//! HTML tree builder: construct a DOM tree from tokenizer output.
22+//!
33+//! Implements a simplified subset of the WHATWG HTML5 tree construction
44+//! algorithm for Phase 3 of the browser engine.
55+66+use we_dom::{Document, NodeId};
77+88+use crate::{Token, Tokenizer};
99+1010+/// Insertion modes for the tree builder state machine.
1111+#[derive(Debug, Clone, Copy, PartialEq)]
1212+enum InsertionMode {
1313+ Initial,
1414+ BeforeHtml,
1515+ BeforeHead,
1616+ InHead,
1717+ Text,
1818+ AfterHead,
1919+ InBody,
2020+ AfterBody,
2121+ AfterAfterBody,
2222+}
2323+2424+/// Returns true if the given tag name is a void element (self-closing, no end tag).
2525+fn is_void_element(tag: &str) -> bool {
2626+ matches!(
2727+ tag,
2828+ "area"
2929+ | "base"
3030+ | "br"
3131+ | "col"
3232+ | "embed"
3333+ | "hr"
3434+ | "img"
3535+ | "input"
3636+ | "link"
3737+ | "meta"
3838+ | "param"
3939+ | "source"
4040+ | "track"
4141+ | "wbr"
4242+ )
4343+}
4444+4545+/// HTML tree builder that processes tokens and constructs a DOM tree.
4646+pub struct TreeBuilder {
4747+ document: Document,
4848+ /// Stack of open elements (the current nesting context).
4949+ open_elements: Vec<NodeId>,
5050+ head_element: Option<NodeId>,
5151+ body_element: Option<NodeId>,
5252+ insertion_mode: InsertionMode,
5353+ /// Original insertion mode, saved when switching to Text mode.
5454+ original_insertion_mode: Option<InsertionMode>,
5555+ /// Pending text for the Text insertion mode (e.g., inside `<title>`).
5656+ pending_text: String,
5757+}
5858+5959+impl TreeBuilder {
6060+ /// Create a new tree builder with an empty document.
6161+ pub fn new() -> Self {
6262+ TreeBuilder {
6363+ document: Document::new(),
6464+ open_elements: Vec::new(),
6565+ head_element: None,
6666+ body_element: None,
6767+ insertion_mode: InsertionMode::Initial,
6868+ original_insertion_mode: None,
6969+ pending_text: String::new(),
7070+ }
7171+ }
7272+7373+ /// Process a single token, updating the DOM tree.
7474+ pub fn process_token(&mut self, token: Token) {
7575+ match self.insertion_mode {
7676+ InsertionMode::Initial => self.handle_initial(token),
7777+ InsertionMode::BeforeHtml => self.handle_before_html(token),
7878+ InsertionMode::BeforeHead => self.handle_before_head(token),
7979+ InsertionMode::InHead => self.handle_in_head(token),
8080+ InsertionMode::Text => self.handle_text(token),
8181+ InsertionMode::AfterHead => self.handle_after_head(token),
8282+ InsertionMode::InBody => self.handle_in_body(token),
8383+ InsertionMode::AfterBody => self.handle_after_body(token),
8484+ InsertionMode::AfterAfterBody => self.handle_after_after_body(token),
8585+ }
8686+ }
8787+8888+ /// Finish building and return the constructed DOM document.
8989+ pub fn finish(self) -> Document {
9090+ self.document
9191+ }
9292+9393+ // --- Insertion mode handlers ---
9494+9595+ fn handle_initial(&mut self, token: Token) {
9696+ match token {
9797+ Token::Doctype { .. } => {
9898+ // For Phase 3, we just acknowledge the DOCTYPE and move on.
9999+ self.insertion_mode = InsertionMode::BeforeHtml;
100100+ }
101101+ Token::Comment(data) => {
102102+ let comment = self.document.create_comment(&data);
103103+ let root = self.document.root();
104104+ self.document.append_child(root, comment);
105105+ }
106106+ Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
107107+ // Ignore whitespace in Initial mode.
108108+ }
109109+ _ => {
110110+ // Anything else: switch to BeforeHtml and reprocess.
111111+ self.insertion_mode = InsertionMode::BeforeHtml;
112112+ self.handle_before_html(token);
113113+ }
114114+ }
115115+ }
116116+117117+ fn handle_before_html(&mut self, token: Token) {
118118+ match token {
119119+ Token::Doctype { .. } => { /* ignore */ }
120120+ Token::Comment(data) => {
121121+ let comment = self.document.create_comment(&data);
122122+ let root = self.document.root();
123123+ self.document.append_child(root, comment);
124124+ }
125125+ Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
126126+ // Ignore whitespace.
127127+ }
128128+ Token::StartTag { ref name, .. } if name == "html" => {
129129+ let html = self.create_element_from_token(&token);
130130+ let root = self.document.root();
131131+ self.document.append_child(root, html);
132132+ self.open_elements.push(html);
133133+ self.insertion_mode = InsertionMode::BeforeHead;
134134+ }
135135+ Token::EndTag { ref name }
136136+ if name != "head" && name != "body" && name != "html" && name != "br" =>
137137+ {
138138+ // Parse error, ignore.
139139+ }
140140+ _ => {
141141+ // Create an implicit <html> element.
142142+ let html = self.document.create_element("html");
143143+ let root = self.document.root();
144144+ self.document.append_child(root, html);
145145+ self.open_elements.push(html);
146146+ self.insertion_mode = InsertionMode::BeforeHead;
147147+ self.handle_before_head(token);
148148+ }
149149+ }
150150+ }
151151+152152+ fn handle_before_head(&mut self, token: Token) {
153153+ match token {
154154+ Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
155155+ // Ignore whitespace.
156156+ }
157157+ Token::Comment(data) => {
158158+ self.insert_comment(&data);
159159+ }
160160+ Token::Doctype { .. } => { /* ignore */ }
161161+ Token::StartTag { ref name, .. } if name == "html" => {
162162+ // Process as if InBody.
163163+ self.handle_in_body(token);
164164+ }
165165+ Token::StartTag { ref name, .. } if name == "head" => {
166166+ let head = self.create_element_from_token(&token);
167167+ self.insert_node(head);
168168+ self.open_elements.push(head);
169169+ self.head_element = Some(head);
170170+ self.insertion_mode = InsertionMode::InHead;
171171+ }
172172+ Token::EndTag { ref name }
173173+ if name != "head" && name != "body" && name != "html" && name != "br" =>
174174+ {
175175+ // Parse error, ignore.
176176+ }
177177+ _ => {
178178+ // Implied <head>.
179179+ let head = self.document.create_element("head");
180180+ self.insert_node(head);
181181+ self.open_elements.push(head);
182182+ self.head_element = Some(head);
183183+ self.insertion_mode = InsertionMode::InHead;
184184+ self.handle_in_head(token);
185185+ }
186186+ }
187187+ }
188188+189189+ fn handle_in_head(&mut self, token: Token) {
190190+ match token {
191191+ Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
192192+ self.insert_text(s);
193193+ }
194194+ Token::Comment(data) => {
195195+ self.insert_comment(&data);
196196+ }
197197+ Token::Doctype { .. } => { /* ignore */ }
198198+ Token::StartTag { ref name, .. } if name == "title" => {
199199+ let elem = self.create_element_from_token(&token);
200200+ self.insert_node(elem);
201201+ self.open_elements.push(elem);
202202+ self.original_insertion_mode = Some(self.insertion_mode);
203203+ self.insertion_mode = InsertionMode::Text;
204204+ }
205205+ Token::StartTag { ref name, .. }
206206+ if name == "style" || name == "script" || name == "noscript" =>
207207+ {
208208+ let elem = self.create_element_from_token(&token);
209209+ self.insert_node(elem);
210210+ self.open_elements.push(elem);
211211+ self.original_insertion_mode = Some(self.insertion_mode);
212212+ self.insertion_mode = InsertionMode::Text;
213213+ }
214214+ Token::StartTag { ref name, .. } if name == "meta" || name == "link" => {
215215+ let elem = self.create_element_from_token(&token);
216216+ self.insert_node(elem);
217217+ // Void elements: don't push onto stack.
218218+ }
219219+ Token::StartTag { ref name, .. } if name == "head" => {
220220+ // Ignore duplicate <head>.
221221+ }
222222+ Token::EndTag { ref name } if name == "head" => {
223223+ self.pop_until("head");
224224+ self.insertion_mode = InsertionMode::AfterHead;
225225+ }
226226+ Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => {
227227+ // Parse error, ignore.
228228+ }
229229+ _ => {
230230+ // Pop <head> and switch to AfterHead, then reprocess.
231231+ self.pop_until("head");
232232+ self.insertion_mode = InsertionMode::AfterHead;
233233+ self.handle_after_head(token);
234234+ }
235235+ }
236236+ }
237237+238238+ fn handle_text(&mut self, token: Token) {
239239+ match token {
240240+ Token::Character(s) => {
241241+ self.pending_text.push_str(&s);
242242+ }
243243+ Token::EndTag { .. } => {
244244+ // Flush pending text.
245245+ if !self.pending_text.is_empty() {
246246+ let text = self.pending_text.clone();
247247+ self.pending_text.clear();
248248+ self.insert_text(&text);
249249+ }
250250+ // Pop the element (e.g., <title>).
251251+ self.open_elements.pop();
252252+ self.insertion_mode = self
253253+ .original_insertion_mode
254254+ .unwrap_or(InsertionMode::InBody);
255255+ self.original_insertion_mode = None;
256256+ }
257257+ Token::Eof => {
258258+ // Flush pending text.
259259+ if !self.pending_text.is_empty() {
260260+ let text = self.pending_text.clone();
261261+ self.pending_text.clear();
262262+ self.insert_text(&text);
263263+ }
264264+ self.open_elements.pop();
265265+ self.insertion_mode = self
266266+ .original_insertion_mode
267267+ .unwrap_or(InsertionMode::InBody);
268268+ self.original_insertion_mode = None;
269269+ self.process_token(Token::Eof);
270270+ }
271271+ _ => {}
272272+ }
273273+ }
274274+275275+ fn handle_after_head(&mut self, token: Token) {
276276+ match token {
277277+ Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
278278+ self.insert_text(s);
279279+ }
280280+ Token::Comment(data) => {
281281+ self.insert_comment(&data);
282282+ }
283283+ Token::Doctype { .. } => { /* ignore */ }
284284+ Token::StartTag { ref name, .. } if name == "html" => {
285285+ self.handle_in_body(token);
286286+ }
287287+ Token::StartTag { ref name, .. } if name == "body" => {
288288+ let body = self.create_element_from_token(&token);
289289+ self.insert_node(body);
290290+ self.open_elements.push(body);
291291+ self.body_element = Some(body);
292292+ self.insertion_mode = InsertionMode::InBody;
293293+ }
294294+ Token::StartTag { ref name, .. } if name == "head" => {
295295+ // Ignore.
296296+ }
297297+ Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => {
298298+ // Ignore.
299299+ }
300300+ _ => {
301301+ // Implied <body>.
302302+ let body = self.document.create_element("body");
303303+ self.insert_node(body);
304304+ self.open_elements.push(body);
305305+ self.body_element = Some(body);
306306+ self.insertion_mode = InsertionMode::InBody;
307307+ self.handle_in_body(token);
308308+ }
309309+ }
310310+ }
311311+312312+ fn handle_in_body(&mut self, token: Token) {
313313+ match token {
314314+ Token::Character(s) => {
315315+ self.insert_text(&s);
316316+ }
317317+ Token::Comment(data) => {
318318+ self.insert_comment(&data);
319319+ }
320320+ Token::Doctype { .. } => { /* ignore */ }
321321+ Token::StartTag { ref name, .. } if name == "html" => {
322322+ // Merge attributes onto existing <html> element.
323323+ if let Token::StartTag { attributes, .. } = &token {
324324+ if let Some(&html_id) = self.open_elements.first() {
325325+ for (attr_name, attr_value) in attributes {
326326+ if self.document.get_attribute(html_id, attr_name).is_none() {
327327+ self.document.set_attribute(html_id, attr_name, attr_value);
328328+ }
329329+ }
330330+ }
331331+ }
332332+ }
333333+ Token::StartTag { ref name, .. }
334334+ if name == "body"
335335+ || name == "head"
336336+ || name == "title"
337337+ || name == "style"
338338+ || name == "script" =>
339339+ {
340340+ match name.as_str() {
341341+ "body" => {
342342+ // Ignore duplicate <body>.
343343+ }
344344+ "head" => {
345345+ // Ignore <head> in body.
346346+ }
347347+ _ => {
348348+ // title/style/script: process using InHead rules
349349+ self.handle_in_head(token);
350350+ }
351351+ }
352352+ }
353353+ Token::StartTag { ref name, .. }
354354+ if name == "p"
355355+ || name == "div"
356356+ || name == "h1"
357357+ || name == "h2"
358358+ || name == "h3"
359359+ || name == "h4"
360360+ || name == "h5"
361361+ || name == "h6"
362362+ || name == "pre"
363363+ || name == "blockquote"
364364+ || name == "ul"
365365+ || name == "ol"
366366+ || name == "li" =>
367367+ {
368368+ // If there's a <p> in button scope, close it first.
369369+ if self.has_element_in_button_scope("p") {
370370+ self.close_p_element();
371371+ }
372372+ let elem = self.create_element_from_token(&token);
373373+ self.insert_node(elem);
374374+ self.open_elements.push(elem);
375375+ }
376376+ Token::StartTag { ref name, .. } if is_void_element(name) => {
377377+ let elem = self.create_element_from_token(&token);
378378+ self.insert_node(elem);
379379+ // Don't push void elements onto the stack.
380380+ }
381381+ Token::StartTag { .. } => {
382382+ // Generic start tag: create element and push onto stack.
383383+ let elem = self.create_element_from_token(&token);
384384+ self.insert_node(elem);
385385+ self.open_elements.push(elem);
386386+ }
387387+ Token::EndTag { ref name } if name == "body" => {
388388+ if self.has_element_in_scope("body") {
389389+ self.insertion_mode = InsertionMode::AfterBody;
390390+ }
391391+ }
392392+ Token::EndTag { ref name } if name == "html" => {
393393+ if self.has_element_in_scope("body") {
394394+ self.insertion_mode = InsertionMode::AfterBody;
395395+ self.handle_after_body(token);
396396+ }
397397+ }
398398+ Token::EndTag { ref name } if name == "p" => {
399399+ if !self.has_element_in_button_scope("p") {
400400+ // No matching <p>: insert an empty one, then close it.
401401+ let p = self.document.create_element("p");
402402+ self.insert_node(p);
403403+ self.open_elements.push(p);
404404+ }
405405+ self.close_p_element();
406406+ }
407407+ Token::EndTag { ref name }
408408+ if name == "div"
409409+ || name == "pre"
410410+ || name == "blockquote"
411411+ || name == "ul"
412412+ || name == "ol"
413413+ || name == "li" =>
414414+ {
415415+ if self.has_element_in_scope(name) {
416416+ self.generate_implied_end_tags(Some(name));
417417+ self.pop_until(name);
418418+ }
419419+ }
420420+ Token::EndTag { ref name }
421421+ if name == "h1"
422422+ || name == "h2"
423423+ || name == "h3"
424424+ || name == "h4"
425425+ || name == "h5"
426426+ || name == "h6" =>
427427+ {
428428+ if self.has_heading_in_scope() {
429429+ self.generate_implied_end_tags(None);
430430+ // Pop until we find a heading element.
431431+ while let Some(id) = self.open_elements.pop() {
432432+ if let Some(tag) = self.document.tag_name(id) {
433433+ if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
434434+ break;
435435+ }
436436+ }
437437+ }
438438+ }
439439+ }
440440+ Token::EndTag { ref name } => {
441441+ // Generic end tag: walk back through open elements.
442442+ self.handle_any_other_end_tag(name);
443443+ }
444444+ Token::Eof => {
445445+ // Stop parsing.
446446+ }
447447+ }
448448+ }
449449+450450+ fn handle_after_body(&mut self, token: Token) {
451451+ match token {
452452+ Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
453453+ // Process whitespace as in InBody.
454454+ self.handle_in_body(token);
455455+ }
456456+ Token::Comment(data) => {
457457+ // Insert as last child of the first element (html).
458458+ let comment = self.document.create_comment(&data);
459459+ if let Some(&html) = self.open_elements.first() {
460460+ self.document.append_child(html, comment);
461461+ }
462462+ }
463463+ Token::Doctype { .. } => { /* ignore */ }
464464+ Token::EndTag { ref name } if name == "html" => {
465465+ self.insertion_mode = InsertionMode::AfterAfterBody;
466466+ }
467467+ Token::Eof => {
468468+ // Stop parsing.
469469+ }
470470+ _ => {
471471+ // Anything else: switch back to InBody and reprocess.
472472+ self.insertion_mode = InsertionMode::InBody;
473473+ self.handle_in_body(token);
474474+ }
475475+ }
476476+ }
477477+478478+ fn handle_after_after_body(&mut self, token: Token) {
479479+ match token {
480480+ Token::Comment(data) => {
481481+ let comment = self.document.create_comment(&data);
482482+ let root = self.document.root();
483483+ self.document.append_child(root, comment);
484484+ }
485485+ Token::Doctype { .. } => { /* ignore */ }
486486+ Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
487487+ self.handle_in_body(token);
488488+ }
489489+ Token::Eof => {
490490+ // Stop.
491491+ }
492492+ _ => {
493493+ self.insertion_mode = InsertionMode::InBody;
494494+ self.handle_in_body(token);
495495+ }
496496+ }
497497+ }
498498+499499+ // --- Helper methods ---
500500+501501+ /// Create a DOM element from a StartTag token, setting attributes.
502502+ fn create_element_from_token(&mut self, token: &Token) -> NodeId {
503503+ if let Token::StartTag {
504504+ name, attributes, ..
505505+ } = token
506506+ {
507507+ let id = self.document.create_element(name);
508508+ for (attr_name, attr_value) in attributes {
509509+ self.document.set_attribute(id, attr_name, attr_value);
510510+ }
511511+ id
512512+ } else {
513513+ // Should only be called with StartTag tokens.
514514+ self.document.create_element("unknown")
515515+ }
516516+ }
517517+518518+ /// Insert a node at the current insertion point (last open element).
519519+ fn insert_node(&mut self, node: NodeId) {
520520+ let parent = self
521521+ .open_elements
522522+ .last()
523523+ .copied()
524524+ .unwrap_or_else(|| self.document.root());
525525+ self.document.append_child(parent, node);
526526+ }
527527+528528+ /// Insert a text node at the current insertion point.
529529+ /// If the last child is already a text node, append to it.
530530+ fn insert_text(&mut self, data: &str) {
531531+ let parent = self
532532+ .open_elements
533533+ .last()
534534+ .copied()
535535+ .unwrap_or_else(|| self.document.root());
536536+537537+ // Try to merge with existing text node.
538538+ if let Some(last_child) = self.document.last_child(parent) {
539539+ if let we_dom::NodeData::Text { data: ref existing } =
540540+ *self.document.node_data(last_child)
541541+ {
542542+ let mut merged = existing.clone();
543543+ merged.push_str(data);
544544+ self.document.set_text_content(last_child, &merged);
545545+ return;
546546+ }
547547+ }
548548+549549+ let text = self.document.create_text(data);
550550+ self.document.append_child(parent, text);
551551+ }
552552+553553+ /// Insert a comment node at the current insertion point.
554554+ fn insert_comment(&mut self, data: &str) {
555555+ let comment = self.document.create_comment(data);
556556+ self.insert_node(comment);
557557+ }
558558+559559+ /// Pop elements from the stack until we find one with the given tag name.
560560+ /// The matching element is also popped.
561561+ fn pop_until(&mut self, tag_name: &str) {
562562+ while let Some(id) = self.open_elements.pop() {
563563+ if self.document.tag_name(id) == Some(tag_name) {
564564+ return;
565565+ }
566566+ }
567567+ }
568568+569569+ /// Check if the given tag name is "in scope" (simplified).
570570+ /// In scope means there's an element with that tag on the stack,
571571+ /// and no scope barrier element between it and the top.
572572+ fn has_element_in_scope(&self, target: &str) -> bool {
573573+ for &id in self.open_elements.iter().rev() {
574574+ if let Some(tag) = self.document.tag_name(id) {
575575+ if tag == target {
576576+ return true;
577577+ }
578578+ // Scope barrier elements.
579579+ if matches!(
580580+ tag,
581581+ "applet"
582582+ | "caption"
583583+ | "html"
584584+ | "table"
585585+ | "td"
586586+ | "th"
587587+ | "marquee"
588588+ | "object"
589589+ | "template"
590590+ ) {
591591+ return false;
592592+ }
593593+ }
594594+ }
595595+ false
596596+ }
597597+598598+ /// Check if the given tag name is "in button scope".
599599+ fn has_element_in_button_scope(&self, target: &str) -> bool {
600600+ for &id in self.open_elements.iter().rev() {
601601+ if let Some(tag) = self.document.tag_name(id) {
602602+ if tag == target {
603603+ return true;
604604+ }
605605+ // Button scope includes all regular scope barriers plus <button>.
606606+ if matches!(
607607+ tag,
608608+ "applet"
609609+ | "button"
610610+ | "caption"
611611+ | "html"
612612+ | "table"
613613+ | "td"
614614+ | "th"
615615+ | "marquee"
616616+ | "object"
617617+ | "template"
618618+ ) {
619619+ return false;
620620+ }
621621+ }
622622+ }
623623+ false
624624+ }
625625+626626+ /// Check if any heading element (h1-h6) is in scope.
627627+ fn has_heading_in_scope(&self) -> bool {
628628+ for &id in self.open_elements.iter().rev() {
629629+ if let Some(tag) = self.document.tag_name(id) {
630630+ if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
631631+ return true;
632632+ }
633633+ if matches!(
634634+ tag,
635635+ "applet"
636636+ | "caption"
637637+ | "html"
638638+ | "table"
639639+ | "td"
640640+ | "th"
641641+ | "marquee"
642642+ | "object"
643643+ | "template"
644644+ ) {
645645+ return false;
646646+ }
647647+ }
648648+ }
649649+ false
650650+ }
651651+652652+ /// Close a `<p>` element: generate implied end tags (excluding p),
653653+ /// then pop until we find the `<p>`.
654654+ fn close_p_element(&mut self) {
655655+ self.generate_implied_end_tags(Some("p"));
656656+ self.pop_until("p");
657657+ }
658658+659659+ /// Generate implied end tags. If `exclude` is provided, don't generate
660660+ /// an end tag for that element.
661661+ fn generate_implied_end_tags(&mut self, exclude: Option<&str>) {
662662+ loop {
663663+ let should_pop = self
664664+ .open_elements
665665+ .last()
666666+ .and_then(|&id| self.document.tag_name(id))
667667+ .map(|tag| {
668668+ if let Some(excl) = exclude {
669669+ if tag == excl {
670670+ return false;
671671+ }
672672+ }
673673+ matches!(
674674+ tag,
675675+ "dd" | "dt"
676676+ | "li"
677677+ | "optgroup"
678678+ | "option"
679679+ | "p"
680680+ | "rb"
681681+ | "rp"
682682+ | "rt"
683683+ | "rtc"
684684+ )
685685+ })
686686+ .unwrap_or(false);
687687+ if should_pop {
688688+ self.open_elements.pop();
689689+ } else {
690690+ break;
691691+ }
692692+ }
693693+ }
694694+695695+ /// Handle a generic end tag by walking back through open elements
696696+ /// using the "any other end tag" algorithm.
697697+ fn handle_any_other_end_tag(&mut self, name: &str) {
698698+ // Walk backwards through the stack.
699699+ let mut i = self.open_elements.len();
700700+ while i > 0 {
701701+ i -= 1;
702702+ let id = self.open_elements[i];
703703+ if self.document.tag_name(id) == Some(name) {
704704+ // Pop everything above and including this element.
705705+ self.open_elements.truncate(i);
706706+ return;
707707+ }
708708+ // If this is a "special" element, stop.
709709+ if let Some(tag) = self.document.tag_name(id) {
710710+ if is_special_element(tag) {
711711+ return;
712712+ }
713713+ }
714714+ }
715715+ }
716716+}
717717+718718+impl Default for TreeBuilder {
719719+ fn default() -> Self {
720720+ Self::new()
721721+ }
722722+}
723723+724724+/// Returns true if the tag is a "special" element per the HTML spec.
725725+fn is_special_element(tag: &str) -> bool {
726726+ matches!(
727727+ tag,
728728+ "address"
729729+ | "applet"
730730+ | "area"
731731+ | "article"
732732+ | "aside"
733733+ | "base"
734734+ | "basefont"
735735+ | "bgsound"
736736+ | "blockquote"
737737+ | "body"
738738+ | "br"
739739+ | "button"
740740+ | "caption"
741741+ | "center"
742742+ | "col"
743743+ | "colgroup"
744744+ | "dd"
745745+ | "details"
746746+ | "dir"
747747+ | "div"
748748+ | "dl"
749749+ | "dt"
750750+ | "embed"
751751+ | "fieldset"
752752+ | "figcaption"
753753+ | "figure"
754754+ | "footer"
755755+ | "form"
756756+ | "frame"
757757+ | "frameset"
758758+ | "h1"
759759+ | "h2"
760760+ | "h3"
761761+ | "h4"
762762+ | "h5"
763763+ | "h6"
764764+ | "head"
765765+ | "header"
766766+ | "hgroup"
767767+ | "hr"
768768+ | "html"
769769+ | "iframe"
770770+ | "img"
771771+ | "input"
772772+ | "li"
773773+ | "link"
774774+ | "listing"
775775+ | "main"
776776+ | "marquee"
777777+ | "menu"
778778+ | "meta"
779779+ | "nav"
780780+ | "noembed"
781781+ | "noframes"
782782+ | "noscript"
783783+ | "object"
784784+ | "ol"
785785+ | "p"
786786+ | "param"
787787+ | "plaintext"
788788+ | "pre"
789789+ | "script"
790790+ | "section"
791791+ | "select"
792792+ | "source"
793793+ | "style"
794794+ | "summary"
795795+ | "table"
796796+ | "tbody"
797797+ | "td"
798798+ | "template"
799799+ | "textarea"
800800+ | "tfoot"
801801+ | "th"
802802+ | "thead"
803803+ | "title"
804804+ | "tr"
805805+ | "track"
806806+ | "ul"
807807+ | "wbr"
808808+ | "xmp"
809809+ )
810810+}
811811+812812+/// Parse an HTML string into a DOM document.
813813+///
814814+/// This is a convenience function that tokenizes the input and builds
815815+/// a DOM tree using the tree builder.
816816+pub fn parse_html(input: &str) -> Document {
817817+ let mut builder = TreeBuilder::new();
818818+ let mut tokenizer = Tokenizer::new(input);
819819+ loop {
820820+ let token = tokenizer.next_token();
821821+ let is_eof = token == Token::Eof;
822822+ builder.process_token(token);
823823+ if is_eof {
824824+ break;
825825+ }
826826+ }
827827+ builder.finish()
828828+}
829829+830830+#[cfg(test)]
831831+mod tests {
832832+ use super::*;
833833+ use we_dom::NodeData;
834834+835835+ /// Helper: collect tag names of direct children of a node.
836836+ fn child_tags(doc: &Document, node: NodeId) -> Vec<String> {
837837+ doc.children(node)
838838+ .filter_map(|id| doc.tag_name(id).map(String::from))
839839+ .collect()
840840+ }
841841+842842+ /// Helper: get the text content of all text node children, concatenated.
843843+ fn text_of_children(doc: &Document, node: NodeId) -> String {
844844+ let mut result = String::new();
845845+ for child in doc.children(node) {
846846+ if let Some(text) = doc.text_content(child) {
847847+ result.push_str(text);
848848+ }
849849+ }
850850+ result
851851+ }
852852+853853+ #[test]
854854+ fn parse_full_document() {
855855+ let doc = parse_html(
856856+ "<!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>",
857857+ );
858858+ let root = doc.root();
859859+860860+ // Root should have one child: <html>
861861+ let html_children: Vec<NodeId> = doc.children(root).collect();
862862+ assert_eq!(html_children.len(), 1);
863863+ let html = html_children[0];
864864+ assert_eq!(doc.tag_name(html), Some("html"));
865865+866866+ // <html> should have <head> and <body>
867867+ let tags = child_tags(&doc, html);
868868+ assert_eq!(tags, vec!["head", "body"]);
869869+870870+ // <head> should have <title>
871871+ let head = doc.children(html).next().unwrap();
872872+ let head_tags = child_tags(&doc, head);
873873+ assert_eq!(head_tags, vec!["title"]);
874874+875875+ // <title> should contain "Test"
876876+ let title = doc.children(head).next().unwrap();
877877+ assert_eq!(text_of_children(&doc, title), "Test");
878878+879879+ // <body> should have <p>
880880+ let body = doc.children(html).nth(1).unwrap();
881881+ let body_tags = child_tags(&doc, body);
882882+ assert_eq!(body_tags, vec!["p"]);
883883+884884+ // <p> should contain "Hello"
885885+ let p = doc.children(body).next().unwrap();
886886+ assert_eq!(text_of_children(&doc, p), "Hello");
887887+ }
888888+889889+ #[test]
890890+ fn implicit_html_head_body() {
891891+ // Minimal document: just <p>Hello
892892+ let doc = parse_html("<p>Hello");
893893+ let root = doc.root();
894894+895895+ let html: Vec<NodeId> = doc.children(root).collect();
896896+ assert_eq!(html.len(), 1);
897897+ assert_eq!(doc.tag_name(html[0]), Some("html"));
898898+899899+ let html_tags = child_tags(&doc, html[0]);
900900+ assert_eq!(html_tags, vec!["head", "body"]);
901901+902902+ let body = doc.children(html[0]).nth(1).unwrap();
903903+ let body_tags = child_tags(&doc, body);
904904+ assert_eq!(body_tags, vec!["p"]);
905905+906906+ let p = doc.children(body).next().unwrap();
907907+ assert_eq!(text_of_children(&doc, p), "Hello");
908908+ }
909909+910910+ #[test]
911911+ fn void_element_br() {
912912+ let doc = parse_html("<p>Line 1<br>Line 2</p>");
913913+ let root = doc.root();
914914+ let html = doc.children(root).next().unwrap();
915915+ let body = doc.children(html).nth(1).unwrap();
916916+ let p = doc.children(body).next().unwrap();
917917+918918+ // <p> should have: text("Line 1"), <br>, text("Line 2")
919919+ let children: Vec<NodeId> = doc.children(p).collect();
920920+ assert_eq!(children.len(), 3);
921921+ assert_eq!(doc.text_content(children[0]), Some("Line 1"));
922922+ assert_eq!(doc.tag_name(children[1]), Some("br"));
923923+ assert_eq!(doc.text_content(children[2]), Some("Line 2"));
924924+ }
925925+926926+ #[test]
927927+ fn p_inside_p_closes_outer() {
928928+ let doc = parse_html("<p>First<p>Second");
929929+ let root = doc.root();
930930+ let html = doc.children(root).next().unwrap();
931931+ let body = doc.children(html).nth(1).unwrap();
932932+933933+ // Should have two sibling <p> elements, not nested.
934934+ let body_tags = child_tags(&doc, body);
935935+ assert_eq!(body_tags, vec!["p", "p"]);
936936+937937+ let children: Vec<NodeId> = doc.children(body).collect();
938938+ assert_eq!(text_of_children(&doc, children[0]), "First");
939939+ assert_eq!(text_of_children(&doc, children[1]), "Second");
940940+ }
941941+942942+ #[test]
943943+ fn nested_div_elements() {
944944+ let doc = parse_html("<div><div>inner</div></div>");
945945+ let root = doc.root();
946946+ let html = doc.children(root).next().unwrap();
947947+ let body = doc.children(html).nth(1).unwrap();
948948+949949+ let outer_div = doc.children(body).next().unwrap();
950950+ assert_eq!(doc.tag_name(outer_div), Some("div"));
951951+952952+ let inner_div = doc.children(outer_div).next().unwrap();
953953+ assert_eq!(doc.tag_name(inner_div), Some("div"));
954954+ assert_eq!(text_of_children(&doc, inner_div), "inner");
955955+ }
956956+957957+ #[test]
958958+ fn inline_elements_nest_properly() {
959959+ let doc = parse_html("<p><span><a href=\"#\">link</a></span></p>");
960960+ let root = doc.root();
961961+ let html = doc.children(root).next().unwrap();
962962+ let body = doc.children(html).nth(1).unwrap();
963963+964964+ let p = doc.children(body).next().unwrap();
965965+ let span = doc.children(p).next().unwrap();
966966+ assert_eq!(doc.tag_name(span), Some("span"));
967967+968968+ let a = doc.children(span).next().unwrap();
969969+ assert_eq!(doc.tag_name(a), Some("a"));
970970+ assert_eq!(doc.get_attribute(a, "href"), Some("#"));
971971+ assert_eq!(text_of_children(&doc, a), "link");
972972+ }
973973+974974+ #[test]
975975+ fn headings() {
976976+ let doc = parse_html("<h1>Title</h1><h2>Subtitle</h2><p>Body text</p>");
977977+ let root = doc.root();
978978+ let html = doc.children(root).next().unwrap();
979979+ let body = doc.children(html).nth(1).unwrap();
980980+981981+ let tags = child_tags(&doc, body);
982982+ assert_eq!(tags, vec!["h1", "h2", "p"]);
983983+ }
984984+985985+ #[test]
986986+ fn comment_nodes() {
987987+ let doc = parse_html("<body><!-- a comment --><p>text</p></body>");
988988+ let root = doc.root();
989989+ let html = doc.children(root).next().unwrap();
990990+ let body = doc.children(html).nth(1).unwrap();
991991+992992+ let children: Vec<NodeId> = doc.children(body).collect();
993993+ assert!(children.len() >= 2);
994994+995995+ // First child should be a comment.
996996+ match doc.node_data(children[0]) {
997997+ NodeData::Comment { data } => assert_eq!(data, " a comment "),
998998+ other => panic!("expected comment, got {:?}", other),
999999+ }
10001000+ }
10011001+10021002+ #[test]
10031003+ fn pre_element() {
10041004+ let doc = parse_html("<pre>line 1\nline 2</pre>");
10051005+ let root = doc.root();
10061006+ let html = doc.children(root).next().unwrap();
10071007+ let body = doc.children(html).nth(1).unwrap();
10081008+10091009+ let pre = doc.children(body).next().unwrap();
10101010+ assert_eq!(doc.tag_name(pre), Some("pre"));
10111011+ assert_eq!(text_of_children(&doc, pre), "line 1\nline 2");
10121012+ }
10131013+10141014+ #[test]
10151015+ fn attributes_preserved() {
10161016+ let doc =
10171017+ parse_html("<div id=\"main\" class=\"container\"><a href=\"/page\">link</a></div>");
10181018+ let root = doc.root();
10191019+ let html = doc.children(root).next().unwrap();
10201020+ let body = doc.children(html).nth(1).unwrap();
10211021+10221022+ let div = doc.children(body).next().unwrap();
10231023+ assert_eq!(doc.get_attribute(div, "id"), Some("main"));
10241024+ assert_eq!(doc.get_attribute(div, "class"), Some("container"));
10251025+10261026+ let a = doc.children(div).next().unwrap();
10271027+ assert_eq!(doc.get_attribute(a, "href"), Some("/page"));
10281028+ }
10291029+10301030+ #[test]
10311031+ fn empty_document() {
10321032+ let doc = parse_html("");
10331033+ let root = doc.root();
10341034+ // Even an empty doc should get html/head/body from EOF handling.
10351035+ // The tree builder creates implicit elements.
10361036+ assert!(doc.children(root).next().is_some());
10371037+ }
10381038+10391039+ #[test]
10401040+ fn just_text() {
10411041+ let doc = parse_html("Hello, world!");
10421042+ let root = doc.root();
10431043+ let html = doc.children(root).next().unwrap();
10441044+ let body = doc.children(html).nth(1).unwrap();
10451045+10461046+ assert_eq!(text_of_children(&doc, body), "Hello, world!");
10471047+ }
10481048+10491049+ #[test]
10501050+ fn heading_closes_open_p() {
10511051+ let doc = parse_html("<p>text<h1>heading</h1>");
10521052+ let root = doc.root();
10531053+ let html = doc.children(root).next().unwrap();
10541054+ let body = doc.children(html).nth(1).unwrap();
10551055+10561056+ // <p> should be closed by <h1>, so they're siblings.
10571057+ let tags = child_tags(&doc, body);
10581058+ assert_eq!(tags, vec!["p", "h1"]);
10591059+ }
10601060+10611061+ #[test]
10621062+ fn self_closing_void_elements() {
10631063+ let doc = parse_html("<p>before<br/>after</p>");
10641064+ let root = doc.root();
10651065+ let html = doc.children(root).next().unwrap();
10661066+ let body = doc.children(html).nth(1).unwrap();
10671067+ let p = doc.children(body).next().unwrap();
10681068+10691069+ let children: Vec<NodeId> = doc.children(p).collect();
10701070+ assert_eq!(children.len(), 3);
10711071+ assert_eq!(doc.tag_name(children[1]), Some("br"));
10721072+ }
10731073+10741074+ #[test]
10751075+ fn doctype_is_handled() {
10761076+ let doc = parse_html("<!DOCTYPE html><html><body></body></html>");
10771077+ let root = doc.root();
10781078+ let html = doc.children(root).next().unwrap();
10791079+ assert_eq!(doc.tag_name(html), Some("html"));
10801080+ }
10811081+10821082+ #[test]
10831083+ fn tree_builder_step_by_step() {
10841084+ let mut builder = TreeBuilder::new();
10851085+ builder.process_token(Token::Doctype {
10861086+ name: Some("html".into()),
10871087+ public_id: None,
10881088+ system_id: None,
10891089+ force_quirks: false,
10901090+ });
10911091+ builder.process_token(Token::StartTag {
10921092+ name: "html".into(),
10931093+ attributes: vec![],
10941094+ self_closing: false,
10951095+ });
10961096+ builder.process_token(Token::StartTag {
10971097+ name: "head".into(),
10981098+ attributes: vec![],
10991099+ self_closing: false,
11001100+ });
11011101+ builder.process_token(Token::EndTag {
11021102+ name: "head".into(),
11031103+ });
11041104+ builder.process_token(Token::StartTag {
11051105+ name: "body".into(),
11061106+ attributes: vec![],
11071107+ self_closing: false,
11081108+ });
11091109+ builder.process_token(Token::StartTag {
11101110+ name: "p".into(),
11111111+ attributes: vec![],
11121112+ self_closing: false,
11131113+ });
11141114+ builder.process_token(Token::Character("Hello".into()));
11151115+ builder.process_token(Token::EndTag { name: "p".into() });
11161116+ builder.process_token(Token::EndTag {
11171117+ name: "body".into(),
11181118+ });
11191119+ builder.process_token(Token::EndTag {
11201120+ name: "html".into(),
11211121+ });
11221122+ builder.process_token(Token::Eof);
11231123+11241124+ let doc = builder.finish();
11251125+ let root = doc.root();
11261126+ let html = doc.children(root).next().unwrap();
11271127+ assert_eq!(doc.tag_name(html), Some("html"));
11281128+11291129+ let body = doc.children(html).nth(1).unwrap();
11301130+ let p = doc.children(body).next().unwrap();
11311131+ assert_eq!(text_of_children(&doc, p), "Hello");
11321132+ }
11331133+11341134+ #[test]
11351135+ fn multiple_text_children_merge() {
11361136+ // When consecutive character tokens arrive, they should merge.
11371137+ let mut builder = TreeBuilder::new();
11381138+ builder.process_token(Token::StartTag {
11391139+ name: "p".into(),
11401140+ attributes: vec![],
11411141+ self_closing: false,
11421142+ });
11431143+ builder.process_token(Token::Character("Hello ".into()));
11441144+ builder.process_token(Token::Character("world".into()));
11451145+ builder.process_token(Token::EndTag { name: "p".into() });
11461146+ builder.process_token(Token::Eof);
11471147+11481148+ let doc = builder.finish();
11491149+ let root = doc.root();
11501150+ let html = doc.children(root).next().unwrap();
11511151+ let body = doc.children(html).nth(1).unwrap();
11521152+ let p = doc.children(body).next().unwrap();
11531153+11541154+ // Should be a single text node.
11551155+ let children: Vec<NodeId> = doc.children(p).collect();
11561156+ assert_eq!(children.len(), 1);
11571157+ assert_eq!(doc.text_content(children[0]), Some("Hello world"));
11581158+ }
11591159+}