web engine - experimental web browser
1//! HTML tree builder: construct a DOM tree from tokenizer output.
2//!
3//! Implements a simplified subset of the WHATWG HTML5 tree construction
4//! algorithm for Phase 3 of the browser engine.
5
6use we_dom::{Document, NodeId};
7
8use crate::{Token, Tokenizer};
9
10/// Insertion modes for the tree builder state machine.
11#[derive(Debug, Clone, Copy, PartialEq)]
12enum InsertionMode {
13 Initial,
14 BeforeHtml,
15 BeforeHead,
16 InHead,
17 Text,
18 AfterHead,
19 InBody,
20 AfterBody,
21 AfterAfterBody,
22}
23
24/// Returns true if the given tag name is a void element (self-closing, no end tag).
25fn is_void_element(tag: &str) -> bool {
26 matches!(
27 tag,
28 "area"
29 | "base"
30 | "br"
31 | "col"
32 | "embed"
33 | "hr"
34 | "img"
35 | "input"
36 | "link"
37 | "meta"
38 | "param"
39 | "source"
40 | "track"
41 | "wbr"
42 )
43}
44
45/// HTML tree builder that processes tokens and constructs a DOM tree.
46pub struct TreeBuilder {
47 document: Document,
48 /// Stack of open elements (the current nesting context).
49 open_elements: Vec<NodeId>,
50 head_element: Option<NodeId>,
51 body_element: Option<NodeId>,
52 insertion_mode: InsertionMode,
53 /// Original insertion mode, saved when switching to Text mode.
54 original_insertion_mode: Option<InsertionMode>,
55 /// Pending text for the Text insertion mode (e.g., inside `<title>`).
56 pending_text: String,
57}
58
59impl TreeBuilder {
60 /// Create a new tree builder with an empty document.
61 pub fn new() -> Self {
62 TreeBuilder {
63 document: Document::new(),
64 open_elements: Vec::new(),
65 head_element: None,
66 body_element: None,
67 insertion_mode: InsertionMode::Initial,
68 original_insertion_mode: None,
69 pending_text: String::new(),
70 }
71 }
72
73 /// Process a single token, updating the DOM tree.
74 pub fn process_token(&mut self, token: Token) {
75 match self.insertion_mode {
76 InsertionMode::Initial => self.handle_initial(token),
77 InsertionMode::BeforeHtml => self.handle_before_html(token),
78 InsertionMode::BeforeHead => self.handle_before_head(token),
79 InsertionMode::InHead => self.handle_in_head(token),
80 InsertionMode::Text => self.handle_text(token),
81 InsertionMode::AfterHead => self.handle_after_head(token),
82 InsertionMode::InBody => self.handle_in_body(token),
83 InsertionMode::AfterBody => self.handle_after_body(token),
84 InsertionMode::AfterAfterBody => self.handle_after_after_body(token),
85 }
86 }
87
88 /// Finish building and return the constructed DOM document.
89 pub fn finish(self) -> Document {
90 self.document
91 }
92
93 // --- Insertion mode handlers ---
94
95 fn handle_initial(&mut self, token: Token) {
96 match token {
97 Token::Doctype { .. } => {
98 // For Phase 3, we just acknowledge the DOCTYPE and move on.
99 self.insertion_mode = InsertionMode::BeforeHtml;
100 }
101 Token::Comment(data) => {
102 let comment = self.document.create_comment(&data);
103 let root = self.document.root();
104 self.document.append_child(root, comment);
105 }
106 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
107 // Ignore whitespace in Initial mode.
108 }
109 _ => {
110 // Anything else: switch to BeforeHtml and reprocess.
111 self.insertion_mode = InsertionMode::BeforeHtml;
112 self.handle_before_html(token);
113 }
114 }
115 }
116
117 fn handle_before_html(&mut self, token: Token) {
118 match token {
119 Token::Doctype { .. } => { /* ignore */ }
120 Token::Comment(data) => {
121 let comment = self.document.create_comment(&data);
122 let root = self.document.root();
123 self.document.append_child(root, comment);
124 }
125 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
126 // Ignore whitespace.
127 }
128 Token::StartTag { ref name, .. } if name == "html" => {
129 let html = self.create_element_from_token(&token);
130 let root = self.document.root();
131 self.document.append_child(root, html);
132 self.open_elements.push(html);
133 self.insertion_mode = InsertionMode::BeforeHead;
134 }
135 Token::EndTag { ref name }
136 if name != "head" && name != "body" && name != "html" && name != "br" =>
137 {
138 // Parse error, ignore.
139 }
140 _ => {
141 // Create an implicit <html> element.
142 let html = self.document.create_element("html");
143 let root = self.document.root();
144 self.document.append_child(root, html);
145 self.open_elements.push(html);
146 self.insertion_mode = InsertionMode::BeforeHead;
147 self.handle_before_head(token);
148 }
149 }
150 }
151
152 fn handle_before_head(&mut self, token: Token) {
153 match token {
154 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
155 // Ignore whitespace.
156 }
157 Token::Comment(data) => {
158 self.insert_comment(&data);
159 }
160 Token::Doctype { .. } => { /* ignore */ }
161 Token::StartTag { ref name, .. } if name == "html" => {
162 // Process as if InBody.
163 self.handle_in_body(token);
164 }
165 Token::StartTag { ref name, .. } if name == "head" => {
166 let head = self.create_element_from_token(&token);
167 self.insert_node(head);
168 self.open_elements.push(head);
169 self.head_element = Some(head);
170 self.insertion_mode = InsertionMode::InHead;
171 }
172 Token::EndTag { ref name }
173 if name != "head" && name != "body" && name != "html" && name != "br" =>
174 {
175 // Parse error, ignore.
176 }
177 _ => {
178 // Implied <head>.
179 let head = self.document.create_element("head");
180 self.insert_node(head);
181 self.open_elements.push(head);
182 self.head_element = Some(head);
183 self.insertion_mode = InsertionMode::InHead;
184 self.handle_in_head(token);
185 }
186 }
187 }
188
189 fn handle_in_head(&mut self, token: Token) {
190 match token {
191 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
192 self.insert_text(s);
193 }
194 Token::Comment(data) => {
195 self.insert_comment(&data);
196 }
197 Token::Doctype { .. } => { /* ignore */ }
198 Token::StartTag { ref name, .. } if name == "title" => {
199 let elem = self.create_element_from_token(&token);
200 self.insert_node(elem);
201 self.open_elements.push(elem);
202 self.original_insertion_mode = Some(self.insertion_mode);
203 self.insertion_mode = InsertionMode::Text;
204 }
205 Token::StartTag { ref name, .. }
206 if name == "style" || name == "script" || name == "noscript" =>
207 {
208 let elem = self.create_element_from_token(&token);
209 self.insert_node(elem);
210 self.open_elements.push(elem);
211 self.original_insertion_mode = Some(self.insertion_mode);
212 self.insertion_mode = InsertionMode::Text;
213 }
214 Token::StartTag { ref name, .. } if name == "meta" || name == "link" => {
215 let elem = self.create_element_from_token(&token);
216 self.insert_node(elem);
217 // Void elements: don't push onto stack.
218 }
219 Token::StartTag { ref name, .. } if name == "head" => {
220 // Ignore duplicate <head>.
221 }
222 Token::EndTag { ref name } if name == "head" => {
223 self.pop_until("head");
224 self.insertion_mode = InsertionMode::AfterHead;
225 }
226 Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => {
227 // Parse error, ignore.
228 }
229 _ => {
230 // Pop <head> and switch to AfterHead, then reprocess.
231 self.pop_until("head");
232 self.insertion_mode = InsertionMode::AfterHead;
233 self.handle_after_head(token);
234 }
235 }
236 }
237
238 fn handle_text(&mut self, token: Token) {
239 match token {
240 Token::Character(s) => {
241 self.pending_text.push_str(&s);
242 }
243 Token::EndTag { .. } => {
244 // Flush pending text.
245 if !self.pending_text.is_empty() {
246 let text = self.pending_text.clone();
247 self.pending_text.clear();
248 self.insert_text(&text);
249 }
250 // Pop the element (e.g., <title>).
251 self.open_elements.pop();
252 self.insertion_mode = self
253 .original_insertion_mode
254 .unwrap_or(InsertionMode::InBody);
255 self.original_insertion_mode = None;
256 }
257 Token::Eof => {
258 // Flush pending text.
259 if !self.pending_text.is_empty() {
260 let text = self.pending_text.clone();
261 self.pending_text.clear();
262 self.insert_text(&text);
263 }
264 self.open_elements.pop();
265 self.insertion_mode = self
266 .original_insertion_mode
267 .unwrap_or(InsertionMode::InBody);
268 self.original_insertion_mode = None;
269 self.process_token(Token::Eof);
270 }
271 _ => {}
272 }
273 }
274
275 fn handle_after_head(&mut self, token: Token) {
276 match token {
277 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
278 self.insert_text(s);
279 }
280 Token::Comment(data) => {
281 self.insert_comment(&data);
282 }
283 Token::Doctype { .. } => { /* ignore */ }
284 Token::StartTag { ref name, .. } if name == "html" => {
285 self.handle_in_body(token);
286 }
287 Token::StartTag { ref name, .. } if name == "body" => {
288 let body = self.create_element_from_token(&token);
289 self.insert_node(body);
290 self.open_elements.push(body);
291 self.body_element = Some(body);
292 self.insertion_mode = InsertionMode::InBody;
293 }
294 Token::StartTag { ref name, .. } if name == "head" => {
295 // Ignore.
296 }
297 Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => {
298 // Ignore.
299 }
300 _ => {
301 // Implied <body>.
302 let body = self.document.create_element("body");
303 self.insert_node(body);
304 self.open_elements.push(body);
305 self.body_element = Some(body);
306 self.insertion_mode = InsertionMode::InBody;
307 self.handle_in_body(token);
308 }
309 }
310 }
311
312 fn handle_in_body(&mut self, token: Token) {
313 match token {
314 Token::Character(s) => {
315 self.insert_text(&s);
316 }
317 Token::Comment(data) => {
318 self.insert_comment(&data);
319 }
320 Token::Doctype { .. } => { /* ignore */ }
321 Token::StartTag { ref name, .. } if name == "html" => {
322 // Merge attributes onto existing <html> element.
323 if let Token::StartTag { attributes, .. } = &token {
324 if let Some(&html_id) = self.open_elements.first() {
325 for (attr_name, attr_value) in attributes {
326 if self.document.get_attribute(html_id, attr_name).is_none() {
327 self.document.set_attribute(html_id, attr_name, attr_value);
328 }
329 }
330 }
331 }
332 }
333 Token::StartTag { ref name, .. }
334 if name == "body"
335 || name == "head"
336 || name == "title"
337 || name == "style"
338 || name == "script" =>
339 {
340 match name.as_str() {
341 "body" => {
342 // Ignore duplicate <body>.
343 }
344 "head" => {
345 // Ignore <head> in body.
346 }
347 _ => {
348 // title/style/script: process using InHead rules
349 self.handle_in_head(token);
350 }
351 }
352 }
353 Token::StartTag { ref name, .. }
354 if name == "p"
355 || name == "div"
356 || name == "h1"
357 || name == "h2"
358 || name == "h3"
359 || name == "h4"
360 || name == "h5"
361 || name == "h6"
362 || name == "pre"
363 || name == "blockquote"
364 || name == "ul"
365 || name == "ol"
366 || name == "li" =>
367 {
368 // If there's a <p> in button scope, close it first.
369 if self.has_element_in_button_scope("p") {
370 self.close_p_element();
371 }
372 let elem = self.create_element_from_token(&token);
373 self.insert_node(elem);
374 self.open_elements.push(elem);
375 }
376 Token::StartTag { ref name, .. } if is_void_element(name) => {
377 let elem = self.create_element_from_token(&token);
378 self.insert_node(elem);
379 // Don't push void elements onto the stack.
380 }
381 Token::StartTag { .. } => {
382 // Generic start tag: create element and push onto stack.
383 let elem = self.create_element_from_token(&token);
384 self.insert_node(elem);
385 self.open_elements.push(elem);
386 }
387 Token::EndTag { ref name } if name == "body" => {
388 if self.has_element_in_scope("body") {
389 self.insertion_mode = InsertionMode::AfterBody;
390 }
391 }
392 Token::EndTag { ref name } if name == "html" => {
393 if self.has_element_in_scope("body") {
394 self.insertion_mode = InsertionMode::AfterBody;
395 self.handle_after_body(token);
396 }
397 }
398 Token::EndTag { ref name } if name == "p" => {
399 if !self.has_element_in_button_scope("p") {
400 // No matching <p>: insert an empty one, then close it.
401 let p = self.document.create_element("p");
402 self.insert_node(p);
403 self.open_elements.push(p);
404 }
405 self.close_p_element();
406 }
407 Token::EndTag { ref name }
408 if name == "div"
409 || name == "pre"
410 || name == "blockquote"
411 || name == "ul"
412 || name == "ol"
413 || name == "li" =>
414 {
415 if self.has_element_in_scope(name) {
416 self.generate_implied_end_tags(Some(name));
417 self.pop_until(name);
418 }
419 }
420 Token::EndTag { ref name }
421 if name == "h1"
422 || name == "h2"
423 || name == "h3"
424 || name == "h4"
425 || name == "h5"
426 || name == "h6" =>
427 {
428 if self.has_heading_in_scope() {
429 self.generate_implied_end_tags(None);
430 // Pop until we find a heading element.
431 while let Some(id) = self.open_elements.pop() {
432 if let Some(tag) = self.document.tag_name(id) {
433 if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
434 break;
435 }
436 }
437 }
438 }
439 }
440 Token::EndTag { ref name } => {
441 // Generic end tag: walk back through open elements.
442 self.handle_any_other_end_tag(name);
443 }
444 Token::Eof => {
445 // Stop parsing.
446 }
447 }
448 }
449
450 fn handle_after_body(&mut self, token: Token) {
451 match token {
452 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
453 // Process whitespace as in InBody.
454 self.handle_in_body(token);
455 }
456 Token::Comment(data) => {
457 // Insert as last child of the first element (html).
458 let comment = self.document.create_comment(&data);
459 if let Some(&html) = self.open_elements.first() {
460 self.document.append_child(html, comment);
461 }
462 }
463 Token::Doctype { .. } => { /* ignore */ }
464 Token::EndTag { ref name } if name == "html" => {
465 self.insertion_mode = InsertionMode::AfterAfterBody;
466 }
467 Token::Eof => {
468 // Stop parsing.
469 }
470 _ => {
471 // Anything else: switch back to InBody and reprocess.
472 self.insertion_mode = InsertionMode::InBody;
473 self.handle_in_body(token);
474 }
475 }
476 }
477
478 fn handle_after_after_body(&mut self, token: Token) {
479 match token {
480 Token::Comment(data) => {
481 let comment = self.document.create_comment(&data);
482 let root = self.document.root();
483 self.document.append_child(root, comment);
484 }
485 Token::Doctype { .. } => { /* ignore */ }
486 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => {
487 self.handle_in_body(token);
488 }
489 Token::Eof => {
490 // Stop.
491 }
492 _ => {
493 self.insertion_mode = InsertionMode::InBody;
494 self.handle_in_body(token);
495 }
496 }
497 }
498
499 // --- Helper methods ---
500
501 /// Create a DOM element from a StartTag token, setting attributes.
502 fn create_element_from_token(&mut self, token: &Token) -> NodeId {
503 if let Token::StartTag {
504 name, attributes, ..
505 } = token
506 {
507 let id = self.document.create_element(name);
508 for (attr_name, attr_value) in attributes {
509 self.document.set_attribute(id, attr_name, attr_value);
510 }
511 id
512 } else {
513 // Should only be called with StartTag tokens.
514 self.document.create_element("unknown")
515 }
516 }
517
518 /// Insert a node at the current insertion point (last open element).
519 fn insert_node(&mut self, node: NodeId) {
520 let parent = self
521 .open_elements
522 .last()
523 .copied()
524 .unwrap_or_else(|| self.document.root());
525 self.document.append_child(parent, node);
526 }
527
528 /// Insert a text node at the current insertion point.
529 /// If the last child is already a text node, append to it.
530 fn insert_text(&mut self, data: &str) {
531 let parent = self
532 .open_elements
533 .last()
534 .copied()
535 .unwrap_or_else(|| self.document.root());
536
537 // Try to merge with existing text node.
538 if let Some(last_child) = self.document.last_child(parent) {
539 if let we_dom::NodeData::Text { data: ref existing } =
540 *self.document.node_data(last_child)
541 {
542 let mut merged = existing.clone();
543 merged.push_str(data);
544 self.document.set_text_content(last_child, &merged);
545 return;
546 }
547 }
548
549 let text = self.document.create_text(data);
550 self.document.append_child(parent, text);
551 }
552
553 /// Insert a comment node at the current insertion point.
554 fn insert_comment(&mut self, data: &str) {
555 let comment = self.document.create_comment(data);
556 self.insert_node(comment);
557 }
558
559 /// Pop elements from the stack until we find one with the given tag name.
560 /// The matching element is also popped.
561 fn pop_until(&mut self, tag_name: &str) {
562 while let Some(id) = self.open_elements.pop() {
563 if self.document.tag_name(id) == Some(tag_name) {
564 return;
565 }
566 }
567 }
568
569 /// Check if the given tag name is "in scope" (simplified).
570 /// In scope means there's an element with that tag on the stack,
571 /// and no scope barrier element between it and the top.
572 fn has_element_in_scope(&self, target: &str) -> bool {
573 for &id in self.open_elements.iter().rev() {
574 if let Some(tag) = self.document.tag_name(id) {
575 if tag == target {
576 return true;
577 }
578 // Scope barrier elements.
579 if matches!(
580 tag,
581 "applet"
582 | "caption"
583 | "html"
584 | "table"
585 | "td"
586 | "th"
587 | "marquee"
588 | "object"
589 | "template"
590 ) {
591 return false;
592 }
593 }
594 }
595 false
596 }
597
598 /// Check if the given tag name is "in button scope".
599 fn has_element_in_button_scope(&self, target: &str) -> bool {
600 for &id in self.open_elements.iter().rev() {
601 if let Some(tag) = self.document.tag_name(id) {
602 if tag == target {
603 return true;
604 }
605 // Button scope includes all regular scope barriers plus <button>.
606 if matches!(
607 tag,
608 "applet"
609 | "button"
610 | "caption"
611 | "html"
612 | "table"
613 | "td"
614 | "th"
615 | "marquee"
616 | "object"
617 | "template"
618 ) {
619 return false;
620 }
621 }
622 }
623 false
624 }
625
626 /// Check if any heading element (h1-h6) is in scope.
627 fn has_heading_in_scope(&self) -> bool {
628 for &id in self.open_elements.iter().rev() {
629 if let Some(tag) = self.document.tag_name(id) {
630 if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
631 return true;
632 }
633 if matches!(
634 tag,
635 "applet"
636 | "caption"
637 | "html"
638 | "table"
639 | "td"
640 | "th"
641 | "marquee"
642 | "object"
643 | "template"
644 ) {
645 return false;
646 }
647 }
648 }
649 false
650 }
651
652 /// Close a `<p>` element: generate implied end tags (excluding p),
653 /// then pop until we find the `<p>`.
654 fn close_p_element(&mut self) {
655 self.generate_implied_end_tags(Some("p"));
656 self.pop_until("p");
657 }
658
659 /// Generate implied end tags. If `exclude` is provided, don't generate
660 /// an end tag for that element.
661 fn generate_implied_end_tags(&mut self, exclude: Option<&str>) {
662 loop {
663 let should_pop = self
664 .open_elements
665 .last()
666 .and_then(|&id| self.document.tag_name(id))
667 .map(|tag| {
668 if let Some(excl) = exclude {
669 if tag == excl {
670 return false;
671 }
672 }
673 matches!(
674 tag,
675 "dd" | "dt"
676 | "li"
677 | "optgroup"
678 | "option"
679 | "p"
680 | "rb"
681 | "rp"
682 | "rt"
683 | "rtc"
684 )
685 })
686 .unwrap_or(false);
687 if should_pop {
688 self.open_elements.pop();
689 } else {
690 break;
691 }
692 }
693 }
694
695 /// Handle a generic end tag by walking back through open elements
696 /// using the "any other end tag" algorithm.
697 fn handle_any_other_end_tag(&mut self, name: &str) {
698 // Walk backwards through the stack.
699 let mut i = self.open_elements.len();
700 while i > 0 {
701 i -= 1;
702 let id = self.open_elements[i];
703 if self.document.tag_name(id) == Some(name) {
704 // Pop everything above and including this element.
705 self.open_elements.truncate(i);
706 return;
707 }
708 // If this is a "special" element, stop.
709 if let Some(tag) = self.document.tag_name(id) {
710 if is_special_element(tag) {
711 return;
712 }
713 }
714 }
715 }
716}
717
718impl Default for TreeBuilder {
719 fn default() -> Self {
720 Self::new()
721 }
722}
723
724/// Returns true if the tag is a "special" element per the HTML spec.
725fn is_special_element(tag: &str) -> bool {
726 matches!(
727 tag,
728 "address"
729 | "applet"
730 | "area"
731 | "article"
732 | "aside"
733 | "base"
734 | "basefont"
735 | "bgsound"
736 | "blockquote"
737 | "body"
738 | "br"
739 | "button"
740 | "caption"
741 | "center"
742 | "col"
743 | "colgroup"
744 | "dd"
745 | "details"
746 | "dir"
747 | "div"
748 | "dl"
749 | "dt"
750 | "embed"
751 | "fieldset"
752 | "figcaption"
753 | "figure"
754 | "footer"
755 | "form"
756 | "frame"
757 | "frameset"
758 | "h1"
759 | "h2"
760 | "h3"
761 | "h4"
762 | "h5"
763 | "h6"
764 | "head"
765 | "header"
766 | "hgroup"
767 | "hr"
768 | "html"
769 | "iframe"
770 | "img"
771 | "input"
772 | "li"
773 | "link"
774 | "listing"
775 | "main"
776 | "marquee"
777 | "menu"
778 | "meta"
779 | "nav"
780 | "noembed"
781 | "noframes"
782 | "noscript"
783 | "object"
784 | "ol"
785 | "p"
786 | "param"
787 | "plaintext"
788 | "pre"
789 | "script"
790 | "section"
791 | "select"
792 | "source"
793 | "style"
794 | "summary"
795 | "table"
796 | "tbody"
797 | "td"
798 | "template"
799 | "textarea"
800 | "tfoot"
801 | "th"
802 | "thead"
803 | "title"
804 | "tr"
805 | "track"
806 | "ul"
807 | "wbr"
808 | "xmp"
809 )
810}
811
812/// Parse an HTML string into a DOM document.
813///
814/// This is a convenience function that tokenizes the input and builds
815/// a DOM tree using the tree builder.
816pub fn parse_html(input: &str) -> Document {
817 let mut builder = TreeBuilder::new();
818 let mut tokenizer = Tokenizer::new(input);
819 loop {
820 let token = tokenizer.next_token();
821 let is_eof = token == Token::Eof;
822 builder.process_token(token);
823 if is_eof {
824 break;
825 }
826 }
827 builder.finish()
828}
829
830#[cfg(test)]
831mod tests {
832 use super::*;
833 use we_dom::NodeData;
834
835 /// Helper: collect tag names of direct children of a node.
836 fn child_tags(doc: &Document, node: NodeId) -> Vec<String> {
837 doc.children(node)
838 .filter_map(|id| doc.tag_name(id).map(String::from))
839 .collect()
840 }
841
842 /// Helper: get the text content of all text node children, concatenated.
843 fn text_of_children(doc: &Document, node: NodeId) -> String {
844 let mut result = String::new();
845 for child in doc.children(node) {
846 if let Some(text) = doc.text_content(child) {
847 result.push_str(text);
848 }
849 }
850 result
851 }
852
853 #[test]
854 fn parse_full_document() {
855 let doc = parse_html(
856 "<!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>",
857 );
858 let root = doc.root();
859
860 // Root should have one child: <html>
861 let html_children: Vec<NodeId> = doc.children(root).collect();
862 assert_eq!(html_children.len(), 1);
863 let html = html_children[0];
864 assert_eq!(doc.tag_name(html), Some("html"));
865
866 // <html> should have <head> and <body>
867 let tags = child_tags(&doc, html);
868 assert_eq!(tags, vec!["head", "body"]);
869
870 // <head> should have <title>
871 let head = doc.children(html).next().unwrap();
872 let head_tags = child_tags(&doc, head);
873 assert_eq!(head_tags, vec!["title"]);
874
875 // <title> should contain "Test"
876 let title = doc.children(head).next().unwrap();
877 assert_eq!(text_of_children(&doc, title), "Test");
878
879 // <body> should have <p>
880 let body = doc.children(html).nth(1).unwrap();
881 let body_tags = child_tags(&doc, body);
882 assert_eq!(body_tags, vec!["p"]);
883
884 // <p> should contain "Hello"
885 let p = doc.children(body).next().unwrap();
886 assert_eq!(text_of_children(&doc, p), "Hello");
887 }
888
889 #[test]
890 fn implicit_html_head_body() {
891 // Minimal document: just <p>Hello
892 let doc = parse_html("<p>Hello");
893 let root = doc.root();
894
895 let html: Vec<NodeId> = doc.children(root).collect();
896 assert_eq!(html.len(), 1);
897 assert_eq!(doc.tag_name(html[0]), Some("html"));
898
899 let html_tags = child_tags(&doc, html[0]);
900 assert_eq!(html_tags, vec!["head", "body"]);
901
902 let body = doc.children(html[0]).nth(1).unwrap();
903 let body_tags = child_tags(&doc, body);
904 assert_eq!(body_tags, vec!["p"]);
905
906 let p = doc.children(body).next().unwrap();
907 assert_eq!(text_of_children(&doc, p), "Hello");
908 }
909
910 #[test]
911 fn void_element_br() {
912 let doc = parse_html("<p>Line 1<br>Line 2</p>");
913 let root = doc.root();
914 let html = doc.children(root).next().unwrap();
915 let body = doc.children(html).nth(1).unwrap();
916 let p = doc.children(body).next().unwrap();
917
918 // <p> should have: text("Line 1"), <br>, text("Line 2")
919 let children: Vec<NodeId> = doc.children(p).collect();
920 assert_eq!(children.len(), 3);
921 assert_eq!(doc.text_content(children[0]), Some("Line 1"));
922 assert_eq!(doc.tag_name(children[1]), Some("br"));
923 assert_eq!(doc.text_content(children[2]), Some("Line 2"));
924 }
925
926 #[test]
927 fn p_inside_p_closes_outer() {
928 let doc = parse_html("<p>First<p>Second");
929 let root = doc.root();
930 let html = doc.children(root).next().unwrap();
931 let body = doc.children(html).nth(1).unwrap();
932
933 // Should have two sibling <p> elements, not nested.
934 let body_tags = child_tags(&doc, body);
935 assert_eq!(body_tags, vec!["p", "p"]);
936
937 let children: Vec<NodeId> = doc.children(body).collect();
938 assert_eq!(text_of_children(&doc, children[0]), "First");
939 assert_eq!(text_of_children(&doc, children[1]), "Second");
940 }
941
942 #[test]
943 fn nested_div_elements() {
944 let doc = parse_html("<div><div>inner</div></div>");
945 let root = doc.root();
946 let html = doc.children(root).next().unwrap();
947 let body = doc.children(html).nth(1).unwrap();
948
949 let outer_div = doc.children(body).next().unwrap();
950 assert_eq!(doc.tag_name(outer_div), Some("div"));
951
952 let inner_div = doc.children(outer_div).next().unwrap();
953 assert_eq!(doc.tag_name(inner_div), Some("div"));
954 assert_eq!(text_of_children(&doc, inner_div), "inner");
955 }
956
957 #[test]
958 fn inline_elements_nest_properly() {
959 let doc = parse_html("<p><span><a href=\"#\">link</a></span></p>");
960 let root = doc.root();
961 let html = doc.children(root).next().unwrap();
962 let body = doc.children(html).nth(1).unwrap();
963
964 let p = doc.children(body).next().unwrap();
965 let span = doc.children(p).next().unwrap();
966 assert_eq!(doc.tag_name(span), Some("span"));
967
968 let a = doc.children(span).next().unwrap();
969 assert_eq!(doc.tag_name(a), Some("a"));
970 assert_eq!(doc.get_attribute(a, "href"), Some("#"));
971 assert_eq!(text_of_children(&doc, a), "link");
972 }
973
974 #[test]
975 fn headings() {
976 let doc = parse_html("<h1>Title</h1><h2>Subtitle</h2><p>Body text</p>");
977 let root = doc.root();
978 let html = doc.children(root).next().unwrap();
979 let body = doc.children(html).nth(1).unwrap();
980
981 let tags = child_tags(&doc, body);
982 assert_eq!(tags, vec!["h1", "h2", "p"]);
983 }
984
985 #[test]
986 fn comment_nodes() {
987 let doc = parse_html("<body><!-- a comment --><p>text</p></body>");
988 let root = doc.root();
989 let html = doc.children(root).next().unwrap();
990 let body = doc.children(html).nth(1).unwrap();
991
992 let children: Vec<NodeId> = doc.children(body).collect();
993 assert!(children.len() >= 2);
994
995 // First child should be a comment.
996 match doc.node_data(children[0]) {
997 NodeData::Comment { data } => assert_eq!(data, " a comment "),
998 other => panic!("expected comment, got {:?}", other),
999 }
1000 }
1001
1002 #[test]
1003 fn pre_element() {
1004 let doc = parse_html("<pre>line 1\nline 2</pre>");
1005 let root = doc.root();
1006 let html = doc.children(root).next().unwrap();
1007 let body = doc.children(html).nth(1).unwrap();
1008
1009 let pre = doc.children(body).next().unwrap();
1010 assert_eq!(doc.tag_name(pre), Some("pre"));
1011 assert_eq!(text_of_children(&doc, pre), "line 1\nline 2");
1012 }
1013
1014 #[test]
1015 fn attributes_preserved() {
1016 let doc =
1017 parse_html("<div id=\"main\" class=\"container\"><a href=\"/page\">link</a></div>");
1018 let root = doc.root();
1019 let html = doc.children(root).next().unwrap();
1020 let body = doc.children(html).nth(1).unwrap();
1021
1022 let div = doc.children(body).next().unwrap();
1023 assert_eq!(doc.get_attribute(div, "id"), Some("main"));
1024 assert_eq!(doc.get_attribute(div, "class"), Some("container"));
1025
1026 let a = doc.children(div).next().unwrap();
1027 assert_eq!(doc.get_attribute(a, "href"), Some("/page"));
1028 }
1029
1030 #[test]
1031 fn empty_document() {
1032 let doc = parse_html("");
1033 let root = doc.root();
1034 // Even an empty doc should get html/head/body from EOF handling.
1035 // The tree builder creates implicit elements.
1036 assert!(doc.children(root).next().is_some());
1037 }
1038
1039 #[test]
1040 fn just_text() {
1041 let doc = parse_html("Hello, world!");
1042 let root = doc.root();
1043 let html = doc.children(root).next().unwrap();
1044 let body = doc.children(html).nth(1).unwrap();
1045
1046 assert_eq!(text_of_children(&doc, body), "Hello, world!");
1047 }
1048
1049 #[test]
1050 fn heading_closes_open_p() {
1051 let doc = parse_html("<p>text<h1>heading</h1>");
1052 let root = doc.root();
1053 let html = doc.children(root).next().unwrap();
1054 let body = doc.children(html).nth(1).unwrap();
1055
1056 // <p> should be closed by <h1>, so they're siblings.
1057 let tags = child_tags(&doc, body);
1058 assert_eq!(tags, vec!["p", "h1"]);
1059 }
1060
1061 #[test]
1062 fn self_closing_void_elements() {
1063 let doc = parse_html("<p>before<br/>after</p>");
1064 let root = doc.root();
1065 let html = doc.children(root).next().unwrap();
1066 let body = doc.children(html).nth(1).unwrap();
1067 let p = doc.children(body).next().unwrap();
1068
1069 let children: Vec<NodeId> = doc.children(p).collect();
1070 assert_eq!(children.len(), 3);
1071 assert_eq!(doc.tag_name(children[1]), Some("br"));
1072 }
1073
1074 #[test]
1075 fn doctype_is_handled() {
1076 let doc = parse_html("<!DOCTYPE html><html><body></body></html>");
1077 let root = doc.root();
1078 let html = doc.children(root).next().unwrap();
1079 assert_eq!(doc.tag_name(html), Some("html"));
1080 }
1081
1082 #[test]
1083 fn tree_builder_step_by_step() {
1084 let mut builder = TreeBuilder::new();
1085 builder.process_token(Token::Doctype {
1086 name: Some("html".into()),
1087 public_id: None,
1088 system_id: None,
1089 force_quirks: false,
1090 });
1091 builder.process_token(Token::StartTag {
1092 name: "html".into(),
1093 attributes: vec![],
1094 self_closing: false,
1095 });
1096 builder.process_token(Token::StartTag {
1097 name: "head".into(),
1098 attributes: vec![],
1099 self_closing: false,
1100 });
1101 builder.process_token(Token::EndTag {
1102 name: "head".into(),
1103 });
1104 builder.process_token(Token::StartTag {
1105 name: "body".into(),
1106 attributes: vec![],
1107 self_closing: false,
1108 });
1109 builder.process_token(Token::StartTag {
1110 name: "p".into(),
1111 attributes: vec![],
1112 self_closing: false,
1113 });
1114 builder.process_token(Token::Character("Hello".into()));
1115 builder.process_token(Token::EndTag { name: "p".into() });
1116 builder.process_token(Token::EndTag {
1117 name: "body".into(),
1118 });
1119 builder.process_token(Token::EndTag {
1120 name: "html".into(),
1121 });
1122 builder.process_token(Token::Eof);
1123
1124 let doc = builder.finish();
1125 let root = doc.root();
1126 let html = doc.children(root).next().unwrap();
1127 assert_eq!(doc.tag_name(html), Some("html"));
1128
1129 let body = doc.children(html).nth(1).unwrap();
1130 let p = doc.children(body).next().unwrap();
1131 assert_eq!(text_of_children(&doc, p), "Hello");
1132 }
1133
1134 #[test]
1135 fn multiple_text_children_merge() {
1136 // When consecutive character tokens arrive, they should merge.
1137 let mut builder = TreeBuilder::new();
1138 builder.process_token(Token::StartTag {
1139 name: "p".into(),
1140 attributes: vec![],
1141 self_closing: false,
1142 });
1143 builder.process_token(Token::Character("Hello ".into()));
1144 builder.process_token(Token::Character("world".into()));
1145 builder.process_token(Token::EndTag { name: "p".into() });
1146 builder.process_token(Token::Eof);
1147
1148 let doc = builder.finish();
1149 let root = doc.root();
1150 let html = doc.children(root).next().unwrap();
1151 let body = doc.children(html).nth(1).unwrap();
1152 let p = doc.children(body).next().unwrap();
1153
1154 // Should be a single text node.
1155 let children: Vec<NodeId> = doc.children(p).collect();
1156 assert_eq!(children.len(), 1);
1157 assert_eq!(doc.text_content(children[0]), Some("Hello world"));
1158 }
1159}