web engine - experimental web browser
at poly1305-h4-fix 1159 lines 40 kB view raw
1//! HTML tree builder: construct a DOM tree from tokenizer output. 2//! 3//! Implements a simplified subset of the WHATWG HTML5 tree construction 4//! algorithm for Phase 3 of the browser engine. 5 6use we_dom::{Document, NodeId}; 7 8use crate::{Token, Tokenizer}; 9 10/// Insertion modes for the tree builder state machine. 11#[derive(Debug, Clone, Copy, PartialEq)] 12enum InsertionMode { 13 Initial, 14 BeforeHtml, 15 BeforeHead, 16 InHead, 17 Text, 18 AfterHead, 19 InBody, 20 AfterBody, 21 AfterAfterBody, 22} 23 24/// Returns true if the given tag name is a void element (self-closing, no end tag). 25fn is_void_element(tag: &str) -> bool { 26 matches!( 27 tag, 28 "area" 29 | "base" 30 | "br" 31 | "col" 32 | "embed" 33 | "hr" 34 | "img" 35 | "input" 36 | "link" 37 | "meta" 38 | "param" 39 | "source" 40 | "track" 41 | "wbr" 42 ) 43} 44 45/// HTML tree builder that processes tokens and constructs a DOM tree. 46pub struct TreeBuilder { 47 document: Document, 48 /// Stack of open elements (the current nesting context). 49 open_elements: Vec<NodeId>, 50 head_element: Option<NodeId>, 51 body_element: Option<NodeId>, 52 insertion_mode: InsertionMode, 53 /// Original insertion mode, saved when switching to Text mode. 54 original_insertion_mode: Option<InsertionMode>, 55 /// Pending text for the Text insertion mode (e.g., inside `<title>`). 56 pending_text: String, 57} 58 59impl TreeBuilder { 60 /// Create a new tree builder with an empty document. 61 pub fn new() -> Self { 62 TreeBuilder { 63 document: Document::new(), 64 open_elements: Vec::new(), 65 head_element: None, 66 body_element: None, 67 insertion_mode: InsertionMode::Initial, 68 original_insertion_mode: None, 69 pending_text: String::new(), 70 } 71 } 72 73 /// Process a single token, updating the DOM tree. 74 pub fn process_token(&mut self, token: Token) { 75 match self.insertion_mode { 76 InsertionMode::Initial => self.handle_initial(token), 77 InsertionMode::BeforeHtml => self.handle_before_html(token), 78 InsertionMode::BeforeHead => self.handle_before_head(token), 79 InsertionMode::InHead => self.handle_in_head(token), 80 InsertionMode::Text => self.handle_text(token), 81 InsertionMode::AfterHead => self.handle_after_head(token), 82 InsertionMode::InBody => self.handle_in_body(token), 83 InsertionMode::AfterBody => self.handle_after_body(token), 84 InsertionMode::AfterAfterBody => self.handle_after_after_body(token), 85 } 86 } 87 88 /// Finish building and return the constructed DOM document. 89 pub fn finish(self) -> Document { 90 self.document 91 } 92 93 // --- Insertion mode handlers --- 94 95 fn handle_initial(&mut self, token: Token) { 96 match token { 97 Token::Doctype { .. } => { 98 // For Phase 3, we just acknowledge the DOCTYPE and move on. 99 self.insertion_mode = InsertionMode::BeforeHtml; 100 } 101 Token::Comment(data) => { 102 let comment = self.document.create_comment(&data); 103 let root = self.document.root(); 104 self.document.append_child(root, comment); 105 } 106 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 107 // Ignore whitespace in Initial mode. 108 } 109 _ => { 110 // Anything else: switch to BeforeHtml and reprocess. 111 self.insertion_mode = InsertionMode::BeforeHtml; 112 self.handle_before_html(token); 113 } 114 } 115 } 116 117 fn handle_before_html(&mut self, token: Token) { 118 match token { 119 Token::Doctype { .. } => { /* ignore */ } 120 Token::Comment(data) => { 121 let comment = self.document.create_comment(&data); 122 let root = self.document.root(); 123 self.document.append_child(root, comment); 124 } 125 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 126 // Ignore whitespace. 127 } 128 Token::StartTag { ref name, .. } if name == "html" => { 129 let html = self.create_element_from_token(&token); 130 let root = self.document.root(); 131 self.document.append_child(root, html); 132 self.open_elements.push(html); 133 self.insertion_mode = InsertionMode::BeforeHead; 134 } 135 Token::EndTag { ref name } 136 if name != "head" && name != "body" && name != "html" && name != "br" => 137 { 138 // Parse error, ignore. 139 } 140 _ => { 141 // Create an implicit <html> element. 142 let html = self.document.create_element("html"); 143 let root = self.document.root(); 144 self.document.append_child(root, html); 145 self.open_elements.push(html); 146 self.insertion_mode = InsertionMode::BeforeHead; 147 self.handle_before_head(token); 148 } 149 } 150 } 151 152 fn handle_before_head(&mut self, token: Token) { 153 match token { 154 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 155 // Ignore whitespace. 156 } 157 Token::Comment(data) => { 158 self.insert_comment(&data); 159 } 160 Token::Doctype { .. } => { /* ignore */ } 161 Token::StartTag { ref name, .. } if name == "html" => { 162 // Process as if InBody. 163 self.handle_in_body(token); 164 } 165 Token::StartTag { ref name, .. } if name == "head" => { 166 let head = self.create_element_from_token(&token); 167 self.insert_node(head); 168 self.open_elements.push(head); 169 self.head_element = Some(head); 170 self.insertion_mode = InsertionMode::InHead; 171 } 172 Token::EndTag { ref name } 173 if name != "head" && name != "body" && name != "html" && name != "br" => 174 { 175 // Parse error, ignore. 176 } 177 _ => { 178 // Implied <head>. 179 let head = self.document.create_element("head"); 180 self.insert_node(head); 181 self.open_elements.push(head); 182 self.head_element = Some(head); 183 self.insertion_mode = InsertionMode::InHead; 184 self.handle_in_head(token); 185 } 186 } 187 } 188 189 fn handle_in_head(&mut self, token: Token) { 190 match token { 191 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 192 self.insert_text(s); 193 } 194 Token::Comment(data) => { 195 self.insert_comment(&data); 196 } 197 Token::Doctype { .. } => { /* ignore */ } 198 Token::StartTag { ref name, .. } if name == "title" => { 199 let elem = self.create_element_from_token(&token); 200 self.insert_node(elem); 201 self.open_elements.push(elem); 202 self.original_insertion_mode = Some(self.insertion_mode); 203 self.insertion_mode = InsertionMode::Text; 204 } 205 Token::StartTag { ref name, .. } 206 if name == "style" || name == "script" || name == "noscript" => 207 { 208 let elem = self.create_element_from_token(&token); 209 self.insert_node(elem); 210 self.open_elements.push(elem); 211 self.original_insertion_mode = Some(self.insertion_mode); 212 self.insertion_mode = InsertionMode::Text; 213 } 214 Token::StartTag { ref name, .. } if name == "meta" || name == "link" => { 215 let elem = self.create_element_from_token(&token); 216 self.insert_node(elem); 217 // Void elements: don't push onto stack. 218 } 219 Token::StartTag { ref name, .. } if name == "head" => { 220 // Ignore duplicate <head>. 221 } 222 Token::EndTag { ref name } if name == "head" => { 223 self.pop_until("head"); 224 self.insertion_mode = InsertionMode::AfterHead; 225 } 226 Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { 227 // Parse error, ignore. 228 } 229 _ => { 230 // Pop <head> and switch to AfterHead, then reprocess. 231 self.pop_until("head"); 232 self.insertion_mode = InsertionMode::AfterHead; 233 self.handle_after_head(token); 234 } 235 } 236 } 237 238 fn handle_text(&mut self, token: Token) { 239 match token { 240 Token::Character(s) => { 241 self.pending_text.push_str(&s); 242 } 243 Token::EndTag { .. } => { 244 // Flush pending text. 245 if !self.pending_text.is_empty() { 246 let text = self.pending_text.clone(); 247 self.pending_text.clear(); 248 self.insert_text(&text); 249 } 250 // Pop the element (e.g., <title>). 251 self.open_elements.pop(); 252 self.insertion_mode = self 253 .original_insertion_mode 254 .unwrap_or(InsertionMode::InBody); 255 self.original_insertion_mode = None; 256 } 257 Token::Eof => { 258 // Flush pending text. 259 if !self.pending_text.is_empty() { 260 let text = self.pending_text.clone(); 261 self.pending_text.clear(); 262 self.insert_text(&text); 263 } 264 self.open_elements.pop(); 265 self.insertion_mode = self 266 .original_insertion_mode 267 .unwrap_or(InsertionMode::InBody); 268 self.original_insertion_mode = None; 269 self.process_token(Token::Eof); 270 } 271 _ => {} 272 } 273 } 274 275 fn handle_after_head(&mut self, token: Token) { 276 match token { 277 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 278 self.insert_text(s); 279 } 280 Token::Comment(data) => { 281 self.insert_comment(&data); 282 } 283 Token::Doctype { .. } => { /* ignore */ } 284 Token::StartTag { ref name, .. } if name == "html" => { 285 self.handle_in_body(token); 286 } 287 Token::StartTag { ref name, .. } if name == "body" => { 288 let body = self.create_element_from_token(&token); 289 self.insert_node(body); 290 self.open_elements.push(body); 291 self.body_element = Some(body); 292 self.insertion_mode = InsertionMode::InBody; 293 } 294 Token::StartTag { ref name, .. } if name == "head" => { 295 // Ignore. 296 } 297 Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { 298 // Ignore. 299 } 300 _ => { 301 // Implied <body>. 302 let body = self.document.create_element("body"); 303 self.insert_node(body); 304 self.open_elements.push(body); 305 self.body_element = Some(body); 306 self.insertion_mode = InsertionMode::InBody; 307 self.handle_in_body(token); 308 } 309 } 310 } 311 312 fn handle_in_body(&mut self, token: Token) { 313 match token { 314 Token::Character(s) => { 315 self.insert_text(&s); 316 } 317 Token::Comment(data) => { 318 self.insert_comment(&data); 319 } 320 Token::Doctype { .. } => { /* ignore */ } 321 Token::StartTag { ref name, .. } if name == "html" => { 322 // Merge attributes onto existing <html> element. 323 if let Token::StartTag { attributes, .. } = &token { 324 if let Some(&html_id) = self.open_elements.first() { 325 for (attr_name, attr_value) in attributes { 326 if self.document.get_attribute(html_id, attr_name).is_none() { 327 self.document.set_attribute(html_id, attr_name, attr_value); 328 } 329 } 330 } 331 } 332 } 333 Token::StartTag { ref name, .. } 334 if name == "body" 335 || name == "head" 336 || name == "title" 337 || name == "style" 338 || name == "script" => 339 { 340 match name.as_str() { 341 "body" => { 342 // Ignore duplicate <body>. 343 } 344 "head" => { 345 // Ignore <head> in body. 346 } 347 _ => { 348 // title/style/script: process using InHead rules 349 self.handle_in_head(token); 350 } 351 } 352 } 353 Token::StartTag { ref name, .. } 354 if name == "p" 355 || name == "div" 356 || name == "h1" 357 || name == "h2" 358 || name == "h3" 359 || name == "h4" 360 || name == "h5" 361 || name == "h6" 362 || name == "pre" 363 || name == "blockquote" 364 || name == "ul" 365 || name == "ol" 366 || name == "li" => 367 { 368 // If there's a <p> in button scope, close it first. 369 if self.has_element_in_button_scope("p") { 370 self.close_p_element(); 371 } 372 let elem = self.create_element_from_token(&token); 373 self.insert_node(elem); 374 self.open_elements.push(elem); 375 } 376 Token::StartTag { ref name, .. } if is_void_element(name) => { 377 let elem = self.create_element_from_token(&token); 378 self.insert_node(elem); 379 // Don't push void elements onto the stack. 380 } 381 Token::StartTag { .. } => { 382 // Generic start tag: create element and push onto stack. 383 let elem = self.create_element_from_token(&token); 384 self.insert_node(elem); 385 self.open_elements.push(elem); 386 } 387 Token::EndTag { ref name } if name == "body" => { 388 if self.has_element_in_scope("body") { 389 self.insertion_mode = InsertionMode::AfterBody; 390 } 391 } 392 Token::EndTag { ref name } if name == "html" => { 393 if self.has_element_in_scope("body") { 394 self.insertion_mode = InsertionMode::AfterBody; 395 self.handle_after_body(token); 396 } 397 } 398 Token::EndTag { ref name } if name == "p" => { 399 if !self.has_element_in_button_scope("p") { 400 // No matching <p>: insert an empty one, then close it. 401 let p = self.document.create_element("p"); 402 self.insert_node(p); 403 self.open_elements.push(p); 404 } 405 self.close_p_element(); 406 } 407 Token::EndTag { ref name } 408 if name == "div" 409 || name == "pre" 410 || name == "blockquote" 411 || name == "ul" 412 || name == "ol" 413 || name == "li" => 414 { 415 if self.has_element_in_scope(name) { 416 self.generate_implied_end_tags(Some(name)); 417 self.pop_until(name); 418 } 419 } 420 Token::EndTag { ref name } 421 if name == "h1" 422 || name == "h2" 423 || name == "h3" 424 || name == "h4" 425 || name == "h5" 426 || name == "h6" => 427 { 428 if self.has_heading_in_scope() { 429 self.generate_implied_end_tags(None); 430 // Pop until we find a heading element. 431 while let Some(id) = self.open_elements.pop() { 432 if let Some(tag) = self.document.tag_name(id) { 433 if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { 434 break; 435 } 436 } 437 } 438 } 439 } 440 Token::EndTag { ref name } => { 441 // Generic end tag: walk back through open elements. 442 self.handle_any_other_end_tag(name); 443 } 444 Token::Eof => { 445 // Stop parsing. 446 } 447 } 448 } 449 450 fn handle_after_body(&mut self, token: Token) { 451 match token { 452 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 453 // Process whitespace as in InBody. 454 self.handle_in_body(token); 455 } 456 Token::Comment(data) => { 457 // Insert as last child of the first element (html). 458 let comment = self.document.create_comment(&data); 459 if let Some(&html) = self.open_elements.first() { 460 self.document.append_child(html, comment); 461 } 462 } 463 Token::Doctype { .. } => { /* ignore */ } 464 Token::EndTag { ref name } if name == "html" => { 465 self.insertion_mode = InsertionMode::AfterAfterBody; 466 } 467 Token::Eof => { 468 // Stop parsing. 469 } 470 _ => { 471 // Anything else: switch back to InBody and reprocess. 472 self.insertion_mode = InsertionMode::InBody; 473 self.handle_in_body(token); 474 } 475 } 476 } 477 478 fn handle_after_after_body(&mut self, token: Token) { 479 match token { 480 Token::Comment(data) => { 481 let comment = self.document.create_comment(&data); 482 let root = self.document.root(); 483 self.document.append_child(root, comment); 484 } 485 Token::Doctype { .. } => { /* ignore */ } 486 Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 487 self.handle_in_body(token); 488 } 489 Token::Eof => { 490 // Stop. 491 } 492 _ => { 493 self.insertion_mode = InsertionMode::InBody; 494 self.handle_in_body(token); 495 } 496 } 497 } 498 499 // --- Helper methods --- 500 501 /// Create a DOM element from a StartTag token, setting attributes. 502 fn create_element_from_token(&mut self, token: &Token) -> NodeId { 503 if let Token::StartTag { 504 name, attributes, .. 505 } = token 506 { 507 let id = self.document.create_element(name); 508 for (attr_name, attr_value) in attributes { 509 self.document.set_attribute(id, attr_name, attr_value); 510 } 511 id 512 } else { 513 // Should only be called with StartTag tokens. 514 self.document.create_element("unknown") 515 } 516 } 517 518 /// Insert a node at the current insertion point (last open element). 519 fn insert_node(&mut self, node: NodeId) { 520 let parent = self 521 .open_elements 522 .last() 523 .copied() 524 .unwrap_or_else(|| self.document.root()); 525 self.document.append_child(parent, node); 526 } 527 528 /// Insert a text node at the current insertion point. 529 /// If the last child is already a text node, append to it. 530 fn insert_text(&mut self, data: &str) { 531 let parent = self 532 .open_elements 533 .last() 534 .copied() 535 .unwrap_or_else(|| self.document.root()); 536 537 // Try to merge with existing text node. 538 if let Some(last_child) = self.document.last_child(parent) { 539 if let we_dom::NodeData::Text { data: ref existing } = 540 *self.document.node_data(last_child) 541 { 542 let mut merged = existing.clone(); 543 merged.push_str(data); 544 self.document.set_text_content(last_child, &merged); 545 return; 546 } 547 } 548 549 let text = self.document.create_text(data); 550 self.document.append_child(parent, text); 551 } 552 553 /// Insert a comment node at the current insertion point. 554 fn insert_comment(&mut self, data: &str) { 555 let comment = self.document.create_comment(data); 556 self.insert_node(comment); 557 } 558 559 /// Pop elements from the stack until we find one with the given tag name. 560 /// The matching element is also popped. 561 fn pop_until(&mut self, tag_name: &str) { 562 while let Some(id) = self.open_elements.pop() { 563 if self.document.tag_name(id) == Some(tag_name) { 564 return; 565 } 566 } 567 } 568 569 /// Check if the given tag name is "in scope" (simplified). 570 /// In scope means there's an element with that tag on the stack, 571 /// and no scope barrier element between it and the top. 572 fn has_element_in_scope(&self, target: &str) -> bool { 573 for &id in self.open_elements.iter().rev() { 574 if let Some(tag) = self.document.tag_name(id) { 575 if tag == target { 576 return true; 577 } 578 // Scope barrier elements. 579 if matches!( 580 tag, 581 "applet" 582 | "caption" 583 | "html" 584 | "table" 585 | "td" 586 | "th" 587 | "marquee" 588 | "object" 589 | "template" 590 ) { 591 return false; 592 } 593 } 594 } 595 false 596 } 597 598 /// Check if the given tag name is "in button scope". 599 fn has_element_in_button_scope(&self, target: &str) -> bool { 600 for &id in self.open_elements.iter().rev() { 601 if let Some(tag) = self.document.tag_name(id) { 602 if tag == target { 603 return true; 604 } 605 // Button scope includes all regular scope barriers plus <button>. 606 if matches!( 607 tag, 608 "applet" 609 | "button" 610 | "caption" 611 | "html" 612 | "table" 613 | "td" 614 | "th" 615 | "marquee" 616 | "object" 617 | "template" 618 ) { 619 return false; 620 } 621 } 622 } 623 false 624 } 625 626 /// Check if any heading element (h1-h6) is in scope. 627 fn has_heading_in_scope(&self) -> bool { 628 for &id in self.open_elements.iter().rev() { 629 if let Some(tag) = self.document.tag_name(id) { 630 if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { 631 return true; 632 } 633 if matches!( 634 tag, 635 "applet" 636 | "caption" 637 | "html" 638 | "table" 639 | "td" 640 | "th" 641 | "marquee" 642 | "object" 643 | "template" 644 ) { 645 return false; 646 } 647 } 648 } 649 false 650 } 651 652 /// Close a `<p>` element: generate implied end tags (excluding p), 653 /// then pop until we find the `<p>`. 654 fn close_p_element(&mut self) { 655 self.generate_implied_end_tags(Some("p")); 656 self.pop_until("p"); 657 } 658 659 /// Generate implied end tags. If `exclude` is provided, don't generate 660 /// an end tag for that element. 661 fn generate_implied_end_tags(&mut self, exclude: Option<&str>) { 662 loop { 663 let should_pop = self 664 .open_elements 665 .last() 666 .and_then(|&id| self.document.tag_name(id)) 667 .map(|tag| { 668 if let Some(excl) = exclude { 669 if tag == excl { 670 return false; 671 } 672 } 673 matches!( 674 tag, 675 "dd" | "dt" 676 | "li" 677 | "optgroup" 678 | "option" 679 | "p" 680 | "rb" 681 | "rp" 682 | "rt" 683 | "rtc" 684 ) 685 }) 686 .unwrap_or(false); 687 if should_pop { 688 self.open_elements.pop(); 689 } else { 690 break; 691 } 692 } 693 } 694 695 /// Handle a generic end tag by walking back through open elements 696 /// using the "any other end tag" algorithm. 697 fn handle_any_other_end_tag(&mut self, name: &str) { 698 // Walk backwards through the stack. 699 let mut i = self.open_elements.len(); 700 while i > 0 { 701 i -= 1; 702 let id = self.open_elements[i]; 703 if self.document.tag_name(id) == Some(name) { 704 // Pop everything above and including this element. 705 self.open_elements.truncate(i); 706 return; 707 } 708 // If this is a "special" element, stop. 709 if let Some(tag) = self.document.tag_name(id) { 710 if is_special_element(tag) { 711 return; 712 } 713 } 714 } 715 } 716} 717 718impl Default for TreeBuilder { 719 fn default() -> Self { 720 Self::new() 721 } 722} 723 724/// Returns true if the tag is a "special" element per the HTML spec. 725fn is_special_element(tag: &str) -> bool { 726 matches!( 727 tag, 728 "address" 729 | "applet" 730 | "area" 731 | "article" 732 | "aside" 733 | "base" 734 | "basefont" 735 | "bgsound" 736 | "blockquote" 737 | "body" 738 | "br" 739 | "button" 740 | "caption" 741 | "center" 742 | "col" 743 | "colgroup" 744 | "dd" 745 | "details" 746 | "dir" 747 | "div" 748 | "dl" 749 | "dt" 750 | "embed" 751 | "fieldset" 752 | "figcaption" 753 | "figure" 754 | "footer" 755 | "form" 756 | "frame" 757 | "frameset" 758 | "h1" 759 | "h2" 760 | "h3" 761 | "h4" 762 | "h5" 763 | "h6" 764 | "head" 765 | "header" 766 | "hgroup" 767 | "hr" 768 | "html" 769 | "iframe" 770 | "img" 771 | "input" 772 | "li" 773 | "link" 774 | "listing" 775 | "main" 776 | "marquee" 777 | "menu" 778 | "meta" 779 | "nav" 780 | "noembed" 781 | "noframes" 782 | "noscript" 783 | "object" 784 | "ol" 785 | "p" 786 | "param" 787 | "plaintext" 788 | "pre" 789 | "script" 790 | "section" 791 | "select" 792 | "source" 793 | "style" 794 | "summary" 795 | "table" 796 | "tbody" 797 | "td" 798 | "template" 799 | "textarea" 800 | "tfoot" 801 | "th" 802 | "thead" 803 | "title" 804 | "tr" 805 | "track" 806 | "ul" 807 | "wbr" 808 | "xmp" 809 ) 810} 811 812/// Parse an HTML string into a DOM document. 813/// 814/// This is a convenience function that tokenizes the input and builds 815/// a DOM tree using the tree builder. 816pub fn parse_html(input: &str) -> Document { 817 let mut builder = TreeBuilder::new(); 818 let mut tokenizer = Tokenizer::new(input); 819 loop { 820 let token = tokenizer.next_token(); 821 let is_eof = token == Token::Eof; 822 builder.process_token(token); 823 if is_eof { 824 break; 825 } 826 } 827 builder.finish() 828} 829 830#[cfg(test)] 831mod tests { 832 use super::*; 833 use we_dom::NodeData; 834 835 /// Helper: collect tag names of direct children of a node. 836 fn child_tags(doc: &Document, node: NodeId) -> Vec<String> { 837 doc.children(node) 838 .filter_map(|id| doc.tag_name(id).map(String::from)) 839 .collect() 840 } 841 842 /// Helper: get the text content of all text node children, concatenated. 843 fn text_of_children(doc: &Document, node: NodeId) -> String { 844 let mut result = String::new(); 845 for child in doc.children(node) { 846 if let Some(text) = doc.text_content(child) { 847 result.push_str(text); 848 } 849 } 850 result 851 } 852 853 #[test] 854 fn parse_full_document() { 855 let doc = parse_html( 856 "<!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>", 857 ); 858 let root = doc.root(); 859 860 // Root should have one child: <html> 861 let html_children: Vec<NodeId> = doc.children(root).collect(); 862 assert_eq!(html_children.len(), 1); 863 let html = html_children[0]; 864 assert_eq!(doc.tag_name(html), Some("html")); 865 866 // <html> should have <head> and <body> 867 let tags = child_tags(&doc, html); 868 assert_eq!(tags, vec!["head", "body"]); 869 870 // <head> should have <title> 871 let head = doc.children(html).next().unwrap(); 872 let head_tags = child_tags(&doc, head); 873 assert_eq!(head_tags, vec!["title"]); 874 875 // <title> should contain "Test" 876 let title = doc.children(head).next().unwrap(); 877 assert_eq!(text_of_children(&doc, title), "Test"); 878 879 // <body> should have <p> 880 let body = doc.children(html).nth(1).unwrap(); 881 let body_tags = child_tags(&doc, body); 882 assert_eq!(body_tags, vec!["p"]); 883 884 // <p> should contain "Hello" 885 let p = doc.children(body).next().unwrap(); 886 assert_eq!(text_of_children(&doc, p), "Hello"); 887 } 888 889 #[test] 890 fn implicit_html_head_body() { 891 // Minimal document: just <p>Hello 892 let doc = parse_html("<p>Hello"); 893 let root = doc.root(); 894 895 let html: Vec<NodeId> = doc.children(root).collect(); 896 assert_eq!(html.len(), 1); 897 assert_eq!(doc.tag_name(html[0]), Some("html")); 898 899 let html_tags = child_tags(&doc, html[0]); 900 assert_eq!(html_tags, vec!["head", "body"]); 901 902 let body = doc.children(html[0]).nth(1).unwrap(); 903 let body_tags = child_tags(&doc, body); 904 assert_eq!(body_tags, vec!["p"]); 905 906 let p = doc.children(body).next().unwrap(); 907 assert_eq!(text_of_children(&doc, p), "Hello"); 908 } 909 910 #[test] 911 fn void_element_br() { 912 let doc = parse_html("<p>Line 1<br>Line 2</p>"); 913 let root = doc.root(); 914 let html = doc.children(root).next().unwrap(); 915 let body = doc.children(html).nth(1).unwrap(); 916 let p = doc.children(body).next().unwrap(); 917 918 // <p> should have: text("Line 1"), <br>, text("Line 2") 919 let children: Vec<NodeId> = doc.children(p).collect(); 920 assert_eq!(children.len(), 3); 921 assert_eq!(doc.text_content(children[0]), Some("Line 1")); 922 assert_eq!(doc.tag_name(children[1]), Some("br")); 923 assert_eq!(doc.text_content(children[2]), Some("Line 2")); 924 } 925 926 #[test] 927 fn p_inside_p_closes_outer() { 928 let doc = parse_html("<p>First<p>Second"); 929 let root = doc.root(); 930 let html = doc.children(root).next().unwrap(); 931 let body = doc.children(html).nth(1).unwrap(); 932 933 // Should have two sibling <p> elements, not nested. 934 let body_tags = child_tags(&doc, body); 935 assert_eq!(body_tags, vec!["p", "p"]); 936 937 let children: Vec<NodeId> = doc.children(body).collect(); 938 assert_eq!(text_of_children(&doc, children[0]), "First"); 939 assert_eq!(text_of_children(&doc, children[1]), "Second"); 940 } 941 942 #[test] 943 fn nested_div_elements() { 944 let doc = parse_html("<div><div>inner</div></div>"); 945 let root = doc.root(); 946 let html = doc.children(root).next().unwrap(); 947 let body = doc.children(html).nth(1).unwrap(); 948 949 let outer_div = doc.children(body).next().unwrap(); 950 assert_eq!(doc.tag_name(outer_div), Some("div")); 951 952 let inner_div = doc.children(outer_div).next().unwrap(); 953 assert_eq!(doc.tag_name(inner_div), Some("div")); 954 assert_eq!(text_of_children(&doc, inner_div), "inner"); 955 } 956 957 #[test] 958 fn inline_elements_nest_properly() { 959 let doc = parse_html("<p><span><a href=\"#\">link</a></span></p>"); 960 let root = doc.root(); 961 let html = doc.children(root).next().unwrap(); 962 let body = doc.children(html).nth(1).unwrap(); 963 964 let p = doc.children(body).next().unwrap(); 965 let span = doc.children(p).next().unwrap(); 966 assert_eq!(doc.tag_name(span), Some("span")); 967 968 let a = doc.children(span).next().unwrap(); 969 assert_eq!(doc.tag_name(a), Some("a")); 970 assert_eq!(doc.get_attribute(a, "href"), Some("#")); 971 assert_eq!(text_of_children(&doc, a), "link"); 972 } 973 974 #[test] 975 fn headings() { 976 let doc = parse_html("<h1>Title</h1><h2>Subtitle</h2><p>Body text</p>"); 977 let root = doc.root(); 978 let html = doc.children(root).next().unwrap(); 979 let body = doc.children(html).nth(1).unwrap(); 980 981 let tags = child_tags(&doc, body); 982 assert_eq!(tags, vec!["h1", "h2", "p"]); 983 } 984 985 #[test] 986 fn comment_nodes() { 987 let doc = parse_html("<body><!-- a comment --><p>text</p></body>"); 988 let root = doc.root(); 989 let html = doc.children(root).next().unwrap(); 990 let body = doc.children(html).nth(1).unwrap(); 991 992 let children: Vec<NodeId> = doc.children(body).collect(); 993 assert!(children.len() >= 2); 994 995 // First child should be a comment. 996 match doc.node_data(children[0]) { 997 NodeData::Comment { data } => assert_eq!(data, " a comment "), 998 other => panic!("expected comment, got {:?}", other), 999 } 1000 } 1001 1002 #[test] 1003 fn pre_element() { 1004 let doc = parse_html("<pre>line 1\nline 2</pre>"); 1005 let root = doc.root(); 1006 let html = doc.children(root).next().unwrap(); 1007 let body = doc.children(html).nth(1).unwrap(); 1008 1009 let pre = doc.children(body).next().unwrap(); 1010 assert_eq!(doc.tag_name(pre), Some("pre")); 1011 assert_eq!(text_of_children(&doc, pre), "line 1\nline 2"); 1012 } 1013 1014 #[test] 1015 fn attributes_preserved() { 1016 let doc = 1017 parse_html("<div id=\"main\" class=\"container\"><a href=\"/page\">link</a></div>"); 1018 let root = doc.root(); 1019 let html = doc.children(root).next().unwrap(); 1020 let body = doc.children(html).nth(1).unwrap(); 1021 1022 let div = doc.children(body).next().unwrap(); 1023 assert_eq!(doc.get_attribute(div, "id"), Some("main")); 1024 assert_eq!(doc.get_attribute(div, "class"), Some("container")); 1025 1026 let a = doc.children(div).next().unwrap(); 1027 assert_eq!(doc.get_attribute(a, "href"), Some("/page")); 1028 } 1029 1030 #[test] 1031 fn empty_document() { 1032 let doc = parse_html(""); 1033 let root = doc.root(); 1034 // Even an empty doc should get html/head/body from EOF handling. 1035 // The tree builder creates implicit elements. 1036 assert!(doc.children(root).next().is_some()); 1037 } 1038 1039 #[test] 1040 fn just_text() { 1041 let doc = parse_html("Hello, world!"); 1042 let root = doc.root(); 1043 let html = doc.children(root).next().unwrap(); 1044 let body = doc.children(html).nth(1).unwrap(); 1045 1046 assert_eq!(text_of_children(&doc, body), "Hello, world!"); 1047 } 1048 1049 #[test] 1050 fn heading_closes_open_p() { 1051 let doc = parse_html("<p>text<h1>heading</h1>"); 1052 let root = doc.root(); 1053 let html = doc.children(root).next().unwrap(); 1054 let body = doc.children(html).nth(1).unwrap(); 1055 1056 // <p> should be closed by <h1>, so they're siblings. 1057 let tags = child_tags(&doc, body); 1058 assert_eq!(tags, vec!["p", "h1"]); 1059 } 1060 1061 #[test] 1062 fn self_closing_void_elements() { 1063 let doc = parse_html("<p>before<br/>after</p>"); 1064 let root = doc.root(); 1065 let html = doc.children(root).next().unwrap(); 1066 let body = doc.children(html).nth(1).unwrap(); 1067 let p = doc.children(body).next().unwrap(); 1068 1069 let children: Vec<NodeId> = doc.children(p).collect(); 1070 assert_eq!(children.len(), 3); 1071 assert_eq!(doc.tag_name(children[1]), Some("br")); 1072 } 1073 1074 #[test] 1075 fn doctype_is_handled() { 1076 let doc = parse_html("<!DOCTYPE html><html><body></body></html>"); 1077 let root = doc.root(); 1078 let html = doc.children(root).next().unwrap(); 1079 assert_eq!(doc.tag_name(html), Some("html")); 1080 } 1081 1082 #[test] 1083 fn tree_builder_step_by_step() { 1084 let mut builder = TreeBuilder::new(); 1085 builder.process_token(Token::Doctype { 1086 name: Some("html".into()), 1087 public_id: None, 1088 system_id: None, 1089 force_quirks: false, 1090 }); 1091 builder.process_token(Token::StartTag { 1092 name: "html".into(), 1093 attributes: vec![], 1094 self_closing: false, 1095 }); 1096 builder.process_token(Token::StartTag { 1097 name: "head".into(), 1098 attributes: vec![], 1099 self_closing: false, 1100 }); 1101 builder.process_token(Token::EndTag { 1102 name: "head".into(), 1103 }); 1104 builder.process_token(Token::StartTag { 1105 name: "body".into(), 1106 attributes: vec![], 1107 self_closing: false, 1108 }); 1109 builder.process_token(Token::StartTag { 1110 name: "p".into(), 1111 attributes: vec![], 1112 self_closing: false, 1113 }); 1114 builder.process_token(Token::Character("Hello".into())); 1115 builder.process_token(Token::EndTag { name: "p".into() }); 1116 builder.process_token(Token::EndTag { 1117 name: "body".into(), 1118 }); 1119 builder.process_token(Token::EndTag { 1120 name: "html".into(), 1121 }); 1122 builder.process_token(Token::Eof); 1123 1124 let doc = builder.finish(); 1125 let root = doc.root(); 1126 let html = doc.children(root).next().unwrap(); 1127 assert_eq!(doc.tag_name(html), Some("html")); 1128 1129 let body = doc.children(html).nth(1).unwrap(); 1130 let p = doc.children(body).next().unwrap(); 1131 assert_eq!(text_of_children(&doc, p), "Hello"); 1132 } 1133 1134 #[test] 1135 fn multiple_text_children_merge() { 1136 // When consecutive character tokens arrive, they should merge. 1137 let mut builder = TreeBuilder::new(); 1138 builder.process_token(Token::StartTag { 1139 name: "p".into(), 1140 attributes: vec![], 1141 self_closing: false, 1142 }); 1143 builder.process_token(Token::Character("Hello ".into())); 1144 builder.process_token(Token::Character("world".into())); 1145 builder.process_token(Token::EndTag { name: "p".into() }); 1146 builder.process_token(Token::Eof); 1147 1148 let doc = builder.finish(); 1149 let root = doc.root(); 1150 let html = doc.children(root).next().unwrap(); 1151 let body = doc.children(html).nth(1).unwrap(); 1152 let p = doc.children(body).next().unwrap(); 1153 1154 // Should be a single text node. 1155 let children: Vec<NodeId> = doc.children(p).collect(); 1156 assert_eq!(children.len(), 1); 1157 assert_eq!(doc.text_content(children[0]), Some("Hello world")); 1158 } 1159}