web engine - experimental web browser

Merge branch 'tree-builder': HTML tree builder

+1163 -1
+4 -1
crates/html/src/lib.rs
··· 1 1 //! HTML5 tokenizer and tree builder. 2 2 //! 3 - //! Implements the WHATWG HTML5 tokenizer state machine (§13.2.5). 3 + //! Implements the WHATWG HTML5 tokenizer state machine (§13.2.5) 4 + //! and a simplified tree builder for constructing DOM trees from tokens. 4 5 5 6 mod entities; 6 7 mod tokenizer; 8 + mod tree_builder; 7 9 8 10 pub use tokenizer::Tokenizer; 11 + pub use tree_builder::{parse_html, TreeBuilder}; 9 12 10 13 /// A token emitted by the HTML tokenizer. 11 14 #[derive(Debug, Clone, PartialEq)]
+1159
crates/html/src/tree_builder.rs
··· 1 + //! HTML tree builder: construct a DOM tree from tokenizer output. 2 + //! 3 + //! Implements a simplified subset of the WHATWG HTML5 tree construction 4 + //! algorithm for Phase 3 of the browser engine. 5 + 6 + use we_dom::{Document, NodeId}; 7 + 8 + use crate::{Token, Tokenizer}; 9 + 10 + /// Insertion modes for the tree builder state machine. 11 + #[derive(Debug, Clone, Copy, PartialEq)] 12 + enum InsertionMode { 13 + Initial, 14 + BeforeHtml, 15 + BeforeHead, 16 + InHead, 17 + Text, 18 + AfterHead, 19 + InBody, 20 + AfterBody, 21 + AfterAfterBody, 22 + } 23 + 24 + /// Returns true if the given tag name is a void element (self-closing, no end tag). 25 + fn is_void_element(tag: &str) -> bool { 26 + matches!( 27 + tag, 28 + "area" 29 + | "base" 30 + | "br" 31 + | "col" 32 + | "embed" 33 + | "hr" 34 + | "img" 35 + | "input" 36 + | "link" 37 + | "meta" 38 + | "param" 39 + | "source" 40 + | "track" 41 + | "wbr" 42 + ) 43 + } 44 + 45 + /// HTML tree builder that processes tokens and constructs a DOM tree. 46 + pub struct TreeBuilder { 47 + document: Document, 48 + /// Stack of open elements (the current nesting context). 49 + open_elements: Vec<NodeId>, 50 + head_element: Option<NodeId>, 51 + body_element: Option<NodeId>, 52 + insertion_mode: InsertionMode, 53 + /// Original insertion mode, saved when switching to Text mode. 54 + original_insertion_mode: Option<InsertionMode>, 55 + /// Pending text for the Text insertion mode (e.g., inside `<title>`). 56 + pending_text: String, 57 + } 58 + 59 + impl TreeBuilder { 60 + /// Create a new tree builder with an empty document. 61 + pub fn new() -> Self { 62 + TreeBuilder { 63 + document: Document::new(), 64 + open_elements: Vec::new(), 65 + head_element: None, 66 + body_element: None, 67 + insertion_mode: InsertionMode::Initial, 68 + original_insertion_mode: None, 69 + pending_text: String::new(), 70 + } 71 + } 72 + 73 + /// Process a single token, updating the DOM tree. 74 + pub fn process_token(&mut self, token: Token) { 75 + match self.insertion_mode { 76 + InsertionMode::Initial => self.handle_initial(token), 77 + InsertionMode::BeforeHtml => self.handle_before_html(token), 78 + InsertionMode::BeforeHead => self.handle_before_head(token), 79 + InsertionMode::InHead => self.handle_in_head(token), 80 + InsertionMode::Text => self.handle_text(token), 81 + InsertionMode::AfterHead => self.handle_after_head(token), 82 + InsertionMode::InBody => self.handle_in_body(token), 83 + InsertionMode::AfterBody => self.handle_after_body(token), 84 + InsertionMode::AfterAfterBody => self.handle_after_after_body(token), 85 + } 86 + } 87 + 88 + /// Finish building and return the constructed DOM document. 89 + pub fn finish(self) -> Document { 90 + self.document 91 + } 92 + 93 + // --- Insertion mode handlers --- 94 + 95 + fn handle_initial(&mut self, token: Token) { 96 + match token { 97 + Token::Doctype { .. } => { 98 + // For Phase 3, we just acknowledge the DOCTYPE and move on. 99 + self.insertion_mode = InsertionMode::BeforeHtml; 100 + } 101 + Token::Comment(data) => { 102 + let comment = self.document.create_comment(&data); 103 + let root = self.document.root(); 104 + self.document.append_child(root, comment); 105 + } 106 + Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 107 + // Ignore whitespace in Initial mode. 108 + } 109 + _ => { 110 + // Anything else: switch to BeforeHtml and reprocess. 111 + self.insertion_mode = InsertionMode::BeforeHtml; 112 + self.handle_before_html(token); 113 + } 114 + } 115 + } 116 + 117 + fn handle_before_html(&mut self, token: Token) { 118 + match token { 119 + Token::Doctype { .. } => { /* ignore */ } 120 + Token::Comment(data) => { 121 + let comment = self.document.create_comment(&data); 122 + let root = self.document.root(); 123 + self.document.append_child(root, comment); 124 + } 125 + Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 126 + // Ignore whitespace. 127 + } 128 + Token::StartTag { ref name, .. } if name == "html" => { 129 + let html = self.create_element_from_token(&token); 130 + let root = self.document.root(); 131 + self.document.append_child(root, html); 132 + self.open_elements.push(html); 133 + self.insertion_mode = InsertionMode::BeforeHead; 134 + } 135 + Token::EndTag { ref name } 136 + if name != "head" && name != "body" && name != "html" && name != "br" => 137 + { 138 + // Parse error, ignore. 139 + } 140 + _ => { 141 + // Create an implicit <html> element. 142 + let html = self.document.create_element("html"); 143 + let root = self.document.root(); 144 + self.document.append_child(root, html); 145 + self.open_elements.push(html); 146 + self.insertion_mode = InsertionMode::BeforeHead; 147 + self.handle_before_head(token); 148 + } 149 + } 150 + } 151 + 152 + fn handle_before_head(&mut self, token: Token) { 153 + match token { 154 + Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 155 + // Ignore whitespace. 156 + } 157 + Token::Comment(data) => { 158 + self.insert_comment(&data); 159 + } 160 + Token::Doctype { .. } => { /* ignore */ } 161 + Token::StartTag { ref name, .. } if name == "html" => { 162 + // Process as if InBody. 163 + self.handle_in_body(token); 164 + } 165 + Token::StartTag { ref name, .. } if name == "head" => { 166 + let head = self.create_element_from_token(&token); 167 + self.insert_node(head); 168 + self.open_elements.push(head); 169 + self.head_element = Some(head); 170 + self.insertion_mode = InsertionMode::InHead; 171 + } 172 + Token::EndTag { ref name } 173 + if name != "head" && name != "body" && name != "html" && name != "br" => 174 + { 175 + // Parse error, ignore. 176 + } 177 + _ => { 178 + // Implied <head>. 179 + let head = self.document.create_element("head"); 180 + self.insert_node(head); 181 + self.open_elements.push(head); 182 + self.head_element = Some(head); 183 + self.insertion_mode = InsertionMode::InHead; 184 + self.handle_in_head(token); 185 + } 186 + } 187 + } 188 + 189 + fn handle_in_head(&mut self, token: Token) { 190 + match token { 191 + Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 192 + self.insert_text(s); 193 + } 194 + Token::Comment(data) => { 195 + self.insert_comment(&data); 196 + } 197 + Token::Doctype { .. } => { /* ignore */ } 198 + Token::StartTag { ref name, .. } if name == "title" => { 199 + let elem = self.create_element_from_token(&token); 200 + self.insert_node(elem); 201 + self.open_elements.push(elem); 202 + self.original_insertion_mode = Some(self.insertion_mode); 203 + self.insertion_mode = InsertionMode::Text; 204 + } 205 + Token::StartTag { ref name, .. } 206 + if name == "style" || name == "script" || name == "noscript" => 207 + { 208 + let elem = self.create_element_from_token(&token); 209 + self.insert_node(elem); 210 + self.open_elements.push(elem); 211 + self.original_insertion_mode = Some(self.insertion_mode); 212 + self.insertion_mode = InsertionMode::Text; 213 + } 214 + Token::StartTag { ref name, .. } if name == "meta" || name == "link" => { 215 + let elem = self.create_element_from_token(&token); 216 + self.insert_node(elem); 217 + // Void elements: don't push onto stack. 218 + } 219 + Token::StartTag { ref name, .. } if name == "head" => { 220 + // Ignore duplicate <head>. 221 + } 222 + Token::EndTag { ref name } if name == "head" => { 223 + self.pop_until("head"); 224 + self.insertion_mode = InsertionMode::AfterHead; 225 + } 226 + Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { 227 + // Parse error, ignore. 228 + } 229 + _ => { 230 + // Pop <head> and switch to AfterHead, then reprocess. 231 + self.pop_until("head"); 232 + self.insertion_mode = InsertionMode::AfterHead; 233 + self.handle_after_head(token); 234 + } 235 + } 236 + } 237 + 238 + fn handle_text(&mut self, token: Token) { 239 + match token { 240 + Token::Character(s) => { 241 + self.pending_text.push_str(&s); 242 + } 243 + Token::EndTag { .. } => { 244 + // Flush pending text. 245 + if !self.pending_text.is_empty() { 246 + let text = self.pending_text.clone(); 247 + self.pending_text.clear(); 248 + self.insert_text(&text); 249 + } 250 + // Pop the element (e.g., <title>). 251 + self.open_elements.pop(); 252 + self.insertion_mode = self 253 + .original_insertion_mode 254 + .unwrap_or(InsertionMode::InBody); 255 + self.original_insertion_mode = None; 256 + } 257 + Token::Eof => { 258 + // Flush pending text. 259 + if !self.pending_text.is_empty() { 260 + let text = self.pending_text.clone(); 261 + self.pending_text.clear(); 262 + self.insert_text(&text); 263 + } 264 + self.open_elements.pop(); 265 + self.insertion_mode = self 266 + .original_insertion_mode 267 + .unwrap_or(InsertionMode::InBody); 268 + self.original_insertion_mode = None; 269 + self.process_token(Token::Eof); 270 + } 271 + _ => {} 272 + } 273 + } 274 + 275 + fn handle_after_head(&mut self, token: Token) { 276 + match token { 277 + Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 278 + self.insert_text(s); 279 + } 280 + Token::Comment(data) => { 281 + self.insert_comment(&data); 282 + } 283 + Token::Doctype { .. } => { /* ignore */ } 284 + Token::StartTag { ref name, .. } if name == "html" => { 285 + self.handle_in_body(token); 286 + } 287 + Token::StartTag { ref name, .. } if name == "body" => { 288 + let body = self.create_element_from_token(&token); 289 + self.insert_node(body); 290 + self.open_elements.push(body); 291 + self.body_element = Some(body); 292 + self.insertion_mode = InsertionMode::InBody; 293 + } 294 + Token::StartTag { ref name, .. } if name == "head" => { 295 + // Ignore. 296 + } 297 + Token::EndTag { ref name } if name != "body" && name != "html" && name != "br" => { 298 + // Ignore. 299 + } 300 + _ => { 301 + // Implied <body>. 302 + let body = self.document.create_element("body"); 303 + self.insert_node(body); 304 + self.open_elements.push(body); 305 + self.body_element = Some(body); 306 + self.insertion_mode = InsertionMode::InBody; 307 + self.handle_in_body(token); 308 + } 309 + } 310 + } 311 + 312 + fn handle_in_body(&mut self, token: Token) { 313 + match token { 314 + Token::Character(s) => { 315 + self.insert_text(&s); 316 + } 317 + Token::Comment(data) => { 318 + self.insert_comment(&data); 319 + } 320 + Token::Doctype { .. } => { /* ignore */ } 321 + Token::StartTag { ref name, .. } if name == "html" => { 322 + // Merge attributes onto existing <html> element. 323 + if let Token::StartTag { attributes, .. } = &token { 324 + if let Some(&html_id) = self.open_elements.first() { 325 + for (attr_name, attr_value) in attributes { 326 + if self.document.get_attribute(html_id, attr_name).is_none() { 327 + self.document.set_attribute(html_id, attr_name, attr_value); 328 + } 329 + } 330 + } 331 + } 332 + } 333 + Token::StartTag { ref name, .. } 334 + if name == "body" 335 + || name == "head" 336 + || name == "title" 337 + || name == "style" 338 + || name == "script" => 339 + { 340 + match name.as_str() { 341 + "body" => { 342 + // Ignore duplicate <body>. 343 + } 344 + "head" => { 345 + // Ignore <head> in body. 346 + } 347 + _ => { 348 + // title/style/script: process using InHead rules 349 + self.handle_in_head(token); 350 + } 351 + } 352 + } 353 + Token::StartTag { ref name, .. } 354 + if name == "p" 355 + || name == "div" 356 + || name == "h1" 357 + || name == "h2" 358 + || name == "h3" 359 + || name == "h4" 360 + || name == "h5" 361 + || name == "h6" 362 + || name == "pre" 363 + || name == "blockquote" 364 + || name == "ul" 365 + || name == "ol" 366 + || name == "li" => 367 + { 368 + // If there's a <p> in button scope, close it first. 369 + if self.has_element_in_button_scope("p") { 370 + self.close_p_element(); 371 + } 372 + let elem = self.create_element_from_token(&token); 373 + self.insert_node(elem); 374 + self.open_elements.push(elem); 375 + } 376 + Token::StartTag { ref name, .. } if is_void_element(name) => { 377 + let elem = self.create_element_from_token(&token); 378 + self.insert_node(elem); 379 + // Don't push void elements onto the stack. 380 + } 381 + Token::StartTag { .. } => { 382 + // Generic start tag: create element and push onto stack. 383 + let elem = self.create_element_from_token(&token); 384 + self.insert_node(elem); 385 + self.open_elements.push(elem); 386 + } 387 + Token::EndTag { ref name } if name == "body" => { 388 + if self.has_element_in_scope("body") { 389 + self.insertion_mode = InsertionMode::AfterBody; 390 + } 391 + } 392 + Token::EndTag { ref name } if name == "html" => { 393 + if self.has_element_in_scope("body") { 394 + self.insertion_mode = InsertionMode::AfterBody; 395 + self.handle_after_body(token); 396 + } 397 + } 398 + Token::EndTag { ref name } if name == "p" => { 399 + if !self.has_element_in_button_scope("p") { 400 + // No matching <p>: insert an empty one, then close it. 401 + let p = self.document.create_element("p"); 402 + self.insert_node(p); 403 + self.open_elements.push(p); 404 + } 405 + self.close_p_element(); 406 + } 407 + Token::EndTag { ref name } 408 + if name == "div" 409 + || name == "pre" 410 + || name == "blockquote" 411 + || name == "ul" 412 + || name == "ol" 413 + || name == "li" => 414 + { 415 + if self.has_element_in_scope(name) { 416 + self.generate_implied_end_tags(Some(name)); 417 + self.pop_until(name); 418 + } 419 + } 420 + Token::EndTag { ref name } 421 + if name == "h1" 422 + || name == "h2" 423 + || name == "h3" 424 + || name == "h4" 425 + || name == "h5" 426 + || name == "h6" => 427 + { 428 + if self.has_heading_in_scope() { 429 + self.generate_implied_end_tags(None); 430 + // Pop until we find a heading element. 431 + while let Some(id) = self.open_elements.pop() { 432 + if let Some(tag) = self.document.tag_name(id) { 433 + if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { 434 + break; 435 + } 436 + } 437 + } 438 + } 439 + } 440 + Token::EndTag { ref name } => { 441 + // Generic end tag: walk back through open elements. 442 + self.handle_any_other_end_tag(name); 443 + } 444 + Token::Eof => { 445 + // Stop parsing. 446 + } 447 + } 448 + } 449 + 450 + fn handle_after_body(&mut self, token: Token) { 451 + match token { 452 + Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 453 + // Process whitespace as in InBody. 454 + self.handle_in_body(token); 455 + } 456 + Token::Comment(data) => { 457 + // Insert as last child of the first element (html). 458 + let comment = self.document.create_comment(&data); 459 + if let Some(&html) = self.open_elements.first() { 460 + self.document.append_child(html, comment); 461 + } 462 + } 463 + Token::Doctype { .. } => { /* ignore */ } 464 + Token::EndTag { ref name } if name == "html" => { 465 + self.insertion_mode = InsertionMode::AfterAfterBody; 466 + } 467 + Token::Eof => { 468 + // Stop parsing. 469 + } 470 + _ => { 471 + // Anything else: switch back to InBody and reprocess. 472 + self.insertion_mode = InsertionMode::InBody; 473 + self.handle_in_body(token); 474 + } 475 + } 476 + } 477 + 478 + fn handle_after_after_body(&mut self, token: Token) { 479 + match token { 480 + Token::Comment(data) => { 481 + let comment = self.document.create_comment(&data); 482 + let root = self.document.root(); 483 + self.document.append_child(root, comment); 484 + } 485 + Token::Doctype { .. } => { /* ignore */ } 486 + Token::Character(ref s) if s.chars().all(|c| c.is_ascii_whitespace()) => { 487 + self.handle_in_body(token); 488 + } 489 + Token::Eof => { 490 + // Stop. 491 + } 492 + _ => { 493 + self.insertion_mode = InsertionMode::InBody; 494 + self.handle_in_body(token); 495 + } 496 + } 497 + } 498 + 499 + // --- Helper methods --- 500 + 501 + /// Create a DOM element from a StartTag token, setting attributes. 502 + fn create_element_from_token(&mut self, token: &Token) -> NodeId { 503 + if let Token::StartTag { 504 + name, attributes, .. 505 + } = token 506 + { 507 + let id = self.document.create_element(name); 508 + for (attr_name, attr_value) in attributes { 509 + self.document.set_attribute(id, attr_name, attr_value); 510 + } 511 + id 512 + } else { 513 + // Should only be called with StartTag tokens. 514 + self.document.create_element("unknown") 515 + } 516 + } 517 + 518 + /// Insert a node at the current insertion point (last open element). 519 + fn insert_node(&mut self, node: NodeId) { 520 + let parent = self 521 + .open_elements 522 + .last() 523 + .copied() 524 + .unwrap_or_else(|| self.document.root()); 525 + self.document.append_child(parent, node); 526 + } 527 + 528 + /// Insert a text node at the current insertion point. 529 + /// If the last child is already a text node, append to it. 530 + fn insert_text(&mut self, data: &str) { 531 + let parent = self 532 + .open_elements 533 + .last() 534 + .copied() 535 + .unwrap_or_else(|| self.document.root()); 536 + 537 + // Try to merge with existing text node. 538 + if let Some(last_child) = self.document.last_child(parent) { 539 + if let we_dom::NodeData::Text { data: ref existing } = 540 + *self.document.node_data(last_child) 541 + { 542 + let mut merged = existing.clone(); 543 + merged.push_str(data); 544 + self.document.set_text_content(last_child, &merged); 545 + return; 546 + } 547 + } 548 + 549 + let text = self.document.create_text(data); 550 + self.document.append_child(parent, text); 551 + } 552 + 553 + /// Insert a comment node at the current insertion point. 554 + fn insert_comment(&mut self, data: &str) { 555 + let comment = self.document.create_comment(data); 556 + self.insert_node(comment); 557 + } 558 + 559 + /// Pop elements from the stack until we find one with the given tag name. 560 + /// The matching element is also popped. 561 + fn pop_until(&mut self, tag_name: &str) { 562 + while let Some(id) = self.open_elements.pop() { 563 + if self.document.tag_name(id) == Some(tag_name) { 564 + return; 565 + } 566 + } 567 + } 568 + 569 + /// Check if the given tag name is "in scope" (simplified). 570 + /// In scope means there's an element with that tag on the stack, 571 + /// and no scope barrier element between it and the top. 572 + fn has_element_in_scope(&self, target: &str) -> bool { 573 + for &id in self.open_elements.iter().rev() { 574 + if let Some(tag) = self.document.tag_name(id) { 575 + if tag == target { 576 + return true; 577 + } 578 + // Scope barrier elements. 579 + if matches!( 580 + tag, 581 + "applet" 582 + | "caption" 583 + | "html" 584 + | "table" 585 + | "td" 586 + | "th" 587 + | "marquee" 588 + | "object" 589 + | "template" 590 + ) { 591 + return false; 592 + } 593 + } 594 + } 595 + false 596 + } 597 + 598 + /// Check if the given tag name is "in button scope". 599 + fn has_element_in_button_scope(&self, target: &str) -> bool { 600 + for &id in self.open_elements.iter().rev() { 601 + if let Some(tag) = self.document.tag_name(id) { 602 + if tag == target { 603 + return true; 604 + } 605 + // Button scope includes all regular scope barriers plus <button>. 606 + if matches!( 607 + tag, 608 + "applet" 609 + | "button" 610 + | "caption" 611 + | "html" 612 + | "table" 613 + | "td" 614 + | "th" 615 + | "marquee" 616 + | "object" 617 + | "template" 618 + ) { 619 + return false; 620 + } 621 + } 622 + } 623 + false 624 + } 625 + 626 + /// Check if any heading element (h1-h6) is in scope. 627 + fn has_heading_in_scope(&self) -> bool { 628 + for &id in self.open_elements.iter().rev() { 629 + if let Some(tag) = self.document.tag_name(id) { 630 + if matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6") { 631 + return true; 632 + } 633 + if matches!( 634 + tag, 635 + "applet" 636 + | "caption" 637 + | "html" 638 + | "table" 639 + | "td" 640 + | "th" 641 + | "marquee" 642 + | "object" 643 + | "template" 644 + ) { 645 + return false; 646 + } 647 + } 648 + } 649 + false 650 + } 651 + 652 + /// Close a `<p>` element: generate implied end tags (excluding p), 653 + /// then pop until we find the `<p>`. 654 + fn close_p_element(&mut self) { 655 + self.generate_implied_end_tags(Some("p")); 656 + self.pop_until("p"); 657 + } 658 + 659 + /// Generate implied end tags. If `exclude` is provided, don't generate 660 + /// an end tag for that element. 661 + fn generate_implied_end_tags(&mut self, exclude: Option<&str>) { 662 + loop { 663 + let should_pop = self 664 + .open_elements 665 + .last() 666 + .and_then(|&id| self.document.tag_name(id)) 667 + .map(|tag| { 668 + if let Some(excl) = exclude { 669 + if tag == excl { 670 + return false; 671 + } 672 + } 673 + matches!( 674 + tag, 675 + "dd" | "dt" 676 + | "li" 677 + | "optgroup" 678 + | "option" 679 + | "p" 680 + | "rb" 681 + | "rp" 682 + | "rt" 683 + | "rtc" 684 + ) 685 + }) 686 + .unwrap_or(false); 687 + if should_pop { 688 + self.open_elements.pop(); 689 + } else { 690 + break; 691 + } 692 + } 693 + } 694 + 695 + /// Handle a generic end tag by walking back through open elements 696 + /// using the "any other end tag" algorithm. 697 + fn handle_any_other_end_tag(&mut self, name: &str) { 698 + // Walk backwards through the stack. 699 + let mut i = self.open_elements.len(); 700 + while i > 0 { 701 + i -= 1; 702 + let id = self.open_elements[i]; 703 + if self.document.tag_name(id) == Some(name) { 704 + // Pop everything above and including this element. 705 + self.open_elements.truncate(i); 706 + return; 707 + } 708 + // If this is a "special" element, stop. 709 + if let Some(tag) = self.document.tag_name(id) { 710 + if is_special_element(tag) { 711 + return; 712 + } 713 + } 714 + } 715 + } 716 + } 717 + 718 + impl Default for TreeBuilder { 719 + fn default() -> Self { 720 + Self::new() 721 + } 722 + } 723 + 724 + /// Returns true if the tag is a "special" element per the HTML spec. 725 + fn is_special_element(tag: &str) -> bool { 726 + matches!( 727 + tag, 728 + "address" 729 + | "applet" 730 + | "area" 731 + | "article" 732 + | "aside" 733 + | "base" 734 + | "basefont" 735 + | "bgsound" 736 + | "blockquote" 737 + | "body" 738 + | "br" 739 + | "button" 740 + | "caption" 741 + | "center" 742 + | "col" 743 + | "colgroup" 744 + | "dd" 745 + | "details" 746 + | "dir" 747 + | "div" 748 + | "dl" 749 + | "dt" 750 + | "embed" 751 + | "fieldset" 752 + | "figcaption" 753 + | "figure" 754 + | "footer" 755 + | "form" 756 + | "frame" 757 + | "frameset" 758 + | "h1" 759 + | "h2" 760 + | "h3" 761 + | "h4" 762 + | "h5" 763 + | "h6" 764 + | "head" 765 + | "header" 766 + | "hgroup" 767 + | "hr" 768 + | "html" 769 + | "iframe" 770 + | "img" 771 + | "input" 772 + | "li" 773 + | "link" 774 + | "listing" 775 + | "main" 776 + | "marquee" 777 + | "menu" 778 + | "meta" 779 + | "nav" 780 + | "noembed" 781 + | "noframes" 782 + | "noscript" 783 + | "object" 784 + | "ol" 785 + | "p" 786 + | "param" 787 + | "plaintext" 788 + | "pre" 789 + | "script" 790 + | "section" 791 + | "select" 792 + | "source" 793 + | "style" 794 + | "summary" 795 + | "table" 796 + | "tbody" 797 + | "td" 798 + | "template" 799 + | "textarea" 800 + | "tfoot" 801 + | "th" 802 + | "thead" 803 + | "title" 804 + | "tr" 805 + | "track" 806 + | "ul" 807 + | "wbr" 808 + | "xmp" 809 + ) 810 + } 811 + 812 + /// Parse an HTML string into a DOM document. 813 + /// 814 + /// This is a convenience function that tokenizes the input and builds 815 + /// a DOM tree using the tree builder. 816 + pub fn parse_html(input: &str) -> Document { 817 + let mut builder = TreeBuilder::new(); 818 + let mut tokenizer = Tokenizer::new(input); 819 + loop { 820 + let token = tokenizer.next_token(); 821 + let is_eof = token == Token::Eof; 822 + builder.process_token(token); 823 + if is_eof { 824 + break; 825 + } 826 + } 827 + builder.finish() 828 + } 829 + 830 + #[cfg(test)] 831 + mod tests { 832 + use super::*; 833 + use we_dom::NodeData; 834 + 835 + /// Helper: collect tag names of direct children of a node. 836 + fn child_tags(doc: &Document, node: NodeId) -> Vec<String> { 837 + doc.children(node) 838 + .filter_map(|id| doc.tag_name(id).map(String::from)) 839 + .collect() 840 + } 841 + 842 + /// Helper: get the text content of all text node children, concatenated. 843 + fn text_of_children(doc: &Document, node: NodeId) -> String { 844 + let mut result = String::new(); 845 + for child in doc.children(node) { 846 + if let Some(text) = doc.text_content(child) { 847 + result.push_str(text); 848 + } 849 + } 850 + result 851 + } 852 + 853 + #[test] 854 + fn parse_full_document() { 855 + let doc = parse_html( 856 + "<!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>", 857 + ); 858 + let root = doc.root(); 859 + 860 + // Root should have one child: <html> 861 + let html_children: Vec<NodeId> = doc.children(root).collect(); 862 + assert_eq!(html_children.len(), 1); 863 + let html = html_children[0]; 864 + assert_eq!(doc.tag_name(html), Some("html")); 865 + 866 + // <html> should have <head> and <body> 867 + let tags = child_tags(&doc, html); 868 + assert_eq!(tags, vec!["head", "body"]); 869 + 870 + // <head> should have <title> 871 + let head = doc.children(html).next().unwrap(); 872 + let head_tags = child_tags(&doc, head); 873 + assert_eq!(head_tags, vec!["title"]); 874 + 875 + // <title> should contain "Test" 876 + let title = doc.children(head).next().unwrap(); 877 + assert_eq!(text_of_children(&doc, title), "Test"); 878 + 879 + // <body> should have <p> 880 + let body = doc.children(html).nth(1).unwrap(); 881 + let body_tags = child_tags(&doc, body); 882 + assert_eq!(body_tags, vec!["p"]); 883 + 884 + // <p> should contain "Hello" 885 + let p = doc.children(body).next().unwrap(); 886 + assert_eq!(text_of_children(&doc, p), "Hello"); 887 + } 888 + 889 + #[test] 890 + fn implicit_html_head_body() { 891 + // Minimal document: just <p>Hello 892 + let doc = parse_html("<p>Hello"); 893 + let root = doc.root(); 894 + 895 + let html: Vec<NodeId> = doc.children(root).collect(); 896 + assert_eq!(html.len(), 1); 897 + assert_eq!(doc.tag_name(html[0]), Some("html")); 898 + 899 + let html_tags = child_tags(&doc, html[0]); 900 + assert_eq!(html_tags, vec!["head", "body"]); 901 + 902 + let body = doc.children(html[0]).nth(1).unwrap(); 903 + let body_tags = child_tags(&doc, body); 904 + assert_eq!(body_tags, vec!["p"]); 905 + 906 + let p = doc.children(body).next().unwrap(); 907 + assert_eq!(text_of_children(&doc, p), "Hello"); 908 + } 909 + 910 + #[test] 911 + fn void_element_br() { 912 + let doc = parse_html("<p>Line 1<br>Line 2</p>"); 913 + let root = doc.root(); 914 + let html = doc.children(root).next().unwrap(); 915 + let body = doc.children(html).nth(1).unwrap(); 916 + let p = doc.children(body).next().unwrap(); 917 + 918 + // <p> should have: text("Line 1"), <br>, text("Line 2") 919 + let children: Vec<NodeId> = doc.children(p).collect(); 920 + assert_eq!(children.len(), 3); 921 + assert_eq!(doc.text_content(children[0]), Some("Line 1")); 922 + assert_eq!(doc.tag_name(children[1]), Some("br")); 923 + assert_eq!(doc.text_content(children[2]), Some("Line 2")); 924 + } 925 + 926 + #[test] 927 + fn p_inside_p_closes_outer() { 928 + let doc = parse_html("<p>First<p>Second"); 929 + let root = doc.root(); 930 + let html = doc.children(root).next().unwrap(); 931 + let body = doc.children(html).nth(1).unwrap(); 932 + 933 + // Should have two sibling <p> elements, not nested. 934 + let body_tags = child_tags(&doc, body); 935 + assert_eq!(body_tags, vec!["p", "p"]); 936 + 937 + let children: Vec<NodeId> = doc.children(body).collect(); 938 + assert_eq!(text_of_children(&doc, children[0]), "First"); 939 + assert_eq!(text_of_children(&doc, children[1]), "Second"); 940 + } 941 + 942 + #[test] 943 + fn nested_div_elements() { 944 + let doc = parse_html("<div><div>inner</div></div>"); 945 + let root = doc.root(); 946 + let html = doc.children(root).next().unwrap(); 947 + let body = doc.children(html).nth(1).unwrap(); 948 + 949 + let outer_div = doc.children(body).next().unwrap(); 950 + assert_eq!(doc.tag_name(outer_div), Some("div")); 951 + 952 + let inner_div = doc.children(outer_div).next().unwrap(); 953 + assert_eq!(doc.tag_name(inner_div), Some("div")); 954 + assert_eq!(text_of_children(&doc, inner_div), "inner"); 955 + } 956 + 957 + #[test] 958 + fn inline_elements_nest_properly() { 959 + let doc = parse_html("<p><span><a href=\"#\">link</a></span></p>"); 960 + let root = doc.root(); 961 + let html = doc.children(root).next().unwrap(); 962 + let body = doc.children(html).nth(1).unwrap(); 963 + 964 + let p = doc.children(body).next().unwrap(); 965 + let span = doc.children(p).next().unwrap(); 966 + assert_eq!(doc.tag_name(span), Some("span")); 967 + 968 + let a = doc.children(span).next().unwrap(); 969 + assert_eq!(doc.tag_name(a), Some("a")); 970 + assert_eq!(doc.get_attribute(a, "href"), Some("#")); 971 + assert_eq!(text_of_children(&doc, a), "link"); 972 + } 973 + 974 + #[test] 975 + fn headings() { 976 + let doc = parse_html("<h1>Title</h1><h2>Subtitle</h2><p>Body text</p>"); 977 + let root = doc.root(); 978 + let html = doc.children(root).next().unwrap(); 979 + let body = doc.children(html).nth(1).unwrap(); 980 + 981 + let tags = child_tags(&doc, body); 982 + assert_eq!(tags, vec!["h1", "h2", "p"]); 983 + } 984 + 985 + #[test] 986 + fn comment_nodes() { 987 + let doc = parse_html("<body><!-- a comment --><p>text</p></body>"); 988 + let root = doc.root(); 989 + let html = doc.children(root).next().unwrap(); 990 + let body = doc.children(html).nth(1).unwrap(); 991 + 992 + let children: Vec<NodeId> = doc.children(body).collect(); 993 + assert!(children.len() >= 2); 994 + 995 + // First child should be a comment. 996 + match doc.node_data(children[0]) { 997 + NodeData::Comment { data } => assert_eq!(data, " a comment "), 998 + other => panic!("expected comment, got {:?}", other), 999 + } 1000 + } 1001 + 1002 + #[test] 1003 + fn pre_element() { 1004 + let doc = parse_html("<pre>line 1\nline 2</pre>"); 1005 + let root = doc.root(); 1006 + let html = doc.children(root).next().unwrap(); 1007 + let body = doc.children(html).nth(1).unwrap(); 1008 + 1009 + let pre = doc.children(body).next().unwrap(); 1010 + assert_eq!(doc.tag_name(pre), Some("pre")); 1011 + assert_eq!(text_of_children(&doc, pre), "line 1\nline 2"); 1012 + } 1013 + 1014 + #[test] 1015 + fn attributes_preserved() { 1016 + let doc = 1017 + parse_html("<div id=\"main\" class=\"container\"><a href=\"/page\">link</a></div>"); 1018 + let root = doc.root(); 1019 + let html = doc.children(root).next().unwrap(); 1020 + let body = doc.children(html).nth(1).unwrap(); 1021 + 1022 + let div = doc.children(body).next().unwrap(); 1023 + assert_eq!(doc.get_attribute(div, "id"), Some("main")); 1024 + assert_eq!(doc.get_attribute(div, "class"), Some("container")); 1025 + 1026 + let a = doc.children(div).next().unwrap(); 1027 + assert_eq!(doc.get_attribute(a, "href"), Some("/page")); 1028 + } 1029 + 1030 + #[test] 1031 + fn empty_document() { 1032 + let doc = parse_html(""); 1033 + let root = doc.root(); 1034 + // Even an empty doc should get html/head/body from EOF handling. 1035 + // The tree builder creates implicit elements. 1036 + assert!(doc.children(root).next().is_some()); 1037 + } 1038 + 1039 + #[test] 1040 + fn just_text() { 1041 + let doc = parse_html("Hello, world!"); 1042 + let root = doc.root(); 1043 + let html = doc.children(root).next().unwrap(); 1044 + let body = doc.children(html).nth(1).unwrap(); 1045 + 1046 + assert_eq!(text_of_children(&doc, body), "Hello, world!"); 1047 + } 1048 + 1049 + #[test] 1050 + fn heading_closes_open_p() { 1051 + let doc = parse_html("<p>text<h1>heading</h1>"); 1052 + let root = doc.root(); 1053 + let html = doc.children(root).next().unwrap(); 1054 + let body = doc.children(html).nth(1).unwrap(); 1055 + 1056 + // <p> should be closed by <h1>, so they're siblings. 1057 + let tags = child_tags(&doc, body); 1058 + assert_eq!(tags, vec!["p", "h1"]); 1059 + } 1060 + 1061 + #[test] 1062 + fn self_closing_void_elements() { 1063 + let doc = parse_html("<p>before<br/>after</p>"); 1064 + let root = doc.root(); 1065 + let html = doc.children(root).next().unwrap(); 1066 + let body = doc.children(html).nth(1).unwrap(); 1067 + let p = doc.children(body).next().unwrap(); 1068 + 1069 + let children: Vec<NodeId> = doc.children(p).collect(); 1070 + assert_eq!(children.len(), 3); 1071 + assert_eq!(doc.tag_name(children[1]), Some("br")); 1072 + } 1073 + 1074 + #[test] 1075 + fn doctype_is_handled() { 1076 + let doc = parse_html("<!DOCTYPE html><html><body></body></html>"); 1077 + let root = doc.root(); 1078 + let html = doc.children(root).next().unwrap(); 1079 + assert_eq!(doc.tag_name(html), Some("html")); 1080 + } 1081 + 1082 + #[test] 1083 + fn tree_builder_step_by_step() { 1084 + let mut builder = TreeBuilder::new(); 1085 + builder.process_token(Token::Doctype { 1086 + name: Some("html".into()), 1087 + public_id: None, 1088 + system_id: None, 1089 + force_quirks: false, 1090 + }); 1091 + builder.process_token(Token::StartTag { 1092 + name: "html".into(), 1093 + attributes: vec![], 1094 + self_closing: false, 1095 + }); 1096 + builder.process_token(Token::StartTag { 1097 + name: "head".into(), 1098 + attributes: vec![], 1099 + self_closing: false, 1100 + }); 1101 + builder.process_token(Token::EndTag { 1102 + name: "head".into(), 1103 + }); 1104 + builder.process_token(Token::StartTag { 1105 + name: "body".into(), 1106 + attributes: vec![], 1107 + self_closing: false, 1108 + }); 1109 + builder.process_token(Token::StartTag { 1110 + name: "p".into(), 1111 + attributes: vec![], 1112 + self_closing: false, 1113 + }); 1114 + builder.process_token(Token::Character("Hello".into())); 1115 + builder.process_token(Token::EndTag { name: "p".into() }); 1116 + builder.process_token(Token::EndTag { 1117 + name: "body".into(), 1118 + }); 1119 + builder.process_token(Token::EndTag { 1120 + name: "html".into(), 1121 + }); 1122 + builder.process_token(Token::Eof); 1123 + 1124 + let doc = builder.finish(); 1125 + let root = doc.root(); 1126 + let html = doc.children(root).next().unwrap(); 1127 + assert_eq!(doc.tag_name(html), Some("html")); 1128 + 1129 + let body = doc.children(html).nth(1).unwrap(); 1130 + let p = doc.children(body).next().unwrap(); 1131 + assert_eq!(text_of_children(&doc, p), "Hello"); 1132 + } 1133 + 1134 + #[test] 1135 + fn multiple_text_children_merge() { 1136 + // When consecutive character tokens arrive, they should merge. 1137 + let mut builder = TreeBuilder::new(); 1138 + builder.process_token(Token::StartTag { 1139 + name: "p".into(), 1140 + attributes: vec![], 1141 + self_closing: false, 1142 + }); 1143 + builder.process_token(Token::Character("Hello ".into())); 1144 + builder.process_token(Token::Character("world".into())); 1145 + builder.process_token(Token::EndTag { name: "p".into() }); 1146 + builder.process_token(Token::Eof); 1147 + 1148 + let doc = builder.finish(); 1149 + let root = doc.root(); 1150 + let html = doc.children(root).next().unwrap(); 1151 + let body = doc.children(html).nth(1).unwrap(); 1152 + let p = doc.children(body).next().unwrap(); 1153 + 1154 + // Should be a single text node. 1155 + let children: Vec<NodeId> = doc.children(p).collect(); 1156 + assert_eq!(children.len(), 1); 1157 + assert_eq!(doc.text_content(children[0]), Some("Hello world")); 1158 + } 1159 + }