web engine - experimental web browser

Implement CSS tokenizer per CSS Syntax Module Level 3

Full tokenizer state machine producing all CSS token types: ident, function,
at-keyword, hash, string, url, number, percentage, dimension, whitespace,
delimiters, CDO/CDC. Handles escape sequences, comments, number consumption
(integer/float/exponent), url tokens, and input preprocessing (CRLF
normalization, null replacement). 43 unit tests covering all token types,
edge cases, and real CSS patterns.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+983
+2
crates/css/src/lib.rs
··· 1 1 //! CSS tokenizer, parser, and CSSOM. 2 + 3 + pub mod tokenizer;
+981
crates/css/src/tokenizer.rs
··· 1 + //! CSS tokenizer per CSS Syntax Module Level 3 §4. 2 + //! 3 + //! Consumes a stream of code points and produces CSS tokens. 4 + 5 + /// A CSS token produced by the tokenizer. 6 + #[derive(Debug, Clone, PartialEq)] 7 + pub enum Token { 8 + Ident(String), 9 + Function(String), 10 + AtKeyword(String), 11 + Hash(String, HashType), 12 + String(String), 13 + BadString, 14 + Url(String), 15 + BadUrl, 16 + Number(f64, NumericType), 17 + Percentage(f64), 18 + Dimension(f64, NumericType, String), 19 + Whitespace, 20 + Colon, 21 + Semicolon, 22 + Comma, 23 + LeftBracket, 24 + RightBracket, 25 + LeftParen, 26 + RightParen, 27 + LeftBrace, 28 + RightBrace, 29 + Delim(char), 30 + Cdo, 31 + Cdc, 32 + Eof, 33 + } 34 + 35 + /// Whether a `<hash-token>` is "id" (valid identifier) or "unrestricted". 36 + #[derive(Debug, Clone, Copy, PartialEq, Eq)] 37 + pub enum HashType { 38 + Id, 39 + Unrestricted, 40 + } 41 + 42 + /// Whether a number is integer or number (float). 43 + #[derive(Debug, Clone, Copy, PartialEq, Eq)] 44 + pub enum NumericType { 45 + Integer, 46 + Number, 47 + } 48 + 49 + /// CSS tokenizer state machine. 50 + pub struct Tokenizer { 51 + input: Vec<char>, 52 + pos: usize, 53 + } 54 + 55 + impl Tokenizer { 56 + pub fn new(input: &str) -> Self { 57 + // Preprocessing: replace \r\n, \r, \f with \n (§3.3) 58 + let mut chars = Vec::with_capacity(input.len()); 59 + let raw: Vec<char> = input.chars().collect(); 60 + let mut i = 0; 61 + while i < raw.len() { 62 + match raw[i] { 63 + '\r' => { 64 + chars.push('\n'); 65 + if i + 1 < raw.len() && raw[i + 1] == '\n' { 66 + i += 1; 67 + } 68 + } 69 + '\x0C' => chars.push('\n'), 70 + '\0' => chars.push('\u{FFFD}'), 71 + c => chars.push(c), 72 + } 73 + i += 1; 74 + } 75 + Self { 76 + input: chars, 77 + pos: 0, 78 + } 79 + } 80 + 81 + /// Tokenize the entire input into a list of tokens (excluding EOF). 82 + pub fn tokenize(input: &str) -> Vec<Token> { 83 + let mut tokenizer = Self::new(input); 84 + let mut tokens = Vec::new(); 85 + loop { 86 + let token = tokenizer.next_token(); 87 + if token == Token::Eof { 88 + break; 89 + } 90 + tokens.push(token); 91 + } 92 + tokens 93 + } 94 + 95 + /// Consume and return the next token. 96 + pub fn next_token(&mut self) -> Token { 97 + self.consume_comments(); 98 + self.consume_token() 99 + } 100 + 101 + fn peek(&self) -> char { 102 + self.input.get(self.pos).copied().unwrap_or('\0') 103 + } 104 + 105 + fn peek_at(&self, offset: usize) -> char { 106 + self.input.get(self.pos + offset).copied().unwrap_or('\0') 107 + } 108 + 109 + fn advance(&mut self) -> char { 110 + let c = self.peek(); 111 + if self.pos < self.input.len() { 112 + self.pos += 1; 113 + } 114 + c 115 + } 116 + 117 + fn is_eof(&self) -> bool { 118 + self.pos >= self.input.len() 119 + } 120 + 121 + fn consume_comments(&mut self) { 122 + while self.peek() == '/' && self.peek_at(1) == '*' { 123 + self.pos += 2; 124 + loop { 125 + if self.is_eof() { 126 + return; 127 + } 128 + if self.peek() == '*' && self.peek_at(1) == '/' { 129 + self.pos += 2; 130 + break; 131 + } 132 + self.pos += 1; 133 + } 134 + } 135 + } 136 + 137 + fn consume_token(&mut self) -> Token { 138 + if self.is_eof() { 139 + return Token::Eof; 140 + } 141 + 142 + let c = self.peek(); 143 + 144 + // Whitespace 145 + if is_whitespace(c) { 146 + self.consume_whitespace(); 147 + return Token::Whitespace; 148 + } 149 + 150 + // String 151 + if c == '"' || c == '\'' { 152 + return self.consume_string(c); 153 + } 154 + 155 + // Hash 156 + if c == '#' { 157 + self.advance(); 158 + if is_name_char(self.peek()) || self.starts_valid_escape() { 159 + let hash_type = if self.would_start_ident() { 160 + HashType::Id 161 + } else { 162 + HashType::Unrestricted 163 + }; 164 + let name = self.consume_name(); 165 + return Token::Hash(name, hash_type); 166 + } 167 + return Token::Delim('#'); 168 + } 169 + 170 + // Left paren 171 + if c == '(' { 172 + self.advance(); 173 + return Token::LeftParen; 174 + } 175 + 176 + // Right paren 177 + if c == ')' { 178 + self.advance(); 179 + return Token::RightParen; 180 + } 181 + 182 + // Plus sign 183 + if c == '+' { 184 + if self.starts_number() { 185 + return self.consume_numeric(); 186 + } 187 + self.advance(); 188 + return Token::Delim('+'); 189 + } 190 + 191 + // Comma 192 + if c == ',' { 193 + self.advance(); 194 + return Token::Comma; 195 + } 196 + 197 + // Hyphen-minus 198 + if c == '-' { 199 + if self.starts_number() { 200 + return self.consume_numeric(); 201 + } 202 + if self.peek_at(1) == '-' && self.peek_at(2) == '>' { 203 + self.pos += 3; 204 + return Token::Cdc; 205 + } 206 + if self.would_start_ident() { 207 + return self.consume_ident_like(); 208 + } 209 + self.advance(); 210 + return Token::Delim('-'); 211 + } 212 + 213 + // Full stop 214 + if c == '.' { 215 + if self.starts_number() { 216 + return self.consume_numeric(); 217 + } 218 + self.advance(); 219 + return Token::Delim('.'); 220 + } 221 + 222 + // Colon 223 + if c == ':' { 224 + self.advance(); 225 + return Token::Colon; 226 + } 227 + 228 + // Semicolon 229 + if c == ';' { 230 + self.advance(); 231 + return Token::Semicolon; 232 + } 233 + 234 + // Less-than sign 235 + if c == '<' { 236 + if self.peek_at(1) == '!' && self.peek_at(2) == '-' && self.peek_at(3) == '-' { 237 + self.pos += 4; 238 + return Token::Cdo; 239 + } 240 + self.advance(); 241 + return Token::Delim('<'); 242 + } 243 + 244 + // At sign 245 + if c == '@' { 246 + self.advance(); 247 + if self.would_start_ident() { 248 + let name = self.consume_name(); 249 + return Token::AtKeyword(name); 250 + } 251 + return Token::Delim('@'); 252 + } 253 + 254 + // Left bracket 255 + if c == '[' { 256 + self.advance(); 257 + return Token::LeftBracket; 258 + } 259 + 260 + // Backslash 261 + if c == '\\' { 262 + if self.starts_valid_escape() { 263 + return self.consume_ident_like(); 264 + } 265 + self.advance(); 266 + return Token::Delim('\\'); 267 + } 268 + 269 + // Right bracket 270 + if c == ']' { 271 + self.advance(); 272 + return Token::RightBracket; 273 + } 274 + 275 + // Left brace 276 + if c == '{' { 277 + self.advance(); 278 + return Token::LeftBrace; 279 + } 280 + 281 + // Right brace 282 + if c == '}' { 283 + self.advance(); 284 + return Token::RightBrace; 285 + } 286 + 287 + // Digit 288 + if c.is_ascii_digit() { 289 + return self.consume_numeric(); 290 + } 291 + 292 + // Name start 293 + if is_name_start_char(c) { 294 + return self.consume_ident_like(); 295 + } 296 + 297 + // Anything else 298 + self.advance(); 299 + Token::Delim(c) 300 + } 301 + 302 + fn consume_whitespace(&mut self) { 303 + while !self.is_eof() && is_whitespace(self.peek()) { 304 + self.advance(); 305 + } 306 + } 307 + 308 + fn consume_string(&mut self, ending: char) -> Token { 309 + self.advance(); // consume opening quote 310 + let mut value = String::new(); 311 + loop { 312 + if self.is_eof() { 313 + return Token::String(value); 314 + } 315 + let c = self.advance(); 316 + match c { 317 + c if c == ending => return Token::String(value), 318 + '\n' => { 319 + // Unescaped newline in string → bad string 320 + self.pos -= 1; // reconsume 321 + return Token::BadString; 322 + } 323 + '\\' => { 324 + if self.is_eof() { 325 + // Backslash at end of input: do nothing 326 + } else if self.peek() == '\n' { 327 + self.advance(); // consume newline (line continuation) 328 + } else { 329 + value.push(self.consume_escaped_char()); 330 + } 331 + } 332 + _ => value.push(c), 333 + } 334 + } 335 + } 336 + 337 + fn consume_escaped_char(&mut self) -> char { 338 + if self.is_eof() { 339 + return '\u{FFFD}'; 340 + } 341 + let c = self.advance(); 342 + if c.is_ascii_hexdigit() { 343 + let mut hex = String::new(); 344 + hex.push(c); 345 + for _ in 0..5 { 346 + if !self.is_eof() && self.peek().is_ascii_hexdigit() { 347 + hex.push(self.advance()); 348 + } else { 349 + break; 350 + } 351 + } 352 + // Consume optional trailing whitespace 353 + if !self.is_eof() && is_whitespace(self.peek()) { 354 + self.advance(); 355 + } 356 + let code_point = u32::from_str_radix(&hex, 16).unwrap_or(0); 357 + if code_point == 0 || code_point > 0x10FFFF || (0xD800..=0xDFFF).contains(&code_point) { 358 + '\u{FFFD}' 359 + } else { 360 + char::from_u32(code_point).unwrap_or('\u{FFFD}') 361 + } 362 + } else { 363 + c 364 + } 365 + } 366 + 367 + fn starts_valid_escape(&self) -> bool { 368 + self.peek() == '\\' && self.peek_at(1) != '\n' 369 + } 370 + 371 + fn starts_valid_escape_at(&self, offset: usize) -> bool { 372 + self.peek_at(offset) == '\\' && self.peek_at(offset + 1) != '\n' 373 + } 374 + 375 + /// Check if the next chars would start an identifier (§4.3.9). 376 + fn would_start_ident(&self) -> bool { 377 + self.would_start_ident_at(0) 378 + } 379 + 380 + fn would_start_ident_at(&self, offset: usize) -> bool { 381 + let c = self.peek_at(offset); 382 + if is_name_start_char(c) { 383 + return true; 384 + } 385 + if c == '-' { 386 + let next = self.peek_at(offset + 1); 387 + return is_name_start_char(next) 388 + || next == '-' 389 + || self.starts_valid_escape_at(offset + 1); 390 + } 391 + if c == '\\' { 392 + return self.starts_valid_escape_at(offset); 393 + } 394 + false 395 + } 396 + 397 + /// Check if the next chars would start a number (§4.3.10). 398 + fn starts_number(&self) -> bool { 399 + let c = self.peek(); 400 + if c == '+' || c == '-' { 401 + let next = self.peek_at(1); 402 + if next.is_ascii_digit() { 403 + return true; 404 + } 405 + if next == '.' && self.peek_at(2).is_ascii_digit() { 406 + return true; 407 + } 408 + return false; 409 + } 410 + if c == '.' { 411 + return self.peek_at(1).is_ascii_digit(); 412 + } 413 + c.is_ascii_digit() 414 + } 415 + 416 + fn consume_name(&mut self) -> String { 417 + let mut name = String::new(); 418 + loop { 419 + if self.is_eof() { 420 + break; 421 + } 422 + let c = self.peek(); 423 + if is_name_char(c) { 424 + name.push(c); 425 + self.advance(); 426 + } else if self.starts_valid_escape() { 427 + self.advance(); // consume backslash 428 + name.push(self.consume_escaped_char()); 429 + } else { 430 + break; 431 + } 432 + } 433 + name 434 + } 435 + 436 + fn consume_numeric(&mut self) -> Token { 437 + let (value, num_type) = self.consume_number(); 438 + 439 + if self.would_start_ident() { 440 + let unit = self.consume_name(); 441 + return Token::Dimension(value, num_type, unit); 442 + } 443 + 444 + if self.peek() == '%' { 445 + self.advance(); 446 + return Token::Percentage(value); 447 + } 448 + 449 + Token::Number(value, num_type) 450 + } 451 + 452 + fn consume_number(&mut self) -> (f64, NumericType) { 453 + let mut repr = String::new(); 454 + let mut num_type = NumericType::Integer; 455 + 456 + // Sign 457 + if self.peek() == '+' || self.peek() == '-' { 458 + repr.push(self.advance()); 459 + } 460 + 461 + // Integer part 462 + while !self.is_eof() && self.peek().is_ascii_digit() { 463 + repr.push(self.advance()); 464 + } 465 + 466 + // Fractional part 467 + if self.peek() == '.' && self.peek_at(1).is_ascii_digit() { 468 + repr.push(self.advance()); // '.' 469 + num_type = NumericType::Number; 470 + while !self.is_eof() && self.peek().is_ascii_digit() { 471 + repr.push(self.advance()); 472 + } 473 + } 474 + 475 + // Exponent 476 + if self.peek() == 'e' || self.peek() == 'E' { 477 + let next = self.peek_at(1); 478 + if next.is_ascii_digit() 479 + || ((next == '+' || next == '-') && self.peek_at(2).is_ascii_digit()) 480 + { 481 + repr.push(self.advance()); // 'e'/'E' 482 + num_type = NumericType::Number; 483 + if self.peek() == '+' || self.peek() == '-' { 484 + repr.push(self.advance()); 485 + } 486 + while !self.is_eof() && self.peek().is_ascii_digit() { 487 + repr.push(self.advance()); 488 + } 489 + } 490 + } 491 + 492 + let value = repr.parse::<f64>().unwrap_or(0.0); 493 + (value, num_type) 494 + } 495 + 496 + fn consume_ident_like(&mut self) -> Token { 497 + let name = self.consume_name(); 498 + 499 + // Check for url( function 500 + if name.eq_ignore_ascii_case("url") && self.peek() == '(' { 501 + self.advance(); // consume '(' 502 + // Skip whitespace 503 + let saved = self.pos; 504 + self.consume_whitespace(); 505 + if self.peek() == '"' || self.peek() == '\'' { 506 + // url("...") → treat as function token, parser handles the rest 507 + self.pos = saved; 508 + return Token::Function(name); 509 + } 510 + return self.consume_url(); 511 + } 512 + 513 + if self.peek() == '(' { 514 + self.advance(); 515 + return Token::Function(name); 516 + } 517 + 518 + Token::Ident(name) 519 + } 520 + 521 + fn consume_url(&mut self) -> Token { 522 + let mut value = String::new(); 523 + self.consume_whitespace(); 524 + 525 + loop { 526 + if self.is_eof() { 527 + return Token::Url(value); 528 + } 529 + match self.peek() { 530 + ')' => { 531 + self.advance(); 532 + return Token::Url(value); 533 + } 534 + c if is_whitespace(c) => { 535 + self.consume_whitespace(); 536 + if self.is_eof() || self.peek() == ')' { 537 + if !self.is_eof() { 538 + self.advance(); 539 + } 540 + return Token::Url(value); 541 + } 542 + self.consume_bad_url_remnants(); 543 + return Token::BadUrl; 544 + } 545 + '"' | '\'' | '(' => { 546 + self.consume_bad_url_remnants(); 547 + return Token::BadUrl; 548 + } 549 + '\\' => { 550 + if self.starts_valid_escape() { 551 + self.advance(); 552 + value.push(self.consume_escaped_char()); 553 + } else { 554 + self.consume_bad_url_remnants(); 555 + return Token::BadUrl; 556 + } 557 + } 558 + c if is_non_printable(c) => { 559 + self.consume_bad_url_remnants(); 560 + return Token::BadUrl; 561 + } 562 + _ => { 563 + value.push(self.advance()); 564 + } 565 + } 566 + } 567 + } 568 + 569 + fn consume_bad_url_remnants(&mut self) { 570 + loop { 571 + if self.is_eof() { 572 + return; 573 + } 574 + let c = self.advance(); 575 + if c == ')' { 576 + return; 577 + } 578 + if self.peek_at(0) != '\n' && c == '\\' { 579 + self.advance(); // consume escaped char 580 + } 581 + } 582 + } 583 + } 584 + 585 + fn is_whitespace(c: char) -> bool { 586 + matches!(c, ' ' | '\t' | '\n') 587 + } 588 + 589 + fn is_name_start_char(c: char) -> bool { 590 + c.is_ascii_alphabetic() || !c.is_ascii() || c == '_' 591 + } 592 + 593 + fn is_name_char(c: char) -> bool { 594 + is_name_start_char(c) || c.is_ascii_digit() || c == '-' 595 + } 596 + 597 + fn is_non_printable(c: char) -> bool { 598 + matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F') 599 + } 600 + 601 + #[cfg(test)] 602 + mod tests { 603 + use super::*; 604 + 605 + fn tokenize(input: &str) -> Vec<Token> { 606 + Tokenizer::tokenize(input) 607 + } 608 + 609 + #[test] 610 + fn test_empty() { 611 + assert_eq!(tokenize(""), vec![]); 612 + } 613 + 614 + #[test] 615 + fn test_whitespace() { 616 + assert_eq!(tokenize(" \t\n "), vec![Token::Whitespace]); 617 + } 618 + 619 + #[test] 620 + fn test_ident() { 621 + assert_eq!(tokenize("color"), vec![Token::Ident("color".into())]); 622 + assert_eq!(tokenize("div"), vec![Token::Ident("div".into())]); 623 + assert_eq!(tokenize("--custom"), vec![Token::Ident("--custom".into())]); 624 + assert_eq!(tokenize("_foo"), vec![Token::Ident("_foo".into())]); 625 + assert_eq!( 626 + tokenize("-webkit-foo"), 627 + vec![Token::Ident("-webkit-foo".into())] 628 + ); 629 + } 630 + 631 + #[test] 632 + fn test_function() { 633 + assert_eq!(tokenize("rgb("), vec![Token::Function("rgb".into())]); 634 + let tokens = tokenize("rgb(255, 0, 0)"); 635 + assert_eq!(tokens[0], Token::Function("rgb".into())); 636 + assert_eq!(tokenize("calc("), vec![Token::Function("calc".into())]); 637 + } 638 + 639 + #[test] 640 + fn test_at_keyword() { 641 + assert_eq!(tokenize("@media"), vec![Token::AtKeyword("media".into())]); 642 + assert_eq!(tokenize("@import"), vec![Token::AtKeyword("import".into())]); 643 + } 644 + 645 + #[test] 646 + fn test_hash() { 647 + assert_eq!( 648 + tokenize("#id"), 649 + vec![Token::Hash("id".into(), HashType::Id)] 650 + ); 651 + assert_eq!( 652 + tokenize("#fff"), 653 + vec![Token::Hash("fff".into(), HashType::Id)] 654 + ); 655 + assert_eq!( 656 + tokenize("#123"), 657 + vec![Token::Hash("123".into(), HashType::Unrestricted)] 658 + ); 659 + } 660 + 661 + #[test] 662 + fn test_string_double_quote() { 663 + assert_eq!(tokenize(r#""hello""#), vec![Token::String("hello".into())]); 664 + } 665 + 666 + #[test] 667 + fn test_string_single_quote() { 668 + assert_eq!(tokenize("'world'"), vec![Token::String("world".into())]); 669 + } 670 + 671 + #[test] 672 + fn test_string_escape() { 673 + assert_eq!(tokenize(r#""he\6Co""#), vec![Token::String("helo".into())]); 674 + } 675 + 676 + #[test] 677 + fn test_string_newline_escape() { 678 + assert_eq!( 679 + tokenize("\"line\\\ncontinued\""), 680 + vec![Token::String("linecontinued".into())] 681 + ); 682 + } 683 + 684 + #[test] 685 + fn test_bad_string() { 686 + let tokens = tokenize("\"unterminated\n"); 687 + assert_eq!(tokens[0], Token::BadString); 688 + } 689 + 690 + #[test] 691 + fn test_number_integer() { 692 + assert_eq!( 693 + tokenize("42"), 694 + vec![Token::Number(42.0, NumericType::Integer)] 695 + ); 696 + } 697 + 698 + #[test] 699 + fn test_number_float() { 700 + assert_eq!( 701 + tokenize("3.14"), 702 + vec![Token::Number(3.14, NumericType::Number)] 703 + ); 704 + } 705 + 706 + #[test] 707 + fn test_number_signed() { 708 + assert_eq!( 709 + tokenize("+10"), 710 + vec![Token::Number(10.0, NumericType::Integer)] 711 + ); 712 + assert_eq!( 713 + tokenize("-5"), 714 + vec![Token::Number(-5.0, NumericType::Integer)] 715 + ); 716 + } 717 + 718 + #[test] 719 + fn test_number_exponent() { 720 + assert_eq!( 721 + tokenize("1e2"), 722 + vec![Token::Number(100.0, NumericType::Number)] 723 + ); 724 + assert_eq!( 725 + tokenize("2E+3"), 726 + vec![Token::Number(2000.0, NumericType::Number)] 727 + ); 728 + } 729 + 730 + #[test] 731 + fn test_percentage() { 732 + assert_eq!(tokenize("50%"), vec![Token::Percentage(50.0)]); 733 + } 734 + 735 + #[test] 736 + fn test_dimension() { 737 + assert_eq!( 738 + tokenize("10px"), 739 + vec![Token::Dimension(10.0, NumericType::Integer, "px".into())] 740 + ); 741 + assert_eq!( 742 + tokenize("2em"), 743 + vec![Token::Dimension(2.0, NumericType::Integer, "em".into())] 744 + ); 745 + assert_eq!( 746 + tokenize("1.5rem"), 747 + vec![Token::Dimension(1.5, NumericType::Number, "rem".into())] 748 + ); 749 + } 750 + 751 + #[test] 752 + fn test_delimiters() { 753 + assert_eq!(tokenize(":"), vec![Token::Colon]); 754 + assert_eq!(tokenize(";"), vec![Token::Semicolon]); 755 + assert_eq!(tokenize(","), vec![Token::Comma]); 756 + assert_eq!(tokenize("("), vec![Token::LeftParen]); 757 + assert_eq!(tokenize(")"), vec![Token::RightParen]); 758 + assert_eq!(tokenize("["), vec![Token::LeftBracket]); 759 + assert_eq!(tokenize("]"), vec![Token::RightBracket]); 760 + assert_eq!(tokenize("{"), vec![Token::LeftBrace]); 761 + assert_eq!(tokenize("}"), vec![Token::RightBrace]); 762 + } 763 + 764 + #[test] 765 + fn test_delim_tokens() { 766 + assert_eq!(tokenize("."), vec![Token::Delim('.')]); 767 + assert_eq!(tokenize(">"), vec![Token::Delim('>')]); 768 + assert_eq!(tokenize("+"), vec![Token::Delim('+')]); 769 + assert_eq!(tokenize("~"), vec![Token::Delim('~')]); 770 + assert_eq!(tokenize("*"), vec![Token::Delim('*')]); 771 + } 772 + 773 + #[test] 774 + fn test_cdo_cdc() { 775 + assert_eq!(tokenize("<!--"), vec![Token::Cdo]); 776 + assert_eq!(tokenize("-->"), vec![Token::Cdc]); 777 + } 778 + 779 + #[test] 780 + fn test_comments() { 781 + assert_eq!( 782 + tokenize("/* comment */color"), 783 + vec![Token::Ident("color".into())] 784 + ); 785 + assert_eq!( 786 + tokenize("a/* x */b"), 787 + vec![Token::Ident("a".into()), Token::Ident("b".into())] 788 + ); 789 + } 790 + 791 + #[test] 792 + fn test_unclosed_comment() { 793 + assert_eq!(tokenize("/* unclosed"), vec![]); 794 + } 795 + 796 + #[test] 797 + fn test_url_token() { 798 + assert_eq!( 799 + tokenize("url(https://example.com)"), 800 + vec![Token::Url("https://example.com".into())] 801 + ); 802 + } 803 + 804 + #[test] 805 + fn test_url_with_whitespace() { 806 + assert_eq!( 807 + tokenize("url( foo.png )"), 808 + vec![Token::Url("foo.png".into())] 809 + ); 810 + } 811 + 812 + #[test] 813 + fn test_url_function_with_quotes() { 814 + let tokens = tokenize("url(\"foo.png\")"); 815 + assert_eq!(tokens[0], Token::Function("url".into())); 816 + } 817 + 818 + #[test] 819 + fn test_bad_url() { 820 + let tokens = tokenize("url(foo bar)"); 821 + assert_eq!(tokens[0], Token::BadUrl); 822 + } 823 + 824 + #[test] 825 + fn test_escape_in_ident() { 826 + assert_eq!(tokenize(r"c\6Flor"), vec![Token::Ident("color".into())]); 827 + } 828 + 829 + #[test] 830 + fn test_css_rule() { 831 + let tokens = tokenize("div { color: red; }"); 832 + assert_eq!( 833 + tokens, 834 + vec![ 835 + Token::Ident("div".into()), 836 + Token::Whitespace, 837 + Token::LeftBrace, 838 + Token::Whitespace, 839 + Token::Ident("color".into()), 840 + Token::Colon, 841 + Token::Whitespace, 842 + Token::Ident("red".into()), 843 + Token::Semicolon, 844 + Token::Whitespace, 845 + Token::RightBrace, 846 + ] 847 + ); 848 + } 849 + 850 + #[test] 851 + fn test_selector_with_class() { 852 + let tokens = tokenize("div.foo"); 853 + assert_eq!( 854 + tokens, 855 + vec![ 856 + Token::Ident("div".into()), 857 + Token::Delim('.'), 858 + Token::Ident("foo".into()), 859 + ] 860 + ); 861 + } 862 + 863 + #[test] 864 + fn test_selector_with_id() { 865 + let tokens = tokenize("#main"); 866 + assert_eq!(tokens, vec![Token::Hash("main".into(), HashType::Id)]); 867 + } 868 + 869 + #[test] 870 + fn test_dimension_with_float() { 871 + assert_eq!( 872 + tokenize("0.5em"), 873 + vec![Token::Dimension(0.5, NumericType::Number, "em".into())] 874 + ); 875 + } 876 + 877 + #[test] 878 + fn test_multiple_numbers() { 879 + let tokens = tokenize("10px 20px"); 880 + assert_eq!( 881 + tokens, 882 + vec![ 883 + Token::Dimension(10.0, NumericType::Integer, "px".into()), 884 + Token::Whitespace, 885 + Token::Dimension(20.0, NumericType::Integer, "px".into()), 886 + ] 887 + ); 888 + } 889 + 890 + #[test] 891 + fn test_at_rule() { 892 + let tokens = tokenize("@media screen"); 893 + assert_eq!( 894 + tokens, 895 + vec![ 896 + Token::AtKeyword("media".into()), 897 + Token::Whitespace, 898 + Token::Ident("screen".into()), 899 + ] 900 + ); 901 + } 902 + 903 + #[test] 904 + fn test_function_with_args() { 905 + let tokens = tokenize("calc(100% - 20px)"); 906 + assert_eq!( 907 + tokens, 908 + vec![ 909 + Token::Function("calc".into()), 910 + Token::Percentage(100.0), 911 + Token::Whitespace, 912 + Token::Delim('-'), 913 + Token::Whitespace, 914 + Token::Dimension(20.0, NumericType::Integer, "px".into()), 915 + Token::RightParen, 916 + ] 917 + ); 918 + } 919 + 920 + #[test] 921 + fn test_color_hex() { 922 + let tokens = tokenize("#ff0000"); 923 + assert_eq!(tokens, vec![Token::Hash("ff0000".into(), HashType::Id)]); 924 + } 925 + 926 + #[test] 927 + fn test_negative_dimension() { 928 + assert_eq!( 929 + tokenize("-10px"), 930 + vec![Token::Dimension(-10.0, NumericType::Integer, "px".into())] 931 + ); 932 + } 933 + 934 + #[test] 935 + fn test_unicode_ident() { 936 + assert_eq!(tokenize("côté"), vec![Token::Ident("côté".into())]); 937 + } 938 + 939 + #[test] 940 + fn test_null_replacement() { 941 + let tokens = tokenize("a\0b"); 942 + assert_eq!(tokens, vec![Token::Ident("a\u{FFFD}b".into())]); 943 + } 944 + 945 + #[test] 946 + fn test_crlf_normalization() { 947 + let tokens = tokenize("a\r\nb"); 948 + assert_eq!( 949 + tokens, 950 + vec![ 951 + Token::Ident("a".into()), 952 + Token::Whitespace, 953 + Token::Ident("b".into()), 954 + ] 955 + ); 956 + } 957 + 958 + #[test] 959 + fn test_escape_hex_with_trailing_space() { 960 + // \41 followed by space should produce 'A' 961 + assert_eq!(tokenize(r"\41 B"), vec![Token::Ident("AB".into())]); 962 + } 963 + 964 + #[test] 965 + fn test_at_sign_alone() { 966 + assert_eq!(tokenize("@"), vec![Token::Delim('@')]); 967 + } 968 + 969 + #[test] 970 + fn test_hash_alone() { 971 + // # followed by non-name char 972 + assert_eq!(tokenize("# "), vec![Token::Delim('#'), Token::Whitespace]); 973 + } 974 + 975 + #[test] 976 + fn test_nested_comments() { 977 + // CSS comments don't nest, so "/* /* */" closes at first */ 978 + let tokens = tokenize("/* /* */ a"); 979 + assert_eq!(tokens, vec![Token::Whitespace, Token::Ident("a".into())]); 980 + } 981 + }