//! CSS tokenizer per CSS Syntax Module Level 3 §4. //! //! Consumes a stream of code points and produces CSS tokens. /// A CSS token produced by the tokenizer. #[derive(Debug, Clone, PartialEq)] pub enum Token { Ident(String), Function(String), AtKeyword(String), Hash(String, HashType), String(String), BadString, Url(String), BadUrl, Number(f64, NumericType), Percentage(f64), Dimension(f64, NumericType, String), Whitespace, Colon, Semicolon, Comma, LeftBracket, RightBracket, LeftParen, RightParen, LeftBrace, RightBrace, Delim(char), Cdo, Cdc, Eof, } /// Whether a `` is "id" (valid identifier) or "unrestricted". #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum HashType { Id, Unrestricted, } /// Whether a number is integer or number (float). #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum NumericType { Integer, Number, } /// CSS tokenizer state machine. pub struct Tokenizer { input: Vec, pos: usize, } impl Tokenizer { pub fn new(input: &str) -> Self { // Preprocessing: replace \r\n, \r, \f with \n (§3.3) let mut chars = Vec::with_capacity(input.len()); let raw: Vec = input.chars().collect(); let mut i = 0; while i < raw.len() { match raw[i] { '\r' => { chars.push('\n'); if i + 1 < raw.len() && raw[i + 1] == '\n' { i += 1; } } '\x0C' => chars.push('\n'), '\0' => chars.push('\u{FFFD}'), c => chars.push(c), } i += 1; } Self { input: chars, pos: 0, } } /// Tokenize the entire input into a list of tokens (excluding EOF). pub fn tokenize(input: &str) -> Vec { let mut tokenizer = Self::new(input); let mut tokens = Vec::new(); loop { let token = tokenizer.next_token(); if token == Token::Eof { break; } tokens.push(token); } tokens } /// Consume and return the next token. pub fn next_token(&mut self) -> Token { self.consume_comments(); self.consume_token() } fn peek(&self) -> char { self.input.get(self.pos).copied().unwrap_or('\0') } fn peek_at(&self, offset: usize) -> char { self.input.get(self.pos + offset).copied().unwrap_or('\0') } fn advance(&mut self) -> char { let c = self.peek(); if self.pos < self.input.len() { self.pos += 1; } c } fn is_eof(&self) -> bool { self.pos >= self.input.len() } fn consume_comments(&mut self) { while self.peek() == '/' && self.peek_at(1) == '*' { self.pos += 2; loop { if self.is_eof() { return; } if self.peek() == '*' && self.peek_at(1) == '/' { self.pos += 2; break; } self.pos += 1; } } } fn consume_token(&mut self) -> Token { if self.is_eof() { return Token::Eof; } let c = self.peek(); // Whitespace if is_whitespace(c) { self.consume_whitespace(); return Token::Whitespace; } // String if c == '"' || c == '\'' { return self.consume_string(c); } // Hash if c == '#' { self.advance(); if is_name_char(self.peek()) || self.starts_valid_escape() { let hash_type = if self.would_start_ident() { HashType::Id } else { HashType::Unrestricted }; let name = self.consume_name(); return Token::Hash(name, hash_type); } return Token::Delim('#'); } // Left paren if c == '(' { self.advance(); return Token::LeftParen; } // Right paren if c == ')' { self.advance(); return Token::RightParen; } // Plus sign if c == '+' { if self.starts_number() { return self.consume_numeric(); } self.advance(); return Token::Delim('+'); } // Comma if c == ',' { self.advance(); return Token::Comma; } // Hyphen-minus if c == '-' { if self.starts_number() { return self.consume_numeric(); } if self.peek_at(1) == '-' && self.peek_at(2) == '>' { self.pos += 3; return Token::Cdc; } if self.would_start_ident() { return self.consume_ident_like(); } self.advance(); return Token::Delim('-'); } // Full stop if c == '.' { if self.starts_number() { return self.consume_numeric(); } self.advance(); return Token::Delim('.'); } // Colon if c == ':' { self.advance(); return Token::Colon; } // Semicolon if c == ';' { self.advance(); return Token::Semicolon; } // Less-than sign if c == '<' { if self.peek_at(1) == '!' && self.peek_at(2) == '-' && self.peek_at(3) == '-' { self.pos += 4; return Token::Cdo; } self.advance(); return Token::Delim('<'); } // At sign if c == '@' { self.advance(); if self.would_start_ident() { let name = self.consume_name(); return Token::AtKeyword(name); } return Token::Delim('@'); } // Left bracket if c == '[' { self.advance(); return Token::LeftBracket; } // Backslash if c == '\\' { if self.starts_valid_escape() { return self.consume_ident_like(); } self.advance(); return Token::Delim('\\'); } // Right bracket if c == ']' { self.advance(); return Token::RightBracket; } // Left brace if c == '{' { self.advance(); return Token::LeftBrace; } // Right brace if c == '}' { self.advance(); return Token::RightBrace; } // Digit if c.is_ascii_digit() { return self.consume_numeric(); } // Name start if is_name_start_char(c) { return self.consume_ident_like(); } // Anything else self.advance(); Token::Delim(c) } fn consume_whitespace(&mut self) { while !self.is_eof() && is_whitespace(self.peek()) { self.advance(); } } fn consume_string(&mut self, ending: char) -> Token { self.advance(); // consume opening quote let mut value = String::new(); loop { if self.is_eof() { return Token::String(value); } let c = self.advance(); match c { c if c == ending => return Token::String(value), '\n' => { // Unescaped newline in string → bad string self.pos -= 1; // reconsume return Token::BadString; } '\\' => { if self.is_eof() { // Backslash at end of input: do nothing } else if self.peek() == '\n' { self.advance(); // consume newline (line continuation) } else { value.push(self.consume_escaped_char()); } } _ => value.push(c), } } } fn consume_escaped_char(&mut self) -> char { if self.is_eof() { return '\u{FFFD}'; } let c = self.advance(); if c.is_ascii_hexdigit() { let mut hex = String::new(); hex.push(c); for _ in 0..5 { if !self.is_eof() && self.peek().is_ascii_hexdigit() { hex.push(self.advance()); } else { break; } } // Consume optional trailing whitespace if !self.is_eof() && is_whitespace(self.peek()) { self.advance(); } let code_point = u32::from_str_radix(&hex, 16).unwrap_or(0); if code_point == 0 || code_point > 0x10FFFF || (0xD800..=0xDFFF).contains(&code_point) { '\u{FFFD}' } else { char::from_u32(code_point).unwrap_or('\u{FFFD}') } } else { c } } fn starts_valid_escape(&self) -> bool { self.peek() == '\\' && self.peek_at(1) != '\n' } fn starts_valid_escape_at(&self, offset: usize) -> bool { self.peek_at(offset) == '\\' && self.peek_at(offset + 1) != '\n' } /// Check if the next chars would start an identifier (§4.3.9). fn would_start_ident(&self) -> bool { self.would_start_ident_at(0) } fn would_start_ident_at(&self, offset: usize) -> bool { let c = self.peek_at(offset); if is_name_start_char(c) { return true; } if c == '-' { let next = self.peek_at(offset + 1); return is_name_start_char(next) || next == '-' || self.starts_valid_escape_at(offset + 1); } if c == '\\' { return self.starts_valid_escape_at(offset); } false } /// Check if the next chars would start a number (§4.3.10). fn starts_number(&self) -> bool { let c = self.peek(); if c == '+' || c == '-' { let next = self.peek_at(1); if next.is_ascii_digit() { return true; } if next == '.' && self.peek_at(2).is_ascii_digit() { return true; } return false; } if c == '.' { return self.peek_at(1).is_ascii_digit(); } c.is_ascii_digit() } fn consume_name(&mut self) -> String { let mut name = String::new(); loop { if self.is_eof() { break; } let c = self.peek(); if is_name_char(c) { name.push(c); self.advance(); } else if self.starts_valid_escape() { self.advance(); // consume backslash name.push(self.consume_escaped_char()); } else { break; } } name } fn consume_numeric(&mut self) -> Token { let (value, num_type) = self.consume_number(); if self.would_start_ident() { let unit = self.consume_name(); return Token::Dimension(value, num_type, unit); } if self.peek() == '%' { self.advance(); return Token::Percentage(value); } Token::Number(value, num_type) } fn consume_number(&mut self) -> (f64, NumericType) { let mut repr = String::new(); let mut num_type = NumericType::Integer; // Sign if self.peek() == '+' || self.peek() == '-' { repr.push(self.advance()); } // Integer part while !self.is_eof() && self.peek().is_ascii_digit() { repr.push(self.advance()); } // Fractional part if self.peek() == '.' && self.peek_at(1).is_ascii_digit() { repr.push(self.advance()); // '.' num_type = NumericType::Number; while !self.is_eof() && self.peek().is_ascii_digit() { repr.push(self.advance()); } } // Exponent if self.peek() == 'e' || self.peek() == 'E' { let next = self.peek_at(1); if next.is_ascii_digit() || ((next == '+' || next == '-') && self.peek_at(2).is_ascii_digit()) { repr.push(self.advance()); // 'e'/'E' num_type = NumericType::Number; if self.peek() == '+' || self.peek() == '-' { repr.push(self.advance()); } while !self.is_eof() && self.peek().is_ascii_digit() { repr.push(self.advance()); } } } let value = repr.parse::().unwrap_or(0.0); (value, num_type) } fn consume_ident_like(&mut self) -> Token { let name = self.consume_name(); // Check for url( function if name.eq_ignore_ascii_case("url") && self.peek() == '(' { self.advance(); // consume '(' // Skip whitespace let saved = self.pos; self.consume_whitespace(); if self.peek() == '"' || self.peek() == '\'' { // url("...") → treat as function token, parser handles the rest self.pos = saved; return Token::Function(name); } return self.consume_url(); } if self.peek() == '(' { self.advance(); return Token::Function(name); } Token::Ident(name) } fn consume_url(&mut self) -> Token { let mut value = String::new(); self.consume_whitespace(); loop { if self.is_eof() { return Token::Url(value); } match self.peek() { ')' => { self.advance(); return Token::Url(value); } c if is_whitespace(c) => { self.consume_whitespace(); if self.is_eof() || self.peek() == ')' { if !self.is_eof() { self.advance(); } return Token::Url(value); } self.consume_bad_url_remnants(); return Token::BadUrl; } '"' | '\'' | '(' => { self.consume_bad_url_remnants(); return Token::BadUrl; } '\\' => { if self.starts_valid_escape() { self.advance(); value.push(self.consume_escaped_char()); } else { self.consume_bad_url_remnants(); return Token::BadUrl; } } c if is_non_printable(c) => { self.consume_bad_url_remnants(); return Token::BadUrl; } _ => { value.push(self.advance()); } } } } fn consume_bad_url_remnants(&mut self) { loop { if self.is_eof() { return; } let c = self.advance(); if c == ')' { return; } if self.peek_at(0) != '\n' && c == '\\' { self.advance(); // consume escaped char } } } } fn is_whitespace(c: char) -> bool { matches!(c, ' ' | '\t' | '\n') } fn is_name_start_char(c: char) -> bool { c.is_ascii_alphabetic() || !c.is_ascii() || c == '_' } fn is_name_char(c: char) -> bool { is_name_start_char(c) || c.is_ascii_digit() || c == '-' } fn is_non_printable(c: char) -> bool { matches!(c, '\x00'..='\x08' | '\x0B' | '\x0E'..='\x1F' | '\x7F') } #[cfg(test)] mod tests { use super::*; fn tokenize(input: &str) -> Vec { Tokenizer::tokenize(input) } #[test] fn test_empty() { assert_eq!(tokenize(""), vec![]); } #[test] fn test_whitespace() { assert_eq!(tokenize(" \t\n "), vec![Token::Whitespace]); } #[test] fn test_ident() { assert_eq!(tokenize("color"), vec![Token::Ident("color".into())]); assert_eq!(tokenize("div"), vec![Token::Ident("div".into())]); assert_eq!(tokenize("--custom"), vec![Token::Ident("--custom".into())]); assert_eq!(tokenize("_foo"), vec![Token::Ident("_foo".into())]); assert_eq!( tokenize("-webkit-foo"), vec![Token::Ident("-webkit-foo".into())] ); } #[test] fn test_function() { assert_eq!(tokenize("rgb("), vec![Token::Function("rgb".into())]); let tokens = tokenize("rgb(255, 0, 0)"); assert_eq!(tokens[0], Token::Function("rgb".into())); assert_eq!(tokenize("calc("), vec![Token::Function("calc".into())]); } #[test] fn test_at_keyword() { assert_eq!(tokenize("@media"), vec![Token::AtKeyword("media".into())]); assert_eq!(tokenize("@import"), vec![Token::AtKeyword("import".into())]); } #[test] fn test_hash() { assert_eq!( tokenize("#id"), vec![Token::Hash("id".into(), HashType::Id)] ); assert_eq!( tokenize("#fff"), vec![Token::Hash("fff".into(), HashType::Id)] ); assert_eq!( tokenize("#123"), vec![Token::Hash("123".into(), HashType::Unrestricted)] ); } #[test] fn test_string_double_quote() { assert_eq!(tokenize(r#""hello""#), vec![Token::String("hello".into())]); } #[test] fn test_string_single_quote() { assert_eq!(tokenize("'world'"), vec![Token::String("world".into())]); } #[test] fn test_string_escape() { assert_eq!(tokenize(r#""he\6Co""#), vec![Token::String("helo".into())]); } #[test] fn test_string_newline_escape() { assert_eq!( tokenize("\"line\\\ncontinued\""), vec![Token::String("linecontinued".into())] ); } #[test] fn test_bad_string() { let tokens = tokenize("\"unterminated\n"); assert_eq!(tokens[0], Token::BadString); } #[test] fn test_number_integer() { assert_eq!( tokenize("42"), vec![Token::Number(42.0, NumericType::Integer)] ); } #[test] fn test_number_float() { assert_eq!( tokenize("3.14"), vec![Token::Number(3.14, NumericType::Number)] ); } #[test] fn test_number_signed() { assert_eq!( tokenize("+10"), vec![Token::Number(10.0, NumericType::Integer)] ); assert_eq!( tokenize("-5"), vec![Token::Number(-5.0, NumericType::Integer)] ); } #[test] fn test_number_exponent() { assert_eq!( tokenize("1e2"), vec![Token::Number(100.0, NumericType::Number)] ); assert_eq!( tokenize("2E+3"), vec![Token::Number(2000.0, NumericType::Number)] ); } #[test] fn test_percentage() { assert_eq!(tokenize("50%"), vec![Token::Percentage(50.0)]); } #[test] fn test_dimension() { assert_eq!( tokenize("10px"), vec![Token::Dimension(10.0, NumericType::Integer, "px".into())] ); assert_eq!( tokenize("2em"), vec![Token::Dimension(2.0, NumericType::Integer, "em".into())] ); assert_eq!( tokenize("1.5rem"), vec![Token::Dimension(1.5, NumericType::Number, "rem".into())] ); } #[test] fn test_delimiters() { assert_eq!(tokenize(":"), vec![Token::Colon]); assert_eq!(tokenize(";"), vec![Token::Semicolon]); assert_eq!(tokenize(","), vec![Token::Comma]); assert_eq!(tokenize("("), vec![Token::LeftParen]); assert_eq!(tokenize(")"), vec![Token::RightParen]); assert_eq!(tokenize("["), vec![Token::LeftBracket]); assert_eq!(tokenize("]"), vec![Token::RightBracket]); assert_eq!(tokenize("{"), vec![Token::LeftBrace]); assert_eq!(tokenize("}"), vec![Token::RightBrace]); } #[test] fn test_delim_tokens() { assert_eq!(tokenize("."), vec![Token::Delim('.')]); assert_eq!(tokenize(">"), vec![Token::Delim('>')]); assert_eq!(tokenize("+"), vec![Token::Delim('+')]); assert_eq!(tokenize("~"), vec![Token::Delim('~')]); assert_eq!(tokenize("*"), vec![Token::Delim('*')]); } #[test] fn test_cdo_cdc() { assert_eq!(tokenize(""), vec![Token::Cdc]); } #[test] fn test_comments() { assert_eq!( tokenize("/* comment */color"), vec![Token::Ident("color".into())] ); assert_eq!( tokenize("a/* x */b"), vec![Token::Ident("a".into()), Token::Ident("b".into())] ); } #[test] fn test_unclosed_comment() { assert_eq!(tokenize("/* unclosed"), vec![]); } #[test] fn test_url_token() { assert_eq!( tokenize("url(https://example.com)"), vec![Token::Url("https://example.com".into())] ); } #[test] fn test_url_with_whitespace() { assert_eq!( tokenize("url( foo.png )"), vec![Token::Url("foo.png".into())] ); } #[test] fn test_url_function_with_quotes() { let tokens = tokenize("url(\"foo.png\")"); assert_eq!(tokens[0], Token::Function("url".into())); } #[test] fn test_bad_url() { let tokens = tokenize("url(foo bar)"); assert_eq!(tokens[0], Token::BadUrl); } #[test] fn test_escape_in_ident() { assert_eq!(tokenize(r"c\6Flor"), vec![Token::Ident("color".into())]); } #[test] fn test_css_rule() { let tokens = tokenize("div { color: red; }"); assert_eq!( tokens, vec![ Token::Ident("div".into()), Token::Whitespace, Token::LeftBrace, Token::Whitespace, Token::Ident("color".into()), Token::Colon, Token::Whitespace, Token::Ident("red".into()), Token::Semicolon, Token::Whitespace, Token::RightBrace, ] ); } #[test] fn test_selector_with_class() { let tokens = tokenize("div.foo"); assert_eq!( tokens, vec![ Token::Ident("div".into()), Token::Delim('.'), Token::Ident("foo".into()), ] ); } #[test] fn test_selector_with_id() { let tokens = tokenize("#main"); assert_eq!(tokens, vec![Token::Hash("main".into(), HashType::Id)]); } #[test] fn test_dimension_with_float() { assert_eq!( tokenize("0.5em"), vec![Token::Dimension(0.5, NumericType::Number, "em".into())] ); } #[test] fn test_multiple_numbers() { let tokens = tokenize("10px 20px"); assert_eq!( tokens, vec![ Token::Dimension(10.0, NumericType::Integer, "px".into()), Token::Whitespace, Token::Dimension(20.0, NumericType::Integer, "px".into()), ] ); } #[test] fn test_at_rule() { let tokens = tokenize("@media screen"); assert_eq!( tokens, vec![ Token::AtKeyword("media".into()), Token::Whitespace, Token::Ident("screen".into()), ] ); } #[test] fn test_function_with_args() { let tokens = tokenize("calc(100% - 20px)"); assert_eq!( tokens, vec![ Token::Function("calc".into()), Token::Percentage(100.0), Token::Whitespace, Token::Delim('-'), Token::Whitespace, Token::Dimension(20.0, NumericType::Integer, "px".into()), Token::RightParen, ] ); } #[test] fn test_color_hex() { let tokens = tokenize("#ff0000"); assert_eq!(tokens, vec![Token::Hash("ff0000".into(), HashType::Id)]); } #[test] fn test_negative_dimension() { assert_eq!( tokenize("-10px"), vec![Token::Dimension(-10.0, NumericType::Integer, "px".into())] ); } #[test] fn test_unicode_ident() { assert_eq!(tokenize("côté"), vec![Token::Ident("côté".into())]); } #[test] fn test_null_replacement() { let tokens = tokenize("a\0b"); assert_eq!(tokens, vec![Token::Ident("a\u{FFFD}b".into())]); } #[test] fn test_crlf_normalization() { let tokens = tokenize("a\r\nb"); assert_eq!( tokens, vec![ Token::Ident("a".into()), Token::Whitespace, Token::Ident("b".into()), ] ); } #[test] fn test_escape_hex_with_trailing_space() { // \41 followed by space should produce 'A' assert_eq!(tokenize(r"\41 B"), vec![Token::Ident("AB".into())]); } #[test] fn test_at_sign_alone() { assert_eq!(tokenize("@"), vec![Token::Delim('@')]); } #[test] fn test_hash_alone() { // # followed by non-name char assert_eq!(tokenize("# "), vec![Token::Delim('#'), Token::Whitespace]); } #[test] fn test_nested_comments() { // CSS comments don't nest, so "/* /* */" closes at first */ let tokens = tokenize("/* /* */ a"); assert_eq!(tokens, vec![Token::Whitespace, Token::Ident("a".into())]); } }