//! html5lib tokenizer test harness. //! //! Reads JSON test files from `tests/html5lib-tests/tokenizer/` and runs each //! test case against our HTML tokenizer. Reports pass/fail/skip counts. //! //! Run with: `cargo test -p we-html --test html5lib_tokenizer` mod json; use json::JsonValue; use we_html::Token; /// Workspace root relative to the crate directory. const WORKSPACE_ROOT: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../"); /// Convert a JSON output token (array) into our `Token` type for comparison. fn json_to_token(val: &JsonValue) -> Option { let arr = val.as_array()?; let kind = arr.first()?.as_str()?; match kind { "DOCTYPE" => { let name = arr.get(1).and_then(|v| v.as_str()).map(String::from); let public_id = match arr.get(2) { Some(JsonValue::Null) => None, Some(v) => v.as_str().map(String::from), None => None, }; let system_id = match arr.get(3) { Some(JsonValue::Null) => None, Some(v) => v.as_str().map(String::from), None => None, }; let correctness = arr.get(4).and_then(|v| v.as_bool()).unwrap_or(true); Some(Token::Doctype { name, public_id, system_id, force_quirks: !correctness, }) } "StartTag" => { let name = arr.get(1)?.as_str()?.to_string(); let mut attributes = Vec::new(); if let Some(attrs_obj) = arr.get(2).and_then(|v| v.as_object()) { for (k, v) in attrs_obj { let val_str = v.as_str().unwrap_or("").to_string(); attributes.push((k.clone(), val_str)); } } let self_closing = arr.get(3).and_then(|v| v.as_bool()).unwrap_or(false); Some(Token::StartTag { name, attributes, self_closing, }) } "EndTag" => { let name = arr.get(1)?.as_str()?.to_string(); Some(Token::EndTag { name }) } "Character" => { let data = arr.get(1)?.as_str()?.to_string(); Some(Token::Character(data)) } "Comment" => { let data = arr.get(1)?.as_str()?.to_string(); Some(Token::Comment(data)) } _ => None, } } /// Apply double-escaping as described in the html5lib test format. /// When `doubleEscaped` is true, the input and expected strings contain /// literal `\uXXXX` sequences that should be decoded. fn unescape_double_escaped(s: &str) -> String { let mut result = String::new(); let mut chars = s.chars(); while let Some(ch) = chars.next() { if ch == '\\' { match chars.next() { Some('u') => { let hex: String = chars.by_ref().take(4).collect(); if hex.len() == 4 { if let Ok(cp) = u32::from_str_radix(&hex, 16) { if let Some(c) = char::from_u32(cp) { result.push(c); continue; } } } result.push('\\'); result.push('u'); result.push_str(&hex); } Some(other) => { result.push('\\'); result.push(other); } None => { result.push('\\'); } } } else { result.push(ch); } } result } /// Run a single test case and return whether it passed. fn run_test_case(test: &JsonValue, double_escaped: bool) -> bool { let input = match test.get("input").and_then(|v| v.as_str()) { Some(s) => { if double_escaped { unescape_double_escaped(s) } else { s.to_string() } } None => return false, }; let expected_output = match test.get("output").and_then(|v| v.as_array()) { Some(arr) => arr, None => return false, }; // Convert expected output tokens. let expected_tokens: Vec = expected_output .iter() .filter_map(|tok_json| { let mut tok = json_to_token(tok_json)?; if double_escaped { match &mut tok { Token::Character(ref mut s) => *s = unescape_double_escaped(s), Token::Comment(ref mut s) => *s = unescape_double_escaped(s), _ => {} } } Some(tok) }) .collect(); // Run our tokenizer. let actual_tokens = we_html::tokenize(&input); actual_tokens == expected_tokens } /// Load and run all test cases from a single html5lib tokenizer test file. fn run_test_file(path: &std::path::Path) -> (usize, usize, usize) { let content = match std::fs::read_to_string(path) { Ok(c) => c, Err(e) => { eprintln!(" failed to read {}: {}", path.display(), e); return (0, 0, 1); } }; let root = match json::parse(&content) { Ok(v) => v, Err(e) => { eprintln!(" failed to parse {}: {}", path.display(), e); return (0, 0, 1); } }; let tests = match root.get("tests").and_then(|v| v.as_array()) { Some(t) => t, None => { eprintln!(" no 'tests' array in {}", path.display()); return (0, 0, 1); } }; let mut pass = 0; let mut fail = 0; let mut skip = 0; for test in tests { let desc = test .get("description") .and_then(|v| v.as_str()) .unwrap_or(""); let double_escaped = test .get("doubleEscaped") .and_then(|v| v.as_bool()) .unwrap_or(false); // If the test specifies initialStates, we run once per state. // For now we only support the default "Data state" so skip others. if let Some(states) = test.get("initialStates").and_then(|v| v.as_array()) { let has_data_state = states.iter().any(|s| s.as_str() == Some("Data state")); if !has_data_state { skip += 1; continue; } } if run_test_case(test, double_escaped) { pass += 1; } else { fail += 1; // Only print first few failures to avoid noise. if fail <= 5 { eprintln!(" FAIL: {}", desc); } } } (pass, fail, skip) } #[test] fn html5lib_tokenizer_tests() { let test_dir = std::path::PathBuf::from(WORKSPACE_ROOT).join("tests/html5lib-tests/tokenizer"); if !test_dir.exists() { eprintln!( "html5lib-tests submodule not checked out at {}", test_dir.display() ); eprintln!("Run: git submodule update --init tests/html5lib-tests"); // Don't fail the test — the submodule might not be initialized. return; } let mut total_pass = 0; let mut total_fail = 0; let mut total_skip = 0; let mut entries: Vec<_> = std::fs::read_dir(&test_dir) .expect("failed to read tokenizer test dir") .filter_map(|e| e.ok()) .filter(|e| e.path().extension().map_or(false, |ext| ext == "test")) .collect(); entries.sort_by_key(|e| e.file_name()); for entry in &entries { let path = entry.path(); let name = path.file_name().unwrap().to_string_lossy(); let (pass, fail, skip) = run_test_file(&path); eprintln!("{}: {} pass, {} fail, {} skip", name, pass, fail, skip); total_pass += pass; total_fail += fail; total_skip += skip; } eprintln!(); eprintln!( "html5lib tokenizer totals: {} pass, {} fail, {} skip ({} total)", total_pass, total_fail, total_skip, total_pass + total_fail + total_skip ); // The test "passes" as a harness — it reports results but doesn't fail // the test suite until we have an implementation to measure against. // This lets CI always run and report progress. }