web engine - experimental web browser
at poly1305-h4-fix 263 lines 8.5 kB view raw
1//! html5lib tokenizer test harness. 2//! 3//! Reads JSON test files from `tests/html5lib-tests/tokenizer/` and runs each 4//! test case against our HTML tokenizer. Reports pass/fail/skip counts. 5//! 6//! Run with: `cargo test -p we-html --test html5lib_tokenizer` 7 8mod json; 9 10use json::JsonValue; 11use we_html::Token; 12 13/// Workspace root relative to the crate directory. 14const WORKSPACE_ROOT: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../"); 15 16/// Convert a JSON output token (array) into our `Token` type for comparison. 17fn json_to_token(val: &JsonValue) -> Option<Token> { 18 let arr = val.as_array()?; 19 let kind = arr.first()?.as_str()?; 20 match kind { 21 "DOCTYPE" => { 22 let name = arr.get(1).and_then(|v| v.as_str()).map(String::from); 23 let public_id = match arr.get(2) { 24 Some(JsonValue::Null) => None, 25 Some(v) => v.as_str().map(String::from), 26 None => None, 27 }; 28 let system_id = match arr.get(3) { 29 Some(JsonValue::Null) => None, 30 Some(v) => v.as_str().map(String::from), 31 None => None, 32 }; 33 let correctness = arr.get(4).and_then(|v| v.as_bool()).unwrap_or(true); 34 Some(Token::Doctype { 35 name, 36 public_id, 37 system_id, 38 force_quirks: !correctness, 39 }) 40 } 41 "StartTag" => { 42 let name = arr.get(1)?.as_str()?.to_string(); 43 let mut attributes = Vec::new(); 44 if let Some(attrs_obj) = arr.get(2).and_then(|v| v.as_object()) { 45 for (k, v) in attrs_obj { 46 let val_str = v.as_str().unwrap_or("").to_string(); 47 attributes.push((k.clone(), val_str)); 48 } 49 } 50 let self_closing = arr.get(3).and_then(|v| v.as_bool()).unwrap_or(false); 51 Some(Token::StartTag { 52 name, 53 attributes, 54 self_closing, 55 }) 56 } 57 "EndTag" => { 58 let name = arr.get(1)?.as_str()?.to_string(); 59 Some(Token::EndTag { name }) 60 } 61 "Character" => { 62 let data = arr.get(1)?.as_str()?.to_string(); 63 Some(Token::Character(data)) 64 } 65 "Comment" => { 66 let data = arr.get(1)?.as_str()?.to_string(); 67 Some(Token::Comment(data)) 68 } 69 _ => None, 70 } 71} 72 73/// Apply double-escaping as described in the html5lib test format. 74/// When `doubleEscaped` is true, the input and expected strings contain 75/// literal `\uXXXX` sequences that should be decoded. 76fn unescape_double_escaped(s: &str) -> String { 77 let mut result = String::new(); 78 let mut chars = s.chars(); 79 while let Some(ch) = chars.next() { 80 if ch == '\\' { 81 match chars.next() { 82 Some('u') => { 83 let hex: String = chars.by_ref().take(4).collect(); 84 if hex.len() == 4 { 85 if let Ok(cp) = u32::from_str_radix(&hex, 16) { 86 if let Some(c) = char::from_u32(cp) { 87 result.push(c); 88 continue; 89 } 90 } 91 } 92 result.push('\\'); 93 result.push('u'); 94 result.push_str(&hex); 95 } 96 Some(other) => { 97 result.push('\\'); 98 result.push(other); 99 } 100 None => { 101 result.push('\\'); 102 } 103 } 104 } else { 105 result.push(ch); 106 } 107 } 108 result 109} 110 111/// Run a single test case and return whether it passed. 112fn run_test_case(test: &JsonValue, double_escaped: bool) -> bool { 113 let input = match test.get("input").and_then(|v| v.as_str()) { 114 Some(s) => { 115 if double_escaped { 116 unescape_double_escaped(s) 117 } else { 118 s.to_string() 119 } 120 } 121 None => return false, 122 }; 123 124 let expected_output = match test.get("output").and_then(|v| v.as_array()) { 125 Some(arr) => arr, 126 None => return false, 127 }; 128 129 // Convert expected output tokens. 130 let expected_tokens: Vec<Token> = expected_output 131 .iter() 132 .filter_map(|tok_json| { 133 let mut tok = json_to_token(tok_json)?; 134 if double_escaped { 135 match &mut tok { 136 Token::Character(ref mut s) => *s = unescape_double_escaped(s), 137 Token::Comment(ref mut s) => *s = unescape_double_escaped(s), 138 _ => {} 139 } 140 } 141 Some(tok) 142 }) 143 .collect(); 144 145 // Run our tokenizer. 146 let actual_tokens = we_html::tokenize(&input); 147 148 actual_tokens == expected_tokens 149} 150 151/// Load and run all test cases from a single html5lib tokenizer test file. 152fn run_test_file(path: &std::path::Path) -> (usize, usize, usize) { 153 let content = match std::fs::read_to_string(path) { 154 Ok(c) => c, 155 Err(e) => { 156 eprintln!(" failed to read {}: {}", path.display(), e); 157 return (0, 0, 1); 158 } 159 }; 160 161 let root = match json::parse(&content) { 162 Ok(v) => v, 163 Err(e) => { 164 eprintln!(" failed to parse {}: {}", path.display(), e); 165 return (0, 0, 1); 166 } 167 }; 168 169 let tests = match root.get("tests").and_then(|v| v.as_array()) { 170 Some(t) => t, 171 None => { 172 eprintln!(" no 'tests' array in {}", path.display()); 173 return (0, 0, 1); 174 } 175 }; 176 177 let mut pass = 0; 178 let mut fail = 0; 179 let mut skip = 0; 180 181 for test in tests { 182 let desc = test 183 .get("description") 184 .and_then(|v| v.as_str()) 185 .unwrap_or("<no description>"); 186 187 let double_escaped = test 188 .get("doubleEscaped") 189 .and_then(|v| v.as_bool()) 190 .unwrap_or(false); 191 192 // If the test specifies initialStates, we run once per state. 193 // For now we only support the default "Data state" so skip others. 194 if let Some(states) = test.get("initialStates").and_then(|v| v.as_array()) { 195 let has_data_state = states.iter().any(|s| s.as_str() == Some("Data state")); 196 if !has_data_state { 197 skip += 1; 198 continue; 199 } 200 } 201 202 if run_test_case(test, double_escaped) { 203 pass += 1; 204 } else { 205 fail += 1; 206 // Only print first few failures to avoid noise. 207 if fail <= 5 { 208 eprintln!(" FAIL: {}", desc); 209 } 210 } 211 } 212 213 (pass, fail, skip) 214} 215 216#[test] 217fn html5lib_tokenizer_tests() { 218 let test_dir = std::path::PathBuf::from(WORKSPACE_ROOT).join("tests/html5lib-tests/tokenizer"); 219 220 if !test_dir.exists() { 221 eprintln!( 222 "html5lib-tests submodule not checked out at {}", 223 test_dir.display() 224 ); 225 eprintln!("Run: git submodule update --init tests/html5lib-tests"); 226 // Don't fail the test — the submodule might not be initialized. 227 return; 228 } 229 230 let mut total_pass = 0; 231 let mut total_fail = 0; 232 let mut total_skip = 0; 233 234 let mut entries: Vec<_> = std::fs::read_dir(&test_dir) 235 .expect("failed to read tokenizer test dir") 236 .filter_map(|e| e.ok()) 237 .filter(|e| e.path().extension().map_or(false, |ext| ext == "test")) 238 .collect(); 239 entries.sort_by_key(|e| e.file_name()); 240 241 for entry in &entries { 242 let path = entry.path(); 243 let name = path.file_name().unwrap().to_string_lossy(); 244 let (pass, fail, skip) = run_test_file(&path); 245 eprintln!("{}: {} pass, {} fail, {} skip", name, pass, fail, skip); 246 total_pass += pass; 247 total_fail += fail; 248 total_skip += skip; 249 } 250 251 eprintln!(); 252 eprintln!( 253 "html5lib tokenizer totals: {} pass, {} fail, {} skip ({} total)", 254 total_pass, 255 total_fail, 256 total_skip, 257 total_pass + total_fail + total_skip 258 ); 259 260 // The test "passes" as a harness — it reports results but doesn't fail 261 // the test suite until we have an implementation to measure against. 262 // This lets CI always run and report progress. 263}