web engine - experimental web browser
1//! html5lib tokenizer test harness.
2//!
3//! Reads JSON test files from `tests/html5lib-tests/tokenizer/` and runs each
4//! test case against our HTML tokenizer. Reports pass/fail/skip counts.
5//!
6//! Run with: `cargo test -p we-html --test html5lib_tokenizer`
7
8mod json;
9
10use json::JsonValue;
11use we_html::Token;
12
13/// Workspace root relative to the crate directory.
14const WORKSPACE_ROOT: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../");
15
16/// Convert a JSON output token (array) into our `Token` type for comparison.
17fn json_to_token(val: &JsonValue) -> Option<Token> {
18 let arr = val.as_array()?;
19 let kind = arr.first()?.as_str()?;
20 match kind {
21 "DOCTYPE" => {
22 let name = arr.get(1).and_then(|v| v.as_str()).map(String::from);
23 let public_id = match arr.get(2) {
24 Some(JsonValue::Null) => None,
25 Some(v) => v.as_str().map(String::from),
26 None => None,
27 };
28 let system_id = match arr.get(3) {
29 Some(JsonValue::Null) => None,
30 Some(v) => v.as_str().map(String::from),
31 None => None,
32 };
33 let correctness = arr.get(4).and_then(|v| v.as_bool()).unwrap_or(true);
34 Some(Token::Doctype {
35 name,
36 public_id,
37 system_id,
38 force_quirks: !correctness,
39 })
40 }
41 "StartTag" => {
42 let name = arr.get(1)?.as_str()?.to_string();
43 let mut attributes = Vec::new();
44 if let Some(attrs_obj) = arr.get(2).and_then(|v| v.as_object()) {
45 for (k, v) in attrs_obj {
46 let val_str = v.as_str().unwrap_or("").to_string();
47 attributes.push((k.clone(), val_str));
48 }
49 }
50 let self_closing = arr.get(3).and_then(|v| v.as_bool()).unwrap_or(false);
51 Some(Token::StartTag {
52 name,
53 attributes,
54 self_closing,
55 })
56 }
57 "EndTag" => {
58 let name = arr.get(1)?.as_str()?.to_string();
59 Some(Token::EndTag { name })
60 }
61 "Character" => {
62 let data = arr.get(1)?.as_str()?.to_string();
63 Some(Token::Character(data))
64 }
65 "Comment" => {
66 let data = arr.get(1)?.as_str()?.to_string();
67 Some(Token::Comment(data))
68 }
69 _ => None,
70 }
71}
72
73/// Apply double-escaping as described in the html5lib test format.
74/// When `doubleEscaped` is true, the input and expected strings contain
75/// literal `\uXXXX` sequences that should be decoded.
76fn unescape_double_escaped(s: &str) -> String {
77 let mut result = String::new();
78 let mut chars = s.chars();
79 while let Some(ch) = chars.next() {
80 if ch == '\\' {
81 match chars.next() {
82 Some('u') => {
83 let hex: String = chars.by_ref().take(4).collect();
84 if hex.len() == 4 {
85 if let Ok(cp) = u32::from_str_radix(&hex, 16) {
86 if let Some(c) = char::from_u32(cp) {
87 result.push(c);
88 continue;
89 }
90 }
91 }
92 result.push('\\');
93 result.push('u');
94 result.push_str(&hex);
95 }
96 Some(other) => {
97 result.push('\\');
98 result.push(other);
99 }
100 None => {
101 result.push('\\');
102 }
103 }
104 } else {
105 result.push(ch);
106 }
107 }
108 result
109}
110
111/// Run a single test case and return whether it passed.
112fn run_test_case(test: &JsonValue, double_escaped: bool) -> bool {
113 let input = match test.get("input").and_then(|v| v.as_str()) {
114 Some(s) => {
115 if double_escaped {
116 unescape_double_escaped(s)
117 } else {
118 s.to_string()
119 }
120 }
121 None => return false,
122 };
123
124 let expected_output = match test.get("output").and_then(|v| v.as_array()) {
125 Some(arr) => arr,
126 None => return false,
127 };
128
129 // Convert expected output tokens.
130 let expected_tokens: Vec<Token> = expected_output
131 .iter()
132 .filter_map(|tok_json| {
133 let mut tok = json_to_token(tok_json)?;
134 if double_escaped {
135 match &mut tok {
136 Token::Character(ref mut s) => *s = unescape_double_escaped(s),
137 Token::Comment(ref mut s) => *s = unescape_double_escaped(s),
138 _ => {}
139 }
140 }
141 Some(tok)
142 })
143 .collect();
144
145 // Run our tokenizer.
146 let actual_tokens = we_html::tokenize(&input);
147
148 actual_tokens == expected_tokens
149}
150
151/// Load and run all test cases from a single html5lib tokenizer test file.
152fn run_test_file(path: &std::path::Path) -> (usize, usize, usize) {
153 let content = match std::fs::read_to_string(path) {
154 Ok(c) => c,
155 Err(e) => {
156 eprintln!(" failed to read {}: {}", path.display(), e);
157 return (0, 0, 1);
158 }
159 };
160
161 let root = match json::parse(&content) {
162 Ok(v) => v,
163 Err(e) => {
164 eprintln!(" failed to parse {}: {}", path.display(), e);
165 return (0, 0, 1);
166 }
167 };
168
169 let tests = match root.get("tests").and_then(|v| v.as_array()) {
170 Some(t) => t,
171 None => {
172 eprintln!(" no 'tests' array in {}", path.display());
173 return (0, 0, 1);
174 }
175 };
176
177 let mut pass = 0;
178 let mut fail = 0;
179 let mut skip = 0;
180
181 for test in tests {
182 let desc = test
183 .get("description")
184 .and_then(|v| v.as_str())
185 .unwrap_or("<no description>");
186
187 let double_escaped = test
188 .get("doubleEscaped")
189 .and_then(|v| v.as_bool())
190 .unwrap_or(false);
191
192 // If the test specifies initialStates, we run once per state.
193 // For now we only support the default "Data state" so skip others.
194 if let Some(states) = test.get("initialStates").and_then(|v| v.as_array()) {
195 let has_data_state = states.iter().any(|s| s.as_str() == Some("Data state"));
196 if !has_data_state {
197 skip += 1;
198 continue;
199 }
200 }
201
202 if run_test_case(test, double_escaped) {
203 pass += 1;
204 } else {
205 fail += 1;
206 // Only print first few failures to avoid noise.
207 if fail <= 5 {
208 eprintln!(" FAIL: {}", desc);
209 }
210 }
211 }
212
213 (pass, fail, skip)
214}
215
216#[test]
217fn html5lib_tokenizer_tests() {
218 let test_dir = std::path::PathBuf::from(WORKSPACE_ROOT).join("tests/html5lib-tests/tokenizer");
219
220 if !test_dir.exists() {
221 eprintln!(
222 "html5lib-tests submodule not checked out at {}",
223 test_dir.display()
224 );
225 eprintln!("Run: git submodule update --init tests/html5lib-tests");
226 // Don't fail the test — the submodule might not be initialized.
227 return;
228 }
229
230 let mut total_pass = 0;
231 let mut total_fail = 0;
232 let mut total_skip = 0;
233
234 let mut entries: Vec<_> = std::fs::read_dir(&test_dir)
235 .expect("failed to read tokenizer test dir")
236 .filter_map(|e| e.ok())
237 .filter(|e| e.path().extension().map_or(false, |ext| ext == "test"))
238 .collect();
239 entries.sort_by_key(|e| e.file_name());
240
241 for entry in &entries {
242 let path = entry.path();
243 let name = path.file_name().unwrap().to_string_lossy();
244 let (pass, fail, skip) = run_test_file(&path);
245 eprintln!("{}: {} pass, {} fail, {} skip", name, pass, fail, skip);
246 total_pass += pass;
247 total_fail += fail;
248 total_skip += skip;
249 }
250
251 eprintln!();
252 eprintln!(
253 "html5lib tokenizer totals: {} pass, {} fail, {} skip ({} total)",
254 total_pass,
255 total_fail,
256 total_skip,
257 total_pass + total_fail + total_skip
258 );
259
260 // The test "passes" as a harness — it reports results but doesn't fail
261 // the test suite until we have an implementation to measure against.
262 // This lets CI always run and report progress.
263}