//! html5lib tokenizer test harness.
//!
//! Reads JSON test files from `tests/html5lib-tests/tokenizer/` and runs each
//! test case against our HTML tokenizer. Reports pass/fail/skip counts.
//!
//! Run with: `cargo test -p we-html --test html5lib_tokenizer`
mod json;
use json::JsonValue;
use we_html::Token;
/// Workspace root relative to the crate directory.
const WORKSPACE_ROOT: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../");
/// Convert a JSON output token (array) into our `Token` type for comparison.
fn json_to_token(val: &JsonValue) -> Option {
let arr = val.as_array()?;
let kind = arr.first()?.as_str()?;
match kind {
"DOCTYPE" => {
let name = arr.get(1).and_then(|v| v.as_str()).map(String::from);
let public_id = match arr.get(2) {
Some(JsonValue::Null) => None,
Some(v) => v.as_str().map(String::from),
None => None,
};
let system_id = match arr.get(3) {
Some(JsonValue::Null) => None,
Some(v) => v.as_str().map(String::from),
None => None,
};
let correctness = arr.get(4).and_then(|v| v.as_bool()).unwrap_or(true);
Some(Token::Doctype {
name,
public_id,
system_id,
force_quirks: !correctness,
})
}
"StartTag" => {
let name = arr.get(1)?.as_str()?.to_string();
let mut attributes = Vec::new();
if let Some(attrs_obj) = arr.get(2).and_then(|v| v.as_object()) {
for (k, v) in attrs_obj {
let val_str = v.as_str().unwrap_or("").to_string();
attributes.push((k.clone(), val_str));
}
}
let self_closing = arr.get(3).and_then(|v| v.as_bool()).unwrap_or(false);
Some(Token::StartTag {
name,
attributes,
self_closing,
})
}
"EndTag" => {
let name = arr.get(1)?.as_str()?.to_string();
Some(Token::EndTag { name })
}
"Character" => {
let data = arr.get(1)?.as_str()?.to_string();
Some(Token::Character(data))
}
"Comment" => {
let data = arr.get(1)?.as_str()?.to_string();
Some(Token::Comment(data))
}
_ => None,
}
}
/// Apply double-escaping as described in the html5lib test format.
/// When `doubleEscaped` is true, the input and expected strings contain
/// literal `\uXXXX` sequences that should be decoded.
fn unescape_double_escaped(s: &str) -> String {
let mut result = String::new();
let mut chars = s.chars();
while let Some(ch) = chars.next() {
if ch == '\\' {
match chars.next() {
Some('u') => {
let hex: String = chars.by_ref().take(4).collect();
if hex.len() == 4 {
if let Ok(cp) = u32::from_str_radix(&hex, 16) {
if let Some(c) = char::from_u32(cp) {
result.push(c);
continue;
}
}
}
result.push('\\');
result.push('u');
result.push_str(&hex);
}
Some(other) => {
result.push('\\');
result.push(other);
}
None => {
result.push('\\');
}
}
} else {
result.push(ch);
}
}
result
}
/// Run a single test case and return whether it passed.
fn run_test_case(test: &JsonValue, double_escaped: bool) -> bool {
let input = match test.get("input").and_then(|v| v.as_str()) {
Some(s) => {
if double_escaped {
unescape_double_escaped(s)
} else {
s.to_string()
}
}
None => return false,
};
let expected_output = match test.get("output").and_then(|v| v.as_array()) {
Some(arr) => arr,
None => return false,
};
// Convert expected output tokens.
let expected_tokens: Vec = expected_output
.iter()
.filter_map(|tok_json| {
let mut tok = json_to_token(tok_json)?;
if double_escaped {
match &mut tok {
Token::Character(ref mut s) => *s = unescape_double_escaped(s),
Token::Comment(ref mut s) => *s = unescape_double_escaped(s),
_ => {}
}
}
Some(tok)
})
.collect();
// Run our tokenizer.
let actual_tokens = we_html::tokenize(&input);
actual_tokens == expected_tokens
}
/// Load and run all test cases from a single html5lib tokenizer test file.
fn run_test_file(path: &std::path::Path) -> (usize, usize, usize) {
let content = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(e) => {
eprintln!(" failed to read {}: {}", path.display(), e);
return (0, 0, 1);
}
};
let root = match json::parse(&content) {
Ok(v) => v,
Err(e) => {
eprintln!(" failed to parse {}: {}", path.display(), e);
return (0, 0, 1);
}
};
let tests = match root.get("tests").and_then(|v| v.as_array()) {
Some(t) => t,
None => {
eprintln!(" no 'tests' array in {}", path.display());
return (0, 0, 1);
}
};
let mut pass = 0;
let mut fail = 0;
let mut skip = 0;
for test in tests {
let desc = test
.get("description")
.and_then(|v| v.as_str())
.unwrap_or("");
let double_escaped = test
.get("doubleEscaped")
.and_then(|v| v.as_bool())
.unwrap_or(false);
// If the test specifies initialStates, we run once per state.
// For now we only support the default "Data state" so skip others.
if let Some(states) = test.get("initialStates").and_then(|v| v.as_array()) {
let has_data_state = states.iter().any(|s| s.as_str() == Some("Data state"));
if !has_data_state {
skip += 1;
continue;
}
}
if run_test_case(test, double_escaped) {
pass += 1;
} else {
fail += 1;
// Only print first few failures to avoid noise.
if fail <= 5 {
eprintln!(" FAIL: {}", desc);
}
}
}
(pass, fail, skip)
}
#[test]
fn html5lib_tokenizer_tests() {
let test_dir = std::path::PathBuf::from(WORKSPACE_ROOT).join("tests/html5lib-tests/tokenizer");
if !test_dir.exists() {
eprintln!(
"html5lib-tests submodule not checked out at {}",
test_dir.display()
);
eprintln!("Run: git submodule update --init tests/html5lib-tests");
// Don't fail the test — the submodule might not be initialized.
return;
}
let mut total_pass = 0;
let mut total_fail = 0;
let mut total_skip = 0;
let mut entries: Vec<_> = std::fs::read_dir(&test_dir)
.expect("failed to read tokenizer test dir")
.filter_map(|e| e.ok())
.filter(|e| e.path().extension().map_or(false, |ext| ext == "test"))
.collect();
entries.sort_by_key(|e| e.file_name());
for entry in &entries {
let path = entry.path();
let name = path.file_name().unwrap().to_string_lossy();
let (pass, fail, skip) = run_test_file(&path);
eprintln!("{}: {} pass, {} fail, {} skip", name, pass, fail, skip);
total_pass += pass;
total_fail += fail;
total_skip += skip;
}
eprintln!();
eprintln!(
"html5lib tokenizer totals: {} pass, {} fail, {} skip ({} total)",
total_pass,
total_fail,
total_skip,
total_pass + total_fail + total_skip
);
// The test "passes" as a harness — it reports results but doesn't fail
// the test suite until we have an implementation to measure against.
// This lets CI always run and report progress.
}