//! html5lib tokenizer test harness.
//!
//! Reads JSON test files from `tests/html5lib-tests/tokenizer/` and runs each
//! test case against our HTML tokenizer. Reports pass/fail/skip counts.
//!
//! Run with: `cargo test -p we-html --test html5lib_tokenizer`

mod json;

use json::JsonValue;
use we_html::Token;

/// Workspace root relative to the crate directory.
const WORKSPACE_ROOT: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../../");

/// Convert a JSON output token (array) into our `Token` type for comparison.
fn json_to_token(val: &JsonValue) -> Option<Token> {
    let arr = val.as_array()?;
    let kind = arr.first()?.as_str()?;
    match kind {
        "DOCTYPE" => {
            let name = arr.get(1).and_then(|v| v.as_str()).map(String::from);
            let public_id = match arr.get(2) {
                Some(JsonValue::Null) => None,
                Some(v) => v.as_str().map(String::from),
                None => None,
            };
            let system_id = match arr.get(3) {
                Some(JsonValue::Null) => None,
                Some(v) => v.as_str().map(String::from),
                None => None,
            };
            let correctness = arr.get(4).and_then(|v| v.as_bool()).unwrap_or(true);
            Some(Token::Doctype {
                name,
                public_id,
                system_id,
                force_quirks: !correctness,
            })
        }
        "StartTag" => {
            let name = arr.get(1)?.as_str()?.to_string();
            let mut attributes = Vec::new();
            if let Some(attrs_obj) = arr.get(2).and_then(|v| v.as_object()) {
                for (k, v) in attrs_obj {
                    let val_str = v.as_str().unwrap_or("").to_string();
                    attributes.push((k.clone(), val_str));
                }
            }
            let self_closing = arr.get(3).and_then(|v| v.as_bool()).unwrap_or(false);
            Some(Token::StartTag {
                name,
                attributes,
                self_closing,
            })
        }
        "EndTag" => {
            let name = arr.get(1)?.as_str()?.to_string();
            Some(Token::EndTag { name })
        }
        "Character" => {
            let data = arr.get(1)?.as_str()?.to_string();
            Some(Token::Character(data))
        }
        "Comment" => {
            let data = arr.get(1)?.as_str()?.to_string();
            Some(Token::Comment(data))
        }
        _ => None,
    }
}

/// Apply double-escaping as described in the html5lib test format.
/// When `doubleEscaped` is true, the input and expected strings contain
/// literal `\uXXXX` sequences that should be decoded.
fn unescape_double_escaped(s: &str) -> String {
    let mut result = String::new();
    let mut chars = s.chars();
    while let Some(ch) = chars.next() {
        if ch == '\\' {
            match chars.next() {
                Some('u') => {
                    let hex: String = chars.by_ref().take(4).collect();
                    if hex.len() == 4 {
                        if let Ok(cp) = u32::from_str_radix(&hex, 16) {
                            if let Some(c) = char::from_u32(cp) {
                                result.push(c);
                                continue;
                            }
                        }
                    }
                    result.push('\\');
                    result.push('u');
                    result.push_str(&hex);
                }
                Some(other) => {
                    result.push('\\');
                    result.push(other);
                }
                None => {
                    result.push('\\');
                }
            }
        } else {
            result.push(ch);
        }
    }
    result
}

/// Run a single test case and return whether it passed.
fn run_test_case(test: &JsonValue, double_escaped: bool) -> bool {
    let input = match test.get("input").and_then(|v| v.as_str()) {
        Some(s) => {
            if double_escaped {
                unescape_double_escaped(s)
            } else {
                s.to_string()
            }
        }
        None => return false,
    };

    let expected_output = match test.get("output").and_then(|v| v.as_array()) {
        Some(arr) => arr,
        None => return false,
    };

    // Convert expected output tokens.
    let expected_tokens: Vec<Token> = expected_output
        .iter()
        .filter_map(|tok_json| {
            let mut tok = json_to_token(tok_json)?;
            if double_escaped {
                match &mut tok {
                    Token::Character(ref mut s) => *s = unescape_double_escaped(s),
                    Token::Comment(ref mut s) => *s = unescape_double_escaped(s),
                    _ => {}
                }
            }
            Some(tok)
        })
        .collect();

    // Run our tokenizer.
    let actual_tokens = we_html::tokenize(&input);

    actual_tokens == expected_tokens
}

/// Load and run all test cases from a single html5lib tokenizer test file.
fn run_test_file(path: &std::path::Path) -> (usize, usize, usize) {
    let content = match std::fs::read_to_string(path) {
        Ok(c) => c,
        Err(e) => {
            eprintln!("  failed to read {}: {}", path.display(), e);
            return (0, 0, 1);
        }
    };

    let root = match json::parse(&content) {
        Ok(v) => v,
        Err(e) => {
            eprintln!("  failed to parse {}: {}", path.display(), e);
            return (0, 0, 1);
        }
    };

    let tests = match root.get("tests").and_then(|v| v.as_array()) {
        Some(t) => t,
        None => {
            eprintln!("  no 'tests' array in {}", path.display());
            return (0, 0, 1);
        }
    };

    let mut pass = 0;
    let mut fail = 0;
    let mut skip = 0;

    for test in tests {
        let desc = test
            .get("description")
            .and_then(|v| v.as_str())
            .unwrap_or("<no description>");

        let double_escaped = test
            .get("doubleEscaped")
            .and_then(|v| v.as_bool())
            .unwrap_or(false);

        // If the test specifies initialStates, we run once per state.
        // For now we only support the default "Data state" so skip others.
        if let Some(states) = test.get("initialStates").and_then(|v| v.as_array()) {
            let has_data_state = states.iter().any(|s| s.as_str() == Some("Data state"));
            if !has_data_state {
                skip += 1;
                continue;
            }
        }

        if run_test_case(test, double_escaped) {
            pass += 1;
        } else {
            fail += 1;
            // Only print first few failures to avoid noise.
            if fail <= 5 {
                eprintln!("  FAIL: {}", desc);
            }
        }
    }

    (pass, fail, skip)
}

#[test]
fn html5lib_tokenizer_tests() {
    let test_dir = std::path::PathBuf::from(WORKSPACE_ROOT).join("tests/html5lib-tests/tokenizer");

    if !test_dir.exists() {
        eprintln!(
            "html5lib-tests submodule not checked out at {}",
            test_dir.display()
        );
        eprintln!("Run: git submodule update --init tests/html5lib-tests");
        // Don't fail the test — the submodule might not be initialized.
        return;
    }

    let mut total_pass = 0;
    let mut total_fail = 0;
    let mut total_skip = 0;

    let mut entries: Vec<_> = std::fs::read_dir(&test_dir)
        .expect("failed to read tokenizer test dir")
        .filter_map(|e| e.ok())
        .filter(|e| e.path().extension().map_or(false, |ext| ext == "test"))
        .collect();
    entries.sort_by_key(|e| e.file_name());

    for entry in &entries {
        let path = entry.path();
        let name = path.file_name().unwrap().to_string_lossy();
        let (pass, fail, skip) = run_test_file(&path);
        eprintln!("{}: {} pass, {} fail, {} skip", name, pass, fail, skip);
        total_pass += pass;
        total_fail += fail;
        total_skip += skip;
    }

    eprintln!();
    eprintln!(
        "html5lib tokenizer totals: {} pass, {} fail, {} skip ({} total)",
        total_pass,
        total_fail,
        total_skip,
        total_pass + total_fail + total_skip
    );

    // The test "passes" as a harness — it reports results but doesn't fail
    // the test suite until we have an implementation to measure against.
    // This lets CI always run and report progress.
}