//! Minimal JSON parser for reading html5lib test fixtures.
//!
//! Supports the subset of JSON used by html5lib-tests: objects, arrays,
//! strings (with escape sequences including `\uXXXX`), numbers, booleans,
//! and null.
#[derive(Debug, Clone, PartialEq)]
pub enum JsonValue {
Null,
Bool(bool),
Number(f64),
Str(String),
Array(Vec),
Object(Vec<(String, JsonValue)>),
}
impl JsonValue {
pub fn as_str(&self) -> Option<&str> {
match self {
JsonValue::Str(s) => Some(s),
_ => None,
}
}
pub fn as_array(&self) -> Option<&[JsonValue]> {
match self {
JsonValue::Array(a) => Some(a),
_ => None,
}
}
pub fn as_object(&self) -> Option<&[(String, JsonValue)]> {
match self {
JsonValue::Object(o) => Some(o),
_ => None,
}
}
pub fn as_bool(&self) -> Option {
match self {
JsonValue::Bool(b) => Some(*b),
_ => None,
}
}
/// Look up a key in a JSON object.
pub fn get(&self, key: &str) -> Option<&JsonValue> {
match self {
JsonValue::Object(pairs) => pairs.iter().find(|(k, _)| k == key).map(|(_, v)| v),
_ => None,
}
}
}
struct Parser<'a> {
bytes: &'a [u8],
pos: usize,
}
impl<'a> Parser<'a> {
fn new(input: &'a str) -> Self {
Self {
bytes: input.as_bytes(),
pos: 0,
}
}
fn skip_ws(&mut self) {
while self.pos < self.bytes.len() {
match self.bytes[self.pos] {
b' ' | b'\t' | b'\n' | b'\r' => self.pos += 1,
_ => break,
}
}
}
fn peek(&self) -> Option {
self.bytes.get(self.pos).copied()
}
fn advance(&mut self) -> Option {
let b = self.bytes.get(self.pos).copied()?;
self.pos += 1;
Some(b)
}
fn expect(&mut self, ch: u8) -> Result<(), String> {
match self.advance() {
Some(b) if b == ch => Ok(()),
Some(b) => Err(format!(
"expected '{}' at pos {}, got '{}'",
ch as char, self.pos, b as char
)),
None => Err(format!(
"expected '{}' at pos {}, got EOF",
ch as char, self.pos
)),
}
}
fn parse_value(&mut self) -> Result {
self.skip_ws();
match self.peek() {
Some(b'"') => self.parse_string().map(JsonValue::Str),
Some(b'{') => self.parse_object(),
Some(b'[') => self.parse_array(),
Some(b't') => self.parse_literal("true", JsonValue::Bool(true)),
Some(b'f') => self.parse_literal("false", JsonValue::Bool(false)),
Some(b'n') => self.parse_literal("null", JsonValue::Null),
Some(b'-') | Some(b'0'..=b'9') => self.parse_number(),
Some(b) => Err(format!(
"unexpected byte '{}' at pos {}",
b as char, self.pos
)),
None => Err("unexpected EOF".into()),
}
}
fn parse_string(&mut self) -> Result {
self.expect(b'"')?;
let mut s = String::new();
loop {
match self.advance() {
Some(b'"') => return Ok(s),
Some(b'\\') => match self.advance() {
Some(b'"') => s.push('"'),
Some(b'\\') => s.push('\\'),
Some(b'/') => s.push('/'),
Some(b'n') => s.push('\n'),
Some(b'r') => s.push('\r'),
Some(b't') => s.push('\t'),
Some(b'b') => s.push('\u{0008}'),
Some(b'f') => s.push('\u{000C}'),
Some(b'u') => {
let cp = self.parse_hex4()?;
// Handle surrogate pairs.
if (0xD800..=0xDBFF).contains(&cp) {
// High surrogate — expect \uXXXX low surrogate.
if self.advance() == Some(b'\\') && self.advance() == Some(b'u') {
let lo = self.parse_hex4()?;
if (0xDC00..=0xDFFF).contains(&lo) {
let combined = 0x10000
+ ((cp as u32 - 0xD800) << 10)
+ (lo as u32 - 0xDC00);
if let Some(ch) = char::from_u32(combined) {
s.push(ch);
}
}
}
} else if let Some(ch) = char::from_u32(cp as u32) {
s.push(ch);
}
}
Some(b) => {
s.push('\\');
s.push(b as char);
}
None => return Err("unexpected EOF in string escape".into()),
},
Some(_) => {
// We need to handle multi-byte UTF-8 properly.
// Since we're working on bytes, back up and grab the char.
self.pos -= 1;
let rest = std::str::from_utf8(&self.bytes[self.pos..])
.map_err(|e| format!("invalid UTF-8: {}", e))?;
let ch = rest.chars().next().unwrap();
self.pos += ch.len_utf8();
s.push(ch);
}
None => return Err("unexpected EOF in string".into()),
}
}
}
fn parse_hex4(&mut self) -> Result {
let mut val: u16 = 0;
for _ in 0..4 {
let b = self.advance().ok_or("unexpected EOF in \\u escape")?;
let digit = match b {
b'0'..=b'9' => b - b'0',
b'a'..=b'f' => b - b'a' + 10,
b'A'..=b'F' => b - b'A' + 10,
_ => return Err(format!("invalid hex digit '{}'", b as char)),
};
val = val * 16 + digit as u16;
}
Ok(val)
}
fn parse_number(&mut self) -> Result {
let start = self.pos;
if self.peek() == Some(b'-') {
self.pos += 1;
}
while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() {
self.pos += 1;
}
if self.pos < self.bytes.len() && self.bytes[self.pos] == b'.' {
self.pos += 1;
while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() {
self.pos += 1;
}
}
if self.pos < self.bytes.len()
&& (self.bytes[self.pos] == b'e' || self.bytes[self.pos] == b'E')
{
self.pos += 1;
if self.pos < self.bytes.len()
&& (self.bytes[self.pos] == b'+' || self.bytes[self.pos] == b'-')
{
self.pos += 1;
}
while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() {
self.pos += 1;
}
}
let s = std::str::from_utf8(&self.bytes[start..self.pos])
.map_err(|e| format!("invalid UTF-8 in number: {}", e))?;
let n: f64 = s
.parse()
.map_err(|e| format!("invalid number '{}': {}", s, e))?;
Ok(JsonValue::Number(n))
}
fn parse_object(&mut self) -> Result {
self.expect(b'{')?;
self.skip_ws();
let mut pairs = Vec::new();
if self.peek() == Some(b'}') {
self.pos += 1;
return Ok(JsonValue::Object(pairs));
}
loop {
self.skip_ws();
let key = self.parse_string()?;
self.skip_ws();
self.expect(b':')?;
let val = self.parse_value()?;
pairs.push((key, val));
self.skip_ws();
match self.peek() {
Some(b',') => {
self.pos += 1;
}
Some(b'}') => {
self.pos += 1;
return Ok(JsonValue::Object(pairs));
}
_ => return Err(format!("expected ',' or '}}' at pos {}", self.pos)),
}
}
}
fn parse_array(&mut self) -> Result {
self.expect(b'[')?;
self.skip_ws();
let mut elems = Vec::new();
if self.peek() == Some(b']') {
self.pos += 1;
return Ok(JsonValue::Array(elems));
}
loop {
let val = self.parse_value()?;
elems.push(val);
self.skip_ws();
match self.peek() {
Some(b',') => {
self.pos += 1;
}
Some(b']') => {
self.pos += 1;
return Ok(JsonValue::Array(elems));
}
_ => return Err(format!("expected ',' or ']' at pos {}", self.pos)),
}
}
}
fn parse_literal(&mut self, expected: &str, value: JsonValue) -> Result {
for b in expected.bytes() {
match self.advance() {
Some(got) if got == b => {}
_ => return Err(format!("expected literal '{}'", expected)),
}
}
Ok(value)
}
}
/// Parse a JSON string into a `JsonValue`.
pub fn parse(input: &str) -> Result {
let mut parser = Parser::new(input);
let val = parser.parse_value()?;
parser.skip_ws();
if parser.pos != parser.bytes.len() {
return Err(format!("trailing data at pos {}", parser.pos));
}
Ok(val)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_simple_object() {
let val = parse(r#"{"a": 1, "b": "hello"}"#).unwrap();
assert_eq!(val.get("a"), Some(&JsonValue::Number(1.0)));
assert_eq!(val.get("b"), Some(&JsonValue::Str("hello".into())));
}
#[test]
fn parse_array() {
let val = parse(r#"[1, "two", true, null]"#).unwrap();
let arr = val.as_array().unwrap();
assert_eq!(arr.len(), 4);
assert_eq!(arr[2], JsonValue::Bool(true));
assert_eq!(arr[3], JsonValue::Null);
}
#[test]
fn parse_nested() {
let val = parse(r#"{"tests": [{"desc": "a"}]}"#).unwrap();
let tests = val.get("tests").unwrap().as_array().unwrap();
assert_eq!(tests.len(), 1);
}
#[test]
fn parse_string_escapes() {
let val = parse(r#""hello\nworld""#).unwrap();
assert_eq!(val.as_str().unwrap(), "hello\nworld");
}
#[test]
fn parse_unicode_escape() {
let val = parse(r#""\u0041""#).unwrap();
assert_eq!(val.as_str().unwrap(), "A");
}
}