//! Minimal JSON parser for reading html5lib test fixtures. //! //! Supports the subset of JSON used by html5lib-tests: objects, arrays, //! strings (with escape sequences including `\uXXXX`), numbers, booleans, //! and null. #[derive(Debug, Clone, PartialEq)] pub enum JsonValue { Null, Bool(bool), Number(f64), Str(String), Array(Vec), Object(Vec<(String, JsonValue)>), } impl JsonValue { pub fn as_str(&self) -> Option<&str> { match self { JsonValue::Str(s) => Some(s), _ => None, } } pub fn as_array(&self) -> Option<&[JsonValue]> { match self { JsonValue::Array(a) => Some(a), _ => None, } } pub fn as_object(&self) -> Option<&[(String, JsonValue)]> { match self { JsonValue::Object(o) => Some(o), _ => None, } } pub fn as_bool(&self) -> Option { match self { JsonValue::Bool(b) => Some(*b), _ => None, } } /// Look up a key in a JSON object. pub fn get(&self, key: &str) -> Option<&JsonValue> { match self { JsonValue::Object(pairs) => pairs.iter().find(|(k, _)| k == key).map(|(_, v)| v), _ => None, } } } struct Parser<'a> { bytes: &'a [u8], pos: usize, } impl<'a> Parser<'a> { fn new(input: &'a str) -> Self { Self { bytes: input.as_bytes(), pos: 0, } } fn skip_ws(&mut self) { while self.pos < self.bytes.len() { match self.bytes[self.pos] { b' ' | b'\t' | b'\n' | b'\r' => self.pos += 1, _ => break, } } } fn peek(&self) -> Option { self.bytes.get(self.pos).copied() } fn advance(&mut self) -> Option { let b = self.bytes.get(self.pos).copied()?; self.pos += 1; Some(b) } fn expect(&mut self, ch: u8) -> Result<(), String> { match self.advance() { Some(b) if b == ch => Ok(()), Some(b) => Err(format!( "expected '{}' at pos {}, got '{}'", ch as char, self.pos, b as char )), None => Err(format!( "expected '{}' at pos {}, got EOF", ch as char, self.pos )), } } fn parse_value(&mut self) -> Result { self.skip_ws(); match self.peek() { Some(b'"') => self.parse_string().map(JsonValue::Str), Some(b'{') => self.parse_object(), Some(b'[') => self.parse_array(), Some(b't') => self.parse_literal("true", JsonValue::Bool(true)), Some(b'f') => self.parse_literal("false", JsonValue::Bool(false)), Some(b'n') => self.parse_literal("null", JsonValue::Null), Some(b'-') | Some(b'0'..=b'9') => self.parse_number(), Some(b) => Err(format!( "unexpected byte '{}' at pos {}", b as char, self.pos )), None => Err("unexpected EOF".into()), } } fn parse_string(&mut self) -> Result { self.expect(b'"')?; let mut s = String::new(); loop { match self.advance() { Some(b'"') => return Ok(s), Some(b'\\') => match self.advance() { Some(b'"') => s.push('"'), Some(b'\\') => s.push('\\'), Some(b'/') => s.push('/'), Some(b'n') => s.push('\n'), Some(b'r') => s.push('\r'), Some(b't') => s.push('\t'), Some(b'b') => s.push('\u{0008}'), Some(b'f') => s.push('\u{000C}'), Some(b'u') => { let cp = self.parse_hex4()?; // Handle surrogate pairs. if (0xD800..=0xDBFF).contains(&cp) { // High surrogate — expect \uXXXX low surrogate. if self.advance() == Some(b'\\') && self.advance() == Some(b'u') { let lo = self.parse_hex4()?; if (0xDC00..=0xDFFF).contains(&lo) { let combined = 0x10000 + ((cp as u32 - 0xD800) << 10) + (lo as u32 - 0xDC00); if let Some(ch) = char::from_u32(combined) { s.push(ch); } } } } else if let Some(ch) = char::from_u32(cp as u32) { s.push(ch); } } Some(b) => { s.push('\\'); s.push(b as char); } None => return Err("unexpected EOF in string escape".into()), }, Some(_) => { // We need to handle multi-byte UTF-8 properly. // Since we're working on bytes, back up and grab the char. self.pos -= 1; let rest = std::str::from_utf8(&self.bytes[self.pos..]) .map_err(|e| format!("invalid UTF-8: {}", e))?; let ch = rest.chars().next().unwrap(); self.pos += ch.len_utf8(); s.push(ch); } None => return Err("unexpected EOF in string".into()), } } } fn parse_hex4(&mut self) -> Result { let mut val: u16 = 0; for _ in 0..4 { let b = self.advance().ok_or("unexpected EOF in \\u escape")?; let digit = match b { b'0'..=b'9' => b - b'0', b'a'..=b'f' => b - b'a' + 10, b'A'..=b'F' => b - b'A' + 10, _ => return Err(format!("invalid hex digit '{}'", b as char)), }; val = val * 16 + digit as u16; } Ok(val) } fn parse_number(&mut self) -> Result { let start = self.pos; if self.peek() == Some(b'-') { self.pos += 1; } while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() { self.pos += 1; } if self.pos < self.bytes.len() && self.bytes[self.pos] == b'.' { self.pos += 1; while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() { self.pos += 1; } } if self.pos < self.bytes.len() && (self.bytes[self.pos] == b'e' || self.bytes[self.pos] == b'E') { self.pos += 1; if self.pos < self.bytes.len() && (self.bytes[self.pos] == b'+' || self.bytes[self.pos] == b'-') { self.pos += 1; } while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_digit() { self.pos += 1; } } let s = std::str::from_utf8(&self.bytes[start..self.pos]) .map_err(|e| format!("invalid UTF-8 in number: {}", e))?; let n: f64 = s .parse() .map_err(|e| format!("invalid number '{}': {}", s, e))?; Ok(JsonValue::Number(n)) } fn parse_object(&mut self) -> Result { self.expect(b'{')?; self.skip_ws(); let mut pairs = Vec::new(); if self.peek() == Some(b'}') { self.pos += 1; return Ok(JsonValue::Object(pairs)); } loop { self.skip_ws(); let key = self.parse_string()?; self.skip_ws(); self.expect(b':')?; let val = self.parse_value()?; pairs.push((key, val)); self.skip_ws(); match self.peek() { Some(b',') => { self.pos += 1; } Some(b'}') => { self.pos += 1; return Ok(JsonValue::Object(pairs)); } _ => return Err(format!("expected ',' or '}}' at pos {}", self.pos)), } } } fn parse_array(&mut self) -> Result { self.expect(b'[')?; self.skip_ws(); let mut elems = Vec::new(); if self.peek() == Some(b']') { self.pos += 1; return Ok(JsonValue::Array(elems)); } loop { let val = self.parse_value()?; elems.push(val); self.skip_ws(); match self.peek() { Some(b',') => { self.pos += 1; } Some(b']') => { self.pos += 1; return Ok(JsonValue::Array(elems)); } _ => return Err(format!("expected ',' or ']' at pos {}", self.pos)), } } } fn parse_literal(&mut self, expected: &str, value: JsonValue) -> Result { for b in expected.bytes() { match self.advance() { Some(got) if got == b => {} _ => return Err(format!("expected literal '{}'", expected)), } } Ok(value) } } /// Parse a JSON string into a `JsonValue`. pub fn parse(input: &str) -> Result { let mut parser = Parser::new(input); let val = parser.parse_value()?; parser.skip_ws(); if parser.pos != parser.bytes.len() { return Err(format!("trailing data at pos {}", parser.pos)); } Ok(val) } #[cfg(test)] mod tests { use super::*; #[test] fn parse_simple_object() { let val = parse(r#"{"a": 1, "b": "hello"}"#).unwrap(); assert_eq!(val.get("a"), Some(&JsonValue::Number(1.0))); assert_eq!(val.get("b"), Some(&JsonValue::Str("hello".into()))); } #[test] fn parse_array() { let val = parse(r#"[1, "two", true, null]"#).unwrap(); let arr = val.as_array().unwrap(); assert_eq!(arr.len(), 4); assert_eq!(arr[2], JsonValue::Bool(true)); assert_eq!(arr[3], JsonValue::Null); } #[test] fn parse_nested() { let val = parse(r#"{"tests": [{"desc": "a"}]}"#).unwrap(); let tests = val.get("tests").unwrap().as_array().unwrap(); assert_eq!(tests.len(), 1); } #[test] fn parse_string_escapes() { let val = parse(r#""hello\nworld""#).unwrap(); assert_eq!(val.as_str().unwrap(), "hello\nworld"); } #[test] fn parse_unicode_escape() { let val = parse(r#""\u0041""#).unwrap(); assert_eq!(val.as_str().unwrap(), "A"); } }