this repo has no description
at main 384 lines 8.6 kB view raw
1{0 Building a JSON Parser} 2 3@scrolly Building a JSON Parser in OCaml 4{ol 5{li 6 {b Defining the Value Type} 7 8 Every parser starts with a type. JSON has six kinds of values: 9 null, booleans, numbers, strings, arrays, and objects. 10 We encode this directly as an OCaml variant. 11 12 {[ 13(* >type json = 14(* > | Null 15(* > | Bool of bool 16(* > | Number of float 17(* > | String of string 18(* > | Array of json list 19(* > | Object of (string * json) list 20 ]} 21} 22{li 23 {b A Simple Scanner} 24 25 Before parsing structure, we need to skip whitespace and 26 peek at the next meaningful character. Our scanner works 27 on a string with a mutable position index. 28 29 {[ 30type json = 31 | Null 32 | Bool of bool 33 | Number of float 34 | String of string 35 | Array of json list 36 | Object of (string * json) list 37 38(* >type scanner = { 39(* > input : string; 40(* > mutable pos : int; 41(* >} 42(* > 43(* >let peek s = 44(* > while s.pos < String.length s.input 45(* > && s.input.[s.pos] = ' ' do 46(* > s.pos <- s.pos + 1 47(* > done; 48(* > if s.pos < String.length s.input 49(* > then Some s.input.[s.pos] 50(* > else None 51(* > 52(* >let advance s = s.pos <- s.pos + 1 53 ]} 54} 55{li 56 {b Parsing Strings} 57 58 JSON strings are delimited by double quotes. We scan character 59 by character, collecting into a buffer. This handles the simple 60 case without escape sequences. 61 62 {[ 63type json = 64 | Null 65 | Bool of bool 66 | Number of float 67 | String of string 68 | Array of json list 69 | Object of (string * json) list 70 71type scanner = { 72 input : string; 73 mutable pos : int; 74} 75 76let peek s = 77 while s.pos < String.length s.input 78 && s.input.[s.pos] = ' ' do 79 s.pos <- s.pos + 1 80 done; 81 if s.pos < String.length s.input 82 then Some s.input.[s.pos] 83 else None 84 85let advance s = s.pos <- s.pos + 1 86 87(* >let parse_string s = 88(* > advance s; 89(* > let buf = Buffer.create 64 in 90(* > while s.pos < String.length s.input 91(* > && s.input.[s.pos] <> '"' do 92(* > Buffer.add_char buf s.input.[s.pos]; 93(* > advance s 94(* > done; 95(* > advance s; 96(* > Buffer.contents buf 97 ]} 98} 99{li 100 {b Parsing Numbers} 101 102 Numbers in JSON can be integers or floats. We collect consecutive 103 digit and dot characters, then use float_of_string to parse them. 104 A production parser would handle exponents too. 105 106 {[ 107type json = 108 | Null 109 | Bool of bool 110 | Number of float 111 | String of string 112 | Array of json list 113 | Object of (string * json) list 114 115type scanner = { 116 input : string; 117 mutable pos : int; 118} 119 120let peek s = 121 while s.pos < String.length s.input 122 && s.input.[s.pos] = ' ' do 123 s.pos <- s.pos + 1 124 done; 125 if s.pos < String.length s.input 126 then Some s.input.[s.pos] 127 else None 128 129let advance s = s.pos <- s.pos + 1 130 131let parse_string s = 132 advance s; 133 let buf = Buffer.create 64 in 134 while s.pos < String.length s.input 135 && s.input.[s.pos] <> '"' do 136 Buffer.add_char buf s.input.[s.pos]; 137 advance s 138 done; 139 advance s; 140 Buffer.contents buf 141 142(* >let is_digit c = c >= '0' && c <= '9' 143(* > 144(* >let parse_number s = 145(* > let start = s.pos in 146(* > while s.pos < String.length s.input 147(* > && (is_digit s.input.[s.pos] 148(* > || s.input.[s.pos] = '.' 149(* > || s.input.[s.pos] = '-') do 150(* > advance s 151(* > done; 152(* > float_of_string 153(* > (String.sub s.input start (s.pos - start)) 154 ]} 155} 156{li 157 {b The Recursive Core} 158 159 Now the magic: parse_value dispatches on the next character 160 to decide what kind of JSON value to parse. For atoms like 161 null, true, false we match literal strings. For compound 162 structures, we recurse. 163 164 {[ 165type json = 166 | Null 167 | Bool of bool 168 | Number of float 169 | String of string 170 | Array of json list 171 | Object of (string * json) list 172 173type scanner = { 174 input : string; 175 mutable pos : int; 176} 177 178let peek s = 179 while s.pos < String.length s.input 180 && s.input.[s.pos] = ' ' do 181 s.pos <- s.pos + 1 182 done; 183 if s.pos < String.length s.input 184 then Some s.input.[s.pos] 185 else None 186 187let advance s = s.pos <- s.pos + 1 188 189let parse_string s = 190 advance s; 191 let buf = Buffer.create 64 in 192 while s.pos < String.length s.input 193 && s.input.[s.pos] <> '"' do 194 Buffer.add_char buf s.input.[s.pos]; 195 advance s 196 done; 197 advance s; 198 Buffer.contents buf 199 200let is_digit c = c >= '0' && c <= '9' 201 202let parse_number s = 203 let start = s.pos in 204 while s.pos < String.length s.input 205 && (is_digit s.input.[s.pos] 206 || s.input.[s.pos] = '.' 207 || s.input.[s.pos] = '-') do 208 advance s 209 done; 210 float_of_string 211 (String.sub s.input start (s.pos - start)) 212 213(* >let expect s c = 214(* > match peek s with 215(* > | Some c' when c' = c -> advance s 216(* > | _ -> failwith "unexpected character" 217(* > 218(* >let rec parse_value s = 219(* > match peek s with 220(* > | Some '"' -> String (parse_string s) 221(* > | Some c when is_digit c || c = '-' -> 222(* > Number (parse_number s) 223(* > | Some 't' -> 224(* > s.pos <- s.pos + 4; Bool true 225(* > | Some 'f' -> 226(* > s.pos <- s.pos + 5; Bool false 227(* > | Some 'n' -> 228(* > s.pos <- s.pos + 4; Null 229(* > | Some '[' -> parse_array s 230(* > | Some '{' -> parse_object s 231(* > | _ -> failwith "unexpected token" 232(* > 233(* >and parse_array s = 234(* > advance s; 235(* > let items = ref [] in 236(* > (match peek s with 237(* > | Some ']' -> advance s 238(* > | _ -> 239(* > items := [parse_value s]; 240(* > while peek s = Some ',' do 241(* > advance s; 242(* > items := parse_value s :: !items 243(* > done; 244(* > expect s ']'); 245(* > Array (List.rev !items) 246(* > 247(* >and parse_object s = 248(* > advance s; 249(* > let pairs = ref [] in 250(* > (match peek s with 251(* > | Some '}' -> advance s 252(* > | _ -> 253(* > let key = parse_string s in 254(* > expect s ':'; 255(* > let value = parse_value s in 256(* > pairs := [(key, value)]; 257(* > while peek s = Some ',' do 258(* > advance s; 259(* > let k = parse_string s in 260(* > expect s ':'; 261(* > let v = parse_value s in 262(* > pairs := (k, v) :: !pairs 263(* > done; 264(* > expect s '}'); 265(* > Object (List.rev !pairs) 266 ]} 267} 268{li 269 {b The Public API} 270 271 Finally we wrap the scanner in a clean top-level function. 272 Pass a string in, get a JSON value out. The entire parser 273 is about 80 lines of OCaml — no dependencies, no magic. 274 275 {[ 276type json = 277 | Null 278 | Bool of bool 279 | Number of float 280 | String of string 281 | Array of json list 282 | Object of (string * json) list 283 284type scanner = { 285 input : string; 286 mutable pos : int; 287} 288 289let peek s = 290 while s.pos < String.length s.input 291 && s.input.[s.pos] = ' ' do 292 s.pos <- s.pos + 1 293 done; 294 if s.pos < String.length s.input 295 then Some s.input.[s.pos] 296 else None 297 298let advance s = s.pos <- s.pos + 1 299 300let parse_string s = 301 advance s; 302 let buf = Buffer.create 64 in 303 while s.pos < String.length s.input 304 && s.input.[s.pos] <> '"' do 305 Buffer.add_char buf s.input.[s.pos]; 306 advance s 307 done; 308 advance s; 309 Buffer.contents buf 310 311let is_digit c = c >= '0' && c <= '9' 312 313let parse_number s = 314 let start = s.pos in 315 while s.pos < String.length s.input 316 && (is_digit s.input.[s.pos] 317 || s.input.[s.pos] = '.' 318 || s.input.[s.pos] = '-') do 319 advance s 320 done; 321 float_of_string 322 (String.sub s.input start (s.pos - start)) 323 324let expect s c = 325 match peek s with 326 | Some c' when c' = c -> advance s 327 | _ -> failwith "unexpected character" 328 329let rec parse_value s = 330 match peek s with 331 | Some '"' -> String (parse_string s) 332 | Some c when is_digit c || c = '-' -> 333 Number (parse_number s) 334 | Some 't' -> 335 s.pos <- s.pos + 4; Bool true 336 | Some 'f' -> 337 s.pos <- s.pos + 5; Bool false 338 | Some 'n' -> 339 s.pos <- s.pos + 4; Null 340 | Some '[' -> parse_array s 341 | Some '{' -> parse_object s 342 | _ -> failwith "unexpected token" 343 344and parse_array s = 345 advance s; 346 let items = ref [] in 347 (match peek s with 348 | Some ']' -> advance s 349 | _ -> 350 items := [parse_value s]; 351 while peek s = Some ',' do 352 advance s; 353 items := parse_value s :: !items 354 done; 355 expect s ']'); 356 Array (List.rev !items) 357 358and parse_object s = 359 advance s; 360 let pairs = ref [] in 361 (match peek s with 362 | Some '}' -> advance s 363 | _ -> 364 let key = parse_string s in 365 expect s ':'; 366 let value = parse_value s in 367 pairs := [(key, value)]; 368 while peek s = Some ',' do 369 advance s; 370 let k = parse_string s in 371 expect s ':'; 372 let v = parse_value s in 373 pairs := (k, v) :: !pairs 374 done; 375 expect s '}'); 376 Object (List.rev !pairs) 377 378(* >let parse input = 379(* > let s = { input; pos = 0 } in 380(* > let v = parse_value s in 381(* > v 382 ]} 383} 384}