this repo has no description
1{0 Building a JSON Parser}
2
3@scrolly Building a JSON Parser in OCaml
4{ol
5{li
6 {b Defining the Value Type}
7
8 Every parser starts with a type. JSON has six kinds of values:
9 null, booleans, numbers, strings, arrays, and objects.
10 We encode this directly as an OCaml variant.
11
12 {[
13(* >type json =
14(* > | Null
15(* > | Bool of bool
16(* > | Number of float
17(* > | String of string
18(* > | Array of json list
19(* > | Object of (string * json) list
20 ]}
21}
22{li
23 {b A Simple Scanner}
24
25 Before parsing structure, we need to skip whitespace and
26 peek at the next meaningful character. Our scanner works
27 on a string with a mutable position index.
28
29 {[
30type json =
31 | Null
32 | Bool of bool
33 | Number of float
34 | String of string
35 | Array of json list
36 | Object of (string * json) list
37
38(* >type scanner = {
39(* > input : string;
40(* > mutable pos : int;
41(* >}
42(* >
43(* >let peek s =
44(* > while s.pos < String.length s.input
45(* > && s.input.[s.pos] = ' ' do
46(* > s.pos <- s.pos + 1
47(* > done;
48(* > if s.pos < String.length s.input
49(* > then Some s.input.[s.pos]
50(* > else None
51(* >
52(* >let advance s = s.pos <- s.pos + 1
53 ]}
54}
55{li
56 {b Parsing Strings}
57
58 JSON strings are delimited by double quotes. We scan character
59 by character, collecting into a buffer. This handles the simple
60 case without escape sequences.
61
62 {[
63type json =
64 | Null
65 | Bool of bool
66 | Number of float
67 | String of string
68 | Array of json list
69 | Object of (string * json) list
70
71type scanner = {
72 input : string;
73 mutable pos : int;
74}
75
76let peek s =
77 while s.pos < String.length s.input
78 && s.input.[s.pos] = ' ' do
79 s.pos <- s.pos + 1
80 done;
81 if s.pos < String.length s.input
82 then Some s.input.[s.pos]
83 else None
84
85let advance s = s.pos <- s.pos + 1
86
87(* >let parse_string s =
88(* > advance s;
89(* > let buf = Buffer.create 64 in
90(* > while s.pos < String.length s.input
91(* > && s.input.[s.pos] <> '"' do
92(* > Buffer.add_char buf s.input.[s.pos];
93(* > advance s
94(* > done;
95(* > advance s;
96(* > Buffer.contents buf
97 ]}
98}
99{li
100 {b Parsing Numbers}
101
102 Numbers in JSON can be integers or floats. We collect consecutive
103 digit and dot characters, then use float_of_string to parse them.
104 A production parser would handle exponents too.
105
106 {[
107type json =
108 | Null
109 | Bool of bool
110 | Number of float
111 | String of string
112 | Array of json list
113 | Object of (string * json) list
114
115type scanner = {
116 input : string;
117 mutable pos : int;
118}
119
120let peek s =
121 while s.pos < String.length s.input
122 && s.input.[s.pos] = ' ' do
123 s.pos <- s.pos + 1
124 done;
125 if s.pos < String.length s.input
126 then Some s.input.[s.pos]
127 else None
128
129let advance s = s.pos <- s.pos + 1
130
131let parse_string s =
132 advance s;
133 let buf = Buffer.create 64 in
134 while s.pos < String.length s.input
135 && s.input.[s.pos] <> '"' do
136 Buffer.add_char buf s.input.[s.pos];
137 advance s
138 done;
139 advance s;
140 Buffer.contents buf
141
142(* >let is_digit c = c >= '0' && c <= '9'
143(* >
144(* >let parse_number s =
145(* > let start = s.pos in
146(* > while s.pos < String.length s.input
147(* > && (is_digit s.input.[s.pos]
148(* > || s.input.[s.pos] = '.'
149(* > || s.input.[s.pos] = '-') do
150(* > advance s
151(* > done;
152(* > float_of_string
153(* > (String.sub s.input start (s.pos - start))
154 ]}
155}
156{li
157 {b The Recursive Core}
158
159 Now the magic: parse_value dispatches on the next character
160 to decide what kind of JSON value to parse. For atoms like
161 null, true, false we match literal strings. For compound
162 structures, we recurse.
163
164 {[
165type json =
166 | Null
167 | Bool of bool
168 | Number of float
169 | String of string
170 | Array of json list
171 | Object of (string * json) list
172
173type scanner = {
174 input : string;
175 mutable pos : int;
176}
177
178let peek s =
179 while s.pos < String.length s.input
180 && s.input.[s.pos] = ' ' do
181 s.pos <- s.pos + 1
182 done;
183 if s.pos < String.length s.input
184 then Some s.input.[s.pos]
185 else None
186
187let advance s = s.pos <- s.pos + 1
188
189let parse_string s =
190 advance s;
191 let buf = Buffer.create 64 in
192 while s.pos < String.length s.input
193 && s.input.[s.pos] <> '"' do
194 Buffer.add_char buf s.input.[s.pos];
195 advance s
196 done;
197 advance s;
198 Buffer.contents buf
199
200let is_digit c = c >= '0' && c <= '9'
201
202let parse_number s =
203 let start = s.pos in
204 while s.pos < String.length s.input
205 && (is_digit s.input.[s.pos]
206 || s.input.[s.pos] = '.'
207 || s.input.[s.pos] = '-') do
208 advance s
209 done;
210 float_of_string
211 (String.sub s.input start (s.pos - start))
212
213(* >let expect s c =
214(* > match peek s with
215(* > | Some c' when c' = c -> advance s
216(* > | _ -> failwith "unexpected character"
217(* >
218(* >let rec parse_value s =
219(* > match peek s with
220(* > | Some '"' -> String (parse_string s)
221(* > | Some c when is_digit c || c = '-' ->
222(* > Number (parse_number s)
223(* > | Some 't' ->
224(* > s.pos <- s.pos + 4; Bool true
225(* > | Some 'f' ->
226(* > s.pos <- s.pos + 5; Bool false
227(* > | Some 'n' ->
228(* > s.pos <- s.pos + 4; Null
229(* > | Some '[' -> parse_array s
230(* > | Some '{' -> parse_object s
231(* > | _ -> failwith "unexpected token"
232(* >
233(* >and parse_array s =
234(* > advance s;
235(* > let items = ref [] in
236(* > (match peek s with
237(* > | Some ']' -> advance s
238(* > | _ ->
239(* > items := [parse_value s];
240(* > while peek s = Some ',' do
241(* > advance s;
242(* > items := parse_value s :: !items
243(* > done;
244(* > expect s ']');
245(* > Array (List.rev !items)
246(* >
247(* >and parse_object s =
248(* > advance s;
249(* > let pairs = ref [] in
250(* > (match peek s with
251(* > | Some '}' -> advance s
252(* > | _ ->
253(* > let key = parse_string s in
254(* > expect s ':';
255(* > let value = parse_value s in
256(* > pairs := [(key, value)];
257(* > while peek s = Some ',' do
258(* > advance s;
259(* > let k = parse_string s in
260(* > expect s ':';
261(* > let v = parse_value s in
262(* > pairs := (k, v) :: !pairs
263(* > done;
264(* > expect s '}');
265(* > Object (List.rev !pairs)
266 ]}
267}
268{li
269 {b The Public API}
270
271 Finally we wrap the scanner in a clean top-level function.
272 Pass a string in, get a JSON value out. The entire parser
273 is about 80 lines of OCaml — no dependencies, no magic.
274
275 {[
276type json =
277 | Null
278 | Bool of bool
279 | Number of float
280 | String of string
281 | Array of json list
282 | Object of (string * json) list
283
284type scanner = {
285 input : string;
286 mutable pos : int;
287}
288
289let peek s =
290 while s.pos < String.length s.input
291 && s.input.[s.pos] = ' ' do
292 s.pos <- s.pos + 1
293 done;
294 if s.pos < String.length s.input
295 then Some s.input.[s.pos]
296 else None
297
298let advance s = s.pos <- s.pos + 1
299
300let parse_string s =
301 advance s;
302 let buf = Buffer.create 64 in
303 while s.pos < String.length s.input
304 && s.input.[s.pos] <> '"' do
305 Buffer.add_char buf s.input.[s.pos];
306 advance s
307 done;
308 advance s;
309 Buffer.contents buf
310
311let is_digit c = c >= '0' && c <= '9'
312
313let parse_number s =
314 let start = s.pos in
315 while s.pos < String.length s.input
316 && (is_digit s.input.[s.pos]
317 || s.input.[s.pos] = '.'
318 || s.input.[s.pos] = '-') do
319 advance s
320 done;
321 float_of_string
322 (String.sub s.input start (s.pos - start))
323
324let expect s c =
325 match peek s with
326 | Some c' when c' = c -> advance s
327 | _ -> failwith "unexpected character"
328
329let rec parse_value s =
330 match peek s with
331 | Some '"' -> String (parse_string s)
332 | Some c when is_digit c || c = '-' ->
333 Number (parse_number s)
334 | Some 't' ->
335 s.pos <- s.pos + 4; Bool true
336 | Some 'f' ->
337 s.pos <- s.pos + 5; Bool false
338 | Some 'n' ->
339 s.pos <- s.pos + 4; Null
340 | Some '[' -> parse_array s
341 | Some '{' -> parse_object s
342 | _ -> failwith "unexpected token"
343
344and parse_array s =
345 advance s;
346 let items = ref [] in
347 (match peek s with
348 | Some ']' -> advance s
349 | _ ->
350 items := [parse_value s];
351 while peek s = Some ',' do
352 advance s;
353 items := parse_value s :: !items
354 done;
355 expect s ']');
356 Array (List.rev !items)
357
358and parse_object s =
359 advance s;
360 let pairs = ref [] in
361 (match peek s with
362 | Some '}' -> advance s
363 | _ ->
364 let key = parse_string s in
365 expect s ':';
366 let value = parse_value s in
367 pairs := [(key, value)];
368 while peek s = Some ',' do
369 advance s;
370 let k = parse_string s in
371 expect s ':';
372 let v = parse_value s in
373 pairs := (k, v) :: !pairs
374 done;
375 expect s '}');
376 Object (List.rev !pairs)
377
378(* >let parse input =
379(* > let s = { input; pos = 0 } in
380(* > let v = parse_value s in
381(* > v
382 ]}
383}
384}