forked from
anil.recoil.org/ocaml-tomlt
TOML 1.1 codecs for OCaml
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6open Bytesrw
7
8(* Aliases for cleaner code *)
9module Toml = Tomlt.Toml
10module Toml_error = Toml.Error
11
12(* Lexer - works directly on bytes buffer filled from Bytes.Reader *)
13
14type token =
15 | Tok_lbracket
16 | Tok_rbracket
17 | Tok_lbrace
18 | Tok_rbrace
19 | Tok_equals
20 | Tok_comma
21 | Tok_dot
22 | Tok_newline
23 | Tok_eof
24 | Tok_bare_key of string
25 | Tok_basic_string of string
26 | Tok_literal_string of string
27 | Tok_ml_basic_string of
28 string (* Multiline basic string - not valid as key *)
29 | Tok_ml_literal_string of
30 string (* Multiline literal string - not valid as key *)
31 | Tok_integer of
32 int64 * string (* value, original string for key reconstruction *)
33 | Tok_float of
34 float * string (* value, original string for key reconstruction *)
35 | Tok_datetime of string
36 | Tok_datetime_local of string
37 | Tok_date_local of string
38 | Tok_time_local of string
39
40type lexer = {
41 input : bytes; (* Buffer containing input data *)
42 input_len : int; (* Length of valid data in input *)
43 mutable pos : int;
44 mutable line : int;
45 mutable col : int;
46 file : string;
47}
48
49(* Create lexer from string (copies to bytes) *)
50let lexer ?(file = "-") s =
51 let input = Bytes.of_string s in
52 { input; input_len = Bytes.length input; pos = 0; line = 1; col = 1; file }
53
54(* Create lexer directly from Bytes.Reader - reads all data into buffer *)
55let lexer_from_reader ?(file = "-") r =
56 (* Read all slices into a buffer *)
57 let buf = Buffer.create 4096 in
58 let rec read_all () =
59 let slice = Bytes.Reader.read r in
60 if Bytes.Slice.is_eod slice then ()
61 else begin
62 Bytes.Slice.add_to_buffer buf slice;
63 read_all ()
64 end
65 in
66 read_all ();
67 let input = Buffer.to_bytes buf in
68 { input; input_len = Bytes.length input; pos = 0; line = 1; col = 1; file }
69
70let is_eof l = l.pos >= l.input_len
71let peek l = if is_eof l then None else Some (Bytes.get l.input l.pos)
72
73let peek2 l =
74 if l.pos + 1 >= l.input_len then None
75 else Some (Bytes.get l.input (l.pos + 1))
76
77let peek_n l n =
78 if l.pos + n - 1 >= l.input_len then None
79 else Some (Bytes.sub_string l.input l.pos n)
80
81let advance l =
82 if not (is_eof l) then begin
83 if Bytes.get l.input l.pos = '\n' then begin
84 l.line <- l.line + 1;
85 l.col <- 1
86 end
87 else l.col <- l.col + 1;
88 l.pos <- l.pos + 1
89 end
90
91let advance_n l n =
92 for _ = 1 to n do
93 advance l
94 done
95
96let skip_whitespace l =
97 while
98 (not (is_eof l))
99 && (Bytes.get l.input l.pos = ' ' || Bytes.get l.input l.pos = '\t')
100 do
101 advance l
102 done
103
104(* Helper functions for bytes access *)
105let[@inline] current_char l pos = Bytes.unsafe_get l.input pos
106let[@inline] current l = Bytes.unsafe_get l.input l.pos
107let sub_string l pos len = Bytes.sub_string l.input pos len
108
109(* Helper to create error location from lexer state *)
110let lexer_loc l = Toml.Error.loc ~file:l.file ~line:l.line ~column:l.col ()
111
112(* Get expected byte length of UTF-8 char from first byte *)
113let utf8_first_byte_len c =
114 let code = Char.code c in
115 if code < 0x80 then 1
116 else if code < 0xC0 then 0 (* Invalid: continuation byte as start *)
117 else if code < 0xE0 then 2
118 else if code < 0xF0 then 3
119 else if code < 0xF8 then 4
120 else 0 (* Invalid: 5+ byte sequence *)
121
122(* Validate UTF-8 at position in lexer's bytes buffer, returns byte length *)
123let validate_utf8_at_pos_bytes l =
124 if l.pos >= l.input_len then
125 Toml.Error.raise_lexer ~location:(lexer_loc l) Unexpected_eof;
126 let byte_len = utf8_first_byte_len (Bytes.unsafe_get l.input l.pos) in
127 if byte_len = 0 then
128 Toml.Error.raise_lexer ~location:(lexer_loc l) Invalid_utf8;
129 if l.pos + byte_len > l.input_len then
130 Toml.Error.raise_lexer ~location:(lexer_loc l) Incomplete_utf8;
131 (* Validate using uutf - it checks overlong encodings, surrogates, etc. *)
132 let sub = Bytes.sub_string l.input l.pos byte_len in
133 let valid = ref false in
134 Uutf.String.fold_utf_8
135 (fun () _ -> function `Uchar _ -> valid := true | `Malformed _ -> ())
136 () sub;
137 if not !valid then Toml.Error.raise_lexer ~location:(lexer_loc l) Invalid_utf8;
138 byte_len
139
140(* UTF-8 validation - validates and advances over a single UTF-8 character *)
141let validate_utf8_char l =
142 let byte_len = validate_utf8_at_pos_bytes l in
143 for _ = 1 to byte_len do
144 advance l
145 done
146
147let skip_comment l =
148 if (not (is_eof l)) && current l = '#' then begin
149 (* Validate comment characters *)
150 advance l;
151 let continue = ref true in
152 while !continue && (not (is_eof l)) && current l <> '\n' do
153 let c = current l in
154 let code = Char.code c in
155 (* CR is only valid if followed by LF (CRLF at end of comment) *)
156 if c = '\r' then begin
157 (* Check if this CR is followed by LF - if so, it ends the comment *)
158 if l.pos + 1 < l.input_len && current_char l (l.pos + 1) = '\n' then
159 (* This is CRLF - stop the loop, let the main lexer handle it *)
160 continue := false
161 else Toml.Error.raise_lexer ~location:(lexer_loc l) Bare_carriage_return
162 end
163 else if code >= 0x80 then begin
164 (* Multi-byte UTF-8 character - validate it *)
165 validate_utf8_char l
166 end
167 else begin
168 (* ASCII control characters other than tab are not allowed in comments *)
169 if code < 0x09 || (code > 0x09 && code < 0x20) || code = 0x7F then
170 Toml.Error.raise_lexer ~location:(lexer_loc l)
171 (Control_character code);
172 advance l
173 end
174 done
175 end
176
177let skip_ws_and_comments l =
178 let rec loop () =
179 skip_whitespace l;
180 if (not (is_eof l)) && current l = '#' then begin
181 skip_comment l;
182 loop ()
183 end
184 in
185 loop ()
186
187let is_bare_key_char c =
188 (c >= 'A' && c <= 'Z')
189 || (c >= 'a' && c <= 'z')
190 || (c >= '0' && c <= '9')
191 || c = '_' || c = '-'
192
193let is_digit c = c >= '0' && c <= '9'
194
195let is_hex_digit c =
196 is_digit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
197
198let is_oct_digit c = c >= '0' && c <= '7'
199let is_bin_digit c = c = '0' || c = '1'
200
201let hex_value c =
202 if c >= '0' && c <= '9' then Char.code c - Char.code '0'
203 else if c >= 'a' && c <= 'f' then Char.code c - Char.code 'a' + 10
204 else if c >= 'A' && c <= 'F' then Char.code c - Char.code 'A' + 10
205 else Toml.Error.raise_number Invalid_hex_digit
206
207(* Convert Unicode codepoint to UTF-8 using uutf *)
208let codepoint_to_utf8 codepoint =
209 if codepoint < 0 || codepoint > 0x10FFFF then
210 Fmt.failwith "Invalid Unicode codepoint: U+%X" codepoint;
211 if codepoint >= 0xD800 && codepoint <= 0xDFFF then
212 Fmt.failwith "Surrogate codepoint not allowed: U+%04X" codepoint;
213 let buf = Buffer.create 4 in
214 Uutf.Buffer.add_utf_8 buf (Uchar.of_int codepoint);
215 Buffer.contents buf
216
217(* Parse Unicode escape with error location from lexer *)
218let unicode_to_utf8 l codepoint =
219 if codepoint < 0 || codepoint > 0x10FFFF then
220 Toml.Error.raise_lexer ~location:(lexer_loc l)
221 (Invalid_unicode_codepoint codepoint);
222 if codepoint >= 0xD800 && codepoint <= 0xDFFF then
223 Toml.Error.raise_lexer ~location:(lexer_loc l)
224 (Surrogate_codepoint codepoint);
225 let buf = Buffer.create 4 in
226 Uutf.Buffer.add_utf_8 buf (Uchar.of_int codepoint);
227 Buffer.contents buf
228
229let parse_escape l =
230 advance l;
231 (* skip backslash *)
232 if is_eof l then Toml.Error.raise_lexer ~location:(lexer_loc l) Unexpected_eof;
233 let c = current l in
234 advance l;
235 match c with
236 | 'b' -> "\b"
237 | 't' -> "\t"
238 | 'n' -> "\n"
239 | 'f' -> "\x0C"
240 | 'r' -> "\r"
241 | 'e' -> "\x1B" (* TOML 1.1 escape *)
242 | '"' -> "\""
243 | '\\' -> "\\"
244 | 'x' ->
245 (* \xHH - 2 hex digits *)
246 if l.pos + 1 >= l.input_len then
247 Toml.Error.raise_lexer ~location:(lexer_loc l) (Incomplete_escape "\\x");
248 let c1 = current_char l l.pos in
249 let c2 = current_char l (l.pos + 1) in
250 if not (is_hex_digit c1 && is_hex_digit c2) then
251 Toml.Error.raise_lexer ~location:(lexer_loc l)
252 (Invalid_unicode_escape "\\x");
253 let cp = (hex_value c1 * 16) + hex_value c2 in
254 advance l;
255 advance l;
256 unicode_to_utf8 l cp
257 | 'u' ->
258 (* \uHHHH - 4 hex digits *)
259 if l.pos + 3 >= l.input_len then
260 Toml.Error.raise_lexer ~location:(lexer_loc l) (Incomplete_escape "\\u");
261 let s = sub_string l l.pos 4 in
262 for i = 0 to 3 do
263 if not (is_hex_digit s.[i]) then
264 Toml.Error.raise_lexer ~location:(lexer_loc l)
265 (Invalid_unicode_escape "\\u")
266 done;
267 let cp = int_of_string ("0x" ^ s) in
268 advance_n l 4;
269 unicode_to_utf8 l cp
270 | 'U' ->
271 (* \UHHHHHHHH - 8 hex digits *)
272 if l.pos + 7 >= l.input_len then
273 Toml.Error.raise_lexer ~location:(lexer_loc l) (Incomplete_escape "\\U");
274 let s = sub_string l l.pos 8 in
275 for i = 0 to 7 do
276 if not (is_hex_digit s.[i]) then
277 Toml.Error.raise_lexer ~location:(lexer_loc l)
278 (Invalid_unicode_escape "\\U")
279 done;
280 let cp = int_of_string ("0x" ^ s) in
281 advance_n l 8;
282 unicode_to_utf8 l cp
283 | _ -> Toml.Error.raise_lexer ~location:(lexer_loc l) (Invalid_escape c)
284
285let validate_string_char l c is_multiline =
286 let code = Char.code c in
287 (* Control characters other than tab (and LF/CR for multiline) are not allowed *)
288 if code < 0x09 then
289 Toml.Error.raise_lexer ~location:(lexer_loc l) (Control_character code);
290 if
291 code > 0x09 && code < 0x20
292 && not (is_multiline && (code = 0x0A || code = 0x0D))
293 then Toml.Error.raise_lexer ~location:(lexer_loc l) (Control_character code);
294 if code = 0x7F then
295 Toml.Error.raise_lexer ~location:(lexer_loc l) (Control_character code)
296
297(* Validate UTF-8 in string context and add bytes to buffer *)
298let utf8_add_validated l buf =
299 let byte_len = validate_utf8_at_pos_bytes l in
300 Buffer.add_string buf (sub_string l l.pos byte_len);
301 for _ = 1 to byte_len do
302 advance l
303 done
304
305let skip_opening_newline l errmsg =
306 match peek l with
307 | Some '\n' -> advance l
308 | Some '\r' ->
309 advance l;
310 if peek l = Some '\n' then advance l else failwith errmsg
311 | _ -> ()
312
313let skip_all_ws_newlines l =
314 let rec loop () =
315 match peek l with
316 | Some ' ' | Some '\t' | Some '\n' ->
317 advance l;
318 loop ()
319 | Some '\r' ->
320 advance l;
321 if peek l = Some '\n' then advance l;
322 loop ()
323 | _ -> ()
324 in
325 loop ()
326
327let handle_multiline_quotes l buf quote_char cont =
328 let quote_count = ref 0 in
329 let p = ref l.pos in
330 while !p < l.input_len && current_char l !p = quote_char do
331 incr quote_count;
332 incr p
333 done;
334 if !quote_count >= 3 then begin
335 let extra = min (!quote_count - 3) 2 in
336 for _ = 1 to extra do
337 Buffer.add_char buf quote_char
338 done;
339 advance_n l !quote_count;
340 if !quote_count > 5 then failwith "Too many quotes in multiline string"
341 end
342 else begin
343 for _ = 1 to !quote_count do
344 Buffer.add_char buf quote_char;
345 advance l
346 done;
347 cont ()
348 end
349
350let handle_multiline_basic_backslash l buf loop =
351 let saved_pos = l.pos and saved_line = l.line and saved_col = l.col in
352 advance l;
353 let rec skip_ws () =
354 match peek l with
355 | Some ' ' | Some '\t' ->
356 advance l;
357 skip_ws ()
358 | _ -> ()
359 in
360 skip_ws ();
361 match peek l with
362 | Some '\n' ->
363 advance l;
364 skip_all_ws_newlines l;
365 loop ()
366 | Some '\r' ->
367 advance l;
368 if peek l = Some '\n' then advance l;
369 skip_all_ws_newlines l;
370 loop ()
371 | _ ->
372 l.pos <- saved_pos;
373 l.line <- saved_line;
374 l.col <- saved_col;
375 Buffer.add_string buf (parse_escape l);
376 loop ()
377
378let rec parse_multiline_basic_string l buf () =
379 if is_eof l then failwith "Unterminated string";
380 let c = current l in
381 if c = '"' then
382 handle_multiline_quotes l buf '"' (parse_multiline_basic_string l buf)
383 else if c = '\\' then
384 handle_multiline_basic_backslash l buf (parse_multiline_basic_string l buf)
385 else begin
386 if c = '\r' then begin
387 advance l;
388 if peek l = Some '\n' then (
389 Buffer.add_char buf '\n';
390 advance l)
391 else failwith "Bare carriage return not allowed in string"
392 end
393 else if Char.code c >= 0x80 then utf8_add_validated l buf
394 else begin
395 validate_string_char l c true;
396 Buffer.add_char buf c;
397 advance l
398 end;
399 parse_multiline_basic_string l buf ()
400 end
401
402let rec parse_single_basic_string l buf () =
403 if is_eof l then failwith "Unterminated string";
404 let c = current l in
405 if c = '"' then advance l
406 else if c = '\\' then (
407 Buffer.add_string buf (parse_escape l);
408 parse_single_basic_string l buf ())
409 else if c = '\n' || c = '\r' then
410 failwith "Newline not allowed in basic string"
411 else begin
412 if Char.code c >= 0x80 then utf8_add_validated l buf
413 else begin
414 validate_string_char l c false;
415 Buffer.add_char buf c;
416 advance l
417 end;
418 parse_single_basic_string l buf ()
419 end
420
421let parse_basic_string l =
422 advance l;
423 let buf = Buffer.create 64 in
424 let multiline =
425 match peek_n l 2 with
426 | Some "\"\"" ->
427 advance l;
428 advance l;
429 skip_opening_newline l "Bare carriage return not allowed in string";
430 true
431 | _ -> false
432 in
433 if multiline then parse_multiline_basic_string l buf ()
434 else parse_single_basic_string l buf ();
435 (Buffer.contents buf, multiline)
436
437let validate_literal_ctrl l c code ~multiline =
438 let is_ctrl =
439 if multiline then
440 code < 0x09
441 || (code > 0x09 && code < 0x0A)
442 || (code > 0x0D && code < 0x20)
443 || code = 0x7F
444 else code < 0x09 || (code > 0x09 && code < 0x20) || code = 0x7F
445 in
446 if is_ctrl && code <> 0x0A && code <> 0x0D then
447 failwith
448 (Fmt.str
449 "Control character U+%04X not allowed in literal string at line %d"
450 (Char.code c) l.line)
451
452let rec parse_multiline_literal_string l buf () =
453 if is_eof l then failwith "Unterminated literal string";
454 let c = current l in
455 if c = '\'' then
456 handle_multiline_quotes l buf '\'' (parse_multiline_literal_string l buf)
457 else begin
458 if c = '\r' then begin
459 advance l;
460 if peek l = Some '\n' then (
461 Buffer.add_char buf '\n';
462 advance l)
463 else failwith "Bare carriage return not allowed in literal string"
464 end
465 else if Char.code c >= 0x80 then utf8_add_validated l buf
466 else begin
467 validate_literal_ctrl l c (Char.code c) ~multiline:true;
468 Buffer.add_char buf c;
469 advance l
470 end;
471 parse_multiline_literal_string l buf ()
472 end
473
474let rec parse_single_literal_string l buf () =
475 if is_eof l then failwith "Unterminated literal string";
476 let c = current l in
477 if c = '\'' then advance l
478 else if c = '\n' || c = '\r' then
479 failwith "Newline not allowed in literal string"
480 else begin
481 let code = Char.code c in
482 if code >= 0x80 then utf8_add_validated l buf
483 else begin
484 validate_literal_ctrl l c code ~multiline:false;
485 Buffer.add_char buf c;
486 advance l
487 end;
488 parse_single_literal_string l buf ()
489 end
490
491let parse_literal_string l =
492 advance l;
493 let buf = Buffer.create 64 in
494 let multiline =
495 match peek_n l 2 with
496 | Some "''" ->
497 advance l;
498 advance l;
499 skip_opening_newline l
500 "Bare carriage return not allowed in literal string";
501 true
502 | _ -> false
503 in
504 if multiline then parse_multiline_literal_string l buf ()
505 else parse_single_literal_string l buf ();
506 (Buffer.contents buf, multiline)
507
508let read_prefixed_int l is_valid_digit prefix fail_prefix fail_trailing =
509 advance l;
510 advance l;
511 (* skip 0x/0o/0b *)
512 let num_start = l.pos in
513 if peek l = Some '_' then
514 Fmt.failwith "Leading underscore not allowed after %s" prefix;
515 let rec read first =
516 match peek l with
517 | Some c when is_valid_digit c ->
518 advance l;
519 read false
520 | Some '_' ->
521 if first then
522 Fmt.failwith "Underscore must follow a %s digit" fail_prefix;
523 advance l;
524 if peek l |> Option.map is_valid_digit |> Option.value ~default:false
525 then read false
526 else Fmt.failwith "Trailing underscore in %s number" fail_trailing
527 | _ ->
528 if first then
529 Fmt.failwith "Expected %s digit after %s" fail_prefix prefix
530 in
531 read true;
532 (num_start, l.pos)
533
534let read_decimal_int l =
535 (* Read digit sequence with _ separators, returns whether any were read *)
536 let rec read_int first =
537 match peek l with
538 | Some c when is_digit c ->
539 advance l;
540 read_int false
541 | Some '_' ->
542 if first then failwith "Underscore must follow a digit";
543 advance l;
544 if peek l |> Option.map is_digit |> Option.value ~default:false then
545 read_int false
546 else failwith "Trailing underscore in number"
547 | _ -> if first then failwith "Expected digit"
548 in
549 read_int
550
551let parse_decimal_number l start =
552 let first_digit = peek l in
553 if first_digit = Some '0' then begin
554 match peek2 l with
555 | Some c when is_digit c -> failwith "Leading zeros not allowed"
556 | Some '_' -> failwith "Leading zeros not allowed"
557 | _ -> ()
558 end;
559 let read_int = read_decimal_int l in
560 (match peek l with
561 | Some c when is_digit c -> read_int false
562 | _ -> failwith "Expected digit after sign");
563 let is_float = ref false in
564 (match (peek l, peek2 l) with
565 | Some '.', Some c when is_digit c ->
566 is_float := true;
567 advance l;
568 read_int false
569 | Some '.', _ -> failwith "Decimal point must be followed by digit"
570 | _ -> ());
571 (match peek l with
572 | Some 'e' | Some 'E' ->
573 is_float := true;
574 advance l;
575 (match peek l with Some '+' | Some '-' -> advance l | _ -> ());
576 (match peek l with
577 | Some '_' -> failwith "Underscore cannot follow exponent"
578 | _ -> ());
579 read_int true
580 | _ -> ());
581 let s = sub_string l start (l.pos - start) in
582 let s' = String.concat "" (String.split_on_char '_' s) in
583 if !is_float then Tok_float (float_of_string s', s)
584 else Tok_integer (Int64.of_string s', s)
585
586let parse_number l =
587 let start = l.pos in
588 let neg =
589 match peek l with
590 | Some '-' ->
591 advance l;
592 true
593 | Some '+' ->
594 advance l;
595 false
596 | _ -> false
597 in
598 match peek_n l 3 with
599 | Some "inf" ->
600 advance_n l 3;
601 let s = sub_string l start (l.pos - start) in
602 Tok_float ((if neg then Float.neg_infinity else Float.infinity), s)
603 | Some "nan" ->
604 advance_n l 3;
605 let s = sub_string l start (l.pos - start) in
606 Tok_float (Float.nan, s)
607 | _ -> (
608 match (peek l, peek2 l) with
609 | Some '0', Some 'x' when not neg ->
610 let num_start, num_end =
611 read_prefixed_int l is_hex_digit "0x" "hex" "hex"
612 in
613 let s = sub_string l num_start (num_end - num_start) in
614 let s = String.concat "" (String.split_on_char '_' s) in
615 let orig = sub_string l start (l.pos - start) in
616 Tok_integer (Int64.of_string ("0x" ^ s), orig)
617 | Some '0', Some 'o' when not neg ->
618 let num_start, num_end =
619 read_prefixed_int l is_oct_digit "0o" "octal" "octal"
620 in
621 let s = sub_string l num_start (num_end - num_start) in
622 let s = String.concat "" (String.split_on_char '_' s) in
623 let orig = sub_string l start (l.pos - start) in
624 Tok_integer (Int64.of_string ("0o" ^ s), orig)
625 | Some '0', Some 'b' when not neg ->
626 let num_start, num_end =
627 read_prefixed_int l is_bin_digit "0b" "binary" "binary"
628 in
629 let s = sub_string l num_start (num_end - num_start) in
630 let s = String.concat "" (String.split_on_char '_' s) in
631 let orig = sub_string l start (l.pos - start) in
632 Tok_integer (Int64.of_string ("0b" ^ s), orig)
633 | _ -> parse_decimal_number l start)
634
635(* Check if we're looking at a datetime/date/time *)
636let is_time_prefix l pos =
637 pos + 5 <= l.input_len
638 && is_digit (current_char l pos)
639 && is_digit (current_char l (pos + 1))
640 && current_char l (pos + 2) = ':'
641 && is_digit (current_char l (pos + 3))
642 && is_digit (current_char l (pos + 4))
643
644let is_date_prefix l pos =
645 pos + 10 <= l.input_len
646 && is_digit (current_char l pos)
647 && is_digit (current_char l (pos + 1))
648 && is_digit (current_char l (pos + 2))
649 && is_digit (current_char l (pos + 3))
650 && current_char l (pos + 4) = '-'
651 && is_digit (current_char l (pos + 5))
652 && is_digit (current_char l (pos + 6))
653 && current_char l (pos + 7) = '-'
654 && is_digit (current_char l (pos + 8))
655 && is_digit (current_char l (pos + 9))
656
657let check_date_suffix l pos =
658 let len = l.input_len in
659 if pos + 10 >= len then `Date
660 else
661 let next = current_char l (pos + 10) in
662 if
663 next = 'T' || next = 't' || next = '\n' || next = '\r' || next = '#'
664 || next = ',' || next = ']' || next = '}'
665 then `Date
666 else if next = ' ' || next = '\t' then begin
667 let rec skip_ws p =
668 if p >= len then p
669 else
670 match current_char l p with ' ' | '\t' -> skip_ws (p + 1) | _ -> p
671 in
672 if current_char l (skip_ws (pos + 11)) = '=' then `Other else `Date
673 end
674 else if is_bare_key_char next then `Other
675 else `Date
676
677let looks_like_datetime l =
678 let pos = l.pos in
679 if is_date_prefix l pos then check_date_suffix l pos
680 else if is_time_prefix l pos then `Time
681 else `Other
682
683(* Date/time validation *)
684let validate_date year month day =
685 if month < 1 || month > 12 then Fmt.failwith "Invalid month: %d" month;
686 if day < 1 then Fmt.failwith "Invalid day: %d" day;
687 let days_in_month = [| 0; 31; 28; 31; 30; 31; 30; 31; 31; 30; 31; 30; 31 |] in
688 let is_leap = (year mod 4 = 0 && year mod 100 <> 0) || year mod 400 = 0 in
689 let max_days = if month = 2 && is_leap then 29 else days_in_month.(month) in
690 if day > max_days then Fmt.failwith "Invalid day %d for month %d" day month
691
692let validate_time hour minute second =
693 if hour < 0 || hour > 23 then Fmt.failwith "Invalid hour: %d" hour;
694 if minute < 0 || minute > 59 then Fmt.failwith "Invalid minute: %d" minute;
695 if second < 0 || second > 60 then (* 60 for leap second *)
696 Fmt.failwith "Invalid second: %d" second
697
698let validate_offset hour minute =
699 if hour < 0 || hour > 23 then
700 Fmt.failwith "Invalid timezone offset hour: %d" hour;
701 if minute < 0 || minute > 59 then
702 Fmt.failwith "Invalid timezone offset minute: %d" minute
703
704let read_2_digits l buf secondary_buf fail_msg =
705 for _ = 1 to 2 do
706 match peek l with
707 | Some c when is_digit c ->
708 Buffer.add_char buf c;
709 Buffer.add_char secondary_buf c;
710 advance l
711 | _ -> failwith fail_msg
712 done
713
714let read_optional_seconds_frac l buf second_buf =
715 match peek l with
716 | Some ':' -> (
717 Buffer.add_char buf ':';
718 advance l;
719 read_2_digits l buf second_buf "Invalid time format";
720 (* Optional fractional seconds *)
721 match peek l with
722 | Some '.' ->
723 Buffer.add_char buf '.';
724 advance l;
725 if not (peek l |> Option.map is_digit |> Option.value ~default:false)
726 then failwith "Expected digit after decimal point";
727 while peek l |> Option.map is_digit |> Option.value ~default:false do
728 Buffer.add_char buf (Option.get (peek l));
729 advance l
730 done
731 | _ -> ())
732 | _ ->
733 (* No seconds - add :00 for normalization *)
734 Buffer.add_string buf ":00";
735 Buffer.add_string second_buf "00"
736
737let read_date_part l buf =
738 (* Read YYYY-MM-DD into buf, validate date components *)
739 let year_buf = Buffer.create 4 in
740 let month_buf = Buffer.create 2 in
741 let day_buf = Buffer.create 2 in
742 for _ = 1 to 4 do
743 match peek l with
744 | Some c when is_digit c ->
745 Buffer.add_char buf c;
746 Buffer.add_char year_buf c;
747 advance l
748 | _ -> failwith "Invalid date format"
749 done;
750 if peek l <> Some '-' then failwith "Invalid date format";
751 Buffer.add_char buf '-';
752 advance l;
753 read_2_digits l buf month_buf "Invalid date format";
754 if peek l <> Some '-' then failwith "Invalid date format";
755 Buffer.add_char buf '-';
756 advance l;
757 read_2_digits l buf day_buf "Invalid date format";
758 let year = int_of_string (Buffer.contents year_buf) in
759 let month = int_of_string (Buffer.contents month_buf) in
760 let day = int_of_string (Buffer.contents day_buf) in
761 validate_date year month day
762
763let read_tz_offset l buf =
764 (* Parse +HH:MM or -HH:MM offset, validate and append to buf *)
765 let sign = current l in
766 let off_hour_buf = Buffer.create 2 in
767 let off_min_buf = Buffer.create 2 in
768 Buffer.add_char buf sign;
769 advance l;
770 read_2_digits l buf off_hour_buf "Invalid timezone offset";
771 if peek l <> Some ':' then failwith "Invalid timezone offset";
772 Buffer.add_char buf ':';
773 advance l;
774 read_2_digits l buf off_min_buf "Invalid timezone offset";
775 let off_hour = int_of_string (Buffer.contents off_hour_buf) in
776 let off_min = int_of_string (Buffer.contents off_min_buf) in
777 validate_offset off_hour off_min
778
779let read_time_and_offset l buf =
780 (* Read HH:MM[:SS[.frac]] and optional offset, return datetime token *)
781 let hour_buf = Buffer.create 2 in
782 let minute_buf = Buffer.create 2 in
783 let second_buf = Buffer.create 2 in
784 Buffer.add_char buf 'T';
785 (* normalize to uppercase T *)
786 read_2_digits l buf hour_buf "Invalid time format";
787 if peek l <> Some ':' then failwith "Invalid time format";
788 Buffer.add_char buf ':';
789 advance l;
790 read_2_digits l buf minute_buf "Invalid time format";
791 read_optional_seconds_frac l buf second_buf;
792 let hour = int_of_string (Buffer.contents hour_buf) in
793 let minute = int_of_string (Buffer.contents minute_buf) in
794 let second =
795 if Buffer.length second_buf > 0 then
796 int_of_string (Buffer.contents second_buf)
797 else 0
798 in
799 validate_time hour minute second;
800 match peek l with
801 | Some 'Z' | Some 'z' ->
802 Buffer.add_char buf 'Z';
803 advance l;
804 Tok_datetime (Buffer.contents buf)
805 | Some '+' | Some '-' ->
806 read_tz_offset l buf;
807 Tok_datetime (Buffer.contents buf)
808 | _ -> Tok_datetime_local (Buffer.contents buf)
809
810let parse_datetime l =
811 let buf = Buffer.create 32 in
812 read_date_part l buf;
813 match peek l with
814 | Some 'T' | Some 't' ->
815 advance l;
816 read_time_and_offset l buf
817 | Some ' ' -> (
818 (* Space could be followed by time (datetime with space separator)
819 or could be end of date (local date followed by comment/value) *)
820 advance l;
821 match peek l with
822 | Some c when is_digit c -> read_time_and_offset l buf
823 | _ ->
824 (* Not followed by time - this is just a local date *)
825 l.pos <- l.pos - 1;
826 (* Go back to before the space *)
827 Tok_date_local (Buffer.contents buf))
828 | _ -> Tok_date_local (Buffer.contents buf)
829
830let parse_time l =
831 let buf = Buffer.create 16 in
832 let hour_buf = Buffer.create 2 in
833 let minute_buf = Buffer.create 2 in
834 let second_buf = Buffer.create 2 in
835 (* Read HH:MM *)
836 read_2_digits l buf hour_buf "Invalid time format";
837 if peek l <> Some ':' then failwith "Invalid time format";
838 Buffer.add_char buf ':';
839 advance l;
840 read_2_digits l buf minute_buf "Invalid time format";
841 read_optional_seconds_frac l buf second_buf;
842 (* Validate time *)
843 let hour = int_of_string (Buffer.contents hour_buf) in
844 let minute = int_of_string (Buffer.contents minute_buf) in
845 let second =
846 if Buffer.length second_buf > 0 then
847 int_of_string (Buffer.contents second_buf)
848 else 0
849 in
850 validate_time hour minute second;
851 Tok_time_local (Buffer.contents buf)
852
853let ws_followed_by_eq l p =
854 let rec skip pp =
855 if pp >= l.input_len then false
856 else
857 match current_char l pp with
858 | ' ' | '\t' -> skip (pp + 1)
859 | '=' -> true
860 | _ -> false
861 in
862 skip p
863
864let signed_digit_is_key_context l start =
865 (* Determine if a signed-digit sequence like -01 is a key rather than a number *)
866 let rec scan_ahead p =
867 if p >= l.input_len then false
868 else
869 let c = current_char l p in
870 if is_digit c || c = '_' then scan_ahead (p + 1)
871 else if c = ' ' || c = '\t' then ws_followed_by_eq l (p + 1)
872 else if c = '=' then true
873 else if c = '.' then
874 p + 1 < l.input_len
875 &&
876 let next = current_char l (p + 1) in
877 (not (is_digit next)) && is_bare_key_char next
878 else if c = 'e' || c = 'E' then false
879 else is_bare_key_char c
880 in
881 scan_ahead (start + 1)
882
883let digit_is_bare_key l start =
884 (* Scan from start to determine if a digit-led token is a bare key vs number *)
885 let rec scan pos has_dash =
886 if pos >= l.input_len then has_dash
887 else
888 match current_char l pos with
889 | c when is_digit c || c = '_' || c = '.' -> scan (pos + 1) has_dash
890 | '-' when pos + 1 < l.input_len ->
891 let next = current_char l (pos + 1) in
892 if is_digit next then scan (pos + 1) true
893 else is_bare_key_char next || has_dash
894 | ('e' | 'E') when pos + 1 >= l.input_len -> true
895 | 'e' | 'E' ->
896 let next = current_char l (pos + 1) in
897 if is_digit next then has_dash
898 else if
899 (next = '+' || next = '-')
900 && pos + 2 < l.input_len
901 && is_digit (current_char l (pos + 2))
902 then has_dash
903 else true
904 | c -> is_bare_key_char c || has_dash
905 in
906 let has_leading_zero =
907 current_char l start = '0'
908 && start + 1 < l.input_len
909 && is_digit (current_char l (start + 1))
910 in
911 has_leading_zero || scan start false
912
913let lex_signed_token l =
914 let sign = current l in
915 let start = l.pos in
916 match peek2 l with
917 | Some d when is_digit d ->
918 if signed_digit_is_key_context l start then begin
919 while (not (is_eof l)) && is_bare_key_char (current l) do
920 advance l
921 done;
922 Tok_bare_key (sub_string l start (l.pos - start))
923 end
924 else parse_number l
925 | Some 'i' ->
926 if
927 l.pos + 3 < l.input_len
928 && current_char l (l.pos + 1) = 'i'
929 && current_char l (l.pos + 2) = 'n'
930 && current_char l (l.pos + 3) = 'f'
931 then begin
932 advance_n l 4;
933 let s = sub_string l start (l.pos - start) in
934 if sign = '-' then Tok_float (Float.neg_infinity, s)
935 else Tok_float (Float.infinity, s)
936 end
937 else if sign = '-' then begin
938 while (not (is_eof l)) && is_bare_key_char (current l) do
939 advance l
940 done;
941 Tok_bare_key (sub_string l start (l.pos - start))
942 end
943 else Fmt.failwith "Unexpected character after %c" sign
944 | Some 'n' ->
945 if
946 l.pos + 3 < l.input_len
947 && current_char l (l.pos + 1) = 'n'
948 && current_char l (l.pos + 2) = 'a'
949 && current_char l (l.pos + 3) = 'n'
950 then begin
951 advance_n l 4;
952 let s = sub_string l start (l.pos - start) in
953 Tok_float (Float.nan, s)
954 end
955 else if sign = '-' then begin
956 while (not (is_eof l)) && is_bare_key_char (current l) do
957 advance l
958 done;
959 Tok_bare_key (sub_string l start (l.pos - start))
960 end
961 else Fmt.failwith "Unexpected character after %c" sign
962 | _ when sign = '-' ->
963 while (not (is_eof l)) && is_bare_key_char (current l) do
964 advance l
965 done;
966 Tok_bare_key (sub_string l start (l.pos - start))
967 | _ -> Fmt.failwith "Unexpected character after %c" sign
968
969let lex_digit_token l =
970 match looks_like_datetime l with
971 | `Date -> parse_datetime l
972 | `Time -> parse_time l
973 | `Other ->
974 let start = l.pos in
975 let is_prefixed_number =
976 start + 1 < l.input_len
977 && current_char l start = '0'
978 &&
979 let c1 = current_char l (start + 1) in
980 c1 = 'x' || c1 = 'X' || c1 = 'o' || c1 = 'O' || c1 = 'b' || c1 = 'B'
981 in
982 if is_prefixed_number then parse_number l
983 else if digit_is_bare_key l start then begin
984 while (not (is_eof l)) && is_bare_key_char (current l) do
985 advance l
986 done;
987 Tok_bare_key (sub_string l start (l.pos - start))
988 end
989 else parse_number l
990
991let next_token l =
992 skip_ws_and_comments l;
993 if is_eof l then Tok_eof
994 else begin
995 let c = current l in
996 match c with
997 | '[' ->
998 advance l;
999 Tok_lbracket
1000 | ']' ->
1001 advance l;
1002 Tok_rbracket
1003 | '{' ->
1004 advance l;
1005 Tok_lbrace
1006 | '}' ->
1007 advance l;
1008 Tok_rbrace
1009 | '=' ->
1010 advance l;
1011 Tok_equals
1012 | ',' ->
1013 advance l;
1014 Tok_comma
1015 | '.' ->
1016 advance l;
1017 Tok_dot
1018 | '\n' ->
1019 advance l;
1020 Tok_newline
1021 | '\r' ->
1022 advance l;
1023 if peek l = Some '\n' then begin
1024 advance l;
1025 Tok_newline
1026 end
1027 else Fmt.failwith "Bare carriage return not allowed at line %d" l.line
1028 | '"' ->
1029 let s, multiline = parse_basic_string l in
1030 if multiline then Tok_ml_basic_string s else Tok_basic_string s
1031 | '\'' ->
1032 let s, multiline = parse_literal_string l in
1033 if multiline then Tok_ml_literal_string s else Tok_literal_string s
1034 | '+' | '-' -> lex_signed_token l
1035 | c when is_digit c -> lex_digit_token l
1036 | c when c = 't' || c = 'f' || c = 'i' || c = 'n' ->
1037 let start = l.pos in
1038 while (not (is_eof l)) && is_bare_key_char (current l) do
1039 advance l
1040 done;
1041 Tok_bare_key (sub_string l start (l.pos - start))
1042 | c when is_bare_key_char c ->
1043 let start = l.pos in
1044 while (not (is_eof l)) && is_bare_key_char (current l) do
1045 advance l
1046 done;
1047 Tok_bare_key (sub_string l start (l.pos - start))
1048 | c ->
1049 let code = Char.code c in
1050 if code < 0x20 || code = 0x7F then
1051 Fmt.failwith "Control character U+%04X not allowed at line %d" code
1052 l.line
1053 else
1054 Fmt.failwith "Unexpected character '%c' at line %d, column %d" c
1055 l.line l.col
1056 end
1057
1058(* Parser *)
1059
1060type parser = { lexer : lexer; mutable current : token; mutable peeked : bool }
1061
1062let parser lexer = { lexer; current = Tok_eof; peeked = false }
1063
1064let peek_token p =
1065 if not p.peeked then begin
1066 p.current <- next_token p.lexer;
1067 p.peeked <- true
1068 end;
1069 p.current
1070
1071let consume_token p =
1072 let tok = peek_token p in
1073 p.peeked <- false;
1074 tok
1075
1076(* Check if next raw character (without skipping whitespace) matches *)
1077let next_raw_char_is p c =
1078 p.lexer.pos < p.lexer.input_len && current_char p.lexer p.lexer.pos = c
1079
1080let expect_token p expected =
1081 let tok = consume_token p in
1082 if tok <> expected then
1083 failwith
1084 (Fmt.str "Expected %s"
1085 (match expected with
1086 | Tok_equals -> "="
1087 | Tok_rbracket -> "]"
1088 | Tok_rbrace -> "}"
1089 | Tok_newline -> "newline"
1090 | _ -> "token"))
1091
1092let skip_newlines p =
1093 while peek_token p = Tok_newline do
1094 ignore (consume_token p)
1095 done
1096
1097(* Parse a single key segment (bare, basic string, literal string, or integer) *)
1098(* Note: Tok_float is handled specially in parse_dotted_key *)
1099let parse_key_segment p =
1100 match peek_token p with
1101 | Tok_bare_key s ->
1102 ignore (consume_token p);
1103 [ s ]
1104 | Tok_basic_string s ->
1105 ignore (consume_token p);
1106 [ s ]
1107 | Tok_literal_string s ->
1108 ignore (consume_token p);
1109 [ s ]
1110 | Tok_integer (_i, orig_str) ->
1111 ignore (consume_token p);
1112 [ orig_str ]
1113 | Tok_float (f, orig_str) ->
1114 (* Float in key context - use original string to preserve exact key parts *)
1115 ignore (consume_token p);
1116 if Float.is_nan f then [ "nan" ]
1117 else if f = Float.infinity then [ "inf" ]
1118 else if f = Float.neg_infinity then [ "-inf" ]
1119 else begin
1120 (* Remove underscores from original string and split on dot *)
1121 let s = String.concat "" (String.split_on_char '_' orig_str) in
1122 if String.contains s 'e' || String.contains s 'E' then
1123 (* Has exponent, treat as single key *)
1124 [ s ]
1125 else if String.contains s '.' then
1126 (* Split on decimal point for dotted key *)
1127 String.split_on_char '.' s
1128 else
1129 (* No decimal point, single integer key *)
1130 [ s ]
1131 end
1132 | Tok_date_local s ->
1133 ignore (consume_token p);
1134 [ s ]
1135 | Tok_datetime s ->
1136 ignore (consume_token p);
1137 [ s ]
1138 | Tok_datetime_local s ->
1139 ignore (consume_token p);
1140 [ s ]
1141 | Tok_time_local s ->
1142 ignore (consume_token p);
1143 [ s ]
1144 | Tok_ml_basic_string _ ->
1145 failwith "Multiline strings are not allowed as keys"
1146 | Tok_ml_literal_string _ ->
1147 failwith "Multiline strings are not allowed as keys"
1148 | _ -> failwith "Expected key"
1149
1150(* Parse a dotted key - returns list of key strings *)
1151let parse_dotted_key p =
1152 let first_keys = parse_key_segment p in
1153 let rec loop acc =
1154 match peek_token p with
1155 | Tok_dot ->
1156 ignore (consume_token p);
1157 let keys = parse_key_segment p in
1158 loop (List.rev_append keys acc)
1159 | _ -> List.rev acc
1160 in
1161 let rest = loop [] in
1162 first_keys @ rest
1163
1164let validate_number_underscores str =
1165 let len = String.length str in
1166 if len > 0 && str.[0] = '_' then
1167 failwith "Leading underscore not allowed in number";
1168 if len > 0 && str.[len - 1] = '_' then
1169 failwith "Trailing underscore not allowed in number";
1170 let has_hex_prefix =
1171 len > 2 && str.[0] = '0' && (str.[1] = 'x' || str.[1] = 'X')
1172 in
1173 let is_digit_char c = c >= '0' && c <= '9' in
1174 let is_hex_char c =
1175 (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
1176 in
1177 for i = 0 to len - 2 do
1178 if str.[i] = '_' && str.[i + 1] = '_' then
1179 failwith "Double underscore not allowed in number";
1180 if str.[i] = '_' then begin
1181 let prev = if i > 0 then Some str.[i - 1] else None in
1182 let next = Some str.[i + 1] in
1183 match (prev, next) with
1184 | Some p, Some n when has_hex_prefix && is_hex_char p && is_hex_char n ->
1185 ()
1186 | Some p, Some n when is_digit_char p && is_digit_char n -> ()
1187 | _ -> failwith "Underscore must be between digits"
1188 end
1189 done
1190
1191let bare_key_as_number s =
1192 validate_number_underscores s;
1193 let s_no_underscore = String.concat "" (String.split_on_char '_' s) in
1194 let len = String.length s_no_underscore in
1195 if len = 0 then Fmt.failwith "Unexpected bare key '%s' as value" s;
1196 let c0 = s_no_underscore.[0] in
1197 if c0 < '0' || c0 > '9' then
1198 Fmt.failwith "Unexpected bare key '%s' as value" s;
1199 if
1200 len > 1 && c0 = '0'
1201 && s_no_underscore.[1] >= '0'
1202 && s_no_underscore.[1] <= '9'
1203 then failwith "Leading zeros not allowed";
1204 try
1205 if
1206 String.contains s_no_underscore '.'
1207 || String.contains s_no_underscore 'e'
1208 || String.contains s_no_underscore 'E'
1209 then Toml.Float (float_of_string s_no_underscore)
1210 else Toml.Int (Int64.of_string s_no_underscore)
1211 with Failure _ -> Fmt.failwith "Unexpected bare key '%s' as value" s
1212
1213let rec parse_value p =
1214 match peek_token p with
1215 | Tok_basic_string s ->
1216 ignore (consume_token p);
1217 Toml.String s
1218 | Tok_literal_string s ->
1219 ignore (consume_token p);
1220 Toml.String s
1221 | Tok_ml_basic_string s ->
1222 ignore (consume_token p);
1223 Toml.String s
1224 | Tok_ml_literal_string s ->
1225 ignore (consume_token p);
1226 Toml.String s
1227 | Tok_integer (i, _) ->
1228 ignore (consume_token p);
1229 Toml.Int i
1230 | Tok_float (f, _) ->
1231 ignore (consume_token p);
1232 Toml.Float f
1233 | Tok_datetime s ->
1234 ignore (consume_token p);
1235 Toml.Datetime s
1236 | Tok_datetime_local s ->
1237 ignore (consume_token p);
1238 Toml.Datetime_local s
1239 | Tok_date_local s ->
1240 ignore (consume_token p);
1241 Toml.Date_local s
1242 | Tok_time_local s ->
1243 ignore (consume_token p);
1244 Toml.Time_local s
1245 | Tok_lbracket -> parse_array p
1246 | Tok_lbrace -> parse_inline_table p
1247 | Tok_bare_key s -> (
1248 ignore (consume_token p);
1249 match s with
1250 | "true" -> Bool true
1251 | "false" -> Bool false
1252 | "inf" -> Float Float.infinity
1253 | "nan" -> Float Float.nan
1254 | _ -> bare_key_as_number s)
1255 | _ -> failwith "Expected value"
1256
1257and parse_array p =
1258 ignore (consume_token p);
1259 (* [ *)
1260 skip_newlines p;
1261 let rec loop acc =
1262 match peek_token p with
1263 | Tok_rbracket ->
1264 ignore (consume_token p);
1265 Toml.Array (List.rev acc)
1266 | _ -> (
1267 let v = parse_value p in
1268 skip_newlines p;
1269 match peek_token p with
1270 | Tok_comma ->
1271 ignore (consume_token p);
1272 skip_newlines p;
1273 loop (v :: acc)
1274 | Tok_rbracket ->
1275 ignore (consume_token p);
1276 Toml.Array (List.rev (v :: acc))
1277 | _ -> failwith "Expected ',' or ']' in array")
1278 in
1279 loop []
1280
1281and parse_inline_table p =
1282 ignore (consume_token p);
1283 (* { *)
1284 skip_newlines p;
1285 (* Track explicitly defined keys - can't be extended with dotted keys *)
1286 let defined_inline = ref [] in
1287 let rec loop acc =
1288 match peek_token p with
1289 | Tok_rbrace ->
1290 ignore (consume_token p);
1291 Toml.Table (List.rev acc)
1292 | _ -> (
1293 let keys = parse_dotted_key p in
1294 skip_ws p;
1295 expect_token p Tok_equals;
1296 skip_ws p;
1297 let v = parse_value p in
1298 (* Check if trying to extend a previously-defined inline table *)
1299 (match keys with
1300 | first_key :: _ :: _ ->
1301 (* Multi-key dotted path - check if first key is already defined *)
1302 if List.mem first_key !defined_inline then
1303 failwith
1304 (Fmt.str "Cannot extend inline table '%s' with dotted key"
1305 first_key)
1306 | _ -> ());
1307 (* If this is a direct assignment to a key, track it *)
1308 (match keys with
1309 | [ k ] ->
1310 if List.mem k !defined_inline then
1311 Fmt.failwith "Duplicate key '%s' in inline table" k;
1312 defined_inline := k :: !defined_inline
1313 | _ -> ());
1314 let entry = build_nested_table keys v in
1315 (* Merge the entry with existing entries (for dotted keys with common prefix) *)
1316 let acc = merge_entry_into_table acc entry in
1317 skip_newlines p;
1318 match peek_token p with
1319 | Tok_comma ->
1320 ignore (consume_token p);
1321 skip_newlines p;
1322 loop acc
1323 | Tok_rbrace ->
1324 ignore (consume_token p);
1325 Toml.Table (List.rev acc)
1326 | _ -> failwith "Expected ',' or '}' in inline table")
1327 in
1328 loop []
1329
1330and skip_ws _p =
1331 (* Skip whitespace in token stream - handled by lexer but needed for lookahead *)
1332 ()
1333
1334and build_nested_table keys value =
1335 match keys with
1336 | [] -> failwith "Empty key"
1337 | [ k ] -> (k, value)
1338 | k :: rest -> (k, Toml.Table [ build_nested_table rest value ])
1339
1340(* Merge two TOML values - used for combining dotted keys in inline tables *)
1341and merge_toml_values v1 v2 =
1342 match (v1, v2) with
1343 | Toml.Table entries1, Toml.Table entries2 ->
1344 (* Merge the entries *)
1345 let merged =
1346 List.fold_left
1347 (fun acc (k, v) ->
1348 match List.assoc_opt k acc with
1349 | Some existing ->
1350 (* Key exists - try to merge if both are tables *)
1351 let merged_v = merge_toml_values existing v in
1352 (k, merged_v) :: List.remove_assoc k acc
1353 | None -> (k, v) :: acc)
1354 entries1 entries2
1355 in
1356 Toml.Table (List.rev merged)
1357 | _, _ ->
1358 (* Can't merge non-table values with same key *)
1359 failwith "Conflicting keys in inline table"
1360
1361(* Merge a single entry into an existing table *)
1362and merge_entry_into_table entries (k, v) =
1363 match List.assoc_opt k entries with
1364 | Some existing ->
1365 let merged_v = merge_toml_values existing v in
1366 (k, merged_v) :: List.remove_assoc k entries
1367 | None -> (k, v) :: entries
1368
1369let validate_datetime_string s =
1370 (* Parse and validate date portion *)
1371 if String.length s >= 10 then begin
1372 let year = int_of_string (String.sub s 0 4) in
1373 let month = int_of_string (String.sub s 5 2) in
1374 let day = int_of_string (String.sub s 8 2) in
1375 validate_date year month day;
1376 (* Parse and validate time portion if present *)
1377 if String.length s >= 16 then begin
1378 let time_start =
1379 if s.[10] = 'T' || s.[10] = 't' || s.[10] = ' ' then 11 else 10
1380 in
1381 let hour = int_of_string (String.sub s time_start 2) in
1382 let minute = int_of_string (String.sub s (time_start + 3) 2) in
1383 let second =
1384 if String.length s >= time_start + 8 && s.[time_start + 5] = ':' then
1385 int_of_string (String.sub s (time_start + 6) 2)
1386 else 0
1387 in
1388 validate_time hour minute second
1389 end
1390 end
1391
1392let validate_date_string s =
1393 if String.length s >= 10 then begin
1394 let year = int_of_string (String.sub s 0 4) in
1395 let month = int_of_string (String.sub s 5 2) in
1396 let day = int_of_string (String.sub s 8 2) in
1397 validate_date year month day
1398 end
1399
1400let validate_time_string s =
1401 if String.length s >= 5 then begin
1402 let hour = int_of_string (String.sub s 0 2) in
1403 let minute = int_of_string (String.sub s 3 2) in
1404 let second =
1405 if String.length s >= 8 && s.[5] = ':' then
1406 int_of_string (String.sub s 6 2)
1407 else 0
1408 in
1409 validate_time hour minute second
1410 end
1411
1412(* Table management for the parser *)
1413type table_state = {
1414 mutable values : (string * Toml.t) list;
1415 subtables : (string, table_state) Hashtbl.t;
1416 mutable is_array : bool;
1417 mutable is_inline : bool;
1418 mutable defined : bool;
1419 (* Has this table been explicitly defined with [table]? *)
1420 mutable closed : bool; (* Closed to extension via dotted keys from parent *)
1421 mutable array_elements : table_state list; (* For arrays of tables *)
1422}
1423
1424let table_state () =
1425 {
1426 values = [];
1427 subtables = Hashtbl.create 16;
1428 is_array = false;
1429 is_inline = false;
1430 defined = false;
1431 closed = false;
1432 array_elements = [];
1433 }
1434
1435let rec ensure_table state keys create_intermediate =
1436 match keys with
1437 | [] -> state
1438 | [ k ] -> (
1439 (* Check if key exists as a value *)
1440 if List.mem_assoc k state.values then
1441 Fmt.failwith "Cannot use value '%s' as a table" k;
1442 match Hashtbl.find_opt state.subtables k with
1443 | Some sub -> sub
1444 | None ->
1445 let sub = table_state () in
1446 Hashtbl.add state.subtables k sub;
1447 sub)
1448 | k :: rest ->
1449 (* Check if key exists as a value *)
1450 if List.mem_assoc k state.values then
1451 Fmt.failwith "Cannot use value '%s' as a table" k;
1452 let sub =
1453 match Hashtbl.find_opt state.subtables k with
1454 | Some sub -> sub
1455 | None ->
1456 let sub = table_state () in
1457 Hashtbl.add state.subtables k sub;
1458 sub
1459 in
1460 if create_intermediate && not sub.defined then sub.defined <- false;
1461 (* Mark as implicitly defined *)
1462 ensure_table sub rest create_intermediate
1463
1464(* Like ensure_table but marks tables as defined (for dotted keys) *)
1465(* Dotted keys mark tables as "defined" (can't re-define with [table]) but not "closed" *)
1466let rec table_for_dotted_key state keys =
1467 match keys with
1468 | [] -> state
1469 | [ k ] -> (
1470 (* Check if key exists as a value *)
1471 if List.mem_assoc k state.values then
1472 Fmt.failwith "Cannot use value '%s' as a table" k;
1473 match Hashtbl.find_opt state.subtables k with
1474 | Some sub ->
1475 (* Check if it's an array of tables (can't extend with dotted keys) *)
1476 if sub.is_array then
1477 failwith
1478 (Fmt.str "Cannot extend array of tables '%s' using dotted keys" k);
1479 (* Check if it's closed (explicitly defined with [table] header) *)
1480 if sub.closed then
1481 Fmt.failwith "Cannot extend table '%s' using dotted keys" k;
1482 if sub.is_inline then Fmt.failwith "Cannot extend inline table '%s'" k;
1483 (* Mark as defined by dotted key *)
1484 sub.defined <- true;
1485 sub
1486 | None ->
1487 let sub = table_state () in
1488 sub.defined <- true;
1489 (* Mark as defined by dotted key *)
1490 Hashtbl.add state.subtables k sub;
1491 sub)
1492 | k :: rest ->
1493 (* Check if key exists as a value *)
1494 if List.mem_assoc k state.values then
1495 Fmt.failwith "Cannot use value '%s' as a table" k;
1496 let sub =
1497 match Hashtbl.find_opt state.subtables k with
1498 | Some sub ->
1499 (* Check if it's an array of tables (can't extend with dotted keys) *)
1500 if sub.is_array then
1501 failwith
1502 (Fmt.str "Cannot extend array of tables '%s' using dotted keys"
1503 k);
1504 if sub.closed then
1505 Fmt.failwith "Cannot extend table '%s' using dotted keys" k;
1506 if sub.is_inline then
1507 Fmt.failwith "Cannot extend inline table '%s'" k;
1508 (* Mark as defined by dotted key *)
1509 sub.defined <- true;
1510 sub
1511 | None ->
1512 let sub = table_state () in
1513 sub.defined <- true;
1514 (* Mark as defined by dotted key *)
1515 Hashtbl.add state.subtables k sub;
1516 sub
1517 in
1518 table_for_dotted_key sub rest
1519
1520let rec table_state_to_toml state =
1521 let subtable_values =
1522 Hashtbl.fold
1523 (fun k sub acc ->
1524 let v =
1525 if sub.is_array then
1526 Toml.Array (List.map table_state_to_toml (array_elements sub))
1527 else table_state_to_toml sub
1528 in
1529 (k, v) :: acc)
1530 state.subtables []
1531 in
1532 Toml.Table (List.rev state.values @ subtable_values)
1533
1534and array_elements state = List.rev state.array_elements
1535
1536let rec key_path_has_prefix keys prefix =
1537 match (keys, prefix) with
1538 | _, [] -> true
1539 | [], _ -> false
1540 | k :: krest, p :: prest -> k = p && key_path_has_prefix krest prest
1541
1542let rec remove_key_prefix keys prefix =
1543 match (keys, prefix) with
1544 | ks, [] -> ks
1545 | [], _ -> []
1546 | _ :: krest, _ :: prest -> remove_key_prefix krest prest
1547
1548let check_array_table_conflict array_table keys =
1549 if array_table.defined && not array_table.is_array then
1550 Fmt.failwith
1551 "Cannot define '%s' as array of tables; already defined as table"
1552 (String.concat "." keys);
1553 if
1554 (array_table.values <> [] || Hashtbl.length array_table.subtables > 0)
1555 && not array_table.is_array
1556 then
1557 Fmt.failwith "Cannot define '%s' as array of tables; already has content"
1558 (String.concat "." keys)
1559
1560let define_table_header table keys =
1561 if table.is_array then
1562 Fmt.failwith
1563 "Cannot define '%s' as table; already defined as array of tables"
1564 (String.concat "." keys);
1565 if table.defined then
1566 Fmt.failwith "Table '%s' already defined" (String.concat "." keys);
1567 table.defined <- true;
1568 table.closed <- true
1569
1570let add_value_to_table tbl key v =
1571 if List.mem_assoc key tbl.values then Fmt.failwith "Duplicate key: %s" key;
1572 (match Hashtbl.find_opt tbl.subtables key with
1573 | Some sub ->
1574 if sub.is_array then
1575 Fmt.failwith "Cannot redefine array of tables '%s' as a value" key
1576 else Fmt.failwith "Cannot redefine table '%s' as a value" key
1577 | None -> ());
1578 tbl.values <- (key, v) :: tbl.values
1579
1580type doc_state = {
1581 root : table_state;
1582 mutable current_table : table_state;
1583 array_context_stack : (string list * table_state * table_state) list ref;
1584}
1585
1586let doc_state_of root =
1587 { root; current_table = root; array_context_stack = ref [] }
1588
1589let array_context ds keys =
1590 let rec find stack =
1591 match stack with
1592 | [] -> None
1593 | (path, parent, container) :: rest ->
1594 if keys = path then Some (`Sibling (path, parent, container))
1595 else if
1596 key_path_has_prefix keys path && List.length keys > List.length path
1597 then
1598 let current_entry = List.hd container.array_elements in
1599 Some (`Nested (path, current_entry))
1600 else find rest
1601 in
1602 find !(ds.array_context_stack)
1603
1604let rec pop_invalid_contexts ds keys =
1605 match !(ds.array_context_stack) with
1606 | [] -> ()
1607 | (path, _, _) :: rest ->
1608 if not (key_path_has_prefix keys path) then begin
1609 ds.array_context_stack := rest;
1610 pop_invalid_contexts ds keys
1611 end
1612
1613let handle_array_of_tables ds keys =
1614 pop_invalid_contexts ds keys;
1615 match array_context ds keys with
1616 | Some (`Sibling (_path, _parent, container)) ->
1617 let new_entry = table_state () in
1618 container.array_elements <- new_entry :: container.array_elements;
1619 ds.current_table <- new_entry
1620 | Some (`Nested (parent_path, parent_entry)) ->
1621 let relative_keys = remove_key_prefix keys parent_path in
1622 let array_table = ensure_table parent_entry relative_keys true in
1623 check_array_table_conflict array_table keys;
1624 array_table.is_array <- true;
1625 let new_entry = table_state () in
1626 array_table.array_elements <- new_entry :: array_table.array_elements;
1627 ds.current_table <- new_entry;
1628 ds.array_context_stack :=
1629 (keys, parent_entry, array_table) :: !(ds.array_context_stack)
1630 | None ->
1631 let array_table = ensure_table ds.root keys true in
1632 check_array_table_conflict array_table keys;
1633 array_table.is_array <- true;
1634 let entry = table_state () in
1635 array_table.array_elements <- entry :: array_table.array_elements;
1636 ds.current_table <- entry;
1637 ds.array_context_stack :=
1638 (keys, ds.root, array_table) :: !(ds.array_context_stack)
1639
1640let handle_table_header ds keys =
1641 pop_invalid_contexts ds keys;
1642 match array_context ds keys with
1643 | Some (`Nested (parent_path, parent_entry)) ->
1644 let relative_keys = remove_key_prefix keys parent_path in
1645 let table =
1646 if relative_keys <> [] then ensure_table parent_entry relative_keys true
1647 else ensure_table ds.root keys true
1648 in
1649 define_table_header table keys;
1650 ds.current_table <- table
1651 | Some (`Sibling (_, _, container)) ->
1652 if container.is_array then
1653 Fmt.failwith
1654 "Cannot define '%s' as table; already defined as array of tables"
1655 (String.concat "." keys);
1656 let table = ensure_table ds.root keys true in
1657 if table.defined then
1658 Fmt.failwith "Table '%s' already defined" (String.concat "." keys);
1659 table.defined <- true;
1660 table.closed <- true;
1661 ds.current_table <- table
1662 | None ->
1663 let table = ensure_table ds.root keys true in
1664 define_table_header table keys;
1665 ds.current_table <- table;
1666 if
1667 not
1668 (List.exists
1669 (fun (p, _, _) -> key_path_has_prefix keys p)
1670 !(ds.array_context_stack))
1671 then ds.array_context_stack := []
1672
1673(* Main parser function *)
1674let parse_toml_from_lexer lexer =
1675 let parser = parser lexer in
1676 let ds = doc_state_of (table_state ()) in
1677
1678 let rec parse_document () =
1679 skip_newlines parser;
1680 match peek_token parser with
1681 | Tok_eof -> ()
1682 | Tok_lbracket -> (
1683 ignore (consume_token parser);
1684 let is_adjacent_bracket = next_raw_char_is parser '[' in
1685 match peek_token parser with
1686 | Tok_lbracket when not is_adjacent_bracket ->
1687 failwith "Invalid table header syntax"
1688 | Tok_lbracket ->
1689 ignore (consume_token parser);
1690 let keys = parse_dotted_key parser in
1691 expect_token parser Tok_rbracket;
1692 if not (next_raw_char_is parser ']') then
1693 failwith "Invalid array of tables syntax (space in ]])";
1694 expect_token parser Tok_rbracket;
1695 skip_to_newline parser;
1696 handle_array_of_tables ds keys;
1697 parse_document ()
1698 | _ ->
1699 let keys = parse_dotted_key parser in
1700 expect_token parser Tok_rbracket;
1701 skip_to_newline parser;
1702 handle_table_header ds keys;
1703 parse_document ())
1704 | Tok_bare_key _ | Tok_basic_string _ | Tok_literal_string _ | Tok_integer _
1705 | Tok_float _ | Tok_date_local _ | Tok_datetime _ | Tok_datetime_local _
1706 | Tok_time_local _ ->
1707 let keys = parse_dotted_key parser in
1708 expect_token parser Tok_equals;
1709 let value = parse_value parser in
1710 skip_to_newline parser;
1711 (match keys with
1712 | [] -> failwith "Empty key"
1713 | [ k ] -> add_value_to_table ds.current_table k value
1714 | _ ->
1715 let parent_keys = List.rev (List.tl (List.rev keys)) in
1716 let final_key = List.hd (List.rev keys) in
1717 let parent = table_for_dotted_key ds.current_table parent_keys in
1718 add_value_to_table parent final_key value);
1719 parse_document ()
1720 | _tok -> Fmt.failwith "Unexpected token at line %d" parser.lexer.line
1721 and skip_to_newline parser =
1722 skip_ws_and_comments parser.lexer;
1723 match peek_token parser with
1724 | Tok_newline -> ignore (consume_token parser)
1725 | Tok_eof -> ()
1726 | _ -> failwith "Expected newline after value"
1727 in
1728
1729 parse_document ();
1730 table_state_to_toml ds.root
1731
1732(* Parse TOML from string - creates lexer internally *)
1733let parse_toml input =
1734 let lexer = lexer input in
1735 parse_toml_from_lexer lexer
1736
1737(* Parse TOML directly from Bytes.Reader - no intermediate string *)
1738let parse_toml_from_reader ?file r =
1739 let lexer = lexer_from_reader ?file r in
1740 parse_toml_from_lexer lexer
1741
1742let normalize_exp_notation s =
1743 let buf = Buffer.create (String.length s + 1) in
1744 let i = ref 0 in
1745 while !i < String.length s do
1746 let c = s.[!i] in
1747 if c = 'E' || c = 'e' then begin
1748 Buffer.add_char buf 'e';
1749 if !i + 1 < String.length s then begin
1750 let next = s.[!i + 1] in
1751 if next >= '0' && next <= '9' then Buffer.add_char buf '+'
1752 end
1753 end
1754 else Buffer.add_char buf c;
1755 incr i
1756 done;
1757 Buffer.contents buf
1758
1759let shortest_exp_notation f init_prec =
1760 let rec try_exp prec =
1761 if prec > 17 then normalize_exp_notation (Fmt.str "%.17e" f)
1762 else
1763 let s = normalize_exp_notation (Fmt.str "%.*e" prec f) in
1764 if float_of_string s = f then s else try_exp (prec + 1)
1765 in
1766 try_exp init_prec
1767
1768let shortest_decimal_notation f =
1769 let rec try_decimal prec =
1770 if prec > 17 then None
1771 else
1772 let s = Fmt.str "%.*f" prec f in
1773 let s =
1774 let len = String.length s in
1775 let dot_pos = try String.index s '.' with Not_found -> len in
1776 let rec last_nonzero i =
1777 if i <= dot_pos then dot_pos + 2
1778 else if s.[i] <> '0' then i + 1
1779 else last_nonzero (i - 1)
1780 in
1781 String.sub s 0 (min len (last_nonzero (len - 1)))
1782 in
1783 let s =
1784 if not (String.contains s '.') then s ^ ".0"
1785 else if s.[String.length s - 1] = '.' then s ^ "0"
1786 else s
1787 in
1788 if float_of_string s = f then Some s else try_decimal (prec + 1)
1789 in
1790 try_decimal 1
1791
1792let shortest_g_notation f =
1793 let rec try_g prec =
1794 if prec > 17 then Fmt.str "%.17g" f
1795 else
1796 let s = Fmt.str "%.*g" prec f in
1797 if float_of_string s = f then s else try_g (prec + 1)
1798 in
1799 try_g 1
1800
1801let integer_float_str f =
1802 let abs_f = Float.abs f in
1803 if abs_f = 9007199254740991.0 then Fmt.str "%.0f" f
1804 else if abs_f >= 1e6 then shortest_exp_notation f 0
1805 else if abs_f >= 2.0 then Fmt.str "%.1f" f
1806 else Fmt.str "%.0f" f
1807
1808let fractional_float_str f =
1809 let abs_f = Float.abs f in
1810 if abs_f >= 1e10 || (abs_f < 1e-4 && abs_f > 0.0) then
1811 shortest_exp_notation f 1
1812 else
1813 match shortest_decimal_notation f with
1814 | Some d -> d
1815 | None -> shortest_g_notation f
1816
1817let float_to_tagged_json_str f =
1818 if Float.is_nan f then "nan"
1819 else if f = Float.infinity then "inf"
1820 else if f = Float.neg_infinity then "-inf"
1821 else if f = 0.0 then if 1.0 /. f = Float.neg_infinity then "-0" else "0"
1822 else if Float.is_integer f then integer_float_str f
1823 else fractional_float_str f
1824
1825(* Convert TOML to tagged JSON for toml-test compatibility *)
1826let rec toml_to_tagged_json value =
1827 match value with
1828 | Toml.String s ->
1829 Fmt.str "{\"type\":\"string\",\"value\":%s}" (json_encode_string s)
1830 | Toml.Int i -> Fmt.str "{\"type\":\"integer\",\"value\":\"%Ld\"}" i
1831 | Toml.Float f ->
1832 Fmt.str "{\"type\":\"float\",\"value\":\"%s\"}"
1833 (float_to_tagged_json_str f)
1834 | Toml.Bool b ->
1835 Fmt.str "{\"type\":\"bool\",\"value\":\"%s\"}"
1836 (if b then "true" else "false")
1837 | Toml.Datetime s ->
1838 validate_datetime_string s;
1839 Fmt.str "{\"type\":\"datetime\",\"value\":\"%s\"}" s
1840 | Toml.Datetime_local s ->
1841 validate_datetime_string s;
1842 Fmt.str "{\"type\":\"datetime-local\",\"value\":\"%s\"}" s
1843 | Toml.Date_local s ->
1844 validate_date_string s;
1845 Fmt.str "{\"type\":\"date-local\",\"value\":\"%s\"}" s
1846 | Toml.Time_local s ->
1847 validate_time_string s;
1848 Fmt.str "{\"type\":\"time-local\",\"value\":\"%s\"}" s
1849 | Toml.Array items ->
1850 let json_items = List.map toml_to_tagged_json items in
1851 Fmt.str "[%s]" (String.concat "," json_items)
1852 | Toml.Table pairs ->
1853 let json_pairs =
1854 List.map
1855 (fun (k, v) ->
1856 Fmt.str "%s:%s" (json_encode_string k) (toml_to_tagged_json v))
1857 pairs
1858 in
1859 Fmt.str "{%s}" (String.concat "," json_pairs)
1860
1861and json_encode_string s =
1862 let buf = Buffer.create (String.length s + 2) in
1863 Buffer.add_char buf '"';
1864 String.iter
1865 (fun c ->
1866 match c with
1867 | '"' -> Buffer.add_string buf "\\\""
1868 | '\\' -> Buffer.add_string buf "\\\\"
1869 | '\n' -> Buffer.add_string buf "\\n"
1870 | '\r' -> Buffer.add_string buf "\\r"
1871 | '\t' -> Buffer.add_string buf "\\t"
1872 | '\b' -> Buffer.add_string buf "\\b" (* backspace *)
1873 | c when Char.code c = 0x0C -> Buffer.add_string buf "\\f" (* formfeed *)
1874 | c when Char.code c < 0x20 ->
1875 Buffer.add_string buf (Fmt.str "\\u%04x" (Char.code c))
1876 | c -> Buffer.add_char buf c)
1877 s;
1878 Buffer.add_char buf '"';
1879 Buffer.contents buf
1880
1881let tagged_value_to_toml value =
1882 match value with
1883 | Toml.Table [ ("type", Toml.String typ); ("value", Toml.String v) ]
1884 | Toml.Table [ ("value", Toml.String v); ("type", Toml.String typ) ] -> (
1885 match typ with
1886 | "string" -> Toml.String v
1887 | "integer" -> Toml.Int (Int64.of_string v)
1888 | "float" -> (
1889 match v with
1890 | "inf" -> Toml.Float Float.infinity
1891 | "-inf" -> Toml.Float Float.neg_infinity
1892 | "nan" -> Toml.Float Float.nan
1893 | _ -> Toml.Float (float_of_string v))
1894 | "bool" -> Toml.Bool (v = "true")
1895 | "datetime" -> Toml.Datetime v
1896 | "datetime-local" -> Toml.Datetime_local v
1897 | "date-local" -> Toml.Date_local v
1898 | "time-local" -> Toml.Time_local v
1899 | _ -> Fmt.failwith "Unknown type: %s" typ)
1900 | _ -> value
1901
1902type json_reader = { s : string; len : int; pos : int ref }
1903
1904let json_reader_of s = { s; len = String.length s; pos = ref 0 }
1905
1906let jr_skip_ws r =
1907 while
1908 !(r.pos) < r.len
1909 && (r.s.[!(r.pos)] = ' '
1910 || r.s.[!(r.pos)] = '\t'
1911 || r.s.[!(r.pos)] = '\n'
1912 || r.s.[!(r.pos)] = '\r')
1913 do
1914 incr r.pos
1915 done
1916
1917let jr_expect r c =
1918 jr_skip_ws r;
1919 if !(r.pos) >= r.len || r.s.[!(r.pos)] <> c then
1920 Fmt.failwith "Expected '%c' at position %d" c !(r.pos);
1921 incr r.pos
1922
1923let jr_peek r =
1924 jr_skip_ws r;
1925 if !(r.pos) >= r.len then None else Some r.s.[!(r.pos)]
1926
1927let jr_read_escape r buf =
1928 incr r.pos;
1929 if !(r.pos) >= r.len then failwith "Unexpected end in string escape";
1930 match r.s.[!(r.pos)] with
1931 | '"' ->
1932 Buffer.add_char buf '"';
1933 incr r.pos
1934 | '\\' ->
1935 Buffer.add_char buf '\\';
1936 incr r.pos
1937 | '/' ->
1938 Buffer.add_char buf '/';
1939 incr r.pos
1940 | 'n' ->
1941 Buffer.add_char buf '\n';
1942 incr r.pos
1943 | 'r' ->
1944 Buffer.add_char buf '\r';
1945 incr r.pos
1946 | 't' ->
1947 Buffer.add_char buf '\t';
1948 incr r.pos
1949 | 'b' ->
1950 Buffer.add_char buf '\b';
1951 incr r.pos
1952 | 'f' ->
1953 Buffer.add_char buf (Char.chr 0x0C);
1954 incr r.pos
1955 | 'u' ->
1956 incr r.pos;
1957 if !(r.pos) + 3 >= r.len then failwith "Invalid unicode escape";
1958 let hex = String.sub r.s !(r.pos) 4 in
1959 let cp = int_of_string ("0x" ^ hex) in
1960 Buffer.add_string buf (codepoint_to_utf8 cp);
1961 r.pos := !(r.pos) + 4
1962 | c -> Fmt.failwith "Invalid escape: \\%c" c
1963
1964let jr_read_string r =
1965 jr_skip_ws r;
1966 jr_expect r '"';
1967 let buf = Buffer.create 64 in
1968 while !(r.pos) < r.len && r.s.[!(r.pos)] <> '"' do
1969 if r.s.[!(r.pos)] = '\\' then jr_read_escape r buf
1970 else begin
1971 Buffer.add_char buf r.s.[!(r.pos)];
1972 incr r.pos
1973 end
1974 done;
1975 jr_expect r '"';
1976 Buffer.contents buf
1977
1978let rec jr_read_value r =
1979 jr_skip_ws r;
1980 match jr_peek r with
1981 | Some '{' -> jr_read_object r
1982 | Some '[' -> jr_read_array r
1983 | Some '"' -> Toml.String (jr_read_string r)
1984 | _ -> failwith "Expected value"
1985
1986and jr_read_object r =
1987 jr_expect r '{';
1988 jr_skip_ws r;
1989 if jr_peek r = Some '}' then begin
1990 incr r.pos;
1991 Toml.Table []
1992 end
1993 else begin
1994 let pairs = ref [] in
1995 let first = ref true in
1996 while jr_peek r <> Some '}' do
1997 if not !first then jr_expect r ',';
1998 first := false;
1999 jr_skip_ws r;
2000 let key = jr_read_string r in
2001 jr_expect r ':';
2002 let value = jr_read_value r in
2003 pairs := (key, tagged_value_to_toml value) :: !pairs
2004 done;
2005 jr_expect r '}';
2006 Toml.Table (List.rev !pairs)
2007 end
2008
2009and jr_read_array r =
2010 jr_expect r '[';
2011 jr_skip_ws r;
2012 if jr_peek r = Some ']' then begin
2013 incr r.pos;
2014 Toml.Array []
2015 end
2016 else begin
2017 let items = ref [] in
2018 let first = ref true in
2019 while jr_peek r <> Some ']' do
2020 if not !first then jr_expect r ',';
2021 first := false;
2022 items := tagged_value_to_toml (jr_read_value r) :: !items
2023 done;
2024 jr_expect r ']';
2025 Toml.Array (List.rev !items)
2026 end
2027
2028(* Tagged JSON to TOML for encoder *)
2029let decode_tagged_json_string s = jr_read_value (json_reader_of s)
2030
2031(* ============================================
2032 Streaming TOML Encoder
2033 ============================================ *)
2034
2035let is_bare_key_char c =
2036 (c >= 'A' && c <= 'Z')
2037 || (c >= 'a' && c <= 'z')
2038 || (c >= '0' && c <= '9')
2039 || c = '_' || c = '-'
2040
2041let rec write_toml_string w s =
2042 (* Check if we need to escape *)
2043 let needs_escape =
2044 String.exists
2045 (fun c ->
2046 let code = Char.code c in
2047 c = '"' || c = '\\' || c = '\n' || c = '\r' || c = '\t' || code < 0x20
2048 || code = 0x7F)
2049 s
2050 in
2051 if needs_escape then begin
2052 Bytes.Writer.write_string w "\"";
2053 String.iter
2054 (fun c ->
2055 match c with
2056 | '"' -> Bytes.Writer.write_string w "\\\""
2057 | '\\' -> Bytes.Writer.write_string w "\\\\"
2058 | '\n' -> Bytes.Writer.write_string w "\\n"
2059 | '\r' -> Bytes.Writer.write_string w "\\r"
2060 | '\t' -> Bytes.Writer.write_string w "\\t"
2061 | '\b' -> Bytes.Writer.write_string w "\\b"
2062 | c when Char.code c = 0x0C -> Bytes.Writer.write_string w "\\f"
2063 | c when Char.code c < 0x20 || Char.code c = 0x7F ->
2064 Bytes.Writer.write_string w (Fmt.str "\\u%04X" (Char.code c))
2065 | c ->
2066 let b = Bytes.create 1 in
2067 Bytes.set b 0 c;
2068 Bytes.Writer.write_bytes w b)
2069 s;
2070 Bytes.Writer.write_string w "\""
2071 end
2072 else begin
2073 Bytes.Writer.write_string w "\"";
2074 Bytes.Writer.write_string w s;
2075 Bytes.Writer.write_string w "\""
2076 end
2077
2078and write_toml_key w k =
2079 (* Check if it can be a bare key *)
2080 let is_bare = String.length k > 0 && String.for_all is_bare_key_char k in
2081 if is_bare then Bytes.Writer.write_string w k else write_toml_string w k
2082
2083and write_toml_value w ?(inline = false) value =
2084 match value with
2085 | Toml.String s -> write_toml_string w s
2086 | Toml.Int i -> Bytes.Writer.write_string w (Int64.to_string i)
2087 | Toml.Float f ->
2088 if Float.is_nan f then Bytes.Writer.write_string w "nan"
2089 else if f = Float.infinity then Bytes.Writer.write_string w "inf"
2090 else if f = Float.neg_infinity then Bytes.Writer.write_string w "-inf"
2091 else begin
2092 let s = Fmt.str "%.17g" f in
2093 (* Ensure it looks like a float *)
2094 let s =
2095 if
2096 String.contains s '.' || String.contains s 'e'
2097 || String.contains s 'E'
2098 then s
2099 else s ^ ".0"
2100 in
2101 Bytes.Writer.write_string w s
2102 end
2103 | Toml.Bool b -> Bytes.Writer.write_string w (if b then "true" else "false")
2104 | Toml.Datetime s -> Bytes.Writer.write_string w s
2105 | Toml.Datetime_local s -> Bytes.Writer.write_string w s
2106 | Toml.Date_local s -> Bytes.Writer.write_string w s
2107 | Toml.Time_local s -> Bytes.Writer.write_string w s
2108 | Toml.Array items ->
2109 Bytes.Writer.write_string w "[";
2110 List.iteri
2111 (fun i item ->
2112 if i > 0 then Bytes.Writer.write_string w ", ";
2113 write_toml_value w ~inline:true item)
2114 items;
2115 Bytes.Writer.write_string w "]"
2116 | Toml.Table pairs when inline ->
2117 Bytes.Writer.write_string w "{";
2118 List.iteri
2119 (fun i (k, v) ->
2120 if i > 0 then Bytes.Writer.write_string w ", ";
2121 write_toml_key w k;
2122 Bytes.Writer.write_string w " = ";
2123 write_toml_value w ~inline:true v)
2124 pairs;
2125 Bytes.Writer.write_string w "}"
2126 | Toml.Table _ -> failwith "Cannot encode table inline without inline flag"
2127
2128let is_pure_table_array items =
2129 items <> []
2130 && List.for_all (function Toml.Table _ -> true | _ -> false) items
2131
2132let write_path w path =
2133 List.iteri
2134 (fun i k ->
2135 if i > 0 then Bytes.Writer.write_string w ".";
2136 write_toml_key w k)
2137 path
2138
2139let rec encode_at_path w has_content path value =
2140 match value with
2141 | Toml.Table pairs ->
2142 let simple, nested =
2143 List.partition
2144 (fun (_, v) ->
2145 match v with
2146 | Toml.Table _ -> false
2147 | Toml.Array items -> not (is_pure_table_array items)
2148 | _ -> true)
2149 pairs
2150 in
2151 List.iter
2152 (fun (k, v) ->
2153 write_toml_key w k;
2154 Bytes.Writer.write_string w " = ";
2155 write_toml_value w ~inline:true v;
2156 Bytes.Writer.write_string w "\n";
2157 has_content := true)
2158 simple;
2159 encode_nested_pairs w has_content path nested
2160 | _ -> failwith "Top-level TOML must be a table"
2161
2162and encode_nested_pairs w has_content path nested =
2163 List.iter
2164 (fun (k, v) ->
2165 let new_path = path @ [ k ] in
2166 match v with
2167 | Toml.Table _ ->
2168 if !has_content then Bytes.Writer.write_string w "\n";
2169 Bytes.Writer.write_string w "[";
2170 write_path w new_path;
2171 Bytes.Writer.write_string w "]\n";
2172 has_content := true;
2173 encode_at_path w has_content new_path v
2174 | Toml.Array items when is_pure_table_array items ->
2175 encode_table_array w has_content new_path items
2176 | _ ->
2177 write_toml_key w k;
2178 Bytes.Writer.write_string w " = ";
2179 write_toml_value w ~inline:true v;
2180 Bytes.Writer.write_string w "\n";
2181 has_content := true)
2182 nested
2183
2184and encode_table_array w has_content new_path items =
2185 List.iter
2186 (fun item ->
2187 match item with
2188 | Toml.Table _ ->
2189 if !has_content then Bytes.Writer.write_string w "\n";
2190 Bytes.Writer.write_string w "[[";
2191 write_path w new_path;
2192 Bytes.Writer.write_string w "]]\n";
2193 has_content := true;
2194 encode_at_path w has_content new_path item
2195 | _ -> assert false)
2196 items
2197
2198(* True streaming TOML encoder - writes directly to Bytes.Writer *)
2199let encode_to_writer w value =
2200 let has_content = ref false in
2201 encode_at_path w has_content [] value
2202
2203(* ============================================
2204 Public Interface - Parsing
2205 ============================================ *)
2206
2207let of_string input =
2208 try Ok (parse_toml input) with
2209 | Failure msg ->
2210 Error (Toml.Error.v (Toml.Error.Syntax (Toml.Error.Expected msg)))
2211 | Toml.Error.Error e -> Error e
2212 | e ->
2213 Error
2214 (Toml.Error.v
2215 (Toml.Error.Syntax (Toml.Error.Expected (Printexc.to_string e))))
2216
2217let of_reader ?file r =
2218 try Ok (parse_toml_from_reader ?file r) with
2219 | Failure msg ->
2220 Error (Toml.Error.v (Toml.Error.Syntax (Toml.Error.Expected msg)))
2221 | Toml.Error.Error e -> Error e
2222 | e ->
2223 Error
2224 (Toml.Error.v
2225 (Toml.Error.Syntax (Toml.Error.Expected (Printexc.to_string e))))
2226
2227let parse = parse_toml
2228let parse_reader ?file r = parse_toml_from_reader ?file r
2229
2230(* ============================================
2231 Public Interface - Encoding
2232 ============================================ *)
2233
2234let to_writer w value = encode_to_writer w value
2235
2236let to_string value =
2237 let buf = Buffer.create 256 in
2238 let w = Bytes.Writer.of_buffer buf in
2239 encode_to_writer w value;
2240 Buffer.contents buf
2241
2242(* ============================================
2243 Codec I/O Operations
2244 ============================================ *)
2245
2246let decode_string c s = Result.bind (of_string s) (Tomlt.decode c)
2247
2248let decode_string_exn c s =
2249 let toml = parse s in
2250 Tomlt.decode_exn c toml
2251
2252let encode_string c v =
2253 let toml = Tomlt.encode c v in
2254 to_string toml
2255
2256let decode_reader ?file c r = Result.bind (of_reader ?file r) (Tomlt.decode c)
2257
2258let encode_writer c v w =
2259 let toml = Tomlt.encode c v in
2260 to_writer w toml
2261
2262(* ============================================
2263 Tagged JSON Module
2264 ============================================ *)
2265
2266module Tagged_json = struct
2267 let encode = toml_to_tagged_json
2268 let decode = decode_tagged_json_string
2269
2270 let decode_and_encode_toml json_str =
2271 try
2272 let toml = decode_tagged_json_string json_str in
2273 Ok (to_string toml)
2274 with
2275 | Failure msg -> Error msg
2276 | e -> Error (Printexc.to_string e)
2277end