TOML 1.1 codecs for OCaml
at main 2355 lines 93 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6open Bytesrw 7 8(* Aliases for cleaner code *) 9module Toml = Tomlt.Toml 10module Toml_error = Toml.Error 11 12(* Lexer - works directly on bytes buffer filled from Bytes.Reader *) 13 14type token = 15 | Tok_lbracket 16 | Tok_rbracket 17 | Tok_lbrace 18 | Tok_rbrace 19 | Tok_equals 20 | Tok_comma 21 | Tok_dot 22 | Tok_newline 23 | Tok_eof 24 | Tok_bare_key of string 25 | Tok_basic_string of string 26 | Tok_literal_string of string 27 | Tok_ml_basic_string of string (* Multiline basic string - not valid as key *) 28 | Tok_ml_literal_string of string (* Multiline literal string - not valid as key *) 29 | Tok_integer of int64 * string (* value, original string for key reconstruction *) 30 | Tok_float of float * string (* value, original string for key reconstruction *) 31 | Tok_datetime of string 32 | Tok_datetime_local of string 33 | Tok_date_local of string 34 | Tok_time_local of string 35 36type lexer = { 37 input : bytes; (* Buffer containing input data *) 38 input_len : int; (* Length of valid data in input *) 39 mutable pos : int; 40 mutable line : int; 41 mutable col : int; 42 file : string; 43} 44 45(* Create lexer from string (copies to bytes) *) 46let make_lexer ?(file = "-") s = 47 let input = Bytes.of_string s in 48 { input; input_len = Bytes.length input; pos = 0; line = 1; col = 1; file } 49 50(* Create lexer directly from Bytes.Reader - reads all data into buffer *) 51let make_lexer_from_reader ?(file = "-") r = 52 (* Read all slices into a buffer *) 53 let buf = Buffer.create 4096 in 54 let rec read_all () = 55 let slice = Bytes.Reader.read r in 56 if Bytes.Slice.is_eod slice then () 57 else begin 58 Bytes.Slice.add_to_buffer buf slice; 59 read_all () 60 end 61 in 62 read_all (); 63 let input = Buffer.to_bytes buf in 64 { input; input_len = Bytes.length input; pos = 0; line = 1; col = 1; file } 65 66let is_eof l = l.pos >= l.input_len 67 68let peek l = if is_eof l then None else Some (Bytes.get l.input l.pos) 69 70let peek2 l = 71 if l.pos + 1 >= l.input_len then None 72 else Some (Bytes.get l.input (l.pos + 1)) 73 74let peek_n l n = 75 if l.pos + n - 1 >= l.input_len then None 76 else Some (Bytes.sub_string l.input l.pos n) 77 78let advance l = 79 if not (is_eof l) then begin 80 if Bytes.get l.input l.pos = '\n' then begin 81 l.line <- l.line + 1; 82 l.col <- 1 83 end else 84 l.col <- l.col + 1; 85 l.pos <- l.pos + 1 86 end 87 88let advance_n l n = 89 for _ = 1 to n do advance l done 90 91let skip_whitespace l = 92 while not (is_eof l) && (Bytes.get l.input l.pos = ' ' || Bytes.get l.input l.pos = '\t') do 93 advance l 94 done 95 96(* Helper functions for bytes access *) 97let[@inline] get_char l pos = Bytes.unsafe_get l.input pos 98let[@inline] get_current l = Bytes.unsafe_get l.input l.pos 99let sub_string l pos len = Bytes.sub_string l.input pos len 100 101(* Helper to create error location from lexer state *) 102let lexer_loc l = Toml.Error.loc ~file:l.file ~line:l.line ~column:l.col () 103 104(* Get expected byte length of UTF-8 char from first byte *) 105let utf8_byte_length_from_first_byte c = 106 let code = Char.code c in 107 if code < 0x80 then 1 108 else if code < 0xC0 then 0 (* Invalid: continuation byte as start *) 109 else if code < 0xE0 then 2 110 else if code < 0xF0 then 3 111 else if code < 0xF8 then 4 112 else 0 (* Invalid: 5+ byte sequence *) 113 114(* Validate UTF-8 at position in lexer's bytes buffer, returns byte length *) 115let validate_utf8_at_pos_bytes l = 116 if l.pos >= l.input_len then 117 Toml.Error.raise_lexer ~location:(lexer_loc l) Unexpected_eof; 118 let byte_len = utf8_byte_length_from_first_byte (Bytes.unsafe_get l.input l.pos) in 119 if byte_len = 0 then 120 Toml.Error.raise_lexer ~location:(lexer_loc l) Invalid_utf8; 121 if l.pos + byte_len > l.input_len then 122 Toml.Error.raise_lexer ~location:(lexer_loc l) Incomplete_utf8; 123 (* Validate using uutf - it checks overlong encodings, surrogates, etc. *) 124 let sub = Bytes.sub_string l.input l.pos byte_len in 125 let valid = ref false in 126 Uutf.String.fold_utf_8 (fun () _ -> function 127 | `Uchar _ -> valid := true 128 | `Malformed _ -> () 129 ) () sub; 130 if not !valid then 131 Toml.Error.raise_lexer ~location:(lexer_loc l) Invalid_utf8; 132 byte_len 133 134(* UTF-8 validation - validates and advances over a single UTF-8 character *) 135let validate_utf8_char l = 136 let byte_len = validate_utf8_at_pos_bytes l in 137 for _ = 1 to byte_len do advance l done 138 139let skip_comment l = 140 if not (is_eof l) && get_current l = '#' then begin 141 (* Validate comment characters *) 142 advance l; 143 let continue = ref true in 144 while !continue && not (is_eof l) && get_current l <> '\n' do 145 let c = get_current l in 146 let code = Char.code c in 147 (* CR is only valid if followed by LF (CRLF at end of comment) *) 148 if c = '\r' then begin 149 (* Check if this CR is followed by LF - if so, it ends the comment *) 150 if l.pos + 1 < l.input_len && get_char l (l.pos + 1) = '\n' then 151 (* This is CRLF - stop the loop, let the main lexer handle it *) 152 continue := false 153 else 154 Toml.Error.raise_lexer ~location:(lexer_loc l) Bare_carriage_return 155 end else if code >= 0x80 then begin 156 (* Multi-byte UTF-8 character - validate it *) 157 validate_utf8_char l 158 end else begin 159 (* ASCII control characters other than tab are not allowed in comments *) 160 if code < 0x09 || (code > 0x09 && code < 0x20) || code = 0x7F then 161 Toml.Error.raise_lexer ~location:(lexer_loc l) (Control_character code); 162 advance l 163 end 164 done 165 end 166 167let skip_ws_and_comments l = 168 let rec loop () = 169 skip_whitespace l; 170 if not (is_eof l) && get_current l = '#' then begin 171 skip_comment l; 172 loop () 173 end 174 in 175 loop () 176 177let is_bare_key_char c = 178 (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || 179 (c >= '0' && c <= '9') || c = '_' || c = '-' 180 181let is_digit c = c >= '0' && c <= '9' 182let is_hex_digit c = is_digit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') 183let is_oct_digit c = c >= '0' && c <= '7' 184let is_bin_digit c = c = '0' || c = '1' 185 186let hex_value c = 187 if c >= '0' && c <= '9' then Char.code c - Char.code '0' 188 else if c >= 'a' && c <= 'f' then Char.code c - Char.code 'a' + 10 189 else if c >= 'A' && c <= 'F' then Char.code c - Char.code 'A' + 10 190 else Toml.Error.raise_number Invalid_hex_digit 191 192(* Convert Unicode codepoint to UTF-8 using uutf *) 193let codepoint_to_utf8 codepoint = 194 if codepoint < 0 || codepoint > 0x10FFFF then 195 failwith (Printf.sprintf "Invalid Unicode codepoint: U+%X" codepoint); 196 if codepoint >= 0xD800 && codepoint <= 0xDFFF then 197 failwith (Printf.sprintf "Surrogate codepoint not allowed: U+%04X" codepoint); 198 let buf = Buffer.create 4 in 199 Uutf.Buffer.add_utf_8 buf (Uchar.of_int codepoint); 200 Buffer.contents buf 201 202(* Parse Unicode escape with error location from lexer *) 203let unicode_to_utf8 l codepoint = 204 if codepoint < 0 || codepoint > 0x10FFFF then 205 Toml.Error.raise_lexer ~location:(lexer_loc l) (Invalid_unicode_codepoint codepoint); 206 if codepoint >= 0xD800 && codepoint <= 0xDFFF then 207 Toml.Error.raise_lexer ~location:(lexer_loc l) (Surrogate_codepoint codepoint); 208 let buf = Buffer.create 4 in 209 Uutf.Buffer.add_utf_8 buf (Uchar.of_int codepoint); 210 Buffer.contents buf 211 212let parse_escape l = 213 advance l; (* skip backslash *) 214 if is_eof l then 215 Toml.Error.raise_lexer ~location:(lexer_loc l) Unexpected_eof; 216 let c = get_current l in 217 advance l; 218 match c with 219 | 'b' -> "\b" 220 | 't' -> "\t" 221 | 'n' -> "\n" 222 | 'f' -> "\x0C" 223 | 'r' -> "\r" 224 | 'e' -> "\x1B" (* TOML 1.1 escape *) 225 | '"' -> "\"" 226 | '\\' -> "\\" 227 | 'x' -> 228 (* \xHH - 2 hex digits *) 229 if l.pos + 1 >= l.input_len then 230 Toml.Error.raise_lexer ~location:(lexer_loc l) (Incomplete_escape "\\x"); 231 let c1 = get_char l l.pos in 232 let c2 = get_char l (l.pos + 1) in 233 if not (is_hex_digit c1 && is_hex_digit c2) then 234 Toml.Error.raise_lexer ~location:(lexer_loc l) (Invalid_unicode_escape "\\x"); 235 let cp = (hex_value c1 * 16) + hex_value c2 in 236 advance l; advance l; 237 unicode_to_utf8 l cp 238 | 'u' -> 239 (* \uHHHH - 4 hex digits *) 240 if l.pos + 3 >= l.input_len then 241 Toml.Error.raise_lexer ~location:(lexer_loc l) (Incomplete_escape "\\u"); 242 let s = sub_string l l.pos 4 in 243 for i = 0 to 3 do 244 if not (is_hex_digit s.[i]) then 245 Toml.Error.raise_lexer ~location:(lexer_loc l) (Invalid_unicode_escape "\\u") 246 done; 247 let cp = int_of_string ("0x" ^ s) in 248 advance_n l 4; 249 unicode_to_utf8 l cp 250 | 'U' -> 251 (* \UHHHHHHHH - 8 hex digits *) 252 if l.pos + 7 >= l.input_len then 253 Toml.Error.raise_lexer ~location:(lexer_loc l) (Incomplete_escape "\\U"); 254 let s = sub_string l l.pos 8 in 255 for i = 0 to 7 do 256 if not (is_hex_digit s.[i]) then 257 Toml.Error.raise_lexer ~location:(lexer_loc l) (Invalid_unicode_escape "\\U") 258 done; 259 let cp = int_of_string ("0x" ^ s) in 260 advance_n l 8; 261 unicode_to_utf8 l cp 262 | _ -> 263 Toml.Error.raise_lexer ~location:(lexer_loc l) (Invalid_escape c) 264 265let validate_string_char l c is_multiline = 266 let code = Char.code c in 267 (* Control characters other than tab (and LF/CR for multiline) are not allowed *) 268 if code < 0x09 then 269 Toml.Error.raise_lexer ~location:(lexer_loc l) (Control_character code); 270 if code > 0x09 && code < 0x20 && not (is_multiline && (code = 0x0A || code = 0x0D)) then 271 Toml.Error.raise_lexer ~location:(lexer_loc l) (Control_character code); 272 if code = 0x7F then 273 Toml.Error.raise_lexer ~location:(lexer_loc l) (Control_character code) 274 275(* Validate UTF-8 in string context and add bytes to buffer *) 276let validate_and_add_utf8_to_buffer l buf = 277 let byte_len = validate_utf8_at_pos_bytes l in 278 Buffer.add_string buf (sub_string l l.pos byte_len); 279 for _ = 1 to byte_len do advance l done 280 281let parse_basic_string l = 282 advance l; (* skip opening quote *) 283 let buf = Buffer.create 64 in 284 let multiline = 285 match peek_n l 2 with 286 | Some "\"\"" -> 287 advance l; advance l; (* skip two more quotes *) 288 (* Skip newline immediately after opening delimiter *) 289 (match peek l with 290 | Some '\n' -> advance l 291 | Some '\r' -> 292 advance l; 293 if peek l = Some '\n' then advance l 294 else failwith "Bare carriage return not allowed in string" 295 | _ -> ()); 296 true 297 | _ -> false 298 in 299 let rec loop () = 300 if is_eof l then 301 failwith "Unterminated string"; 302 let c = get_current l in 303 if multiline then begin 304 if c = '"' then begin 305 (* Count consecutive quotes *) 306 let quote_count = ref 0 in 307 let p = ref l.pos in 308 while !p < l.input_len && get_char l !p = '"' do 309 incr quote_count; 310 incr p 311 done; 312 if !quote_count >= 3 then begin 313 (* 3+ quotes - this is a closing delimiter *) 314 (* Add extra quotes (up to 2) to content before closing delimiter *) 315 let extra = min (!quote_count - 3) 2 in 316 for _ = 1 to extra do 317 Buffer.add_char buf '"' 318 done; 319 advance_n l (!quote_count); 320 if !quote_count > 5 then 321 failwith "Too many quotes in multiline string" 322 end else begin 323 (* Less than 3 quotes - add them to content *) 324 for _ = 1 to !quote_count do 325 Buffer.add_char buf '"'; 326 advance l 327 done; 328 loop () 329 end 330 end else if c = '\\' then begin 331 (* Check for line-ending backslash *) 332 let saved_pos = l.pos in 333 let saved_line = l.line in 334 let saved_col = l.col in 335 advance l; 336 let rec skip_ws () = 337 match peek l with 338 | Some ' ' | Some '\t' -> advance l; skip_ws () 339 | _ -> () 340 in 341 skip_ws (); 342 match peek l with 343 | Some '\n' -> 344 advance l; 345 (* Skip all whitespace and newlines after *) 346 let rec skip_all () = 347 match peek l with 348 | Some ' ' | Some '\t' | Some '\n' -> advance l; skip_all () 349 | Some '\r' -> 350 advance l; 351 if peek l = Some '\n' then advance l; 352 skip_all () 353 | _ -> () 354 in 355 skip_all (); 356 loop () 357 | Some '\r' -> 358 advance l; 359 if peek l = Some '\n' then advance l; 360 let rec skip_all () = 361 match peek l with 362 | Some ' ' | Some '\t' | Some '\n' -> advance l; skip_all () 363 | Some '\r' -> 364 advance l; 365 if peek l = Some '\n' then advance l; 366 skip_all () 367 | _ -> () 368 in 369 skip_all (); 370 loop () 371 | _ -> 372 (* Not a line-ending backslash, restore position and parse escape *) 373 l.pos <- saved_pos; 374 l.line <- saved_line; 375 l.col <- saved_col; 376 Buffer.add_string buf (parse_escape l); 377 loop () 378 end else begin 379 let code = Char.code c in 380 if c = '\r' then begin 381 advance l; 382 if peek l = Some '\n' then begin 383 Buffer.add_char buf '\n'; 384 advance l 385 end else 386 failwith "Bare carriage return not allowed in string" 387 end else if code >= 0x80 then begin 388 (* Multi-byte UTF-8 - validate and add *) 389 validate_and_add_utf8_to_buffer l buf 390 end else begin 391 (* ASCII - validate control chars *) 392 validate_string_char l c true; 393 Buffer.add_char buf c; 394 advance l 395 end; 396 loop () 397 end 398 end else begin 399 (* Single-line basic string *) 400 if c = '"' then begin 401 advance l; 402 () 403 end else if c = '\\' then begin 404 Buffer.add_string buf (parse_escape l); 405 loop () 406 end else if c = '\n' || c = '\r' then 407 failwith "Newline not allowed in basic string" 408 else begin 409 let code = Char.code c in 410 if code >= 0x80 then begin 411 (* Multi-byte UTF-8 - validate and add *) 412 validate_and_add_utf8_to_buffer l buf 413 end else begin 414 (* ASCII - validate control chars *) 415 validate_string_char l c false; 416 Buffer.add_char buf c; 417 advance l 418 end; 419 loop () 420 end 421 end 422 in 423 loop (); 424 (Buffer.contents buf, multiline) 425 426let parse_literal_string l = 427 advance l; (* skip opening quote *) 428 let buf = Buffer.create 64 in 429 let multiline = 430 match peek_n l 2 with 431 | Some "''" -> 432 advance l; advance l; (* skip two more quotes *) 433 (* Skip newline immediately after opening delimiter *) 434 (match peek l with 435 | Some '\n' -> advance l 436 | Some '\r' -> 437 advance l; 438 if peek l = Some '\n' then advance l 439 else failwith "Bare carriage return not allowed in literal string" 440 | _ -> ()); 441 true 442 | _ -> false 443 in 444 let rec loop () = 445 if is_eof l then 446 failwith "Unterminated literal string"; 447 let c = get_current l in 448 if multiline then begin 449 if c = '\'' then begin 450 (* Count consecutive quotes *) 451 let quote_count = ref 0 in 452 let p = ref l.pos in 453 while !p < l.input_len && get_char l !p = '\'' do 454 incr quote_count; 455 incr p 456 done; 457 if !quote_count >= 3 then begin 458 (* 3+ quotes - this is a closing delimiter *) 459 (* Add extra quotes (up to 2) to content before closing delimiter *) 460 let extra = min (!quote_count - 3) 2 in 461 for _ = 1 to extra do 462 Buffer.add_char buf '\'' 463 done; 464 advance_n l (!quote_count); 465 if !quote_count > 5 then 466 failwith "Too many quotes in multiline literal string" 467 end else begin 468 (* Less than 3 quotes - add them to content *) 469 for _ = 1 to !quote_count do 470 Buffer.add_char buf '\''; 471 advance l 472 done; 473 loop () 474 end 475 end else begin 476 let code = Char.code c in 477 if c = '\r' then begin 478 advance l; 479 if peek l = Some '\n' then begin 480 Buffer.add_char buf '\n'; 481 advance l 482 end else 483 failwith "Bare carriage return not allowed in literal string" 484 end else if code >= 0x80 then begin 485 (* Multi-byte UTF-8 - validate and add *) 486 validate_and_add_utf8_to_buffer l buf 487 end else begin 488 (* ASCII control char validation for literal strings *) 489 if code < 0x09 || (code > 0x09 && code < 0x0A) || (code > 0x0D && code < 0x20) || code = 0x7F then 490 if code <> 0x0A && code <> 0x0D then 491 failwith (Printf.sprintf "Control character U+%04X not allowed in literal string at line %d" code l.line); 492 Buffer.add_char buf c; 493 advance l 494 end; 495 loop () 496 end 497 end else begin 498 if c = '\'' then begin 499 advance l; 500 () 501 end else if c = '\n' || c = '\r' then 502 failwith "Newline not allowed in literal string" 503 else begin 504 let code = Char.code c in 505 if code >= 0x80 then begin 506 (* Multi-byte UTF-8 - validate and add *) 507 validate_and_add_utf8_to_buffer l buf 508 end else begin 509 (* ASCII control char validation *) 510 if code < 0x09 || (code > 0x09 && code < 0x20) || code = 0x7F then 511 failwith (Printf.sprintf "Control character U+%04X not allowed in literal string at line %d" code l.line); 512 Buffer.add_char buf c; 513 advance l 514 end; 515 loop () 516 end 517 end 518 in 519 loop (); 520 (Buffer.contents buf, multiline) 521 522let parse_number l = 523 let start = l.pos in 524 let neg = 525 match peek l with 526 | Some '-' -> advance l; true 527 | Some '+' -> advance l; false 528 | _ -> false 529 in 530 (* Check for special floats: inf and nan *) 531 match peek_n l 3 with 532 | Some "inf" -> 533 advance_n l 3; 534 let s = sub_string l start (l.pos - start) in 535 Tok_float ((if neg then Float.neg_infinity else Float.infinity), s) 536 | Some "nan" -> 537 advance_n l 3; 538 let s = sub_string l start (l.pos - start) in 539 Tok_float (Float.nan, s) 540 | _ -> 541 (* Check for hex, octal, or binary *) 542 match peek l, peek2 l with 543 | Some '0', Some 'x' when not neg -> 544 advance l; advance l; 545 let num_start = l.pos in 546 (* Check for leading underscore *) 547 if peek l = Some '_' then failwith "Leading underscore not allowed after 0x"; 548 let rec read_hex first = 549 match peek l with 550 | Some c when is_hex_digit c -> advance l; read_hex false 551 | Some '_' -> 552 if first then failwith "Underscore must follow a hex digit"; 553 advance l; 554 if peek l |> Option.map is_hex_digit |> Option.value ~default:false then 555 read_hex false 556 else 557 failwith "Trailing underscore in hex number" 558 | _ -> 559 if first then failwith "Expected hex digit after 0x" 560 in 561 read_hex true; 562 let s = sub_string l num_start (l.pos - num_start) in 563 let s = String.concat "" (String.split_on_char '_' s) in 564 let orig = sub_string l start (l.pos - start) in 565 Tok_integer (Int64.of_string ("0x" ^ s), orig) 566 | Some '0', Some 'o' when not neg -> 567 advance l; advance l; 568 let num_start = l.pos in 569 (* Check for leading underscore *) 570 if peek l = Some '_' then failwith "Leading underscore not allowed after 0o"; 571 let rec read_oct first = 572 match peek l with 573 | Some c when is_oct_digit c -> advance l; read_oct false 574 | Some '_' -> 575 if first then failwith "Underscore must follow an octal digit"; 576 advance l; 577 if peek l |> Option.map is_oct_digit |> Option.value ~default:false then 578 read_oct false 579 else 580 failwith "Trailing underscore in octal number" 581 | _ -> 582 if first then failwith "Expected octal digit after 0o" 583 in 584 read_oct true; 585 let s = sub_string l num_start (l.pos - num_start) in 586 let s = String.concat "" (String.split_on_char '_' s) in 587 let orig = sub_string l start (l.pos - start) in 588 Tok_integer (Int64.of_string ("0o" ^ s), orig) 589 | Some '0', Some 'b' when not neg -> 590 advance l; advance l; 591 let num_start = l.pos in 592 (* Check for leading underscore *) 593 if peek l = Some '_' then failwith "Leading underscore not allowed after 0b"; 594 let rec read_bin first = 595 match peek l with 596 | Some c when is_bin_digit c -> advance l; read_bin false 597 | Some '_' -> 598 if first then failwith "Underscore must follow a binary digit"; 599 advance l; 600 if peek l |> Option.map is_bin_digit |> Option.value ~default:false then 601 read_bin false 602 else 603 failwith "Trailing underscore in binary number" 604 | _ -> 605 if first then failwith "Expected binary digit after 0b" 606 in 607 read_bin true; 608 let s = sub_string l num_start (l.pos - num_start) in 609 let s = String.concat "" (String.split_on_char '_' s) in 610 let orig = sub_string l start (l.pos - start) in 611 Tok_integer (Int64.of_string ("0b" ^ s), orig) 612 | _ -> 613 (* Regular decimal number *) 614 let first_digit = peek l in 615 (* Check for leading zeros - also reject 0_ followed by digits *) 616 if first_digit = Some '0' then begin 617 match peek2 l with 618 | Some c when is_digit c -> failwith "Leading zeros not allowed" 619 | Some '_' -> failwith "Leading zeros not allowed" 620 | _ -> () 621 end; 622 let rec read_int first = 623 match peek l with 624 | Some c when is_digit c -> advance l; read_int false 625 | Some '_' -> 626 if first then failwith "Underscore must follow a digit"; 627 advance l; 628 if peek l |> Option.map is_digit |> Option.value ~default:false then 629 read_int false 630 else 631 failwith "Trailing underscore in number" 632 | _ -> 633 if first then failwith "Expected digit" 634 in 635 (match peek l with 636 | Some c when is_digit c -> read_int false 637 | _ -> failwith "Expected digit after sign"); 638 (* Check for float *) 639 let is_float = ref false in 640 (match peek l, peek2 l with 641 | Some '.', Some c when is_digit c -> 642 is_float := true; 643 advance l; 644 read_int false 645 | Some '.', _ -> 646 failwith "Decimal point must be followed by digit" 647 | _ -> ()); 648 (* Check for exponent *) 649 (match peek l with 650 | Some 'e' | Some 'E' -> 651 is_float := true; 652 advance l; 653 (match peek l with 654 | Some '+' | Some '-' -> advance l 655 | _ -> ()); 656 (* After exponent/sign, first char must be a digit, not underscore *) 657 (match peek l with 658 | Some '_' -> failwith "Underscore cannot follow exponent" 659 | _ -> ()); 660 read_int true 661 | _ -> ()); 662 let s = sub_string l start (l.pos - start) in 663 let s' = String.concat "" (String.split_on_char '_' s) in 664 if !is_float then 665 Tok_float (float_of_string s', s) 666 else 667 Tok_integer (Int64.of_string s', s) 668 669(* Check if we're looking at a datetime/date/time *) 670let looks_like_datetime l = 671 (* YYYY-MM-DD or HH:MM - need to ensure it's not a bare key that starts with numbers *) 672 let check_datetime () = 673 let pos = l.pos in 674 let len = l.input_len in 675 (* Check for YYYY-MM-DD pattern - must have exactly this structure *) 676 if pos + 10 <= len then begin 677 let c0 = get_char l pos in 678 let c1 = get_char l (pos + 1) in 679 let c2 = get_char l (pos + 2) in 680 let c3 = get_char l (pos + 3) in 681 let c4 = get_char l (pos + 4) in 682 let c5 = get_char l (pos + 5) in 683 let c6 = get_char l (pos + 6) in 684 let c7 = get_char l (pos + 7) in 685 let c8 = get_char l (pos + 8) in 686 let c9 = get_char l (pos + 9) in 687 (* Must match YYYY-MM-DD pattern AND not be followed by bare key chars (except T or space for time) *) 688 if is_digit c0 && is_digit c1 && is_digit c2 && is_digit c3 && c4 = '-' && 689 is_digit c5 && is_digit c6 && c7 = '-' && is_digit c8 && is_digit c9 then begin 690 (* Check what follows - if it's a bare key char other than T/t/space, it's not a date *) 691 if pos + 10 < len then begin 692 let next = get_char l (pos + 10) in 693 if next = 'T' || next = 't' then 694 `Date (* Datetime continues with time part *) 695 else if next = ' ' || next = '\t' then begin 696 (* Check if followed by = (key context) or time part *) 697 let rec skip_ws p = 698 if p >= len then p 699 else match get_char l p with 700 | ' ' | '\t' -> skip_ws (p + 1) 701 | _ -> p 702 in 703 let after_ws = skip_ws (pos + 11) in 704 if after_ws < len && get_char l after_ws = '=' then 705 `Other (* It's a key followed by = *) 706 else if after_ws < len && is_digit (get_char l after_ws) then 707 `Date (* Could be "2001-02-03 12:34:56" format *) 708 else 709 `Date 710 end else if next = '\n' || next = '\r' || 711 next = '#' || next = ',' || next = ']' || next = '}' then 712 `Date 713 else if is_bare_key_char next then 714 `Other (* It's a bare key like "2000-02-29abc" *) 715 else 716 `Date 717 end else 718 `Date 719 end else if pos + 5 <= len && 720 is_digit c0 && is_digit c1 && c2 = ':' && is_digit c3 && is_digit c4 then 721 `Time 722 else 723 `Other 724 end else if pos + 5 <= len then begin 725 let c0 = get_char l pos in 726 let c1 = get_char l (pos + 1) in 727 let c2 = get_char l (pos + 2) in 728 let c3 = get_char l (pos + 3) in 729 let c4 = get_char l (pos + 4) in 730 if is_digit c0 && is_digit c1 && c2 = ':' && is_digit c3 && is_digit c4 then 731 `Time 732 else 733 `Other 734 end else 735 `Other 736 in 737 check_datetime () 738 739(* Date/time validation *) 740let validate_date year month day = 741 if month < 1 || month > 12 then 742 failwith (Printf.sprintf "Invalid month: %d" month); 743 if day < 1 then 744 failwith (Printf.sprintf "Invalid day: %d" day); 745 let days_in_month = [| 0; 31; 28; 31; 30; 31; 30; 31; 31; 30; 31; 30; 31 |] in 746 let is_leap = (year mod 4 = 0 && year mod 100 <> 0) || year mod 400 = 0 in 747 let max_days = 748 if month = 2 && is_leap then 29 749 else days_in_month.(month) 750 in 751 if day > max_days then 752 failwith (Printf.sprintf "Invalid day %d for month %d" day month) 753 754let validate_time hour minute second = 755 if hour < 0 || hour > 23 then 756 failwith (Printf.sprintf "Invalid hour: %d" hour); 757 if minute < 0 || minute > 59 then 758 failwith (Printf.sprintf "Invalid minute: %d" minute); 759 if second < 0 || second > 60 then (* 60 for leap second *) 760 failwith (Printf.sprintf "Invalid second: %d" second) 761 762let validate_offset hour minute = 763 if hour < 0 || hour > 23 then 764 failwith (Printf.sprintf "Invalid timezone offset hour: %d" hour); 765 if minute < 0 || minute > 59 then 766 failwith (Printf.sprintf "Invalid timezone offset minute: %d" minute) 767 768let parse_datetime l = 769 let buf = Buffer.create 32 in 770 let year_buf = Buffer.create 4 in 771 let month_buf = Buffer.create 2 in 772 let day_buf = Buffer.create 2 in 773 (* Read date part YYYY-MM-DD *) 774 for _ = 1 to 4 do 775 match peek l with 776 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char year_buf c; advance l 777 | _ -> failwith "Invalid date format" 778 done; 779 if peek l <> Some '-' then failwith "Invalid date format"; 780 Buffer.add_char buf '-'; advance l; 781 for _ = 1 to 2 do 782 match peek l with 783 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char month_buf c; advance l 784 | _ -> failwith "Invalid date format" 785 done; 786 if peek l <> Some '-' then failwith "Invalid date format"; 787 Buffer.add_char buf '-'; advance l; 788 for _ = 1 to 2 do 789 match peek l with 790 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char day_buf c; advance l 791 | _ -> failwith "Invalid date format" 792 done; 793 (* Validate date immediately *) 794 let year = int_of_string (Buffer.contents year_buf) in 795 let month = int_of_string (Buffer.contents month_buf) in 796 let day = int_of_string (Buffer.contents day_buf) in 797 validate_date year month day; 798 (* Helper to parse time part (after T or space) *) 799 let parse_time_part () = 800 let hour_buf = Buffer.create 2 in 801 let minute_buf = Buffer.create 2 in 802 let second_buf = Buffer.create 2 in 803 Buffer.add_char buf 'T'; (* Always normalize to uppercase T *) 804 for _ = 1 to 2 do 805 match peek l with 806 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char hour_buf c; advance l 807 | _ -> failwith "Invalid time format" 808 done; 809 if peek l <> Some ':' then failwith "Invalid time format"; 810 Buffer.add_char buf ':'; advance l; 811 for _ = 1 to 2 do 812 match peek l with 813 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char minute_buf c; advance l 814 | _ -> failwith "Invalid time format" 815 done; 816 (* Optional seconds *) 817 (match peek l with 818 | Some ':' -> 819 Buffer.add_char buf ':'; advance l; 820 for _ = 1 to 2 do 821 match peek l with 822 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char second_buf c; advance l 823 | _ -> failwith "Invalid time format" 824 done; 825 (* Optional fractional seconds *) 826 (match peek l with 827 | Some '.' -> 828 Buffer.add_char buf '.'; advance l; 829 if not (peek l |> Option.map is_digit |> Option.value ~default:false) then 830 failwith "Expected digit after decimal point"; 831 while peek l |> Option.map is_digit |> Option.value ~default:false do 832 Buffer.add_char buf (Option.get (peek l)); 833 advance l 834 done 835 | _ -> ()) 836 | _ -> 837 (* No seconds - add :00 for normalization per toml-test *) 838 Buffer.add_string buf ":00"; 839 Buffer.add_string second_buf "00"); 840 (* Validate time *) 841 let hour = int_of_string (Buffer.contents hour_buf) in 842 let minute = int_of_string (Buffer.contents minute_buf) in 843 let second = if Buffer.length second_buf > 0 then int_of_string (Buffer.contents second_buf) else 0 in 844 validate_time hour minute second; 845 (* Check for offset *) 846 match peek l with 847 | Some 'Z' | Some 'z' -> 848 Buffer.add_char buf 'Z'; 849 advance l; 850 Tok_datetime (Buffer.contents buf) 851 | Some '+' | Some '-' as sign_opt -> 852 let sign = Option.get sign_opt in 853 let off_hour_buf = Buffer.create 2 in 854 let off_min_buf = Buffer.create 2 in 855 Buffer.add_char buf sign; 856 advance l; 857 for _ = 1 to 2 do 858 match peek l with 859 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char off_hour_buf c; advance l 860 | _ -> failwith "Invalid timezone offset" 861 done; 862 if peek l <> Some ':' then failwith "Invalid timezone offset"; 863 Buffer.add_char buf ':'; advance l; 864 for _ = 1 to 2 do 865 match peek l with 866 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char off_min_buf c; advance l 867 | _ -> failwith "Invalid timezone offset" 868 done; 869 (* Validate offset *) 870 let off_hour = int_of_string (Buffer.contents off_hour_buf) in 871 let off_min = int_of_string (Buffer.contents off_min_buf) in 872 validate_offset off_hour off_min; 873 Tok_datetime (Buffer.contents buf) 874 | _ -> 875 Tok_datetime_local (Buffer.contents buf) 876 in 877 (* Check if there's a time part *) 878 match peek l with 879 | Some 'T' | Some 't' -> 880 advance l; 881 parse_time_part () 882 | Some ' ' -> 883 (* Space could be followed by time (datetime with space separator) 884 or could be end of date (local date followed by comment/value) *) 885 advance l; (* Skip the space *) 886 (* Check if followed by digit (time) *) 887 (match peek l with 888 | Some c when is_digit c -> 889 parse_time_part () 890 | _ -> 891 (* Not followed by time - this is just a local date *) 892 (* Put the space back by not consuming anything further *) 893 l.pos <- l.pos - 1; (* Go back to before the space *) 894 Tok_date_local (Buffer.contents buf)) 895 | _ -> 896 (* Just a date *) 897 Tok_date_local (Buffer.contents buf) 898 899let parse_time l = 900 let buf = Buffer.create 16 in 901 let hour_buf = Buffer.create 2 in 902 let minute_buf = Buffer.create 2 in 903 let second_buf = Buffer.create 2 in 904 (* Read HH:MM *) 905 for _ = 1 to 2 do 906 match peek l with 907 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char hour_buf c; advance l 908 | _ -> failwith "Invalid time format" 909 done; 910 if peek l <> Some ':' then failwith "Invalid time format"; 911 Buffer.add_char buf ':'; advance l; 912 for _ = 1 to 2 do 913 match peek l with 914 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char minute_buf c; advance l 915 | _ -> failwith "Invalid time format" 916 done; 917 (* Optional seconds *) 918 (match peek l with 919 | Some ':' -> 920 Buffer.add_char buf ':'; advance l; 921 for _ = 1 to 2 do 922 match peek l with 923 | Some c when is_digit c -> Buffer.add_char buf c; Buffer.add_char second_buf c; advance l 924 | _ -> failwith "Invalid time format" 925 done; 926 (* Optional fractional seconds *) 927 (match peek l with 928 | Some '.' -> 929 Buffer.add_char buf '.'; advance l; 930 if not (peek l |> Option.map is_digit |> Option.value ~default:false) then 931 failwith "Expected digit after decimal point"; 932 while peek l |> Option.map is_digit |> Option.value ~default:false do 933 Buffer.add_char buf (Option.get (peek l)); 934 advance l 935 done 936 | _ -> ()) 937 | _ -> 938 (* No seconds - add :00 for normalization *) 939 Buffer.add_string buf ":00"; 940 Buffer.add_string second_buf "00"); 941 (* Validate time *) 942 let hour = int_of_string (Buffer.contents hour_buf) in 943 let minute = int_of_string (Buffer.contents minute_buf) in 944 let second = if Buffer.length second_buf > 0 then int_of_string (Buffer.contents second_buf) else 0 in 945 validate_time hour minute second; 946 Tok_time_local (Buffer.contents buf) 947 948let next_token l = 949 skip_ws_and_comments l; 950 if is_eof l then Tok_eof 951 else begin 952 let c = get_current l in 953 match c with 954 | '[' -> advance l; Tok_lbracket 955 | ']' -> advance l; Tok_rbracket 956 | '{' -> advance l; Tok_lbrace 957 | '}' -> advance l; Tok_rbrace 958 | '=' -> advance l; Tok_equals 959 | ',' -> advance l; Tok_comma 960 | '.' -> advance l; Tok_dot 961 | '\n' -> advance l; Tok_newline 962 | '\r' -> 963 advance l; 964 if peek l = Some '\n' then begin 965 advance l; 966 Tok_newline 967 end else 968 failwith (Printf.sprintf "Bare carriage return not allowed at line %d" l.line) 969 | '"' -> 970 let (s, multiline) = parse_basic_string l in 971 if multiline then Tok_ml_basic_string s else Tok_basic_string s 972 | '\'' -> 973 let (s, multiline) = parse_literal_string l in 974 if multiline then Tok_ml_literal_string s else Tok_literal_string s 975 | '+' | '-' -> 976 (* Could be number, special float (+inf, -inf, +nan, -nan), or bare key starting with - *) 977 let sign = c in 978 let start = l.pos in 979 (match peek2 l with 980 | Some d when is_digit d -> 981 (* Check if this looks like a key (followed by = after whitespace/key chars) *) 982 (* A key like -01 should be followed by whitespace then =, not by . or e (number syntax) *) 983 let is_key_context = 984 let rec scan_ahead p = 985 if p >= l.input_len then false 986 else 987 let c = get_char l p in 988 if is_digit c || c = '_' then scan_ahead (p + 1) 989 else if c = ' ' || c = '\t' then 990 (* Skip whitespace and check for = *) 991 let rec skip_ws pp = 992 if pp >= l.input_len then false 993 else match get_char l pp with 994 | ' ' | '\t' -> skip_ws (pp + 1) 995 | '=' -> true 996 | _ -> false 997 in 998 skip_ws (p + 1) 999 else if c = '=' then true 1000 else if c = '.' then 1001 (* Check if . is followed by digit (number) vs letter/underscore (dotted key) *) 1002 if p + 1 < l.input_len then 1003 let next = get_char l (p + 1) in 1004 if is_digit next then false (* It's a decimal number like -3.14 *) 1005 else if is_bare_key_char next then true (* Dotted key *) 1006 else false 1007 else false 1008 else if c = 'e' || c = 'E' then false (* Scientific notation *) 1009 else if is_bare_key_char c then 1010 (* Contains non-digit bare key char - it's a key *) 1011 true 1012 else false 1013 in 1014 scan_ahead (start + 1) 1015 in 1016 if is_key_context then begin 1017 (* Treat as bare key *) 1018 while not (is_eof l) && is_bare_key_char (get_current l) do 1019 advance l 1020 done; 1021 Tok_bare_key (sub_string l start (l.pos - start)) 1022 end else 1023 parse_number l 1024 | Some 'i' -> 1025 (* Check for inf *) 1026 if l.pos + 3 < l.input_len && 1027 get_char l (l.pos + 1) = 'i' && get_char l (l.pos + 2) = 'n' && get_char l (l.pos + 3) = 'f' then begin 1028 advance_n l 4; 1029 let s = sub_string l start (l.pos - start) in 1030 if sign = '-' then Tok_float (Float.neg_infinity, s) 1031 else Tok_float (Float.infinity, s) 1032 end else if sign = '-' then begin 1033 (* Could be bare key like -inf-key *) 1034 while not (is_eof l) && is_bare_key_char (get_current l) do 1035 advance l 1036 done; 1037 Tok_bare_key (sub_string l start (l.pos - start)) 1038 end else 1039 failwith (Printf.sprintf "Unexpected character after %c" sign) 1040 | Some 'n' -> 1041 (* Check for nan *) 1042 if l.pos + 3 < l.input_len && 1043 get_char l (l.pos + 1) = 'n' && get_char l (l.pos + 2) = 'a' && get_char l (l.pos + 3) = 'n' then begin 1044 advance_n l 4; 1045 let s = sub_string l start (l.pos - start) in 1046 Tok_float (Float.nan, s) (* Sign on NaN doesn't change the value *) 1047 end else if sign = '-' then begin 1048 (* Could be bare key like -name *) 1049 while not (is_eof l) && is_bare_key_char (get_current l) do 1050 advance l 1051 done; 1052 Tok_bare_key (sub_string l start (l.pos - start)) 1053 end else 1054 failwith (Printf.sprintf "Unexpected character after %c" sign) 1055 | _ when sign = '-' -> 1056 (* Bare key starting with - like -key or --- *) 1057 while not (is_eof l) && is_bare_key_char (get_current l) do 1058 advance l 1059 done; 1060 Tok_bare_key (sub_string l start (l.pos - start)) 1061 | _ -> failwith (Printf.sprintf "Unexpected character after %c" sign)) 1062 | c when is_digit c -> 1063 (* Could be number, datetime, or bare key starting with digits *) 1064 (match looks_like_datetime l with 1065 | `Date -> parse_datetime l 1066 | `Time -> parse_time l 1067 | `Other -> 1068 (* Check for hex/octal/binary prefix first - these are always numbers *) 1069 let start = l.pos in 1070 let is_prefixed_number = 1071 start + 1 < l.input_len && get_char l start = '0' && 1072 (let c1 = get_char l (start + 1) in 1073 c1 = 'x' || c1 = 'X' || c1 = 'o' || c1 = 'O' || c1 = 'b' || c1 = 'B') 1074 in 1075 if is_prefixed_number then 1076 parse_number l 1077 else begin 1078 (* Check if this is a bare key: 1079 - Contains letters (like "123abc") 1080 - Has leading zeros (like "0123") which would be invalid as a number *) 1081 let has_leading_zero = 1082 get_char l start = '0' && start + 1 < l.input_len && 1083 let c1 = get_char l (start + 1) in 1084 is_digit c1 1085 in 1086 (* Scan to see if this is a bare key or a number 1087 - If it looks like scientific notation (digits + e/E + optional sign + digits), it's a number 1088 - If it contains letters OR dashes between digits, it's a bare key *) 1089 let rec scan_for_bare_key pos has_dash_between_digits = 1090 if pos >= l.input_len then has_dash_between_digits 1091 else 1092 let c = get_char l pos in 1093 if is_digit c || c = '_' then scan_for_bare_key (pos + 1) has_dash_between_digits 1094 else if c = '.' then scan_for_bare_key (pos + 1) has_dash_between_digits 1095 else if c = '-' then 1096 (* Dash in key - check what follows *) 1097 let next_pos = pos + 1 in 1098 if next_pos < l.input_len then 1099 let next = get_char l next_pos in 1100 if is_digit next then 1101 scan_for_bare_key (next_pos) true (* Dash between digits - bare key *) 1102 else if is_bare_key_char next then 1103 true (* Dash followed by letter - definitely bare key like 2000-datetime *) 1104 else 1105 has_dash_between_digits (* End of sequence *) 1106 else 1107 has_dash_between_digits (* End of input *) 1108 else if c = 'e' || c = 'E' then 1109 (* Check if this looks like scientific notation *) 1110 let next_pos = pos + 1 in 1111 if next_pos >= l.input_len then true (* Just 'e' at end, bare key *) 1112 else 1113 let next = get_char l next_pos in 1114 if next = '+' || next = '-' then 1115 (* Has exponent sign - check if followed by digit *) 1116 let after_sign = next_pos + 1 in 1117 if after_sign < l.input_len && is_digit (get_char l after_sign) then 1118 has_dash_between_digits (* Scientific notation, but might have dash earlier *) 1119 else 1120 true (* e.g., "3e-abc" - bare key *) 1121 else if is_digit next then 1122 has_dash_between_digits (* Scientific notation like 3e2, but check if had dash earlier *) 1123 else 1124 true (* e.g., "3eabc" - bare key *) 1125 else if is_bare_key_char c then 1126 (* It's a letter - this is a bare key *) 1127 true 1128 else has_dash_between_digits 1129 in 1130 if has_leading_zero || scan_for_bare_key start false then begin 1131 (* It's a bare key *) 1132 while not (is_eof l) && is_bare_key_char (get_current l) do 1133 advance l 1134 done; 1135 Tok_bare_key (sub_string l start (l.pos - start)) 1136 end else 1137 (* It's a number - use parse_number *) 1138 parse_number l 1139 end) 1140 | c when c = 't' || c = 'f' || c = 'i' || c = 'n' -> 1141 (* These could be keywords (true, false, inf, nan) or bare keys 1142 Always read as bare key and let parser interpret *) 1143 let start = l.pos in 1144 while not (is_eof l) && is_bare_key_char (get_current l) do 1145 advance l 1146 done; 1147 Tok_bare_key (sub_string l start (l.pos - start)) 1148 | c when is_bare_key_char c -> 1149 let start = l.pos in 1150 while not (is_eof l) && is_bare_key_char (get_current l) do 1151 advance l 1152 done; 1153 Tok_bare_key (sub_string l start (l.pos - start)) 1154 | c -> 1155 let code = Char.code c in 1156 if code < 0x20 || code = 0x7F then 1157 failwith (Printf.sprintf "Control character U+%04X not allowed at line %d" code l.line) 1158 else 1159 failwith (Printf.sprintf "Unexpected character '%c' at line %d, column %d" c l.line l.col) 1160 end 1161 1162(* Parser *) 1163 1164type parser = { 1165 lexer : lexer; 1166 mutable current : token; 1167 mutable peeked : bool; 1168} 1169 1170let make_parser lexer = 1171 { lexer; current = Tok_eof; peeked = false } 1172 1173let peek_token p = 1174 if not p.peeked then begin 1175 p.current <- next_token p.lexer; 1176 p.peeked <- true 1177 end; 1178 p.current 1179 1180let consume_token p = 1181 let tok = peek_token p in 1182 p.peeked <- false; 1183 tok 1184 1185(* Check if next raw character (without skipping whitespace) matches *) 1186let next_raw_char_is p c = 1187 p.lexer.pos < p.lexer.input_len && get_char p.lexer p.lexer.pos = c 1188 1189let expect_token p expected = 1190 let tok = consume_token p in 1191 if tok <> expected then 1192 failwith (Printf.sprintf "Expected %s" (match expected with 1193 | Tok_equals -> "=" 1194 | Tok_rbracket -> "]" 1195 | Tok_rbrace -> "}" 1196 | Tok_newline -> "newline" 1197 | _ -> "token")) 1198 1199let skip_newlines p = 1200 while peek_token p = Tok_newline do 1201 ignore (consume_token p) 1202 done 1203 1204(* Parse a single key segment (bare, basic string, literal string, or integer) *) 1205(* Note: Tok_float is handled specially in parse_dotted_key *) 1206let parse_key_segment p = 1207 match peek_token p with 1208 | Tok_bare_key s -> ignore (consume_token p); [s] 1209 | Tok_basic_string s -> ignore (consume_token p); [s] 1210 | Tok_literal_string s -> ignore (consume_token p); [s] 1211 | Tok_integer (_i, orig_str) -> ignore (consume_token p); [orig_str] 1212 | Tok_float (f, orig_str) -> 1213 (* Float in key context - use original string to preserve exact key parts *) 1214 ignore (consume_token p); 1215 if Float.is_nan f then ["nan"] 1216 else if f = Float.infinity then ["inf"] 1217 else if f = Float.neg_infinity then ["-inf"] 1218 else begin 1219 (* Remove underscores from original string and split on dot *) 1220 let s = String.concat "" (String.split_on_char '_' orig_str) in 1221 if String.contains s 'e' || String.contains s 'E' then 1222 (* Has exponent, treat as single key *) 1223 [s] 1224 else if String.contains s '.' then 1225 (* Split on decimal point for dotted key *) 1226 String.split_on_char '.' s 1227 else 1228 (* No decimal point, single integer key *) 1229 [s] 1230 end 1231 | Tok_date_local s -> ignore (consume_token p); [s] 1232 | Tok_datetime s -> ignore (consume_token p); [s] 1233 | Tok_datetime_local s -> ignore (consume_token p); [s] 1234 | Tok_time_local s -> ignore (consume_token p); [s] 1235 | Tok_ml_basic_string _ -> failwith "Multiline strings are not allowed as keys" 1236 | Tok_ml_literal_string _ -> failwith "Multiline strings are not allowed as keys" 1237 | _ -> failwith "Expected key" 1238 1239(* Parse a dotted key - returns list of key strings *) 1240let parse_dotted_key p = 1241 let first_keys = parse_key_segment p in 1242 let rec loop acc = 1243 match peek_token p with 1244 | Tok_dot -> 1245 ignore (consume_token p); 1246 let keys = parse_key_segment p in 1247 loop (List.rev_append keys acc) 1248 | _ -> List.rev acc 1249 in 1250 let rest = loop [] in 1251 first_keys @ rest 1252 1253let rec parse_value p = 1254 match peek_token p with 1255 | Tok_basic_string s -> ignore (consume_token p); Toml.String s 1256 | Tok_literal_string s -> ignore (consume_token p); Toml.String s 1257 | Tok_ml_basic_string s -> ignore (consume_token p); Toml.String s 1258 | Tok_ml_literal_string s -> ignore (consume_token p); Toml.String s 1259 | Tok_integer (i, _) -> ignore (consume_token p); Toml.Int i 1260 | Tok_float (f, _) -> ignore (consume_token p); Toml.Float f 1261 | Tok_datetime s -> ignore (consume_token p); Toml.Datetime s 1262 | Tok_datetime_local s -> ignore (consume_token p); Toml.Datetime_local s 1263 | Tok_date_local s -> ignore (consume_token p); Toml.Date_local s 1264 | Tok_time_local s -> ignore (consume_token p); Toml.Time_local s 1265 | Tok_lbracket -> parse_array p 1266 | Tok_lbrace -> parse_inline_table p 1267 | Tok_bare_key s -> 1268 (* Interpret bare keys as boolean, float keywords, or numbers in value context *) 1269 ignore (consume_token p); 1270 (match s with 1271 | "true" -> Bool true 1272 | "false" -> Bool false 1273 | "inf" -> Float Float.infinity 1274 | "nan" -> Float Float.nan 1275 | _ -> 1276 (* Validate underscore placement in the original string *) 1277 let validate_underscores str = 1278 let len = String.length str in 1279 if len > 0 && str.[0] = '_' then 1280 failwith "Leading underscore not allowed in number"; 1281 if len > 0 && str.[len - 1] = '_' then 1282 failwith "Trailing underscore not allowed in number"; 1283 for i = 0 to len - 2 do 1284 if str.[i] = '_' && str.[i + 1] = '_' then 1285 failwith "Double underscore not allowed in number"; 1286 (* Underscore must be between digits (not next to 'e', 'E', '.', 'x', 'o', 'b', etc.) *) 1287 if str.[i] = '_' then begin 1288 let prev = if i > 0 then Some str.[i - 1] else None in 1289 let next = Some str.[i + 1] in 1290 let is_digit_char c = c >= '0' && c <= '9' in 1291 let is_hex_char c = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') in 1292 (* For hex numbers, underscore can be between hex digits *) 1293 let has_hex_prefix = len > 2 && str.[0] = '0' && (str.[1] = 'x' || str.[1] = 'X') in 1294 match prev, next with 1295 | Some p, Some n when has_hex_prefix && is_hex_char p && is_hex_char n -> () 1296 | Some p, Some n when is_digit_char p && is_digit_char n -> () 1297 | _ -> failwith "Underscore must be between digits" 1298 end 1299 done 1300 in 1301 validate_underscores s; 1302 (* Try to parse as a number - bare keys like "10e3" should be floats *) 1303 let s_no_underscore = String.concat "" (String.split_on_char '_' s) in 1304 let len = String.length s_no_underscore in 1305 if len > 0 then 1306 let c0 = s_no_underscore.[0] in 1307 (* Must start with digit for it to be a number in value context *) 1308 if c0 >= '0' && c0 <= '9' then begin 1309 (* Check for leading zeros *) 1310 if len > 1 && c0 = '0' && s_no_underscore.[1] >= '0' && s_no_underscore.[1] <= '9' then 1311 failwith "Leading zeros not allowed" 1312 else 1313 try 1314 (* Try to parse as float (handles scientific notation) *) 1315 if String.contains s_no_underscore '.' || 1316 String.contains s_no_underscore 'e' || 1317 String.contains s_no_underscore 'E' then 1318 Toml.Float (float_of_string s_no_underscore) 1319 else 1320 Toml.Int (Int64.of_string s_no_underscore) 1321 with _ -> 1322 failwith (Printf.sprintf "Unexpected bare key '%s' as value" s) 1323 end else 1324 failwith (Printf.sprintf "Unexpected bare key '%s' as value" s) 1325 else 1326 failwith (Printf.sprintf "Unexpected bare key '%s' as value" s)) 1327 | _ -> failwith "Expected value" 1328 1329and parse_array p = 1330 ignore (consume_token p); (* [ *) 1331 skip_newlines p; 1332 let rec loop acc = 1333 match peek_token p with 1334 | Tok_rbracket -> 1335 ignore (consume_token p); 1336 Toml.Array (List.rev acc) 1337 | _ -> 1338 let v = parse_value p in 1339 skip_newlines p; 1340 match peek_token p with 1341 | Tok_comma -> 1342 ignore (consume_token p); 1343 skip_newlines p; 1344 loop (v :: acc) 1345 | Tok_rbracket -> 1346 ignore (consume_token p); 1347 Toml.Array (List.rev (v :: acc)) 1348 | _ -> failwith "Expected ',' or ']' in array" 1349 in 1350 loop [] 1351 1352and parse_inline_table p = 1353 ignore (consume_token p); (* { *) 1354 skip_newlines p; 1355 (* Track explicitly defined keys - can't be extended with dotted keys *) 1356 let defined_inline = ref [] in 1357 let rec loop acc = 1358 match peek_token p with 1359 | Tok_rbrace -> 1360 ignore (consume_token p); 1361 Toml.Table (List.rev acc) 1362 | _ -> 1363 let keys = parse_dotted_key p in 1364 skip_ws p; 1365 expect_token p Tok_equals; 1366 skip_ws p; 1367 let v = parse_value p in 1368 (* Check if trying to extend a previously-defined inline table *) 1369 (match keys with 1370 | first_key :: _ :: _ -> 1371 (* Multi-key dotted path - check if first key is already defined *) 1372 if List.mem first_key !defined_inline then 1373 failwith (Printf.sprintf "Cannot extend inline table '%s' with dotted key" first_key) 1374 | _ -> ()); 1375 (* If this is a direct assignment to a key, track it *) 1376 (match keys with 1377 | [k] -> 1378 if List.mem k !defined_inline then 1379 failwith (Printf.sprintf "Duplicate key '%s' in inline table" k); 1380 defined_inline := k :: !defined_inline 1381 | _ -> ()); 1382 let entry = build_nested_table keys v in 1383 (* Merge the entry with existing entries (for dotted keys with common prefix) *) 1384 let acc = merge_entry_into_table acc entry in 1385 skip_newlines p; 1386 match peek_token p with 1387 | Tok_comma -> 1388 ignore (consume_token p); 1389 skip_newlines p; 1390 loop acc 1391 | Tok_rbrace -> 1392 ignore (consume_token p); 1393 Toml.Table (List.rev acc) 1394 | _ -> failwith "Expected ',' or '}' in inline table" 1395 in 1396 loop [] 1397 1398and skip_ws _p = 1399 (* Skip whitespace in token stream - handled by lexer but needed for lookahead *) 1400 () 1401 1402and build_nested_table keys value = 1403 match keys with 1404 | [] -> failwith "Empty key" 1405 | [k] -> (k, value) 1406 | k :: rest -> 1407 (k, Toml.Table [build_nested_table rest value]) 1408 1409(* Merge two TOML values - used for combining dotted keys in inline tables *) 1410and merge_toml_values v1 v2 = 1411 match v1, v2 with 1412 | Toml.Table entries1, Toml.Table entries2 -> 1413 (* Merge the entries *) 1414 let merged = List.fold_left (fun acc (k, v) -> 1415 match List.assoc_opt k acc with 1416 | Some existing -> 1417 (* Key exists - try to merge if both are tables *) 1418 let merged_v = merge_toml_values existing v in 1419 (k, merged_v) :: List.remove_assoc k acc 1420 | None -> 1421 (k, v) :: acc 1422 ) entries1 entries2 in 1423 Toml.Table (List.rev merged) 1424 | _, _ -> 1425 (* Can't merge non-table values with same key *) 1426 failwith "Conflicting keys in inline table" 1427 1428(* Merge a single entry into an existing table *) 1429and merge_entry_into_table entries (k, v) = 1430 match List.assoc_opt k entries with 1431 | Some existing -> 1432 let merged_v = merge_toml_values existing v in 1433 (k, merged_v) :: List.remove_assoc k entries 1434 | None -> 1435 (k, v) :: entries 1436 1437let validate_datetime_string s = 1438 (* Parse and validate date portion *) 1439 if String.length s >= 10 then begin 1440 let year = int_of_string (String.sub s 0 4) in 1441 let month = int_of_string (String.sub s 5 2) in 1442 let day = int_of_string (String.sub s 8 2) in 1443 validate_date year month day; 1444 (* Parse and validate time portion if present *) 1445 if String.length s >= 16 then begin 1446 let time_start = if s.[10] = 'T' || s.[10] = 't' || s.[10] = ' ' then 11 else 10 in 1447 let hour = int_of_string (String.sub s time_start 2) in 1448 let minute = int_of_string (String.sub s (time_start + 3) 2) in 1449 let second = 1450 if String.length s >= time_start + 8 && s.[time_start + 5] = ':' then 1451 int_of_string (String.sub s (time_start + 6) 2) 1452 else 0 1453 in 1454 validate_time hour minute second 1455 end 1456 end 1457 1458let validate_date_string s = 1459 if String.length s >= 10 then begin 1460 let year = int_of_string (String.sub s 0 4) in 1461 let month = int_of_string (String.sub s 5 2) in 1462 let day = int_of_string (String.sub s 8 2) in 1463 validate_date year month day 1464 end 1465 1466let validate_time_string s = 1467 if String.length s >= 5 then begin 1468 let hour = int_of_string (String.sub s 0 2) in 1469 let minute = int_of_string (String.sub s 3 2) in 1470 let second = 1471 if String.length s >= 8 && s.[5] = ':' then 1472 int_of_string (String.sub s 6 2) 1473 else 0 1474 in 1475 validate_time hour minute second 1476 end 1477 1478(* Table management for the parser *) 1479type table_state = { 1480 mutable values : (string * Toml.t) list; 1481 subtables : (string, table_state) Hashtbl.t; 1482 mutable is_array : bool; 1483 mutable is_inline : bool; 1484 mutable defined : bool; (* Has this table been explicitly defined with [table]? *) 1485 mutable closed : bool; (* Closed to extension via dotted keys from parent *) 1486 mutable array_elements : table_state list; (* For arrays of tables *) 1487} 1488 1489let create_table_state () = { 1490 values = []; 1491 subtables = Hashtbl.create 16; 1492 is_array = false; 1493 is_inline = false; 1494 defined = false; 1495 closed = false; 1496 array_elements = []; 1497} 1498 1499let rec get_or_create_table state keys create_intermediate = 1500 match keys with 1501 | [] -> state 1502 | [k] -> 1503 (* Check if key exists as a value *) 1504 if List.mem_assoc k state.values then 1505 failwith (Printf.sprintf "Cannot use value '%s' as a table" k); 1506 (match Hashtbl.find_opt state.subtables k with 1507 | Some sub -> sub 1508 | None -> 1509 let sub = create_table_state () in 1510 Hashtbl.add state.subtables k sub; 1511 sub) 1512 | k :: rest -> 1513 (* Check if key exists as a value *) 1514 if List.mem_assoc k state.values then 1515 failwith (Printf.sprintf "Cannot use value '%s' as a table" k); 1516 let sub = match Hashtbl.find_opt state.subtables k with 1517 | Some sub -> sub 1518 | None -> 1519 let sub = create_table_state () in 1520 Hashtbl.add state.subtables k sub; 1521 sub 1522 in 1523 if create_intermediate && not sub.defined then 1524 sub.defined <- false; (* Mark as implicitly defined *) 1525 get_or_create_table sub rest create_intermediate 1526 1527(* Like get_or_create_table but marks tables as defined (for dotted keys) *) 1528(* Dotted keys mark tables as "defined" (can't re-define with [table]) but not "closed" *) 1529let rec get_or_create_table_for_dotted_key state keys = 1530 match keys with 1531 | [] -> state 1532 | [k] -> 1533 (* Check if key exists as a value *) 1534 if List.mem_assoc k state.values then 1535 failwith (Printf.sprintf "Cannot use value '%s' as a table" k); 1536 (match Hashtbl.find_opt state.subtables k with 1537 | Some sub -> 1538 (* Check if it's an array of tables (can't extend with dotted keys) *) 1539 if sub.is_array then 1540 failwith (Printf.sprintf "Cannot extend array of tables '%s' using dotted keys" k); 1541 (* Check if it's closed (explicitly defined with [table] header) *) 1542 if sub.closed then 1543 failwith (Printf.sprintf "Cannot extend table '%s' using dotted keys" k); 1544 if sub.is_inline then 1545 failwith (Printf.sprintf "Cannot extend inline table '%s'" k); 1546 (* Mark as defined by dotted key *) 1547 sub.defined <- true; 1548 sub 1549 | None -> 1550 let sub = create_table_state () in 1551 sub.defined <- true; (* Mark as defined by dotted key *) 1552 Hashtbl.add state.subtables k sub; 1553 sub) 1554 | k :: rest -> 1555 (* Check if key exists as a value *) 1556 if List.mem_assoc k state.values then 1557 failwith (Printf.sprintf "Cannot use value '%s' as a table" k); 1558 let sub = match Hashtbl.find_opt state.subtables k with 1559 | Some sub -> 1560 (* Check if it's an array of tables (can't extend with dotted keys) *) 1561 if sub.is_array then 1562 failwith (Printf.sprintf "Cannot extend array of tables '%s' using dotted keys" k); 1563 if sub.closed then 1564 failwith (Printf.sprintf "Cannot extend table '%s' using dotted keys" k); 1565 if sub.is_inline then 1566 failwith (Printf.sprintf "Cannot extend inline table '%s'" k); 1567 (* Mark as defined by dotted key *) 1568 sub.defined <- true; 1569 sub 1570 | None -> 1571 let sub = create_table_state () in 1572 sub.defined <- true; (* Mark as defined by dotted key *) 1573 Hashtbl.add state.subtables k sub; 1574 sub 1575 in 1576 get_or_create_table_for_dotted_key sub rest 1577 1578let rec table_state_to_toml state = 1579 let subtable_values = Hashtbl.fold (fun k sub acc -> 1580 let v = 1581 if sub.is_array then 1582 Toml.Array (List.map table_state_to_toml (get_array_elements sub)) 1583 else 1584 table_state_to_toml sub 1585 in 1586 (k, v) :: acc 1587 ) state.subtables [] in 1588 Toml.Table (List.rev state.values @ subtable_values) 1589 1590and get_array_elements state = 1591 List.rev state.array_elements 1592 1593(* Main parser function *) 1594let parse_toml_from_lexer lexer = 1595 let parser = make_parser lexer in 1596 let root = create_table_state () in 1597 let current_table = ref root in 1598 (* Stack of array contexts: (full_path, parent_state, array_container) *) 1599 (* parent_state is where the array lives, array_container is the array table itself *) 1600 let array_context_stack = ref ([] : (string list * table_state * table_state) list) in 1601 1602 (* Check if keys has a prefix matching the given path *) 1603 let rec has_prefix keys prefix = 1604 match keys, prefix with 1605 | _, [] -> true 1606 | [], _ -> false 1607 | k :: krest, p :: prest -> k = p && has_prefix krest prest 1608 in 1609 1610 (* Remove prefix from keys *) 1611 let rec remove_prefix keys prefix = 1612 match keys, prefix with 1613 | ks, [] -> ks 1614 | [], _ -> [] 1615 | _ :: krest, _ :: prest -> remove_prefix krest prest 1616 in 1617 1618 (* Find matching array context for the given keys *) 1619 let find_array_context keys = 1620 (* Stack is newest-first, so first match is the innermost (longest) prefix *) 1621 let rec find stack = 1622 match stack with 1623 | [] -> None 1624 | (path, parent, container) :: rest -> 1625 if keys = path then 1626 (* Exact match - adding sibling element *) 1627 Some (`Sibling (path, parent, container)) 1628 else if has_prefix keys path && List.length keys > List.length path then 1629 (* Proper prefix - nested table/array within current element *) 1630 let current_entry = List.hd container.array_elements in 1631 Some (`Nested (path, current_entry)) 1632 else 1633 find rest 1634 in 1635 find !array_context_stack 1636 in 1637 1638 (* Pop array contexts that are no longer valid for the given keys *) 1639 let rec pop_invalid_contexts keys = 1640 match !array_context_stack with 1641 | [] -> () 1642 | (path, _, _) :: rest -> 1643 if not (has_prefix keys path) then begin 1644 array_context_stack := rest; 1645 pop_invalid_contexts keys 1646 end 1647 in 1648 1649 let rec parse_document () = 1650 skip_newlines parser; 1651 match peek_token parser with 1652 | Tok_eof -> () 1653 | Tok_lbracket -> 1654 (* Check for array of tables [[...]] vs table [...] *) 1655 ignore (consume_token parser); 1656 (* For [[, the two brackets must be adjacent (no whitespace) *) 1657 let is_adjacent_bracket = next_raw_char_is parser '[' in 1658 (match peek_token parser with 1659 | Tok_lbracket when not is_adjacent_bracket -> 1660 (* The next [ was found after whitespace - this is invalid syntax like [ [table]] *) 1661 failwith "Invalid table header syntax" 1662 | Tok_lbracket -> 1663 (* Array of tables - brackets are adjacent *) 1664 ignore (consume_token parser); 1665 let keys = parse_dotted_key parser in 1666 expect_token parser Tok_rbracket; 1667 (* Check that closing ]] are adjacent (no whitespace) *) 1668 if not (next_raw_char_is parser ']') then 1669 failwith "Invalid array of tables syntax (space in ]])"; 1670 expect_token parser Tok_rbracket; 1671 skip_to_newline parser; 1672 (* Pop contexts that are no longer valid for these keys *) 1673 pop_invalid_contexts keys; 1674 (* Check array context for this path *) 1675 (match find_array_context keys with 1676 | Some (`Sibling (path, _parent, container)) -> 1677 (* Adding another element to an existing array *) 1678 let new_entry = create_table_state () in 1679 container.array_elements <- new_entry :: container.array_elements; 1680 current_table := new_entry; 1681 (* Update the stack entry with new current element (by re-adding) *) 1682 array_context_stack := List.map (fun (p, par, cont) -> 1683 if p = path then (p, par, cont) else (p, par, cont) 1684 ) !array_context_stack 1685 | Some (`Nested (parent_path, parent_entry)) -> 1686 (* Sub-array within current array element *) 1687 let relative_keys = remove_prefix keys parent_path in 1688 let array_table = get_or_create_table parent_entry relative_keys true in 1689 (* Check if trying to convert a non-array table to array *) 1690 if array_table.defined && not array_table.is_array then 1691 failwith (Printf.sprintf "Cannot define '%s' as array of tables; already defined as table" (String.concat "." keys)); 1692 if (array_table.values <> [] || Hashtbl.length array_table.subtables > 0) && not array_table.is_array then 1693 failwith (Printf.sprintf "Cannot define '%s' as array of tables; already has content" (String.concat "." keys)); 1694 array_table.is_array <- true; 1695 let new_entry = create_table_state () in 1696 array_table.array_elements <- new_entry :: array_table.array_elements; 1697 current_table := new_entry; 1698 (* Push new context for the nested array *) 1699 array_context_stack := (keys, parent_entry, array_table) :: !array_context_stack 1700 | None -> 1701 (* Top-level array *) 1702 let array_table = get_or_create_table root keys true in 1703 (* Check if trying to convert a non-array table to array *) 1704 if array_table.defined && not array_table.is_array then 1705 failwith (Printf.sprintf "Cannot define '%s' as array of tables; already defined as table" (String.concat "." keys)); 1706 if (array_table.values <> [] || Hashtbl.length array_table.subtables > 0) && not array_table.is_array then 1707 failwith (Printf.sprintf "Cannot define '%s' as array of tables; already has content" (String.concat "." keys)); 1708 array_table.is_array <- true; 1709 let entry = create_table_state () in 1710 array_table.array_elements <- entry :: array_table.array_elements; 1711 current_table := entry; 1712 (* Push context for this array *) 1713 array_context_stack := (keys, root, array_table) :: !array_context_stack); 1714 parse_document () 1715 | _ -> 1716 (* Regular table *) 1717 let keys = parse_dotted_key parser in 1718 expect_token parser Tok_rbracket; 1719 skip_to_newline parser; 1720 (* Pop contexts that are no longer valid for these keys *) 1721 pop_invalid_contexts keys; 1722 (* Check if this table is relative to a current array element *) 1723 (match find_array_context keys with 1724 | Some (`Nested (parent_path, parent_entry)) -> 1725 let relative_keys = remove_prefix keys parent_path in 1726 if relative_keys <> [] then begin 1727 let table = get_or_create_table parent_entry relative_keys true in 1728 if table.is_array then 1729 failwith (Printf.sprintf "Cannot define '%s' as table; already defined as array of tables" (String.concat "." keys)); 1730 if table.defined then 1731 failwith (Printf.sprintf "Table '%s' already defined" (String.concat "." keys)); 1732 table.defined <- true; 1733 table.closed <- true; (* Can't extend via dotted keys from parent *) 1734 current_table := table 1735 end else begin 1736 (* Keys equal parent_path - shouldn't happen for regular tables *) 1737 let table = get_or_create_table root keys true in 1738 if table.is_array then 1739 failwith (Printf.sprintf "Cannot define '%s' as table; already defined as array of tables" (String.concat "." keys)); 1740 if table.defined then 1741 failwith (Printf.sprintf "Table '%s' already defined" (String.concat "." keys)); 1742 table.defined <- true; 1743 table.closed <- true; (* Can't extend via dotted keys from parent *) 1744 current_table := table 1745 end 1746 | Some (`Sibling (_, _, container)) -> 1747 (* Exact match to an array of tables path - can't define as regular table *) 1748 if container.is_array then 1749 failwith (Printf.sprintf "Cannot define '%s' as table; already defined as array of tables" (String.concat "." keys)); 1750 (* Shouldn't reach here normally *) 1751 let table = get_or_create_table root keys true in 1752 if table.defined then 1753 failwith (Printf.sprintf "Table '%s' already defined" (String.concat "." keys)); 1754 table.defined <- true; 1755 table.closed <- true; 1756 current_table := table 1757 | None -> 1758 (* Not in an array context *) 1759 let table = get_or_create_table root keys true in 1760 if table.is_array then 1761 failwith (Printf.sprintf "Cannot define '%s' as table; already defined as array of tables" (String.concat "." keys)); 1762 if table.defined then 1763 failwith (Printf.sprintf "Table '%s' already defined" (String.concat "." keys)); 1764 table.defined <- true; 1765 table.closed <- true; (* Can't extend via dotted keys from parent *) 1766 current_table := table; 1767 (* Clear array context stack if we left all array contexts *) 1768 if not (List.exists (fun (p, _, _) -> has_prefix keys p) !array_context_stack) then 1769 array_context_stack := []); 1770 parse_document ()) 1771 | Tok_bare_key _ | Tok_basic_string _ | Tok_literal_string _ 1772 | Tok_integer _ | Tok_float _ | Tok_date_local _ | Tok_datetime _ 1773 | Tok_datetime_local _ | Tok_time_local _ -> 1774 (* Key-value pair - key can be bare, quoted, or numeric *) 1775 let keys = parse_dotted_key parser in 1776 expect_token parser Tok_equals; 1777 let value = parse_value parser in 1778 skip_to_newline parser; 1779 (* Add value to current table - check for duplicates first *) 1780 let add_value_to_table tbl key v = 1781 if List.mem_assoc key tbl.values then 1782 failwith (Printf.sprintf "Duplicate key: %s" key); 1783 (match Hashtbl.find_opt tbl.subtables key with 1784 | Some sub -> 1785 if sub.is_array then 1786 failwith (Printf.sprintf "Cannot redefine array of tables '%s' as a value" key) 1787 else 1788 failwith (Printf.sprintf "Cannot redefine table '%s' as a value" key) 1789 | None -> ()); 1790 tbl.values <- (key, v) :: tbl.values 1791 in 1792 (match keys with 1793 | [] -> failwith "Empty key" 1794 | [k] -> 1795 add_value_to_table !current_table k value 1796 | _ -> 1797 let parent_keys = List.rev (List.tl (List.rev keys)) in 1798 let final_key = List.hd (List.rev keys) in 1799 (* Use get_or_create_table_for_dotted_key to check for closed tables *) 1800 let parent = get_or_create_table_for_dotted_key !current_table parent_keys in 1801 add_value_to_table parent final_key value); 1802 parse_document () 1803 | _tok -> 1804 failwith (Printf.sprintf "Unexpected token at line %d" parser.lexer.line) 1805 1806 and skip_to_newline parser = 1807 skip_ws_and_comments parser.lexer; 1808 match peek_token parser with 1809 | Tok_newline -> ignore (consume_token parser) 1810 | Tok_eof -> () 1811 | _ -> failwith "Expected newline after value" 1812 in 1813 1814 parse_document (); 1815 table_state_to_toml root 1816 1817(* Parse TOML from string - creates lexer internally *) 1818let parse_toml input = 1819 let lexer = make_lexer input in 1820 parse_toml_from_lexer lexer 1821 1822(* Parse TOML directly from Bytes.Reader - no intermediate string *) 1823let parse_toml_from_reader ?file r = 1824 let lexer = make_lexer_from_reader ?file r in 1825 parse_toml_from_lexer lexer 1826 1827(* Convert TOML to tagged JSON for toml-test compatibility *) 1828let rec toml_to_tagged_json value = 1829 match value with 1830 | Toml.String s -> 1831 Printf.sprintf "{\"type\":\"string\",\"value\":%s}" (json_encode_string s) 1832 | Toml.Int i -> 1833 Printf.sprintf "{\"type\":\"integer\",\"value\":\"%Ld\"}" i 1834 | Toml.Float f -> 1835 let value_str = 1836 (* Normalize exponent format - lowercase e, keep + for positive exponents *) 1837 let format_exp s = 1838 let buf = Buffer.create (String.length s + 1) in 1839 let i = ref 0 in 1840 while !i < String.length s do 1841 let c = s.[!i] in 1842 if c = 'E' then begin 1843 Buffer.add_char buf 'e'; 1844 (* Add + if next char is a digit (no sign present) *) 1845 if !i + 1 < String.length s then begin 1846 let next = s.[!i + 1] in 1847 if next >= '0' && next <= '9' then 1848 Buffer.add_char buf '+' 1849 end 1850 end else if c = 'e' then begin 1851 Buffer.add_char buf 'e'; 1852 (* Add + if next char is a digit (no sign present) *) 1853 if !i + 1 < String.length s then begin 1854 let next = s.[!i + 1] in 1855 if next >= '0' && next <= '9' then 1856 Buffer.add_char buf '+' 1857 end 1858 end else 1859 Buffer.add_char buf c; 1860 incr i 1861 done; 1862 Buffer.contents buf 1863 in 1864 if Float.is_nan f then "nan" 1865 else if f = Float.infinity then "inf" 1866 else if f = Float.neg_infinity then "-inf" 1867 else if f = 0.0 then 1868 (* Special case for zero - output "0" or "-0" *) 1869 if 1.0 /. f = Float.neg_infinity then "-0" else "0" 1870 else if Float.is_integer f then 1871 (* Integer floats - decide on representation *) 1872 let abs_f = Float.abs f in 1873 if abs_f = 9007199254740991.0 then 1874 (* Exact max safe integer - output without .0 per toml-test expectation *) 1875 Printf.sprintf "%.0f" f 1876 else if abs_f >= 1e6 then 1877 (* Use scientific notation for numbers >= 1e6 *) 1878 (* Start with precision 0 to get XeN format (integer mantissa) *) 1879 let rec try_exp_precision prec = 1880 if prec > 17 then format_exp (Printf.sprintf "%.17e" f) 1881 else 1882 let s = format_exp (Printf.sprintf "%.*e" prec f) in 1883 if float_of_string s = f then s 1884 else try_exp_precision (prec + 1) 1885 in 1886 try_exp_precision 0 1887 else if abs_f >= 2.0 then 1888 (* Integer floats >= 2 - output with .0 suffix *) 1889 Printf.sprintf "%.1f" f 1890 else 1891 (* Integer floats 0, 1, -1 - output without .0 suffix *) 1892 Printf.sprintf "%.0f" f 1893 else 1894 (* Non-integer float *) 1895 let abs_f = Float.abs f in 1896 let use_scientific = abs_f >= 1e10 || (abs_f < 1e-4 && abs_f > 0.0) in 1897 if use_scientific then 1898 let rec try_exp_precision prec = 1899 if prec > 17 then format_exp (Printf.sprintf "%.17e" f) 1900 else 1901 let s = format_exp (Printf.sprintf "%.*e" prec f) in 1902 if float_of_string s = f then s 1903 else try_exp_precision (prec + 1) 1904 in 1905 try_exp_precision 1 1906 else 1907 (* Prefer decimal notation for reasonable range *) 1908 (* Try shortest decimal first *) 1909 let rec try_decimal_precision prec = 1910 if prec > 17 then None 1911 else 1912 let s = Printf.sprintf "%.*f" prec f in 1913 (* Remove trailing zeros but keep at least one decimal place *) 1914 let s = 1915 let len = String.length s in 1916 let dot_pos = try String.index s '.' with Not_found -> len in 1917 let rec find_last_nonzero i = 1918 if i <= dot_pos then dot_pos + 2 (* Keep at least X.0 *) 1919 else if s.[i] <> '0' then i + 1 1920 else find_last_nonzero (i - 1) 1921 in 1922 let end_pos = min len (find_last_nonzero (len - 1)) in 1923 String.sub s 0 end_pos 1924 in 1925 (* Ensure there's a decimal point with at least one digit after *) 1926 let s = 1927 if not (String.contains s '.') then s ^ ".0" 1928 else if s.[String.length s - 1] = '.' then s ^ "0" 1929 else s 1930 in 1931 if float_of_string s = f then Some s 1932 else try_decimal_precision (prec + 1) 1933 in 1934 let decimal = try_decimal_precision 1 in 1935 (* Always prefer decimal notation if it works *) 1936 match decimal with 1937 | Some d -> d 1938 | None -> 1939 (* Fall back to shortest representation *) 1940 let rec try_precision prec = 1941 if prec > 17 then Printf.sprintf "%.17g" f 1942 else 1943 let s = Printf.sprintf "%.*g" prec f in 1944 if float_of_string s = f then s 1945 else try_precision (prec + 1) 1946 in 1947 try_precision 1 1948 in 1949 Printf.sprintf "{\"type\":\"float\",\"value\":\"%s\"}" value_str 1950 | Toml.Bool b -> 1951 Printf.sprintf "{\"type\":\"bool\",\"value\":\"%s\"}" (if b then "true" else "false") 1952 | Toml.Datetime s -> 1953 validate_datetime_string s; 1954 Printf.sprintf "{\"type\":\"datetime\",\"value\":\"%s\"}" s 1955 | Toml.Datetime_local s -> 1956 validate_datetime_string s; 1957 Printf.sprintf "{\"type\":\"datetime-local\",\"value\":\"%s\"}" s 1958 | Toml.Date_local s -> 1959 validate_date_string s; 1960 Printf.sprintf "{\"type\":\"date-local\",\"value\":\"%s\"}" s 1961 | Toml.Time_local s -> 1962 validate_time_string s; 1963 Printf.sprintf "{\"type\":\"time-local\",\"value\":\"%s\"}" s 1964 | Toml.Array items -> 1965 let json_items = List.map toml_to_tagged_json items in 1966 Printf.sprintf "[%s]" (String.concat "," json_items) 1967 | Toml.Table pairs -> 1968 let json_pairs = List.map (fun (k, v) -> 1969 Printf.sprintf "%s:%s" (json_encode_string k) (toml_to_tagged_json v) 1970 ) pairs in 1971 Printf.sprintf "{%s}" (String.concat "," json_pairs) 1972 1973and json_encode_string s = 1974 let buf = Buffer.create (String.length s + 2) in 1975 Buffer.add_char buf '"'; 1976 String.iter (fun c -> 1977 match c with 1978 | '"' -> Buffer.add_string buf "\\\"" 1979 | '\\' -> Buffer.add_string buf "\\\\" 1980 | '\n' -> Buffer.add_string buf "\\n" 1981 | '\r' -> Buffer.add_string buf "\\r" 1982 | '\t' -> Buffer.add_string buf "\\t" 1983 | '\b' -> Buffer.add_string buf "\\b" (* backspace *) 1984 | c when Char.code c = 0x0C -> Buffer.add_string buf "\\f" (* formfeed *) 1985 | c when Char.code c < 0x20 -> 1986 Buffer.add_string buf (Printf.sprintf "\\u%04x" (Char.code c)) 1987 | c -> Buffer.add_char buf c 1988 ) s; 1989 Buffer.add_char buf '"'; 1990 Buffer.contents buf 1991 1992(* Tagged JSON to TOML for encoder *) 1993let decode_tagged_json_string s = 1994 (* Simple JSON parser for tagged format *) 1995 let pos = ref 0 in 1996 let len = String.length s in 1997 1998 let skip_ws () = 1999 while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t' || s.[!pos] = '\n' || s.[!pos] = '\r') do 2000 incr pos 2001 done 2002 in 2003 2004 let expect c = 2005 skip_ws (); 2006 if !pos >= len || s.[!pos] <> c then 2007 failwith (Printf.sprintf "Expected '%c' at position %d" c !pos); 2008 incr pos 2009 in 2010 2011 let peek () = 2012 skip_ws (); 2013 if !pos >= len then None else Some s.[!pos] 2014 in 2015 2016 let parse_json_string () = 2017 skip_ws (); 2018 expect '"'; 2019 let buf = Buffer.create 64 in 2020 while !pos < len && s.[!pos] <> '"' do 2021 if s.[!pos] = '\\' then begin 2022 incr pos; 2023 if !pos >= len then failwith "Unexpected end in string escape"; 2024 match s.[!pos] with 2025 | '"' -> Buffer.add_char buf '"'; incr pos 2026 | '\\' -> Buffer.add_char buf '\\'; incr pos 2027 | '/' -> Buffer.add_char buf '/'; incr pos 2028 | 'n' -> Buffer.add_char buf '\n'; incr pos 2029 | 'r' -> Buffer.add_char buf '\r'; incr pos 2030 | 't' -> Buffer.add_char buf '\t'; incr pos 2031 | 'b' -> Buffer.add_char buf '\b'; incr pos 2032 | 'f' -> Buffer.add_char buf (Char.chr 0x0C); incr pos 2033 | 'u' -> 2034 incr pos; 2035 if !pos + 3 >= len then failwith "Invalid unicode escape"; 2036 let hex = String.sub s !pos 4 in 2037 let cp = int_of_string ("0x" ^ hex) in 2038 Buffer.add_string buf (codepoint_to_utf8 cp); 2039 pos := !pos + 4 2040 | c -> failwith (Printf.sprintf "Invalid escape: \\%c" c) 2041 end else begin 2042 Buffer.add_char buf s.[!pos]; 2043 incr pos 2044 end 2045 done; 2046 expect '"'; 2047 Buffer.contents buf 2048 in 2049 2050 (* Convert a tagged JSON object to a TOML primitive if applicable *) 2051 let convert_tagged_value value = 2052 match value with 2053 | Toml.Table [("type", Toml.String typ); ("value", Toml.String v)] 2054 | Toml.Table [("value", Toml.String v); ("type", Toml.String typ)] -> 2055 (match typ with 2056 | "string" -> Toml.String v 2057 | "integer" -> Toml.Int (Int64.of_string v) 2058 | "float" -> 2059 (match v with 2060 | "inf" -> Toml.Float Float.infinity 2061 | "-inf" -> Toml.Float Float.neg_infinity 2062 | "nan" -> Toml.Float Float.nan 2063 | _ -> Toml.Float (float_of_string v)) 2064 | "bool" -> Toml.Bool (v = "true") 2065 | "datetime" -> Toml.Datetime v 2066 | "datetime-local" -> Toml.Datetime_local v 2067 | "date-local" -> Toml.Date_local v 2068 | "time-local" -> Toml.Time_local v 2069 | _ -> failwith (Printf.sprintf "Unknown type: %s" typ)) 2070 | _ -> value 2071 in 2072 2073 let rec parse_value () = 2074 skip_ws (); 2075 match peek () with 2076 | Some '{' -> parse_object () 2077 | Some '[' -> parse_array () 2078 | Some '"' -> Toml.String (parse_json_string ()) 2079 | _ -> failwith "Expected value" 2080 2081 and parse_object () = 2082 expect '{'; 2083 skip_ws (); 2084 if peek () = Some '}' then begin 2085 incr pos; 2086 Toml.Table [] 2087 end else begin 2088 let pairs = ref [] in 2089 let first = ref true in 2090 while peek () <> Some '}' do 2091 if not !first then expect ','; 2092 first := false; 2093 skip_ws (); 2094 let key = parse_json_string () in 2095 expect ':'; 2096 let value = parse_value () in 2097 pairs := (key, convert_tagged_value value) :: !pairs 2098 done; 2099 expect '}'; 2100 Toml.Table (List.rev !pairs) 2101 end 2102 2103 and parse_array () = 2104 expect '['; 2105 skip_ws (); 2106 if peek () = Some ']' then begin 2107 incr pos; 2108 Toml.Array [] 2109 end else begin 2110 let items = ref [] in 2111 let first = ref true in 2112 while peek () <> Some ']' do 2113 if not !first then expect ','; 2114 first := false; 2115 items := convert_tagged_value (parse_value ()) :: !items 2116 done; 2117 expect ']'; 2118 Toml.Array (List.rev !items) 2119 end 2120 in 2121 2122 parse_value () 2123 2124 2125(* ============================================ 2126 Streaming TOML Encoder 2127 ============================================ *) 2128 2129let is_bare_key_char c = 2130 (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || 2131 (c >= '0' && c <= '9') || c = '_' || c = '-' 2132 2133let rec write_toml_string w s = 2134 (* Check if we need to escape *) 2135 let needs_escape = String.exists (fun c -> 2136 let code = Char.code c in 2137 c = '"' || c = '\\' || c = '\n' || c = '\r' || c = '\t' || 2138 code < 0x20 || code = 0x7F 2139 ) s in 2140 if needs_escape then begin 2141 Bytes.Writer.write_string w "\""; 2142 String.iter (fun c -> 2143 match c with 2144 | '"' -> Bytes.Writer.write_string w "\\\"" 2145 | '\\' -> Bytes.Writer.write_string w "\\\\" 2146 | '\n' -> Bytes.Writer.write_string w "\\n" 2147 | '\r' -> Bytes.Writer.write_string w "\\r" 2148 | '\t' -> Bytes.Writer.write_string w "\\t" 2149 | '\b' -> Bytes.Writer.write_string w "\\b" 2150 | c when Char.code c = 0x0C -> Bytes.Writer.write_string w "\\f" 2151 | c when Char.code c < 0x20 || Char.code c = 0x7F -> 2152 Bytes.Writer.write_string w (Printf.sprintf "\\u%04X" (Char.code c)) 2153 | c -> 2154 let b = Bytes.create 1 in 2155 Bytes.set b 0 c; 2156 Bytes.Writer.write_bytes w b 2157 ) s; 2158 Bytes.Writer.write_string w "\"" 2159 end else begin 2160 Bytes.Writer.write_string w "\""; 2161 Bytes.Writer.write_string w s; 2162 Bytes.Writer.write_string w "\"" 2163 end 2164 2165and write_toml_key w k = 2166 (* Check if it can be a bare key *) 2167 let is_bare = String.length k > 0 && String.for_all is_bare_key_char k in 2168 if is_bare then Bytes.Writer.write_string w k 2169 else write_toml_string w k 2170 2171and write_toml_value w ?(inline=false) value = 2172 match value with 2173 | Toml.String s -> write_toml_string w s 2174 | Toml.Int i -> Bytes.Writer.write_string w (Int64.to_string i) 2175 | Toml.Float f -> 2176 if Float.is_nan f then Bytes.Writer.write_string w "nan" 2177 else if f = Float.infinity then Bytes.Writer.write_string w "inf" 2178 else if f = Float.neg_infinity then Bytes.Writer.write_string w "-inf" 2179 else begin 2180 let s = Printf.sprintf "%.17g" f in 2181 (* Ensure it looks like a float *) 2182 let s = if String.contains s '.' || String.contains s 'e' || String.contains s 'E' 2183 then s else s ^ ".0" in 2184 Bytes.Writer.write_string w s 2185 end 2186 | Toml.Bool b -> Bytes.Writer.write_string w (if b then "true" else "false") 2187 | Toml.Datetime s -> Bytes.Writer.write_string w s 2188 | Toml.Datetime_local s -> Bytes.Writer.write_string w s 2189 | Toml.Date_local s -> Bytes.Writer.write_string w s 2190 | Toml.Time_local s -> Bytes.Writer.write_string w s 2191 | Toml.Array items -> 2192 Bytes.Writer.write_string w "["; 2193 List.iteri (fun i item -> 2194 if i > 0 then Bytes.Writer.write_string w ", "; 2195 write_toml_value w ~inline:true item 2196 ) items; 2197 Bytes.Writer.write_string w "]" 2198 | Toml.Table pairs when inline -> 2199 Bytes.Writer.write_string w "{"; 2200 List.iteri (fun i (k, v) -> 2201 if i > 0 then Bytes.Writer.write_string w ", "; 2202 write_toml_key w k; 2203 Bytes.Writer.write_string w " = "; 2204 write_toml_value w ~inline:true v 2205 ) pairs; 2206 Bytes.Writer.write_string w "}" 2207 | Toml.Table _ -> failwith "Cannot encode table inline without inline flag" 2208 2209(* True streaming TOML encoder - writes directly to Bytes.Writer *) 2210let encode_to_writer w value = 2211 let has_content = ref false in 2212 2213 let write_path path = 2214 List.iteri (fun i k -> 2215 if i > 0 then Bytes.Writer.write_string w "."; 2216 write_toml_key w k 2217 ) path 2218 in 2219 2220 let rec encode_at_path path value = 2221 match value with 2222 | Toml.Table pairs -> 2223 (* Separate simple values from nested tables *) 2224 (* Only PURE table arrays (all items are tables) use [[array]] syntax. 2225 Mixed arrays (primitives + tables) must be encoded inline. *) 2226 let is_pure_table_array items = 2227 items <> [] && List.for_all (function Toml.Table _ -> true | _ -> false) items 2228 in 2229 let simple, nested = List.partition (fun (_, v) -> 2230 match v with 2231 | Toml.Table _ -> false 2232 | Toml.Array items -> not (is_pure_table_array items) 2233 | _ -> true 2234 ) pairs in 2235 2236 (* Emit simple values first *) 2237 List.iter (fun (k, v) -> 2238 write_toml_key w k; 2239 Bytes.Writer.write_string w " = "; 2240 write_toml_value w ~inline:true v; 2241 Bytes.Writer.write_string w "\n"; 2242 has_content := true 2243 ) simple; 2244 2245 (* Then nested tables *) 2246 List.iter (fun (k, v) -> 2247 let new_path = path @ [k] in 2248 match v with 2249 | Toml.Table _ -> 2250 if !has_content then Bytes.Writer.write_string w "\n"; 2251 Bytes.Writer.write_string w "["; 2252 write_path new_path; 2253 Bytes.Writer.write_string w "]\n"; 2254 has_content := true; 2255 encode_at_path new_path v 2256 | Toml.Array items when items <> [] && List.for_all (function Toml.Table _ -> true | _ -> false) items -> 2257 (* Pure table array - use [[array]] syntax *) 2258 List.iter (fun item -> 2259 match item with 2260 | Toml.Table _ -> 2261 if !has_content then Bytes.Writer.write_string w "\n"; 2262 Bytes.Writer.write_string w "[["; 2263 write_path new_path; 2264 Bytes.Writer.write_string w "]]\n"; 2265 has_content := true; 2266 encode_at_path new_path item 2267 | _ -> assert false (* Impossible - we checked for_all above *) 2268 ) items 2269 | _ -> 2270 write_toml_key w k; 2271 Bytes.Writer.write_string w " = "; 2272 write_toml_value w ~inline:true v; 2273 Bytes.Writer.write_string w "\n"; 2274 has_content := true 2275 ) nested 2276 | _ -> 2277 failwith "Top-level TOML must be a table" 2278 in 2279 2280 encode_at_path [] value 2281 2282(* ============================================ 2283 Public Interface - Parsing 2284 ============================================ *) 2285 2286let of_string input = 2287 try 2288 Ok (parse_toml input) 2289 with 2290 | Failure msg -> Error (Toml.Error.make (Toml.Error.Syntax (Toml.Error.Expected msg))) 2291 | Toml.Error.Error e -> Error e 2292 | e -> Error (Toml.Error.make (Toml.Error.Syntax (Toml.Error.Expected (Printexc.to_string e)))) 2293 2294let of_reader ?file r = 2295 try 2296 Ok (parse_toml_from_reader ?file r) 2297 with 2298 | Failure msg -> Error (Toml.Error.make (Toml.Error.Syntax (Toml.Error.Expected msg))) 2299 | Toml.Error.Error e -> Error e 2300 | e -> Error (Toml.Error.make (Toml.Error.Syntax (Toml.Error.Expected (Printexc.to_string e)))) 2301 2302let parse = parse_toml 2303 2304let parse_reader ?file r = parse_toml_from_reader ?file r 2305 2306(* ============================================ 2307 Public Interface - Encoding 2308 ============================================ *) 2309 2310let to_writer w value = encode_to_writer w value 2311 2312let to_string value = 2313 let buf = Buffer.create 256 in 2314 let w = Bytes.Writer.of_buffer buf in 2315 encode_to_writer w value; 2316 Buffer.contents buf 2317 2318(* ============================================ 2319 Codec I/O Operations 2320 ============================================ *) 2321 2322let decode_string c s = 2323 Result.bind (of_string s) (Tomlt.decode c) 2324 2325let decode_string_exn c s = 2326 let toml = parse s in 2327 Tomlt.decode_exn c toml 2328 2329let encode_string c v = 2330 let toml = Tomlt.encode c v in 2331 to_string toml 2332 2333let decode_reader ?file c r = 2334 Result.bind (of_reader ?file r) (Tomlt.decode c) 2335 2336let encode_writer c v w = 2337 let toml = Tomlt.encode c v in 2338 to_writer w toml 2339 2340(* ============================================ 2341 Tagged JSON Module 2342 ============================================ *) 2343 2344module Tagged_json = struct 2345 let encode = toml_to_tagged_json 2346 let decode = decode_tagged_json_string 2347 2348 let decode_and_encode_toml json_str = 2349 try 2350 let toml = decode_tagged_json_string json_str in 2351 Ok (to_string toml) 2352 with 2353 | Failure msg -> Error msg 2354 | e -> Error (Printexc.to_string e) 2355end