The unpac monorepo manager self-hosting as a monorepo using unpac

feature: add streaming interface (#456)

This allows us to partially match a string and then resume a match from where
we've ended.

authored by

Rudi Grinberg and committed by
GitHub
f339904a 73e4c4e7

+533 -1
+2
CHANGES.md
··· 7 7 * Introduce parsing functions in `Re.{Perl,Pcre,Emacs,Glob}` that return a 8 8 result instead of raising. (#542) 9 9 10 + * Introduce experimental streaming API `Re.Stream`. (#456) 11 + 10 12 1.13.1 (30-Sep-2024) 11 13 -------------------- 12 14
+176
lib/compile.ml
··· 371 371 else final_boundary_check re positions ~last ~slen s state_info ~groups 372 372 ;; 373 373 374 + module Stream = struct 375 + type nonrec t = 376 + { state : State.t 377 + ; re : re 378 + } 379 + 380 + type 'a feed = 381 + | Ok of 'a 382 + | No_match 383 + 384 + let create re = 385 + let category = Category.(search_boundary ++ inexistant) in 386 + let state = find_initial_state re category in 387 + { state; re } 388 + ;; 389 + 390 + let feed t s ~pos ~len = 391 + (* TODO bound checks? *) 392 + let last = pos + len in 393 + let state = loop_no_mark t.re ~colors:t.re.colors s ~last ~pos t.state t.state in 394 + let info = State.get_info state in 395 + if Idx.is_break info.idx 396 + && 397 + match Automata.State.status info.desc with 398 + | Failed -> true 399 + | Match _ | Running -> false 400 + then No_match 401 + else Ok { t with state } 402 + ;; 403 + 404 + let finalize t s ~pos ~len = 405 + (* TODO bound checks? *) 406 + let last = pos + len in 407 + let state = scan_str t.re Positions.empty s t.state ~last ~pos ~groups:false in 408 + let info = State.get_info state in 409 + match 410 + let _idx, res = 411 + let final_cat = Category.(search_boundary ++ inexistant) in 412 + final t.re Positions.empty info final_cat 413 + in 414 + res 415 + with 416 + | Running | Failed -> false 417 + | Match _ -> true 418 + ;; 419 + 420 + module Group = struct 421 + type nonrec t = 422 + { t : t 423 + ; positions : Positions.t 424 + ; slices : Slice.L.t 425 + ; abs_pos : int 426 + ; first_match_pos : int 427 + } 428 + 429 + let no_match_starts_before t = t.first_match_pos 430 + 431 + let create t = 432 + { t 433 + ; positions = Positions.make ~groups:true t.re 434 + ; slices = [] 435 + ; abs_pos = 0 436 + ; first_match_pos = 0 437 + } 438 + ;; 439 + 440 + module Match = struct 441 + type t = 442 + { pmarks : Pmark.Set.t 443 + ; slices : Slice.L.t 444 + ; marks : Mark_infos.t 445 + ; positions : int array 446 + ; start_pos : int 447 + } 448 + 449 + let test_mark t mark = Pmark.Set.mem mark t.pmarks 450 + 451 + let get t i = 452 + Mark_infos.offset t.marks i 453 + |> Option.map (fun (start, stop) -> 454 + let start = t.positions.(start) - t.start_pos in 455 + let stop = t.positions.(stop) - t.start_pos in 456 + Slice.L.get_substring t.slices ~start ~stop) 457 + ;; 458 + 459 + let make ~start_pos ~pmarks ~slices ~marks ~positions = 460 + let positions = Positions.all positions in 461 + { pmarks; slices; positions; marks; start_pos } 462 + ;; 463 + end 464 + 465 + let rec loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st = 466 + if pos < last 467 + then ( 468 + let st' = next colors st s pos in 469 + let idx = (State.get_info st').idx in 470 + if Idx.is_idx idx 471 + then ( 472 + Positions.set positions (Idx.idx idx) (abs_pos + pos); 473 + loop re ~abs_pos ~colors ~positions s ~pos:(pos + 1) ~last st' st') 474 + else if Idx.is_break idx 475 + then ( 476 + Positions.set positions (Idx.break_idx idx) (abs_pos + pos); 477 + st') 478 + else ( 479 + (* Unknown *) 480 + validate re positions s ~pos st0; 481 + loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st0)) 482 + else st 483 + ;; 484 + 485 + let feed ({ t; positions; slices; abs_pos; first_match_pos = _ } as tt) s ~pos ~len = 486 + let state = 487 + (* TODO bound checks? *) 488 + let last = pos + len in 489 + loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state t.state 490 + in 491 + let info = State.get_info state in 492 + if Idx.is_break info.idx 493 + && 494 + match Automata.State.status info.desc with 495 + | Failed -> true 496 + | Match _ | Running -> false 497 + then No_match 498 + else ( 499 + let t = { t with state } in 500 + let slices = { Slice.s; pos; len } :: slices in 501 + let first_match_pos = Positions.first positions in 502 + let slices = Slice.L.drop_rev slices (first_match_pos - tt.first_match_pos) in 503 + let abs_pos = abs_pos + len in 504 + Ok { tt with t; slices; abs_pos; first_match_pos }) 505 + ;; 506 + 507 + let finalize 508 + ({ t; positions; slices; abs_pos; first_match_pos = _ } as tt) 509 + s 510 + ~pos 511 + ~len 512 + : Match.t feed 513 + = 514 + (* TODO bound checks? *) 515 + let last = pos + len in 516 + let info = 517 + let state = 518 + loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state t.state 519 + in 520 + State.get_info state 521 + in 522 + match 523 + match Automata.State.status info.desc with 524 + | (Match _ | Failed) as s -> s 525 + | Running -> 526 + let idx, res = 527 + let final_cat = Category.(search_boundary ++ inexistant) in 528 + final t.re positions info final_cat 529 + in 530 + (match res with 531 + | Running | Failed -> () 532 + | Match _ -> Positions.set positions (Automata.Idx.to_int idx) (abs_pos + last)); 533 + res 534 + with 535 + | Running | Failed -> No_match 536 + | Match (marks, pmarks) -> 537 + let first_match_position = Positions.first positions in 538 + let slices = 539 + let slices = 540 + let slices = { Slice.s; pos; len } :: slices in 541 + Slice.L.drop_rev slices (first_match_position - tt.first_match_pos) 542 + in 543 + List.rev slices 544 + in 545 + Ok (Match.make ~start_pos:first_match_position ~pmarks ~marks ~slices ~positions) 546 + ;; 547 + end 548 + end 549 + 374 550 let match_str_no_bounds ~groups ~partial re s ~pos ~len = 375 551 let positions = Positions.make ~groups re in 376 552 match make_match_str re positions ~len ~groups ~partial s ~pos with
+29
lib/compile.mli
··· 1 1 type re 2 2 3 + module Stream : sig 4 + type t 5 + 6 + type 'a feed = 7 + | Ok of 'a 8 + | No_match 9 + 10 + val create : re -> t 11 + val feed : t -> string -> pos:int -> len:int -> t feed 12 + val finalize : t -> string -> pos:int -> len:int -> bool 13 + 14 + module Group : sig 15 + type stream := t 16 + type t 17 + 18 + module Match : sig 19 + type t 20 + 21 + val get : t -> int -> string option 22 + val test_mark : t -> Pmark.t -> bool 23 + end 24 + 25 + val create : stream -> t 26 + val feed : t -> string -> pos:int -> len:int -> t feed 27 + val finalize : t -> string -> pos:int -> len:int -> Match.t feed 28 + val no_match_starts_before : t -> int 29 + end 30 + end 31 + 3 32 type match_info = 4 33 | Match of Group.t 5 34 | Failed
+1
lib/core.ml
··· 170 170 end 171 171 172 172 module Seq = Search 173 + module Stream = Compile.Stream
+39 -1
lib/core.mli
··· 215 215 (** Marks *) 216 216 module Mark : sig 217 217 (** Mark id *) 218 - type t 218 + type t = Pmark.t 219 219 220 220 (** Tell if a mark was matched. *) 221 221 val test : Group.t -> t -> bool ··· 773 773 (** Same as {!Mark.all}. Deprecated *) 774 774 val mark_set : Group.t -> Mark.Set.t 775 775 [@@ocaml.deprecated "Use Mark.all"] 776 + 777 + module Stream : sig 778 + (** An experimental for matching a regular expression by feeding individual 779 + string chunks. 780 + 781 + This module is not covered by semver's stability guarantee. *) 782 + 783 + type t 784 + 785 + type 'a feed = 786 + | Ok of 'a 787 + | No_match 788 + 789 + val create : re -> t 790 + val feed : t -> string -> pos:int -> len:int -> t feed 791 + 792 + (** [finalize s ~pos ~len] feed [s] from [pos] to [len] and return whether 793 + the regular expression matched. *) 794 + val finalize : t -> string -> pos:int -> len:int -> bool 795 + 796 + module Group : sig 797 + (** Match a string against a regular expression with capture groups *) 798 + 799 + type stream := t 800 + type t 801 + 802 + module Match : sig 803 + type t 804 + 805 + val get : t -> int -> string option 806 + val test_mark : t -> Pmark.t -> bool 807 + end 808 + 809 + val create : stream -> t 810 + val feed : t -> string -> pos:int -> len:int -> t feed 811 + val finalize : t -> string -> pos:int -> len:int -> Match.t feed 812 + end 813 + end
+70
lib/slice.ml
··· 1 + open Import 2 + 3 + type t = 4 + { s : string 5 + ; pos : int 6 + ; len : int 7 + } 8 + 9 + module L = struct 10 + type nonrec t = t list 11 + 12 + let get_substring slices ~start ~stop = 13 + if stop = start 14 + then "" 15 + else ( 16 + let slices = 17 + let rec drop slices remains = 18 + if remains = 0 19 + then slices 20 + else ( 21 + match slices with 22 + | [] -> assert false 23 + | ({ s = _; pos; len } as slice) :: xs -> 24 + let remains' = remains - len in 25 + if remains' >= 0 26 + then drop xs remains' 27 + else ( 28 + let pos = pos + remains in 29 + let len = len - remains in 30 + { slice with pos; len } :: xs)) 31 + in 32 + drop slices start 33 + in 34 + let buf = Buffer.create (stop - start) in 35 + let rec take slices remains = 36 + if remains > 0 37 + then ( 38 + match slices with 39 + | [] -> assert false 40 + | { s; pos; len } :: xs -> 41 + let remains' = remains - len in 42 + if remains' > 0 43 + then ( 44 + Buffer.add_substring buf s pos len; 45 + take xs remains') 46 + else Buffer.add_substring buf s pos remains) 47 + in 48 + take slices (stop - start); 49 + Buffer.contents buf) 50 + ;; 51 + 52 + let rec drop t remains = 53 + if remains = 0 54 + then t 55 + else ( 56 + match t with 57 + | [] -> [] 58 + | ({ s = _; pos; len } as slice) :: t -> 59 + if remains >= len 60 + then drop t (remains - len) 61 + else ( 62 + let delta = len - remains in 63 + { slice with pos = pos + delta; len = len - delta } :: t)) 64 + ;; 65 + 66 + let drop_rev t remains = 67 + (* TODO Use a proper functional queue *) 68 + if remains = 0 then t else List.rev (drop (List.rev t) remains) 69 + ;; 70 + end
+12
lib/slice.mli
··· 1 + type t = 2 + { s : string 3 + ; pos : int 4 + ; len : int 5 + } 6 + 7 + module L : sig 8 + type nonrec t = t list 9 + 10 + val get_substring : t -> start:int -> stop:int -> string 11 + val drop_rev : t -> int -> t 12 + end
+1
lib_test/expect/dune
··· 1 1 (library 2 2 (name re_tests) 3 + (modules import test_stream) 3 4 (libraries 4 5 re_private 5 6 ;; This is because of the (implicit_transitive_deps false)
+203
lib_test/expect/test_stream.ml
··· 1 + open Import 2 + module Stream = Re.Stream 3 + 4 + let feed t str = 5 + let res = Stream.feed t str ~pos:0 ~len:(String.length str) in 6 + let () = 7 + match res with 8 + | No_match -> Printf.printf "%S did not match\n" str 9 + | Ok s -> 10 + let status = 11 + match Stream.finalize s "" ~pos:0 ~len:0 with 12 + | true -> "matched" 13 + | false -> "unmatched" 14 + in 15 + Printf.printf "%S not matched (status = %s)\n" str status 16 + in 17 + res 18 + ;; 19 + 20 + let%expect_test "out out of bounds" = 21 + let stream = Re.any |> Re.compile |> Stream.create in 22 + invalid_argument (fun () -> ignore (Stream.feed stream "foo" ~pos:2 ~len:3)); 23 + [%expect {| Invalid_argument "index out of bounds" |}]; 24 + invalid_argument (fun () -> ignore (Stream.finalize stream "foo" ~pos:2 ~len:3)); 25 + [%expect {| Invalid_argument "index out of bounds" |}]; 26 + let stream = Stream.Group.create stream in 27 + invalid_argument (fun () -> ignore (Stream.Group.feed stream "foo" ~pos:2 ~len:3)); 28 + [%expect {| Invalid_argument "index out of bounds" |}]; 29 + invalid_argument (fun () -> ignore (Stream.Group.finalize stream "foo" ~pos:2 ~len:3)); 30 + [%expect {| Invalid_argument "index out of bounds" |}] 31 + ;; 32 + 33 + let%expect_test "basic" = 34 + let s = [ Re.bos; Re.str "abab" ] |> Re.seq |> Re.compile |> Stream.create in 35 + ignore (feed s "x"); 36 + [%expect {| "x" did not match |}]; 37 + let suffix = "ab" in 38 + let s = 39 + match feed s suffix with 40 + | Ok s -> s 41 + | No_match -> assert false 42 + in 43 + [%expect {| 44 + "ab" not matched (status = unmatched) |}]; 45 + (let (_ : _ Stream.feed) = feed s "ab" in 46 + [%expect {| 47 + "ab" not matched (status = matched) |}]); 48 + let (_ : _ Stream.feed) = feed s "xy" in 49 + [%expect {| 50 + "xy" did not match |}] 51 + ;; 52 + 53 + let%expect_test "eos" = 54 + let s = [ Re.str "zzz"; Re.eos ] |> Re.seq |> Re.compile |> Stream.create in 55 + ignore (feed s "zzz"); 56 + [%expect {| "zzz" not matched (status = matched) |}]; 57 + let s = 58 + match feed s "z" with 59 + | Ok s -> s 60 + | No_match -> assert false 61 + in 62 + [%expect {| "z" not matched (status = unmatched) |}]; 63 + (let str = "zz" in 64 + match Stream.finalize s str ~pos:0 ~len:(String.length str) with 65 + | true -> () 66 + | false -> assert false); 67 + [%expect {||}] 68 + ;; 69 + 70 + let%expect_test "finalize empty" = 71 + let s = "abde" in 72 + let stream = 73 + let stream = Re.str s |> Re.whole_string |> Re.compile |> Stream.create in 74 + match feed stream s with 75 + | Ok s -> s 76 + | No_match -> assert false 77 + in 78 + assert (Stream.finalize stream "" ~pos:0 ~len:0); 79 + [%expect {| "abde" not matched (status = matched) |}] 80 + ;; 81 + 82 + let%expect_test "group - basic" = 83 + let s = 84 + let open Re in 85 + str "foo" |> whole_string |> group |> compile |> Stream.create 86 + in 87 + let g = Stream.Group.create s in 88 + let g = 89 + match Stream.Group.feed g "f" ~pos:0 ~len:1 with 90 + | No_match -> assert false 91 + | Ok s -> s 92 + in 93 + (match Stream.Group.finalize g "oo" ~pos:0 ~len:2 with 94 + | Ok _ -> () 95 + | No_match -> assert false); 96 + [%expect {| |}] 97 + ;; 98 + 99 + let pmarks set m = 100 + Printf.printf "mark present %b\n" (Re.Stream.Group.Match.test_mark set m) 101 + ;; 102 + 103 + let%expect_test "group - mark entire string must match" = 104 + let m1, f = Re.(mark (char 'f')) in 105 + let m2, oo = Re.(mark (str "oo")) in 106 + let re = 107 + let open Re in 108 + [ f; oo ] |> seq |> compile 109 + in 110 + let s = Stream.create re in 111 + let g = Stream.Group.create s in 112 + let g = 113 + match Stream.Group.feed g "f" ~pos:0 ~len:1 with 114 + | No_match -> assert false 115 + | Ok s -> s 116 + in 117 + let g = 118 + match Stream.Group.finalize g "oo" ~pos:0 ~len:2 with 119 + | Ok g -> g 120 + | No_match -> assert false 121 + in 122 + pmarks g m1; 123 + [%expect {| mark present true |}]; 124 + pmarks g m2; 125 + [%expect {| mark present true |}] 126 + ;; 127 + 128 + let%expect_test "group - partial mark match" = 129 + let m, foo = Re.(mark (str "foo")) in 130 + let re = Re.compile foo in 131 + let s = Stream.create re in 132 + let g = Stream.Group.create s in 133 + let g = 134 + match Stream.Group.feed g "xx" ~pos:0 ~len:2 with 135 + | No_match -> assert false 136 + | Ok g -> g 137 + in 138 + let g = 139 + match Stream.Group.feed g "foo" ~pos:0 ~len:3 with 140 + | Ok g -> g 141 + | No_match -> assert false 142 + in 143 + let g = 144 + match Stream.Group.finalize g "garb" ~pos:0 ~len:4 with 145 + | Ok g -> g 146 + | No_match -> assert false 147 + in 148 + pmarks g m; 149 + [%expect {| mark present true |}] 150 + ;; 151 + 152 + let print_match match_ n = 153 + match Stream.Group.Match.get match_ n with 154 + | None -> Printf.printf "match %d: <not found>\n" n 155 + | Some s -> Printf.printf "match %d: %s\n" n s 156 + ;; 157 + 158 + let%expect_test "group - match group" = 159 + let stream = 160 + let re = Re.Pcre.re "_([a-z]+)_" |> Re.whole_string |> Re.compile in 161 + Stream.Group.create (Stream.create re) 162 + in 163 + let s = "_abc_" in 164 + let () = 165 + match Stream.Group.finalize stream s ~pos:0 ~len:(String.length s) with 166 + | No_match -> assert false 167 + | Ok m -> 168 + for i = 0 to 1 do 169 + print_match m i 170 + done 171 + in 172 + [%expect {| 173 + match 0: _abc_ 174 + match 1: abc 175 + |}] 176 + ;; 177 + 178 + let%expect_test "group - match group" = 179 + let stream = 180 + let re = Re.Pcre.re "_([a-z]+)__([a-z]+)_" |> Re.whole_string |> Re.compile in 181 + Stream.Group.create (Stream.create re) 182 + in 183 + let s = "_abc_" in 184 + let stream = 185 + match Stream.Group.feed stream s ~pos:0 ~len:(String.length s) with 186 + | No_match -> assert false 187 + | Ok m -> m 188 + in 189 + let s = "_de_" in 190 + let () = 191 + match Stream.Group.finalize stream s ~pos:0 ~len:(String.length s) with 192 + | No_match -> assert false 193 + | Ok m -> 194 + for i = 0 to 2 do 195 + print_match m i 196 + done 197 + in 198 + [%expect {| 199 + match 0: _abc__de_ 200 + match 1: abc 201 + match 2: de 202 + |}] 203 + ;;