···77* Introduce parsing functions in `Re.{Perl,Pcre,Emacs,Glob}` that return a
88 result instead of raising. (#542)
991010+* Introduce experimental streaming API `Re.Stream`. (#456)
1111+10121.13.1 (30-Sep-2024)
1113--------------------
1214
+176
lib/compile.ml
···371371 else final_boundary_check re positions ~last ~slen s state_info ~groups
372372;;
373373374374+module Stream = struct
375375+ type nonrec t =
376376+ { state : State.t
377377+ ; re : re
378378+ }
379379+380380+ type 'a feed =
381381+ | Ok of 'a
382382+ | No_match
383383+384384+ let create re =
385385+ let category = Category.(search_boundary ++ inexistant) in
386386+ let state = find_initial_state re category in
387387+ { state; re }
388388+ ;;
389389+390390+ let feed t s ~pos ~len =
391391+ (* TODO bound checks? *)
392392+ let last = pos + len in
393393+ let state = loop_no_mark t.re ~colors:t.re.colors s ~last ~pos t.state t.state in
394394+ let info = State.get_info state in
395395+ if Idx.is_break info.idx
396396+ &&
397397+ match Automata.State.status info.desc with
398398+ | Failed -> true
399399+ | Match _ | Running -> false
400400+ then No_match
401401+ else Ok { t with state }
402402+ ;;
403403+404404+ let finalize t s ~pos ~len =
405405+ (* TODO bound checks? *)
406406+ let last = pos + len in
407407+ let state = scan_str t.re Positions.empty s t.state ~last ~pos ~groups:false in
408408+ let info = State.get_info state in
409409+ match
410410+ let _idx, res =
411411+ let final_cat = Category.(search_boundary ++ inexistant) in
412412+ final t.re Positions.empty info final_cat
413413+ in
414414+ res
415415+ with
416416+ | Running | Failed -> false
417417+ | Match _ -> true
418418+ ;;
419419+420420+ module Group = struct
421421+ type nonrec t =
422422+ { t : t
423423+ ; positions : Positions.t
424424+ ; slices : Slice.L.t
425425+ ; abs_pos : int
426426+ ; first_match_pos : int
427427+ }
428428+429429+ let no_match_starts_before t = t.first_match_pos
430430+431431+ let create t =
432432+ { t
433433+ ; positions = Positions.make ~groups:true t.re
434434+ ; slices = []
435435+ ; abs_pos = 0
436436+ ; first_match_pos = 0
437437+ }
438438+ ;;
439439+440440+ module Match = struct
441441+ type t =
442442+ { pmarks : Pmark.Set.t
443443+ ; slices : Slice.L.t
444444+ ; marks : Mark_infos.t
445445+ ; positions : int array
446446+ ; start_pos : int
447447+ }
448448+449449+ let test_mark t mark = Pmark.Set.mem mark t.pmarks
450450+451451+ let get t i =
452452+ Mark_infos.offset t.marks i
453453+ |> Option.map (fun (start, stop) ->
454454+ let start = t.positions.(start) - t.start_pos in
455455+ let stop = t.positions.(stop) - t.start_pos in
456456+ Slice.L.get_substring t.slices ~start ~stop)
457457+ ;;
458458+459459+ let make ~start_pos ~pmarks ~slices ~marks ~positions =
460460+ let positions = Positions.all positions in
461461+ { pmarks; slices; positions; marks; start_pos }
462462+ ;;
463463+ end
464464+465465+ let rec loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st =
466466+ if pos < last
467467+ then (
468468+ let st' = next colors st s pos in
469469+ let idx = (State.get_info st').idx in
470470+ if Idx.is_idx idx
471471+ then (
472472+ Positions.set positions (Idx.idx idx) (abs_pos + pos);
473473+ loop re ~abs_pos ~colors ~positions s ~pos:(pos + 1) ~last st' st')
474474+ else if Idx.is_break idx
475475+ then (
476476+ Positions.set positions (Idx.break_idx idx) (abs_pos + pos);
477477+ st')
478478+ else (
479479+ (* Unknown *)
480480+ validate re positions s ~pos st0;
481481+ loop re ~abs_pos ~colors ~positions s ~pos ~last st0 st0))
482482+ else st
483483+ ;;
484484+485485+ let feed ({ t; positions; slices; abs_pos; first_match_pos = _ } as tt) s ~pos ~len =
486486+ let state =
487487+ (* TODO bound checks? *)
488488+ let last = pos + len in
489489+ loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state t.state
490490+ in
491491+ let info = State.get_info state in
492492+ if Idx.is_break info.idx
493493+ &&
494494+ match Automata.State.status info.desc with
495495+ | Failed -> true
496496+ | Match _ | Running -> false
497497+ then No_match
498498+ else (
499499+ let t = { t with state } in
500500+ let slices = { Slice.s; pos; len } :: slices in
501501+ let first_match_pos = Positions.first positions in
502502+ let slices = Slice.L.drop_rev slices (first_match_pos - tt.first_match_pos) in
503503+ let abs_pos = abs_pos + len in
504504+ Ok { tt with t; slices; abs_pos; first_match_pos })
505505+ ;;
506506+507507+ let finalize
508508+ ({ t; positions; slices; abs_pos; first_match_pos = _ } as tt)
509509+ s
510510+ ~pos
511511+ ~len
512512+ : Match.t feed
513513+ =
514514+ (* TODO bound checks? *)
515515+ let last = pos + len in
516516+ let info =
517517+ let state =
518518+ loop t.re ~abs_pos ~colors:t.re.colors s ~positions ~last ~pos t.state t.state
519519+ in
520520+ State.get_info state
521521+ in
522522+ match
523523+ match Automata.State.status info.desc with
524524+ | (Match _ | Failed) as s -> s
525525+ | Running ->
526526+ let idx, res =
527527+ let final_cat = Category.(search_boundary ++ inexistant) in
528528+ final t.re positions info final_cat
529529+ in
530530+ (match res with
531531+ | Running | Failed -> ()
532532+ | Match _ -> Positions.set positions (Automata.Idx.to_int idx) (abs_pos + last));
533533+ res
534534+ with
535535+ | Running | Failed -> No_match
536536+ | Match (marks, pmarks) ->
537537+ let first_match_position = Positions.first positions in
538538+ let slices =
539539+ let slices =
540540+ let slices = { Slice.s; pos; len } :: slices in
541541+ Slice.L.drop_rev slices (first_match_position - tt.first_match_pos)
542542+ in
543543+ List.rev slices
544544+ in
545545+ Ok (Match.make ~start_pos:first_match_position ~pmarks ~marks ~slices ~positions)
546546+ ;;
547547+ end
548548+end
549549+374550let match_str_no_bounds ~groups ~partial re s ~pos ~len =
375551 let positions = Positions.make ~groups re in
376552 match make_match_str re positions ~len ~groups ~partial s ~pos with
+29
lib/compile.mli
···11type re
2233+module Stream : sig
44+ type t
55+66+ type 'a feed =
77+ | Ok of 'a
88+ | No_match
99+1010+ val create : re -> t
1111+ val feed : t -> string -> pos:int -> len:int -> t feed
1212+ val finalize : t -> string -> pos:int -> len:int -> bool
1313+1414+ module Group : sig
1515+ type stream := t
1616+ type t
1717+1818+ module Match : sig
1919+ type t
2020+2121+ val get : t -> int -> string option
2222+ val test_mark : t -> Pmark.t -> bool
2323+ end
2424+2525+ val create : stream -> t
2626+ val feed : t -> string -> pos:int -> len:int -> t feed
2727+ val finalize : t -> string -> pos:int -> len:int -> Match.t feed
2828+ val no_match_starts_before : t -> int
2929+ end
3030+end
3131+332type match_info =
433 | Match of Group.t
534 | Failed
···215215(** Marks *)
216216module Mark : sig
217217 (** Mark id *)
218218- type t
218218+ type t = Pmark.t
219219220220 (** Tell if a mark was matched. *)
221221 val test : Group.t -> t -> bool
···773773(** Same as {!Mark.all}. Deprecated *)
774774val mark_set : Group.t -> Mark.Set.t
775775[@@ocaml.deprecated "Use Mark.all"]
776776+777777+module Stream : sig
778778+ (** An experimental for matching a regular expression by feeding individual
779779+ string chunks.
780780+781781+ This module is not covered by semver's stability guarantee. *)
782782+783783+ type t
784784+785785+ type 'a feed =
786786+ | Ok of 'a
787787+ | No_match
788788+789789+ val create : re -> t
790790+ val feed : t -> string -> pos:int -> len:int -> t feed
791791+792792+ (** [finalize s ~pos ~len] feed [s] from [pos] to [len] and return whether
793793+ the regular expression matched. *)
794794+ val finalize : t -> string -> pos:int -> len:int -> bool
795795+796796+ module Group : sig
797797+ (** Match a string against a regular expression with capture groups *)
798798+799799+ type stream := t
800800+ type t
801801+802802+ module Match : sig
803803+ type t
804804+805805+ val get : t -> int -> string option
806806+ val test_mark : t -> Pmark.t -> bool
807807+ end
808808+809809+ val create : stream -> t
810810+ val feed : t -> string -> pos:int -> len:int -> t feed
811811+ val finalize : t -> string -> pos:int -> len:int -> Match.t feed
812812+ end
813813+end
+70
lib/slice.ml
···11+open Import
22+33+type t =
44+ { s : string
55+ ; pos : int
66+ ; len : int
77+ }
88+99+module L = struct
1010+ type nonrec t = t list
1111+1212+ let get_substring slices ~start ~stop =
1313+ if stop = start
1414+ then ""
1515+ else (
1616+ let slices =
1717+ let rec drop slices remains =
1818+ if remains = 0
1919+ then slices
2020+ else (
2121+ match slices with
2222+ | [] -> assert false
2323+ | ({ s = _; pos; len } as slice) :: xs ->
2424+ let remains' = remains - len in
2525+ if remains' >= 0
2626+ then drop xs remains'
2727+ else (
2828+ let pos = pos + remains in
2929+ let len = len - remains in
3030+ { slice with pos; len } :: xs))
3131+ in
3232+ drop slices start
3333+ in
3434+ let buf = Buffer.create (stop - start) in
3535+ let rec take slices remains =
3636+ if remains > 0
3737+ then (
3838+ match slices with
3939+ | [] -> assert false
4040+ | { s; pos; len } :: xs ->
4141+ let remains' = remains - len in
4242+ if remains' > 0
4343+ then (
4444+ Buffer.add_substring buf s pos len;
4545+ take xs remains')
4646+ else Buffer.add_substring buf s pos remains)
4747+ in
4848+ take slices (stop - start);
4949+ Buffer.contents buf)
5050+ ;;
5151+5252+ let rec drop t remains =
5353+ if remains = 0
5454+ then t
5555+ else (
5656+ match t with
5757+ | [] -> []
5858+ | ({ s = _; pos; len } as slice) :: t ->
5959+ if remains >= len
6060+ then drop t (remains - len)
6161+ else (
6262+ let delta = len - remains in
6363+ { slice with pos = pos + delta; len = len - delta } :: t))
6464+ ;;
6565+6666+ let drop_rev t remains =
6767+ (* TODO Use a proper functional queue *)
6868+ if remains = 0 then t else List.rev (drop (List.rev t) remains)
6969+ ;;
7070+end
+12
lib/slice.mli
···11+type t =
22+ { s : string
33+ ; pos : int
44+ ; len : int
55+ }
66+77+module L : sig
88+ type nonrec t = t list
99+1010+ val get_substring : t -> start:int -> stop:int -> string
1111+ val drop_rev : t -> int -> t
1212+end
+1
lib_test/expect/dune
···11(library
22 (name re_tests)
33+ (modules import test_stream)
34 (libraries
45 re_private
56 ;; This is because of the (implicit_transitive_deps false)
+203
lib_test/expect/test_stream.ml
···11+open Import
22+module Stream = Re.Stream
33+44+let feed t str =
55+ let res = Stream.feed t str ~pos:0 ~len:(String.length str) in
66+ let () =
77+ match res with
88+ | No_match -> Printf.printf "%S did not match\n" str
99+ | Ok s ->
1010+ let status =
1111+ match Stream.finalize s "" ~pos:0 ~len:0 with
1212+ | true -> "matched"
1313+ | false -> "unmatched"
1414+ in
1515+ Printf.printf "%S not matched (status = %s)\n" str status
1616+ in
1717+ res
1818+;;
1919+2020+let%expect_test "out out of bounds" =
2121+ let stream = Re.any |> Re.compile |> Stream.create in
2222+ invalid_argument (fun () -> ignore (Stream.feed stream "foo" ~pos:2 ~len:3));
2323+ [%expect {| Invalid_argument "index out of bounds" |}];
2424+ invalid_argument (fun () -> ignore (Stream.finalize stream "foo" ~pos:2 ~len:3));
2525+ [%expect {| Invalid_argument "index out of bounds" |}];
2626+ let stream = Stream.Group.create stream in
2727+ invalid_argument (fun () -> ignore (Stream.Group.feed stream "foo" ~pos:2 ~len:3));
2828+ [%expect {| Invalid_argument "index out of bounds" |}];
2929+ invalid_argument (fun () -> ignore (Stream.Group.finalize stream "foo" ~pos:2 ~len:3));
3030+ [%expect {| Invalid_argument "index out of bounds" |}]
3131+;;
3232+3333+let%expect_test "basic" =
3434+ let s = [ Re.bos; Re.str "abab" ] |> Re.seq |> Re.compile |> Stream.create in
3535+ ignore (feed s "x");
3636+ [%expect {| "x" did not match |}];
3737+ let suffix = "ab" in
3838+ let s =
3939+ match feed s suffix with
4040+ | Ok s -> s
4141+ | No_match -> assert false
4242+ in
4343+ [%expect {|
4444+ "ab" not matched (status = unmatched) |}];
4545+ (let (_ : _ Stream.feed) = feed s "ab" in
4646+ [%expect {|
4747+ "ab" not matched (status = matched) |}]);
4848+ let (_ : _ Stream.feed) = feed s "xy" in
4949+ [%expect {|
5050+ "xy" did not match |}]
5151+;;
5252+5353+let%expect_test "eos" =
5454+ let s = [ Re.str "zzz"; Re.eos ] |> Re.seq |> Re.compile |> Stream.create in
5555+ ignore (feed s "zzz");
5656+ [%expect {| "zzz" not matched (status = matched) |}];
5757+ let s =
5858+ match feed s "z" with
5959+ | Ok s -> s
6060+ | No_match -> assert false
6161+ in
6262+ [%expect {| "z" not matched (status = unmatched) |}];
6363+ (let str = "zz" in
6464+ match Stream.finalize s str ~pos:0 ~len:(String.length str) with
6565+ | true -> ()
6666+ | false -> assert false);
6767+ [%expect {||}]
6868+;;
6969+7070+let%expect_test "finalize empty" =
7171+ let s = "abde" in
7272+ let stream =
7373+ let stream = Re.str s |> Re.whole_string |> Re.compile |> Stream.create in
7474+ match feed stream s with
7575+ | Ok s -> s
7676+ | No_match -> assert false
7777+ in
7878+ assert (Stream.finalize stream "" ~pos:0 ~len:0);
7979+ [%expect {| "abde" not matched (status = matched) |}]
8080+;;
8181+8282+let%expect_test "group - basic" =
8383+ let s =
8484+ let open Re in
8585+ str "foo" |> whole_string |> group |> compile |> Stream.create
8686+ in
8787+ let g = Stream.Group.create s in
8888+ let g =
8989+ match Stream.Group.feed g "f" ~pos:0 ~len:1 with
9090+ | No_match -> assert false
9191+ | Ok s -> s
9292+ in
9393+ (match Stream.Group.finalize g "oo" ~pos:0 ~len:2 with
9494+ | Ok _ -> ()
9595+ | No_match -> assert false);
9696+ [%expect {| |}]
9797+;;
9898+9999+let pmarks set m =
100100+ Printf.printf "mark present %b\n" (Re.Stream.Group.Match.test_mark set m)
101101+;;
102102+103103+let%expect_test "group - mark entire string must match" =
104104+ let m1, f = Re.(mark (char 'f')) in
105105+ let m2, oo = Re.(mark (str "oo")) in
106106+ let re =
107107+ let open Re in
108108+ [ f; oo ] |> seq |> compile
109109+ in
110110+ let s = Stream.create re in
111111+ let g = Stream.Group.create s in
112112+ let g =
113113+ match Stream.Group.feed g "f" ~pos:0 ~len:1 with
114114+ | No_match -> assert false
115115+ | Ok s -> s
116116+ in
117117+ let g =
118118+ match Stream.Group.finalize g "oo" ~pos:0 ~len:2 with
119119+ | Ok g -> g
120120+ | No_match -> assert false
121121+ in
122122+ pmarks g m1;
123123+ [%expect {| mark present true |}];
124124+ pmarks g m2;
125125+ [%expect {| mark present true |}]
126126+;;
127127+128128+let%expect_test "group - partial mark match" =
129129+ let m, foo = Re.(mark (str "foo")) in
130130+ let re = Re.compile foo in
131131+ let s = Stream.create re in
132132+ let g = Stream.Group.create s in
133133+ let g =
134134+ match Stream.Group.feed g "xx" ~pos:0 ~len:2 with
135135+ | No_match -> assert false
136136+ | Ok g -> g
137137+ in
138138+ let g =
139139+ match Stream.Group.feed g "foo" ~pos:0 ~len:3 with
140140+ | Ok g -> g
141141+ | No_match -> assert false
142142+ in
143143+ let g =
144144+ match Stream.Group.finalize g "garb" ~pos:0 ~len:4 with
145145+ | Ok g -> g
146146+ | No_match -> assert false
147147+ in
148148+ pmarks g m;
149149+ [%expect {| mark present true |}]
150150+;;
151151+152152+let print_match match_ n =
153153+ match Stream.Group.Match.get match_ n with
154154+ | None -> Printf.printf "match %d: <not found>\n" n
155155+ | Some s -> Printf.printf "match %d: %s\n" n s
156156+;;
157157+158158+let%expect_test "group - match group" =
159159+ let stream =
160160+ let re = Re.Pcre.re "_([a-z]+)_" |> Re.whole_string |> Re.compile in
161161+ Stream.Group.create (Stream.create re)
162162+ in
163163+ let s = "_abc_" in
164164+ let () =
165165+ match Stream.Group.finalize stream s ~pos:0 ~len:(String.length s) with
166166+ | No_match -> assert false
167167+ | Ok m ->
168168+ for i = 0 to 1 do
169169+ print_match m i
170170+ done
171171+ in
172172+ [%expect {|
173173+ match 0: _abc_
174174+ match 1: abc
175175+ |}]
176176+;;
177177+178178+let%expect_test "group - match group" =
179179+ let stream =
180180+ let re = Re.Pcre.re "_([a-z]+)__([a-z]+)_" |> Re.whole_string |> Re.compile in
181181+ Stream.Group.create (Stream.create re)
182182+ in
183183+ let s = "_abc_" in
184184+ let stream =
185185+ match Stream.Group.feed stream s ~pos:0 ~len:(String.length s) with
186186+ | No_match -> assert false
187187+ | Ok m -> m
188188+ in
189189+ let s = "_de_" in
190190+ let () =
191191+ match Stream.Group.finalize stream s ~pos:0 ~len:(String.length s) with
192192+ | No_match -> assert false
193193+ | Ok m ->
194194+ for i = 0 to 2 do
195195+ print_match m i
196196+ done
197197+ in
198198+ [%expect {|
199199+ match 0: _abc__de_
200200+ match 1: abc
201201+ match 2: de
202202+ |}]
203203+;;