My working unpac repository
1(**************************************************************************)
2(* *)
3(* OCaml *)
4(* *)
5(* Damien Doligez, projet Gallium, INRIA Rocquencourt *)
6(* *)
7(* Copyright 2014 Institut National de Recherche en Informatique et *)
8(* en Automatique. *)
9(* *)
10(* All rights reserved. This file is distributed under the terms of *)
11(* the GNU Lesser General Public License version 2.1, with the *)
12(* special exception on linking described in the file LICENSE. *)
13(* *)
14(**************************************************************************)
15
16(* String operations, based on byte sequence operations *)
17
18(* WARNING: Some functions in this file are duplicated in bytes.ml for
19 efficiency reasons. When you modify the one in this file you need to
20 modify its duplicate in bytes.ml.
21 These functions have a "duplicated" comment above their definition.
22*)
23
24external length : string -> int = "%string_length"
25external get : string -> int -> char = "%string_safe_get"
26external unsafe_get : string -> int -> char = "%string_unsafe_get"
27external unsafe_blit : string -> int -> bytes -> int -> int -> unit
28 = "caml_blit_string" [@@noalloc]
29
30module B = Bytes
31
32let bts = B.unsafe_to_string
33let bos = B.unsafe_of_string
34
35let make n c =
36 B.make n c |> bts
37let init n f =
38 B.init n f |> bts
39let empty = ""
40let of_bytes = B.to_string
41let to_bytes = B.of_string
42let sub s ofs len =
43 if ofs = 0 && length s = len then s else
44 B.sub (bos s) ofs len |> bts
45let blit =
46 B.blit_string
47
48let ensure_ge (x:int) y = if x >= y then x else invalid_arg "String.concat"
49
50let rec sum_lengths acc seplen = function
51 | [] -> acc
52 | hd :: [] -> length hd + acc
53 | hd :: tl -> sum_lengths (ensure_ge (length hd + seplen + acc) acc) seplen tl
54
55let rec unsafe_blits dst pos sep seplen = function
56 [] -> dst
57 | hd :: [] ->
58 unsafe_blit hd 0 dst pos (length hd); dst
59 | hd :: tl ->
60 unsafe_blit hd 0 dst pos (length hd);
61 unsafe_blit sep 0 dst (pos + length hd) seplen;
62 unsafe_blits dst (pos + length hd + seplen) sep seplen tl
63
64let concat sep = function
65 [] -> ""
66 | [s] -> s
67 | l -> let seplen = length sep in bts @@
68 unsafe_blits
69 (B.create (sum_lengths 0 seplen l))
70 0 sep seplen l
71
72let cat = ( ^ )
73
74(* duplicated in bytes.ml *)
75let iter f s =
76 for i = 0 to length s - 1 do f (unsafe_get s i) done
77
78(* duplicated in bytes.ml *)
79let iteri f s =
80 for i = 0 to length s - 1 do f i (unsafe_get s i) done
81
82let map f s =
83 B.map f (bos s) |> bts
84let mapi f s =
85 B.mapi f (bos s) |> bts
86let fold_right f x a =
87 B.fold_right f (bos x) a
88let fold_left f a x =
89 B.fold_left f a (bos x)
90let exists f s =
91 B.exists f (bos s)
92let for_all f s =
93 B.for_all f (bos s)
94
95(* Beware: we cannot use B.trim or B.escape because they always make a
96 copy, but String.mli spells out some cases where we are not allowed
97 to make a copy. *)
98
99let is_space = function
100 | ' ' | '\012' | '\n' | '\r' | '\t' -> true
101 | _ -> false
102
103let trim s =
104 if s = "" then s
105 else if is_space (unsafe_get s 0) || is_space (unsafe_get s (length s - 1))
106 then bts (B.trim (bos s))
107 else s
108
109let escaped s =
110 let b = bos s in
111 (* We satisfy [unsafe_escape]'s precondition by passing an
112 immutable byte sequence [b]. *)
113 let b' = B.unsafe_escape b in
114 (* With js_of_ocaml, [bos] and [bts] are not the identity.
115 We can avoid a [bts] conversion if [unsafe_escape] returned
116 its argument. *)
117 if b == b' then s else bts b'
118
119(* duplicated in bytes.ml *)
120let rec index_rec s lim i c =
121 if i >= lim then raise Not_found else
122 if unsafe_get s i = c then i else index_rec s lim (i + 1) c
123
124(* duplicated in bytes.ml *)
125let index s c = index_rec s (length s) 0 c
126
127(* duplicated in bytes.ml *)
128let rec index_rec_opt s lim i c =
129 if i >= lim then None else
130 if unsafe_get s i = c then Some i else index_rec_opt s lim (i + 1) c
131
132(* duplicated in bytes.ml *)
133let index_opt s c = index_rec_opt s (length s) 0 c
134
135(* duplicated in bytes.ml *)
136let index_from s i c =
137 let l = length s in
138 if i < 0 || i > l then invalid_arg "String.index_from / Bytes.index_from" else
139 index_rec s l i c
140
141(* duplicated in bytes.ml *)
142let index_from_opt s i c =
143 let l = length s in
144 if i < 0 || i > l then
145 invalid_arg "String.index_from_opt / Bytes.index_from_opt"
146 else
147 index_rec_opt s l i c
148
149(* duplicated in bytes.ml *)
150let rec rindex_rec s i c =
151 if i < 0 then raise Not_found else
152 if unsafe_get s i = c then i else rindex_rec s (i - 1) c
153
154(* duplicated in bytes.ml *)
155let rindex s c = rindex_rec s (length s - 1) c
156
157(* duplicated in bytes.ml *)
158let rindex_from s i c =
159 if i < -1 || i >= length s then
160 invalid_arg "String.rindex_from / Bytes.rindex_from"
161 else
162 rindex_rec s i c
163
164(* duplicated in bytes.ml *)
165let rec rindex_rec_opt s i c =
166 if i < 0 then None else
167 if unsafe_get s i = c then Some i else rindex_rec_opt s (i - 1) c
168
169(* duplicated in bytes.ml *)
170let rindex_opt s c = rindex_rec_opt s (length s - 1) c
171
172(* duplicated in bytes.ml *)
173let rindex_from_opt s i c =
174 if i < -1 || i >= length s then
175 invalid_arg "String.rindex_from_opt / Bytes.rindex_from_opt"
176 else
177 rindex_rec_opt s i c
178
179(* duplicated in bytes.ml *)
180let contains_from s i c =
181 let l = length s in
182 if i < 0 || i > l then
183 invalid_arg "String.contains_from / Bytes.contains_from"
184 else
185 try ignore (index_rec s l i c); true with Not_found -> false
186
187(* duplicated in bytes.ml *)
188let contains s c = contains_from s 0 c
189
190(* duplicated in bytes.ml *)
191let rcontains_from s i c =
192 if i < 0 || i >= length s then
193 invalid_arg "String.rcontains_from / Bytes.rcontains_from"
194 else
195 try ignore (rindex_rec s i c); true with Not_found -> false
196
197let uppercase_ascii s =
198 B.uppercase_ascii (bos s) |> bts
199let lowercase_ascii s =
200 B.lowercase_ascii (bos s) |> bts
201let capitalize_ascii s =
202 B.capitalize_ascii (bos s) |> bts
203let uncapitalize_ascii s =
204 B.uncapitalize_ascii (bos s) |> bts
205
206(* duplicated in bytes.ml *)
207let starts_with ~prefix s =
208 let len_s = length s
209 and len_pre = length prefix in
210 let rec aux i =
211 if i = len_pre then true
212 else if unsafe_get s i <> unsafe_get prefix i then false
213 else aux (i + 1)
214 in len_s >= len_pre && aux 0
215
216(* duplicated in bytes.ml *)
217let ends_with ~suffix s =
218 let len_s = length s
219 and len_suf = length suffix in
220 let diff = len_s - len_suf in
221 let rec aux i =
222 if i = len_suf then true
223 else if unsafe_get s (diff + i) <> unsafe_get suffix i then false
224 else aux (i + 1)
225 in diff >= 0 && aux 0
226
227external seeded_hash : int -> string -> int = "caml_string_hash" [@@noalloc]
228let hash x = seeded_hash 0 x
229
230(* Splitting with magnitudes *)
231
232let[@inline] subrange ?(first = 0) ?(last = max_int) s =
233 (* assert (Sys.max_string_length - 1 < max_int) *)
234 let max = length s - 1 in
235 let first = if first < 0 then 0 else first in
236 let last = if last > max then max else last in
237 if first > last then "" else sub s first (last - first + 1)
238
239let take_first n s = subrange ~last:(n - 1) s
240let drop_first n s = subrange ~first:n s
241let cut_first n s = (take_first n s, drop_first n s)
242let take_last n s = subrange ~first:(length s - n) s
243let drop_last n s = subrange ~last:(length s - n - 1) s
244let cut_last n s = (drop_last n s, take_last n s)
245
246(* Splitting with predicates *)
247
248let take_first_while sat s =
249 let len = length s and i = ref 0 in
250 while !i < len && sat (unsafe_get s !i) do incr i done;
251 if !i = len then s else sub s 0 !i
252
253let drop_first_while sat s =
254 let len = length s and i = ref 0 in
255 while !i < len && sat (unsafe_get s !i) do incr i done;
256 if !i = 0 then s else sub s !i (len - !i)
257
258let cut_first_while sat s =
259 let len = length s and i = ref 0 in
260 while !i < len && sat (unsafe_get s !i) do incr i done;
261 if !i = len then s, "" else
262 if !i = 0 then "", s else
263 sub s 0 !i, sub s !i (len - !i)
264
265let take_last_while sat s =
266 let len = length s in
267 let i = ref (len - 1) in
268 while !i >= 0 && sat (unsafe_get s !i) do decr i done;
269 if !i < 0 then s else
270 let j = !i + 1 in
271 sub s j (len - j)
272
273let drop_last_while sat s =
274 let len = length s in
275 let i = ref (len - 1) in
276 while !i >= 0 && sat (unsafe_get s !i) do decr i done;
277 if !i < 0 then "" else sub s 0 (!i + 1)
278
279let cut_last_while sat s =
280 let len = length s in
281 let i = ref (len - 1) in
282 while !i >= 0 && sat (unsafe_get s !i) do decr i done;
283 if !i < 0 then "", s else
284 if !i = len - 1 then s, "" else
285 let j = !i + 1 in
286 sub s 0 j, sub s j (len - j)
287
288(* Splitting with separators *)
289
290(* duplicated in bytes.ml *)
291let split_on_char sep s =
292 let r = ref [] in
293 let j = ref (length s) in
294 for i = length s - 1 downto 0 do
295 if unsafe_get s i = sep then begin
296 r := sub s (i + 1) (!j - i - 1) :: !r;
297 j := i
298 end
299 done;
300 sub s 0 !j :: !r
301
302type t = string
303
304let compare (x: t) (y: t) = Stdlib.compare x y
305external equal : string -> string -> bool = "caml_string_equal" [@@noalloc]
306
307(** {1 Iterators} *)
308
309let to_seq s = bos s |> B.to_seq
310
311let to_seqi s = bos s |> B.to_seqi
312
313let of_seq g = B.of_seq g |> bts
314
315(* UTF decoders and validators *)
316
317let get_utf_8_uchar s i = B.get_utf_8_uchar (bos s) i
318let is_valid_utf_8 s = B.is_valid_utf_8 (bos s)
319
320let get_utf_16be_uchar s i = B.get_utf_16be_uchar (bos s) i
321let is_valid_utf_16be s = B.is_valid_utf_16be (bos s)
322
323let get_utf_16le_uchar s i = B.get_utf_16le_uchar (bos s) i
324let is_valid_utf_16le s = B.is_valid_utf_16le (bos s)
325
326(** {6 Binary encoding/decoding of integers} *)
327
328external get_uint8 : string -> int -> int = "%string_safe_get"
329external get_uint16_ne : string -> int -> int = "%caml_string_get16"
330external get_int32_ne : string -> int -> int32 = "%caml_string_get32"
331external get_int64_ne : string -> int -> int64 = "%caml_string_get64"
332
333let get_int8 s i = B.get_int8 (bos s) i
334let get_uint16_le s i = B.get_uint16_le (bos s) i
335let get_uint16_be s i = B.get_uint16_be (bos s) i
336let get_int16_ne s i = B.get_int16_ne (bos s) i
337let get_int16_le s i = B.get_int16_le (bos s) i
338let get_int16_be s i = B.get_int16_be (bos s) i
339let get_int32_le s i = B.get_int32_le (bos s) i
340let get_int32_be s i = B.get_int32_be (bos s) i
341let get_int64_le s i = B.get_int64_le (bos s) i
342let get_int64_be s i = B.get_int64_be (bos s) i
343
344(* Spellchecking *)
345
346let utf_8_uchar_length s =
347 let slen = length s in
348 let i = ref 0 and ulen = ref 0 in
349 while (!i < slen) do
350 let dec_len = Uchar.utf_8_decode_length_of_byte (unsafe_get s !i) in
351 i := (!i + if dec_len = 0 then 1 (* count one Uchar.rep *) else dec_len);
352 incr ulen;
353 done;
354 !ulen
355
356let uchar_array_of_utf_8_string s =
357 let slen = length s in (* is an upper bound on Uchar.t count *)
358 let uchars = Array.make slen Uchar.max in
359 let k = ref 0 and i = ref 0 in
360 while (!i < slen) do
361 let dec = get_utf_8_uchar s !i in
362 i := !i + Uchar.utf_decode_length dec;
363 uchars.(!k) <- Uchar.utf_decode_uchar dec;
364 incr k;
365 done;
366 uchars, !k
367
368let edit_distance' ?(limit = Int.max_int) s (s0, len0) s1 =
369 if limit <= 1 then (if equal s s1 then 0 else limit) else
370 let[@inline] minimum a b c = Int.min a (Int.min b c) in
371 let s1, len1 = uchar_array_of_utf_8_string s1 in
372 let limit = Int.min (Int.max len0 len1) limit in
373 if Int.abs (len1 - len0) >= limit then limit else
374 let s0, s1 = if len0 > len1 then s0, s1 else s1, s0 in
375 let len0, len1 = if len0 > len1 then len0, len1 else len1, len0 in
376 let rec loop row_minus2 row_minus1 row i len0 limit s0 s1 =
377 if i > len0 then row_minus1.(Array.length row_minus1 - 1) else
378 let len1 = Array.length row - 1 in
379 let row_min = ref Int.max_int in
380 row.(0) <- i;
381 let jmax =
382 let jmax = Int.min len1 (i + limit - 1) in
383 if jmax < 0 then (* overflow *) len1 else jmax
384 in
385 for j = Int.max 1 (i - limit) to jmax do
386 let cost = if Uchar.equal s0.(i-1) s1.(j-1) then 0 else 1 in
387 let min = minimum
388 (row_minus1.(j-1) + cost) (* substitute *)
389 (row_minus1.(j) + 1) (* delete *)
390 (row.(j-1) + 1) (* insert *)
391 (* Note when j = i - limit, the latter [row] read makes a bogus read
392 on the value that was in the matrix at d.(i-2).(i - limit - 1).
393 Since by induction for all i,j, d.(i).(j) >= abs (i - j),
394 (row.(j-1) + 1) is greater or equal to [limit] and thus does
395 not affect adversely the minimum computation. *)
396 in
397 let min =
398 if (i > 1 && j > 1 &&
399 Uchar.equal s0.(i-1) s1.(j-2) &&
400 Uchar.equal s0.(i-2) s1.(j-1))
401 then Int.min min (row_minus2.(j-2) + cost) (* transpose *)
402 else min
403 in
404 row.(j) <- min;
405 row_min := Int.min !row_min min;
406 done;
407 if !row_min >= limit then (* can no longer decrease *) limit else
408 loop row_minus1 row row_minus2 (i + 1) len0 limit s0 s1
409 in
410 let ignore =
411 (* Value used to make the values around the diagonal stripe ignored
412 by the min computations when we have a limit. *)
413 limit + 1
414 in
415 let row_minus2 = Array.make (len1 + 1) ignore in
416 let row_minus1 = Array.init (len1 + 1) (fun x -> x) in
417 let row = Array.make (len1 + 1) ignore in
418 let d = loop row_minus2 row_minus1 row 1 len0 limit s0 s1 in
419 if d > limit then limit else d
420
421let edit_distance ?limit s0 s1 =
422 let us0 = uchar_array_of_utf_8_string s0 in
423 edit_distance' ?limit s0 us0 s1
424
425let default_max_dist s = match utf_8_uchar_length s with
426 | 0 | 1 | 2 -> 0
427 | 3 | 4 -> 1
428 | _ -> 2
429
430let spellcheck ?(max_dist = default_max_dist) iter_dict s =
431 let min = ref (max_dist s) in
432 let acc = ref [] in
433 let select_words s us word =
434 let d = edit_distance' ~limit:(!min + 1) s us word in
435 if d = !min then (acc := word :: !acc) else
436 if d < !min then (min := d; acc := [word]) else ()
437 in
438 let us = uchar_array_of_utf_8_string s in
439 iter_dict (select_words s us);
440 List.rev !acc