My working unpac repository
at opam/upstream/seq 440 lines 14 kB view raw
1(**************************************************************************) 2(* *) 3(* OCaml *) 4(* *) 5(* Damien Doligez, projet Gallium, INRIA Rocquencourt *) 6(* *) 7(* Copyright 2014 Institut National de Recherche en Informatique et *) 8(* en Automatique. *) 9(* *) 10(* All rights reserved. This file is distributed under the terms of *) 11(* the GNU Lesser General Public License version 2.1, with the *) 12(* special exception on linking described in the file LICENSE. *) 13(* *) 14(**************************************************************************) 15 16(* String operations, based on byte sequence operations *) 17 18(* WARNING: Some functions in this file are duplicated in bytes.ml for 19 efficiency reasons. When you modify the one in this file you need to 20 modify its duplicate in bytes.ml. 21 These functions have a "duplicated" comment above their definition. 22*) 23 24external length : string -> int = "%string_length" 25external get : string -> int -> char = "%string_safe_get" 26external unsafe_get : string -> int -> char = "%string_unsafe_get" 27external unsafe_blit : string -> int -> bytes -> int -> int -> unit 28 = "caml_blit_string" [@@noalloc] 29 30module B = Bytes 31 32let bts = B.unsafe_to_string 33let bos = B.unsafe_of_string 34 35let make n c = 36 B.make n c |> bts 37let init n f = 38 B.init n f |> bts 39let empty = "" 40let of_bytes = B.to_string 41let to_bytes = B.of_string 42let sub s ofs len = 43 if ofs = 0 && length s = len then s else 44 B.sub (bos s) ofs len |> bts 45let blit = 46 B.blit_string 47 48let ensure_ge (x:int) y = if x >= y then x else invalid_arg "String.concat" 49 50let rec sum_lengths acc seplen = function 51 | [] -> acc 52 | hd :: [] -> length hd + acc 53 | hd :: tl -> sum_lengths (ensure_ge (length hd + seplen + acc) acc) seplen tl 54 55let rec unsafe_blits dst pos sep seplen = function 56 [] -> dst 57 | hd :: [] -> 58 unsafe_blit hd 0 dst pos (length hd); dst 59 | hd :: tl -> 60 unsafe_blit hd 0 dst pos (length hd); 61 unsafe_blit sep 0 dst (pos + length hd) seplen; 62 unsafe_blits dst (pos + length hd + seplen) sep seplen tl 63 64let concat sep = function 65 [] -> "" 66 | [s] -> s 67 | l -> let seplen = length sep in bts @@ 68 unsafe_blits 69 (B.create (sum_lengths 0 seplen l)) 70 0 sep seplen l 71 72let cat = ( ^ ) 73 74(* duplicated in bytes.ml *) 75let iter f s = 76 for i = 0 to length s - 1 do f (unsafe_get s i) done 77 78(* duplicated in bytes.ml *) 79let iteri f s = 80 for i = 0 to length s - 1 do f i (unsafe_get s i) done 81 82let map f s = 83 B.map f (bos s) |> bts 84let mapi f s = 85 B.mapi f (bos s) |> bts 86let fold_right f x a = 87 B.fold_right f (bos x) a 88let fold_left f a x = 89 B.fold_left f a (bos x) 90let exists f s = 91 B.exists f (bos s) 92let for_all f s = 93 B.for_all f (bos s) 94 95(* Beware: we cannot use B.trim or B.escape because they always make a 96 copy, but String.mli spells out some cases where we are not allowed 97 to make a copy. *) 98 99let is_space = function 100 | ' ' | '\012' | '\n' | '\r' | '\t' -> true 101 | _ -> false 102 103let trim s = 104 if s = "" then s 105 else if is_space (unsafe_get s 0) || is_space (unsafe_get s (length s - 1)) 106 then bts (B.trim (bos s)) 107 else s 108 109let escaped s = 110 let b = bos s in 111 (* We satisfy [unsafe_escape]'s precondition by passing an 112 immutable byte sequence [b]. *) 113 let b' = B.unsafe_escape b in 114 (* With js_of_ocaml, [bos] and [bts] are not the identity. 115 We can avoid a [bts] conversion if [unsafe_escape] returned 116 its argument. *) 117 if b == b' then s else bts b' 118 119(* duplicated in bytes.ml *) 120let rec index_rec s lim i c = 121 if i >= lim then raise Not_found else 122 if unsafe_get s i = c then i else index_rec s lim (i + 1) c 123 124(* duplicated in bytes.ml *) 125let index s c = index_rec s (length s) 0 c 126 127(* duplicated in bytes.ml *) 128let rec index_rec_opt s lim i c = 129 if i >= lim then None else 130 if unsafe_get s i = c then Some i else index_rec_opt s lim (i + 1) c 131 132(* duplicated in bytes.ml *) 133let index_opt s c = index_rec_opt s (length s) 0 c 134 135(* duplicated in bytes.ml *) 136let index_from s i c = 137 let l = length s in 138 if i < 0 || i > l then invalid_arg "String.index_from / Bytes.index_from" else 139 index_rec s l i c 140 141(* duplicated in bytes.ml *) 142let index_from_opt s i c = 143 let l = length s in 144 if i < 0 || i > l then 145 invalid_arg "String.index_from_opt / Bytes.index_from_opt" 146 else 147 index_rec_opt s l i c 148 149(* duplicated in bytes.ml *) 150let rec rindex_rec s i c = 151 if i < 0 then raise Not_found else 152 if unsafe_get s i = c then i else rindex_rec s (i - 1) c 153 154(* duplicated in bytes.ml *) 155let rindex s c = rindex_rec s (length s - 1) c 156 157(* duplicated in bytes.ml *) 158let rindex_from s i c = 159 if i < -1 || i >= length s then 160 invalid_arg "String.rindex_from / Bytes.rindex_from" 161 else 162 rindex_rec s i c 163 164(* duplicated in bytes.ml *) 165let rec rindex_rec_opt s i c = 166 if i < 0 then None else 167 if unsafe_get s i = c then Some i else rindex_rec_opt s (i - 1) c 168 169(* duplicated in bytes.ml *) 170let rindex_opt s c = rindex_rec_opt s (length s - 1) c 171 172(* duplicated in bytes.ml *) 173let rindex_from_opt s i c = 174 if i < -1 || i >= length s then 175 invalid_arg "String.rindex_from_opt / Bytes.rindex_from_opt" 176 else 177 rindex_rec_opt s i c 178 179(* duplicated in bytes.ml *) 180let contains_from s i c = 181 let l = length s in 182 if i < 0 || i > l then 183 invalid_arg "String.contains_from / Bytes.contains_from" 184 else 185 try ignore (index_rec s l i c); true with Not_found -> false 186 187(* duplicated in bytes.ml *) 188let contains s c = contains_from s 0 c 189 190(* duplicated in bytes.ml *) 191let rcontains_from s i c = 192 if i < 0 || i >= length s then 193 invalid_arg "String.rcontains_from / Bytes.rcontains_from" 194 else 195 try ignore (rindex_rec s i c); true with Not_found -> false 196 197let uppercase_ascii s = 198 B.uppercase_ascii (bos s) |> bts 199let lowercase_ascii s = 200 B.lowercase_ascii (bos s) |> bts 201let capitalize_ascii s = 202 B.capitalize_ascii (bos s) |> bts 203let uncapitalize_ascii s = 204 B.uncapitalize_ascii (bos s) |> bts 205 206(* duplicated in bytes.ml *) 207let starts_with ~prefix s = 208 let len_s = length s 209 and len_pre = length prefix in 210 let rec aux i = 211 if i = len_pre then true 212 else if unsafe_get s i <> unsafe_get prefix i then false 213 else aux (i + 1) 214 in len_s >= len_pre && aux 0 215 216(* duplicated in bytes.ml *) 217let ends_with ~suffix s = 218 let len_s = length s 219 and len_suf = length suffix in 220 let diff = len_s - len_suf in 221 let rec aux i = 222 if i = len_suf then true 223 else if unsafe_get s (diff + i) <> unsafe_get suffix i then false 224 else aux (i + 1) 225 in diff >= 0 && aux 0 226 227external seeded_hash : int -> string -> int = "caml_string_hash" [@@noalloc] 228let hash x = seeded_hash 0 x 229 230(* Splitting with magnitudes *) 231 232let[@inline] subrange ?(first = 0) ?(last = max_int) s = 233 (* assert (Sys.max_string_length - 1 < max_int) *) 234 let max = length s - 1 in 235 let first = if first < 0 then 0 else first in 236 let last = if last > max then max else last in 237 if first > last then "" else sub s first (last - first + 1) 238 239let take_first n s = subrange ~last:(n - 1) s 240let drop_first n s = subrange ~first:n s 241let cut_first n s = (take_first n s, drop_first n s) 242let take_last n s = subrange ~first:(length s - n) s 243let drop_last n s = subrange ~last:(length s - n - 1) s 244let cut_last n s = (drop_last n s, take_last n s) 245 246(* Splitting with predicates *) 247 248let take_first_while sat s = 249 let len = length s and i = ref 0 in 250 while !i < len && sat (unsafe_get s !i) do incr i done; 251 if !i = len then s else sub s 0 !i 252 253let drop_first_while sat s = 254 let len = length s and i = ref 0 in 255 while !i < len && sat (unsafe_get s !i) do incr i done; 256 if !i = 0 then s else sub s !i (len - !i) 257 258let cut_first_while sat s = 259 let len = length s and i = ref 0 in 260 while !i < len && sat (unsafe_get s !i) do incr i done; 261 if !i = len then s, "" else 262 if !i = 0 then "", s else 263 sub s 0 !i, sub s !i (len - !i) 264 265let take_last_while sat s = 266 let len = length s in 267 let i = ref (len - 1) in 268 while !i >= 0 && sat (unsafe_get s !i) do decr i done; 269 if !i < 0 then s else 270 let j = !i + 1 in 271 sub s j (len - j) 272 273let drop_last_while sat s = 274 let len = length s in 275 let i = ref (len - 1) in 276 while !i >= 0 && sat (unsafe_get s !i) do decr i done; 277 if !i < 0 then "" else sub s 0 (!i + 1) 278 279let cut_last_while sat s = 280 let len = length s in 281 let i = ref (len - 1) in 282 while !i >= 0 && sat (unsafe_get s !i) do decr i done; 283 if !i < 0 then "", s else 284 if !i = len - 1 then s, "" else 285 let j = !i + 1 in 286 sub s 0 j, sub s j (len - j) 287 288(* Splitting with separators *) 289 290(* duplicated in bytes.ml *) 291let split_on_char sep s = 292 let r = ref [] in 293 let j = ref (length s) in 294 for i = length s - 1 downto 0 do 295 if unsafe_get s i = sep then begin 296 r := sub s (i + 1) (!j - i - 1) :: !r; 297 j := i 298 end 299 done; 300 sub s 0 !j :: !r 301 302type t = string 303 304let compare (x: t) (y: t) = Stdlib.compare x y 305external equal : string -> string -> bool = "caml_string_equal" [@@noalloc] 306 307(** {1 Iterators} *) 308 309let to_seq s = bos s |> B.to_seq 310 311let to_seqi s = bos s |> B.to_seqi 312 313let of_seq g = B.of_seq g |> bts 314 315(* UTF decoders and validators *) 316 317let get_utf_8_uchar s i = B.get_utf_8_uchar (bos s) i 318let is_valid_utf_8 s = B.is_valid_utf_8 (bos s) 319 320let get_utf_16be_uchar s i = B.get_utf_16be_uchar (bos s) i 321let is_valid_utf_16be s = B.is_valid_utf_16be (bos s) 322 323let get_utf_16le_uchar s i = B.get_utf_16le_uchar (bos s) i 324let is_valid_utf_16le s = B.is_valid_utf_16le (bos s) 325 326(** {6 Binary encoding/decoding of integers} *) 327 328external get_uint8 : string -> int -> int = "%string_safe_get" 329external get_uint16_ne : string -> int -> int = "%caml_string_get16" 330external get_int32_ne : string -> int -> int32 = "%caml_string_get32" 331external get_int64_ne : string -> int -> int64 = "%caml_string_get64" 332 333let get_int8 s i = B.get_int8 (bos s) i 334let get_uint16_le s i = B.get_uint16_le (bos s) i 335let get_uint16_be s i = B.get_uint16_be (bos s) i 336let get_int16_ne s i = B.get_int16_ne (bos s) i 337let get_int16_le s i = B.get_int16_le (bos s) i 338let get_int16_be s i = B.get_int16_be (bos s) i 339let get_int32_le s i = B.get_int32_le (bos s) i 340let get_int32_be s i = B.get_int32_be (bos s) i 341let get_int64_le s i = B.get_int64_le (bos s) i 342let get_int64_be s i = B.get_int64_be (bos s) i 343 344(* Spellchecking *) 345 346let utf_8_uchar_length s = 347 let slen = length s in 348 let i = ref 0 and ulen = ref 0 in 349 while (!i < slen) do 350 let dec_len = Uchar.utf_8_decode_length_of_byte (unsafe_get s !i) in 351 i := (!i + if dec_len = 0 then 1 (* count one Uchar.rep *) else dec_len); 352 incr ulen; 353 done; 354 !ulen 355 356let uchar_array_of_utf_8_string s = 357 let slen = length s in (* is an upper bound on Uchar.t count *) 358 let uchars = Array.make slen Uchar.max in 359 let k = ref 0 and i = ref 0 in 360 while (!i < slen) do 361 let dec = get_utf_8_uchar s !i in 362 i := !i + Uchar.utf_decode_length dec; 363 uchars.(!k) <- Uchar.utf_decode_uchar dec; 364 incr k; 365 done; 366 uchars, !k 367 368let edit_distance' ?(limit = Int.max_int) s (s0, len0) s1 = 369 if limit <= 1 then (if equal s s1 then 0 else limit) else 370 let[@inline] minimum a b c = Int.min a (Int.min b c) in 371 let s1, len1 = uchar_array_of_utf_8_string s1 in 372 let limit = Int.min (Int.max len0 len1) limit in 373 if Int.abs (len1 - len0) >= limit then limit else 374 let s0, s1 = if len0 > len1 then s0, s1 else s1, s0 in 375 let len0, len1 = if len0 > len1 then len0, len1 else len1, len0 in 376 let rec loop row_minus2 row_minus1 row i len0 limit s0 s1 = 377 if i > len0 then row_minus1.(Array.length row_minus1 - 1) else 378 let len1 = Array.length row - 1 in 379 let row_min = ref Int.max_int in 380 row.(0) <- i; 381 let jmax = 382 let jmax = Int.min len1 (i + limit - 1) in 383 if jmax < 0 then (* overflow *) len1 else jmax 384 in 385 for j = Int.max 1 (i - limit) to jmax do 386 let cost = if Uchar.equal s0.(i-1) s1.(j-1) then 0 else 1 in 387 let min = minimum 388 (row_minus1.(j-1) + cost) (* substitute *) 389 (row_minus1.(j) + 1) (* delete *) 390 (row.(j-1) + 1) (* insert *) 391 (* Note when j = i - limit, the latter [row] read makes a bogus read 392 on the value that was in the matrix at d.(i-2).(i - limit - 1). 393 Since by induction for all i,j, d.(i).(j) >= abs (i - j), 394 (row.(j-1) + 1) is greater or equal to [limit] and thus does 395 not affect adversely the minimum computation. *) 396 in 397 let min = 398 if (i > 1 && j > 1 && 399 Uchar.equal s0.(i-1) s1.(j-2) && 400 Uchar.equal s0.(i-2) s1.(j-1)) 401 then Int.min min (row_minus2.(j-2) + cost) (* transpose *) 402 else min 403 in 404 row.(j) <- min; 405 row_min := Int.min !row_min min; 406 done; 407 if !row_min >= limit then (* can no longer decrease *) limit else 408 loop row_minus1 row row_minus2 (i + 1) len0 limit s0 s1 409 in 410 let ignore = 411 (* Value used to make the values around the diagonal stripe ignored 412 by the min computations when we have a limit. *) 413 limit + 1 414 in 415 let row_minus2 = Array.make (len1 + 1) ignore in 416 let row_minus1 = Array.init (len1 + 1) (fun x -> x) in 417 let row = Array.make (len1 + 1) ignore in 418 let d = loop row_minus2 row_minus1 row 1 len0 limit s0 s1 in 419 if d > limit then limit else d 420 421let edit_distance ?limit s0 s1 = 422 let us0 = uchar_array_of_utf_8_string s0 in 423 edit_distance' ?limit s0 us0 s1 424 425let default_max_dist s = match utf_8_uchar_length s with 426 | 0 | 1 | 2 -> 0 427 | 3 | 4 -> 1 428 | _ -> 2 429 430let spellcheck ?(max_dist = default_max_dist) iter_dict s = 431 let min = ref (max_dist s) in 432 let acc = ref [] in 433 let select_words s us word = 434 let d = edit_distance' ~limit:(!min + 1) s us word in 435 if d = !min then (acc := word :: !acc) else 436 if d < !min then (min := d; acc := [word]) else () 437 in 438 let us = uchar_array_of_utf_8_string s in 439 iter_dict (select_words s us); 440 List.rev !acc