My working unpac repository
at opam/upstream/seq 848 lines 27 kB view raw
1(**************************************************************************) 2(* *) 3(* OCaml *) 4(* *) 5(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6(* *) 7(* Copyright 1996 Institut National de Recherche en Informatique et *) 8(* en Automatique. *) 9(* *) 10(* All rights reserved. This file is distributed under the terms of *) 11(* the GNU Lesser General Public License version 2.1, with the *) 12(* special exception on linking described in the file LICENSE. *) 13(* *) 14(**************************************************************************) 15 16(* Byte sequence operations *) 17 18(* WARNING: Some functions in this file are duplicated in string.ml for 19 efficiency reasons. When you modify the one in this file you need to 20 modify its duplicate in string.ml. 21 These functions have a "duplicated" comment above their definition. 22*) 23 24external length : bytes -> int = "%bytes_length" 25external string_length : string -> int = "%string_length" 26external get : bytes -> int -> char = "%bytes_safe_get" 27external set : bytes -> int -> char -> unit = "%bytes_safe_set" 28external create : int -> bytes = "caml_create_bytes" 29external unsafe_get : bytes -> int -> char = "%bytes_unsafe_get" 30external unsafe_set : bytes -> int -> char -> unit = "%bytes_unsafe_set" 31external unsafe_fill : bytes -> int -> int -> char -> unit 32 = "caml_fill_bytes" [@@noalloc] 33external unsafe_to_string : bytes -> string = "%bytes_to_string" 34external unsafe_of_string : string -> bytes = "%bytes_of_string" 35 36external unsafe_blit : bytes -> int -> bytes -> int -> int -> unit 37 = "caml_blit_bytes" [@@noalloc] 38external unsafe_blit_string : string -> int -> bytes -> int -> int -> unit 39 = "caml_blit_string" [@@noalloc] 40 41let make n c = 42 let s = create n in 43 unsafe_fill s 0 n c; 44 s 45 46let init n f = 47 let s = create n in 48 for i = 0 to n - 1 do 49 unsafe_set s i (f i) 50 done; 51 s 52 53let empty = create 0 54 55let copy s = 56 let len = length s in 57 let r = create len in 58 unsafe_blit s 0 r 0 len; 59 r 60 61let to_string b = unsafe_to_string (copy b) 62let of_string s = copy (unsafe_of_string s) 63 64let sub s ofs len = 65 if ofs < 0 || len < 0 || ofs > length s - len 66 then invalid_arg "String.sub / Bytes.sub" 67 else begin 68 let r = create len in 69 unsafe_blit s ofs r 0 len; 70 r 71 end 72 73let sub_string b ofs len = unsafe_to_string (sub b ofs len) 74 75(* addition with an overflow check *) 76let (++) a b = 77 let c = a + b in 78 match a < 0, b < 0, c < 0 with 79 | true , true , false 80 | false, false, true -> invalid_arg "Bytes.extend" (* overflow *) 81 | _ -> c 82 83let extend s left right = 84 let len = length s ++ left ++ right in 85 let r = create len in 86 let (srcoff, dstoff) = if left < 0 then -left, 0 else 0, left in 87 let cpylen = Int.min (length s - srcoff) (len - dstoff) in 88 if cpylen > 0 then unsafe_blit s srcoff r dstoff cpylen; 89 r 90 91let fill s ofs len c = 92 if ofs < 0 || len < 0 || ofs > length s - len 93 then invalid_arg "String.fill / Bytes.fill" 94 else unsafe_fill s ofs len c 95 96let blit s1 ofs1 s2 ofs2 len = 97 if len < 0 || ofs1 < 0 || ofs1 > length s1 - len 98 || ofs2 < 0 || ofs2 > length s2 - len 99 then invalid_arg "Bytes.blit" 100 else unsafe_blit s1 ofs1 s2 ofs2 len 101 102let blit_string s1 ofs1 s2 ofs2 len = 103 if len < 0 || ofs1 < 0 || ofs1 > string_length s1 - len 104 || ofs2 < 0 || ofs2 > length s2 - len 105 then invalid_arg "String.blit / Bytes.blit_string" 106 else unsafe_blit_string s1 ofs1 s2 ofs2 len 107 108(* duplicated in string.ml *) 109let iter f a = 110 for i = 0 to length a - 1 do f(unsafe_get a i) done 111 112(* duplicated in string.ml *) 113let iteri f a = 114 for i = 0 to length a - 1 do f i (unsafe_get a i) done 115 116let ensure_ge (x:int) y = if x >= y then x else invalid_arg "Bytes.concat" 117 118let rec sum_lengths acc seplen = function 119 | [] -> acc 120 | hd :: [] -> length hd + acc 121 | hd :: tl -> sum_lengths (ensure_ge (length hd + seplen + acc) acc) seplen tl 122 123let rec unsafe_blits dst pos sep seplen = function 124 [] -> dst 125 | hd :: [] -> 126 unsafe_blit hd 0 dst pos (length hd); dst 127 | hd :: tl -> 128 unsafe_blit hd 0 dst pos (length hd); 129 unsafe_blit sep 0 dst (pos + length hd) seplen; 130 unsafe_blits dst (pos + length hd + seplen) sep seplen tl 131 132let concat sep = function 133 [] -> empty 134 | l -> let seplen = length sep in 135 unsafe_blits 136 (create (sum_lengths 0 seplen l)) 137 0 sep seplen l 138 139let cat s1 s2 = 140 let l1 = length s1 in 141 let l2 = length s2 in 142 let r = create (l1 + l2) in 143 unsafe_blit s1 0 r 0 l1; 144 unsafe_blit s2 0 r l1 l2; 145 r 146 147 148external char_code: char -> int = "%identity" 149external char_chr: int -> char = "%identity" 150 151let is_space = function 152 | ' ' | '\012' | '\n' | '\r' | '\t' -> true 153 | _ -> false 154 155let trim s = 156 let len = length s in 157 let i = ref 0 in 158 while !i < len && is_space (unsafe_get s !i) do 159 incr i 160 done; 161 let j = ref (len - 1) in 162 while !j >= !i && is_space (unsafe_get s !j) do 163 decr j 164 done; 165 if !j >= !i then 166 sub s !i (!j - !i + 1) 167 else 168 empty 169 170let unsafe_escape s = 171 (* We perform two passes on the input sequence, one to compute the 172 result size and one to write the result. 173 174 #11508, #11509: This logic would be incorrect in presence of 175 concurrent modification to the input, making the use of 176 [unsafe_set] below memory-unsafe. 177 178 Precondition: This function may be safely called on: 179 - an immutable byte sequence 180 - a uniquely-owned byte sequence (the function takes ownership) 181 182 In either case we return a uniquely-owned byte sequence. 183 *) 184 let n = ref 0 in 185 for i = 0 to length s - 1 do 186 n := !n + 187 (match unsafe_get s i with 188 | '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2 189 | ' ' .. '~' -> 1 190 | _ -> 4) 191 done; 192 if !n = length s then s 193 else begin 194 let s' = create !n in 195 n := 0; 196 for i = 0 to length s - 1 do 197 begin match unsafe_get s i with 198 | ('\"' | '\\') as c -> 199 unsafe_set s' !n '\\'; incr n; unsafe_set s' !n c 200 | '\n' -> 201 unsafe_set s' !n '\\'; incr n; unsafe_set s' !n 'n' 202 | '\t' -> 203 unsafe_set s' !n '\\'; incr n; unsafe_set s' !n 't' 204 | '\r' -> 205 unsafe_set s' !n '\\'; incr n; unsafe_set s' !n 'r' 206 | '\b' -> 207 unsafe_set s' !n '\\'; incr n; unsafe_set s' !n 'b' 208 | (' ' .. '~') as c -> unsafe_set s' !n c 209 | c -> 210 let a = char_code c in 211 unsafe_set s' !n '\\'; 212 incr n; 213 unsafe_set s' !n (char_chr (48 + a / 100)); 214 incr n; 215 unsafe_set s' !n (char_chr (48 + (a / 10) mod 10)); 216 incr n; 217 unsafe_set s' !n (char_chr (48 + a mod 10)); 218 end; 219 incr n 220 done; 221 s' 222 end 223 224let escaped b = 225 let b = copy b in 226 (* We copy our input to obtain a uniquely-owned byte sequence [b] 227 to satisfy [unsafe_escape]'s precondition *) 228 unsafe_escape b 229 230let map f s = 231 let l = length s in 232 if l = 0 then s else begin 233 let r = create l in 234 for i = 0 to l - 1 do unsafe_set r i (f (unsafe_get s i)) done; 235 r 236 end 237 238let mapi f s = 239 let l = length s in 240 if l = 0 then s else begin 241 let r = create l in 242 for i = 0 to l - 1 do unsafe_set r i (f i (unsafe_get s i)) done; 243 r 244 end 245 246let fold_left f x a = 247 let r = ref x in 248 for i = 0 to length a - 1 do 249 r := f !r (unsafe_get a i) 250 done; 251 !r 252 253let fold_right f a x = 254 let r = ref x in 255 for i = length a - 1 downto 0 do 256 r := f (unsafe_get a i) !r 257 done; 258 !r 259 260let exists p s = 261 let n = length s in 262 let rec loop i = 263 if i = n then false 264 else if p (unsafe_get s i) then true 265 else loop (succ i) in 266 loop 0 267 268let for_all p s = 269 let n = length s in 270 let rec loop i = 271 if i = n then true 272 else if p (unsafe_get s i) then loop (succ i) 273 else false in 274 loop 0 275 276let uppercase_ascii s = map Char.uppercase_ascii s 277let lowercase_ascii s = map Char.lowercase_ascii s 278 279let apply1 f s = 280 if length s = 0 then s else begin 281 let r = copy s in 282 unsafe_set r 0 (f(unsafe_get s 0)); 283 r 284 end 285 286let capitalize_ascii s = apply1 Char.uppercase_ascii s 287let uncapitalize_ascii s = apply1 Char.lowercase_ascii s 288 289(* duplicated in string.ml *) 290let starts_with ~prefix s = 291 let len_s = length s 292 and len_pre = length prefix in 293 let rec aux i = 294 if i = len_pre then true 295 else if unsafe_get s i <> unsafe_get prefix i then false 296 else aux (i + 1) 297 in len_s >= len_pre && aux 0 298 299(* duplicated in string.ml *) 300let ends_with ~suffix s = 301 let len_s = length s 302 and len_suf = length suffix in 303 let diff = len_s - len_suf in 304 let rec aux i = 305 if i = len_suf then true 306 else if unsafe_get s (diff + i) <> unsafe_get suffix i then false 307 else aux (i + 1) 308 in diff >= 0 && aux 0 309 310(* duplicated in string.ml *) 311let rec index_rec s lim i c = 312 if i >= lim then raise Not_found else 313 if unsafe_get s i = c then i else index_rec s lim (i + 1) c 314 315(* duplicated in string.ml *) 316let index s c = index_rec s (length s) 0 c 317 318(* duplicated in string.ml *) 319let rec index_rec_opt s lim i c = 320 if i >= lim then None else 321 if unsafe_get s i = c then Some i else index_rec_opt s lim (i + 1) c 322 323(* duplicated in string.ml *) 324let index_opt s c = index_rec_opt s (length s) 0 c 325 326(* duplicated in string.ml *) 327let index_from s i c = 328 let l = length s in 329 if i < 0 || i > l then invalid_arg "String.index_from / Bytes.index_from" else 330 index_rec s l i c 331 332(* duplicated in string.ml *) 333let index_from_opt s i c = 334 let l = length s in 335 if i < 0 || i > l then 336 invalid_arg "String.index_from_opt / Bytes.index_from_opt" 337 else 338 index_rec_opt s l i c 339 340(* duplicated in string.ml *) 341let rec rindex_rec s i c = 342 if i < 0 then raise Not_found else 343 if unsafe_get s i = c then i else rindex_rec s (i - 1) c 344 345(* duplicated in string.ml *) 346let rindex s c = rindex_rec s (length s - 1) c 347 348(* duplicated in string.ml *) 349let rindex_from s i c = 350 if i < -1 || i >= length s then 351 invalid_arg "String.rindex_from / Bytes.rindex_from" 352 else 353 rindex_rec s i c 354 355(* duplicated in string.ml *) 356let rec rindex_rec_opt s i c = 357 if i < 0 then None else 358 if unsafe_get s i = c then Some i else rindex_rec_opt s (i - 1) c 359 360(* duplicated in string.ml *) 361let rindex_opt s c = rindex_rec_opt s (length s - 1) c 362 363(* duplicated in string.ml *) 364let rindex_from_opt s i c = 365 if i < -1 || i >= length s then 366 invalid_arg "String.rindex_from_opt / Bytes.rindex_from_opt" 367 else 368 rindex_rec_opt s i c 369 370 371(* duplicated in string.ml *) 372let contains_from s i c = 373 let l = length s in 374 if i < 0 || i > l then 375 invalid_arg "String.contains_from / Bytes.contains_from" 376 else 377 try ignore (index_rec s l i c); true with Not_found -> false 378 379 380(* duplicated in string.ml *) 381let contains s c = contains_from s 0 c 382 383(* duplicated in string.ml *) 384let rcontains_from s i c = 385 if i < 0 || i >= length s then 386 invalid_arg "String.rcontains_from / Bytes.rcontains_from" 387 else 388 try ignore (rindex_rec s i c); true with Not_found -> false 389 390 391type t = bytes 392 393let compare (x: t) (y: t) = Stdlib.compare x y 394external equal : t -> t -> bool = "caml_bytes_equal" [@@noalloc] 395 396(* duplicated in string.ml *) 397let split_on_char sep s = 398 let r = ref [] in 399 let j = ref (length s) in 400 for i = length s - 1 downto 0 do 401 if unsafe_get s i = sep then begin 402 r := sub s (i + 1) (!j - i - 1) :: !r; 403 j := i 404 end 405 done; 406 sub s 0 !j :: !r 407 408(** {1 Iterators} *) 409 410let to_seq s = 411 let rec aux i () = 412 if i = length s then Seq.Nil 413 else 414 let x = get s i in 415 Seq.Cons (x, aux (i+1)) 416 in 417 aux 0 418 419let to_seqi s = 420 let rec aux i () = 421 if i = length s then Seq.Nil 422 else 423 let x = get s i in 424 Seq.Cons ((i,x), aux (i+1)) 425 in 426 aux 0 427 428let of_seq i = 429 let n = ref 0 in 430 let buf = ref (make 256 '\000') in 431 let resize () = 432 (* resize *) 433 let new_len = Int.min (2 * length !buf) Sys.max_string_length in 434 if length !buf = new_len then failwith "Bytes.of_seq: cannot grow bytes"; 435 let new_buf = make new_len '\000' in 436 blit !buf 0 new_buf 0 !n; 437 buf := new_buf 438 in 439 Seq.iter 440 (fun c -> 441 if !n = length !buf then resize(); 442 set !buf !n c; 443 incr n) 444 i; 445 sub !buf 0 !n 446 447(** {6 Binary encoding/decoding of integers} *) 448 449(* The get_ functions are all duplicated in string.ml *) 450 451external unsafe_get_uint8 : bytes -> int -> int = "%bytes_unsafe_get" 452external unsafe_get_uint16_ne : bytes -> int -> int = "%caml_bytes_get16u" 453external get_uint8 : bytes -> int -> int = "%bytes_safe_get" 454external get_uint16_ne : bytes -> int -> int = "%caml_bytes_get16" 455external get_int32_ne : bytes -> int -> int32 = "%caml_bytes_get32" 456external get_int64_ne : bytes -> int -> int64 = "%caml_bytes_get64" 457 458external unsafe_set_uint8 : bytes -> int -> int -> unit = "%bytes_unsafe_set" 459external unsafe_set_uint16_ne : bytes -> int -> int -> unit 460 = "%caml_bytes_set16u" 461external set_int8 : bytes -> int -> int -> unit = "%bytes_safe_set" 462external set_int16_ne : bytes -> int -> int -> unit = "%caml_bytes_set16" 463external set_int32_ne : bytes -> int -> int32 -> unit = "%caml_bytes_set32" 464external set_int64_ne : bytes -> int -> int64 -> unit = "%caml_bytes_set64" 465external swap16 : int -> int = "%bswap16" 466external swap32 : int32 -> int32 = "%bswap_int32" 467external swap64 : int64 -> int64 = "%bswap_int64" 468 469let unsafe_get_uint16_le b i = 470 if Sys.big_endian 471 then swap16 (unsafe_get_uint16_ne b i) 472 else unsafe_get_uint16_ne b i 473 474let unsafe_get_uint16_be b i = 475 if Sys.big_endian 476 then unsafe_get_uint16_ne b i 477 else swap16 (unsafe_get_uint16_ne b i) 478 479let get_int8 b i = 480 ((get_uint8 b i) lsl (Sys.int_size - 8)) asr (Sys.int_size - 8) 481 482let get_uint16_le b i = 483 if Sys.big_endian then swap16 (get_uint16_ne b i) 484 else get_uint16_ne b i 485 486let get_uint16_be b i = 487 if not Sys.big_endian then swap16 (get_uint16_ne b i) 488 else get_uint16_ne b i 489 490let get_int16_ne b i = 491 ((get_uint16_ne b i) lsl (Sys.int_size - 16)) asr (Sys.int_size - 16) 492 493let get_int16_le b i = 494 ((get_uint16_le b i) lsl (Sys.int_size - 16)) asr (Sys.int_size - 16) 495 496let get_int16_be b i = 497 ((get_uint16_be b i) lsl (Sys.int_size - 16)) asr (Sys.int_size - 16) 498 499let get_int32_le b i = 500 if Sys.big_endian then swap32 (get_int32_ne b i) 501 else get_int32_ne b i 502 503let get_int32_be b i = 504 if not Sys.big_endian then swap32 (get_int32_ne b i) 505 else get_int32_ne b i 506 507let get_int64_le b i = 508 if Sys.big_endian then swap64 (get_int64_ne b i) 509 else get_int64_ne b i 510 511let get_int64_be b i = 512 if not Sys.big_endian then swap64 (get_int64_ne b i) 513 else get_int64_ne b i 514 515let unsafe_set_uint16_le b i x = 516 if Sys.big_endian 517 then unsafe_set_uint16_ne b i (swap16 x) 518 else unsafe_set_uint16_ne b i x 519 520let unsafe_set_uint16_be b i x = 521 if Sys.big_endian 522 then unsafe_set_uint16_ne b i x else 523 unsafe_set_uint16_ne b i (swap16 x) 524 525let set_int16_le b i x = 526 if Sys.big_endian then set_int16_ne b i (swap16 x) 527 else set_int16_ne b i x 528 529let set_int16_be b i x = 530 if not Sys.big_endian then set_int16_ne b i (swap16 x) 531 else set_int16_ne b i x 532 533let set_int32_le b i x = 534 if Sys.big_endian then set_int32_ne b i (swap32 x) 535 else set_int32_ne b i x 536 537let set_int32_be b i x = 538 if not Sys.big_endian then set_int32_ne b i (swap32 x) 539 else set_int32_ne b i x 540 541let set_int64_le b i x = 542 if Sys.big_endian then set_int64_ne b i (swap64 x) 543 else set_int64_ne b i x 544 545let set_int64_be b i x = 546 if not Sys.big_endian then set_int64_ne b i (swap64 x) 547 else set_int64_ne b i x 548 549let set_uint8 = set_int8 550let set_uint16_ne = set_int16_ne 551let set_uint16_be = set_int16_be 552let set_uint16_le = set_int16_le 553 554(* UTF codecs and validations *) 555 556let dec_invalid = Uchar.utf_decode_invalid 557let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u) 558 559(* In case of decoding error, if we error on the first byte, we 560 consume the byte, otherwise we consume the [n] bytes preceding 561 the erroring byte. 562 563 This means that if a client uses decodes without caring about 564 validity it naturally replace bogus data with Uchar.rep according 565 to the WHATWG Encoding standard. Other schemes are possible by 566 consulting the number of used bytes on invalid decodes. For more 567 details see https://hsivonen.fi/broken-utf-8/ 568 569 For this reason in [get_utf_8_uchar] we gradually check the next 570 byte is available rather than doing it immediately after the 571 first byte. Contrast with [is_valid_utf_8]. *) 572 573(* UTF-8 *) 574 575let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10 576let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101 577let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100 578let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b 579let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8 580 581let[@inline] utf_8_uchar_2 b0 b1 = 582 ((b0 land 0x1F) lsl 6) lor 583 ((b1 land 0x3F)) 584 585let[@inline] utf_8_uchar_3 b0 b1 b2 = 586 ((b0 land 0x0F) lsl 12) lor 587 ((b1 land 0x3F) lsl 6) lor 588 ((b2 land 0x3F)) 589 590let[@inline] utf_8_uchar_4 b0 b1 b2 b3 = 591 ((b0 land 0x07) lsl 18) lor 592 ((b1 land 0x3F) lsl 12) lor 593 ((b2 land 0x3F) lsl 6) lor 594 ((b3 land 0x3F)) 595 596let get_utf_8_uchar b i = 597 let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *) 598 let get = unsafe_get_uint8 in 599 let max = length b - 1 in 600 match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *) 601 | '\x00' .. '\x7F' -> dec_ret 1 b0 602 | '\xC2' .. '\xDF' -> 603 let i = i + 1 in if i > max then dec_invalid 1 else 604 let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 605 dec_ret 2 (utf_8_uchar_2 b0 b1) 606 | '\xE0' -> 607 let i = i + 1 in if i > max then dec_invalid 1 else 608 let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else 609 let i = i + 1 in if i > max then dec_invalid 2 else 610 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 611 dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 612 | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' -> 613 let i = i + 1 in if i > max then dec_invalid 1 else 614 let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 615 let i = i + 1 in if i > max then dec_invalid 2 else 616 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 617 dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 618 | '\xED' -> 619 let i = i + 1 in if i > max then dec_invalid 1 else 620 let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else 621 let i = i + 1 in if i > max then dec_invalid 2 else 622 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 623 dec_ret 3 (utf_8_uchar_3 b0 b1 b2) 624 | '\xF0' -> 625 let i = i + 1 in if i > max then dec_invalid 1 else 626 let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else 627 let i = i + 1 in if i > max then dec_invalid 2 else 628 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 629 let i = i + 1 in if i > max then dec_invalid 3 else 630 let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 631 dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 632 | '\xF1' .. '\xF3' -> 633 let i = i + 1 in if i > max then dec_invalid 1 else 634 let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else 635 let i = i + 1 in if i > max then dec_invalid 2 else 636 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 637 let i = i + 1 in if i > max then dec_invalid 3 else 638 let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 639 dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 640 | '\xF4' -> 641 let i = i + 1 in if i > max then dec_invalid 1 else 642 let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else 643 let i = i + 1 in if i > max then dec_invalid 2 else 644 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else 645 let i = i + 1 in if i > max then dec_invalid 3 else 646 let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else 647 dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3) 648 | _ -> dec_invalid 1 649 650let set_utf_8_uchar b i u = 651 let set = unsafe_set_uint8 in 652 let max = length b - 1 in 653 match Uchar.to_int u with 654 | u when u < 0 -> assert false 655 | u when u <= 0x007F -> 656 set_uint8 b i u; 657 1 658 | u when u <= 0x07FF -> 659 let last = i + 1 in 660 if last > max then 0 else 661 (set_uint8 b i (0xC0 lor (u lsr 6)); 662 set b last (0x80 lor (u land 0x3F)); 663 2) 664 | u when u <= 0xFFFF -> 665 let last = i + 2 in 666 if last > max then 0 else 667 (set_uint8 b i (0xE0 lor (u lsr 12)); 668 set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F)); 669 set b last (0x80 lor (u land 0x3F)); 670 3) 671 | u when u <= 0x10FFFF -> 672 let last = i + 3 in 673 if last > max then 0 else 674 (set_uint8 b i (0xF0 lor (u lsr 18)); 675 set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F)); 676 set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F)); 677 set b last (0x80 lor (u land 0x3F)); 678 4) 679 | _ -> assert false 680 681let is_valid_utf_8 b = 682 let rec loop max b i = 683 if i > max then true else 684 let get = unsafe_get_uint8 in 685 match Char.unsafe_chr (get b i) with 686 | '\x00' .. '\x7F' -> loop max b (i + 1) 687 | '\xC2' .. '\xDF' -> 688 let last = i + 1 in 689 if last > max 690 || not_in_x80_to_xBF (get b last) 691 then false 692 else loop max b (last + 1) 693 | '\xE0' -> 694 let last = i + 2 in 695 if last > max 696 || not_in_xA0_to_xBF (get b (i + 1)) 697 || not_in_x80_to_xBF (get b last) 698 then false 699 else loop max b (last + 1) 700 | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' -> 701 let last = i + 2 in 702 if last > max 703 || not_in_x80_to_xBF (get b (i + 1)) 704 || not_in_x80_to_xBF (get b last) 705 then false 706 else loop max b (last + 1) 707 | '\xED' -> 708 let last = i + 2 in 709 if last > max 710 || not_in_x80_to_x9F (get b (i + 1)) 711 || not_in_x80_to_xBF (get b last) 712 then false 713 else loop max b (last + 1) 714 | '\xF0' -> 715 let last = i + 3 in 716 if last > max 717 || not_in_x90_to_xBF (get b (i + 1)) 718 || not_in_x80_to_xBF (get b (i + 2)) 719 || not_in_x80_to_xBF (get b last) 720 then false 721 else loop max b (last + 1) 722 | '\xF1' .. '\xF3' -> 723 let last = i + 3 in 724 if last > max 725 || not_in_x80_to_xBF (get b (i + 1)) 726 || not_in_x80_to_xBF (get b (i + 2)) 727 || not_in_x80_to_xBF (get b last) 728 then false 729 else loop max b (last + 1) 730 | '\xF4' -> 731 let last = i + 3 in 732 if last > max 733 || not_in_x80_to_x8F (get b (i + 1)) 734 || not_in_x80_to_xBF (get b (i + 2)) 735 || not_in_x80_to_xBF (get b last) 736 then false 737 else loop max b (last + 1) 738 | _ -> false 739 in 740 loop (length b - 1) b 0 741 742(* UTF-16BE *) 743 744let get_utf_16be_uchar b i = 745 let get = unsafe_get_uint16_be in 746 let max = length b - 1 in 747 if i < 0 || i > max then invalid_arg "index out of bounds" else 748 if i = max then dec_invalid 1 else 749 match get b i with 750 | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u 751 | u when u > 0xDBFF -> dec_invalid 2 752 | hi -> (* combine [hi] with a low surrogate *) 753 let last = i + 3 in 754 if last > max then dec_invalid (max - i + 1) else 755 match get b (i + 2) with 756 | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *) 757 | lo -> 758 let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in 759 dec_ret 4 u 760 761let set_utf_16be_uchar b i u = 762 let set = unsafe_set_uint16_be in 763 let max = length b - 1 in 764 if i < 0 || i > max then invalid_arg "index out of bounds" else 765 match Uchar.to_int u with 766 | u when u < 0 -> assert false 767 | u when u <= 0xFFFF -> 768 let last = i + 1 in 769 if last > max then 0 else (set b i u; 2) 770 | u when u <= 0x10FFFF -> 771 let last = i + 3 in 772 if last > max then 0 else 773 let u' = u - 0x10000 in 774 let hi = (0xD800 lor (u' lsr 10)) in 775 let lo = (0xDC00 lor (u' land 0x3FF)) in 776 set b i hi; set b (i + 2) lo; 4 777 | _ -> assert false 778 779let is_valid_utf_16be b = 780 let rec loop max b i = 781 let get = unsafe_get_uint16_be in 782 if i > max then true else 783 if i = max then false else 784 match get b i with 785 | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2) 786 | u when u > 0xDBFF -> false 787 | _hi -> 788 let last = i + 3 in 789 if last > max then false else 790 match get b (i + 2) with 791 | u when u < 0xDC00 || u > 0xDFFF -> false 792 | _lo -> loop max b (i + 4) 793 in 794 loop (length b - 1) b 0 795 796(* UTF-16LE *) 797 798let get_utf_16le_uchar b i = 799 let get = unsafe_get_uint16_le in 800 let max = length b - 1 in 801 if i < 0 || i > max then invalid_arg "index out of bounds" else 802 if i = max then dec_invalid 1 else 803 match get b i with 804 | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u 805 | u when u > 0xDBFF -> dec_invalid 2 806 | hi -> (* combine [hi] with a low surrogate *) 807 let last = i + 3 in 808 if last > max then dec_invalid (max - i + 1) else 809 match get b (i + 2) with 810 | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *) 811 | lo -> 812 let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in 813 dec_ret 4 u 814 815let set_utf_16le_uchar b i u = 816 let set = unsafe_set_uint16_le in 817 let max = length b - 1 in 818 if i < 0 || i > max then invalid_arg "index out of bounds" else 819 match Uchar.to_int u with 820 | u when u < 0 -> assert false 821 | u when u <= 0xFFFF -> 822 let last = i + 1 in 823 if last > max then 0 else (set b i u; 2) 824 | u when u <= 0x10FFFF -> 825 let last = i + 3 in 826 if last > max then 0 else 827 let u' = u - 0x10000 in 828 let hi = (0xD800 lor (u' lsr 10)) in 829 let lo = (0xDC00 lor (u' land 0x3FF)) in 830 set b i hi; set b (i + 2) lo; 4 831 | _ -> assert false 832 833let is_valid_utf_16le b = 834 let rec loop max b i = 835 let get = unsafe_get_uint16_le in 836 if i > max then true else 837 if i = max then false else 838 match get b i with 839 | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2) 840 | u when u > 0xDBFF -> false 841 | _hi -> 842 let last = i + 3 in 843 if last > max then false else 844 match get b (i + 2) with 845 | u when u < 0xDC00 || u > 0xDFFF -> false 846 | _lo -> loop max b (i + 4) 847 in 848 loop (length b - 1) b 0