My working unpac repository
1(**************************************************************************)
2(* *)
3(* OCaml *)
4(* *)
5(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
6(* *)
7(* Copyright 1996 Institut National de Recherche en Informatique et *)
8(* en Automatique. *)
9(* *)
10(* All rights reserved. This file is distributed under the terms of *)
11(* the GNU Lesser General Public License version 2.1, with the *)
12(* special exception on linking described in the file LICENSE. *)
13(* *)
14(**************************************************************************)
15
16(* Byte sequence operations *)
17
18(* WARNING: Some functions in this file are duplicated in string.ml for
19 efficiency reasons. When you modify the one in this file you need to
20 modify its duplicate in string.ml.
21 These functions have a "duplicated" comment above their definition.
22*)
23
24external length : bytes -> int = "%bytes_length"
25external string_length : string -> int = "%string_length"
26external get : bytes -> int -> char = "%bytes_safe_get"
27external set : bytes -> int -> char -> unit = "%bytes_safe_set"
28external create : int -> bytes = "caml_create_bytes"
29external unsafe_get : bytes -> int -> char = "%bytes_unsafe_get"
30external unsafe_set : bytes -> int -> char -> unit = "%bytes_unsafe_set"
31external unsafe_fill : bytes -> int -> int -> char -> unit
32 = "caml_fill_bytes" [@@noalloc]
33external unsafe_to_string : bytes -> string = "%bytes_to_string"
34external unsafe_of_string : string -> bytes = "%bytes_of_string"
35
36external unsafe_blit : bytes -> int -> bytes -> int -> int -> unit
37 = "caml_blit_bytes" [@@noalloc]
38external unsafe_blit_string : string -> int -> bytes -> int -> int -> unit
39 = "caml_blit_string" [@@noalloc]
40
41let make n c =
42 let s = create n in
43 unsafe_fill s 0 n c;
44 s
45
46let init n f =
47 let s = create n in
48 for i = 0 to n - 1 do
49 unsafe_set s i (f i)
50 done;
51 s
52
53let empty = create 0
54
55let copy s =
56 let len = length s in
57 let r = create len in
58 unsafe_blit s 0 r 0 len;
59 r
60
61let to_string b = unsafe_to_string (copy b)
62let of_string s = copy (unsafe_of_string s)
63
64let sub s ofs len =
65 if ofs < 0 || len < 0 || ofs > length s - len
66 then invalid_arg "String.sub / Bytes.sub"
67 else begin
68 let r = create len in
69 unsafe_blit s ofs r 0 len;
70 r
71 end
72
73let sub_string b ofs len = unsafe_to_string (sub b ofs len)
74
75(* addition with an overflow check *)
76let (++) a b =
77 let c = a + b in
78 match a < 0, b < 0, c < 0 with
79 | true , true , false
80 | false, false, true -> invalid_arg "Bytes.extend" (* overflow *)
81 | _ -> c
82
83let extend s left right =
84 let len = length s ++ left ++ right in
85 let r = create len in
86 let (srcoff, dstoff) = if left < 0 then -left, 0 else 0, left in
87 let cpylen = Int.min (length s - srcoff) (len - dstoff) in
88 if cpylen > 0 then unsafe_blit s srcoff r dstoff cpylen;
89 r
90
91let fill s ofs len c =
92 if ofs < 0 || len < 0 || ofs > length s - len
93 then invalid_arg "String.fill / Bytes.fill"
94 else unsafe_fill s ofs len c
95
96let blit s1 ofs1 s2 ofs2 len =
97 if len < 0 || ofs1 < 0 || ofs1 > length s1 - len
98 || ofs2 < 0 || ofs2 > length s2 - len
99 then invalid_arg "Bytes.blit"
100 else unsafe_blit s1 ofs1 s2 ofs2 len
101
102let blit_string s1 ofs1 s2 ofs2 len =
103 if len < 0 || ofs1 < 0 || ofs1 > string_length s1 - len
104 || ofs2 < 0 || ofs2 > length s2 - len
105 then invalid_arg "String.blit / Bytes.blit_string"
106 else unsafe_blit_string s1 ofs1 s2 ofs2 len
107
108(* duplicated in string.ml *)
109let iter f a =
110 for i = 0 to length a - 1 do f(unsafe_get a i) done
111
112(* duplicated in string.ml *)
113let iteri f a =
114 for i = 0 to length a - 1 do f i (unsafe_get a i) done
115
116let ensure_ge (x:int) y = if x >= y then x else invalid_arg "Bytes.concat"
117
118let rec sum_lengths acc seplen = function
119 | [] -> acc
120 | hd :: [] -> length hd + acc
121 | hd :: tl -> sum_lengths (ensure_ge (length hd + seplen + acc) acc) seplen tl
122
123let rec unsafe_blits dst pos sep seplen = function
124 [] -> dst
125 | hd :: [] ->
126 unsafe_blit hd 0 dst pos (length hd); dst
127 | hd :: tl ->
128 unsafe_blit hd 0 dst pos (length hd);
129 unsafe_blit sep 0 dst (pos + length hd) seplen;
130 unsafe_blits dst (pos + length hd + seplen) sep seplen tl
131
132let concat sep = function
133 [] -> empty
134 | l -> let seplen = length sep in
135 unsafe_blits
136 (create (sum_lengths 0 seplen l))
137 0 sep seplen l
138
139let cat s1 s2 =
140 let l1 = length s1 in
141 let l2 = length s2 in
142 let r = create (l1 + l2) in
143 unsafe_blit s1 0 r 0 l1;
144 unsafe_blit s2 0 r l1 l2;
145 r
146
147
148external char_code: char -> int = "%identity"
149external char_chr: int -> char = "%identity"
150
151let is_space = function
152 | ' ' | '\012' | '\n' | '\r' | '\t' -> true
153 | _ -> false
154
155let trim s =
156 let len = length s in
157 let i = ref 0 in
158 while !i < len && is_space (unsafe_get s !i) do
159 incr i
160 done;
161 let j = ref (len - 1) in
162 while !j >= !i && is_space (unsafe_get s !j) do
163 decr j
164 done;
165 if !j >= !i then
166 sub s !i (!j - !i + 1)
167 else
168 empty
169
170let unsafe_escape s =
171 (* We perform two passes on the input sequence, one to compute the
172 result size and one to write the result.
173
174 #11508, #11509: This logic would be incorrect in presence of
175 concurrent modification to the input, making the use of
176 [unsafe_set] below memory-unsafe.
177
178 Precondition: This function may be safely called on:
179 - an immutable byte sequence
180 - a uniquely-owned byte sequence (the function takes ownership)
181
182 In either case we return a uniquely-owned byte sequence.
183 *)
184 let n = ref 0 in
185 for i = 0 to length s - 1 do
186 n := !n +
187 (match unsafe_get s i with
188 | '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2
189 | ' ' .. '~' -> 1
190 | _ -> 4)
191 done;
192 if !n = length s then s
193 else begin
194 let s' = create !n in
195 n := 0;
196 for i = 0 to length s - 1 do
197 begin match unsafe_get s i with
198 | ('\"' | '\\') as c ->
199 unsafe_set s' !n '\\'; incr n; unsafe_set s' !n c
200 | '\n' ->
201 unsafe_set s' !n '\\'; incr n; unsafe_set s' !n 'n'
202 | '\t' ->
203 unsafe_set s' !n '\\'; incr n; unsafe_set s' !n 't'
204 | '\r' ->
205 unsafe_set s' !n '\\'; incr n; unsafe_set s' !n 'r'
206 | '\b' ->
207 unsafe_set s' !n '\\'; incr n; unsafe_set s' !n 'b'
208 | (' ' .. '~') as c -> unsafe_set s' !n c
209 | c ->
210 let a = char_code c in
211 unsafe_set s' !n '\\';
212 incr n;
213 unsafe_set s' !n (char_chr (48 + a / 100));
214 incr n;
215 unsafe_set s' !n (char_chr (48 + (a / 10) mod 10));
216 incr n;
217 unsafe_set s' !n (char_chr (48 + a mod 10));
218 end;
219 incr n
220 done;
221 s'
222 end
223
224let escaped b =
225 let b = copy b in
226 (* We copy our input to obtain a uniquely-owned byte sequence [b]
227 to satisfy [unsafe_escape]'s precondition *)
228 unsafe_escape b
229
230let map f s =
231 let l = length s in
232 if l = 0 then s else begin
233 let r = create l in
234 for i = 0 to l - 1 do unsafe_set r i (f (unsafe_get s i)) done;
235 r
236 end
237
238let mapi f s =
239 let l = length s in
240 if l = 0 then s else begin
241 let r = create l in
242 for i = 0 to l - 1 do unsafe_set r i (f i (unsafe_get s i)) done;
243 r
244 end
245
246let fold_left f x a =
247 let r = ref x in
248 for i = 0 to length a - 1 do
249 r := f !r (unsafe_get a i)
250 done;
251 !r
252
253let fold_right f a x =
254 let r = ref x in
255 for i = length a - 1 downto 0 do
256 r := f (unsafe_get a i) !r
257 done;
258 !r
259
260let exists p s =
261 let n = length s in
262 let rec loop i =
263 if i = n then false
264 else if p (unsafe_get s i) then true
265 else loop (succ i) in
266 loop 0
267
268let for_all p s =
269 let n = length s in
270 let rec loop i =
271 if i = n then true
272 else if p (unsafe_get s i) then loop (succ i)
273 else false in
274 loop 0
275
276let uppercase_ascii s = map Char.uppercase_ascii s
277let lowercase_ascii s = map Char.lowercase_ascii s
278
279let apply1 f s =
280 if length s = 0 then s else begin
281 let r = copy s in
282 unsafe_set r 0 (f(unsafe_get s 0));
283 r
284 end
285
286let capitalize_ascii s = apply1 Char.uppercase_ascii s
287let uncapitalize_ascii s = apply1 Char.lowercase_ascii s
288
289(* duplicated in string.ml *)
290let starts_with ~prefix s =
291 let len_s = length s
292 and len_pre = length prefix in
293 let rec aux i =
294 if i = len_pre then true
295 else if unsafe_get s i <> unsafe_get prefix i then false
296 else aux (i + 1)
297 in len_s >= len_pre && aux 0
298
299(* duplicated in string.ml *)
300let ends_with ~suffix s =
301 let len_s = length s
302 and len_suf = length suffix in
303 let diff = len_s - len_suf in
304 let rec aux i =
305 if i = len_suf then true
306 else if unsafe_get s (diff + i) <> unsafe_get suffix i then false
307 else aux (i + 1)
308 in diff >= 0 && aux 0
309
310(* duplicated in string.ml *)
311let rec index_rec s lim i c =
312 if i >= lim then raise Not_found else
313 if unsafe_get s i = c then i else index_rec s lim (i + 1) c
314
315(* duplicated in string.ml *)
316let index s c = index_rec s (length s) 0 c
317
318(* duplicated in string.ml *)
319let rec index_rec_opt s lim i c =
320 if i >= lim then None else
321 if unsafe_get s i = c then Some i else index_rec_opt s lim (i + 1) c
322
323(* duplicated in string.ml *)
324let index_opt s c = index_rec_opt s (length s) 0 c
325
326(* duplicated in string.ml *)
327let index_from s i c =
328 let l = length s in
329 if i < 0 || i > l then invalid_arg "String.index_from / Bytes.index_from" else
330 index_rec s l i c
331
332(* duplicated in string.ml *)
333let index_from_opt s i c =
334 let l = length s in
335 if i < 0 || i > l then
336 invalid_arg "String.index_from_opt / Bytes.index_from_opt"
337 else
338 index_rec_opt s l i c
339
340(* duplicated in string.ml *)
341let rec rindex_rec s i c =
342 if i < 0 then raise Not_found else
343 if unsafe_get s i = c then i else rindex_rec s (i - 1) c
344
345(* duplicated in string.ml *)
346let rindex s c = rindex_rec s (length s - 1) c
347
348(* duplicated in string.ml *)
349let rindex_from s i c =
350 if i < -1 || i >= length s then
351 invalid_arg "String.rindex_from / Bytes.rindex_from"
352 else
353 rindex_rec s i c
354
355(* duplicated in string.ml *)
356let rec rindex_rec_opt s i c =
357 if i < 0 then None else
358 if unsafe_get s i = c then Some i else rindex_rec_opt s (i - 1) c
359
360(* duplicated in string.ml *)
361let rindex_opt s c = rindex_rec_opt s (length s - 1) c
362
363(* duplicated in string.ml *)
364let rindex_from_opt s i c =
365 if i < -1 || i >= length s then
366 invalid_arg "String.rindex_from_opt / Bytes.rindex_from_opt"
367 else
368 rindex_rec_opt s i c
369
370
371(* duplicated in string.ml *)
372let contains_from s i c =
373 let l = length s in
374 if i < 0 || i > l then
375 invalid_arg "String.contains_from / Bytes.contains_from"
376 else
377 try ignore (index_rec s l i c); true with Not_found -> false
378
379
380(* duplicated in string.ml *)
381let contains s c = contains_from s 0 c
382
383(* duplicated in string.ml *)
384let rcontains_from s i c =
385 if i < 0 || i >= length s then
386 invalid_arg "String.rcontains_from / Bytes.rcontains_from"
387 else
388 try ignore (rindex_rec s i c); true with Not_found -> false
389
390
391type t = bytes
392
393let compare (x: t) (y: t) = Stdlib.compare x y
394external equal : t -> t -> bool = "caml_bytes_equal" [@@noalloc]
395
396(* duplicated in string.ml *)
397let split_on_char sep s =
398 let r = ref [] in
399 let j = ref (length s) in
400 for i = length s - 1 downto 0 do
401 if unsafe_get s i = sep then begin
402 r := sub s (i + 1) (!j - i - 1) :: !r;
403 j := i
404 end
405 done;
406 sub s 0 !j :: !r
407
408(** {1 Iterators} *)
409
410let to_seq s =
411 let rec aux i () =
412 if i = length s then Seq.Nil
413 else
414 let x = get s i in
415 Seq.Cons (x, aux (i+1))
416 in
417 aux 0
418
419let to_seqi s =
420 let rec aux i () =
421 if i = length s then Seq.Nil
422 else
423 let x = get s i in
424 Seq.Cons ((i,x), aux (i+1))
425 in
426 aux 0
427
428let of_seq i =
429 let n = ref 0 in
430 let buf = ref (make 256 '\000') in
431 let resize () =
432 (* resize *)
433 let new_len = Int.min (2 * length !buf) Sys.max_string_length in
434 if length !buf = new_len then failwith "Bytes.of_seq: cannot grow bytes";
435 let new_buf = make new_len '\000' in
436 blit !buf 0 new_buf 0 !n;
437 buf := new_buf
438 in
439 Seq.iter
440 (fun c ->
441 if !n = length !buf then resize();
442 set !buf !n c;
443 incr n)
444 i;
445 sub !buf 0 !n
446
447(** {6 Binary encoding/decoding of integers} *)
448
449(* The get_ functions are all duplicated in string.ml *)
450
451external unsafe_get_uint8 : bytes -> int -> int = "%bytes_unsafe_get"
452external unsafe_get_uint16_ne : bytes -> int -> int = "%caml_bytes_get16u"
453external get_uint8 : bytes -> int -> int = "%bytes_safe_get"
454external get_uint16_ne : bytes -> int -> int = "%caml_bytes_get16"
455external get_int32_ne : bytes -> int -> int32 = "%caml_bytes_get32"
456external get_int64_ne : bytes -> int -> int64 = "%caml_bytes_get64"
457
458external unsafe_set_uint8 : bytes -> int -> int -> unit = "%bytes_unsafe_set"
459external unsafe_set_uint16_ne : bytes -> int -> int -> unit
460 = "%caml_bytes_set16u"
461external set_int8 : bytes -> int -> int -> unit = "%bytes_safe_set"
462external set_int16_ne : bytes -> int -> int -> unit = "%caml_bytes_set16"
463external set_int32_ne : bytes -> int -> int32 -> unit = "%caml_bytes_set32"
464external set_int64_ne : bytes -> int -> int64 -> unit = "%caml_bytes_set64"
465external swap16 : int -> int = "%bswap16"
466external swap32 : int32 -> int32 = "%bswap_int32"
467external swap64 : int64 -> int64 = "%bswap_int64"
468
469let unsafe_get_uint16_le b i =
470 if Sys.big_endian
471 then swap16 (unsafe_get_uint16_ne b i)
472 else unsafe_get_uint16_ne b i
473
474let unsafe_get_uint16_be b i =
475 if Sys.big_endian
476 then unsafe_get_uint16_ne b i
477 else swap16 (unsafe_get_uint16_ne b i)
478
479let get_int8 b i =
480 ((get_uint8 b i) lsl (Sys.int_size - 8)) asr (Sys.int_size - 8)
481
482let get_uint16_le b i =
483 if Sys.big_endian then swap16 (get_uint16_ne b i)
484 else get_uint16_ne b i
485
486let get_uint16_be b i =
487 if not Sys.big_endian then swap16 (get_uint16_ne b i)
488 else get_uint16_ne b i
489
490let get_int16_ne b i =
491 ((get_uint16_ne b i) lsl (Sys.int_size - 16)) asr (Sys.int_size - 16)
492
493let get_int16_le b i =
494 ((get_uint16_le b i) lsl (Sys.int_size - 16)) asr (Sys.int_size - 16)
495
496let get_int16_be b i =
497 ((get_uint16_be b i) lsl (Sys.int_size - 16)) asr (Sys.int_size - 16)
498
499let get_int32_le b i =
500 if Sys.big_endian then swap32 (get_int32_ne b i)
501 else get_int32_ne b i
502
503let get_int32_be b i =
504 if not Sys.big_endian then swap32 (get_int32_ne b i)
505 else get_int32_ne b i
506
507let get_int64_le b i =
508 if Sys.big_endian then swap64 (get_int64_ne b i)
509 else get_int64_ne b i
510
511let get_int64_be b i =
512 if not Sys.big_endian then swap64 (get_int64_ne b i)
513 else get_int64_ne b i
514
515let unsafe_set_uint16_le b i x =
516 if Sys.big_endian
517 then unsafe_set_uint16_ne b i (swap16 x)
518 else unsafe_set_uint16_ne b i x
519
520let unsafe_set_uint16_be b i x =
521 if Sys.big_endian
522 then unsafe_set_uint16_ne b i x else
523 unsafe_set_uint16_ne b i (swap16 x)
524
525let set_int16_le b i x =
526 if Sys.big_endian then set_int16_ne b i (swap16 x)
527 else set_int16_ne b i x
528
529let set_int16_be b i x =
530 if not Sys.big_endian then set_int16_ne b i (swap16 x)
531 else set_int16_ne b i x
532
533let set_int32_le b i x =
534 if Sys.big_endian then set_int32_ne b i (swap32 x)
535 else set_int32_ne b i x
536
537let set_int32_be b i x =
538 if not Sys.big_endian then set_int32_ne b i (swap32 x)
539 else set_int32_ne b i x
540
541let set_int64_le b i x =
542 if Sys.big_endian then set_int64_ne b i (swap64 x)
543 else set_int64_ne b i x
544
545let set_int64_be b i x =
546 if not Sys.big_endian then set_int64_ne b i (swap64 x)
547 else set_int64_ne b i x
548
549let set_uint8 = set_int8
550let set_uint16_ne = set_int16_ne
551let set_uint16_be = set_int16_be
552let set_uint16_le = set_int16_le
553
554(* UTF codecs and validations *)
555
556let dec_invalid = Uchar.utf_decode_invalid
557let[@inline] dec_ret n u = Uchar.utf_decode n (Uchar.unsafe_of_int u)
558
559(* In case of decoding error, if we error on the first byte, we
560 consume the byte, otherwise we consume the [n] bytes preceding
561 the erroring byte.
562
563 This means that if a client uses decodes without caring about
564 validity it naturally replace bogus data with Uchar.rep according
565 to the WHATWG Encoding standard. Other schemes are possible by
566 consulting the number of used bytes on invalid decodes. For more
567 details see https://hsivonen.fi/broken-utf-8/
568
569 For this reason in [get_utf_8_uchar] we gradually check the next
570 byte is available rather than doing it immediately after the
571 first byte. Contrast with [is_valid_utf_8]. *)
572
573(* UTF-8 *)
574
575let[@inline] not_in_x80_to_xBF b = b lsr 6 <> 0b10
576let[@inline] not_in_xA0_to_xBF b = b lsr 5 <> 0b101
577let[@inline] not_in_x80_to_x9F b = b lsr 5 <> 0b100
578let[@inline] not_in_x90_to_xBF b = b < 0x90 || 0xBF < b
579let[@inline] not_in_x80_to_x8F b = b lsr 4 <> 0x8
580
581let[@inline] utf_8_uchar_2 b0 b1 =
582 ((b0 land 0x1F) lsl 6) lor
583 ((b1 land 0x3F))
584
585let[@inline] utf_8_uchar_3 b0 b1 b2 =
586 ((b0 land 0x0F) lsl 12) lor
587 ((b1 land 0x3F) lsl 6) lor
588 ((b2 land 0x3F))
589
590let[@inline] utf_8_uchar_4 b0 b1 b2 b3 =
591 ((b0 land 0x07) lsl 18) lor
592 ((b1 land 0x3F) lsl 12) lor
593 ((b2 land 0x3F) lsl 6) lor
594 ((b3 land 0x3F))
595
596let get_utf_8_uchar b i =
597 let b0 = get_uint8 b i in (* raises if [i] is not a valid index. *)
598 let get = unsafe_get_uint8 in
599 let max = length b - 1 in
600 match Char.unsafe_chr b0 with (* See The Unicode Standard, Table 3.7 *)
601 | '\x00' .. '\x7F' -> dec_ret 1 b0
602 | '\xC2' .. '\xDF' ->
603 let i = i + 1 in if i > max then dec_invalid 1 else
604 let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
605 dec_ret 2 (utf_8_uchar_2 b0 b1)
606 | '\xE0' ->
607 let i = i + 1 in if i > max then dec_invalid 1 else
608 let b1 = get b i in if not_in_xA0_to_xBF b1 then dec_invalid 1 else
609 let i = i + 1 in if i > max then dec_invalid 2 else
610 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
611 dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
612 | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
613 let i = i + 1 in if i > max then dec_invalid 1 else
614 let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
615 let i = i + 1 in if i > max then dec_invalid 2 else
616 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
617 dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
618 | '\xED' ->
619 let i = i + 1 in if i > max then dec_invalid 1 else
620 let b1 = get b i in if not_in_x80_to_x9F b1 then dec_invalid 1 else
621 let i = i + 1 in if i > max then dec_invalid 2 else
622 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
623 dec_ret 3 (utf_8_uchar_3 b0 b1 b2)
624 | '\xF0' ->
625 let i = i + 1 in if i > max then dec_invalid 1 else
626 let b1 = get b i in if not_in_x90_to_xBF b1 then dec_invalid 1 else
627 let i = i + 1 in if i > max then dec_invalid 2 else
628 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
629 let i = i + 1 in if i > max then dec_invalid 3 else
630 let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
631 dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
632 | '\xF1' .. '\xF3' ->
633 let i = i + 1 in if i > max then dec_invalid 1 else
634 let b1 = get b i in if not_in_x80_to_xBF b1 then dec_invalid 1 else
635 let i = i + 1 in if i > max then dec_invalid 2 else
636 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
637 let i = i + 1 in if i > max then dec_invalid 3 else
638 let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
639 dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
640 | '\xF4' ->
641 let i = i + 1 in if i > max then dec_invalid 1 else
642 let b1 = get b i in if not_in_x80_to_x8F b1 then dec_invalid 1 else
643 let i = i + 1 in if i > max then dec_invalid 2 else
644 let b2 = get b i in if not_in_x80_to_xBF b2 then dec_invalid 2 else
645 let i = i + 1 in if i > max then dec_invalid 3 else
646 let b3 = get b i in if not_in_x80_to_xBF b3 then dec_invalid 3 else
647 dec_ret 4 (utf_8_uchar_4 b0 b1 b2 b3)
648 | _ -> dec_invalid 1
649
650let set_utf_8_uchar b i u =
651 let set = unsafe_set_uint8 in
652 let max = length b - 1 in
653 match Uchar.to_int u with
654 | u when u < 0 -> assert false
655 | u when u <= 0x007F ->
656 set_uint8 b i u;
657 1
658 | u when u <= 0x07FF ->
659 let last = i + 1 in
660 if last > max then 0 else
661 (set_uint8 b i (0xC0 lor (u lsr 6));
662 set b last (0x80 lor (u land 0x3F));
663 2)
664 | u when u <= 0xFFFF ->
665 let last = i + 2 in
666 if last > max then 0 else
667 (set_uint8 b i (0xE0 lor (u lsr 12));
668 set b (i + 1) (0x80 lor ((u lsr 6) land 0x3F));
669 set b last (0x80 lor (u land 0x3F));
670 3)
671 | u when u <= 0x10FFFF ->
672 let last = i + 3 in
673 if last > max then 0 else
674 (set_uint8 b i (0xF0 lor (u lsr 18));
675 set b (i + 1) (0x80 lor ((u lsr 12) land 0x3F));
676 set b (i + 2) (0x80 lor ((u lsr 6) land 0x3F));
677 set b last (0x80 lor (u land 0x3F));
678 4)
679 | _ -> assert false
680
681let is_valid_utf_8 b =
682 let rec loop max b i =
683 if i > max then true else
684 let get = unsafe_get_uint8 in
685 match Char.unsafe_chr (get b i) with
686 | '\x00' .. '\x7F' -> loop max b (i + 1)
687 | '\xC2' .. '\xDF' ->
688 let last = i + 1 in
689 if last > max
690 || not_in_x80_to_xBF (get b last)
691 then false
692 else loop max b (last + 1)
693 | '\xE0' ->
694 let last = i + 2 in
695 if last > max
696 || not_in_xA0_to_xBF (get b (i + 1))
697 || not_in_x80_to_xBF (get b last)
698 then false
699 else loop max b (last + 1)
700 | '\xE1' .. '\xEC' | '\xEE' .. '\xEF' ->
701 let last = i + 2 in
702 if last > max
703 || not_in_x80_to_xBF (get b (i + 1))
704 || not_in_x80_to_xBF (get b last)
705 then false
706 else loop max b (last + 1)
707 | '\xED' ->
708 let last = i + 2 in
709 if last > max
710 || not_in_x80_to_x9F (get b (i + 1))
711 || not_in_x80_to_xBF (get b last)
712 then false
713 else loop max b (last + 1)
714 | '\xF0' ->
715 let last = i + 3 in
716 if last > max
717 || not_in_x90_to_xBF (get b (i + 1))
718 || not_in_x80_to_xBF (get b (i + 2))
719 || not_in_x80_to_xBF (get b last)
720 then false
721 else loop max b (last + 1)
722 | '\xF1' .. '\xF3' ->
723 let last = i + 3 in
724 if last > max
725 || not_in_x80_to_xBF (get b (i + 1))
726 || not_in_x80_to_xBF (get b (i + 2))
727 || not_in_x80_to_xBF (get b last)
728 then false
729 else loop max b (last + 1)
730 | '\xF4' ->
731 let last = i + 3 in
732 if last > max
733 || not_in_x80_to_x8F (get b (i + 1))
734 || not_in_x80_to_xBF (get b (i + 2))
735 || not_in_x80_to_xBF (get b last)
736 then false
737 else loop max b (last + 1)
738 | _ -> false
739 in
740 loop (length b - 1) b 0
741
742(* UTF-16BE *)
743
744let get_utf_16be_uchar b i =
745 let get = unsafe_get_uint16_be in
746 let max = length b - 1 in
747 if i < 0 || i > max then invalid_arg "index out of bounds" else
748 if i = max then dec_invalid 1 else
749 match get b i with
750 | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u
751 | u when u > 0xDBFF -> dec_invalid 2
752 | hi -> (* combine [hi] with a low surrogate *)
753 let last = i + 3 in
754 if last > max then dec_invalid (max - i + 1) else
755 match get b (i + 2) with
756 | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *)
757 | lo ->
758 let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in
759 dec_ret 4 u
760
761let set_utf_16be_uchar b i u =
762 let set = unsafe_set_uint16_be in
763 let max = length b - 1 in
764 if i < 0 || i > max then invalid_arg "index out of bounds" else
765 match Uchar.to_int u with
766 | u when u < 0 -> assert false
767 | u when u <= 0xFFFF ->
768 let last = i + 1 in
769 if last > max then 0 else (set b i u; 2)
770 | u when u <= 0x10FFFF ->
771 let last = i + 3 in
772 if last > max then 0 else
773 let u' = u - 0x10000 in
774 let hi = (0xD800 lor (u' lsr 10)) in
775 let lo = (0xDC00 lor (u' land 0x3FF)) in
776 set b i hi; set b (i + 2) lo; 4
777 | _ -> assert false
778
779let is_valid_utf_16be b =
780 let rec loop max b i =
781 let get = unsafe_get_uint16_be in
782 if i > max then true else
783 if i = max then false else
784 match get b i with
785 | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2)
786 | u when u > 0xDBFF -> false
787 | _hi ->
788 let last = i + 3 in
789 if last > max then false else
790 match get b (i + 2) with
791 | u when u < 0xDC00 || u > 0xDFFF -> false
792 | _lo -> loop max b (i + 4)
793 in
794 loop (length b - 1) b 0
795
796(* UTF-16LE *)
797
798let get_utf_16le_uchar b i =
799 let get = unsafe_get_uint16_le in
800 let max = length b - 1 in
801 if i < 0 || i > max then invalid_arg "index out of bounds" else
802 if i = max then dec_invalid 1 else
803 match get b i with
804 | u when u < 0xD800 || u > 0xDFFF -> dec_ret 2 u
805 | u when u > 0xDBFF -> dec_invalid 2
806 | hi -> (* combine [hi] with a low surrogate *)
807 let last = i + 3 in
808 if last > max then dec_invalid (max - i + 1) else
809 match get b (i + 2) with
810 | u when u < 0xDC00 || u > 0xDFFF -> dec_invalid 2 (* retry here *)
811 | lo ->
812 let u = (((hi land 0x3FF) lsl 10) lor (lo land 0x3FF)) + 0x10000 in
813 dec_ret 4 u
814
815let set_utf_16le_uchar b i u =
816 let set = unsafe_set_uint16_le in
817 let max = length b - 1 in
818 if i < 0 || i > max then invalid_arg "index out of bounds" else
819 match Uchar.to_int u with
820 | u when u < 0 -> assert false
821 | u when u <= 0xFFFF ->
822 let last = i + 1 in
823 if last > max then 0 else (set b i u; 2)
824 | u when u <= 0x10FFFF ->
825 let last = i + 3 in
826 if last > max then 0 else
827 let u' = u - 0x10000 in
828 let hi = (0xD800 lor (u' lsr 10)) in
829 let lo = (0xDC00 lor (u' land 0x3FF)) in
830 set b i hi; set b (i + 2) lo; 4
831 | _ -> assert false
832
833let is_valid_utf_16le b =
834 let rec loop max b i =
835 let get = unsafe_get_uint16_le in
836 if i > max then true else
837 if i = max then false else
838 match get b i with
839 | u when u < 0xD800 || u > 0xDFFF -> loop max b (i + 2)
840 | u when u > 0xDBFF -> false
841 | _hi ->
842 let last = i + 3 in
843 if last > max then false else
844 match get b (i + 2) with
845 | u when u < 0xDC00 || u > 0xDFFF -> false
846 | _lo -> loop max b (i + 4)
847 in
848 loop (length b - 1) b 0