My working unpac repository
1(**************************************************************************)
2(* *)
3(* OCaml *)
4(* *)
5(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
6(* *)
7(* Copyright 1996 Institut National de Recherche en Informatique et *)
8(* en Automatique. *)
9(* *)
10(* All rights reserved. This file is distributed under the terms of *)
11(* the GNU Lesser General Public License version 2.1, with the *)
12(* special exception on linking described in the file LICENSE. *)
13(* *)
14(**************************************************************************)
15
16(* NOTE:
17 If this file is stringLabels.mli, run tools/sync_stdlib_docs after editing
18 it to generate string.mli.
19
20 If this file is string.mli, do not edit it directly -- edit
21 stringLabels.mli instead.
22 *)
23
24(** Strings.
25
26 A string [s] of length [n] is an indexable and immutable sequence
27 of [n] bytes. For historical reasons these bytes are referred to
28 as characters.
29
30 The semantics of string functions is defined in terms of
31 indices and positions. These are depicted and described
32 as follows.
33
34{v
35positions 0 1 2 3 4 n-1 n
36 +---+---+---+---+ +-----+
37 indices | 0 | 1 | 2 | 3 | ... | n-1 |
38 +---+---+---+---+ +-----+
39v}
40 {ul
41 {- An {e index} [i] of [s] is an integer in the range \[[0];[n-1]\].
42 It represents the [i]th byte (character) of [s] which can be
43 accessed using the constant time string indexing operator
44 [s.[i]].}
45 {- A {e position} [i] of [s] is an integer in the range
46 \[[0];[n]\]. It represents either the point at the beginning of
47 the string, or the point between two indices, or the point at
48 the end of the string. The [i]th byte index is between position
49 [i] and [i+1].}}
50
51 Two integers [start] and [len] are said to define a {e valid
52 substring} of [s] if [len >= 0] and [start], [start+len] are
53 positions of [s].
54
55 {b Unicode text.} Strings being arbitrary sequences of bytes, they
56 can hold any kind of textual encoding. However the recommended
57 encoding for storing Unicode text in OCaml strings is UTF-8. This
58 is the encoding used by Unicode escapes in string literals. For
59 example the string ["\u{1F42B}"] is the UTF-8 encoding of the
60 Unicode character U+1F42B.
61
62 {b Past mutability.} Before OCaml 4.02, strings used to be modifiable in
63 place like {!Bytes.t} mutable sequences of bytes.
64 OCaml 4 had various compiler flags and configuration options to support the
65 transition period from mutable to immutable strings.
66 Those options are no longer available, and strings are now always
67 immutable.
68
69 The labeled version of this module can be used as described in the
70 {!StdLabels} module.
71*)
72
73(** {1:strings Strings} *)
74
75type t = string
76(** The type for strings. *)
77
78val make : int -> char -> string
79(** [make n c] is a string of length [n] with each index holding the
80 character [c].
81
82 @raise Invalid_argument if [n < 0] or [n > ]{!Sys.max_string_length}. *)
83
84val init : int -> f:(int -> char) -> string
85(** [init n ~f] is a string of length [n] with index
86 [i] holding the character [f i] (called in increasing index order).
87
88 @raise Invalid_argument if [n < 0] or [n > ]{!Sys.max_string_length}.
89 @since 4.02 *)
90
91val empty : string
92(** The empty string.
93
94 @since 4.13
95*)
96
97external length : string -> int = "%string_length"
98(** [length s] is the length (number of bytes/characters) of [s]. *)
99
100external get : string -> int -> char = "%string_safe_get"
101(** [get s i] is the character at index [i] in [s]. This is the same
102 as writing [s.[i]].
103
104 @raise Invalid_argument if [i] not an index of [s]. *)
105
106val of_bytes : bytes -> string
107(** Return a new string that contains the same bytes as the given byte
108 sequence.
109
110 @since 4.13
111*)
112
113val to_bytes : string -> bytes
114(** Return a new byte sequence that contains the same bytes as the given
115 string.
116
117 @since 4.13
118*)
119
120val blit :
121 src:string -> src_pos:int -> dst:bytes -> dst_pos:int -> len:int -> unit
122(** Same as {!Bytes.blit_string} which should be preferred. *)
123
124(** {1:concat Concatenating}
125
126 {b Note.} The {!Stdlib.( ^ )} binary operator concatenates two
127 strings. *)
128
129val concat : sep:string -> string list -> string
130(** [concat ~sep ss] concatenates the list of strings [ss], inserting
131 the separator string [sep] between each.
132
133 @raise Invalid_argument if the result is longer than
134 {!Sys.max_string_length} bytes. *)
135
136val cat : string -> string -> string
137(** [cat s1 s2] concatenates s1 and s2 ([s1 ^ s2]).
138
139 @raise Invalid_argument if the result is longer than
140 {!Sys.max_string_length} bytes.
141
142 @since 4.13
143*)
144
145(** {1:predicates Predicates and comparisons} *)
146
147val equal : t -> t -> bool
148(** [equal s0 s1] is [true] if and only if [s0] and [s1] are character-wise
149 equal.
150 @since 4.03 (4.05 in StringLabels) *)
151
152val compare : t -> t -> int
153(** [compare s0 s1] sorts [s0] and [s1] in lexicographical order. [compare]
154 behaves like {!Stdlib.compare} on strings but may be more efficient. *)
155
156val starts_with :
157 prefix (* comment thwarts tools/sync_stdlib_docs *) :string -> string -> bool
158(** [starts_with ][~prefix s] is [true] if and only if [s] starts with
159 [prefix].
160
161 @since 4.13 *)
162
163val ends_with :
164 suffix (* comment thwarts tools/sync_stdlib_docs *) :string -> string -> bool
165(** [ends_with ][~suffix s] is [true] if and only if [s] ends with [suffix].
166
167 @since 4.13 *)
168
169val contains_from : string -> int -> char -> bool
170(** [contains_from s start c] is [true] if and only if [c] appears in [s]
171 after position [start].
172
173 @raise Invalid_argument if [start] is not a valid position in [s]. *)
174
175val rcontains_from : string -> int -> char -> bool
176(** [rcontains_from s stop c] is [true] if and only if [c] appears in [s]
177 before position [stop+1].
178
179 @raise Invalid_argument if [stop < 0] or [stop+1] is not a valid
180 position in [s]. *)
181
182val contains : string -> char -> bool
183(** [contains s c] is {!String.contains_from}[ s 0 c]. *)
184
185(** {1:extract Extracting substrings} *)
186
187val sub : string -> pos:int -> len:int -> string
188(** [sub s ~pos ~len] is a string of length [len], containing the
189 substring of [s] that starts at position [pos] and has length
190 [len].
191
192 @raise Invalid_argument if [pos] and [len] do not designate a valid
193 substring of [s]. *)
194
195(** {1:splitting Splitting strings} *)
196
197(** {2:splitting_mag Splitting with magnitudes} *)
198
199val take_first : int -> string -> string
200(** [take_first n s] are the first [n] bytes of [s]. This is [s] if
201 [n >= length s] and [""] if [n <= 0].
202
203 @since 5.5 *)
204
205val take_last : int -> string -> string
206(** [take_last n s] are the last [n] bytes of [s]. This is [s] if
207 [n >= length s] and [""] if [n <= 0].
208
209 @since 5.5 *)
210
211val drop_first : int -> string -> string
212(** [drop_first n s] is [s] without the first [n] bytes of [s]. This is [""]
213 if [n >= length s] and [s] if [n <= 0].
214
215 @since 5.5 *)
216
217val drop_last : int -> string -> string
218(** [drop_last n s] is [s] without the last [n] bytes of [s]. This is [""]
219 if [n >= length s] and [s] if [n <= 0].
220
221 @since 5.5 *)
222
223val cut_first : int -> string -> string * string
224(** [cut_first n v] is [(take_first n v, drop_first n v)].
225
226 @since 5.5 *)
227
228val cut_last : int -> string -> string * string
229(** [cut_last n v] is [(drop_last n v, take_last n v)].
230
231 @since 5.5 *)
232
233(** {2:splitting_preds Splitting with predicates} *)
234
235val take_first_while : (char -> bool) -> string -> string
236(** [take_first_while p s] is the first consecutive bytes of [s]
237 satisfying the predicate [p].
238
239 @since 5.5 *)
240
241val take_last_while : (char -> bool) -> string -> string
242(** [take_last_while p s] is the last consecutive bytes of [s]
243 satisfying the predicate [p].
244
245 @since 5.5 *)
246
247val drop_first_while : (char -> bool) -> string -> string
248(** [drop_first_while p s] is [s] without the first consecutive bytes of [s]
249 satisfying the predicate [p].
250
251 @since 5.5 *)
252
253val drop_last_while : (char -> bool) -> string -> string
254(** [drop_last_while p s] is [s] without the last consecutive bytes of [s]
255 satisfying the predicate [p].
256
257 @since 5.5 *)
258
259val cut_first_while : (char -> bool) -> string -> string * string
260(** [cut_first_while p s] is
261 [(take_first_while p s, drop_first_while p s)].
262
263 @since 5.5 *)
264
265val cut_last_while : (char -> bool) -> string -> string * string
266(** [cut_last_while p s] is
267 [(drop_last_while p s, take_last_while p s)].
268
269 @since 5.5 *)
270
271(** {2:splitting_sep Splitting with separators} *)
272
273val split_on_char : sep:char -> string -> string list
274(** [split_on_char ~sep s] is the list of all (possibly empty)
275 substrings of [s] that are delimited by the character [sep].
276 If [s] is empty, the result is the singleton list [[""]].
277
278 The function's result is specified by the following invariants:
279 {ul
280 {- The list is not empty.}
281 {- Concatenating its elements using [sep] as a separator returns a
282 string equal to the input ([concat (make 1 sep)
283 (split_on_char sep s) = s]).}
284 {- No string in the result contains the [sep] character.}}
285
286 @since 4.04 (4.05 in StringLabels) *)
287
288(** {1:transforming Transforming} *)
289
290val map : f:(char -> char) -> string -> string
291(** [map f s] is the string resulting from applying [f] to all the
292 characters of [s] in increasing order.
293
294 @since 4.00 *)
295
296val mapi : f:(int -> char -> char) -> string -> string
297(** [mapi ~f s] is like {!map} but the index of the character is also
298 passed to [f].
299
300 @since 4.02 *)
301
302val fold_left : f:('acc -> char -> 'acc) -> init:'acc -> string -> 'acc
303(** [fold_left f x s] computes [f (... (f (f x s.[0]) s.[1]) ...) s.[n-1]],
304 where [n] is the length of the string [s].
305 @since 4.13 *)
306
307val fold_right : f:(char -> 'acc -> 'acc) -> string -> init:'acc -> 'acc
308(** [fold_right f s x] computes [f s.[0] (f s.[1] ( ... (f s.[n-1] x) ...))],
309 where [n] is the length of the string [s].
310 @since 4.13 *)
311
312val for_all : f:(char -> bool) -> string -> bool
313(** [for_all p s] checks if all characters in [s] satisfy the predicate [p].
314 @since 4.13 *)
315
316val exists : f:(char -> bool) -> string -> bool
317(** [exists p s] checks if at least one character of [s] satisfies the predicate
318 [p].
319 @since 4.13 *)
320
321val trim : string -> string
322(** [trim s] is [s] without leading and trailing whitespace. Whitespace
323 characters are: [' '], ['\x0C'] (form feed), ['\n'], ['\r'], and ['\t'].
324
325 @since 4.00 *)
326
327val escaped : string -> string
328(** [escaped s] is [s] with special characters represented by escape
329 sequences, following the lexical conventions of OCaml.
330
331 All characters outside the US-ASCII printable range \[0x20;0x7E\] are
332 escaped, as well as backslash (0x2F) and double-quote (0x22).
333
334 The function {!Scanf.unescaped} is a left inverse of [escaped],
335 i.e. [Scanf.unescaped (escaped s) = s] for any string [s] (unless
336 [escaped s] fails).
337
338 @raise Invalid_argument if the result is longer than
339 {!Sys.max_string_length} bytes. *)
340
341val uppercase_ascii : string -> string
342(** [uppercase_ascii s] is [s] with all lowercase letters
343 translated to uppercase, using the US-ASCII character set.
344
345 @since 4.03 (4.05 in StringLabels) *)
346
347val lowercase_ascii : string -> string
348(** [lowercase_ascii s] is [s] with all uppercase letters translated
349 to lowercase, using the US-ASCII character set.
350
351 @since 4.03 (4.05 in StringLabels) *)
352
353val capitalize_ascii : string -> string
354(** [capitalize_ascii s] is [s] with the first character set to
355 uppercase, using the US-ASCII character set.
356
357 @since 4.03 (4.05 in StringLabels) *)
358
359val uncapitalize_ascii : string -> string
360(** [uncapitalize_ascii s] is [s] with the first character set to lowercase,
361 using the US-ASCII character set.
362
363 @since 4.03 (4.05 in StringLabels) *)
364
365(** {1:traversing Traversing} *)
366
367val iter : f:(char -> unit) -> string -> unit
368(** [iter ~f s] applies function [f] in turn to all the characters of [s].
369 It is equivalent to [f s.[0]; f s.[1]; ...; f s.[length s - 1]; ()]. *)
370
371val iteri : f:(int -> char -> unit) -> string -> unit
372(** [iteri] is like {!iter}, but the function is also given the
373 corresponding character index.
374
375 @since 4.00 *)
376
377(** {1:searching Searching} *)
378
379val index_from : string -> int -> char -> int
380(** [index_from s i c] is the index of the first occurrence of [c] in
381 [s] after position [i].
382
383 @raise Not_found if [c] does not occur in [s] after position [i].
384 @raise Invalid_argument if [i] is not a valid position in [s]. *)
385
386
387val index_from_opt : string -> int -> char -> int option
388(** [index_from_opt s i c] is the index of the first occurrence of [c]
389 in [s] after position [i] (if any).
390
391 @raise Invalid_argument if [i] is not a valid position in [s].
392 @since 4.05 *)
393
394val rindex_from : string -> int -> char -> int
395(** [rindex_from s i c] is the index of the last occurrence of [c] in
396 [s] before position [i+1].
397
398 @raise Not_found if [c] does not occur in [s] before position [i+1].
399 @raise Invalid_argument if [i+1] is not a valid position in [s]. *)
400
401val rindex_from_opt : string -> int -> char -> int option
402(** [rindex_from_opt s i c] is the index of the last occurrence of [c]
403 in [s] before position [i+1] (if any).
404
405 @raise Invalid_argument if [i+1] is not a valid position in [s].
406 @since 4.05 *)
407
408val index : string -> char -> int
409(** [index s c] is {!String.index_from}[ s 0 c]. *)
410
411val index_opt : string -> char -> int option
412(** [index_opt s c] is {!String.index_from_opt}[ s 0 c].
413
414 @since 4.05 *)
415
416val rindex : string -> char -> int
417(** [rindex s c] is {!String.rindex_from}[ s (length s - 1) c]. *)
418
419val rindex_opt : string -> char -> int option
420(** [rindex_opt s c] is {!String.rindex_from_opt}[ s (length s - 1) c].
421
422 @since 4.05 *)
423
424(** {1 Strings and Sequences} *)
425
426val to_seq : t -> char Seq.t
427(** [to_seq s] is a sequence made of the string's characters in
428 increasing order.
429
430 @since 4.07 *)
431
432val to_seqi : t -> (int * char) Seq.t
433(** [to_seqi s] is like {!to_seq} but also tuples the corresponding index.
434
435 @since 4.07 *)
436
437val of_seq : char Seq.t -> t
438(** [of_seq s] is a string made of the sequence's characters.
439
440 @since 4.07 *)
441
442(** {1:utf UTF decoding and validations}
443
444 @since 4.14 *)
445
446(** {2:utf_8 UTF-8} *)
447
448val get_utf_8_uchar : t -> int -> Uchar.utf_decode
449(** [get_utf_8_uchar b i] decodes an UTF-8 character at index [i] in
450 [b]. *)
451
452val is_valid_utf_8 : t -> bool
453(** [is_valid_utf_8 b] is [true] if and only if [b] contains valid
454 UTF-8 data. *)
455
456(** {2:utf_16be UTF-16BE} *)
457
458val get_utf_16be_uchar : t -> int -> Uchar.utf_decode
459(** [get_utf_16be_uchar b i] decodes an UTF-16BE character at index
460 [i] in [b]. *)
461
462val is_valid_utf_16be : t -> bool
463(** [is_valid_utf_16be b] is [true] if and only if [b] contains valid
464 UTF-16BE data. *)
465
466(** {2:utf_16le UTF-16LE} *)
467
468val get_utf_16le_uchar : t -> int -> Uchar.utf_decode
469(** [get_utf_16le_uchar b i] decodes an UTF-16LE character at index
470 [i] in [b]. *)
471
472val is_valid_utf_16le : t -> bool
473(** [is_valid_utf_16le b] is [true] if and only if [b] contains valid
474 UTF-16LE data. *)
475
476(** {1:spellchecking Spellchecking} *)
477
478val edit_distance : ?limit:int -> t -> t -> int
479(** [edit_distance s0 s1] is the number of single character edits
480 (understood as insertion, deletion, substitution, transposition)
481 that are needed to change [s0] into [s1].
482
483 If [limit] is provided the function returns with [limit] as soon
484 as it was determined that [s0] and [s1] have distance of at least
485 [limit]. This is faster if you have a fixed limit, for example for
486 spellchecking.
487
488 The function assumes the strings are UTF-8 encoded and uses {!Uchar.t}
489 for the notion of character. Decoding errors are replaced by
490 {!Uchar.rep}. Normalizing the strings to
491 {{:https://unicode.org/glossary/#normalization_form_c}NFC} gives
492 better results.
493
494 {b Note.} This implements the simpler Optimal String Alignment (OSA)
495 distance, not the Damerau-Levenshtein distance. With this function
496 ["ca"] and ["abc"] have a distance of 3 not 2.
497
498 @since 5.4
499*)
500
501val spellcheck :
502 ?max_dist:(string -> int) -> ((string -> unit) -> unit) -> string ->
503 string list
504(** [spellcheck iter_dict s] are the strings enumerated by the
505 iterator [iter_dict] whose {{!edit_distance}edit distance} to [s]
506 is the smallest and at most [max_dist s]. If multiple corrections
507 are returned their order is as found in [iter_dict]. The default
508 [max_dist s] is:
509
510 {ul
511 {- [0] if [s] has 0 to 2 Unicode characters.}
512 {- [1] if [s] has 3 to 4 Unicode characters.}
513 {- [2] otherwise.}}
514
515 If your dictionary is a list [l], a suitable [iter_dict] is given
516 by [(fun yield -> List.iter yield l)].
517
518 All strings are assumed to be UTF-8 encoded, decoding
519 errors are replaced by {!Uchar.rep} characters.
520
521 @since 5.4 *)
522
523(** {1 Binary decoding of integers} *)
524
525(** The functions in this section binary decode integers from strings.
526
527 All following functions raise [Invalid_argument] if the characters
528 needed at index [i] to decode the integer are not available.
529
530 Little-endian (resp. big-endian) encoding means that least
531 (resp. most) significant bytes are stored first. Big-endian is
532 also known as network byte order. Native-endian encoding is
533 either little-endian or big-endian depending on {!Sys.big_endian}.
534
535 32-bit and 64-bit integers are represented by the [int32] and
536 [int64] types, which can be interpreted either as signed or
537 unsigned numbers.
538
539 8-bit and 16-bit integers are represented by the [int] type,
540 which has more bits than the binary encoding. These extra bits
541 are sign-extended (or zero-extended) for functions which decode 8-bit
542 or 16-bit integers and represented them with [int] values.
543*)
544
545val get_uint8 : string -> int -> int
546(** [get_uint8 b i] is [b]'s unsigned 8-bit integer starting at character
547 index [i].
548
549 @since 4.13
550*)
551
552val get_int8 : string -> int -> int
553(** [get_int8 b i] is [b]'s signed 8-bit integer starting at character
554 index [i].
555
556 @since 4.13
557*)
558
559val get_uint16_ne : string -> int -> int
560(** [get_uint16_ne b i] is [b]'s native-endian unsigned 16-bit integer
561 starting at character index [i].
562
563 @since 4.13
564*)
565
566val get_uint16_be : string -> int -> int
567(** [get_uint16_be b i] is [b]'s big-endian unsigned 16-bit integer
568 starting at character index [i].
569
570 @since 4.13
571*)
572
573val get_uint16_le : string -> int -> int
574(** [get_uint16_le b i] is [b]'s little-endian unsigned 16-bit integer
575 starting at character index [i].
576
577 @since 4.13
578*)
579
580val get_int16_ne : string -> int -> int
581(** [get_int16_ne b i] is [b]'s native-endian signed 16-bit integer
582 starting at character index [i].
583
584 @since 4.13
585*)
586
587val get_int16_be : string -> int -> int
588(** [get_int16_be b i] is [b]'s big-endian signed 16-bit integer
589 starting at character index [i].
590
591 @since 4.13
592*)
593
594val get_int16_le : string -> int -> int
595(** [get_int16_le b i] is [b]'s little-endian signed 16-bit integer
596 starting at character index [i].
597
598 @since 4.13
599*)
600
601val get_int32_ne : string -> int -> int32
602(** [get_int32_ne b i] is [b]'s native-endian 32-bit integer
603 starting at character index [i].
604
605 @since 4.13
606*)
607
608val hash : t -> int
609(** An unseeded hash function for strings, with the same output value as
610 {!Hashtbl.hash}. This function allows this module to be passed as argument
611 to the functor {!Hashtbl.Make}.
612
613 @since 5.0 *)
614
615val seeded_hash : int -> t -> int
616(** A seeded hash function for strings, with the same output value as
617 {!Hashtbl.seeded_hash}. This function allows this module to be passed as
618 argument to the functor {!Hashtbl.MakeSeeded}.
619
620 @since 5.0 *)
621
622val get_int32_be : string -> int -> int32
623(** [get_int32_be b i] is [b]'s big-endian 32-bit integer
624 starting at character index [i].
625
626 @since 4.13
627*)
628
629val get_int32_le : string -> int -> int32
630(** [get_int32_le b i] is [b]'s little-endian 32-bit integer
631 starting at character index [i].
632
633 @since 4.13
634*)
635
636val get_int64_ne : string -> int -> int64
637(** [get_int64_ne b i] is [b]'s native-endian 64-bit integer
638 starting at character index [i].
639
640 @since 4.13
641*)
642
643val get_int64_be : string -> int -> int64
644(** [get_int64_be b i] is [b]'s big-endian 64-bit integer
645 starting at character index [i].
646
647 @since 4.13
648*)
649
650val get_int64_le : string -> int -> int64
651(** [get_int64_le b i] is [b]'s little-endian 64-bit integer
652 starting at character index [i].
653
654 @since 4.13
655*)
656
657(**/**)
658
659(* The following is for system use only. Do not call directly. *)
660
661external unsafe_get : string -> int -> char = "%string_unsafe_get"
662external unsafe_blit :
663 src:string -> src_pos:int -> dst:bytes -> dst_pos:int -> len:int ->
664 unit = "caml_blit_string" [@@noalloc]