stdlib/stringLabels.mli at opam/upstream/seq · anil.recoil.org/unpac-work

My working unpac repository
unpac-work / stdlib / stringLabels.mli
at opam/upstream/seq 664 lines 22 kB view raw
wrap content
  1(**************************************************************************)
  2(*                                                                        *)
  3(*                                 OCaml                                  *)
  4(*                                                                        *)
  5(*             Xavier Leroy, projet Cristal, INRIA Rocquencourt           *)
  6(*                                                                        *)
  7(*   Copyright 1996 Institut National de Recherche en Informatique et     *)
  8(*     en Automatique.                                                    *)
  9(*                                                                        *)
 10(*   All rights reserved.  This file is distributed under the terms of    *)
 11(*   the GNU Lesser General Public License version 2.1, with the          *)
 12(*   special exception on linking described in the file LICENSE.          *)
 13(*                                                                        *)
 14(**************************************************************************)
 15
 16(* NOTE:
 17   If this file is stringLabels.mli, run tools/sync_stdlib_docs after editing
 18   it to generate string.mli.
 19
 20   If this file is string.mli, do not edit it directly -- edit
 21   stringLabels.mli instead.
 22 *)
 23
 24(** Strings.
 25
 26    A string [s] of length [n] is an indexable and immutable sequence
 27    of [n] bytes. For historical reasons these bytes are referred to
 28    as characters.
 29
 30    The semantics of string functions is defined in terms of
 31    indices and positions. These are depicted and described
 32    as follows.
 33
 34{v
 35positions  0   1   2   3   4    n-1    n
 36           +---+---+---+---+     +-----+
 37  indices  | 0 | 1 | 2 | 3 | ... | n-1 |
 38           +---+---+---+---+     +-----+
 39v}
 40    {ul
 41    {- An {e index} [i] of [s] is an integer in the range \[[0];[n-1]\].
 42       It represents the [i]th byte (character) of [s] which can be
 43       accessed using the constant time string indexing operator
 44       [s.[i]].}
 45    {- A {e position} [i] of [s] is an integer in the range
 46       \[[0];[n]\]. It represents either the point at the beginning of
 47       the string, or the point between two indices, or the point at
 48       the end of the string. The [i]th byte index is between position
 49       [i] and [i+1].}}
 50
 51    Two integers [start] and [len] are said to define a {e valid
 52    substring} of [s] if [len >= 0] and [start], [start+len] are
 53    positions of [s].
 54
 55    {b Unicode text.} Strings being arbitrary sequences of bytes, they
 56    can hold any kind of textual encoding. However the recommended
 57    encoding for storing Unicode text in OCaml strings is UTF-8. This
 58    is the encoding used by Unicode escapes in string literals. For
 59    example the string ["\u{1F42B}"] is the UTF-8 encoding of the
 60    Unicode character U+1F42B.
 61
 62    {b Past mutability.} Before OCaml 4.02, strings used to be modifiable in
 63    place like {!Bytes.t} mutable sequences of bytes.
 64    OCaml 4 had various compiler flags and configuration options to support the
 65    transition period from mutable to immutable strings.
 66    Those options are no longer available, and strings are now always
 67    immutable.
 68
 69    The labeled version of this module can be used as described in the
 70    {!StdLabels} module.
 71*)
 72
 73(** {1:strings Strings} *)
 74
 75type t = string
 76(** The type for strings. *)
 77
 78val make : int -> char -> string
 79(** [make n c] is a string of length [n] with each index holding the
 80    character [c].
 81
 82    @raise Invalid_argument if [n < 0] or [n > ]{!Sys.max_string_length}. *)
 83
 84val init : int -> f:(int -> char) -> string
 85(** [init n ~f] is a string of length [n] with index
 86    [i] holding the character [f i] (called in increasing index order).
 87
 88    @raise Invalid_argument if [n < 0] or [n > ]{!Sys.max_string_length}.
 89    @since 4.02 *)
 90
 91val empty : string
 92(** The empty string.
 93
 94    @since 4.13
 95*)
 96
 97external length : string -> int = "%string_length"
 98(** [length s] is the length (number of bytes/characters) of [s]. *)
 99
100external get : string -> int -> char = "%string_safe_get"
101(** [get s i] is the character at index [i] in [s]. This is the same
102    as writing [s.[i]].
103
104    @raise Invalid_argument if [i] not an index of [s]. *)
105
106val of_bytes : bytes -> string
107(** Return a new string that contains the same bytes as the given byte
108    sequence.
109
110    @since 4.13
111*)
112
113val to_bytes : string -> bytes
114(** Return a new byte sequence that contains the same bytes as the given
115    string.
116
117    @since 4.13
118*)
119
120val blit :
121  src:string -> src_pos:int -> dst:bytes -> dst_pos:int -> len:int -> unit
122(** Same as {!Bytes.blit_string} which should be preferred. *)
123
124(** {1:concat Concatenating}
125
126    {b Note.} The {!Stdlib.( ^ )} binary operator concatenates two
127    strings. *)
128
129val concat : sep:string -> string list -> string
130(** [concat ~sep ss] concatenates the list of strings [ss], inserting
131    the separator string [sep] between each.
132
133    @raise Invalid_argument if the result is longer than
134    {!Sys.max_string_length} bytes. *)
135
136val cat : string -> string -> string
137(** [cat s1 s2] concatenates s1 and s2 ([s1 ^ s2]).
138
139    @raise Invalid_argument if the result is longer than
140    {!Sys.max_string_length} bytes.
141
142    @since 4.13
143*)
144
145(** {1:predicates Predicates and comparisons} *)
146
147val equal : t -> t -> bool
148(** [equal s0 s1] is [true] if and only if [s0] and [s1] are character-wise
149    equal.
150    @since 4.03 (4.05 in StringLabels) *)
151
152val compare : t -> t -> int
153(** [compare s0 s1] sorts [s0] and [s1] in lexicographical order. [compare]
154    behaves like {!Stdlib.compare} on strings but may be more efficient. *)
155
156val starts_with :
157  prefix (* comment thwarts tools/sync_stdlib_docs *) :string -> string -> bool
158(** [starts_with ][~prefix s] is [true] if and only if [s] starts with
159    [prefix].
160
161    @since 4.13 *)
162
163val ends_with :
164  suffix (* comment thwarts tools/sync_stdlib_docs *) :string -> string -> bool
165(** [ends_with ][~suffix s] is [true] if and only if [s] ends with [suffix].
166
167    @since 4.13 *)
168
169val contains_from : string -> int -> char -> bool
170(** [contains_from s start c] is [true] if and only if [c] appears in [s]
171    after position [start].
172
173    @raise Invalid_argument if [start] is not a valid position in [s]. *)
174
175val rcontains_from : string -> int -> char -> bool
176(** [rcontains_from s stop c] is [true] if and only if [c] appears in [s]
177    before position [stop+1].
178
179    @raise Invalid_argument if [stop < 0] or [stop+1] is not a valid
180    position in [s]. *)
181
182val contains : string -> char -> bool
183(** [contains s c] is {!String.contains_from}[ s 0 c]. *)
184
185(** {1:extract Extracting substrings} *)
186
187val sub : string -> pos:int -> len:int -> string
188(** [sub s ~pos ~len] is a string of length [len], containing the
189    substring of [s] that starts at position [pos] and has length
190    [len].
191
192    @raise Invalid_argument if [pos] and [len] do not designate a valid
193    substring of [s]. *)
194
195(** {1:splitting Splitting strings} *)
196
197(** {2:splitting_mag Splitting with magnitudes} *)
198
199val take_first : int -> string -> string
200(** [take_first n s] are the first [n] bytes of [s]. This is [s] if
201    [n >= length s] and [""] if [n <= 0].
202
203    @since 5.5 *)
204
205val take_last : int -> string -> string
206(** [take_last n s] are the last [n] bytes of [s].  This is [s] if
207    [n >= length s] and [""] if [n <= 0].
208
209    @since 5.5 *)
210
211val drop_first : int -> string -> string
212(** [drop_first n s] is [s] without the first [n] bytes of [s]. This is [""]
213    if [n >= length s] and [s] if [n <= 0].
214
215    @since 5.5 *)
216
217val drop_last : int -> string -> string
218(** [drop_last n s] is [s] without the last [n] bytes of [s]. This is [""]
219    if [n >= length s] and [s] if [n <= 0].
220
221    @since 5.5 *)
222
223val cut_first : int -> string -> string * string
224(** [cut_first n v] is [(take_first n v, drop_first n v)].
225
226    @since 5.5 *)
227
228val cut_last : int -> string -> string * string
229(** [cut_last n v] is [(drop_last n v, take_last n v)].
230
231    @since 5.5 *)
232
233(** {2:splitting_preds Splitting with predicates} *)
234
235val take_first_while : (char -> bool) -> string -> string
236(** [take_first_while p s] is the first consecutive bytes of [s]
237    satisfying the predicate [p].
238
239    @since 5.5 *)
240
241val take_last_while : (char -> bool) -> string -> string
242(** [take_last_while p s] is the last consecutive bytes of [s]
243    satisfying the predicate [p].
244
245    @since 5.5 *)
246
247val drop_first_while : (char -> bool) -> string -> string
248(** [drop_first_while p s] is [s] without the first consecutive bytes of [s]
249    satisfying the predicate [p].
250
251    @since 5.5 *)
252
253val drop_last_while : (char -> bool) -> string -> string
254(** [drop_last_while p s] is [s] without the last consecutive bytes of [s]
255    satisfying the predicate [p].
256
257    @since 5.5 *)
258
259val cut_first_while : (char -> bool) -> string -> string * string
260(** [cut_first_while p s] is
261    [(take_first_while p s, drop_first_while p s)].
262
263    @since 5.5 *)
264
265val cut_last_while : (char -> bool) -> string -> string * string
266(** [cut_last_while p s] is
267    [(drop_last_while p s, take_last_while p s)].
268
269    @since 5.5 *)
270
271(** {2:splitting_sep Splitting with separators} *)
272
273val split_on_char : sep:char -> string -> string list
274(** [split_on_char ~sep s] is the list of all (possibly empty)
275    substrings of [s] that are delimited by the character [sep].
276    If [s] is empty, the result is the singleton list [[""]].
277
278    The function's result is specified by the following invariants:
279    {ul
280    {- The list is not empty.}
281    {- Concatenating its elements using [sep] as a separator returns a
282      string equal to the input ([concat (make 1 sep)
283      (split_on_char sep s) = s]).}
284    {- No string in the result contains the [sep] character.}}
285
286    @since 4.04 (4.05 in StringLabels) *)
287
288(** {1:transforming Transforming} *)
289
290val map : f:(char -> char) -> string -> string
291(** [map f s] is the string resulting from applying [f] to all the
292    characters of [s] in increasing order.
293
294    @since 4.00 *)
295
296val mapi : f:(int -> char -> char) -> string -> string
297(** [mapi ~f s] is like {!map} but the index of the character is also
298    passed to [f].
299
300    @since 4.02 *)
301
302val fold_left : f:('acc -> char -> 'acc) -> init:'acc -> string -> 'acc
303(** [fold_left f x s] computes [f (... (f (f x s.[0]) s.[1]) ...) s.[n-1]],
304    where [n] is the length of the string [s].
305    @since 4.13 *)
306
307val fold_right : f:(char -> 'acc -> 'acc) -> string -> init:'acc -> 'acc
308(** [fold_right f s x] computes [f s.[0] (f s.[1] ( ... (f s.[n-1] x) ...))],
309    where [n] is the length of the string [s].
310    @since 4.13 *)
311
312val for_all : f:(char -> bool) -> string -> bool
313(** [for_all p s] checks if all characters in [s] satisfy the predicate [p].
314    @since 4.13 *)
315
316val exists : f:(char -> bool) -> string -> bool
317(** [exists p s] checks if at least one character of [s] satisfies the predicate
318    [p].
319    @since 4.13 *)
320
321val trim : string -> string
322(** [trim s] is [s] without leading and trailing whitespace. Whitespace
323    characters are: [' '], ['\x0C'] (form feed), ['\n'], ['\r'], and ['\t'].
324
325    @since 4.00 *)
326
327val escaped : string -> string
328(** [escaped s] is [s] with special characters represented by escape
329    sequences, following the lexical conventions of OCaml.
330
331    All characters outside the US-ASCII printable range \[0x20;0x7E\] are
332    escaped, as well as backslash (0x2F) and double-quote (0x22).
333
334    The function {!Scanf.unescaped} is a left inverse of [escaped],
335    i.e. [Scanf.unescaped (escaped s) = s] for any string [s] (unless
336    [escaped s] fails).
337
338    @raise Invalid_argument if the result is longer than
339    {!Sys.max_string_length} bytes. *)
340
341val uppercase_ascii : string -> string
342(** [uppercase_ascii s] is [s] with all lowercase letters
343    translated to uppercase, using the US-ASCII character set.
344
345    @since 4.03 (4.05 in StringLabels) *)
346
347val lowercase_ascii : string -> string
348(** [lowercase_ascii s] is [s] with all uppercase letters translated
349    to lowercase, using the US-ASCII character set.
350
351    @since 4.03 (4.05 in StringLabels) *)
352
353val capitalize_ascii : string -> string
354(** [capitalize_ascii s] is [s] with the first character set to
355    uppercase, using the US-ASCII character set.
356
357    @since 4.03 (4.05 in StringLabels) *)
358
359val uncapitalize_ascii : string -> string
360(** [uncapitalize_ascii s] is [s] with the first character set to lowercase,
361    using the US-ASCII character set.
362
363    @since 4.03 (4.05 in StringLabels) *)
364
365(** {1:traversing Traversing} *)
366
367val iter : f:(char -> unit) -> string -> unit
368(** [iter ~f s] applies function [f] in turn to all the characters of [s].
369    It is equivalent to [f s.[0]; f s.[1]; ...; f s.[length s - 1]; ()]. *)
370
371val iteri : f:(int -> char -> unit) -> string -> unit
372(** [iteri] is like {!iter}, but the function is also given the
373    corresponding character index.
374
375    @since 4.00 *)
376
377(** {1:searching Searching} *)
378
379val index_from : string -> int -> char -> int
380(** [index_from s i c] is the index of the first occurrence of [c] in
381    [s] after position [i].
382
383    @raise Not_found if [c] does not occur in [s] after position [i].
384    @raise Invalid_argument if [i] is not a valid position in [s]. *)
385
386
387val index_from_opt : string -> int -> char -> int option
388(** [index_from_opt s i c] is the index of the first occurrence of [c]
389    in [s] after position [i] (if any).
390
391    @raise Invalid_argument if [i] is not a valid position in [s].
392    @since 4.05 *)
393
394val rindex_from : string -> int -> char -> int
395(** [rindex_from s i c] is the index of the last occurrence of [c] in
396    [s] before position [i+1].
397
398    @raise Not_found if [c] does not occur in [s] before position [i+1].
399    @raise Invalid_argument if [i+1] is not a valid position in [s]. *)
400
401val rindex_from_opt : string -> int -> char -> int option
402(** [rindex_from_opt s i c] is the index of the last occurrence of [c]
403    in [s] before position [i+1] (if any).
404
405    @raise Invalid_argument if [i+1] is not a valid position in [s].
406    @since 4.05 *)
407
408val index : string -> char -> int
409(** [index s c] is {!String.index_from}[ s 0 c]. *)
410
411val index_opt : string -> char -> int option
412(** [index_opt s c] is {!String.index_from_opt}[ s 0 c].
413
414    @since 4.05 *)
415
416val rindex : string -> char -> int
417(** [rindex s c] is {!String.rindex_from}[ s (length s - 1) c]. *)
418
419val rindex_opt : string -> char -> int option
420(** [rindex_opt s c] is {!String.rindex_from_opt}[ s (length s - 1) c].
421
422    @since 4.05 *)
423
424(** {1 Strings and Sequences} *)
425
426val to_seq : t -> char Seq.t
427(** [to_seq s] is a sequence made of the string's characters in
428    increasing order.
429
430    @since 4.07 *)
431
432val to_seqi : t -> (int * char) Seq.t
433(** [to_seqi s] is like {!to_seq} but also tuples the corresponding index.
434
435    @since 4.07 *)
436
437val of_seq : char Seq.t -> t
438(** [of_seq s] is a string made of the sequence's characters.
439
440    @since 4.07 *)
441
442(** {1:utf UTF decoding and validations}
443
444    @since 4.14 *)
445
446(** {2:utf_8 UTF-8} *)
447
448val get_utf_8_uchar : t -> int -> Uchar.utf_decode
449(** [get_utf_8_uchar b i] decodes an UTF-8 character at index [i] in
450    [b]. *)
451
452val is_valid_utf_8 : t -> bool
453(** [is_valid_utf_8 b] is [true] if and only if [b] contains valid
454    UTF-8 data. *)
455
456(** {2:utf_16be UTF-16BE} *)
457
458val get_utf_16be_uchar : t -> int -> Uchar.utf_decode
459(** [get_utf_16be_uchar b i] decodes an UTF-16BE character at index
460    [i] in [b]. *)
461
462val is_valid_utf_16be : t -> bool
463(** [is_valid_utf_16be b] is [true] if and only if [b] contains valid
464    UTF-16BE data. *)
465
466(** {2:utf_16le UTF-16LE} *)
467
468val get_utf_16le_uchar : t -> int -> Uchar.utf_decode
469(** [get_utf_16le_uchar b i] decodes an UTF-16LE character at index
470    [i] in [b]. *)
471
472val is_valid_utf_16le : t -> bool
473(** [is_valid_utf_16le b] is [true] if and only if [b] contains valid
474    UTF-16LE data. *)
475
476(** {1:spellchecking Spellchecking} *)
477
478val edit_distance : ?limit:int -> t -> t -> int
479(** [edit_distance s0 s1] is the number of single character edits
480    (understood as insertion, deletion, substitution, transposition)
481    that are needed to change [s0] into [s1].
482
483    If [limit] is provided the function returns with [limit] as soon
484    as it was determined that [s0] and [s1] have distance of at least
485    [limit]. This is faster if you have a fixed limit, for example for
486    spellchecking.
487
488    The function assumes the strings are UTF-8 encoded and uses {!Uchar.t}
489    for the notion of character. Decoding errors are replaced by
490    {!Uchar.rep}. Normalizing the strings to
491    {{:https://unicode.org/glossary/#normalization_form_c}NFC} gives
492    better results.
493
494    {b Note.} This implements the simpler Optimal String Alignment (OSA)
495    distance, not the Damerau-Levenshtein distance. With this function
496    ["ca"] and ["abc"] have a distance of 3 not 2.
497
498    @since 5.4
499*)
500
501val spellcheck :
502  ?max_dist:(string -> int) -> ((string -> unit) -> unit) -> string ->
503  string list
504(** [spellcheck iter_dict s] are the strings enumerated by the
505    iterator [iter_dict] whose {{!edit_distance}edit distance} to [s]
506    is the smallest and at most [max_dist s]. If multiple corrections
507    are returned their order is as found in [iter_dict]. The default
508    [max_dist s] is:
509
510    {ul
511    {- [0] if [s] has 0 to 2 Unicode characters.}
512    {- [1] if [s] has 3 to 4 Unicode characters.}
513    {- [2] otherwise.}}
514
515    If your dictionary is a list [l], a suitable [iter_dict] is given
516    by [(fun yield -> List.iter yield l)].
517
518    All strings are assumed to be UTF-8 encoded, decoding
519    errors are replaced by {!Uchar.rep} characters.
520
521    @since 5.4 *)
522
523(** {1 Binary decoding of integers} *)
524
525(** The functions in this section binary decode integers from strings.
526
527    All following functions raise [Invalid_argument] if the characters
528    needed at index [i] to decode the integer are not available.
529
530    Little-endian (resp. big-endian) encoding means that least
531    (resp. most) significant bytes are stored first.  Big-endian is
532    also known as network byte order.  Native-endian encoding is
533    either little-endian or big-endian depending on {!Sys.big_endian}.
534
535    32-bit and 64-bit integers are represented by the [int32] and
536    [int64] types, which can be interpreted either as signed or
537    unsigned numbers.
538
539    8-bit and 16-bit integers are represented by the [int] type,
540    which has more bits than the binary encoding.  These extra bits
541    are sign-extended (or zero-extended) for functions which decode 8-bit
542    or 16-bit integers and represented them with [int] values.
543*)
544
545val get_uint8 : string -> int -> int
546(** [get_uint8 b i] is [b]'s unsigned 8-bit integer starting at character
547    index [i].
548
549    @since 4.13
550*)
551
552val get_int8 : string -> int -> int
553(** [get_int8 b i] is [b]'s signed 8-bit integer starting at character
554    index [i].
555
556    @since 4.13
557*)
558
559val get_uint16_ne : string -> int -> int
560(** [get_uint16_ne b i] is [b]'s native-endian unsigned 16-bit integer
561    starting at character index [i].
562
563    @since 4.13
564*)
565
566val get_uint16_be : string -> int -> int
567(** [get_uint16_be b i] is [b]'s big-endian unsigned 16-bit integer
568    starting at character index [i].
569
570    @since 4.13
571*)
572
573val get_uint16_le : string -> int -> int
574(** [get_uint16_le b i] is [b]'s little-endian unsigned 16-bit integer
575    starting at character index [i].
576
577    @since 4.13
578*)
579
580val get_int16_ne : string -> int -> int
581(** [get_int16_ne b i] is [b]'s native-endian signed 16-bit integer
582    starting at character index [i].
583
584    @since 4.13
585*)
586
587val get_int16_be : string -> int -> int
588(** [get_int16_be b i] is [b]'s big-endian signed 16-bit integer
589    starting at character index [i].
590
591    @since 4.13
592*)
593
594val get_int16_le : string -> int -> int
595(** [get_int16_le b i] is [b]'s little-endian signed 16-bit integer
596    starting at character index [i].
597
598    @since 4.13
599*)
600
601val get_int32_ne : string -> int -> int32
602(** [get_int32_ne b i] is [b]'s native-endian 32-bit integer
603    starting at character index [i].
604
605    @since 4.13
606*)
607
608val hash : t -> int
609(** An unseeded hash function for strings, with the same output value as
610    {!Hashtbl.hash}. This function allows this module to be passed as argument
611    to the functor {!Hashtbl.Make}.
612
613    @since 5.0 *)
614
615val seeded_hash : int -> t -> int
616(** A seeded hash function for strings, with the same output value as
617    {!Hashtbl.seeded_hash}. This function allows this module to be passed as
618    argument to the functor {!Hashtbl.MakeSeeded}.
619
620    @since 5.0 *)
621
622val get_int32_be : string -> int -> int32
623(** [get_int32_be b i] is [b]'s big-endian 32-bit integer
624    starting at character index [i].
625
626    @since 4.13
627*)
628
629val get_int32_le : string -> int -> int32
630(** [get_int32_le b i] is [b]'s little-endian 32-bit integer
631    starting at character index [i].
632
633    @since 4.13
634*)
635
636val get_int64_ne : string -> int -> int64
637(** [get_int64_ne b i] is [b]'s native-endian 64-bit integer
638    starting at character index [i].
639
640    @since 4.13
641*)
642
643val get_int64_be : string -> int -> int64
644(** [get_int64_be b i] is [b]'s big-endian 64-bit integer
645    starting at character index [i].
646
647    @since 4.13
648*)
649
650val get_int64_le : string -> int -> int64
651(** [get_int64_le b i] is [b]'s little-endian 64-bit integer
652    starting at character index [i].
653
654    @since 4.13
655*)
656
657(**/**)
658
659(* The following is for system use only. Do not call directly. *)
660
661external unsafe_get : string -> int -> char = "%string_unsafe_get"
662external unsafe_blit :
663  src:string -> src_pos:int -> dst:bytes -> dst_pos:int -> len:int ->
664    unit = "caml_blit_string" [@@noalloc]