My working unpac repository
at opam/upstream/seq 664 lines 22 kB view raw
1(**************************************************************************) 2(* *) 3(* OCaml *) 4(* *) 5(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) 6(* *) 7(* Copyright 1996 Institut National de Recherche en Informatique et *) 8(* en Automatique. *) 9(* *) 10(* All rights reserved. This file is distributed under the terms of *) 11(* the GNU Lesser General Public License version 2.1, with the *) 12(* special exception on linking described in the file LICENSE. *) 13(* *) 14(**************************************************************************) 15 16(* NOTE: 17 If this file is stringLabels.mli, run tools/sync_stdlib_docs after editing 18 it to generate string.mli. 19 20 If this file is string.mli, do not edit it directly -- edit 21 stringLabels.mli instead. 22 *) 23 24(** Strings. 25 26 A string [s] of length [n] is an indexable and immutable sequence 27 of [n] bytes. For historical reasons these bytes are referred to 28 as characters. 29 30 The semantics of string functions is defined in terms of 31 indices and positions. These are depicted and described 32 as follows. 33 34{v 35positions 0 1 2 3 4 n-1 n 36 +---+---+---+---+ +-----+ 37 indices | 0 | 1 | 2 | 3 | ... | n-1 | 38 +---+---+---+---+ +-----+ 39v} 40 {ul 41 {- An {e index} [i] of [s] is an integer in the range \[[0];[n-1]\]. 42 It represents the [i]th byte (character) of [s] which can be 43 accessed using the constant time string indexing operator 44 [s.[i]].} 45 {- A {e position} [i] of [s] is an integer in the range 46 \[[0];[n]\]. It represents either the point at the beginning of 47 the string, or the point between two indices, or the point at 48 the end of the string. The [i]th byte index is between position 49 [i] and [i+1].}} 50 51 Two integers [start] and [len] are said to define a {e valid 52 substring} of [s] if [len >= 0] and [start], [start+len] are 53 positions of [s]. 54 55 {b Unicode text.} Strings being arbitrary sequences of bytes, they 56 can hold any kind of textual encoding. However the recommended 57 encoding for storing Unicode text in OCaml strings is UTF-8. This 58 is the encoding used by Unicode escapes in string literals. For 59 example the string ["\u{1F42B}"] is the UTF-8 encoding of the 60 Unicode character U+1F42B. 61 62 {b Past mutability.} Before OCaml 4.02, strings used to be modifiable in 63 place like {!Bytes.t} mutable sequences of bytes. 64 OCaml 4 had various compiler flags and configuration options to support the 65 transition period from mutable to immutable strings. 66 Those options are no longer available, and strings are now always 67 immutable. 68 69 The labeled version of this module can be used as described in the 70 {!StdLabels} module. 71*) 72 73(** {1:strings Strings} *) 74 75type t = string 76(** The type for strings. *) 77 78val make : int -> char -> string 79(** [make n c] is a string of length [n] with each index holding the 80 character [c]. 81 82 @raise Invalid_argument if [n < 0] or [n > ]{!Sys.max_string_length}. *) 83 84val init : int -> f:(int -> char) -> string 85(** [init n ~f] is a string of length [n] with index 86 [i] holding the character [f i] (called in increasing index order). 87 88 @raise Invalid_argument if [n < 0] or [n > ]{!Sys.max_string_length}. 89 @since 4.02 *) 90 91val empty : string 92(** The empty string. 93 94 @since 4.13 95*) 96 97external length : string -> int = "%string_length" 98(** [length s] is the length (number of bytes/characters) of [s]. *) 99 100external get : string -> int -> char = "%string_safe_get" 101(** [get s i] is the character at index [i] in [s]. This is the same 102 as writing [s.[i]]. 103 104 @raise Invalid_argument if [i] not an index of [s]. *) 105 106val of_bytes : bytes -> string 107(** Return a new string that contains the same bytes as the given byte 108 sequence. 109 110 @since 4.13 111*) 112 113val to_bytes : string -> bytes 114(** Return a new byte sequence that contains the same bytes as the given 115 string. 116 117 @since 4.13 118*) 119 120val blit : 121 src:string -> src_pos:int -> dst:bytes -> dst_pos:int -> len:int -> unit 122(** Same as {!Bytes.blit_string} which should be preferred. *) 123 124(** {1:concat Concatenating} 125 126 {b Note.} The {!Stdlib.( ^ )} binary operator concatenates two 127 strings. *) 128 129val concat : sep:string -> string list -> string 130(** [concat ~sep ss] concatenates the list of strings [ss], inserting 131 the separator string [sep] between each. 132 133 @raise Invalid_argument if the result is longer than 134 {!Sys.max_string_length} bytes. *) 135 136val cat : string -> string -> string 137(** [cat s1 s2] concatenates s1 and s2 ([s1 ^ s2]). 138 139 @raise Invalid_argument if the result is longer than 140 {!Sys.max_string_length} bytes. 141 142 @since 4.13 143*) 144 145(** {1:predicates Predicates and comparisons} *) 146 147val equal : t -> t -> bool 148(** [equal s0 s1] is [true] if and only if [s0] and [s1] are character-wise 149 equal. 150 @since 4.03 (4.05 in StringLabels) *) 151 152val compare : t -> t -> int 153(** [compare s0 s1] sorts [s0] and [s1] in lexicographical order. [compare] 154 behaves like {!Stdlib.compare} on strings but may be more efficient. *) 155 156val starts_with : 157 prefix (* comment thwarts tools/sync_stdlib_docs *) :string -> string -> bool 158(** [starts_with ][~prefix s] is [true] if and only if [s] starts with 159 [prefix]. 160 161 @since 4.13 *) 162 163val ends_with : 164 suffix (* comment thwarts tools/sync_stdlib_docs *) :string -> string -> bool 165(** [ends_with ][~suffix s] is [true] if and only if [s] ends with [suffix]. 166 167 @since 4.13 *) 168 169val contains_from : string -> int -> char -> bool 170(** [contains_from s start c] is [true] if and only if [c] appears in [s] 171 after position [start]. 172 173 @raise Invalid_argument if [start] is not a valid position in [s]. *) 174 175val rcontains_from : string -> int -> char -> bool 176(** [rcontains_from s stop c] is [true] if and only if [c] appears in [s] 177 before position [stop+1]. 178 179 @raise Invalid_argument if [stop < 0] or [stop+1] is not a valid 180 position in [s]. *) 181 182val contains : string -> char -> bool 183(** [contains s c] is {!String.contains_from}[ s 0 c]. *) 184 185(** {1:extract Extracting substrings} *) 186 187val sub : string -> pos:int -> len:int -> string 188(** [sub s ~pos ~len] is a string of length [len], containing the 189 substring of [s] that starts at position [pos] and has length 190 [len]. 191 192 @raise Invalid_argument if [pos] and [len] do not designate a valid 193 substring of [s]. *) 194 195(** {1:splitting Splitting strings} *) 196 197(** {2:splitting_mag Splitting with magnitudes} *) 198 199val take_first : int -> string -> string 200(** [take_first n s] are the first [n] bytes of [s]. This is [s] if 201 [n >= length s] and [""] if [n <= 0]. 202 203 @since 5.5 *) 204 205val take_last : int -> string -> string 206(** [take_last n s] are the last [n] bytes of [s]. This is [s] if 207 [n >= length s] and [""] if [n <= 0]. 208 209 @since 5.5 *) 210 211val drop_first : int -> string -> string 212(** [drop_first n s] is [s] without the first [n] bytes of [s]. This is [""] 213 if [n >= length s] and [s] if [n <= 0]. 214 215 @since 5.5 *) 216 217val drop_last : int -> string -> string 218(** [drop_last n s] is [s] without the last [n] bytes of [s]. This is [""] 219 if [n >= length s] and [s] if [n <= 0]. 220 221 @since 5.5 *) 222 223val cut_first : int -> string -> string * string 224(** [cut_first n v] is [(take_first n v, drop_first n v)]. 225 226 @since 5.5 *) 227 228val cut_last : int -> string -> string * string 229(** [cut_last n v] is [(drop_last n v, take_last n v)]. 230 231 @since 5.5 *) 232 233(** {2:splitting_preds Splitting with predicates} *) 234 235val take_first_while : (char -> bool) -> string -> string 236(** [take_first_while p s] is the first consecutive bytes of [s] 237 satisfying the predicate [p]. 238 239 @since 5.5 *) 240 241val take_last_while : (char -> bool) -> string -> string 242(** [take_last_while p s] is the last consecutive bytes of [s] 243 satisfying the predicate [p]. 244 245 @since 5.5 *) 246 247val drop_first_while : (char -> bool) -> string -> string 248(** [drop_first_while p s] is [s] without the first consecutive bytes of [s] 249 satisfying the predicate [p]. 250 251 @since 5.5 *) 252 253val drop_last_while : (char -> bool) -> string -> string 254(** [drop_last_while p s] is [s] without the last consecutive bytes of [s] 255 satisfying the predicate [p]. 256 257 @since 5.5 *) 258 259val cut_first_while : (char -> bool) -> string -> string * string 260(** [cut_first_while p s] is 261 [(take_first_while p s, drop_first_while p s)]. 262 263 @since 5.5 *) 264 265val cut_last_while : (char -> bool) -> string -> string * string 266(** [cut_last_while p s] is 267 [(drop_last_while p s, take_last_while p s)]. 268 269 @since 5.5 *) 270 271(** {2:splitting_sep Splitting with separators} *) 272 273val split_on_char : sep:char -> string -> string list 274(** [split_on_char ~sep s] is the list of all (possibly empty) 275 substrings of [s] that are delimited by the character [sep]. 276 If [s] is empty, the result is the singleton list [[""]]. 277 278 The function's result is specified by the following invariants: 279 {ul 280 {- The list is not empty.} 281 {- Concatenating its elements using [sep] as a separator returns a 282 string equal to the input ([concat (make 1 sep) 283 (split_on_char sep s) = s]).} 284 {- No string in the result contains the [sep] character.}} 285 286 @since 4.04 (4.05 in StringLabels) *) 287 288(** {1:transforming Transforming} *) 289 290val map : f:(char -> char) -> string -> string 291(** [map f s] is the string resulting from applying [f] to all the 292 characters of [s] in increasing order. 293 294 @since 4.00 *) 295 296val mapi : f:(int -> char -> char) -> string -> string 297(** [mapi ~f s] is like {!map} but the index of the character is also 298 passed to [f]. 299 300 @since 4.02 *) 301 302val fold_left : f:('acc -> char -> 'acc) -> init:'acc -> string -> 'acc 303(** [fold_left f x s] computes [f (... (f (f x s.[0]) s.[1]) ...) s.[n-1]], 304 where [n] is the length of the string [s]. 305 @since 4.13 *) 306 307val fold_right : f:(char -> 'acc -> 'acc) -> string -> init:'acc -> 'acc 308(** [fold_right f s x] computes [f s.[0] (f s.[1] ( ... (f s.[n-1] x) ...))], 309 where [n] is the length of the string [s]. 310 @since 4.13 *) 311 312val for_all : f:(char -> bool) -> string -> bool 313(** [for_all p s] checks if all characters in [s] satisfy the predicate [p]. 314 @since 4.13 *) 315 316val exists : f:(char -> bool) -> string -> bool 317(** [exists p s] checks if at least one character of [s] satisfies the predicate 318 [p]. 319 @since 4.13 *) 320 321val trim : string -> string 322(** [trim s] is [s] without leading and trailing whitespace. Whitespace 323 characters are: [' '], ['\x0C'] (form feed), ['\n'], ['\r'], and ['\t']. 324 325 @since 4.00 *) 326 327val escaped : string -> string 328(** [escaped s] is [s] with special characters represented by escape 329 sequences, following the lexical conventions of OCaml. 330 331 All characters outside the US-ASCII printable range \[0x20;0x7E\] are 332 escaped, as well as backslash (0x2F) and double-quote (0x22). 333 334 The function {!Scanf.unescaped} is a left inverse of [escaped], 335 i.e. [Scanf.unescaped (escaped s) = s] for any string [s] (unless 336 [escaped s] fails). 337 338 @raise Invalid_argument if the result is longer than 339 {!Sys.max_string_length} bytes. *) 340 341val uppercase_ascii : string -> string 342(** [uppercase_ascii s] is [s] with all lowercase letters 343 translated to uppercase, using the US-ASCII character set. 344 345 @since 4.03 (4.05 in StringLabels) *) 346 347val lowercase_ascii : string -> string 348(** [lowercase_ascii s] is [s] with all uppercase letters translated 349 to lowercase, using the US-ASCII character set. 350 351 @since 4.03 (4.05 in StringLabels) *) 352 353val capitalize_ascii : string -> string 354(** [capitalize_ascii s] is [s] with the first character set to 355 uppercase, using the US-ASCII character set. 356 357 @since 4.03 (4.05 in StringLabels) *) 358 359val uncapitalize_ascii : string -> string 360(** [uncapitalize_ascii s] is [s] with the first character set to lowercase, 361 using the US-ASCII character set. 362 363 @since 4.03 (4.05 in StringLabels) *) 364 365(** {1:traversing Traversing} *) 366 367val iter : f:(char -> unit) -> string -> unit 368(** [iter ~f s] applies function [f] in turn to all the characters of [s]. 369 It is equivalent to [f s.[0]; f s.[1]; ...; f s.[length s - 1]; ()]. *) 370 371val iteri : f:(int -> char -> unit) -> string -> unit 372(** [iteri] is like {!iter}, but the function is also given the 373 corresponding character index. 374 375 @since 4.00 *) 376 377(** {1:searching Searching} *) 378 379val index_from : string -> int -> char -> int 380(** [index_from s i c] is the index of the first occurrence of [c] in 381 [s] after position [i]. 382 383 @raise Not_found if [c] does not occur in [s] after position [i]. 384 @raise Invalid_argument if [i] is not a valid position in [s]. *) 385 386 387val index_from_opt : string -> int -> char -> int option 388(** [index_from_opt s i c] is the index of the first occurrence of [c] 389 in [s] after position [i] (if any). 390 391 @raise Invalid_argument if [i] is not a valid position in [s]. 392 @since 4.05 *) 393 394val rindex_from : string -> int -> char -> int 395(** [rindex_from s i c] is the index of the last occurrence of [c] in 396 [s] before position [i+1]. 397 398 @raise Not_found if [c] does not occur in [s] before position [i+1]. 399 @raise Invalid_argument if [i+1] is not a valid position in [s]. *) 400 401val rindex_from_opt : string -> int -> char -> int option 402(** [rindex_from_opt s i c] is the index of the last occurrence of [c] 403 in [s] before position [i+1] (if any). 404 405 @raise Invalid_argument if [i+1] is not a valid position in [s]. 406 @since 4.05 *) 407 408val index : string -> char -> int 409(** [index s c] is {!String.index_from}[ s 0 c]. *) 410 411val index_opt : string -> char -> int option 412(** [index_opt s c] is {!String.index_from_opt}[ s 0 c]. 413 414 @since 4.05 *) 415 416val rindex : string -> char -> int 417(** [rindex s c] is {!String.rindex_from}[ s (length s - 1) c]. *) 418 419val rindex_opt : string -> char -> int option 420(** [rindex_opt s c] is {!String.rindex_from_opt}[ s (length s - 1) c]. 421 422 @since 4.05 *) 423 424(** {1 Strings and Sequences} *) 425 426val to_seq : t -> char Seq.t 427(** [to_seq s] is a sequence made of the string's characters in 428 increasing order. 429 430 @since 4.07 *) 431 432val to_seqi : t -> (int * char) Seq.t 433(** [to_seqi s] is like {!to_seq} but also tuples the corresponding index. 434 435 @since 4.07 *) 436 437val of_seq : char Seq.t -> t 438(** [of_seq s] is a string made of the sequence's characters. 439 440 @since 4.07 *) 441 442(** {1:utf UTF decoding and validations} 443 444 @since 4.14 *) 445 446(** {2:utf_8 UTF-8} *) 447 448val get_utf_8_uchar : t -> int -> Uchar.utf_decode 449(** [get_utf_8_uchar b i] decodes an UTF-8 character at index [i] in 450 [b]. *) 451 452val is_valid_utf_8 : t -> bool 453(** [is_valid_utf_8 b] is [true] if and only if [b] contains valid 454 UTF-8 data. *) 455 456(** {2:utf_16be UTF-16BE} *) 457 458val get_utf_16be_uchar : t -> int -> Uchar.utf_decode 459(** [get_utf_16be_uchar b i] decodes an UTF-16BE character at index 460 [i] in [b]. *) 461 462val is_valid_utf_16be : t -> bool 463(** [is_valid_utf_16be b] is [true] if and only if [b] contains valid 464 UTF-16BE data. *) 465 466(** {2:utf_16le UTF-16LE} *) 467 468val get_utf_16le_uchar : t -> int -> Uchar.utf_decode 469(** [get_utf_16le_uchar b i] decodes an UTF-16LE character at index 470 [i] in [b]. *) 471 472val is_valid_utf_16le : t -> bool 473(** [is_valid_utf_16le b] is [true] if and only if [b] contains valid 474 UTF-16LE data. *) 475 476(** {1:spellchecking Spellchecking} *) 477 478val edit_distance : ?limit:int -> t -> t -> int 479(** [edit_distance s0 s1] is the number of single character edits 480 (understood as insertion, deletion, substitution, transposition) 481 that are needed to change [s0] into [s1]. 482 483 If [limit] is provided the function returns with [limit] as soon 484 as it was determined that [s0] and [s1] have distance of at least 485 [limit]. This is faster if you have a fixed limit, for example for 486 spellchecking. 487 488 The function assumes the strings are UTF-8 encoded and uses {!Uchar.t} 489 for the notion of character. Decoding errors are replaced by 490 {!Uchar.rep}. Normalizing the strings to 491 {{:https://unicode.org/glossary/#normalization_form_c}NFC} gives 492 better results. 493 494 {b Note.} This implements the simpler Optimal String Alignment (OSA) 495 distance, not the Damerau-Levenshtein distance. With this function 496 ["ca"] and ["abc"] have a distance of 3 not 2. 497 498 @since 5.4 499*) 500 501val spellcheck : 502 ?max_dist:(string -> int) -> ((string -> unit) -> unit) -> string -> 503 string list 504(** [spellcheck iter_dict s] are the strings enumerated by the 505 iterator [iter_dict] whose {{!edit_distance}edit distance} to [s] 506 is the smallest and at most [max_dist s]. If multiple corrections 507 are returned their order is as found in [iter_dict]. The default 508 [max_dist s] is: 509 510 {ul 511 {- [0] if [s] has 0 to 2 Unicode characters.} 512 {- [1] if [s] has 3 to 4 Unicode characters.} 513 {- [2] otherwise.}} 514 515 If your dictionary is a list [l], a suitable [iter_dict] is given 516 by [(fun yield -> List.iter yield l)]. 517 518 All strings are assumed to be UTF-8 encoded, decoding 519 errors are replaced by {!Uchar.rep} characters. 520 521 @since 5.4 *) 522 523(** {1 Binary decoding of integers} *) 524 525(** The functions in this section binary decode integers from strings. 526 527 All following functions raise [Invalid_argument] if the characters 528 needed at index [i] to decode the integer are not available. 529 530 Little-endian (resp. big-endian) encoding means that least 531 (resp. most) significant bytes are stored first. Big-endian is 532 also known as network byte order. Native-endian encoding is 533 either little-endian or big-endian depending on {!Sys.big_endian}. 534 535 32-bit and 64-bit integers are represented by the [int32] and 536 [int64] types, which can be interpreted either as signed or 537 unsigned numbers. 538 539 8-bit and 16-bit integers are represented by the [int] type, 540 which has more bits than the binary encoding. These extra bits 541 are sign-extended (or zero-extended) for functions which decode 8-bit 542 or 16-bit integers and represented them with [int] values. 543*) 544 545val get_uint8 : string -> int -> int 546(** [get_uint8 b i] is [b]'s unsigned 8-bit integer starting at character 547 index [i]. 548 549 @since 4.13 550*) 551 552val get_int8 : string -> int -> int 553(** [get_int8 b i] is [b]'s signed 8-bit integer starting at character 554 index [i]. 555 556 @since 4.13 557*) 558 559val get_uint16_ne : string -> int -> int 560(** [get_uint16_ne b i] is [b]'s native-endian unsigned 16-bit integer 561 starting at character index [i]. 562 563 @since 4.13 564*) 565 566val get_uint16_be : string -> int -> int 567(** [get_uint16_be b i] is [b]'s big-endian unsigned 16-bit integer 568 starting at character index [i]. 569 570 @since 4.13 571*) 572 573val get_uint16_le : string -> int -> int 574(** [get_uint16_le b i] is [b]'s little-endian unsigned 16-bit integer 575 starting at character index [i]. 576 577 @since 4.13 578*) 579 580val get_int16_ne : string -> int -> int 581(** [get_int16_ne b i] is [b]'s native-endian signed 16-bit integer 582 starting at character index [i]. 583 584 @since 4.13 585*) 586 587val get_int16_be : string -> int -> int 588(** [get_int16_be b i] is [b]'s big-endian signed 16-bit integer 589 starting at character index [i]. 590 591 @since 4.13 592*) 593 594val get_int16_le : string -> int -> int 595(** [get_int16_le b i] is [b]'s little-endian signed 16-bit integer 596 starting at character index [i]. 597 598 @since 4.13 599*) 600 601val get_int32_ne : string -> int -> int32 602(** [get_int32_ne b i] is [b]'s native-endian 32-bit integer 603 starting at character index [i]. 604 605 @since 4.13 606*) 607 608val hash : t -> int 609(** An unseeded hash function for strings, with the same output value as 610 {!Hashtbl.hash}. This function allows this module to be passed as argument 611 to the functor {!Hashtbl.Make}. 612 613 @since 5.0 *) 614 615val seeded_hash : int -> t -> int 616(** A seeded hash function for strings, with the same output value as 617 {!Hashtbl.seeded_hash}. This function allows this module to be passed as 618 argument to the functor {!Hashtbl.MakeSeeded}. 619 620 @since 5.0 *) 621 622val get_int32_be : string -> int -> int32 623(** [get_int32_be b i] is [b]'s big-endian 32-bit integer 624 starting at character index [i]. 625 626 @since 4.13 627*) 628 629val get_int32_le : string -> int -> int32 630(** [get_int32_le b i] is [b]'s little-endian 32-bit integer 631 starting at character index [i]. 632 633 @since 4.13 634*) 635 636val get_int64_ne : string -> int -> int64 637(** [get_int64_ne b i] is [b]'s native-endian 64-bit integer 638 starting at character index [i]. 639 640 @since 4.13 641*) 642 643val get_int64_be : string -> int -> int64 644(** [get_int64_be b i] is [b]'s big-endian 64-bit integer 645 starting at character index [i]. 646 647 @since 4.13 648*) 649 650val get_int64_le : string -> int -> int64 651(** [get_int64_le b i] is [b]'s little-endian 64-bit integer 652 starting at character index [i]. 653 654 @since 4.13 655*) 656 657(**/**) 658 659(* The following is for system use only. Do not call directly. *) 660 661external unsafe_get : string -> int -> char = "%string_unsafe_get" 662external unsafe_blit : 663 src:string -> src_pos:int -> dst:bytes -> dst_pos:int -> len:int -> 664 unit = "caml_blit_string" [@@noalloc]