lib/punycode.mli at 4825af1dd20dce1f31dcdc8b5c6ea7cf720fc994

Punycode (RFC3492) in OCaml
ocaml-punycode / lib / punycode.mli
at 4825af1dd20dce1f31dcdc8b5c6ea7cf720fc994 267 lines 10 kB view raw
wrap content
  1(*---------------------------------------------------------------------------
  2  Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
  3  SPDX-License-Identifier: ISC
  4 ---------------------------------------------------------------------------*)
  5
  6(** RFC 3492 Punycode: A Bootstring encoding of Unicode for IDNA.
  7
  8    This module implements the Punycode algorithm as specified in
  9    {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, providing
 10    encoding and decoding of Unicode strings to/from ASCII-compatible encoding
 11    suitable for use in internationalized domain names.
 12
 13    Punycode is an instance of Bootstring that uses particular parameter values
 14    appropriate for IDNA. See
 15    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
 16     5} for the specific parameter values.
 17
 18    {2 References}
 19    - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A
 20      Bootstring encoding of Unicode for IDNA
 21    - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol
 22*)
 23
 24(** {1 Position Tracking} *)
 25
 26type position
 27(** Abstract type representing a position in input for error reporting.
 28    Positions track both byte offset and Unicode character index. *)
 29
 30val position_byte_offset : position -> int
 31(** [position_byte_offset pos] returns the byte offset in the input. *)
 32
 33val position_char_index : position -> int
 34(** [position_char_index pos] returns the Unicode character index (0-based). *)
 35
 36val pp_position : Format.formatter -> position -> unit
 37(** [pp_position fmt pos] pretty-prints a position as "byte N, char M". *)
 38
 39(** {1 Error Types} *)
 40
 41type error =
 42  | Overflow of position
 43      (** Arithmetic overflow during encode/decode. This can occur with very
 44          long strings or extreme Unicode code point values. See
 45          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4} RFC 3492
 46           Section 6.4} for overflow handling requirements. *)
 47  | Invalid_character of position * Uchar.t
 48      (** A non-basic code point appeared where only basic code points (ASCII <
 49          128) are allowed. Per
 50          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} RFC 3492
 51           Section 3.1}, basic code points must be segregated at the beginning
 52          of the encoded string. *)
 53  | Invalid_digit of position * char
 54      (** An invalid Punycode digit was encountered during decoding. Valid
 55          digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35). See
 56          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492
 57           Section 5} for digit-value mappings. *)
 58  | Unexpected_end of position
 59      (** The input ended prematurely during decoding of a delta value. See
 60          {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2} RFC 3492
 61           Section 6.2} decoding procedure. *)
 62  | Invalid_utf8 of position  (** Malformed UTF-8 sequence in input string. *)
 63  | Label_too_long of int
 64      (** Encoded label exceeds 63 bytes (DNS limit per
 65          {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}). The int
 66          is the actual length. *)
 67  | Empty_label  (** Empty label is not valid for encoding. *)
 68
 69val pp_error : Format.formatter -> error -> unit
 70(** [pp_error fmt e] pretty-prints an error with position information. *)
 71
 72val error_to_string : error -> string
 73(** [error_to_string e] converts an error to a human-readable string. *)
 74
 75(** {1 Constants}
 76
 77    Punycode parameters as specified in
 78    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
 79     5}. *)
 80
 81val ace_prefix : string
 82(** The ACE prefix ["xn--"] used for Punycode-encoded domain labels. See
 83    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
 84     5} which notes that IDNA prepends this prefix. *)
 85
 86val max_label_length : int
 87(** Maximum length of a domain label in bytes (63), per
 88    {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
 89
 90(** {1 Case Flags for Mixed-Case Annotation}
 91
 92    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
 93     Appendix A} describes an optional mechanism for preserving case information
 94    through the encoding/decoding round-trip. This is useful when the original
 95    string's case should be recoverable.
 96
 97    Note: Mixed-case annotation is not used by the ToASCII and ToUnicode
 98    operations of IDNA. *)
 99
100type case_flag =
101  | Uppercase
102  | Lowercase  (** Case annotation for a character. *)
103
104(** {1 Core Punycode Operations}
105
106    These functions implement the Bootstring algorithms from
107    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section
108     6}. They operate on arrays of Unicode code points ([Uchar.t array]). The
109    encoded output is a plain ASCII string without the ACE prefix. *)
110
111val encode : Uchar.t array -> (string, error) result
112(** [encode codepoints] encodes an array of Unicode code points to a Punycode
113    ASCII string.
114
115    Implements the encoding procedure from
116    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492
117     Section 6.3}:
118
119    1. Basic code points (ASCII < 128) are copied literally to the beginning of
120    the output per
121    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} Section 3.1
122     (Basic code point segregation)} 2. A delimiter ('-') is appended if there
123    are any basic code points 3. Non-basic code points are encoded as deltas
124    using the generalized variable-length integer representation from
125    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3}
126
127    Example:
128    {[
129      encode [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC; ... |]
130      (* = Ok "ihqwcrb4cv8a8dqg056pqjye" *)
131    ]} *)
132
133val decode : string -> (Uchar.t array, error) result
134(** [decode punycode] decodes a Punycode ASCII string to an array of Unicode
135    code points.
136
137    Implements the decoding procedure from
138    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492
139     Section 6.2}.
140
141    The input should be the Punycode portion only, without the ACE prefix. The
142    decoder is case-insensitive for the encoded portion, as required by
143    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
144     5}: "A decoder MUST recognize the letters in both uppercase and lowercase
145    forms".
146
147    Example:
148    {[
149      decode "ihqwcrb4cv8a8dqg056pqjye"
150      (* = Ok [| U+4ED6; U+4EEC; U+4E3A; ... |] (Chinese simplified) *)
151    ]} *)
152
153(** {1 Mixed-Case Annotation}
154
155    These functions support round-trip case preservation as described in
156    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
157     Appendix A}. *)
158
159val encode_with_case :
160  Uchar.t array -> case_flag array -> (string, error) result
161(** [encode_with_case codepoints case_flags] encodes with case annotation.
162
163    Per
164    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
165     Appendix A}:
166    - For basic (ASCII) letters, the output preserves the case flag directly
167    - For non-ASCII characters, the case of the final digit in each delta
168      encoding indicates the flag (uppercase = suggested uppercase)
169
170    The [case_flags] array must have the same length as [codepoints].
171
172    @raise Invalid_argument if array lengths don't match. *)
173
174val decode_with_case : string -> (Uchar.t array * case_flag array, error) result
175(** [decode_with_case punycode] decodes and extracts case annotations.
176
177    Per
178    {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
179     Appendix A}, returns both the decoded code points and an array of case
180    flags indicating the suggested case for each character based on the
181    uppercase/lowercase form of the encoding digits. *)
182
183(** {1 UTF-8 String Operations}
184
185    Convenience functions that work directly with UTF-8 encoded OCaml strings.
186    These combine UTF-8 decoding/encoding with the core Punycode operations. *)
187
188val encode_utf8 : string -> (string, error) result
189(** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix).
190
191    This is equivalent to decoding [s] from UTF-8 to code points, then calling
192    {!encode}.
193
194    Example:
195    {[
196      encode_utf8 "münchen"
197      (* = Ok "mnchen-3ya" *)
198    ]} *)
199
200val decode_utf8 : string -> (string, error) result
201(** [decode_utf8 punycode] decodes Punycode to a UTF-8 string (no ACE prefix).
202
203    This is equivalent to calling {!decode} then encoding the result as UTF-8.
204
205    Example:
206    {[
207      decode_utf8 "mnchen-3ya"
208      (* = Ok "münchen" *)
209    ]} *)
210
211(** {1 Domain Label Operations}
212
213    These functions handle the ACE prefix automatically and enforce DNS label
214    length limits per
215    {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
216
217val encode_label : string -> (string, error) result
218(** [encode_label label] encodes a domain label for use in DNS.
219
220    If the label contains only ASCII characters, it is returned unchanged.
221    Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended, as
222    specified in
223    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
224     5}.
225
226    Returns {!Error} {!Label_too_long} if the result exceeds 63 bytes.
227
228    Example:
229    {[
230      encode_label "münchen"
231        (* = Ok "xn--mnchen-3ya" *)
232        encode_label "example"
233      (* = Ok "example" *)
234    ]} *)
235
236val decode_label : string -> (string, error) result
237(** [decode_label label] decodes a domain label.
238
239    If the label starts with the ACE prefix ("xn--", case-insensitive), it is
240    Punycode-decoded. Otherwise, it is returned unchanged.
241
242    Example:
243    {[
244      decode_label "xn--mnchen-3ya"
245        (* = Ok "münchen" *)
246        decode_label "example"
247      (* = Ok "example" *)
248    ]} *)
249
250(** {1 Validation}
251
252    Predicate functions for checking code point and string properties. *)
253
254val is_basic : Uchar.t -> bool
255(** [is_basic u] is [true] if [u] is a basic code point (ASCII, < 128).
256
257    Per
258    {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
259     5}, basic code points for Punycode are the ASCII code points (0..7F). *)
260
261val is_ascii_string : string -> bool
262(** [is_ascii_string s] is [true] if [s] contains only ASCII characters (all
263    bytes < 128). *)
264
265val has_ace_prefix : string -> bool
266(** [has_ace_prefix s] is [true] if [s] starts with the ACE prefix "xn--"
267    (case-insensitive comparison). *)