Punycode (RFC3492) in OCaml
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** IDNA (Internationalized Domain Names in Applications) support.
7
8 This module provides ToASCII and ToUnicode operations as specified in
9 {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} (IDNA 2008),
10 using Punycode ({{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492})
11 for encoding.
12
13 IDNA allows domain names to contain non-ASCII Unicode characters by encoding
14 them using Punycode with an ACE prefix. This module handles the conversion
15 between Unicode domain names and their ASCII-compatible encoding (ACE) form.
16
17 {2 References}
18 - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} -
19 Internationalized Domain Names in Applications (IDNA): Protocol
20 - {{:https://datatracker.ietf.org/doc/html/rfc5892}RFC 5892} - The Unicode
21 Code Points and Internationalized Domain Names for Applications (IDNA)
22 - {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} - Right-to-Left
23 Scripts for Internationalized Domain Names for Applications (IDNA)
24 - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A
25 Bootstring encoding of Unicode for IDNA *)
26
27(** {1 Error Types} *)
28
29type error_reason =
30 | Punycode_error of Punycode.error_reason
31 (** Error during Punycode encoding/decoding. See {!Punycode.error_reason}
32 for details. *)
33 | Invalid_label of string
34 (** Label violates IDNA constraints. The string describes the violation.
35 See
36 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891
37 Section 4} for label validation requirements. *)
38 | Domain_too_long of int
39 (** Domain name exceeds 253 bytes, per
40 {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. The int is
41 the actual length. *)
42 | Normalization_failed
43 (** Unicode normalization (NFC) failed. Per
44 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} RFC
45 5891 Section 4.2.1}, labels must be in NFC form. *)
46 | Verification_failed
47 (** ToASCII/ToUnicode verification step failed (round-trip check). Per
48 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} RFC 5891
49 Section 4.2}, the result of encoding must decode back to the original
50 input. *)
51
52exception Error of error_reason
53(** Exception raised for all IDNA processing errors. *)
54
55val pp_error_reason : Format.formatter -> error_reason -> unit
56(** [pp_error_reason fmt e] pretty-prints an error. *)
57
58val error_reason_to_string : error_reason -> string
59(** [error_reason_to_string e] converts an error to a human-readable string. *)
60
61(** {1 Constants} *)
62
63val max_domain_length : int
64(** Maximum length of a domain name in bytes (253), per
65 {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
66
67(** {1 ToASCII Operation}
68
69 Converts an internationalized domain name to its ASCII-compatible encoding
70 (ACE) form suitable for DNS lookup.
71
72 See
73 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891 Section
74 4} for the complete ToASCII specification. *)
75
76val to_ascii :
77 ?check_hyphens:bool ->
78 ?check_bidi:bool ->
79 ?check_joiners:bool ->
80 ?use_std3_rules:bool ->
81 ?transitional:bool ->
82 string ->
83 string
84(** [to_ascii domain] converts an internationalized domain name to ASCII.
85
86 Implements the ToASCII operation from
87 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891
88 Section 4.1}.
89
90 For each label in the domain: 1. If all ASCII, pass through (with optional
91 STD3 validation) 2. Otherwise, normalize to NFC per
92 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}Section
93 4.2.1} and Punycode-encode with ACE prefix
94
95 Optional parameters (per
96 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891 Section
97 4} processing options):
98 - [check_hyphens]: Validate hyphen placement per
99 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1}Section
100 4.2.3.1} (default: true)
101 - [check_bidi]: Check bidirectional text rules per
102 {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} (default:
103 false, not implemented)
104 - [check_joiners]: Check contextual joiner rules per
105 {{:https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1}RFC 5892
106 Appendix A.1} (default: false, not implemented)
107 - [use_std3_rules]: Apply STD3 hostname rules per
108 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section
109 4.2.3.2} (default: false)
110 - [transitional]: Use IDNA 2003 transitional processing (default: false)
111
112 @raise Error on conversion failure.
113
114 Example:
115 {[
116 to_ascii "münchen.example.com"
117 (* = "xn--mnchen-3ya.example.com" *)
118 ]} *)
119
120val label_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool -> string -> string
121(** [label_to_ascii label] converts a single label to ASCII.
122
123 This implements the core ToASCII operation for one label, as described in
124 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891
125 Section 4.1}.
126
127 @raise Error on conversion failure. *)
128
129(** {1 ToUnicode Operation}
130
131 Converts an ASCII-compatible encoded domain name back to Unicode.
132
133 See
134 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} RFC 5891
135 Section 4.2} for the complete ToUnicode specification. *)
136
137val to_unicode : string -> string
138(** [to_unicode domain] converts an ACE domain name to Unicode.
139
140 Implements the ToUnicode operation from
141 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891
142 Section 4.2}.
143
144 For each label in the domain: 1. If it has the ACE prefix ("xn--"),
145 Punycode-decode it per
146 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492
147 Section 6.2} 2. Otherwise, pass through unchanged
148
149 @raise Error on decoding failure.
150
151 Example:
152 {[
153 to_unicode "xn--mnchen-3ya.example.com"
154 (* = "münchen.example.com" *)
155 ]} *)
156
157val label_to_unicode : string -> string
158(** [label_to_unicode label] converts a single ACE label to Unicode.
159
160 This implements the core ToUnicode operation for one label, as described in
161 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891
162 Section 4.2}.
163
164 @raise Error on decoding failure. *)
165
166(** {1 Domain Name Integration}
167
168 Functions that work with the
169 {{:https://github.com/hannesm/domain-name}domain-name} library types.
170
171 These provide integration with the [Domain_name] module for applications
172 that use that library for domain name handling. *)
173
174val domain_to_ascii :
175 ?check_hyphens:bool ->
176 ?use_std3_rules:bool ->
177 [ `raw ] Domain_name.t ->
178 [ `raw ] Domain_name.t
179(** [domain_to_ascii domain] converts a domain name to ASCII form.
180
181 Applies {!to_ascii} to the string representation and returns the result as a
182 [Domain_name.t].
183
184 @raise Error on conversion failure.
185
186 Example:
187 {[
188 let d = Domain_name.of_string_exn "münchen.example.com" in
189 domain_to_ascii d
190 (* = Domain_name.of_string_exn "xn--mnchen-3ya.example.com" *)
191 ]} *)
192
193val domain_to_unicode : [ `raw ] Domain_name.t -> [ `raw ] Domain_name.t
194(** [domain_to_unicode domain] converts a domain name to Unicode form.
195
196 Applies {!to_unicode} to the string representation and returns the result as
197 a [Domain_name.t].
198
199 @raise Error on decoding failure. *)
200
201(** {1 Validation} *)
202
203val is_idna_valid : string -> bool
204(** [is_idna_valid domain] checks if a domain name is valid for IDNA processing.
205
206 Returns [true] if {!to_ascii} would succeed on the domain. *)
207
208val is_ace_label : string -> bool
209(** [is_ace_label label] is [true] if the label has the ACE prefix "xn--"
210 (case-insensitive). This indicates the label is Punycode-encoded per
211 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
212 5}. *)
213
214(** {1 Normalization} *)
215
216val normalize_nfc : string -> string
217(** [normalize_nfc s] returns the NFC-normalized form of UTF-8 string [s].
218
219 Per
220 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} RFC 5891
221 Section 4.2.1}, domain labels must be normalized to NFC (Unicode
222 Normalization Form C) before encoding.
223
224 See {{:http://www.unicode.org/reports/tr15/}Unicode Standard Annex #15} for
225 details on Unicode normalization forms. *)