···1+(* RFC 3492 Punycode Implementation *)
2+3+(* {1 Bootstring Parameters for Punycode (RFC 3492 Section 5)} *)
4+5+let base = 36
6+let tmin = 1
7+let tmax = 26
8+let skew = 38
9+let damp = 700
10+let initial_bias = 72
11+let initial_n = 0x80 (* 128 *)
12+let delimiter = '-'
13+let ace_prefix = "xn--"
14+let max_label_length = 63
15+16+(* {1 Position Tracking} *)
17+18+type position = {
19+ byte_offset : int;
20+ char_index : int;
21+}
22+23+let position_byte_offset pos = pos.byte_offset
24+let position_char_index pos = pos.char_index
25+26+let pp_position fmt pos =
27+ Format.fprintf fmt "byte %d, char %d" pos.byte_offset pos.char_index
28+29+30+(* {1 Error Types} *)
31+32+type error =
33+ | Overflow of position
34+ | Invalid_character of position * Uchar.t
35+ | Invalid_digit of position * char
36+ | Unexpected_end of position
37+ | Invalid_utf8 of position
38+ | Label_too_long of int
39+ | Empty_label
40+41+let pp_error fmt = function
42+ | Overflow pos ->
43+ Format.fprintf fmt "arithmetic overflow at %a" pp_position pos
44+ | Invalid_character (pos, u) ->
45+ Format.fprintf fmt "invalid character U+%04X at %a"
46+ (Uchar.to_int u) pp_position pos
47+ | Invalid_digit (pos, c) ->
48+ Format.fprintf fmt "invalid Punycode digit '%c' (0x%02X) at %a"
49+ c (Char.code c) pp_position pos
50+ | Unexpected_end pos ->
51+ Format.fprintf fmt "unexpected end of input at %a" pp_position pos
52+ | Invalid_utf8 pos ->
53+ Format.fprintf fmt "invalid UTF-8 sequence at %a" pp_position pos
54+ | Label_too_long len ->
55+ Format.fprintf fmt "label too long: %d bytes (max %d)" len max_label_length
56+ | Empty_label ->
57+ Format.fprintf fmt "empty label"
58+59+60+(* {1 Case Flags} *)
61+62+type case_flag = Uppercase | Lowercase
63+64+(* {1 Basic Predicates} *)
65+66+let is_basic u =
67+ Uchar.to_int u < 0x80
68+69+70+let is_delimiter c = c = delimiter
71+72+let is_ascii_string s =
73+ let rec loop i =
74+ if i >= String.length s then true
75+ else if Char.code s.[i] >= 0x80 then false
76+ else loop (i + 1)
77+ in
78+ loop 0
79+80+let has_ace_prefix s =
81+ let len = String.length s in
82+ len >= 4 &&
83+ (s.[0] = 'x' || s.[0] = 'X') &&
84+ (s.[1] = 'n' || s.[1] = 'N') &&
85+ s.[2] = '-' && s.[3] = '-'
86+87+(* {1 Digit Encoding/Decoding (RFC 3492 Section 5)}
88+89+ Digit values:
90+ - 0-25: a-z (or A-Z)
91+ - 26-35: 0-9
92+*)
93+94+let encode_digit d case_flag =
95+ if d < 26 then
96+ Char.chr (d + (if case_flag = Uppercase then 0x41 else 0x61))
97+ else
98+ Char.chr (d - 26 + 0x30)
99+100+let decode_digit c =
101+ let code = Char.code c in
102+ if code >= 0x30 && code <= 0x39 then
103+ Some (code - 0x30 + 26) (* '0'-'9' -> 26-35 *)
104+ else if code >= 0x41 && code <= 0x5A then
105+ Some (code - 0x41) (* 'A'-'Z' -> 0-25 *)
106+ else if code >= 0x61 && code <= 0x7A then
107+ Some (code - 0x61) (* 'a'-'z' -> 0-25 *)
108+ else
109+ None
110+111+(* Check if a character is "flagged" (uppercase) for case annotation *)
112+let is_flagged c =
113+ let code = Char.code c in
114+ code >= 0x41 && code <= 0x5A (* 'A'-'Z' *)
115+116+(* {1 Bias Adaptation (RFC 3492 Section 6.1)} *)
117+118+let adapt ~delta ~numpoints ~firsttime =
119+ let delta = if firsttime then delta / damp else delta / 2 in
120+ let delta = delta + (delta / numpoints) in
121+ let threshold = ((base - tmin) * tmax) / 2 in
122+ let rec loop delta k =
123+ if delta > threshold then
124+ loop (delta / (base - tmin)) (k + base)
125+ else
126+ k + (((base - tmin + 1) * delta) / (delta + skew))
127+ in
128+ loop delta 0
129+130+(* {1 Overflow-Safe Arithmetic}
131+132+ RFC 3492 Section 6.4: Use detection to avoid overflow.
133+ A + B overflows iff B > maxint - A
134+ A + B*C overflows iff B > (maxint - A) / C
135+*)
136+137+let max_int_value = max_int
138+139+let safe_mul_add a b c pos =
140+ if c = 0 then Ok a
141+ else if b > (max_int_value - a) / c then
142+ Error (Overflow pos)
143+ else
144+ Ok (a + b * c)
145+146+(* {1 UTF-8 to Code Points Conversion} *)
147+148+let utf8_to_codepoints s =
149+ let len = String.length s in
150+ let acc = ref [] in
151+ let byte_offset = ref 0 in
152+ let char_index = ref 0 in
153+ let error = ref None in
154+ while !byte_offset < len && !error = None do
155+ let pos = { byte_offset = !byte_offset; char_index = !char_index } in
156+ let dec = String.get_utf_8_uchar s !byte_offset in
157+ if Uchar.utf_decode_is_valid dec then begin
158+ acc := Uchar.utf_decode_uchar dec :: !acc;
159+ byte_offset := !byte_offset + Uchar.utf_decode_length dec;
160+ incr char_index
161+ end else begin
162+ error := Some (Invalid_utf8 pos)
163+ end
164+ done;
165+ match !error with
166+ | Some e -> Error e
167+ | None -> Ok (Array.of_list (List.rev !acc))
168+169+(* {1 Code Points to UTF-8 Conversion} *)
170+171+let codepoints_to_utf8 codepoints =
172+ let buf = Buffer.create (Array.length codepoints * 2) in
173+ Array.iter (Buffer.add_utf_8_uchar buf) codepoints;
174+ Buffer.contents buf
175+176+(* {1 Punycode Encoding (RFC 3492 Section 6.3)} *)
177+178+let encode_impl codepoints case_flags =
179+ let input_length = Array.length codepoints in
180+ if input_length = 0 then
181+ Ok ""
182+ else begin
183+ let output = Buffer.create (input_length * 2) in
184+185+ (* Copy basic code points to output *)
186+ let basic_count = ref 0 in
187+ for j = 0 to input_length - 1 do
188+ let cp = codepoints.(j) in
189+ if is_basic cp then begin
190+ let c = Uchar.to_int cp in
191+ let case =
192+ match case_flags with
193+ | Some flags -> flags.(j)
194+ | None -> Lowercase
195+ in
196+ (* Preserve or apply case for ASCII letters *)
197+ let c' =
198+ if c >= 0x41 && c <= 0x5A then (* 'A'-'Z' *)
199+ if case = Lowercase then c + 0x20 else c
200+ else if c >= 0x61 && c <= 0x7A then (* 'a'-'z' *)
201+ if case = Uppercase then c - 0x20 else c
202+ else
203+ c
204+ in
205+ Buffer.add_char output (Char.chr c');
206+ incr basic_count
207+ end
208+ done;
209+210+ let b = !basic_count in
211+ let h = ref b in
212+213+ (* Add delimiter if there were basic code points *)
214+ if b > 0 then
215+ Buffer.add_char output delimiter;
216+217+ (* Main encoding loop *)
218+ let n = ref initial_n in
219+ let delta = ref 0 in
220+ let bias = ref initial_bias in
221+222+ let result = ref (Ok ()) in
223+224+ while !h < input_length && !result = Ok () do
225+ (* Find minimum code point >= n *)
226+ let m = ref max_int_value in
227+ for j = 0 to input_length - 1 do
228+ let cp = Uchar.to_int codepoints.(j) in
229+ if cp >= !n && cp < !m then
230+ m := cp
231+ done;
232+233+ (* Increase delta to advance state to <m, 0> *)
234+ let pos = { byte_offset = 0; char_index = !h } in
235+ (match safe_mul_add !delta (!m - !n) (!h + 1) pos with
236+ | Error e -> result := Error e
237+ | Ok new_delta ->
238+ delta := new_delta;
239+ n := !m;
240+241+ (* Process each code point *)
242+ let j = ref 0 in
243+ while !j < input_length && !result = Ok () do
244+ let cp = Uchar.to_int codepoints.(!j) in
245+ let pos = { byte_offset = 0; char_index = !j } in
246+247+ if cp < !n then begin
248+ incr delta;
249+ if !delta = 0 then (* Overflow *)
250+ result := Error (Overflow pos)
251+ end
252+ else if cp = !n then begin
253+ (* Encode delta as variable-length integer *)
254+ let q = ref !delta in
255+ let k = ref base in
256+ let done_encoding = ref false in
257+258+ while not !done_encoding do
259+ let t =
260+ if !k <= !bias then tmin
261+ else if !k >= !bias + tmax then tmax
262+ else !k - !bias
263+ in
264+ if !q < t then begin
265+ (* Output final digit *)
266+ let case =
267+ match case_flags with
268+ | Some flags -> flags.(!j)
269+ | None -> Lowercase
270+ in
271+ Buffer.add_char output (encode_digit !q case);
272+ done_encoding := true
273+ end else begin
274+ (* Output intermediate digit and continue *)
275+ let digit = t + ((!q - t) mod (base - t)) in
276+ Buffer.add_char output (encode_digit digit Lowercase);
277+ q := (!q - t) / (base - t);
278+ k := !k + base
279+ end
280+ done;
281+282+ bias := adapt ~delta:!delta ~numpoints:(!h + 1) ~firsttime:(!h = b);
283+ delta := 0;
284+ incr h
285+ end;
286+ incr j
287+ done;
288+289+ incr delta;
290+ incr n)
291+ done;
292+293+ match !result with
294+ | Error e -> Error e
295+ | Ok () -> Ok (Buffer.contents output)
296+ end
297+298+let encode codepoints =
299+ encode_impl codepoints None
300+301+let encode_with_case codepoints case_flags =
302+ if Array.length codepoints <> Array.length case_flags then
303+ invalid_arg "encode_with_case: array lengths must match";
304+ encode_impl codepoints (Some case_flags)
305+306+(* {1 Punycode Decoding (RFC 3492 Section 6.2)} *)
307+308+let decode_impl input =
309+ let input_length = String.length input in
310+ if input_length = 0 then
311+ Ok ([||], [||])
312+ else begin
313+ (* Find last delimiter *)
314+ let last_delim = ref (-1) in
315+ for j = 0 to input_length - 1 do
316+ if is_delimiter input.[j] then
317+ last_delim := j
318+ done;
319+ let b = if !last_delim < 0 then 0 else !last_delim in
320+321+ (* Copy basic code points and extract case flags *)
322+ let output = ref [] in
323+ let case_output = ref [] in
324+ let error = ref None in
325+326+ for j = 0 to b - 1 do
327+ if !error = None then begin
328+ let c = input.[j] in
329+ let pos = { byte_offset = j; char_index = j } in
330+ let code = Char.code c in
331+ if code >= 0x80 then
332+ error := Some (Invalid_character (pos, Uchar.of_int code))
333+ else begin
334+ output := Uchar.of_int code :: !output;
335+ case_output := (if is_flagged c then Uppercase else Lowercase) :: !case_output
336+ end
337+ end
338+ done;
339+340+ match !error with
341+ | Some e -> Error e
342+ | None ->
343+ let output = ref (Array.of_list (List.rev !output)) in
344+ let case_output = ref (Array.of_list (List.rev !case_output)) in
345+346+ (* Main decoding loop *)
347+ let n = ref initial_n in
348+ let i = ref 0 in
349+ let bias = ref initial_bias in
350+ let in_pos = ref (if b > 0 then b + 1 else 0) in
351+ let result = ref (Ok ()) in
352+353+ while !in_pos < input_length && !result = Ok () do
354+ let oldi = !i in
355+ let w = ref 1 in
356+ let k = ref base in
357+ let done_decoding = ref false in
358+359+ while not !done_decoding && !result = Ok () do
360+ let pos = { byte_offset = !in_pos; char_index = Array.length !output } in
361+362+ if !in_pos >= input_length then begin
363+ result := Error (Unexpected_end pos);
364+ done_decoding := true
365+ end else begin
366+ let c = input.[!in_pos] in
367+ incr in_pos;
368+369+ match decode_digit c with
370+ | None ->
371+ result := Error (Invalid_digit (pos, c));
372+ done_decoding := true
373+ | Some digit ->
374+ (* i = i + digit * w, with overflow check *)
375+ (match safe_mul_add !i digit !w pos with
376+ | Error e ->
377+ result := Error e;
378+ done_decoding := true
379+ | Ok new_i ->
380+ i := new_i;
381+382+ let t =
383+ if !k <= !bias then tmin
384+ else if !k >= !bias + tmax then tmax
385+ else !k - !bias
386+ in
387+388+ if digit < t then begin
389+ (* Record case flag from this final digit *)
390+ done_decoding := true
391+ end else begin
392+ (* w = w * (base - t), with overflow check *)
393+ let base_minus_t = base - t in
394+ if !w > max_int_value / base_minus_t then begin
395+ result := Error (Overflow pos);
396+ done_decoding := true
397+ end else begin
398+ w := !w * base_minus_t;
399+ k := !k + base
400+ end
401+ end)
402+ end
403+ done;
404+405+ if !result = Ok () then begin
406+ let out_len = Array.length !output in
407+ bias := adapt ~delta:(!i - oldi) ~numpoints:(out_len + 1) ~firsttime:(oldi = 0);
408+409+ let pos = { byte_offset = !in_pos - 1; char_index = out_len } in
410+411+ (* n = n + i / (out_len + 1), with overflow check *)
412+ let increment = !i / (out_len + 1) in
413+ if increment > max_int_value - !n then
414+ result := Error (Overflow pos)
415+ else begin
416+ n := !n + increment;
417+ i := !i mod (out_len + 1);
418+419+ (* Validate that n is a valid Unicode scalar value *)
420+ if not (Uchar.is_valid !n) then
421+ result := Error (Invalid_character (pos, Uchar.rep))
422+ else begin
423+ (* Insert n at position i *)
424+ let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in
425+ let new_case = Array.make (out_len + 1) Lowercase in
426+427+ for j = 0 to !i - 1 do
428+ new_output.(j) <- !output.(j);
429+ new_case.(j) <- !case_output.(j)
430+ done;
431+ new_output.(!i) <- Uchar.of_int !n;
432+ (* Case flag from final digit of this delta *)
433+ new_case.(!i) <- (if !in_pos > 0 && is_flagged input.[!in_pos - 1]
434+ then Uppercase else Lowercase);
435+ for j = !i to out_len - 1 do
436+ new_output.(j + 1) <- !output.(j);
437+ new_case.(j + 1) <- !case_output.(j)
438+ done;
439+440+ output := new_output;
441+ case_output := new_case;
442+ incr i
443+ end
444+ end
445+ end
446+ done;
447+448+ match !result with
449+ | Error e -> Error e
450+ | Ok () -> Ok (!output, !case_output)
451+ end
452+453+let decode input =
454+ match decode_impl input with
455+ | Error e -> Error e
456+ | Ok (codepoints, _) -> Ok codepoints
457+458+let decode_with_case input =
459+ decode_impl input
460+461+(* {1 UTF-8 String Operations} *)
462+463+let encode_utf8 s =
464+ match utf8_to_codepoints s with
465+ | Error e -> Error e
466+ | Ok codepoints -> encode codepoints
467+468+let decode_utf8 punycode =
469+ match decode punycode with
470+ | Error e -> Error e
471+ | Ok codepoints -> Ok (codepoints_to_utf8 codepoints)
472+473+(* {1 Domain Label Operations} *)
474+475+let encode_label label =
476+ if String.length label = 0 then
477+ Error Empty_label
478+ else if is_ascii_string label then begin
479+ (* All ASCII - return as-is, but check length *)
480+ let len = String.length label in
481+ if len > max_label_length then
482+ Error (Label_too_long len)
483+ else
484+ Ok label
485+ end else begin
486+ (* Has non-ASCII - encode with Punycode *)
487+ match encode_utf8 label with
488+ | Error e -> Error e
489+ | Ok encoded ->
490+ let result = ace_prefix ^ encoded in
491+ let len = String.length result in
492+ if len > max_label_length then
493+ Error (Label_too_long len)
494+ else
495+ Ok result
496+ end
497+498+let decode_label label =
499+ if String.length label = 0 then
500+ Error Empty_label
501+ else if has_ace_prefix label then begin
502+ (* Remove ACE prefix and decode *)
503+ let punycode = String.sub label 4 (String.length label - 4) in
504+ decode_utf8 punycode
505+ end else begin
506+ (* No ACE prefix - validate and return *)
507+ if is_ascii_string label then
508+ Ok label
509+ else
510+ (* Has non-ASCII but no ACE prefix - return as-is *)
511+ Ok label
512+ end
···1+(** RFC 3492 Punycode: A Bootstring encoding of Unicode for IDNA.
2+3+ This module implements the Punycode algorithm as specified in
4+ {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492},
5+ providing encoding and decoding of Unicode strings to/from ASCII-compatible
6+ encoding suitable for use in internationalized domain names.
7+8+ Punycode is an instance of Bootstring that uses particular parameter
9+ values appropriate for IDNA. See
10+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}
11+ for the specific parameter values.
12+13+ {2 References}
14+ {ul
15+ {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A Bootstring encoding of Unicode for IDNA}
16+ {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol}} *)
17+18+(** {1 Position Tracking} *)
19+20+type position
21+(** Abstract type representing a position in input for error reporting.
22+ Positions track both byte offset and Unicode character index. *)
23+24+val position_byte_offset : position -> int
25+(** [position_byte_offset pos] returns the byte offset in the input. *)
26+27+val position_char_index : position -> int
28+(** [position_char_index pos] returns the Unicode character index (0-based). *)
29+30+val pp_position : Format.formatter -> position -> unit
31+(** [pp_position fmt pos] pretty-prints a position as "byte N, char M". *)
32+33+(** {1 Error Types} *)
34+35+type error =
36+ | Overflow of position
37+ (** Arithmetic overflow during encode/decode. This can occur with
38+ very long strings or extreme Unicode code point values.
39+ See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4}
40+ RFC 3492 Section 6.4} for overflow handling requirements. *)
41+ | Invalid_character of position * Uchar.t
42+ (** A non-basic code point appeared where only basic code points
43+ (ASCII < 128) are allowed. Per
44+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1}
45+ RFC 3492 Section 3.1}, basic code points must be segregated
46+ at the beginning of the encoded string. *)
47+ | Invalid_digit of position * char
48+ (** An invalid Punycode digit was encountered during decoding.
49+ Valid digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35).
50+ See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}
51+ RFC 3492 Section 5} for digit-value mappings. *)
52+ | Unexpected_end of position
53+ (** The input ended prematurely during decoding of a delta value.
54+ See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}
55+ RFC 3492 Section 6.2} decoding procedure. *)
56+ | Invalid_utf8 of position
57+ (** Malformed UTF-8 sequence in input string. *)
58+ | Label_too_long of int
59+ (** Encoded label exceeds 63 bytes (DNS limit per
60+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}).
61+ The int is the actual length. *)
62+ | Empty_label
63+ (** Empty label is not valid for encoding. *)
64+65+val pp_error : Format.formatter -> error -> unit
66+(** [pp_error fmt e] pretty-prints an error with position information. *)
67+68+(** {1 Constants}
69+70+ Punycode parameters as specified in
71+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}. *)
72+73+val ace_prefix : string
74+(** The ACE prefix ["xn--"] used for Punycode-encoded domain labels.
75+ See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}
76+ RFC 3492 Section 5} which notes that IDNA prepends this prefix. *)
77+78+val max_label_length : int
79+(** Maximum length of a domain label in bytes (63), per
80+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
81+82+(** {1 Case Flags for Mixed-Case Annotation}
83+84+ {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}
85+ describes an optional mechanism for preserving case information through
86+ the encoding/decoding round-trip. This is useful when the original
87+ string's case should be recoverable.
88+89+ Note: Mixed-case annotation is not used by the ToASCII and ToUnicode
90+ operations of IDNA. *)
91+92+type case_flag = Uppercase | Lowercase
93+(** Case annotation for a character. *)
94+95+(** {1 Core Punycode Operations}
96+97+ These functions implement the Bootstring algorithms from
98+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section 6}.
99+ They operate on arrays of Unicode code points ([Uchar.t array]).
100+ The encoded output is a plain ASCII string without the ACE prefix. *)
101+102+val encode : Uchar.t array -> (string, error) result
103+(** [encode codepoints] encodes an array of Unicode code points to a
104+ Punycode ASCII string.
105+106+ Implements the encoding procedure from
107+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492 Section 6.3}:
108+109+ 1. Basic code points (ASCII < 128) are copied literally to the beginning
110+ of the output per {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1}
111+ Section 3.1 (Basic code point segregation)}
112+ 2. A delimiter ('-') is appended if there are any basic code points
113+ 3. Non-basic code points are encoded as deltas using the generalized
114+ variable-length integer representation from
115+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3}
116+117+ Example:
118+ {[
119+ encode [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC; ... |]
120+ (* = Ok "ihqwcrb4cv8a8dqg056pqjye" *)
121+ ]} *)
122+123+val decode : string -> (Uchar.t array, error) result
124+(** [decode punycode] decodes a Punycode ASCII string to an array of
125+ Unicode code points.
126+127+ Implements the decoding procedure from
128+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 Section 6.2}.
129+130+ The input should be the Punycode portion only, without the ACE prefix.
131+ The decoder is case-insensitive for the encoded portion, as required by
132+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}:
133+ "A decoder MUST recognize the letters in both uppercase and lowercase forms".
134+135+ Example:
136+ {[
137+ decode "ihqwcrb4cv8a8dqg056pqjye"
138+ (* = Ok [| U+4ED6; U+4EEC; U+4E3A; ... |] (Chinese simplified) *)
139+ ]} *)
140+141+(** {1 Mixed-Case Annotation}
142+143+ These functions support round-trip case preservation as described
144+ in {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}. *)
145+146+val encode_with_case : Uchar.t array -> case_flag array -> (string, error) result
147+(** [encode_with_case codepoints case_flags] encodes with case annotation.
148+149+ Per {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}:
150+ - For basic (ASCII) letters, the output preserves the case flag directly
151+ - For non-ASCII characters, the case of the final digit in each delta
152+ encoding indicates the flag (uppercase = suggested uppercase)
153+154+ The [case_flags] array must have the same length as [codepoints].
155+156+ @raise Invalid_argument if array lengths don't match. *)
157+158+val decode_with_case : string -> (Uchar.t array * case_flag array, error) result
159+(** [decode_with_case punycode] decodes and extracts case annotations.
160+161+ Per {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A},
162+ returns both the decoded code points and an array of case flags
163+ indicating the suggested case for each character based on the
164+ uppercase/lowercase form of the encoding digits. *)
165+166+(** {1 UTF-8 String Operations}
167+168+ Convenience functions that work directly with UTF-8 encoded OCaml strings.
169+ These combine UTF-8 decoding/encoding with the core Punycode operations. *)
170+171+val encode_utf8 : string -> (string, error) result
172+(** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix).
173+174+ This is equivalent to decoding [s] from UTF-8 to code points, then
175+ calling {!encode}.
176+177+ Example:
178+ {[
179+ encode_utf8 "münchen"
180+ (* = Ok "mnchen-3ya" *)
181+ ]} *)
182+183+val decode_utf8 : string -> (string, error) result
184+(** [decode_utf8 punycode] decodes Punycode to a UTF-8 string (no ACE prefix).
185+186+ This is equivalent to calling {!decode} then encoding the result as UTF-8.
187+188+ Example:
189+ {[
190+ decode_utf8 "mnchen-3ya"
191+ (* = Ok "münchen" *)
192+ ]} *)
193+194+(** {1 Domain Label Operations}
195+196+ These functions handle the ACE prefix automatically and enforce
197+ DNS label length limits per {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
198+199+val encode_label : string -> (string, error) result
200+(** [encode_label label] encodes a domain label for use in DNS.
201+202+ If the label contains only ASCII characters, it is returned unchanged.
203+ Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended,
204+ as specified in {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}
205+ RFC 3492 Section 5}.
206+207+ Returns {!Error} {!Label_too_long} if the result exceeds 63 bytes.
208+209+ Example:
210+ {[
211+ encode_label "münchen"
212+ (* = Ok "xn--mnchen-3ya" *)
213+ encode_label "example"
214+ (* = Ok "example" *)
215+ ]} *)
216+217+val decode_label : string -> (string, error) result
218+(** [decode_label label] decodes a domain label.
219+220+ If the label starts with the ACE prefix ("xn--", case-insensitive),
221+ it is Punycode-decoded. Otherwise, it is returned unchanged.
222+223+ Example:
224+ {[
225+ decode_label "xn--mnchen-3ya"
226+ (* = Ok "münchen" *)
227+ decode_label "example"
228+ (* = Ok "example" *)
229+ ]} *)
230+231+(** {1 Validation}
232+233+ Predicate functions for checking code point and string properties. *)
234+235+val is_basic : Uchar.t -> bool
236+(** [is_basic u] is [true] if [u] is a basic code point (ASCII, < 128).
237+238+ Per {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5},
239+ basic code points for Punycode are the ASCII code points (0..7F). *)
240+241+val is_ascii_string : string -> bool
242+(** [is_ascii_string s] is [true] if [s] contains only ASCII characters
243+ (all bytes < 128). *)
244+245+val has_ace_prefix : string -> bool
246+(** [has_ace_prefix s] is [true] if [s] starts with the ACE prefix "xn--"
247+ (case-insensitive comparison). *)
···1+(* IDNA (Internationalized Domain Names in Applications) Implementation *)
2+3+let max_domain_length = 253
4+5+(* {1 Error Types} *)
6+7+type error =
8+ | Punycode_error of Punycode.error
9+ | Invalid_label of string
10+ | Domain_too_long of int
11+ | Normalization_failed
12+ | Verification_failed
13+14+let pp_error fmt = function
15+ | Punycode_error e ->
16+ Format.fprintf fmt "Punycode error: %a" Punycode.pp_error e
17+ | Invalid_label msg ->
18+ Format.fprintf fmt "invalid label: %s" msg
19+ | Domain_too_long len ->
20+ Format.fprintf fmt "domain too long: %d bytes (max %d)" len max_domain_length
21+ | Normalization_failed ->
22+ Format.fprintf fmt "Unicode normalization failed"
23+ | Verification_failed ->
24+ Format.fprintf fmt "IDNA verification failed (round-trip mismatch)"
25+26+27+(* {1 Unicode Normalization} *)
28+29+let normalize_nfc s =
30+ Uunf_string.normalize_utf_8 `NFC s
31+32+(* {1 Validation Helpers} *)
33+34+let is_ace_label label =
35+ Punycode.has_ace_prefix label
36+37+(* Check if a label follows STD3 rules (hostname restrictions):
38+ - Only LDH (letters, digits, hyphens)
39+ - Cannot start or end with hyphen *)
40+let is_std3_valid label =
41+ let len = String.length label in
42+ if len = 0 then false
43+ else if label.[0] = '-' || label.[len - 1] = '-' then false
44+ else
45+ let rec check i =
46+ if i >= len then true
47+ else
48+ let c = label.[i] in
49+ let valid =
50+ (c >= 'a' && c <= 'z') ||
51+ (c >= 'A' && c <= 'Z') ||
52+ (c >= '0' && c <= '9') ||
53+ c = '-'
54+ in
55+ if valid then check (i + 1) else false
56+ in
57+ check 0
58+59+(* Check hyphen placement: hyphens not in positions 3 and 4 (except for ACE) *)
60+let check_hyphen_rules label =
61+ let len = String.length label in
62+ if len >= 4 && label.[2] = '-' && label.[3] = '-' then
63+ (* Hyphens in positions 3 and 4 - only valid for ACE prefix *)
64+ is_ace_label label
65+ else
66+ true
67+68+(* {1 Label Operations} *)
69+70+let label_to_ascii_impl ~check_hyphens ~use_std3_rules label =
71+ let len = String.length label in
72+ if len = 0 then
73+ Error (Invalid_label "empty label")
74+ else if len > Punycode.max_label_length then
75+ Error (Punycode_error (Punycode.Label_too_long len))
76+ else if Punycode.is_ascii_string label then begin
77+ (* All ASCII - validate and pass through *)
78+ if use_std3_rules && not (is_std3_valid label) then
79+ Error (Invalid_label "STD3 rules violation")
80+ else if check_hyphens && not (check_hyphen_rules label) then
81+ Error (Invalid_label "invalid hyphen placement")
82+ else
83+ Ok label
84+ end else begin
85+ (* Has non-ASCII - normalize and encode *)
86+ let normalized = normalize_nfc label in
87+88+ (* Encode to Punycode *)
89+ match Punycode.encode_utf8 normalized with
90+ | Error e -> Error (Punycode_error e)
91+ | Ok encoded ->
92+ let result = Punycode.ace_prefix ^ encoded in
93+ let result_len = String.length result in
94+ if result_len > Punycode.max_label_length then
95+ Error (Punycode_error (Punycode.Label_too_long result_len))
96+ else if check_hyphens && not (check_hyphen_rules result) then
97+ Error (Invalid_label "invalid hyphen placement in encoded label")
98+ else
99+ (* Verification: decode and compare to original normalized form *)
100+ match Punycode.decode_utf8 encoded with
101+ | Error _ -> Error Verification_failed
102+ | Ok decoded ->
103+ if decoded <> normalized then
104+ Error Verification_failed
105+ else
106+ Ok result
107+ end
108+109+let label_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) label =
110+ label_to_ascii_impl ~check_hyphens ~use_std3_rules label
111+112+let label_to_unicode label =
113+ if is_ace_label label then begin
114+ let encoded = String.sub label 4 (String.length label - 4) in
115+ match Punycode.decode_utf8 encoded with
116+ | Error e -> Error (Punycode_error e)
117+ | Ok decoded -> Ok decoded
118+ end else
119+ Ok label
120+121+(* {1 Domain Operations} *)
122+123+(* Split domain into labels *)
124+let split_domain domain =
125+ String.split_on_char '.' domain
126+127+(* Join labels into domain *)
128+let join_labels labels =
129+ String.concat "." labels
130+131+let to_ascii ?(check_hyphens = true) ?(check_bidi = false)
132+ ?(check_joiners = false) ?(use_std3_rules = false)
133+ ?(transitional = false) domain =
134+ (* Note: check_bidi, check_joiners, and transitional are accepted but
135+ not fully implemented - they would require additional Unicode data *)
136+ let _ = check_bidi in
137+ let _ = check_joiners in
138+ let _ = transitional in
139+140+ let labels = split_domain domain in
141+ let rec process acc = function
142+ | [] ->
143+ let result = join_labels (List.rev acc) in
144+ let len = String.length result in
145+ if len > max_domain_length then
146+ Error (Domain_too_long len)
147+ else
148+ Ok result
149+ | label :: rest ->
150+ match label_to_ascii_impl ~check_hyphens ~use_std3_rules label with
151+ | Error e -> Error e
152+ | Ok encoded -> process (encoded :: acc) rest
153+ in
154+ process [] labels
155+156+let to_unicode domain =
157+ let labels = split_domain domain in
158+ let rec process acc = function
159+ | [] -> Ok (join_labels (List.rev acc))
160+ | label :: rest ->
161+ match label_to_unicode label with
162+ | Error e -> Error e
163+ | Ok decoded -> process (decoded :: acc) rest
164+ in
165+ process [] labels
166+167+(* {1 Domain Name Library Integration} *)
168+169+let domain_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) domain =
170+ let s = Domain_name.to_string domain in
171+ match to_ascii ~check_hyphens ~use_std3_rules s with
172+ | Error e -> Error e
173+ | Ok ascii ->
174+ match Domain_name.of_string ascii with
175+ | Error (`Msg msg) -> Error (Invalid_label msg)
176+ | Ok d -> Ok d
177+178+let domain_to_unicode domain =
179+ let s = Domain_name.to_string domain in
180+ match to_unicode s with
181+ | Error e -> Error e
182+ | Ok unicode ->
183+ match Domain_name.of_string unicode with
184+ | Error (`Msg msg) -> Error (Invalid_label msg)
185+ | Ok d -> Ok d
186+187+(* {1 Validation} *)
188+189+let is_idna_valid domain =
190+ match to_ascii domain with
191+ | Ok _ -> true
192+ | Error _ -> false
···1+(** IDNA (Internationalized Domain Names in Applications) support.
2+3+ This module provides ToASCII and ToUnicode operations as specified
4+ in {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} (IDNA 2008),
5+ using Punycode ({{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492})
6+ for encoding.
7+8+ IDNA allows domain names to contain non-ASCII Unicode characters by
9+ encoding them using Punycode with an ACE prefix. This module handles
10+ the conversion between Unicode domain names and their ASCII-compatible
11+ encoding (ACE) form.
12+13+ {2 References}
14+ {ul
15+ {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} -
16+ Internationalized Domain Names in Applications (IDNA): Protocol}
17+ {- {{:https://datatracker.ietf.org/doc/html/rfc5892}RFC 5892} -
18+ The Unicode Code Points and Internationalized Domain Names for Applications (IDNA)}
19+ {- {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} -
20+ Right-to-Left Scripts for Internationalized Domain Names for Applications (IDNA)}
21+ {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} -
22+ Punycode: A Bootstring encoding of Unicode for IDNA}} *)
23+24+(** {1 Error Types} *)
25+26+type error =
27+ | Punycode_error of Punycode.error
28+ (** Error during Punycode encoding/decoding.
29+ See {!Punycode.error} for details. *)
30+ | Invalid_label of string
31+ (** Label violates IDNA constraints. The string describes the violation.
32+ See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4}
33+ RFC 5891 Section 4} for label validation requirements. *)
34+ | Domain_too_long of int
35+ (** Domain name exceeds 253 bytes, per
36+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}.
37+ The int is the actual length. *)
38+ | Normalization_failed
39+ (** Unicode normalization (NFC) failed.
40+ Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}
41+ RFC 5891 Section 4.2.1}, labels must be in NFC form. *)
42+ | Verification_failed
43+ (** ToASCII/ToUnicode verification step failed (round-trip check).
44+ Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}
45+ RFC 5891 Section 4.2}, the result of encoding must decode back
46+ to the original input. *)
47+48+val pp_error : Format.formatter -> error -> unit
49+(** [pp_error fmt e] pretty-prints an error. *)
50+51+(** {1 Constants} *)
52+53+val max_domain_length : int
54+(** Maximum length of a domain name in bytes (253), per
55+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
56+57+(** {1 ToASCII Operation}
58+59+ Converts an internationalized domain name to its ASCII-compatible
60+ encoding (ACE) form suitable for DNS lookup.
61+62+ See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4}
63+ RFC 5891 Section 4} for the complete ToASCII specification. *)
64+65+val to_ascii : ?check_hyphens:bool -> ?check_bidi:bool ->
66+ ?check_joiners:bool -> ?use_std3_rules:bool ->
67+ ?transitional:bool -> string -> (string, error) result
68+(** [to_ascii domain] converts an internationalized domain name to ASCII.
69+70+ Implements the ToASCII operation from
71+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 Section 4.1}.
72+73+ For each label in the domain:
74+ 1. If all ASCII, pass through (with optional STD3 validation)
75+ 2. Otherwise, normalize to NFC per
76+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}Section 4.2.1}
77+ and Punycode-encode with ACE prefix
78+79+ Optional parameters (per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4}
80+ RFC 5891 Section 4} processing options):
81+ - [check_hyphens]: Validate hyphen placement per
82+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1}Section 4.2.3.1}
83+ (default: true)
84+ - [check_bidi]: Check bidirectional text rules per
85+ {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893}
86+ (default: false, not implemented)
87+ - [check_joiners]: Check contextual joiner rules per
88+ {{:https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1}RFC 5892 Appendix A.1}
89+ (default: false, not implemented)
90+ - [use_std3_rules]: Apply STD3 hostname rules per
91+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section 4.2.3.2}
92+ (default: false)
93+ - [transitional]: Use IDNA 2003 transitional processing
94+ (default: false)
95+96+ Example:
97+ {[
98+ to_ascii "münchen.example.com"
99+ (* = Ok "xn--mnchen-3ya.example.com" *)
100+ ]} *)
101+102+val label_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool ->
103+ string -> (string, error) result
104+(** [label_to_ascii label] converts a single label to ASCII.
105+106+ This implements the core ToASCII operation for one label, as described in
107+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 Section 4.1}. *)
108+109+(** {1 ToUnicode Operation}
110+111+ Converts an ASCII-compatible encoded domain name back to Unicode.
112+113+ See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}
114+ RFC 5891 Section 4.2} for the complete ToUnicode specification. *)
115+116+val to_unicode : string -> (string, error) result
117+(** [to_unicode domain] converts an ACE domain name to Unicode.
118+119+ Implements the ToUnicode operation from
120+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 Section 4.2}.
121+122+ For each label in the domain:
123+ 1. If it has the ACE prefix ("xn--"), Punycode-decode it per
124+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 Section 6.2}
125+ 2. Otherwise, pass through unchanged
126+127+ Example:
128+ {[
129+ to_unicode "xn--mnchen-3ya.example.com"
130+ (* = Ok "münchen.example.com" *)
131+ ]} *)
132+133+val label_to_unicode : string -> (string, error) result
134+(** [label_to_unicode label] converts a single ACE label to Unicode.
135+136+ This implements the core ToUnicode operation for one label, as described in
137+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 Section 4.2}. *)
138+139+(** {1 Domain Name Integration}
140+141+ Functions that work with the
142+ {{:https://github.com/hannesm/domain-name}domain-name} library types.
143+144+ These provide integration with the [Domain_name] module for applications
145+ that use that library for domain name handling. *)
146+147+val domain_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool ->
148+ [`raw] Domain_name.t -> ([`raw] Domain_name.t, error) result
149+(** [domain_to_ascii domain] converts a domain name to ASCII form.
150+151+ Applies {!to_ascii} to the string representation and returns the
152+ result as a [Domain_name.t].
153+154+ Example:
155+ {[
156+ let d = Domain_name.of_string_exn "münchen.example.com" in
157+ domain_to_ascii d
158+ (* = Ok (Domain_name.of_string_exn "xn--mnchen-3ya.example.com") *)
159+ ]} *)
160+161+val domain_to_unicode : [`raw] Domain_name.t -> ([`raw] Domain_name.t, error) result
162+(** [domain_to_unicode domain] converts a domain name to Unicode form.
163+164+ Applies {!to_unicode} to the string representation and returns the
165+ result as a [Domain_name.t]. *)
166+167+(** {1 Validation} *)
168+169+val is_idna_valid : string -> bool
170+(** [is_idna_valid domain] checks if a domain name is valid for IDNA processing.
171+172+ Returns [true] if {!to_ascii} would succeed on the domain. *)
173+174+val is_ace_label : string -> bool
175+(** [is_ace_label label] is [true] if the label has the ACE prefix "xn--"
176+ (case-insensitive). This indicates the label is Punycode-encoded per
177+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}. *)
178+179+(** {1 Normalization} *)
180+181+val normalize_nfc : string -> string
182+(** [normalize_nfc s] returns the NFC-normalized form of UTF-8 string [s].
183+184+ Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}
185+ RFC 5891 Section 4.2.1}, domain labels must be normalized to NFC
186+ (Unicode Normalization Form C) before encoding.
187+188+ See {{:http://www.unicode.org/reports/tr15/}Unicode Standard Annex #15}
189+ for details on Unicode normalization forms. *)
+37
puny.opam
···0000000000000000000000000000000000000
···1+# This file is generated by dune, edit dune-project instead
2+opam-version: "2.0"
3+version: "0.1.0"
4+synopsis: "RFC 3492 Punycode and IDNA implementation for OCaml"
5+description: """
6+A high-quality implementation of RFC 3492 (Punycode) with IDNA support.
7+ Provides encoding and decoding of internationalized domain names,
8+ with proper Unicode normalization and mixed-case annotation support."""
9+maintainer: ["maintainer@example.com"]
10+authors: ["Author Name"]
11+license: "ISC"
12+homepage: "https://github.com/username/puny"
13+bug-reports: "https://github.com/username/puny/issues"
14+depends: [
15+ "ocaml" {>= "4.14.0"}
16+ "dune" {>= "3.0" & >= "3.0"}
17+ "uutf" {>= "1.0.0"}
18+ "uunf" {>= "15.0.0"}
19+ "domain-name" {>= "0.4.0"}
20+ "alcotest" {with-test}
21+ "odoc" {with-doc}
22+]
23+build: [
24+ ["dune" "subst"] {dev}
25+ [
26+ "dune"
27+ "build"
28+ "-p"
29+ name
30+ "-j"
31+ jobs
32+ "@install"
33+ "@runtest" {with-test}
34+ "@doc" {with-doc}
35+ ]
36+]
37+dev-repo: "git+https://github.com/username/puny.git"
···1+2+3+4+5+6+7+Network Working Group A. Costello
8+Request for Comments: 3492 Univ. of California, Berkeley
9+Category: Standards Track March 2003
10+11+12+ Punycode: A Bootstring encoding of Unicode
13+ for Internationalized Domain Names in Applications (IDNA)
14+15+Status of this Memo
16+17+ This document specifies an Internet standards track protocol for the
18+ Internet community, and requests discussion and suggestions for
19+ improvements. Please refer to the current edition of the "Internet
20+ Official Protocol Standards" (STD 1) for the standardization state
21+ and status of this protocol. Distribution of this memo is unlimited.
22+23+Copyright Notice
24+25+ Copyright (C) The Internet Society (2003). All Rights Reserved.
26+27+Abstract
28+29+ Punycode is a simple and efficient transfer encoding syntax designed
30+ for use with Internationalized Domain Names in Applications (IDNA).
31+ It uniquely and reversibly transforms a Unicode string into an ASCII
32+ string. ASCII characters in the Unicode string are represented
33+ literally, and non-ASCII characters are represented by ASCII
34+ characters that are allowed in host name labels (letters, digits, and
35+ hyphens). This document defines a general algorithm called
36+ Bootstring that allows a string of basic code points to uniquely
37+ represent any string of code points drawn from a larger set.
38+ Punycode is an instance of Bootstring that uses particular parameter
39+ values specified by this document, appropriate for IDNA.
40+41+Table of Contents
42+43+ 1. Introduction...............................................2
44+ 1.1 Features..............................................2
45+ 1.2 Interaction of protocol parts.........................3
46+ 2. Terminology................................................3
47+ 3. Bootstring description.....................................4
48+ 3.1 Basic code point segregation..........................4
49+ 3.2 Insertion unsort coding...............................4
50+ 3.3 Generalized variable-length integers..................5
51+ 3.4 Bias adaptation.......................................7
52+ 4. Bootstring parameters......................................8
53+ 5. Parameter values for Punycode..............................8
54+ 6. Bootstring algorithms......................................9
55+56+57+58+Costello Standards Track [Page 1]
59+60+RFC 3492 IDNA Punycode March 2003
61+62+63+ 6.1 Bias adaptation function.............................10
64+ 6.2 Decoding procedure...................................11
65+ 6.3 Encoding procedure...................................12
66+ 6.4 Overflow handling....................................13
67+ 7. Punycode examples.........................................14
68+ 7.1 Sample strings.......................................14
69+ 7.2 Decoding traces......................................17
70+ 7.3 Encoding traces......................................19
71+ 8. Security Considerations...................................20
72+ 9. References................................................21
73+ 9.1 Normative References.................................21
74+ 9.2 Informative References...............................21
75+ A. Mixed-case annotation.....................................22
76+ B. Disclaimer and license....................................22
77+ C. Punycode sample implementation............................23
78+ Author's Address.............................................34
79+ Full Copyright Statement.....................................35
80+81+1. Introduction
82+83+ [IDNA] describes an architecture for supporting internationalized
84+ domain names. Labels containing non-ASCII characters can be
85+ represented by ACE labels, which begin with a special ACE prefix and
86+ contain only ASCII characters. The remainder of the label after the
87+ prefix is a Punycode encoding of a Unicode string satisfying certain
88+ constraints. For the details of the prefix and constraints, see
89+ [IDNA] and [NAMEPREP].
90+91+ Punycode is an instance of a more general algorithm called
92+ Bootstring, which allows strings composed from a small set of "basic"
93+ code points to uniquely represent any string of code points drawn
94+ from a larger set. Punycode is Bootstring with particular parameter
95+ values appropriate for IDNA.
96+97+1.1 Features
98+99+ Bootstring has been designed to have the following features:
100+101+ * Completeness: Every extended string (sequence of arbitrary code
102+ points) can be represented by a basic string (sequence of basic
103+ code points). Restrictions on what strings are allowed, and on
104+ length, can be imposed by higher layers.
105+106+ * Uniqueness: There is at most one basic string that represents a
107+ given extended string.
108+109+ * Reversibility: Any extended string mapped to a basic string can
110+ be recovered from that basic string.
111+112+113+114+Costello Standards Track [Page 2]
115+116+RFC 3492 IDNA Punycode March 2003
117+118+119+ * Efficient encoding: The ratio of basic string length to extended
120+ string length is small. This is important in the context of
121+ domain names because RFC 1034 [RFC1034] restricts the length of a
122+ domain label to 63 characters.
123+124+ * Simplicity: The encoding and decoding algorithms are reasonably
125+ simple to implement. The goals of efficiency and simplicity are
126+ at odds; Bootstring aims at a good balance between them.
127+128+ * Readability: Basic code points appearing in the extended string
129+ are represented as themselves in the basic string (although the
130+ main purpose is to improve efficiency, not readability).
131+132+ Punycode can also support an additional feature that is not used by
133+ the ToASCII and ToUnicode operations of [IDNA]. When extended
134+ strings are case-folded prior to encoding, the basic string can use
135+ mixed case to tell how to convert the folded string into a mixed-case
136+ string. See appendix A "Mixed-case annotation".
137+138+1.2 Interaction of protocol parts
139+140+ Punycode is used by the IDNA protocol [IDNA] for converting domain
141+ labels into ASCII; it is not designed for any other purpose. It is
142+ explicitly not designed for processing arbitrary free text.
143+144+2. Terminology
145+146+ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
147+ "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
148+ document are to be interpreted as described in BCP 14, RFC 2119
149+ [RFC2119].
150+151+ A code point is an integral value associated with a character in a
152+ coded character set.
153+154+ As in the Unicode Standard [UNICODE], Unicode code points are denoted
155+ by "U+" followed by four to six hexadecimal digits, while a range of
156+ code points is denoted by two hexadecimal numbers separated by "..",
157+ with no prefixes.
158+159+ The operators div and mod perform integer division; (x div y) is the
160+ quotient of x divided by y, discarding the remainder, and (x mod y)
161+ is the remainder, so (x div y) * y + (x mod y) == x. Bootstring uses
162+ these operators only with nonnegative operands, so the quotient and
163+ remainder are always nonnegative.
164+165+ The break statement jumps out of the innermost loop (as in C).
166+167+168+169+170+Costello Standards Track [Page 3]
171+172+RFC 3492 IDNA Punycode March 2003
173+174+175+ An overflow is an attempt to compute a value that exceeds the maximum
176+ value of an integer variable.
177+178+3. Bootstring description
179+180+ Bootstring represents an arbitrary sequence of code points (the
181+ "extended string") as a sequence of basic code points (the "basic
182+ string"). This section describes the representation. Section 6
183+ "Bootstring algorithms" presents the algorithms as pseudocode.
184+ Sections 7.1 "Decoding traces" and 7.2 "Encoding traces" trace the
185+ algorithms for sample inputs.
186+187+ The following sections describe the four techniques used in
188+ Bootstring. "Basic code point segregation" is a very simple and
189+ efficient encoding for basic code points occurring in the extended
190+ string: they are simply copied all at once. "Insertion unsort
191+ coding" encodes the non-basic code points as deltas, and processes
192+ the code points in numerical order rather than in order of
193+ appearance, which typically results in smaller deltas. The deltas
194+ are represented as "generalized variable-length integers", which use
195+ basic code points to represent nonnegative integers. The parameters
196+ of this integer representation are dynamically adjusted using "bias
197+ adaptation", to improve efficiency when consecutive deltas have
198+ similar magnitudes.
199+200+3.1 Basic code point segregation
201+202+ All basic code points appearing in the extended string are
203+ represented literally at the beginning of the basic string, in their
204+ original order, followed by a delimiter if (and only if) the number
205+ of basic code points is nonzero. The delimiter is a particular basic
206+ code point, which never appears in the remainder of the basic string.
207+ The decoder can therefore find the end of the literal portion (if
208+ there is one) by scanning for the last delimiter.
209+210+3.2 Insertion unsort coding
211+212+ The remainder of the basic string (after the last delimiter if there
213+ is one) represents a sequence of nonnegative integral deltas as
214+ generalized variable-length integers, described in section 3.3. The
215+ meaning of the deltas is best understood in terms of the decoder.
216+217+ The decoder builds the extended string incrementally. Initially, the
218+ extended string is a copy of the literal portion of the basic string
219+ (excluding the last delimiter). The decoder inserts non-basic code
220+ points, one for each delta, into the extended string, ultimately
221+ arriving at the final decoded string.
222+223+224+225+226+Costello Standards Track [Page 4]
227+228+RFC 3492 IDNA Punycode March 2003
229+230+231+ At the heart of this process is a state machine with two state
232+ variables: an index i and a counter n. The index i refers to a
233+ position in the extended string; it ranges from 0 (the first
234+ position) to the current length of the extended string (which refers
235+ to a potential position beyond the current end). If the current
236+ state is <n,i>, the next state is <n,i+1> if i is less than the
237+ length of the extended string, or <n+1,0> if i equals the length of
238+ the extended string. In other words, each state change causes i to
239+ increment, wrapping around to zero if necessary, and n counts the
240+ number of wrap-arounds.
241+242+ Notice that the state always advances monotonically (there is no way
243+ for the decoder to return to an earlier state). At each state, an
244+ insertion is either performed or not performed. At most one
245+ insertion is performed in a given state. An insertion inserts the
246+ value of n at position i in the extended string. The deltas are a
247+ run-length encoding of this sequence of events: they are the lengths
248+ of the runs of non-insertion states preceeding the insertion states.
249+ Hence, for each delta, the decoder performs delta state changes, then
250+ an insertion, and then one more state change. (An implementation
251+ need not perform each state change individually, but can instead use
252+ division and remainder calculations to compute the next insertion
253+ state directly.) It is an error if the inserted code point is a
254+ basic code point (because basic code points were supposed to be
255+ segregated as described in section 3.1).
256+257+ The encoder's main task is to derive the sequence of deltas that will
258+ cause the decoder to construct the desired string. It can do this by
259+ repeatedly scanning the extended string for the next code point that
260+ the decoder would need to insert, and counting the number of state
261+ changes the decoder would need to perform, mindful of the fact that
262+ the decoder's extended string will include only those code points
263+ that have already been inserted. Section 6.3 "Encoding procedure"
264+ gives a precise algorithm.
265+266+3.3 Generalized variable-length integers
267+268+ In a conventional integer representation the base is the number of
269+ distinct symbols for digits, whose values are 0 through base-1. Let
270+ digit_0 denote the least significant digit, digit_1 the next least
271+ significant, and so on. The value represented is the sum over j of
272+ digit_j * w(j), where w(j) = base^j is the weight (scale factor) for
273+ position j. For example, in the base 8 integer 437, the digits are
274+ 7, 3, and 4, and the weights are 1, 8, and 64, so the value is 7 +
275+ 3*8 + 4*64 = 287. This representation has two disadvantages: First,
276+ there are multiple encodings of each value (because there can be
277+ extra zeros in the most significant positions), which is inconvenient
278+279+280+281+282+Costello Standards Track [Page 5]
283+284+RFC 3492 IDNA Punycode March 2003
285+286+287+ when unique encodings are needed. Second, the integer is not self-
288+ delimiting, so if multiple integers are concatenated the boundaries
289+ between them are lost.
290+291+ The generalized variable-length representation solves these two
292+ problems. The digit values are still 0 through base-1, but now the
293+ integer is self-delimiting by means of thresholds t(j), each of which
294+ is in the range 0 through base-1. Exactly one digit, the most
295+ significant, satisfies digit_j < t(j). Therefore, if several
296+ integers are concatenated, it is easy to separate them, starting with
297+ the first if they are little-endian (least significant digit first),
298+ or starting with the last if they are big-endian (most significant
299+ digit first). As before, the value is the sum over j of digit_j *
300+ w(j), but the weights are different:
301+302+ w(0) = 1
303+ w(j) = w(j-1) * (base - t(j-1)) for j > 0
304+305+ For example, consider the little-endian sequence of base 8 digits
306+ 734251... Suppose the thresholds are 2, 3, 5, 5, 5, 5... This
307+ implies that the weights are 1, 1*(8-2) = 6, 6*(8-3) = 30, 30*(8-5) =
308+ 90, 90*(8-5) = 270, and so on. 7 is not less than 2, and 3 is not
309+ less than 3, but 4 is less than 5, so 4 is the last digit. The value
310+ of 734 is 7*1 + 3*6 + 4*30 = 145. The next integer is 251, with
311+ value 2*1 + 5*6 + 1*30 = 62. Decoding this representation is very
312+ similar to decoding a conventional integer: Start with a current
313+ value of N = 0 and a weight w = 1. Fetch the next digit d and
314+ increase N by d * w. If d is less than the current threshold (t)
315+ then stop, otherwise increase w by a factor of (base - t), update t
316+ for the next position, and repeat.
317+318+ Encoding this representation is similar to encoding a conventional
319+ integer: If N < t then output one digit for N and stop, otherwise
320+ output the digit for t + ((N - t) mod (base - t)), then replace N
321+ with (N - t) div (base - t), update t for the next position, and
322+ repeat.
323+324+ For any particular set of values of t(j), there is exactly one
325+ generalized variable-length representation of each nonnegative
326+ integral value.
327+328+ Bootstring uses little-endian ordering so that the deltas can be
329+ separated starting with the first. The t(j) values are defined in
330+ terms of the constants base, tmin, and tmax, and a state variable
331+ called bias:
332+333+ t(j) = base * (j + 1) - bias,
334+ clamped to the range tmin through tmax
335+336+337+338+Costello Standards Track [Page 6]
339+340+RFC 3492 IDNA Punycode March 2003
341+342+343+ The clamping means that if the formula yields a value less than tmin
344+ or greater than tmax, then t(j) = tmin or tmax, respectively. (In
345+ the pseudocode in section 6 "Bootstring algorithms", the expression
346+ base * (j + 1) is denoted by k for performance reasons.) These t(j)
347+ values cause the representation to favor integers within a particular
348+ range determined by the bias.
349+350+3.4 Bias adaptation
351+352+ After each delta is encoded or decoded, bias is set for the next
353+ delta as follows:
354+355+ 1. Delta is scaled in order to avoid overflow in the next step:
356+357+ let delta = delta div 2
358+359+ But when this is the very first delta, the divisor is not 2, but
360+ instead a constant called damp. This compensates for the fact
361+ that the second delta is usually much smaller than the first.
362+363+ 2. Delta is increased to compensate for the fact that the next delta
364+ will be inserting into a longer string:
365+366+ let delta = delta + (delta div numpoints)
367+368+ numpoints is the total number of code points encoded/decoded so
369+ far (including the one corresponding to this delta itself, and
370+ including the basic code points).
371+372+ 3. Delta is repeatedly divided until it falls within a threshold, to
373+ predict the minimum number of digits needed to represent the next
374+ delta:
375+376+ while delta > ((base - tmin) * tmax) div 2
377+ do let delta = delta div (base - tmin)
378+379+ 4. The bias is set:
380+381+ let bias =
382+ (base * the number of divisions performed in step 3) +
383+ (((base - tmin + 1) * delta) div (delta + skew))
384+385+ The motivation for this procedure is that the current delta
386+ provides a hint about the likely size of the next delta, and so
387+ t(j) is set to tmax for the more significant digits starting with
388+ the one expected to be last, tmin for the less significant digits
389+ up through the one expected to be third-last, and somewhere
390+ between tmin and tmax for the digit expected to be second-last
391+392+393+394+Costello Standards Track [Page 7]
395+396+RFC 3492 IDNA Punycode March 2003
397+398+399+ (balancing the hope of the expected-last digit being unnecessary
400+ against the danger of it being insufficient).
401+402+4. Bootstring parameters
403+404+ Given a set of basic code points, one needs to be designated as the
405+ delimiter. The base cannot be greater than the number of
406+ distinguishable basic code points remaining. The digit-values in the
407+ range 0 through base-1 need to be associated with distinct non-
408+ delimiter basic code points. In some cases multiple code points need
409+ to have the same digit-value; for example, uppercase and lowercase
410+ versions of the same letter need to be equivalent if basic strings
411+ are case-insensitive.
412+413+ The initial value of n cannot be greater than the minimum non-basic
414+ code point that could appear in extended strings.
415+416+ The remaining five parameters (tmin, tmax, skew, damp, and the
417+ initial value of bias) need to satisfy the following constraints:
418+419+ 0 <= tmin <= tmax <= base-1
420+ skew >= 1
421+ damp >= 2
422+ initial_bias mod base <= base - tmin
423+424+ Provided the constraints are satisfied, these five parameters affect
425+ efficiency but not correctness. They are best chosen empirically.
426+427+ If support for mixed-case annotation is desired (see appendix A),
428+ make sure that the code points corresponding to 0 through tmax-1 all
429+ have both uppercase and lowercase forms.
430+431+5. Parameter values for Punycode
432+433+ Punycode uses the following Bootstring parameter values:
434+435+ base = 36
436+ tmin = 1
437+ tmax = 26
438+ skew = 38
439+ damp = 700
440+ initial_bias = 72
441+ initial_n = 128 = 0x80
442+443+ Although the only restriction Punycode imposes on the input integers
444+ is that they be nonnegative, these parameters are especially designed
445+ to work well with Unicode [UNICODE] code points, which are integers
446+ in the range 0..10FFFF (but not D800..DFFF, which are reserved for
447+448+449+450+Costello Standards Track [Page 8]
451+452+RFC 3492 IDNA Punycode March 2003
453+454+455+ use by the UTF-16 encoding of Unicode). The basic code points are
456+ the ASCII [ASCII] code points (0..7F), of which U+002D (-) is the
457+ delimiter, and some of the others have digit-values as follows:
458+459+ code points digit-values
460+ ------------ ----------------------
461+ 41..5A (A-Z) = 0 to 25, respectively
462+ 61..7A (a-z) = 0 to 25, respectively
463+ 30..39 (0-9) = 26 to 35, respectively
464+465+ Using hyphen-minus as the delimiter implies that the encoded string
466+ can end with a hyphen-minus only if the Unicode string consists
467+ entirely of basic code points, but IDNA forbids such strings from
468+ being encoded. The encoded string can begin with a hyphen-minus, but
469+ IDNA prepends a prefix. Therefore IDNA using Punycode conforms to
470+ the RFC 952 rule that host name labels neither begin nor end with a
471+ hyphen-minus [RFC952].
472+473+ A decoder MUST recognize the letters in both uppercase and lowercase
474+ forms (including mixtures of both forms). An encoder SHOULD output
475+ only uppercase forms or only lowercase forms, unless it uses mixed-
476+ case annotation (see appendix A).
477+478+ Presumably most users will not manually write or type encoded strings
479+ (as opposed to cutting and pasting them), but those who do will need
480+ to be alert to the potential visual ambiguity between the following
481+ sets of characters:
482+483+ G 6
484+ I l 1
485+ O 0
486+ S 5
487+ U V
488+ Z 2
489+490+ Such ambiguities are usually resolved by context, but in a Punycode
491+ encoded string there is no context apparent to humans.
492+493+6. Bootstring algorithms
494+495+ Some parts of the pseudocode can be omitted if the parameters satisfy
496+ certain conditions (for which Punycode qualifies). These parts are
497+ enclosed in {braces}, and notes immediately following the pseudocode
498+ explain the conditions under which they can be omitted.
499+500+501+502+503+504+505+506+Costello Standards Track [Page 9]
507+508+RFC 3492 IDNA Punycode March 2003
509+510+511+ Formally, code points are integers, and hence the pseudocode assumes
512+ that arithmetic operations can be performed directly on code points.
513+ In some programming languages, explicit conversion between code
514+ points and integers might be necessary.
515+516+6.1 Bias adaptation function
517+518+ function adapt(delta,numpoints,firsttime):
519+ if firsttime then let delta = delta div damp
520+ else let delta = delta div 2
521+ let delta = delta + (delta div numpoints)
522+ let k = 0
523+ while delta > ((base - tmin) * tmax) div 2 do begin
524+ let delta = delta div (base - tmin)
525+ let k = k + base
526+ end
527+ return k + (((base - tmin + 1) * delta) div (delta + skew))
528+529+ It does not matter whether the modifications to delta and k inside
530+ adapt() affect variables of the same name inside the
531+ encoding/decoding procedures, because after calling adapt() the
532+ caller does not read those variables before overwriting them.
533+534+535+536+537+538+539+540+541+542+543+544+545+546+547+548+549+550+551+552+553+554+555+556+557+558+559+560+561+562+Costello Standards Track [Page 10]
563+564+RFC 3492 IDNA Punycode March 2003
565+566+567+6.2 Decoding procedure
568+569+ let n = initial_n
570+ let i = 0
571+ let bias = initial_bias
572+ let output = an empty string indexed from 0
573+ consume all code points before the last delimiter (if there is one)
574+ and copy them to output, fail on any non-basic code point
575+ if more than zero code points were consumed then consume one more
576+ (which will be the last delimiter)
577+ while the input is not exhausted do begin
578+ let oldi = i
579+ let w = 1
580+ for k = base to infinity in steps of base do begin
581+ consume a code point, or fail if there was none to consume
582+ let digit = the code point's digit-value, fail if it has none
583+ let i = i + digit * w, fail on overflow
584+ let t = tmin if k <= bias {+ tmin}, or
585+ tmax if k >= bias + tmax, or k - bias otherwise
586+ if digit < t then break
587+ let w = w * (base - t), fail on overflow
588+ end
589+ let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
590+ let n = n + i div (length(output) + 1), fail on overflow
591+ let i = i mod (length(output) + 1)
592+ {if n is a basic code point then fail}
593+ insert n into output at position i
594+ increment i
595+ end
596+597+ The full statement enclosed in braces (checking whether n is a basic
598+ code point) can be omitted if initial_n exceeds all basic code points
599+ (which is true for Punycode), because n is never less than initial_n.
600+601+ In the assignment of t, where t is clamped to the range tmin through
602+ tmax, "+ tmin" can always be omitted. This makes the clamping
603+ calculation incorrect when bias < k < bias + tmin, but that cannot
604+ happen because of the way bias is computed and because of the
605+ constraints on the parameters.
606+607+ Because the decoder state can only advance monotonically, and there
608+ is only one representation of any delta, there is therefore only one
609+ encoded string that can represent a given sequence of integers. The
610+ only error conditions are invalid code points, unexpected end-of-
611+ input, overflow, and basic code points encoded using deltas instead
612+ of appearing literally. If the decoder fails on these errors as
613+ shown above, then it cannot produce the same output for two distinct
614+ inputs. Without this property it would have been necessary to re-
615+616+617+618+Costello Standards Track [Page 11]
619+620+RFC 3492 IDNA Punycode March 2003
621+622+623+ encode the output and verify that it matches the input in order to
624+ guarantee the uniqueness of the encoding.
625+626+6.3 Encoding procedure
627+628+ let n = initial_n
629+ let delta = 0
630+ let bias = initial_bias
631+ let h = b = the number of basic code points in the input
632+ copy them to the output in order, followed by a delimiter if b > 0
633+ {if the input contains a non-basic code point < n then fail}
634+ while h < length(input) do begin
635+ let m = the minimum {non-basic} code point >= n in the input
636+ let delta = delta + (m - n) * (h + 1), fail on overflow
637+ let n = m
638+ for each code point c in the input (in order) do begin
639+ if c < n {or c is basic} then increment delta, fail on overflow
640+ if c == n then begin
641+ let q = delta
642+ for k = base to infinity in steps of base do begin
643+ let t = tmin if k <= bias {+ tmin}, or
644+ tmax if k >= bias + tmax, or k - bias otherwise
645+ if q < t then break
646+ output the code point for digit t + ((q - t) mod (base - t))
647+ let q = (q - t) div (base - t)
648+ end
649+ output the code point for digit q
650+ let bias = adapt(delta, h + 1, test h equals b?)
651+ let delta = 0
652+ increment h
653+ end
654+ end
655+ increment delta and n
656+ end
657+658+ The full statement enclosed in braces (checking whether the input
659+ contains a non-basic code point less than n) can be omitted if all
660+ code points less than initial_n are basic code points (which is true
661+ for Punycode if code points are unsigned).
662+663+ The brace-enclosed conditions "non-basic" and "or c is basic" can be
664+ omitted if initial_n exceeds all basic code points (which is true for
665+ Punycode), because the code point being tested is never less than
666+ initial_n.
667+668+ In the assignment of t, where t is clamped to the range tmin through
669+ tmax, "+ tmin" can always be omitted. This makes the clamping
670+ calculation incorrect when bias < k < bias + tmin, but that cannot
671+672+673+674+Costello Standards Track [Page 12]
675+676+RFC 3492 IDNA Punycode March 2003
677+678+679+ happen because of the way bias is computed and because of the
680+ constraints on the parameters.
681+682+ The checks for overflow are necessary to avoid producing invalid
683+ output when the input contains very large values or is very long.
684+685+ The increment of delta at the bottom of the outer loop cannot
686+ overflow because delta < length(input) before the increment, and
687+ length(input) is already assumed to be representable. The increment
688+ of n could overflow, but only if h == length(input), in which case
689+ the procedure is finished anyway.
690+691+6.4 Overflow handling
692+693+ For IDNA, 26-bit unsigned integers are sufficient to handle all valid
694+ IDNA labels without overflow, because any string that needed a 27-bit
695+ delta would have to exceed either the code point limit (0..10FFFF) or
696+ the label length limit (63 characters). However, overflow handling
697+ is necessary because the inputs are not necessarily valid IDNA
698+ labels.
699+700+ If the programming language does not provide overflow detection, the
701+ following technique can be used. Suppose A, B, and C are
702+ representable nonnegative integers and C is nonzero. Then A + B
703+ overflows if and only if B > maxint - A, and A + (B * C) overflows if
704+ and only if B > (maxint - A) div C, where maxint is the greatest
705+ integer for which maxint + 1 cannot be represented. Refer to
706+ appendix C "Punycode sample implementation" for demonstrations of
707+ this technique in the C language.
708+709+ The decoding and encoding algorithms shown in sections 6.2 and 6.3
710+ handle overflow by detecting it whenever it happens. Another
711+ approach is to enforce limits on the inputs that prevent overflow
712+ from happening. For example, if the encoder were to verify that no
713+ input code points exceed M and that the input length does not exceed
714+ L, then no delta could ever exceed (M - initial_n) * (L + 1), and
715+ hence no overflow could occur if integer variables were capable of
716+ representing values that large. This prevention approach would
717+ impose more restrictions on the input than the detection approach
718+ does, but might be considered simpler in some programming languages.
719+720+ In theory, the decoder could use an analogous approach, limiting the
721+ number of digits in a variable-length integer (that is, limiting the
722+ number of iterations in the innermost loop). However, the number of
723+ digits that suffice to represent a given delta can sometimes
724+ represent much larger deltas (because of the adaptation), and hence
725+ this approach would probably need integers wider than 32 bits.
726+727+728+729+730+Costello Standards Track [Page 13]
731+732+RFC 3492 IDNA Punycode March 2003
733+734+735+ Yet another approach for the decoder is to allow overflow to occur,
736+ but to check the final output string by re-encoding it and comparing
737+ to the decoder input. If and only if they do not match (using a
738+ case-insensitive ASCII comparison) overflow has occurred. This
739+ delayed-detection approach would not impose any more restrictions on
740+ the input than the immediate-detection approach does, and might be
741+ considered simpler in some programming languages.
742+743+ In fact, if the decoder is used only inside the IDNA ToUnicode
744+ operation [IDNA], then it need not check for overflow at all, because
745+ ToUnicode performs a higher level re-encoding and comparison, and a
746+ mismatch has the same consequence as if the Punycode decoder had
747+ failed.
748+749+7. Punycode examples
750+751+7.1 Sample strings
752+753+ In the Punycode encodings below, the ACE prefix is not shown.
754+ Backslashes show where line breaks have been inserted in strings too
755+ long for one line.
756+757+ The first several examples are all translations of the sentence "Why
758+ can't they just speak in <language>?" (courtesy of Michael Kaplan's
759+ "provincial" page [PROVINCIAL]). Word breaks and punctuation have
760+ been removed, as is often done in domain names.
761+762+ (A) Arabic (Egyptian):
763+ u+0644 u+064A u+0647 u+0645 u+0627 u+0628 u+062A u+0643 u+0644
764+ u+0645 u+0648 u+0634 u+0639 u+0631 u+0628 u+064A u+061F
765+ Punycode: egbpdaj6bu4bxfgehfvwxn
766+767+ (B) Chinese (simplified):
768+ u+4ED6 u+4EEC u+4E3A u+4EC0 u+4E48 u+4E0D u+8BF4 u+4E2D u+6587
769+ Punycode: ihqwcrb4cv8a8dqg056pqjye
770+771+ (C) Chinese (traditional):
772+ u+4ED6 u+5011 u+7232 u+4EC0 u+9EBD u+4E0D u+8AAA u+4E2D u+6587
773+ Punycode: ihqwctvzc91f659drss3x8bo0yb
774+775+ (D) Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
776+ U+0050 u+0072 u+006F u+010D u+0070 u+0072 u+006F u+0073 u+0074
777+ u+011B u+006E u+0065 u+006D u+006C u+0075 u+0076 u+00ED u+010D
778+ u+0065 u+0073 u+006B u+0079
779+ Punycode: Proprostnemluvesky-uyb24dma41a
780+781+782+783+784+785+786+Costello Standards Track [Page 14]
787+788+RFC 3492 IDNA Punycode March 2003
789+790+791+ (E) Hebrew:
792+ u+05DC u+05DE u+05D4 u+05D4 u+05DD u+05E4 u+05E9 u+05D5 u+05D8
793+ u+05DC u+05D0 u+05DE u+05D3 u+05D1 u+05E8 u+05D9 u+05DD u+05E2
794+ u+05D1 u+05E8 u+05D9 u+05EA
795+ Punycode: 4dbcagdahymbxekheh6e0a7fei0b
796+797+ (F) Hindi (Devanagari):
798+ u+092F u+0939 u+0932 u+094B u+0917 u+0939 u+093F u+0928 u+094D
799+ u+0926 u+0940 u+0915 u+094D u+092F u+094B u+0902 u+0928 u+0939
800+ u+0940 u+0902 u+092C u+094B u+0932 u+0938 u+0915 u+0924 u+0947
801+ u+0939 u+0948 u+0902
802+ Punycode: i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd
803+804+ (G) Japanese (kanji and hiragana):
805+ u+306A u+305C u+307F u+3093 u+306A u+65E5 u+672C u+8A9E u+3092
806+ u+8A71 u+3057 u+3066 u+304F u+308C u+306A u+3044 u+306E u+304B
807+ Punycode: n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa
808+809+ (H) Korean (Hangul syllables):
810+ u+C138 u+ACC4 u+C758 u+BAA8 u+B4E0 u+C0AC u+B78C u+B4E4 u+C774
811+ u+D55C u+AD6D u+C5B4 u+B97C u+C774 u+D574 u+D55C u+B2E4 u+BA74
812+ u+C5BC u+B9C8 u+B098 u+C88B u+C744 u+AE4C
813+ Punycode: 989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j\
814+ psd879ccm6fea98c
815+816+ (I) Russian (Cyrillic):
817+ U+043F u+043E u+0447 u+0435 u+043C u+0443 u+0436 u+0435 u+043E
818+ u+043D u+0438 u+043D u+0435 u+0433 u+043E u+0432 u+043E u+0440
819+ u+044F u+0442 u+043F u+043E u+0440 u+0443 u+0441 u+0441 u+043A
820+ u+0438
821+ Punycode: b1abfaaepdrnnbgefbaDotcwatmq2g4l
822+823+ (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
824+ U+0050 u+006F u+0072 u+0071 u+0075 u+00E9 u+006E u+006F u+0070
825+ u+0075 u+0065 u+0064 u+0065 u+006E u+0073 u+0069 u+006D u+0070
826+ u+006C u+0065 u+006D u+0065 u+006E u+0074 u+0065 u+0068 u+0061
827+ u+0062 u+006C u+0061 u+0072 u+0065 u+006E U+0045 u+0073 u+0070
828+ u+0061 u+00F1 u+006F u+006C
829+ Punycode: PorqunopuedensimplementehablarenEspaol-fmd56a
830+831+ (K) Vietnamese:
832+ T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
833+ <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
834+ U+0054 u+1EA1 u+0069 u+0073 u+0061 u+006F u+0068 u+1ECD u+006B
835+ u+0068 u+00F4 u+006E u+0067 u+0074 u+0068 u+1EC3 u+0063 u+0068
836+ u+1EC9 u+006E u+00F3 u+0069 u+0074 u+0069 u+1EBF u+006E u+0067
837+ U+0056 u+0069 u+1EC7 u+0074
838+ Punycode: TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g
839+840+841+842+Costello Standards Track [Page 15]
843+844+RFC 3492 IDNA Punycode March 2003
845+846+847+ The next several examples are all names of Japanese music artists,
848+ song titles, and TV programs, just because the author happens to have
849+ them handy (but Japanese is useful for providing examples of single-
850+ row text, two-row text, ideographic text, and various mixtures
851+ thereof).
852+853+ (L) 3<nen>B<gumi><kinpachi><sensei>
854+ u+0033 u+5E74 U+0042 u+7D44 u+91D1 u+516B u+5148 u+751F
855+ Punycode: 3B-ww4c5e180e575a65lsy2b
856+857+ (M) <amuro><namie>-with-SUPER-MONKEYS
858+ u+5B89 u+5BA4 u+5948 u+7F8E u+6075 u+002D u+0077 u+0069 u+0074
859+ u+0068 u+002D U+0053 U+0055 U+0050 U+0045 U+0052 u+002D U+004D
860+ U+004F U+004E U+004B U+0045 U+0059 U+0053
861+ Punycode: -with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n
862+863+ (N) Hello-Another-Way-<sorezore><no><basho>
864+ U+0048 u+0065 u+006C u+006C u+006F u+002D U+0041 u+006E u+006F
865+ u+0074 u+0068 u+0065 u+0072 u+002D U+0057 u+0061 u+0079 u+002D
866+ u+305D u+308C u+305E u+308C u+306E u+5834 u+6240
867+ Punycode: Hello-Another-Way--fc4qua05auwb3674vfr0b
868+869+ (O) <hitotsu><yane><no><shita>2
870+ u+3072 u+3068 u+3064 u+5C4B u+6839 u+306E u+4E0B u+0032
871+ Punycode: 2-u9tlzr9756bt3uc0v
872+873+ (P) Maji<de>Koi<suru>5<byou><mae>
874+ U+004D u+0061 u+006A u+0069 u+3067 U+004B u+006F u+0069 u+3059
875+ u+308B u+0035 u+79D2 u+524D
876+ Punycode: MajiKoi5-783gue6qz075azm5e
877+878+ (Q) <pafii>de<runba>
879+ u+30D1 u+30D5 u+30A3 u+30FC u+0064 u+0065 u+30EB u+30F3 u+30D0
880+ Punycode: de-jg4avhby1noc0d
881+882+ (R) <sono><supiido><de>
883+ u+305D u+306E u+30B9 u+30D4 u+30FC u+30C9 u+3067
884+ Punycode: d9juau41awczczp
885+886+ The last example is an ASCII string that breaks the existing rules
887+ for host name labels. (It is not a realistic example for IDNA,
888+ because IDNA never encodes pure ASCII labels.)
889+890+ (S) -> $1.00 <-
891+ u+002D u+003E u+0020 u+0024 u+0031 u+002E u+0030 u+0030 u+0020
892+ u+003C u+002D
893+ Punycode: -> $1.00 <--
894+895+896+897+898+Costello Standards Track [Page 16]
899+900+RFC 3492 IDNA Punycode March 2003
901+902+903+7.2 Decoding traces
904+905+ In the following traces, the evolving state of the decoder is shown
906+ as a sequence of hexadecimal values, representing the code points in
907+ the extended string. An asterisk appears just after the most
908+ recently inserted code point, indicating both n (the value preceeding
909+ the asterisk) and i (the position of the value just after the
910+ asterisk). Other numerical values are decimal.
911+912+ Decoding trace of example B from section 7.1:
913+914+ n is 128, i is 0, bias is 72
915+ input is "ihqwcrb4cv8a8dqg056pqjye"
916+ there is no delimiter, so extended string starts empty
917+ delta "ihq" decodes to 19853
918+ bias becomes 21
919+ 4E0D *
920+ delta "wc" decodes to 64
921+ bias becomes 20
922+ 4E0D 4E2D *
923+ delta "rb" decodes to 37
924+ bias becomes 13
925+ 4E3A * 4E0D 4E2D
926+ delta "4c" decodes to 56
927+ bias becomes 17
928+ 4E3A 4E48 * 4E0D 4E2D
929+ delta "v8a" decodes to 599
930+ bias becomes 32
931+ 4E3A 4EC0 * 4E48 4E0D 4E2D
932+ delta "8d" decodes to 130
933+ bias becomes 23
934+ 4ED6 * 4E3A 4EC0 4E48 4E0D 4E2D
935+ delta "qg" decodes to 154
936+ bias becomes 25
937+ 4ED6 4EEC * 4E3A 4EC0 4E48 4E0D 4E2D
938+ delta "056p" decodes to 46301
939+ bias becomes 84
940+ 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 4E2D 6587 *
941+ delta "qjye" decodes to 88531
942+ bias becomes 90
943+ 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 8BF4 * 4E2D 6587
944+945+946+947+948+949+950+951+952+953+954+Costello Standards Track [Page 17]
955+956+RFC 3492 IDNA Punycode March 2003
957+958+959+ Decoding trace of example L from section 7.1:
960+961+ n is 128, i is 0, bias is 72
962+ input is "3B-ww4c5e180e575a65lsy2b"
963+ literal portion is "3B-", so extended string starts as:
964+ 0033 0042
965+ delta "ww4c" decodes to 62042
966+ bias becomes 27
967+ 0033 0042 5148 *
968+ delta "5e" decodes to 139
969+ bias becomes 24
970+ 0033 0042 516B * 5148
971+ delta "180e" decodes to 16683
972+ bias becomes 67
973+ 0033 5E74 * 0042 516B 5148
974+ delta "575a" decodes to 34821
975+ bias becomes 82
976+ 0033 5E74 0042 516B 5148 751F *
977+ delta "65l" decodes to 14592
978+ bias becomes 67
979+ 0033 5E74 0042 7D44 * 516B 5148 751F
980+ delta "sy2b" decodes to 42088
981+ bias becomes 84
982+ 0033 5E74 0042 7D44 91D1 * 516B 5148 751F
983+984+985+986+987+988+989+990+991+992+993+994+995+996+997+998+999+1000+1001+1002+1003+1004+1005+1006+1007+1008+1009+1010+Costello Standards Track [Page 18]
1011+1012+RFC 3492 IDNA Punycode March 2003
1013+1014+1015+7.3 Encoding traces
1016+1017+ In the following traces, code point values are hexadecimal, while
1018+ other numerical values are decimal.
1019+1020+ Encoding trace of example B from section 7.1:
1021+1022+ bias is 72
1023+ input is:
1024+ 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 8BF4 4E2D 6587
1025+ there are no basic code points, so no literal portion
1026+ next code point to insert is 4E0D
1027+ needed delta is 19853, encodes as "ihq"
1028+ bias becomes 21
1029+ next code point to insert is 4E2D
1030+ needed delta is 64, encodes as "wc"
1031+ bias becomes 20
1032+ next code point to insert is 4E3A
1033+ needed delta is 37, encodes as "rb"
1034+ bias becomes 13
1035+ next code point to insert is 4E48
1036+ needed delta is 56, encodes as "4c"
1037+ bias becomes 17
1038+ next code point to insert is 4EC0
1039+ needed delta is 599, encodes as "v8a"
1040+ bias becomes 32
1041+ next code point to insert is 4ED6
1042+ needed delta is 130, encodes as "8d"
1043+ bias becomes 23
1044+ next code point to insert is 4EEC
1045+ needed delta is 154, encodes as "qg"
1046+ bias becomes 25
1047+ next code point to insert is 6587
1048+ needed delta is 46301, encodes as "056p"
1049+ bias becomes 84
1050+ next code point to insert is 8BF4
1051+ needed delta is 88531, encodes as "qjye"
1052+ bias becomes 90
1053+ output is "ihqwcrb4cv8a8dqg056pqjye"
1054+1055+1056+1057+1058+1059+1060+1061+1062+1063+1064+1065+1066+Costello Standards Track [Page 19]
1067+1068+RFC 3492 IDNA Punycode March 2003
1069+1070+1071+ Encoding trace of example L from section 7.1:
1072+1073+ bias is 72
1074+ input is:
1075+ 0033 5E74 0042 7D44 91D1 516B 5148 751F
1076+ basic code points (0033, 0042) are copied to literal portion: "3B-"
1077+ next code point to insert is 5148
1078+ needed delta is 62042, encodes as "ww4c"
1079+ bias becomes 27
1080+ next code point to insert is 516B
1081+ needed delta is 139, encodes as "5e"
1082+ bias becomes 24
1083+ next code point to insert is 5E74
1084+ needed delta is 16683, encodes as "180e"
1085+ bias becomes 67
1086+ next code point to insert is 751F
1087+ needed delta is 34821, encodes as "575a"
1088+ bias becomes 82
1089+ next code point to insert is 7D44
1090+ needed delta is 14592, encodes as "65l"
1091+ bias becomes 67
1092+ next code point to insert is 91D1
1093+ needed delta is 42088, encodes as "sy2b"
1094+ bias becomes 84
1095+ output is "3B-ww4c5e180e575a65lsy2b"
1096+1097+8. Security Considerations
1098+1099+ Users expect each domain name in DNS to be controlled by a single
1100+ authority. If a Unicode string intended for use as a domain label
1101+ could map to multiple ACE labels, then an internationalized domain
1102+ name could map to multiple ASCII domain names, each controlled by a
1103+ different authority, some of which could be spoofs that hijack
1104+ service requests intended for another. Therefore Punycode is
1105+ designed so that each Unicode string has a unique encoding.
1106+1107+ However, there can still be multiple Unicode representations of the
1108+ "same" text, for various definitions of "same". This problem is
1109+ addressed to some extent by the Unicode standard under the topic of
1110+ canonicalization, and this work is leveraged for domain names by
1111+ Nameprep [NAMEPREP].
1112+1113+1114+1115+1116+1117+1118+1119+1120+1121+1122+Costello Standards Track [Page 20]
1123+1124+RFC 3492 IDNA Punycode March 2003
1125+1126+1127+9. References
1128+1129+9.1 Normative References
1130+1131+ [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
1132+ Requirement Levels", BCP 14, RFC 2119, March 1997.
1133+1134+9.2 Informative References
1135+1136+ [RFC952] Harrenstien, K., Stahl, M. and E. Feinler, "DOD Internet
1137+ Host Table Specification", RFC 952, October 1985.
1138+1139+ [RFC1034] Mockapetris, P., "Domain Names - Concepts and
1140+ Facilities", STD 13, RFC 1034, November 1987.
1141+1142+ [IDNA] Faltstrom, P., Hoffman, P. and A. Costello,
1143+ "Internationalizing Domain Names in Applications
1144+ (IDNA)", RFC 3490, March 2003.
1145+1146+ [NAMEPREP] Hoffman, P. and M. Blanchet, "Nameprep: A Stringprep
1147+ Profile for Internationalized Domain Names (IDN)", RFC
1148+ 3491, March 2003.
1149+1150+ [ASCII] Cerf, V., "ASCII format for Network Interchange", RFC
1151+ 20, October 1969.
1152+1153+ [PROVINCIAL] Kaplan, M., "The 'anyone can be provincial!' page",
1154+ http://www.trigeminal.com/samples/provincial.html.
1155+1156+ [UNICODE] The Unicode Consortium, "The Unicode Standard",
1157+ http://www.unicode.org/unicode/standard/standard.html.
1158+1159+1160+1161+1162+1163+1164+1165+1166+1167+1168+1169+1170+1171+1172+1173+1174+1175+1176+1177+1178+Costello Standards Track [Page 21]
1179+1180+RFC 3492 IDNA Punycode March 2003
1181+1182+1183+A. Mixed-case annotation
1184+1185+ In order to use Punycode to represent case-insensitive strings,
1186+ higher layers need to case-fold the strings prior to Punycode
1187+ encoding. The encoded string can use mixed case as an annotation
1188+ telling how to convert the folded string into a mixed-case string for
1189+ display purposes. Note, however, that mixed-case annotation is not
1190+ used by the ToASCII and ToUnicode operations specified in [IDNA], and
1191+ therefore implementors of IDNA can disregard this appendix.
1192+1193+ Basic code points can use mixed case directly, because the decoder
1194+ copies them verbatim, leaving lowercase code points lowercase, and
1195+ leaving uppercase code points uppercase. Each non-basic code point
1196+ is represented by a delta, which is represented by a sequence of
1197+ basic code points, the last of which provides the annotation. If it
1198+ is uppercase, it is a suggestion to map the non-basic code point to
1199+ uppercase (if possible); if it is lowercase, it is a suggestion to
1200+ map the non-basic code point to lowercase (if possible).
1201+1202+ These annotations do not alter the code points returned by decoders;
1203+ the annotations are returned separately, for the caller to use or
1204+ ignore. Encoders can accept annotations in addition to code points,
1205+ but the annotations do not alter the output, except to influence the
1206+ uppercase/lowercase form of ASCII letters.
1207+1208+ Punycode encoders and decoders need not support these annotations,
1209+ and higher layers need not use them.
1210+1211+B. Disclaimer and license
1212+1213+ Regarding this entire document or any portion of it (including the
1214+ pseudocode and C code), the author makes no guarantees and is not
1215+ responsible for any damage resulting from its use. The author grants
1216+ irrevocable permission to anyone to use, modify, and distribute it in
1217+ any way that does not diminish the rights of anyone else to use,
1218+ modify, and distribute it, provided that redistributed derivative
1219+ works do not contain misleading author or version information.
1220+ Derivative works need not be licensed under similar terms.
1221+1222+1223+1224+1225+1226+1227+1228+1229+1230+1231+1232+1233+1234+Costello Standards Track [Page 22]
1235+1236+RFC 3492 IDNA Punycode March 2003
1237+1238+1239+C. Punycode sample implementation
1240+1241+/*
1242+punycode.c from RFC 3492
1243+http://www.nicemice.net/idn/
1244+Adam M. Costello
1245+http://www.nicemice.net/amc/
1246+1247+This is ANSI C code (C89) implementing Punycode (RFC 3492).
1248+1249+*/
1250+1251+1252+/************************************************************/
1253+/* Public interface (would normally go in its own .h file): */
1254+1255+#include <limits.h>
1256+1257+enum punycode_status {
1258+ punycode_success,
1259+ punycode_bad_input, /* Input is invalid. */
1260+ punycode_big_output, /* Output would exceed the space provided. */
1261+ punycode_overflow /* Input needs wider integers to process. */
1262+};
1263+1264+#if UINT_MAX >= (1 << 26) - 1
1265+typedef unsigned int punycode_uint;
1266+#else
1267+typedef unsigned long punycode_uint;
1268+#endif
1269+1270+enum punycode_status punycode_encode(
1271+ punycode_uint input_length,
1272+ const punycode_uint input[],
1273+ const unsigned char case_flags[],
1274+ punycode_uint *output_length,
1275+ char output[] );
1276+1277+ /* punycode_encode() converts Unicode to Punycode. The input */
1278+ /* is represented as an array of Unicode code points (not code */
1279+ /* units; surrogate pairs are not allowed), and the output */
1280+ /* will be represented as an array of ASCII code points. The */
1281+ /* output string is *not* null-terminated; it will contain */
1282+ /* zeros if and only if the input contains zeros. (Of course */
1283+ /* the caller can leave room for a terminator and add one if */
1284+ /* needed.) The input_length is the number of code points in */
1285+ /* the input. The output_length is an in/out argument: the */
1286+ /* caller passes in the maximum number of code points that it */
1287+1288+1289+1290+Costello Standards Track [Page 23]
1291+1292+RFC 3492 IDNA Punycode March 2003
1293+1294+1295+ /* can receive, and on successful return it will contain the */
1296+ /* number of code points actually output. The case_flags array */
1297+ /* holds input_length boolean values, where nonzero suggests that */
1298+ /* the corresponding Unicode character be forced to uppercase */
1299+ /* after being decoded (if possible), and zero suggests that */
1300+ /* it be forced to lowercase (if possible). ASCII code points */
1301+ /* are encoded literally, except that ASCII letters are forced */
1302+ /* to uppercase or lowercase according to the corresponding */
1303+ /* uppercase flags. If case_flags is a null pointer then ASCII */
1304+ /* letters are left as they are, and other code points are */
1305+ /* treated as if their uppercase flags were zero. The return */
1306+ /* value can be any of the punycode_status values defined above */
1307+ /* except punycode_bad_input; if not punycode_success, then */
1308+ /* output_size and output might contain garbage. */
1309+1310+enum punycode_status punycode_decode(
1311+ punycode_uint input_length,
1312+ const char input[],
1313+ punycode_uint *output_length,
1314+ punycode_uint output[],
1315+ unsigned char case_flags[] );
1316+1317+ /* punycode_decode() converts Punycode to Unicode. The input is */
1318+ /* represented as an array of ASCII code points, and the output */
1319+ /* will be represented as an array of Unicode code points. The */
1320+ /* input_length is the number of code points in the input. The */
1321+ /* output_length is an in/out argument: the caller passes in */
1322+ /* the maximum number of code points that it can receive, and */
1323+ /* on successful return it will contain the actual number of */
1324+ /* code points output. The case_flags array needs room for at */
1325+ /* least output_length values, or it can be a null pointer if the */
1326+ /* case information is not needed. A nonzero flag suggests that */
1327+ /* the corresponding Unicode character be forced to uppercase */
1328+ /* by the caller (if possible), while zero suggests that it be */
1329+ /* forced to lowercase (if possible). ASCII code points are */
1330+ /* output already in the proper case, but their flags will be set */
1331+ /* appropriately so that applying the flags would be harmless. */
1332+ /* The return value can be any of the punycode_status values */
1333+ /* defined above; if not punycode_success, then output_length, */
1334+ /* output, and case_flags might contain garbage. On success, the */
1335+ /* decoder will never need to write an output_length greater than */
1336+ /* input_length, because of how the encoding is defined. */
1337+1338+/**********************************************************/
1339+/* Implementation (would normally go in its own .c file): */
1340+1341+#include <string.h>
1342+1343+1344+1345+1346+Costello Standards Track [Page 24]
1347+1348+RFC 3492 IDNA Punycode March 2003
1349+1350+1351+/*** Bootstring parameters for Punycode ***/
1352+1353+enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
1354+ initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
1355+1356+/* basic(cp) tests whether cp is a basic code point: */
1357+#define basic(cp) ((punycode_uint)(cp) < 0x80)
1358+1359+/* delim(cp) tests whether cp is a delimiter: */
1360+#define delim(cp) ((cp) == delimiter)
1361+1362+/* decode_digit(cp) returns the numeric value of a basic code */
1363+/* point (for use in representing integers) in the range 0 to */
1364+/* base-1, or base if cp is does not represent a value. */
1365+1366+static punycode_uint decode_digit(punycode_uint cp)
1367+{
1368+ return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
1369+ cp - 97 < 26 ? cp - 97 : base;
1370+}
1371+1372+/* encode_digit(d,flag) returns the basic code point whose value */
1373+/* (when used for representing integers) is d, which needs to be in */
1374+/* the range 0 to base-1. The lowercase form is used unless flag is */
1375+/* nonzero, in which case the uppercase form is used. The behavior */
1376+/* is undefined if flag is nonzero and digit d has no uppercase form. */
1377+1378+static char encode_digit(punycode_uint d, int flag)
1379+{
1380+ return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
1381+ /* 0..25 map to ASCII a..z or A..Z */
1382+ /* 26..35 map to ASCII 0..9 */
1383+}
1384+1385+/* flagged(bcp) tests whether a basic code point is flagged */
1386+/* (uppercase). The behavior is undefined if bcp is not a */
1387+/* basic code point. */
1388+1389+#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
1390+1391+/* encode_basic(bcp,flag) forces a basic code point to lowercase */
1392+/* if flag is zero, uppercase if flag is nonzero, and returns */
1393+/* the resulting code point. The code point is unchanged if it */
1394+/* is caseless. The behavior is undefined if bcp is not a basic */
1395+/* code point. */
1396+1397+static char encode_basic(punycode_uint bcp, int flag)
1398+{
1399+1400+1401+1402+Costello Standards Track [Page 25]
1403+1404+RFC 3492 IDNA Punycode March 2003
1405+1406+1407+ bcp -= (bcp - 97 < 26) << 5;
1408+ return bcp + ((!flag && (bcp - 65 < 26)) << 5);
1409+}
1410+1411+/*** Platform-specific constants ***/
1412+1413+/* maxint is the maximum value of a punycode_uint variable: */
1414+static const punycode_uint maxint = -1;
1415+/* Because maxint is unsigned, -1 becomes the maximum value. */
1416+1417+/*** Bias adaptation function ***/
1418+1419+static punycode_uint adapt(
1420+ punycode_uint delta, punycode_uint numpoints, int firsttime )
1421+{
1422+ punycode_uint k;
1423+1424+ delta = firsttime ? delta / damp : delta >> 1;
1425+ /* delta >> 1 is a faster way of doing delta / 2 */
1426+ delta += delta / numpoints;
1427+1428+ for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
1429+ delta /= base - tmin;
1430+ }
1431+1432+ return k + (base - tmin + 1) * delta / (delta + skew);
1433+}
1434+1435+/*** Main encode function ***/
1436+1437+enum punycode_status punycode_encode(
1438+ punycode_uint input_length,
1439+ const punycode_uint input[],
1440+ const unsigned char case_flags[],
1441+ punycode_uint *output_length,
1442+ char output[] )
1443+{
1444+ punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
1445+1446+ /* Initialize the state: */
1447+1448+ n = initial_n;
1449+ delta = out = 0;
1450+ max_out = *output_length;
1451+ bias = initial_bias;
1452+1453+ /* Handle the basic code points: */
1454+1455+1456+1457+1458+Costello Standards Track [Page 26]
1459+1460+RFC 3492 IDNA Punycode March 2003
1461+1462+1463+ for (j = 0; j < input_length; ++j) {
1464+ if (basic(input[j])) {
1465+ if (max_out - out < 2) return punycode_big_output;
1466+ output[out++] =
1467+ case_flags ? encode_basic(input[j], case_flags[j]) : input[j];
1468+ }
1469+ /* else if (input[j] < n) return punycode_bad_input; */
1470+ /* (not needed for Punycode with unsigned code points) */
1471+ }
1472+1473+ h = b = out;
1474+1475+ /* h is the number of code points that have been handled, b is the */
1476+ /* number of basic code points, and out is the number of characters */
1477+ /* that have been output. */
1478+1479+ if (b > 0) output[out++] = delimiter;
1480+1481+ /* Main encoding loop: */
1482+1483+ while (h < input_length) {
1484+ /* All non-basic code points < n have been */
1485+ /* handled already. Find the next larger one: */
1486+1487+ for (m = maxint, j = 0; j < input_length; ++j) {
1488+ /* if (basic(input[j])) continue; */
1489+ /* (not needed for Punycode) */
1490+ if (input[j] >= n && input[j] < m) m = input[j];
1491+ }
1492+1493+ /* Increase delta enough to advance the decoder's */
1494+ /* <n,i> state to <m,0>, but guard against overflow: */
1495+1496+ if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
1497+ delta += (m - n) * (h + 1);
1498+ n = m;
1499+1500+ for (j = 0; j < input_length; ++j) {
1501+ /* Punycode does not need to check whether input[j] is basic: */
1502+ if (input[j] < n /* || basic(input[j]) */ ) {
1503+ if (++delta == 0) return punycode_overflow;
1504+ }
1505+1506+ if (input[j] == n) {
1507+ /* Represent delta as a generalized variable-length integer: */
1508+1509+ for (q = delta, k = base; ; k += base) {
1510+ if (out >= max_out) return punycode_big_output;
1511+1512+1513+1514+Costello Standards Track [Page 27]
1515+1516+RFC 3492 IDNA Punycode March 2003
1517+1518+1519+ t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
1520+ k >= bias + tmax ? tmax : k - bias;
1521+ if (q < t) break;
1522+ output[out++] = encode_digit(t + (q - t) % (base - t), 0);
1523+ q = (q - t) / (base - t);
1524+ }
1525+1526+ output[out++] = encode_digit(q, case_flags && case_flags[j]);
1527+ bias = adapt(delta, h + 1, h == b);
1528+ delta = 0;
1529+ ++h;
1530+ }
1531+ }
1532+1533+ ++delta, ++n;
1534+ }
1535+1536+ *output_length = out;
1537+ return punycode_success;
1538+}
1539+1540+/*** Main decode function ***/
1541+1542+enum punycode_status punycode_decode(
1543+ punycode_uint input_length,
1544+ const char input[],
1545+ punycode_uint *output_length,
1546+ punycode_uint output[],
1547+ unsigned char case_flags[] )
1548+{
1549+ punycode_uint n, out, i, max_out, bias,
1550+ b, j, in, oldi, w, k, digit, t;
1551+1552+ /* Initialize the state: */
1553+1554+ n = initial_n;
1555+ out = i = 0;
1556+ max_out = *output_length;
1557+ bias = initial_bias;
1558+1559+ /* Handle the basic code points: Let b be the number of input code */
1560+ /* points before the last delimiter, or 0 if there is none, then */
1561+ /* copy the first b code points to the output. */
1562+1563+ for (b = j = 0; j < input_length; ++j) if (delim(input[j])) b = j;
1564+ if (b > max_out) return punycode_big_output;
1565+1566+ for (j = 0; j < b; ++j) {
1567+1568+1569+1570+Costello Standards Track [Page 28]
1571+1572+RFC 3492 IDNA Punycode March 2003
1573+1574+1575+ if (case_flags) case_flags[out] = flagged(input[j]);
1576+ if (!basic(input[j])) return punycode_bad_input;
1577+ output[out++] = input[j];
1578+ }
1579+1580+ /* Main decoding loop: Start just after the last delimiter if any */
1581+ /* basic code points were copied; start at the beginning otherwise. */
1582+1583+ for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) {
1584+1585+ /* in is the index of the next character to be consumed, and */
1586+ /* out is the number of code points in the output array. */
1587+1588+ /* Decode a generalized variable-length integer into delta, */
1589+ /* which gets added to i. The overflow checking is easier */
1590+ /* if we increase i as we go, then subtract off its starting */
1591+ /* value at the end to obtain delta. */
1592+1593+ for (oldi = i, w = 1, k = base; ; k += base) {
1594+ if (in >= input_length) return punycode_bad_input;
1595+ digit = decode_digit(input[in++]);
1596+ if (digit >= base) return punycode_bad_input;
1597+ if (digit > (maxint - i) / w) return punycode_overflow;
1598+ i += digit * w;
1599+ t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
1600+ k >= bias + tmax ? tmax : k - bias;
1601+ if (digit < t) break;
1602+ if (w > maxint / (base - t)) return punycode_overflow;
1603+ w *= (base - t);
1604+ }
1605+1606+ bias = adapt(i - oldi, out + 1, oldi == 0);
1607+1608+ /* i was supposed to wrap around from out+1 to 0, */
1609+ /* incrementing n each time, so we'll fix that now: */
1610+1611+ if (i / (out + 1) > maxint - n) return punycode_overflow;
1612+ n += i / (out + 1);
1613+ i %= (out + 1);
1614+1615+ /* Insert n at position i of the output: */
1616+1617+ /* not needed for Punycode: */
1618+ /* if (decode_digit(n) <= base) return punycode_invalid_input; */
1619+ if (out >= max_out) return punycode_big_output;
1620+1621+ if (case_flags) {
1622+ memmove(case_flags + i + 1, case_flags + i, out - i);
1623+1624+1625+1626+Costello Standards Track [Page 29]
1627+1628+RFC 3492 IDNA Punycode March 2003
1629+1630+1631+ /* Case of last character determines uppercase flag: */
1632+ case_flags[i] = flagged(input[in - 1]);
1633+ }
1634+1635+ memmove(output + i + 1, output + i, (out - i) * sizeof *output);
1636+ output[i++] = n;
1637+ }
1638+1639+ *output_length = out;
1640+ return punycode_success;
1641+}
1642+1643+/******************************************************************/
1644+/* Wrapper for testing (would normally go in a separate .c file): */
1645+1646+#include <assert.h>
1647+#include <stdio.h>
1648+#include <stdlib.h>
1649+#include <string.h>
1650+1651+/* For testing, we'll just set some compile-time limits rather than */
1652+/* use malloc(), and set a compile-time option rather than using a */
1653+/* command-line option. */
1654+1655+enum {
1656+ unicode_max_length = 256,
1657+ ace_max_length = 256
1658+};
1659+1660+static void usage(char **argv)
1661+{
1662+ fprintf(stderr,
1663+ "\n"
1664+ "%s -e reads code points and writes a Punycode string.\n"
1665+ "%s -d reads a Punycode string and writes code points.\n"
1666+ "\n"
1667+ "Input and output are plain text in the native character set.\n"
1668+ "Code points are in the form u+hex separated by whitespace.\n"
1669+ "Although the specification allows Punycode strings to contain\n"
1670+ "any characters from the ASCII repertoire, this test code\n"
1671+ "supports only the printable characters, and needs the Punycode\n"
1672+ "string to be followed by a newline.\n"
1673+ "The case of the u in u+hex is the force-to-uppercase flag.\n"
1674+ , argv[0], argv[0]);
1675+ exit(EXIT_FAILURE);
1676+}
1677+1678+static void fail(const char *msg)
1679+1680+1681+1682+Costello Standards Track [Page 30]
1683+1684+RFC 3492 IDNA Punycode March 2003
1685+1686+1687+{
1688+ fputs(msg,stderr);
1689+ exit(EXIT_FAILURE);
1690+}
1691+1692+static const char too_big[] =
1693+ "input or output is too large, recompile with larger limits\n";
1694+static const char invalid_input[] = "invalid input\n";
1695+static const char overflow[] = "arithmetic overflow\n";
1696+static const char io_error[] = "I/O error\n";
1697+1698+/* The following string is used to convert printable */
1699+/* characters between ASCII and the native charset: */
1700+1701+static const char print_ascii[] =
1702+ "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
1703+ "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
1704+ " !\"#$%&'()*+,-./"
1705+ "0123456789:;<=>?"
1706+ "@ABCDEFGHIJKLMNO"
1707+ "PQRSTUVWXYZ[\\]^_"
1708+ "`abcdefghijklmno"
1709+ "pqrstuvwxyz{|}~\n";
1710+1711+int main(int argc, char **argv)
1712+{
1713+ enum punycode_status status;
1714+ int r;
1715+ unsigned int input_length, output_length, j;
1716+ unsigned char case_flags[unicode_max_length];
1717+1718+ if (argc != 2) usage(argv);
1719+ if (argv[1][0] != '-') usage(argv);
1720+ if (argv[1][2] != 0) usage(argv);
1721+1722+ if (argv[1][1] == 'e') {
1723+ punycode_uint input[unicode_max_length];
1724+ unsigned long codept;
1725+ char output[ace_max_length+1], uplus[3];
1726+ int c;
1727+1728+ /* Read the input code points: */
1729+1730+ input_length = 0;
1731+1732+ for (;;) {
1733+ r = scanf("%2s%lx", uplus, &codept);
1734+ if (ferror(stdin)) fail(io_error);
1735+1736+1737+1738+Costello Standards Track [Page 31]
1739+1740+RFC 3492 IDNA Punycode March 2003
1741+1742+1743+ if (r == EOF || r == 0) break;
1744+1745+ if (r != 2 || uplus[1] != '+' || codept > (punycode_uint)-1) {
1746+ fail(invalid_input);
1747+ }
1748+1749+ if (input_length == unicode_max_length) fail(too_big);
1750+1751+ if (uplus[0] == 'u') case_flags[input_length] = 0;
1752+ else if (uplus[0] == 'U') case_flags[input_length] = 1;
1753+ else fail(invalid_input);
1754+1755+ input[input_length++] = codept;
1756+ }
1757+1758+ /* Encode: */
1759+1760+ output_length = ace_max_length;
1761+ status = punycode_encode(input_length, input, case_flags,
1762+ &output_length, output);
1763+ if (status == punycode_bad_input) fail(invalid_input);
1764+ if (status == punycode_big_output) fail(too_big);
1765+ if (status == punycode_overflow) fail(overflow);
1766+ assert(status == punycode_success);
1767+1768+ /* Convert to native charset and output: */
1769+1770+ for (j = 0; j < output_length; ++j) {
1771+ c = output[j];
1772+ assert(c >= 0 && c <= 127);
1773+ if (print_ascii[c] == 0) fail(invalid_input);
1774+ output[j] = print_ascii[c];
1775+ }
1776+1777+ output[j] = 0;
1778+ r = puts(output);
1779+ if (r == EOF) fail(io_error);
1780+ return EXIT_SUCCESS;
1781+ }
1782+1783+ if (argv[1][1] == 'd') {
1784+ char input[ace_max_length+2], *p, *pp;
1785+ punycode_uint output[unicode_max_length];
1786+1787+ /* Read the Punycode input string and convert to ASCII: */
1788+1789+ fgets(input, ace_max_length+2, stdin);
1790+ if (ferror(stdin)) fail(io_error);
1791+1792+1793+1794+Costello Standards Track [Page 32]
1795+1796+RFC 3492 IDNA Punycode March 2003
1797+1798+1799+ if (feof(stdin)) fail(invalid_input);
1800+ input_length = strlen(input) - 1;
1801+ if (input[input_length] != '\n') fail(too_big);
1802+ input[input_length] = 0;
1803+1804+ for (p = input; *p != 0; ++p) {
1805+ pp = strchr(print_ascii, *p);
1806+ if (pp == 0) fail(invalid_input);
1807+ *p = pp - print_ascii;
1808+ }
1809+1810+ /* Decode: */
1811+1812+ output_length = unicode_max_length;
1813+ status = punycode_decode(input_length, input, &output_length,
1814+ output, case_flags);
1815+ if (status == punycode_bad_input) fail(invalid_input);
1816+ if (status == punycode_big_output) fail(too_big);
1817+ if (status == punycode_overflow) fail(overflow);
1818+ assert(status == punycode_success);
1819+1820+ /* Output the result: */
1821+1822+ for (j = 0; j < output_length; ++j) {
1823+ r = printf("%s+%04lX\n",
1824+ case_flags[j] ? "U" : "u",
1825+ (unsigned long) output[j] );
1826+ if (r < 0) fail(io_error);
1827+ }
1828+1829+ return EXIT_SUCCESS;
1830+ }
1831+1832+ usage(argv);
1833+ return EXIT_SUCCESS; /* not reached, but quiets compiler warning */
1834+}
1835+1836+1837+1838+1839+1840+1841+1842+1843+1844+1845+1846+1847+1848+1849+1850+Costello Standards Track [Page 33]
1851+1852+RFC 3492 IDNA Punycode March 2003
1853+1854+1855+Author's Address
1856+1857+ Adam M. Costello
1858+ University of California, Berkeley
1859+ http://www.nicemice.net/amc/
1860+1861+1862+1863+1864+1865+1866+1867+1868+1869+1870+1871+1872+1873+1874+1875+1876+1877+1878+1879+1880+1881+1882+1883+1884+1885+1886+1887+1888+1889+1890+1891+1892+1893+1894+1895+1896+1897+1898+1899+1900+1901+1902+1903+1904+1905+1906+Costello Standards Track [Page 34]
1907+1908+RFC 3492 IDNA Punycode March 2003
1909+1910+1911+Full Copyright Statement
1912+1913+ Copyright (C) The Internet Society (2003). All Rights Reserved.
1914+1915+ This document and translations of it may be copied and furnished to
1916+ others, and derivative works that comment on or otherwise explain it
1917+ or assist in its implementation may be prepared, copied, published
1918+ and distributed, in whole or in part, without restriction of any
1919+ kind, provided that the above copyright notice and this paragraph are
1920+ included on all such copies and derivative works. However, this
1921+ document itself may not be modified in any way, such as by removing
1922+ the copyright notice or references to the Internet Society or other
1923+ Internet organizations, except as needed for the purpose of
1924+ developing Internet standards in which case the procedures for
1925+ copyrights defined in the Internet Standards process must be
1926+ followed, or as required to translate it into languages other than
1927+ English.
1928+1929+ The limited permissions granted above are perpetual and will not be
1930+ revoked by the Internet Society or its successors or assigns.
1931+1932+ This document and the information contained herein is provided on an
1933+ "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
1934+ TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
1935+ BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
1936+ HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
1937+ MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
1938+1939+Acknowledgement
1940+1941+ Funding for the RFC Editor function is currently provided by the
1942+ Internet Society.
1943+1944+1945+1946+1947+1948+1949+1950+1951+1952+1953+1954+1955+1956+1957+1958+1959+1960+1961+1962+Costello Standards Track [Page 35]
1963+