···11+(* RFC 3492 Punycode Implementation *)
22+33+(* {1 Bootstring Parameters for Punycode (RFC 3492 Section 5)} *)
44+55+let base = 36
66+let tmin = 1
77+let tmax = 26
88+let skew = 38
99+let damp = 700
1010+let initial_bias = 72
1111+let initial_n = 0x80 (* 128 *)
1212+let delimiter = '-'
1313+let ace_prefix = "xn--"
1414+let max_label_length = 63
1515+1616+(* {1 Position Tracking} *)
1717+1818+type position = {
1919+ byte_offset : int;
2020+ char_index : int;
2121+}
2222+2323+let position_byte_offset pos = pos.byte_offset
2424+let position_char_index pos = pos.char_index
2525+2626+let pp_position fmt pos =
2727+ Format.fprintf fmt "byte %d, char %d" pos.byte_offset pos.char_index
2828+2929+3030+(* {1 Error Types} *)
3131+3232+type error =
3333+ | Overflow of position
3434+ | Invalid_character of position * Uchar.t
3535+ | Invalid_digit of position * char
3636+ | Unexpected_end of position
3737+ | Invalid_utf8 of position
3838+ | Label_too_long of int
3939+ | Empty_label
4040+4141+let pp_error fmt = function
4242+ | Overflow pos ->
4343+ Format.fprintf fmt "arithmetic overflow at %a" pp_position pos
4444+ | Invalid_character (pos, u) ->
4545+ Format.fprintf fmt "invalid character U+%04X at %a"
4646+ (Uchar.to_int u) pp_position pos
4747+ | Invalid_digit (pos, c) ->
4848+ Format.fprintf fmt "invalid Punycode digit '%c' (0x%02X) at %a"
4949+ c (Char.code c) pp_position pos
5050+ | Unexpected_end pos ->
5151+ Format.fprintf fmt "unexpected end of input at %a" pp_position pos
5252+ | Invalid_utf8 pos ->
5353+ Format.fprintf fmt "invalid UTF-8 sequence at %a" pp_position pos
5454+ | Label_too_long len ->
5555+ Format.fprintf fmt "label too long: %d bytes (max %d)" len max_label_length
5656+ | Empty_label ->
5757+ Format.fprintf fmt "empty label"
5858+5959+6060+(* {1 Case Flags} *)
6161+6262+type case_flag = Uppercase | Lowercase
6363+6464+(* {1 Basic Predicates} *)
6565+6666+let is_basic u =
6767+ Uchar.to_int u < 0x80
6868+6969+7070+let is_delimiter c = c = delimiter
7171+7272+let is_ascii_string s =
7373+ let rec loop i =
7474+ if i >= String.length s then true
7575+ else if Char.code s.[i] >= 0x80 then false
7676+ else loop (i + 1)
7777+ in
7878+ loop 0
7979+8080+let has_ace_prefix s =
8181+ let len = String.length s in
8282+ len >= 4 &&
8383+ (s.[0] = 'x' || s.[0] = 'X') &&
8484+ (s.[1] = 'n' || s.[1] = 'N') &&
8585+ s.[2] = '-' && s.[3] = '-'
8686+8787+(* {1 Digit Encoding/Decoding (RFC 3492 Section 5)}
8888+8989+ Digit values:
9090+ - 0-25: a-z (or A-Z)
9191+ - 26-35: 0-9
9292+*)
9393+9494+let encode_digit d case_flag =
9595+ if d < 26 then
9696+ Char.chr (d + (if case_flag = Uppercase then 0x41 else 0x61))
9797+ else
9898+ Char.chr (d - 26 + 0x30)
9999+100100+let decode_digit c =
101101+ let code = Char.code c in
102102+ if code >= 0x30 && code <= 0x39 then
103103+ Some (code - 0x30 + 26) (* '0'-'9' -> 26-35 *)
104104+ else if code >= 0x41 && code <= 0x5A then
105105+ Some (code - 0x41) (* 'A'-'Z' -> 0-25 *)
106106+ else if code >= 0x61 && code <= 0x7A then
107107+ Some (code - 0x61) (* 'a'-'z' -> 0-25 *)
108108+ else
109109+ None
110110+111111+(* Check if a character is "flagged" (uppercase) for case annotation *)
112112+let is_flagged c =
113113+ let code = Char.code c in
114114+ code >= 0x41 && code <= 0x5A (* 'A'-'Z' *)
115115+116116+(* {1 Bias Adaptation (RFC 3492 Section 6.1)} *)
117117+118118+let adapt ~delta ~numpoints ~firsttime =
119119+ let delta = if firsttime then delta / damp else delta / 2 in
120120+ let delta = delta + (delta / numpoints) in
121121+ let threshold = ((base - tmin) * tmax) / 2 in
122122+ let rec loop delta k =
123123+ if delta > threshold then
124124+ loop (delta / (base - tmin)) (k + base)
125125+ else
126126+ k + (((base - tmin + 1) * delta) / (delta + skew))
127127+ in
128128+ loop delta 0
129129+130130+(* {1 Overflow-Safe Arithmetic}
131131+132132+ RFC 3492 Section 6.4: Use detection to avoid overflow.
133133+ A + B overflows iff B > maxint - A
134134+ A + B*C overflows iff B > (maxint - A) / C
135135+*)
136136+137137+let max_int_value = max_int
138138+139139+let safe_mul_add a b c pos =
140140+ if c = 0 then Ok a
141141+ else if b > (max_int_value - a) / c then
142142+ Error (Overflow pos)
143143+ else
144144+ Ok (a + b * c)
145145+146146+(* {1 UTF-8 to Code Points Conversion} *)
147147+148148+let utf8_to_codepoints s =
149149+ let len = String.length s in
150150+ let acc = ref [] in
151151+ let byte_offset = ref 0 in
152152+ let char_index = ref 0 in
153153+ let error = ref None in
154154+ while !byte_offset < len && !error = None do
155155+ let pos = { byte_offset = !byte_offset; char_index = !char_index } in
156156+ let dec = String.get_utf_8_uchar s !byte_offset in
157157+ if Uchar.utf_decode_is_valid dec then begin
158158+ acc := Uchar.utf_decode_uchar dec :: !acc;
159159+ byte_offset := !byte_offset + Uchar.utf_decode_length dec;
160160+ incr char_index
161161+ end else begin
162162+ error := Some (Invalid_utf8 pos)
163163+ end
164164+ done;
165165+ match !error with
166166+ | Some e -> Error e
167167+ | None -> Ok (Array.of_list (List.rev !acc))
168168+169169+(* {1 Code Points to UTF-8 Conversion} *)
170170+171171+let codepoints_to_utf8 codepoints =
172172+ let buf = Buffer.create (Array.length codepoints * 2) in
173173+ Array.iter (Buffer.add_utf_8_uchar buf) codepoints;
174174+ Buffer.contents buf
175175+176176+(* {1 Punycode Encoding (RFC 3492 Section 6.3)} *)
177177+178178+let encode_impl codepoints case_flags =
179179+ let input_length = Array.length codepoints in
180180+ if input_length = 0 then
181181+ Ok ""
182182+ else begin
183183+ let output = Buffer.create (input_length * 2) in
184184+185185+ (* Copy basic code points to output *)
186186+ let basic_count = ref 0 in
187187+ for j = 0 to input_length - 1 do
188188+ let cp = codepoints.(j) in
189189+ if is_basic cp then begin
190190+ let c = Uchar.to_int cp in
191191+ let case =
192192+ match case_flags with
193193+ | Some flags -> flags.(j)
194194+ | None -> Lowercase
195195+ in
196196+ (* Preserve or apply case for ASCII letters *)
197197+ let c' =
198198+ if c >= 0x41 && c <= 0x5A then (* 'A'-'Z' *)
199199+ if case = Lowercase then c + 0x20 else c
200200+ else if c >= 0x61 && c <= 0x7A then (* 'a'-'z' *)
201201+ if case = Uppercase then c - 0x20 else c
202202+ else
203203+ c
204204+ in
205205+ Buffer.add_char output (Char.chr c');
206206+ incr basic_count
207207+ end
208208+ done;
209209+210210+ let b = !basic_count in
211211+ let h = ref b in
212212+213213+ (* Add delimiter if there were basic code points *)
214214+ if b > 0 then
215215+ Buffer.add_char output delimiter;
216216+217217+ (* Main encoding loop *)
218218+ let n = ref initial_n in
219219+ let delta = ref 0 in
220220+ let bias = ref initial_bias in
221221+222222+ let result = ref (Ok ()) in
223223+224224+ while !h < input_length && !result = Ok () do
225225+ (* Find minimum code point >= n *)
226226+ let m = ref max_int_value in
227227+ for j = 0 to input_length - 1 do
228228+ let cp = Uchar.to_int codepoints.(j) in
229229+ if cp >= !n && cp < !m then
230230+ m := cp
231231+ done;
232232+233233+ (* Increase delta to advance state to <m, 0> *)
234234+ let pos = { byte_offset = 0; char_index = !h } in
235235+ (match safe_mul_add !delta (!m - !n) (!h + 1) pos with
236236+ | Error e -> result := Error e
237237+ | Ok new_delta ->
238238+ delta := new_delta;
239239+ n := !m;
240240+241241+ (* Process each code point *)
242242+ let j = ref 0 in
243243+ while !j < input_length && !result = Ok () do
244244+ let cp = Uchar.to_int codepoints.(!j) in
245245+ let pos = { byte_offset = 0; char_index = !j } in
246246+247247+ if cp < !n then begin
248248+ incr delta;
249249+ if !delta = 0 then (* Overflow *)
250250+ result := Error (Overflow pos)
251251+ end
252252+ else if cp = !n then begin
253253+ (* Encode delta as variable-length integer *)
254254+ let q = ref !delta in
255255+ let k = ref base in
256256+ let done_encoding = ref false in
257257+258258+ while not !done_encoding do
259259+ let t =
260260+ if !k <= !bias then tmin
261261+ else if !k >= !bias + tmax then tmax
262262+ else !k - !bias
263263+ in
264264+ if !q < t then begin
265265+ (* Output final digit *)
266266+ let case =
267267+ match case_flags with
268268+ | Some flags -> flags.(!j)
269269+ | None -> Lowercase
270270+ in
271271+ Buffer.add_char output (encode_digit !q case);
272272+ done_encoding := true
273273+ end else begin
274274+ (* Output intermediate digit and continue *)
275275+ let digit = t + ((!q - t) mod (base - t)) in
276276+ Buffer.add_char output (encode_digit digit Lowercase);
277277+ q := (!q - t) / (base - t);
278278+ k := !k + base
279279+ end
280280+ done;
281281+282282+ bias := adapt ~delta:!delta ~numpoints:(!h + 1) ~firsttime:(!h = b);
283283+ delta := 0;
284284+ incr h
285285+ end;
286286+ incr j
287287+ done;
288288+289289+ incr delta;
290290+ incr n)
291291+ done;
292292+293293+ match !result with
294294+ | Error e -> Error e
295295+ | Ok () -> Ok (Buffer.contents output)
296296+ end
297297+298298+let encode codepoints =
299299+ encode_impl codepoints None
300300+301301+let encode_with_case codepoints case_flags =
302302+ if Array.length codepoints <> Array.length case_flags then
303303+ invalid_arg "encode_with_case: array lengths must match";
304304+ encode_impl codepoints (Some case_flags)
305305+306306+(* {1 Punycode Decoding (RFC 3492 Section 6.2)} *)
307307+308308+let decode_impl input =
309309+ let input_length = String.length input in
310310+ if input_length = 0 then
311311+ Ok ([||], [||])
312312+ else begin
313313+ (* Find last delimiter *)
314314+ let last_delim = ref (-1) in
315315+ for j = 0 to input_length - 1 do
316316+ if is_delimiter input.[j] then
317317+ last_delim := j
318318+ done;
319319+ let b = if !last_delim < 0 then 0 else !last_delim in
320320+321321+ (* Copy basic code points and extract case flags *)
322322+ let output = ref [] in
323323+ let case_output = ref [] in
324324+ let error = ref None in
325325+326326+ for j = 0 to b - 1 do
327327+ if !error = None then begin
328328+ let c = input.[j] in
329329+ let pos = { byte_offset = j; char_index = j } in
330330+ let code = Char.code c in
331331+ if code >= 0x80 then
332332+ error := Some (Invalid_character (pos, Uchar.of_int code))
333333+ else begin
334334+ output := Uchar.of_int code :: !output;
335335+ case_output := (if is_flagged c then Uppercase else Lowercase) :: !case_output
336336+ end
337337+ end
338338+ done;
339339+340340+ match !error with
341341+ | Some e -> Error e
342342+ | None ->
343343+ let output = ref (Array.of_list (List.rev !output)) in
344344+ let case_output = ref (Array.of_list (List.rev !case_output)) in
345345+346346+ (* Main decoding loop *)
347347+ let n = ref initial_n in
348348+ let i = ref 0 in
349349+ let bias = ref initial_bias in
350350+ let in_pos = ref (if b > 0 then b + 1 else 0) in
351351+ let result = ref (Ok ()) in
352352+353353+ while !in_pos < input_length && !result = Ok () do
354354+ let oldi = !i in
355355+ let w = ref 1 in
356356+ let k = ref base in
357357+ let done_decoding = ref false in
358358+359359+ while not !done_decoding && !result = Ok () do
360360+ let pos = { byte_offset = !in_pos; char_index = Array.length !output } in
361361+362362+ if !in_pos >= input_length then begin
363363+ result := Error (Unexpected_end pos);
364364+ done_decoding := true
365365+ end else begin
366366+ let c = input.[!in_pos] in
367367+ incr in_pos;
368368+369369+ match decode_digit c with
370370+ | None ->
371371+ result := Error (Invalid_digit (pos, c));
372372+ done_decoding := true
373373+ | Some digit ->
374374+ (* i = i + digit * w, with overflow check *)
375375+ (match safe_mul_add !i digit !w pos with
376376+ | Error e ->
377377+ result := Error e;
378378+ done_decoding := true
379379+ | Ok new_i ->
380380+ i := new_i;
381381+382382+ let t =
383383+ if !k <= !bias then tmin
384384+ else if !k >= !bias + tmax then tmax
385385+ else !k - !bias
386386+ in
387387+388388+ if digit < t then begin
389389+ (* Record case flag from this final digit *)
390390+ done_decoding := true
391391+ end else begin
392392+ (* w = w * (base - t), with overflow check *)
393393+ let base_minus_t = base - t in
394394+ if !w > max_int_value / base_minus_t then begin
395395+ result := Error (Overflow pos);
396396+ done_decoding := true
397397+ end else begin
398398+ w := !w * base_minus_t;
399399+ k := !k + base
400400+ end
401401+ end)
402402+ end
403403+ done;
404404+405405+ if !result = Ok () then begin
406406+ let out_len = Array.length !output in
407407+ bias := adapt ~delta:(!i - oldi) ~numpoints:(out_len + 1) ~firsttime:(oldi = 0);
408408+409409+ let pos = { byte_offset = !in_pos - 1; char_index = out_len } in
410410+411411+ (* n = n + i / (out_len + 1), with overflow check *)
412412+ let increment = !i / (out_len + 1) in
413413+ if increment > max_int_value - !n then
414414+ result := Error (Overflow pos)
415415+ else begin
416416+ n := !n + increment;
417417+ i := !i mod (out_len + 1);
418418+419419+ (* Validate that n is a valid Unicode scalar value *)
420420+ if not (Uchar.is_valid !n) then
421421+ result := Error (Invalid_character (pos, Uchar.rep))
422422+ else begin
423423+ (* Insert n at position i *)
424424+ let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in
425425+ let new_case = Array.make (out_len + 1) Lowercase in
426426+427427+ for j = 0 to !i - 1 do
428428+ new_output.(j) <- !output.(j);
429429+ new_case.(j) <- !case_output.(j)
430430+ done;
431431+ new_output.(!i) <- Uchar.of_int !n;
432432+ (* Case flag from final digit of this delta *)
433433+ new_case.(!i) <- (if !in_pos > 0 && is_flagged input.[!in_pos - 1]
434434+ then Uppercase else Lowercase);
435435+ for j = !i to out_len - 1 do
436436+ new_output.(j + 1) <- !output.(j);
437437+ new_case.(j + 1) <- !case_output.(j)
438438+ done;
439439+440440+ output := new_output;
441441+ case_output := new_case;
442442+ incr i
443443+ end
444444+ end
445445+ end
446446+ done;
447447+448448+ match !result with
449449+ | Error e -> Error e
450450+ | Ok () -> Ok (!output, !case_output)
451451+ end
452452+453453+let decode input =
454454+ match decode_impl input with
455455+ | Error e -> Error e
456456+ | Ok (codepoints, _) -> Ok codepoints
457457+458458+let decode_with_case input =
459459+ decode_impl input
460460+461461+(* {1 UTF-8 String Operations} *)
462462+463463+let encode_utf8 s =
464464+ match utf8_to_codepoints s with
465465+ | Error e -> Error e
466466+ | Ok codepoints -> encode codepoints
467467+468468+let decode_utf8 punycode =
469469+ match decode punycode with
470470+ | Error e -> Error e
471471+ | Ok codepoints -> Ok (codepoints_to_utf8 codepoints)
472472+473473+(* {1 Domain Label Operations} *)
474474+475475+let encode_label label =
476476+ if String.length label = 0 then
477477+ Error Empty_label
478478+ else if is_ascii_string label then begin
479479+ (* All ASCII - return as-is, but check length *)
480480+ let len = String.length label in
481481+ if len > max_label_length then
482482+ Error (Label_too_long len)
483483+ else
484484+ Ok label
485485+ end else begin
486486+ (* Has non-ASCII - encode with Punycode *)
487487+ match encode_utf8 label with
488488+ | Error e -> Error e
489489+ | Ok encoded ->
490490+ let result = ace_prefix ^ encoded in
491491+ let len = String.length result in
492492+ if len > max_label_length then
493493+ Error (Label_too_long len)
494494+ else
495495+ Ok result
496496+ end
497497+498498+let decode_label label =
499499+ if String.length label = 0 then
500500+ Error Empty_label
501501+ else if has_ace_prefix label then begin
502502+ (* Remove ACE prefix and decode *)
503503+ let punycode = String.sub label 4 (String.length label - 4) in
504504+ decode_utf8 punycode
505505+ end else begin
506506+ (* No ACE prefix - validate and return *)
507507+ if is_ascii_string label then
508508+ Ok label
509509+ else
510510+ (* Has non-ASCII but no ACE prefix - return as-is *)
511511+ Ok label
512512+ end
+247
lib/punycode.mli
···11+(** RFC 3492 Punycode: A Bootstring encoding of Unicode for IDNA.
22+33+ This module implements the Punycode algorithm as specified in
44+ {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492},
55+ providing encoding and decoding of Unicode strings to/from ASCII-compatible
66+ encoding suitable for use in internationalized domain names.
77+88+ Punycode is an instance of Bootstring that uses particular parameter
99+ values appropriate for IDNA. See
1010+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}
1111+ for the specific parameter values.
1212+1313+ {2 References}
1414+ {ul
1515+ {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A Bootstring encoding of Unicode for IDNA}
1616+ {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol}} *)
1717+1818+(** {1 Position Tracking} *)
1919+2020+type position
2121+(** Abstract type representing a position in input for error reporting.
2222+ Positions track both byte offset and Unicode character index. *)
2323+2424+val position_byte_offset : position -> int
2525+(** [position_byte_offset pos] returns the byte offset in the input. *)
2626+2727+val position_char_index : position -> int
2828+(** [position_char_index pos] returns the Unicode character index (0-based). *)
2929+3030+val pp_position : Format.formatter -> position -> unit
3131+(** [pp_position fmt pos] pretty-prints a position as "byte N, char M". *)
3232+3333+(** {1 Error Types} *)
3434+3535+type error =
3636+ | Overflow of position
3737+ (** Arithmetic overflow during encode/decode. This can occur with
3838+ very long strings or extreme Unicode code point values.
3939+ See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4}
4040+ RFC 3492 Section 6.4} for overflow handling requirements. *)
4141+ | Invalid_character of position * Uchar.t
4242+ (** A non-basic code point appeared where only basic code points
4343+ (ASCII < 128) are allowed. Per
4444+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1}
4545+ RFC 3492 Section 3.1}, basic code points must be segregated
4646+ at the beginning of the encoded string. *)
4747+ | Invalid_digit of position * char
4848+ (** An invalid Punycode digit was encountered during decoding.
4949+ Valid digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35).
5050+ See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}
5151+ RFC 3492 Section 5} for digit-value mappings. *)
5252+ | Unexpected_end of position
5353+ (** The input ended prematurely during decoding of a delta value.
5454+ See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}
5555+ RFC 3492 Section 6.2} decoding procedure. *)
5656+ | Invalid_utf8 of position
5757+ (** Malformed UTF-8 sequence in input string. *)
5858+ | Label_too_long of int
5959+ (** Encoded label exceeds 63 bytes (DNS limit per
6060+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}).
6161+ The int is the actual length. *)
6262+ | Empty_label
6363+ (** Empty label is not valid for encoding. *)
6464+6565+val pp_error : Format.formatter -> error -> unit
6666+(** [pp_error fmt e] pretty-prints an error with position information. *)
6767+6868+(** {1 Constants}
6969+7070+ Punycode parameters as specified in
7171+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}. *)
7272+7373+val ace_prefix : string
7474+(** The ACE prefix ["xn--"] used for Punycode-encoded domain labels.
7575+ See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}
7676+ RFC 3492 Section 5} which notes that IDNA prepends this prefix. *)
7777+7878+val max_label_length : int
7979+(** Maximum length of a domain label in bytes (63), per
8080+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
8181+8282+(** {1 Case Flags for Mixed-Case Annotation}
8383+8484+ {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}
8585+ describes an optional mechanism for preserving case information through
8686+ the encoding/decoding round-trip. This is useful when the original
8787+ string's case should be recoverable.
8888+8989+ Note: Mixed-case annotation is not used by the ToASCII and ToUnicode
9090+ operations of IDNA. *)
9191+9292+type case_flag = Uppercase | Lowercase
9393+(** Case annotation for a character. *)
9494+9595+(** {1 Core Punycode Operations}
9696+9797+ These functions implement the Bootstring algorithms from
9898+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section 6}.
9999+ They operate on arrays of Unicode code points ([Uchar.t array]).
100100+ The encoded output is a plain ASCII string without the ACE prefix. *)
101101+102102+val encode : Uchar.t array -> (string, error) result
103103+(** [encode codepoints] encodes an array of Unicode code points to a
104104+ Punycode ASCII string.
105105+106106+ Implements the encoding procedure from
107107+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492 Section 6.3}:
108108+109109+ 1. Basic code points (ASCII < 128) are copied literally to the beginning
110110+ of the output per {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1}
111111+ Section 3.1 (Basic code point segregation)}
112112+ 2. A delimiter ('-') is appended if there are any basic code points
113113+ 3. Non-basic code points are encoded as deltas using the generalized
114114+ variable-length integer representation from
115115+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3}
116116+117117+ Example:
118118+ {[
119119+ encode [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC; ... |]
120120+ (* = Ok "ihqwcrb4cv8a8dqg056pqjye" *)
121121+ ]} *)
122122+123123+val decode : string -> (Uchar.t array, error) result
124124+(** [decode punycode] decodes a Punycode ASCII string to an array of
125125+ Unicode code points.
126126+127127+ Implements the decoding procedure from
128128+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 Section 6.2}.
129129+130130+ The input should be the Punycode portion only, without the ACE prefix.
131131+ The decoder is case-insensitive for the encoded portion, as required by
132132+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}:
133133+ "A decoder MUST recognize the letters in both uppercase and lowercase forms".
134134+135135+ Example:
136136+ {[
137137+ decode "ihqwcrb4cv8a8dqg056pqjye"
138138+ (* = Ok [| U+4ED6; U+4EEC; U+4E3A; ... |] (Chinese simplified) *)
139139+ ]} *)
140140+141141+(** {1 Mixed-Case Annotation}
142142+143143+ These functions support round-trip case preservation as described
144144+ in {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}. *)
145145+146146+val encode_with_case : Uchar.t array -> case_flag array -> (string, error) result
147147+(** [encode_with_case codepoints case_flags] encodes with case annotation.
148148+149149+ Per {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}:
150150+ - For basic (ASCII) letters, the output preserves the case flag directly
151151+ - For non-ASCII characters, the case of the final digit in each delta
152152+ encoding indicates the flag (uppercase = suggested uppercase)
153153+154154+ The [case_flags] array must have the same length as [codepoints].
155155+156156+ @raise Invalid_argument if array lengths don't match. *)
157157+158158+val decode_with_case : string -> (Uchar.t array * case_flag array, error) result
159159+(** [decode_with_case punycode] decodes and extracts case annotations.
160160+161161+ Per {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A},
162162+ returns both the decoded code points and an array of case flags
163163+ indicating the suggested case for each character based on the
164164+ uppercase/lowercase form of the encoding digits. *)
165165+166166+(** {1 UTF-8 String Operations}
167167+168168+ Convenience functions that work directly with UTF-8 encoded OCaml strings.
169169+ These combine UTF-8 decoding/encoding with the core Punycode operations. *)
170170+171171+val encode_utf8 : string -> (string, error) result
172172+(** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix).
173173+174174+ This is equivalent to decoding [s] from UTF-8 to code points, then
175175+ calling {!encode}.
176176+177177+ Example:
178178+ {[
179179+ encode_utf8 "münchen"
180180+ (* = Ok "mnchen-3ya" *)
181181+ ]} *)
182182+183183+val decode_utf8 : string -> (string, error) result
184184+(** [decode_utf8 punycode] decodes Punycode to a UTF-8 string (no ACE prefix).
185185+186186+ This is equivalent to calling {!decode} then encoding the result as UTF-8.
187187+188188+ Example:
189189+ {[
190190+ decode_utf8 "mnchen-3ya"
191191+ (* = Ok "münchen" *)
192192+ ]} *)
193193+194194+(** {1 Domain Label Operations}
195195+196196+ These functions handle the ACE prefix automatically and enforce
197197+ DNS label length limits per {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
198198+199199+val encode_label : string -> (string, error) result
200200+(** [encode_label label] encodes a domain label for use in DNS.
201201+202202+ If the label contains only ASCII characters, it is returned unchanged.
203203+ Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended,
204204+ as specified in {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}
205205+ RFC 3492 Section 5}.
206206+207207+ Returns {!Error} {!Label_too_long} if the result exceeds 63 bytes.
208208+209209+ Example:
210210+ {[
211211+ encode_label "münchen"
212212+ (* = Ok "xn--mnchen-3ya" *)
213213+ encode_label "example"
214214+ (* = Ok "example" *)
215215+ ]} *)
216216+217217+val decode_label : string -> (string, error) result
218218+(** [decode_label label] decodes a domain label.
219219+220220+ If the label starts with the ACE prefix ("xn--", case-insensitive),
221221+ it is Punycode-decoded. Otherwise, it is returned unchanged.
222222+223223+ Example:
224224+ {[
225225+ decode_label "xn--mnchen-3ya"
226226+ (* = Ok "münchen" *)
227227+ decode_label "example"
228228+ (* = Ok "example" *)
229229+ ]} *)
230230+231231+(** {1 Validation}
232232+233233+ Predicate functions for checking code point and string properties. *)
234234+235235+val is_basic : Uchar.t -> bool
236236+(** [is_basic u] is [true] if [u] is a basic code point (ASCII, < 128).
237237+238238+ Per {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5},
239239+ basic code points for Punycode are the ASCII code points (0..7F). *)
240240+241241+val is_ascii_string : string -> bool
242242+(** [is_ascii_string s] is [true] if [s] contains only ASCII characters
243243+ (all bytes < 128). *)
244244+245245+val has_ace_prefix : string -> bool
246246+(** [has_ace_prefix s] is [true] if [s] starts with the ACE prefix "xn--"
247247+ (case-insensitive comparison). *)
+192
lib/punycode_idna.ml
···11+(* IDNA (Internationalized Domain Names in Applications) Implementation *)
22+33+let max_domain_length = 253
44+55+(* {1 Error Types} *)
66+77+type error =
88+ | Punycode_error of Punycode.error
99+ | Invalid_label of string
1010+ | Domain_too_long of int
1111+ | Normalization_failed
1212+ | Verification_failed
1313+1414+let pp_error fmt = function
1515+ | Punycode_error e ->
1616+ Format.fprintf fmt "Punycode error: %a" Punycode.pp_error e
1717+ | Invalid_label msg ->
1818+ Format.fprintf fmt "invalid label: %s" msg
1919+ | Domain_too_long len ->
2020+ Format.fprintf fmt "domain too long: %d bytes (max %d)" len max_domain_length
2121+ | Normalization_failed ->
2222+ Format.fprintf fmt "Unicode normalization failed"
2323+ | Verification_failed ->
2424+ Format.fprintf fmt "IDNA verification failed (round-trip mismatch)"
2525+2626+2727+(* {1 Unicode Normalization} *)
2828+2929+let normalize_nfc s =
3030+ Uunf_string.normalize_utf_8 `NFC s
3131+3232+(* {1 Validation Helpers} *)
3333+3434+let is_ace_label label =
3535+ Punycode.has_ace_prefix label
3636+3737+(* Check if a label follows STD3 rules (hostname restrictions):
3838+ - Only LDH (letters, digits, hyphens)
3939+ - Cannot start or end with hyphen *)
4040+let is_std3_valid label =
4141+ let len = String.length label in
4242+ if len = 0 then false
4343+ else if label.[0] = '-' || label.[len - 1] = '-' then false
4444+ else
4545+ let rec check i =
4646+ if i >= len then true
4747+ else
4848+ let c = label.[i] in
4949+ let valid =
5050+ (c >= 'a' && c <= 'z') ||
5151+ (c >= 'A' && c <= 'Z') ||
5252+ (c >= '0' && c <= '9') ||
5353+ c = '-'
5454+ in
5555+ if valid then check (i + 1) else false
5656+ in
5757+ check 0
5858+5959+(* Check hyphen placement: hyphens not in positions 3 and 4 (except for ACE) *)
6060+let check_hyphen_rules label =
6161+ let len = String.length label in
6262+ if len >= 4 && label.[2] = '-' && label.[3] = '-' then
6363+ (* Hyphens in positions 3 and 4 - only valid for ACE prefix *)
6464+ is_ace_label label
6565+ else
6666+ true
6767+6868+(* {1 Label Operations} *)
6969+7070+let label_to_ascii_impl ~check_hyphens ~use_std3_rules label =
7171+ let len = String.length label in
7272+ if len = 0 then
7373+ Error (Invalid_label "empty label")
7474+ else if len > Punycode.max_label_length then
7575+ Error (Punycode_error (Punycode.Label_too_long len))
7676+ else if Punycode.is_ascii_string label then begin
7777+ (* All ASCII - validate and pass through *)
7878+ if use_std3_rules && not (is_std3_valid label) then
7979+ Error (Invalid_label "STD3 rules violation")
8080+ else if check_hyphens && not (check_hyphen_rules label) then
8181+ Error (Invalid_label "invalid hyphen placement")
8282+ else
8383+ Ok label
8484+ end else begin
8585+ (* Has non-ASCII - normalize and encode *)
8686+ let normalized = normalize_nfc label in
8787+8888+ (* Encode to Punycode *)
8989+ match Punycode.encode_utf8 normalized with
9090+ | Error e -> Error (Punycode_error e)
9191+ | Ok encoded ->
9292+ let result = Punycode.ace_prefix ^ encoded in
9393+ let result_len = String.length result in
9494+ if result_len > Punycode.max_label_length then
9595+ Error (Punycode_error (Punycode.Label_too_long result_len))
9696+ else if check_hyphens && not (check_hyphen_rules result) then
9797+ Error (Invalid_label "invalid hyphen placement in encoded label")
9898+ else
9999+ (* Verification: decode and compare to original normalized form *)
100100+ match Punycode.decode_utf8 encoded with
101101+ | Error _ -> Error Verification_failed
102102+ | Ok decoded ->
103103+ if decoded <> normalized then
104104+ Error Verification_failed
105105+ else
106106+ Ok result
107107+ end
108108+109109+let label_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) label =
110110+ label_to_ascii_impl ~check_hyphens ~use_std3_rules label
111111+112112+let label_to_unicode label =
113113+ if is_ace_label label then begin
114114+ let encoded = String.sub label 4 (String.length label - 4) in
115115+ match Punycode.decode_utf8 encoded with
116116+ | Error e -> Error (Punycode_error e)
117117+ | Ok decoded -> Ok decoded
118118+ end else
119119+ Ok label
120120+121121+(* {1 Domain Operations} *)
122122+123123+(* Split domain into labels *)
124124+let split_domain domain =
125125+ String.split_on_char '.' domain
126126+127127+(* Join labels into domain *)
128128+let join_labels labels =
129129+ String.concat "." labels
130130+131131+let to_ascii ?(check_hyphens = true) ?(check_bidi = false)
132132+ ?(check_joiners = false) ?(use_std3_rules = false)
133133+ ?(transitional = false) domain =
134134+ (* Note: check_bidi, check_joiners, and transitional are accepted but
135135+ not fully implemented - they would require additional Unicode data *)
136136+ let _ = check_bidi in
137137+ let _ = check_joiners in
138138+ let _ = transitional in
139139+140140+ let labels = split_domain domain in
141141+ let rec process acc = function
142142+ | [] ->
143143+ let result = join_labels (List.rev acc) in
144144+ let len = String.length result in
145145+ if len > max_domain_length then
146146+ Error (Domain_too_long len)
147147+ else
148148+ Ok result
149149+ | label :: rest ->
150150+ match label_to_ascii_impl ~check_hyphens ~use_std3_rules label with
151151+ | Error e -> Error e
152152+ | Ok encoded -> process (encoded :: acc) rest
153153+ in
154154+ process [] labels
155155+156156+let to_unicode domain =
157157+ let labels = split_domain domain in
158158+ let rec process acc = function
159159+ | [] -> Ok (join_labels (List.rev acc))
160160+ | label :: rest ->
161161+ match label_to_unicode label with
162162+ | Error e -> Error e
163163+ | Ok decoded -> process (decoded :: acc) rest
164164+ in
165165+ process [] labels
166166+167167+(* {1 Domain Name Library Integration} *)
168168+169169+let domain_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) domain =
170170+ let s = Domain_name.to_string domain in
171171+ match to_ascii ~check_hyphens ~use_std3_rules s with
172172+ | Error e -> Error e
173173+ | Ok ascii ->
174174+ match Domain_name.of_string ascii with
175175+ | Error (`Msg msg) -> Error (Invalid_label msg)
176176+ | Ok d -> Ok d
177177+178178+let domain_to_unicode domain =
179179+ let s = Domain_name.to_string domain in
180180+ match to_unicode s with
181181+ | Error e -> Error e
182182+ | Ok unicode ->
183183+ match Domain_name.of_string unicode with
184184+ | Error (`Msg msg) -> Error (Invalid_label msg)
185185+ | Ok d -> Ok d
186186+187187+(* {1 Validation} *)
188188+189189+let is_idna_valid domain =
190190+ match to_ascii domain with
191191+ | Ok _ -> true
192192+ | Error _ -> false
+189
lib/punycode_idna.mli
···11+(** IDNA (Internationalized Domain Names in Applications) support.
22+33+ This module provides ToASCII and ToUnicode operations as specified
44+ in {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} (IDNA 2008),
55+ using Punycode ({{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492})
66+ for encoding.
77+88+ IDNA allows domain names to contain non-ASCII Unicode characters by
99+ encoding them using Punycode with an ACE prefix. This module handles
1010+ the conversion between Unicode domain names and their ASCII-compatible
1111+ encoding (ACE) form.
1212+1313+ {2 References}
1414+ {ul
1515+ {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} -
1616+ Internationalized Domain Names in Applications (IDNA): Protocol}
1717+ {- {{:https://datatracker.ietf.org/doc/html/rfc5892}RFC 5892} -
1818+ The Unicode Code Points and Internationalized Domain Names for Applications (IDNA)}
1919+ {- {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} -
2020+ Right-to-Left Scripts for Internationalized Domain Names for Applications (IDNA)}
2121+ {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} -
2222+ Punycode: A Bootstring encoding of Unicode for IDNA}} *)
2323+2424+(** {1 Error Types} *)
2525+2626+type error =
2727+ | Punycode_error of Punycode.error
2828+ (** Error during Punycode encoding/decoding.
2929+ See {!Punycode.error} for details. *)
3030+ | Invalid_label of string
3131+ (** Label violates IDNA constraints. The string describes the violation.
3232+ See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4}
3333+ RFC 5891 Section 4} for label validation requirements. *)
3434+ | Domain_too_long of int
3535+ (** Domain name exceeds 253 bytes, per
3636+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}.
3737+ The int is the actual length. *)
3838+ | Normalization_failed
3939+ (** Unicode normalization (NFC) failed.
4040+ Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}
4141+ RFC 5891 Section 4.2.1}, labels must be in NFC form. *)
4242+ | Verification_failed
4343+ (** ToASCII/ToUnicode verification step failed (round-trip check).
4444+ Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}
4545+ RFC 5891 Section 4.2}, the result of encoding must decode back
4646+ to the original input. *)
4747+4848+val pp_error : Format.formatter -> error -> unit
4949+(** [pp_error fmt e] pretty-prints an error. *)
5050+5151+(** {1 Constants} *)
5252+5353+val max_domain_length : int
5454+(** Maximum length of a domain name in bytes (253), per
5555+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
5656+5757+(** {1 ToASCII Operation}
5858+5959+ Converts an internationalized domain name to its ASCII-compatible
6060+ encoding (ACE) form suitable for DNS lookup.
6161+6262+ See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4}
6363+ RFC 5891 Section 4} for the complete ToASCII specification. *)
6464+6565+val to_ascii : ?check_hyphens:bool -> ?check_bidi:bool ->
6666+ ?check_joiners:bool -> ?use_std3_rules:bool ->
6767+ ?transitional:bool -> string -> (string, error) result
6868+(** [to_ascii domain] converts an internationalized domain name to ASCII.
6969+7070+ Implements the ToASCII operation from
7171+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 Section 4.1}.
7272+7373+ For each label in the domain:
7474+ 1. If all ASCII, pass through (with optional STD3 validation)
7575+ 2. Otherwise, normalize to NFC per
7676+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}Section 4.2.1}
7777+ and Punycode-encode with ACE prefix
7878+7979+ Optional parameters (per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4}
8080+ RFC 5891 Section 4} processing options):
8181+ - [check_hyphens]: Validate hyphen placement per
8282+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1}Section 4.2.3.1}
8383+ (default: true)
8484+ - [check_bidi]: Check bidirectional text rules per
8585+ {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893}
8686+ (default: false, not implemented)
8787+ - [check_joiners]: Check contextual joiner rules per
8888+ {{:https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1}RFC 5892 Appendix A.1}
8989+ (default: false, not implemented)
9090+ - [use_std3_rules]: Apply STD3 hostname rules per
9191+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section 4.2.3.2}
9292+ (default: false)
9393+ - [transitional]: Use IDNA 2003 transitional processing
9494+ (default: false)
9595+9696+ Example:
9797+ {[
9898+ to_ascii "münchen.example.com"
9999+ (* = Ok "xn--mnchen-3ya.example.com" *)
100100+ ]} *)
101101+102102+val label_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool ->
103103+ string -> (string, error) result
104104+(** [label_to_ascii label] converts a single label to ASCII.
105105+106106+ This implements the core ToASCII operation for one label, as described in
107107+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 Section 4.1}. *)
108108+109109+(** {1 ToUnicode Operation}
110110+111111+ Converts an ASCII-compatible encoded domain name back to Unicode.
112112+113113+ See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}
114114+ RFC 5891 Section 4.2} for the complete ToUnicode specification. *)
115115+116116+val to_unicode : string -> (string, error) result
117117+(** [to_unicode domain] converts an ACE domain name to Unicode.
118118+119119+ Implements the ToUnicode operation from
120120+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 Section 4.2}.
121121+122122+ For each label in the domain:
123123+ 1. If it has the ACE prefix ("xn--"), Punycode-decode it per
124124+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 Section 6.2}
125125+ 2. Otherwise, pass through unchanged
126126+127127+ Example:
128128+ {[
129129+ to_unicode "xn--mnchen-3ya.example.com"
130130+ (* = Ok "münchen.example.com" *)
131131+ ]} *)
132132+133133+val label_to_unicode : string -> (string, error) result
134134+(** [label_to_unicode label] converts a single ACE label to Unicode.
135135+136136+ This implements the core ToUnicode operation for one label, as described in
137137+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 Section 4.2}. *)
138138+139139+(** {1 Domain Name Integration}
140140+141141+ Functions that work with the
142142+ {{:https://github.com/hannesm/domain-name}domain-name} library types.
143143+144144+ These provide integration with the [Domain_name] module for applications
145145+ that use that library for domain name handling. *)
146146+147147+val domain_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool ->
148148+ [`raw] Domain_name.t -> ([`raw] Domain_name.t, error) result
149149+(** [domain_to_ascii domain] converts a domain name to ASCII form.
150150+151151+ Applies {!to_ascii} to the string representation and returns the
152152+ result as a [Domain_name.t].
153153+154154+ Example:
155155+ {[
156156+ let d = Domain_name.of_string_exn "münchen.example.com" in
157157+ domain_to_ascii d
158158+ (* = Ok (Domain_name.of_string_exn "xn--mnchen-3ya.example.com") *)
159159+ ]} *)
160160+161161+val domain_to_unicode : [`raw] Domain_name.t -> ([`raw] Domain_name.t, error) result
162162+(** [domain_to_unicode domain] converts a domain name to Unicode form.
163163+164164+ Applies {!to_unicode} to the string representation and returns the
165165+ result as a [Domain_name.t]. *)
166166+167167+(** {1 Validation} *)
168168+169169+val is_idna_valid : string -> bool
170170+(** [is_idna_valid domain] checks if a domain name is valid for IDNA processing.
171171+172172+ Returns [true] if {!to_ascii} would succeed on the domain. *)
173173+174174+val is_ace_label : string -> bool
175175+(** [is_ace_label label] is [true] if the label has the ACE prefix "xn--"
176176+ (case-insensitive). This indicates the label is Punycode-encoded per
177177+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}. *)
178178+179179+(** {1 Normalization} *)
180180+181181+val normalize_nfc : string -> string
182182+(** [normalize_nfc s] returns the NFC-normalized form of UTF-8 string [s].
183183+184184+ Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}
185185+ RFC 5891 Section 4.2.1}, domain labels must be normalized to NFC
186186+ (Unicode Normalization Form C) before encoding.
187187+188188+ See {{:http://www.unicode.org/reports/tr15/}Unicode Standard Annex #15}
189189+ for details on Unicode normalization forms. *)
+37
puny.opam
···11+# This file is generated by dune, edit dune-project instead
22+opam-version: "2.0"
33+version: "0.1.0"
44+synopsis: "RFC 3492 Punycode and IDNA implementation for OCaml"
55+description: """
66+A high-quality implementation of RFC 3492 (Punycode) with IDNA support.
77+ Provides encoding and decoding of internationalized domain names,
88+ with proper Unicode normalization and mixed-case annotation support."""
99+maintainer: ["maintainer@example.com"]
1010+authors: ["Author Name"]
1111+license: "ISC"
1212+homepage: "https://github.com/username/puny"
1313+bug-reports: "https://github.com/username/puny/issues"
1414+depends: [
1515+ "ocaml" {>= "4.14.0"}
1616+ "dune" {>= "3.0" & >= "3.0"}
1717+ "uutf" {>= "1.0.0"}
1818+ "uunf" {>= "15.0.0"}
1919+ "domain-name" {>= "0.4.0"}
2020+ "alcotest" {with-test}
2121+ "odoc" {with-doc}
2222+]
2323+build: [
2424+ ["dune" "subst"] {dev}
2525+ [
2626+ "dune"
2727+ "build"
2828+ "-p"
2929+ name
3030+ "-j"
3131+ jobs
3232+ "@install"
3333+ "@runtest" {with-test}
3434+ "@doc" {with-doc}
3535+ ]
3636+]
3737+dev-repo: "git+https://github.com/username/puny.git"
+1963
spec/rfc3492.txt
···11+22+33+44+55+66+77+Network Working Group A. Costello
88+Request for Comments: 3492 Univ. of California, Berkeley
99+Category: Standards Track March 2003
1010+1111+1212+ Punycode: A Bootstring encoding of Unicode
1313+ for Internationalized Domain Names in Applications (IDNA)
1414+1515+Status of this Memo
1616+1717+ This document specifies an Internet standards track protocol for the
1818+ Internet community, and requests discussion and suggestions for
1919+ improvements. Please refer to the current edition of the "Internet
2020+ Official Protocol Standards" (STD 1) for the standardization state
2121+ and status of this protocol. Distribution of this memo is unlimited.
2222+2323+Copyright Notice
2424+2525+ Copyright (C) The Internet Society (2003). All Rights Reserved.
2626+2727+Abstract
2828+2929+ Punycode is a simple and efficient transfer encoding syntax designed
3030+ for use with Internationalized Domain Names in Applications (IDNA).
3131+ It uniquely and reversibly transforms a Unicode string into an ASCII
3232+ string. ASCII characters in the Unicode string are represented
3333+ literally, and non-ASCII characters are represented by ASCII
3434+ characters that are allowed in host name labels (letters, digits, and
3535+ hyphens). This document defines a general algorithm called
3636+ Bootstring that allows a string of basic code points to uniquely
3737+ represent any string of code points drawn from a larger set.
3838+ Punycode is an instance of Bootstring that uses particular parameter
3939+ values specified by this document, appropriate for IDNA.
4040+4141+Table of Contents
4242+4343+ 1. Introduction...............................................2
4444+ 1.1 Features..............................................2
4545+ 1.2 Interaction of protocol parts.........................3
4646+ 2. Terminology................................................3
4747+ 3. Bootstring description.....................................4
4848+ 3.1 Basic code point segregation..........................4
4949+ 3.2 Insertion unsort coding...............................4
5050+ 3.3 Generalized variable-length integers..................5
5151+ 3.4 Bias adaptation.......................................7
5252+ 4. Bootstring parameters......................................8
5353+ 5. Parameter values for Punycode..............................8
5454+ 6. Bootstring algorithms......................................9
5555+5656+5757+5858+Costello Standards Track [Page 1]
5959+6060+RFC 3492 IDNA Punycode March 2003
6161+6262+6363+ 6.1 Bias adaptation function.............................10
6464+ 6.2 Decoding procedure...................................11
6565+ 6.3 Encoding procedure...................................12
6666+ 6.4 Overflow handling....................................13
6767+ 7. Punycode examples.........................................14
6868+ 7.1 Sample strings.......................................14
6969+ 7.2 Decoding traces......................................17
7070+ 7.3 Encoding traces......................................19
7171+ 8. Security Considerations...................................20
7272+ 9. References................................................21
7373+ 9.1 Normative References.................................21
7474+ 9.2 Informative References...............................21
7575+ A. Mixed-case annotation.....................................22
7676+ B. Disclaimer and license....................................22
7777+ C. Punycode sample implementation............................23
7878+ Author's Address.............................................34
7979+ Full Copyright Statement.....................................35
8080+8181+1. Introduction
8282+8383+ [IDNA] describes an architecture for supporting internationalized
8484+ domain names. Labels containing non-ASCII characters can be
8585+ represented by ACE labels, which begin with a special ACE prefix and
8686+ contain only ASCII characters. The remainder of the label after the
8787+ prefix is a Punycode encoding of a Unicode string satisfying certain
8888+ constraints. For the details of the prefix and constraints, see
8989+ [IDNA] and [NAMEPREP].
9090+9191+ Punycode is an instance of a more general algorithm called
9292+ Bootstring, which allows strings composed from a small set of "basic"
9393+ code points to uniquely represent any string of code points drawn
9494+ from a larger set. Punycode is Bootstring with particular parameter
9595+ values appropriate for IDNA.
9696+9797+1.1 Features
9898+9999+ Bootstring has been designed to have the following features:
100100+101101+ * Completeness: Every extended string (sequence of arbitrary code
102102+ points) can be represented by a basic string (sequence of basic
103103+ code points). Restrictions on what strings are allowed, and on
104104+ length, can be imposed by higher layers.
105105+106106+ * Uniqueness: There is at most one basic string that represents a
107107+ given extended string.
108108+109109+ * Reversibility: Any extended string mapped to a basic string can
110110+ be recovered from that basic string.
111111+112112+113113+114114+Costello Standards Track [Page 2]
115115+116116+RFC 3492 IDNA Punycode March 2003
117117+118118+119119+ * Efficient encoding: The ratio of basic string length to extended
120120+ string length is small. This is important in the context of
121121+ domain names because RFC 1034 [RFC1034] restricts the length of a
122122+ domain label to 63 characters.
123123+124124+ * Simplicity: The encoding and decoding algorithms are reasonably
125125+ simple to implement. The goals of efficiency and simplicity are
126126+ at odds; Bootstring aims at a good balance between them.
127127+128128+ * Readability: Basic code points appearing in the extended string
129129+ are represented as themselves in the basic string (although the
130130+ main purpose is to improve efficiency, not readability).
131131+132132+ Punycode can also support an additional feature that is not used by
133133+ the ToASCII and ToUnicode operations of [IDNA]. When extended
134134+ strings are case-folded prior to encoding, the basic string can use
135135+ mixed case to tell how to convert the folded string into a mixed-case
136136+ string. See appendix A "Mixed-case annotation".
137137+138138+1.2 Interaction of protocol parts
139139+140140+ Punycode is used by the IDNA protocol [IDNA] for converting domain
141141+ labels into ASCII; it is not designed for any other purpose. It is
142142+ explicitly not designed for processing arbitrary free text.
143143+144144+2. Terminology
145145+146146+ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
147147+ "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
148148+ document are to be interpreted as described in BCP 14, RFC 2119
149149+ [RFC2119].
150150+151151+ A code point is an integral value associated with a character in a
152152+ coded character set.
153153+154154+ As in the Unicode Standard [UNICODE], Unicode code points are denoted
155155+ by "U+" followed by four to six hexadecimal digits, while a range of
156156+ code points is denoted by two hexadecimal numbers separated by "..",
157157+ with no prefixes.
158158+159159+ The operators div and mod perform integer division; (x div y) is the
160160+ quotient of x divided by y, discarding the remainder, and (x mod y)
161161+ is the remainder, so (x div y) * y + (x mod y) == x. Bootstring uses
162162+ these operators only with nonnegative operands, so the quotient and
163163+ remainder are always nonnegative.
164164+165165+ The break statement jumps out of the innermost loop (as in C).
166166+167167+168168+169169+170170+Costello Standards Track [Page 3]
171171+172172+RFC 3492 IDNA Punycode March 2003
173173+174174+175175+ An overflow is an attempt to compute a value that exceeds the maximum
176176+ value of an integer variable.
177177+178178+3. Bootstring description
179179+180180+ Bootstring represents an arbitrary sequence of code points (the
181181+ "extended string") as a sequence of basic code points (the "basic
182182+ string"). This section describes the representation. Section 6
183183+ "Bootstring algorithms" presents the algorithms as pseudocode.
184184+ Sections 7.1 "Decoding traces" and 7.2 "Encoding traces" trace the
185185+ algorithms for sample inputs.
186186+187187+ The following sections describe the four techniques used in
188188+ Bootstring. "Basic code point segregation" is a very simple and
189189+ efficient encoding for basic code points occurring in the extended
190190+ string: they are simply copied all at once. "Insertion unsort
191191+ coding" encodes the non-basic code points as deltas, and processes
192192+ the code points in numerical order rather than in order of
193193+ appearance, which typically results in smaller deltas. The deltas
194194+ are represented as "generalized variable-length integers", which use
195195+ basic code points to represent nonnegative integers. The parameters
196196+ of this integer representation are dynamically adjusted using "bias
197197+ adaptation", to improve efficiency when consecutive deltas have
198198+ similar magnitudes.
199199+200200+3.1 Basic code point segregation
201201+202202+ All basic code points appearing in the extended string are
203203+ represented literally at the beginning of the basic string, in their
204204+ original order, followed by a delimiter if (and only if) the number
205205+ of basic code points is nonzero. The delimiter is a particular basic
206206+ code point, which never appears in the remainder of the basic string.
207207+ The decoder can therefore find the end of the literal portion (if
208208+ there is one) by scanning for the last delimiter.
209209+210210+3.2 Insertion unsort coding
211211+212212+ The remainder of the basic string (after the last delimiter if there
213213+ is one) represents a sequence of nonnegative integral deltas as
214214+ generalized variable-length integers, described in section 3.3. The
215215+ meaning of the deltas is best understood in terms of the decoder.
216216+217217+ The decoder builds the extended string incrementally. Initially, the
218218+ extended string is a copy of the literal portion of the basic string
219219+ (excluding the last delimiter). The decoder inserts non-basic code
220220+ points, one for each delta, into the extended string, ultimately
221221+ arriving at the final decoded string.
222222+223223+224224+225225+226226+Costello Standards Track [Page 4]
227227+228228+RFC 3492 IDNA Punycode March 2003
229229+230230+231231+ At the heart of this process is a state machine with two state
232232+ variables: an index i and a counter n. The index i refers to a
233233+ position in the extended string; it ranges from 0 (the first
234234+ position) to the current length of the extended string (which refers
235235+ to a potential position beyond the current end). If the current
236236+ state is <n,i>, the next state is <n,i+1> if i is less than the
237237+ length of the extended string, or <n+1,0> if i equals the length of
238238+ the extended string. In other words, each state change causes i to
239239+ increment, wrapping around to zero if necessary, and n counts the
240240+ number of wrap-arounds.
241241+242242+ Notice that the state always advances monotonically (there is no way
243243+ for the decoder to return to an earlier state). At each state, an
244244+ insertion is either performed or not performed. At most one
245245+ insertion is performed in a given state. An insertion inserts the
246246+ value of n at position i in the extended string. The deltas are a
247247+ run-length encoding of this sequence of events: they are the lengths
248248+ of the runs of non-insertion states preceeding the insertion states.
249249+ Hence, for each delta, the decoder performs delta state changes, then
250250+ an insertion, and then one more state change. (An implementation
251251+ need not perform each state change individually, but can instead use
252252+ division and remainder calculations to compute the next insertion
253253+ state directly.) It is an error if the inserted code point is a
254254+ basic code point (because basic code points were supposed to be
255255+ segregated as described in section 3.1).
256256+257257+ The encoder's main task is to derive the sequence of deltas that will
258258+ cause the decoder to construct the desired string. It can do this by
259259+ repeatedly scanning the extended string for the next code point that
260260+ the decoder would need to insert, and counting the number of state
261261+ changes the decoder would need to perform, mindful of the fact that
262262+ the decoder's extended string will include only those code points
263263+ that have already been inserted. Section 6.3 "Encoding procedure"
264264+ gives a precise algorithm.
265265+266266+3.3 Generalized variable-length integers
267267+268268+ In a conventional integer representation the base is the number of
269269+ distinct symbols for digits, whose values are 0 through base-1. Let
270270+ digit_0 denote the least significant digit, digit_1 the next least
271271+ significant, and so on. The value represented is the sum over j of
272272+ digit_j * w(j), where w(j) = base^j is the weight (scale factor) for
273273+ position j. For example, in the base 8 integer 437, the digits are
274274+ 7, 3, and 4, and the weights are 1, 8, and 64, so the value is 7 +
275275+ 3*8 + 4*64 = 287. This representation has two disadvantages: First,
276276+ there are multiple encodings of each value (because there can be
277277+ extra zeros in the most significant positions), which is inconvenient
278278+279279+280280+281281+282282+Costello Standards Track [Page 5]
283283+284284+RFC 3492 IDNA Punycode March 2003
285285+286286+287287+ when unique encodings are needed. Second, the integer is not self-
288288+ delimiting, so if multiple integers are concatenated the boundaries
289289+ between them are lost.
290290+291291+ The generalized variable-length representation solves these two
292292+ problems. The digit values are still 0 through base-1, but now the
293293+ integer is self-delimiting by means of thresholds t(j), each of which
294294+ is in the range 0 through base-1. Exactly one digit, the most
295295+ significant, satisfies digit_j < t(j). Therefore, if several
296296+ integers are concatenated, it is easy to separate them, starting with
297297+ the first if they are little-endian (least significant digit first),
298298+ or starting with the last if they are big-endian (most significant
299299+ digit first). As before, the value is the sum over j of digit_j *
300300+ w(j), but the weights are different:
301301+302302+ w(0) = 1
303303+ w(j) = w(j-1) * (base - t(j-1)) for j > 0
304304+305305+ For example, consider the little-endian sequence of base 8 digits
306306+ 734251... Suppose the thresholds are 2, 3, 5, 5, 5, 5... This
307307+ implies that the weights are 1, 1*(8-2) = 6, 6*(8-3) = 30, 30*(8-5) =
308308+ 90, 90*(8-5) = 270, and so on. 7 is not less than 2, and 3 is not
309309+ less than 3, but 4 is less than 5, so 4 is the last digit. The value
310310+ of 734 is 7*1 + 3*6 + 4*30 = 145. The next integer is 251, with
311311+ value 2*1 + 5*6 + 1*30 = 62. Decoding this representation is very
312312+ similar to decoding a conventional integer: Start with a current
313313+ value of N = 0 and a weight w = 1. Fetch the next digit d and
314314+ increase N by d * w. If d is less than the current threshold (t)
315315+ then stop, otherwise increase w by a factor of (base - t), update t
316316+ for the next position, and repeat.
317317+318318+ Encoding this representation is similar to encoding a conventional
319319+ integer: If N < t then output one digit for N and stop, otherwise
320320+ output the digit for t + ((N - t) mod (base - t)), then replace N
321321+ with (N - t) div (base - t), update t for the next position, and
322322+ repeat.
323323+324324+ For any particular set of values of t(j), there is exactly one
325325+ generalized variable-length representation of each nonnegative
326326+ integral value.
327327+328328+ Bootstring uses little-endian ordering so that the deltas can be
329329+ separated starting with the first. The t(j) values are defined in
330330+ terms of the constants base, tmin, and tmax, and a state variable
331331+ called bias:
332332+333333+ t(j) = base * (j + 1) - bias,
334334+ clamped to the range tmin through tmax
335335+336336+337337+338338+Costello Standards Track [Page 6]
339339+340340+RFC 3492 IDNA Punycode March 2003
341341+342342+343343+ The clamping means that if the formula yields a value less than tmin
344344+ or greater than tmax, then t(j) = tmin or tmax, respectively. (In
345345+ the pseudocode in section 6 "Bootstring algorithms", the expression
346346+ base * (j + 1) is denoted by k for performance reasons.) These t(j)
347347+ values cause the representation to favor integers within a particular
348348+ range determined by the bias.
349349+350350+3.4 Bias adaptation
351351+352352+ After each delta is encoded or decoded, bias is set for the next
353353+ delta as follows:
354354+355355+ 1. Delta is scaled in order to avoid overflow in the next step:
356356+357357+ let delta = delta div 2
358358+359359+ But when this is the very first delta, the divisor is not 2, but
360360+ instead a constant called damp. This compensates for the fact
361361+ that the second delta is usually much smaller than the first.
362362+363363+ 2. Delta is increased to compensate for the fact that the next delta
364364+ will be inserting into a longer string:
365365+366366+ let delta = delta + (delta div numpoints)
367367+368368+ numpoints is the total number of code points encoded/decoded so
369369+ far (including the one corresponding to this delta itself, and
370370+ including the basic code points).
371371+372372+ 3. Delta is repeatedly divided until it falls within a threshold, to
373373+ predict the minimum number of digits needed to represent the next
374374+ delta:
375375+376376+ while delta > ((base - tmin) * tmax) div 2
377377+ do let delta = delta div (base - tmin)
378378+379379+ 4. The bias is set:
380380+381381+ let bias =
382382+ (base * the number of divisions performed in step 3) +
383383+ (((base - tmin + 1) * delta) div (delta + skew))
384384+385385+ The motivation for this procedure is that the current delta
386386+ provides a hint about the likely size of the next delta, and so
387387+ t(j) is set to tmax for the more significant digits starting with
388388+ the one expected to be last, tmin for the less significant digits
389389+ up through the one expected to be third-last, and somewhere
390390+ between tmin and tmax for the digit expected to be second-last
391391+392392+393393+394394+Costello Standards Track [Page 7]
395395+396396+RFC 3492 IDNA Punycode March 2003
397397+398398+399399+ (balancing the hope of the expected-last digit being unnecessary
400400+ against the danger of it being insufficient).
401401+402402+4. Bootstring parameters
403403+404404+ Given a set of basic code points, one needs to be designated as the
405405+ delimiter. The base cannot be greater than the number of
406406+ distinguishable basic code points remaining. The digit-values in the
407407+ range 0 through base-1 need to be associated with distinct non-
408408+ delimiter basic code points. In some cases multiple code points need
409409+ to have the same digit-value; for example, uppercase and lowercase
410410+ versions of the same letter need to be equivalent if basic strings
411411+ are case-insensitive.
412412+413413+ The initial value of n cannot be greater than the minimum non-basic
414414+ code point that could appear in extended strings.
415415+416416+ The remaining five parameters (tmin, tmax, skew, damp, and the
417417+ initial value of bias) need to satisfy the following constraints:
418418+419419+ 0 <= tmin <= tmax <= base-1
420420+ skew >= 1
421421+ damp >= 2
422422+ initial_bias mod base <= base - tmin
423423+424424+ Provided the constraints are satisfied, these five parameters affect
425425+ efficiency but not correctness. They are best chosen empirically.
426426+427427+ If support for mixed-case annotation is desired (see appendix A),
428428+ make sure that the code points corresponding to 0 through tmax-1 all
429429+ have both uppercase and lowercase forms.
430430+431431+5. Parameter values for Punycode
432432+433433+ Punycode uses the following Bootstring parameter values:
434434+435435+ base = 36
436436+ tmin = 1
437437+ tmax = 26
438438+ skew = 38
439439+ damp = 700
440440+ initial_bias = 72
441441+ initial_n = 128 = 0x80
442442+443443+ Although the only restriction Punycode imposes on the input integers
444444+ is that they be nonnegative, these parameters are especially designed
445445+ to work well with Unicode [UNICODE] code points, which are integers
446446+ in the range 0..10FFFF (but not D800..DFFF, which are reserved for
447447+448448+449449+450450+Costello Standards Track [Page 8]
451451+452452+RFC 3492 IDNA Punycode March 2003
453453+454454+455455+ use by the UTF-16 encoding of Unicode). The basic code points are
456456+ the ASCII [ASCII] code points (0..7F), of which U+002D (-) is the
457457+ delimiter, and some of the others have digit-values as follows:
458458+459459+ code points digit-values
460460+ ------------ ----------------------
461461+ 41..5A (A-Z) = 0 to 25, respectively
462462+ 61..7A (a-z) = 0 to 25, respectively
463463+ 30..39 (0-9) = 26 to 35, respectively
464464+465465+ Using hyphen-minus as the delimiter implies that the encoded string
466466+ can end with a hyphen-minus only if the Unicode string consists
467467+ entirely of basic code points, but IDNA forbids such strings from
468468+ being encoded. The encoded string can begin with a hyphen-minus, but
469469+ IDNA prepends a prefix. Therefore IDNA using Punycode conforms to
470470+ the RFC 952 rule that host name labels neither begin nor end with a
471471+ hyphen-minus [RFC952].
472472+473473+ A decoder MUST recognize the letters in both uppercase and lowercase
474474+ forms (including mixtures of both forms). An encoder SHOULD output
475475+ only uppercase forms or only lowercase forms, unless it uses mixed-
476476+ case annotation (see appendix A).
477477+478478+ Presumably most users will not manually write or type encoded strings
479479+ (as opposed to cutting and pasting them), but those who do will need
480480+ to be alert to the potential visual ambiguity between the following
481481+ sets of characters:
482482+483483+ G 6
484484+ I l 1
485485+ O 0
486486+ S 5
487487+ U V
488488+ Z 2
489489+490490+ Such ambiguities are usually resolved by context, but in a Punycode
491491+ encoded string there is no context apparent to humans.
492492+493493+6. Bootstring algorithms
494494+495495+ Some parts of the pseudocode can be omitted if the parameters satisfy
496496+ certain conditions (for which Punycode qualifies). These parts are
497497+ enclosed in {braces}, and notes immediately following the pseudocode
498498+ explain the conditions under which they can be omitted.
499499+500500+501501+502502+503503+504504+505505+506506+Costello Standards Track [Page 9]
507507+508508+RFC 3492 IDNA Punycode March 2003
509509+510510+511511+ Formally, code points are integers, and hence the pseudocode assumes
512512+ that arithmetic operations can be performed directly on code points.
513513+ In some programming languages, explicit conversion between code
514514+ points and integers might be necessary.
515515+516516+6.1 Bias adaptation function
517517+518518+ function adapt(delta,numpoints,firsttime):
519519+ if firsttime then let delta = delta div damp
520520+ else let delta = delta div 2
521521+ let delta = delta + (delta div numpoints)
522522+ let k = 0
523523+ while delta > ((base - tmin) * tmax) div 2 do begin
524524+ let delta = delta div (base - tmin)
525525+ let k = k + base
526526+ end
527527+ return k + (((base - tmin + 1) * delta) div (delta + skew))
528528+529529+ It does not matter whether the modifications to delta and k inside
530530+ adapt() affect variables of the same name inside the
531531+ encoding/decoding procedures, because after calling adapt() the
532532+ caller does not read those variables before overwriting them.
533533+534534+535535+536536+537537+538538+539539+540540+541541+542542+543543+544544+545545+546546+547547+548548+549549+550550+551551+552552+553553+554554+555555+556556+557557+558558+559559+560560+561561+562562+Costello Standards Track [Page 10]
563563+564564+RFC 3492 IDNA Punycode March 2003
565565+566566+567567+6.2 Decoding procedure
568568+569569+ let n = initial_n
570570+ let i = 0
571571+ let bias = initial_bias
572572+ let output = an empty string indexed from 0
573573+ consume all code points before the last delimiter (if there is one)
574574+ and copy them to output, fail on any non-basic code point
575575+ if more than zero code points were consumed then consume one more
576576+ (which will be the last delimiter)
577577+ while the input is not exhausted do begin
578578+ let oldi = i
579579+ let w = 1
580580+ for k = base to infinity in steps of base do begin
581581+ consume a code point, or fail if there was none to consume
582582+ let digit = the code point's digit-value, fail if it has none
583583+ let i = i + digit * w, fail on overflow
584584+ let t = tmin if k <= bias {+ tmin}, or
585585+ tmax if k >= bias + tmax, or k - bias otherwise
586586+ if digit < t then break
587587+ let w = w * (base - t), fail on overflow
588588+ end
589589+ let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
590590+ let n = n + i div (length(output) + 1), fail on overflow
591591+ let i = i mod (length(output) + 1)
592592+ {if n is a basic code point then fail}
593593+ insert n into output at position i
594594+ increment i
595595+ end
596596+597597+ The full statement enclosed in braces (checking whether n is a basic
598598+ code point) can be omitted if initial_n exceeds all basic code points
599599+ (which is true for Punycode), because n is never less than initial_n.
600600+601601+ In the assignment of t, where t is clamped to the range tmin through
602602+ tmax, "+ tmin" can always be omitted. This makes the clamping
603603+ calculation incorrect when bias < k < bias + tmin, but that cannot
604604+ happen because of the way bias is computed and because of the
605605+ constraints on the parameters.
606606+607607+ Because the decoder state can only advance monotonically, and there
608608+ is only one representation of any delta, there is therefore only one
609609+ encoded string that can represent a given sequence of integers. The
610610+ only error conditions are invalid code points, unexpected end-of-
611611+ input, overflow, and basic code points encoded using deltas instead
612612+ of appearing literally. If the decoder fails on these errors as
613613+ shown above, then it cannot produce the same output for two distinct
614614+ inputs. Without this property it would have been necessary to re-
615615+616616+617617+618618+Costello Standards Track [Page 11]
619619+620620+RFC 3492 IDNA Punycode March 2003
621621+622622+623623+ encode the output and verify that it matches the input in order to
624624+ guarantee the uniqueness of the encoding.
625625+626626+6.3 Encoding procedure
627627+628628+ let n = initial_n
629629+ let delta = 0
630630+ let bias = initial_bias
631631+ let h = b = the number of basic code points in the input
632632+ copy them to the output in order, followed by a delimiter if b > 0
633633+ {if the input contains a non-basic code point < n then fail}
634634+ while h < length(input) do begin
635635+ let m = the minimum {non-basic} code point >= n in the input
636636+ let delta = delta + (m - n) * (h + 1), fail on overflow
637637+ let n = m
638638+ for each code point c in the input (in order) do begin
639639+ if c < n {or c is basic} then increment delta, fail on overflow
640640+ if c == n then begin
641641+ let q = delta
642642+ for k = base to infinity in steps of base do begin
643643+ let t = tmin if k <= bias {+ tmin}, or
644644+ tmax if k >= bias + tmax, or k - bias otherwise
645645+ if q < t then break
646646+ output the code point for digit t + ((q - t) mod (base - t))
647647+ let q = (q - t) div (base - t)
648648+ end
649649+ output the code point for digit q
650650+ let bias = adapt(delta, h + 1, test h equals b?)
651651+ let delta = 0
652652+ increment h
653653+ end
654654+ end
655655+ increment delta and n
656656+ end
657657+658658+ The full statement enclosed in braces (checking whether the input
659659+ contains a non-basic code point less than n) can be omitted if all
660660+ code points less than initial_n are basic code points (which is true
661661+ for Punycode if code points are unsigned).
662662+663663+ The brace-enclosed conditions "non-basic" and "or c is basic" can be
664664+ omitted if initial_n exceeds all basic code points (which is true for
665665+ Punycode), because the code point being tested is never less than
666666+ initial_n.
667667+668668+ In the assignment of t, where t is clamped to the range tmin through
669669+ tmax, "+ tmin" can always be omitted. This makes the clamping
670670+ calculation incorrect when bias < k < bias + tmin, but that cannot
671671+672672+673673+674674+Costello Standards Track [Page 12]
675675+676676+RFC 3492 IDNA Punycode March 2003
677677+678678+679679+ happen because of the way bias is computed and because of the
680680+ constraints on the parameters.
681681+682682+ The checks for overflow are necessary to avoid producing invalid
683683+ output when the input contains very large values or is very long.
684684+685685+ The increment of delta at the bottom of the outer loop cannot
686686+ overflow because delta < length(input) before the increment, and
687687+ length(input) is already assumed to be representable. The increment
688688+ of n could overflow, but only if h == length(input), in which case
689689+ the procedure is finished anyway.
690690+691691+6.4 Overflow handling
692692+693693+ For IDNA, 26-bit unsigned integers are sufficient to handle all valid
694694+ IDNA labels without overflow, because any string that needed a 27-bit
695695+ delta would have to exceed either the code point limit (0..10FFFF) or
696696+ the label length limit (63 characters). However, overflow handling
697697+ is necessary because the inputs are not necessarily valid IDNA
698698+ labels.
699699+700700+ If the programming language does not provide overflow detection, the
701701+ following technique can be used. Suppose A, B, and C are
702702+ representable nonnegative integers and C is nonzero. Then A + B
703703+ overflows if and only if B > maxint - A, and A + (B * C) overflows if
704704+ and only if B > (maxint - A) div C, where maxint is the greatest
705705+ integer for which maxint + 1 cannot be represented. Refer to
706706+ appendix C "Punycode sample implementation" for demonstrations of
707707+ this technique in the C language.
708708+709709+ The decoding and encoding algorithms shown in sections 6.2 and 6.3
710710+ handle overflow by detecting it whenever it happens. Another
711711+ approach is to enforce limits on the inputs that prevent overflow
712712+ from happening. For example, if the encoder were to verify that no
713713+ input code points exceed M and that the input length does not exceed
714714+ L, then no delta could ever exceed (M - initial_n) * (L + 1), and
715715+ hence no overflow could occur if integer variables were capable of
716716+ representing values that large. This prevention approach would
717717+ impose more restrictions on the input than the detection approach
718718+ does, but might be considered simpler in some programming languages.
719719+720720+ In theory, the decoder could use an analogous approach, limiting the
721721+ number of digits in a variable-length integer (that is, limiting the
722722+ number of iterations in the innermost loop). However, the number of
723723+ digits that suffice to represent a given delta can sometimes
724724+ represent much larger deltas (because of the adaptation), and hence
725725+ this approach would probably need integers wider than 32 bits.
726726+727727+728728+729729+730730+Costello Standards Track [Page 13]
731731+732732+RFC 3492 IDNA Punycode March 2003
733733+734734+735735+ Yet another approach for the decoder is to allow overflow to occur,
736736+ but to check the final output string by re-encoding it and comparing
737737+ to the decoder input. If and only if they do not match (using a
738738+ case-insensitive ASCII comparison) overflow has occurred. This
739739+ delayed-detection approach would not impose any more restrictions on
740740+ the input than the immediate-detection approach does, and might be
741741+ considered simpler in some programming languages.
742742+743743+ In fact, if the decoder is used only inside the IDNA ToUnicode
744744+ operation [IDNA], then it need not check for overflow at all, because
745745+ ToUnicode performs a higher level re-encoding and comparison, and a
746746+ mismatch has the same consequence as if the Punycode decoder had
747747+ failed.
748748+749749+7. Punycode examples
750750+751751+7.1 Sample strings
752752+753753+ In the Punycode encodings below, the ACE prefix is not shown.
754754+ Backslashes show where line breaks have been inserted in strings too
755755+ long for one line.
756756+757757+ The first several examples are all translations of the sentence "Why
758758+ can't they just speak in <language>?" (courtesy of Michael Kaplan's
759759+ "provincial" page [PROVINCIAL]). Word breaks and punctuation have
760760+ been removed, as is often done in domain names.
761761+762762+ (A) Arabic (Egyptian):
763763+ u+0644 u+064A u+0647 u+0645 u+0627 u+0628 u+062A u+0643 u+0644
764764+ u+0645 u+0648 u+0634 u+0639 u+0631 u+0628 u+064A u+061F
765765+ Punycode: egbpdaj6bu4bxfgehfvwxn
766766+767767+ (B) Chinese (simplified):
768768+ u+4ED6 u+4EEC u+4E3A u+4EC0 u+4E48 u+4E0D u+8BF4 u+4E2D u+6587
769769+ Punycode: ihqwcrb4cv8a8dqg056pqjye
770770+771771+ (C) Chinese (traditional):
772772+ u+4ED6 u+5011 u+7232 u+4EC0 u+9EBD u+4E0D u+8AAA u+4E2D u+6587
773773+ Punycode: ihqwctvzc91f659drss3x8bo0yb
774774+775775+ (D) Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
776776+ U+0050 u+0072 u+006F u+010D u+0070 u+0072 u+006F u+0073 u+0074
777777+ u+011B u+006E u+0065 u+006D u+006C u+0075 u+0076 u+00ED u+010D
778778+ u+0065 u+0073 u+006B u+0079
779779+ Punycode: Proprostnemluvesky-uyb24dma41a
780780+781781+782782+783783+784784+785785+786786+Costello Standards Track [Page 14]
787787+788788+RFC 3492 IDNA Punycode March 2003
789789+790790+791791+ (E) Hebrew:
792792+ u+05DC u+05DE u+05D4 u+05D4 u+05DD u+05E4 u+05E9 u+05D5 u+05D8
793793+ u+05DC u+05D0 u+05DE u+05D3 u+05D1 u+05E8 u+05D9 u+05DD u+05E2
794794+ u+05D1 u+05E8 u+05D9 u+05EA
795795+ Punycode: 4dbcagdahymbxekheh6e0a7fei0b
796796+797797+ (F) Hindi (Devanagari):
798798+ u+092F u+0939 u+0932 u+094B u+0917 u+0939 u+093F u+0928 u+094D
799799+ u+0926 u+0940 u+0915 u+094D u+092F u+094B u+0902 u+0928 u+0939
800800+ u+0940 u+0902 u+092C u+094B u+0932 u+0938 u+0915 u+0924 u+0947
801801+ u+0939 u+0948 u+0902
802802+ Punycode: i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd
803803+804804+ (G) Japanese (kanji and hiragana):
805805+ u+306A u+305C u+307F u+3093 u+306A u+65E5 u+672C u+8A9E u+3092
806806+ u+8A71 u+3057 u+3066 u+304F u+308C u+306A u+3044 u+306E u+304B
807807+ Punycode: n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa
808808+809809+ (H) Korean (Hangul syllables):
810810+ u+C138 u+ACC4 u+C758 u+BAA8 u+B4E0 u+C0AC u+B78C u+B4E4 u+C774
811811+ u+D55C u+AD6D u+C5B4 u+B97C u+C774 u+D574 u+D55C u+B2E4 u+BA74
812812+ u+C5BC u+B9C8 u+B098 u+C88B u+C744 u+AE4C
813813+ Punycode: 989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j\
814814+ psd879ccm6fea98c
815815+816816+ (I) Russian (Cyrillic):
817817+ U+043F u+043E u+0447 u+0435 u+043C u+0443 u+0436 u+0435 u+043E
818818+ u+043D u+0438 u+043D u+0435 u+0433 u+043E u+0432 u+043E u+0440
819819+ u+044F u+0442 u+043F u+043E u+0440 u+0443 u+0441 u+0441 u+043A
820820+ u+0438
821821+ Punycode: b1abfaaepdrnnbgefbaDotcwatmq2g4l
822822+823823+ (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
824824+ U+0050 u+006F u+0072 u+0071 u+0075 u+00E9 u+006E u+006F u+0070
825825+ u+0075 u+0065 u+0064 u+0065 u+006E u+0073 u+0069 u+006D u+0070
826826+ u+006C u+0065 u+006D u+0065 u+006E u+0074 u+0065 u+0068 u+0061
827827+ u+0062 u+006C u+0061 u+0072 u+0065 u+006E U+0045 u+0073 u+0070
828828+ u+0061 u+00F1 u+006F u+006C
829829+ Punycode: PorqunopuedensimplementehablarenEspaol-fmd56a
830830+831831+ (K) Vietnamese:
832832+ T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
833833+ <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
834834+ U+0054 u+1EA1 u+0069 u+0073 u+0061 u+006F u+0068 u+1ECD u+006B
835835+ u+0068 u+00F4 u+006E u+0067 u+0074 u+0068 u+1EC3 u+0063 u+0068
836836+ u+1EC9 u+006E u+00F3 u+0069 u+0074 u+0069 u+1EBF u+006E u+0067
837837+ U+0056 u+0069 u+1EC7 u+0074
838838+ Punycode: TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g
839839+840840+841841+842842+Costello Standards Track [Page 15]
843843+844844+RFC 3492 IDNA Punycode March 2003
845845+846846+847847+ The next several examples are all names of Japanese music artists,
848848+ song titles, and TV programs, just because the author happens to have
849849+ them handy (but Japanese is useful for providing examples of single-
850850+ row text, two-row text, ideographic text, and various mixtures
851851+ thereof).
852852+853853+ (L) 3<nen>B<gumi><kinpachi><sensei>
854854+ u+0033 u+5E74 U+0042 u+7D44 u+91D1 u+516B u+5148 u+751F
855855+ Punycode: 3B-ww4c5e180e575a65lsy2b
856856+857857+ (M) <amuro><namie>-with-SUPER-MONKEYS
858858+ u+5B89 u+5BA4 u+5948 u+7F8E u+6075 u+002D u+0077 u+0069 u+0074
859859+ u+0068 u+002D U+0053 U+0055 U+0050 U+0045 U+0052 u+002D U+004D
860860+ U+004F U+004E U+004B U+0045 U+0059 U+0053
861861+ Punycode: -with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n
862862+863863+ (N) Hello-Another-Way-<sorezore><no><basho>
864864+ U+0048 u+0065 u+006C u+006C u+006F u+002D U+0041 u+006E u+006F
865865+ u+0074 u+0068 u+0065 u+0072 u+002D U+0057 u+0061 u+0079 u+002D
866866+ u+305D u+308C u+305E u+308C u+306E u+5834 u+6240
867867+ Punycode: Hello-Another-Way--fc4qua05auwb3674vfr0b
868868+869869+ (O) <hitotsu><yane><no><shita>2
870870+ u+3072 u+3068 u+3064 u+5C4B u+6839 u+306E u+4E0B u+0032
871871+ Punycode: 2-u9tlzr9756bt3uc0v
872872+873873+ (P) Maji<de>Koi<suru>5<byou><mae>
874874+ U+004D u+0061 u+006A u+0069 u+3067 U+004B u+006F u+0069 u+3059
875875+ u+308B u+0035 u+79D2 u+524D
876876+ Punycode: MajiKoi5-783gue6qz075azm5e
877877+878878+ (Q) <pafii>de<runba>
879879+ u+30D1 u+30D5 u+30A3 u+30FC u+0064 u+0065 u+30EB u+30F3 u+30D0
880880+ Punycode: de-jg4avhby1noc0d
881881+882882+ (R) <sono><supiido><de>
883883+ u+305D u+306E u+30B9 u+30D4 u+30FC u+30C9 u+3067
884884+ Punycode: d9juau41awczczp
885885+886886+ The last example is an ASCII string that breaks the existing rules
887887+ for host name labels. (It is not a realistic example for IDNA,
888888+ because IDNA never encodes pure ASCII labels.)
889889+890890+ (S) -> $1.00 <-
891891+ u+002D u+003E u+0020 u+0024 u+0031 u+002E u+0030 u+0030 u+0020
892892+ u+003C u+002D
893893+ Punycode: -> $1.00 <--
894894+895895+896896+897897+898898+Costello Standards Track [Page 16]
899899+900900+RFC 3492 IDNA Punycode March 2003
901901+902902+903903+7.2 Decoding traces
904904+905905+ In the following traces, the evolving state of the decoder is shown
906906+ as a sequence of hexadecimal values, representing the code points in
907907+ the extended string. An asterisk appears just after the most
908908+ recently inserted code point, indicating both n (the value preceeding
909909+ the asterisk) and i (the position of the value just after the
910910+ asterisk). Other numerical values are decimal.
911911+912912+ Decoding trace of example B from section 7.1:
913913+914914+ n is 128, i is 0, bias is 72
915915+ input is "ihqwcrb4cv8a8dqg056pqjye"
916916+ there is no delimiter, so extended string starts empty
917917+ delta "ihq" decodes to 19853
918918+ bias becomes 21
919919+ 4E0D *
920920+ delta "wc" decodes to 64
921921+ bias becomes 20
922922+ 4E0D 4E2D *
923923+ delta "rb" decodes to 37
924924+ bias becomes 13
925925+ 4E3A * 4E0D 4E2D
926926+ delta "4c" decodes to 56
927927+ bias becomes 17
928928+ 4E3A 4E48 * 4E0D 4E2D
929929+ delta "v8a" decodes to 599
930930+ bias becomes 32
931931+ 4E3A 4EC0 * 4E48 4E0D 4E2D
932932+ delta "8d" decodes to 130
933933+ bias becomes 23
934934+ 4ED6 * 4E3A 4EC0 4E48 4E0D 4E2D
935935+ delta "qg" decodes to 154
936936+ bias becomes 25
937937+ 4ED6 4EEC * 4E3A 4EC0 4E48 4E0D 4E2D
938938+ delta "056p" decodes to 46301
939939+ bias becomes 84
940940+ 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 4E2D 6587 *
941941+ delta "qjye" decodes to 88531
942942+ bias becomes 90
943943+ 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 8BF4 * 4E2D 6587
944944+945945+946946+947947+948948+949949+950950+951951+952952+953953+954954+Costello Standards Track [Page 17]
955955+956956+RFC 3492 IDNA Punycode March 2003
957957+958958+959959+ Decoding trace of example L from section 7.1:
960960+961961+ n is 128, i is 0, bias is 72
962962+ input is "3B-ww4c5e180e575a65lsy2b"
963963+ literal portion is "3B-", so extended string starts as:
964964+ 0033 0042
965965+ delta "ww4c" decodes to 62042
966966+ bias becomes 27
967967+ 0033 0042 5148 *
968968+ delta "5e" decodes to 139
969969+ bias becomes 24
970970+ 0033 0042 516B * 5148
971971+ delta "180e" decodes to 16683
972972+ bias becomes 67
973973+ 0033 5E74 * 0042 516B 5148
974974+ delta "575a" decodes to 34821
975975+ bias becomes 82
976976+ 0033 5E74 0042 516B 5148 751F *
977977+ delta "65l" decodes to 14592
978978+ bias becomes 67
979979+ 0033 5E74 0042 7D44 * 516B 5148 751F
980980+ delta "sy2b" decodes to 42088
981981+ bias becomes 84
982982+ 0033 5E74 0042 7D44 91D1 * 516B 5148 751F
983983+984984+985985+986986+987987+988988+989989+990990+991991+992992+993993+994994+995995+996996+997997+998998+999999+10001000+10011001+10021002+10031003+10041004+10051005+10061006+10071007+10081008+10091009+10101010+Costello Standards Track [Page 18]
10111011+10121012+RFC 3492 IDNA Punycode March 2003
10131013+10141014+10151015+7.3 Encoding traces
10161016+10171017+ In the following traces, code point values are hexadecimal, while
10181018+ other numerical values are decimal.
10191019+10201020+ Encoding trace of example B from section 7.1:
10211021+10221022+ bias is 72
10231023+ input is:
10241024+ 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 8BF4 4E2D 6587
10251025+ there are no basic code points, so no literal portion
10261026+ next code point to insert is 4E0D
10271027+ needed delta is 19853, encodes as "ihq"
10281028+ bias becomes 21
10291029+ next code point to insert is 4E2D
10301030+ needed delta is 64, encodes as "wc"
10311031+ bias becomes 20
10321032+ next code point to insert is 4E3A
10331033+ needed delta is 37, encodes as "rb"
10341034+ bias becomes 13
10351035+ next code point to insert is 4E48
10361036+ needed delta is 56, encodes as "4c"
10371037+ bias becomes 17
10381038+ next code point to insert is 4EC0
10391039+ needed delta is 599, encodes as "v8a"
10401040+ bias becomes 32
10411041+ next code point to insert is 4ED6
10421042+ needed delta is 130, encodes as "8d"
10431043+ bias becomes 23
10441044+ next code point to insert is 4EEC
10451045+ needed delta is 154, encodes as "qg"
10461046+ bias becomes 25
10471047+ next code point to insert is 6587
10481048+ needed delta is 46301, encodes as "056p"
10491049+ bias becomes 84
10501050+ next code point to insert is 8BF4
10511051+ needed delta is 88531, encodes as "qjye"
10521052+ bias becomes 90
10531053+ output is "ihqwcrb4cv8a8dqg056pqjye"
10541054+10551055+10561056+10571057+10581058+10591059+10601060+10611061+10621062+10631063+10641064+10651065+10661066+Costello Standards Track [Page 19]
10671067+10681068+RFC 3492 IDNA Punycode March 2003
10691069+10701070+10711071+ Encoding trace of example L from section 7.1:
10721072+10731073+ bias is 72
10741074+ input is:
10751075+ 0033 5E74 0042 7D44 91D1 516B 5148 751F
10761076+ basic code points (0033, 0042) are copied to literal portion: "3B-"
10771077+ next code point to insert is 5148
10781078+ needed delta is 62042, encodes as "ww4c"
10791079+ bias becomes 27
10801080+ next code point to insert is 516B
10811081+ needed delta is 139, encodes as "5e"
10821082+ bias becomes 24
10831083+ next code point to insert is 5E74
10841084+ needed delta is 16683, encodes as "180e"
10851085+ bias becomes 67
10861086+ next code point to insert is 751F
10871087+ needed delta is 34821, encodes as "575a"
10881088+ bias becomes 82
10891089+ next code point to insert is 7D44
10901090+ needed delta is 14592, encodes as "65l"
10911091+ bias becomes 67
10921092+ next code point to insert is 91D1
10931093+ needed delta is 42088, encodes as "sy2b"
10941094+ bias becomes 84
10951095+ output is "3B-ww4c5e180e575a65lsy2b"
10961096+10971097+8. Security Considerations
10981098+10991099+ Users expect each domain name in DNS to be controlled by a single
11001100+ authority. If a Unicode string intended for use as a domain label
11011101+ could map to multiple ACE labels, then an internationalized domain
11021102+ name could map to multiple ASCII domain names, each controlled by a
11031103+ different authority, some of which could be spoofs that hijack
11041104+ service requests intended for another. Therefore Punycode is
11051105+ designed so that each Unicode string has a unique encoding.
11061106+11071107+ However, there can still be multiple Unicode representations of the
11081108+ "same" text, for various definitions of "same". This problem is
11091109+ addressed to some extent by the Unicode standard under the topic of
11101110+ canonicalization, and this work is leveraged for domain names by
11111111+ Nameprep [NAMEPREP].
11121112+11131113+11141114+11151115+11161116+11171117+11181118+11191119+11201120+11211121+11221122+Costello Standards Track [Page 20]
11231123+11241124+RFC 3492 IDNA Punycode March 2003
11251125+11261126+11271127+9. References
11281128+11291129+9.1 Normative References
11301130+11311131+ [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
11321132+ Requirement Levels", BCP 14, RFC 2119, March 1997.
11331133+11341134+9.2 Informative References
11351135+11361136+ [RFC952] Harrenstien, K., Stahl, M. and E. Feinler, "DOD Internet
11371137+ Host Table Specification", RFC 952, October 1985.
11381138+11391139+ [RFC1034] Mockapetris, P., "Domain Names - Concepts and
11401140+ Facilities", STD 13, RFC 1034, November 1987.
11411141+11421142+ [IDNA] Faltstrom, P., Hoffman, P. and A. Costello,
11431143+ "Internationalizing Domain Names in Applications
11441144+ (IDNA)", RFC 3490, March 2003.
11451145+11461146+ [NAMEPREP] Hoffman, P. and M. Blanchet, "Nameprep: A Stringprep
11471147+ Profile for Internationalized Domain Names (IDN)", RFC
11481148+ 3491, March 2003.
11491149+11501150+ [ASCII] Cerf, V., "ASCII format for Network Interchange", RFC
11511151+ 20, October 1969.
11521152+11531153+ [PROVINCIAL] Kaplan, M., "The 'anyone can be provincial!' page",
11541154+ http://www.trigeminal.com/samples/provincial.html.
11551155+11561156+ [UNICODE] The Unicode Consortium, "The Unicode Standard",
11571157+ http://www.unicode.org/unicode/standard/standard.html.
11581158+11591159+11601160+11611161+11621162+11631163+11641164+11651165+11661166+11671167+11681168+11691169+11701170+11711171+11721172+11731173+11741174+11751175+11761176+11771177+11781178+Costello Standards Track [Page 21]
11791179+11801180+RFC 3492 IDNA Punycode March 2003
11811181+11821182+11831183+A. Mixed-case annotation
11841184+11851185+ In order to use Punycode to represent case-insensitive strings,
11861186+ higher layers need to case-fold the strings prior to Punycode
11871187+ encoding. The encoded string can use mixed case as an annotation
11881188+ telling how to convert the folded string into a mixed-case string for
11891189+ display purposes. Note, however, that mixed-case annotation is not
11901190+ used by the ToASCII and ToUnicode operations specified in [IDNA], and
11911191+ therefore implementors of IDNA can disregard this appendix.
11921192+11931193+ Basic code points can use mixed case directly, because the decoder
11941194+ copies them verbatim, leaving lowercase code points lowercase, and
11951195+ leaving uppercase code points uppercase. Each non-basic code point
11961196+ is represented by a delta, which is represented by a sequence of
11971197+ basic code points, the last of which provides the annotation. If it
11981198+ is uppercase, it is a suggestion to map the non-basic code point to
11991199+ uppercase (if possible); if it is lowercase, it is a suggestion to
12001200+ map the non-basic code point to lowercase (if possible).
12011201+12021202+ These annotations do not alter the code points returned by decoders;
12031203+ the annotations are returned separately, for the caller to use or
12041204+ ignore. Encoders can accept annotations in addition to code points,
12051205+ but the annotations do not alter the output, except to influence the
12061206+ uppercase/lowercase form of ASCII letters.
12071207+12081208+ Punycode encoders and decoders need not support these annotations,
12091209+ and higher layers need not use them.
12101210+12111211+B. Disclaimer and license
12121212+12131213+ Regarding this entire document or any portion of it (including the
12141214+ pseudocode and C code), the author makes no guarantees and is not
12151215+ responsible for any damage resulting from its use. The author grants
12161216+ irrevocable permission to anyone to use, modify, and distribute it in
12171217+ any way that does not diminish the rights of anyone else to use,
12181218+ modify, and distribute it, provided that redistributed derivative
12191219+ works do not contain misleading author or version information.
12201220+ Derivative works need not be licensed under similar terms.
12211221+12221222+12231223+12241224+12251225+12261226+12271227+12281228+12291229+12301230+12311231+12321232+12331233+12341234+Costello Standards Track [Page 22]
12351235+12361236+RFC 3492 IDNA Punycode March 2003
12371237+12381238+12391239+C. Punycode sample implementation
12401240+12411241+/*
12421242+punycode.c from RFC 3492
12431243+http://www.nicemice.net/idn/
12441244+Adam M. Costello
12451245+http://www.nicemice.net/amc/
12461246+12471247+This is ANSI C code (C89) implementing Punycode (RFC 3492).
12481248+12491249+*/
12501250+12511251+12521252+/************************************************************/
12531253+/* Public interface (would normally go in its own .h file): */
12541254+12551255+#include <limits.h>
12561256+12571257+enum punycode_status {
12581258+ punycode_success,
12591259+ punycode_bad_input, /* Input is invalid. */
12601260+ punycode_big_output, /* Output would exceed the space provided. */
12611261+ punycode_overflow /* Input needs wider integers to process. */
12621262+};
12631263+12641264+#if UINT_MAX >= (1 << 26) - 1
12651265+typedef unsigned int punycode_uint;
12661266+#else
12671267+typedef unsigned long punycode_uint;
12681268+#endif
12691269+12701270+enum punycode_status punycode_encode(
12711271+ punycode_uint input_length,
12721272+ const punycode_uint input[],
12731273+ const unsigned char case_flags[],
12741274+ punycode_uint *output_length,
12751275+ char output[] );
12761276+12771277+ /* punycode_encode() converts Unicode to Punycode. The input */
12781278+ /* is represented as an array of Unicode code points (not code */
12791279+ /* units; surrogate pairs are not allowed), and the output */
12801280+ /* will be represented as an array of ASCII code points. The */
12811281+ /* output string is *not* null-terminated; it will contain */
12821282+ /* zeros if and only if the input contains zeros. (Of course */
12831283+ /* the caller can leave room for a terminator and add one if */
12841284+ /* needed.) The input_length is the number of code points in */
12851285+ /* the input. The output_length is an in/out argument: the */
12861286+ /* caller passes in the maximum number of code points that it */
12871287+12881288+12891289+12901290+Costello Standards Track [Page 23]
12911291+12921292+RFC 3492 IDNA Punycode March 2003
12931293+12941294+12951295+ /* can receive, and on successful return it will contain the */
12961296+ /* number of code points actually output. The case_flags array */
12971297+ /* holds input_length boolean values, where nonzero suggests that */
12981298+ /* the corresponding Unicode character be forced to uppercase */
12991299+ /* after being decoded (if possible), and zero suggests that */
13001300+ /* it be forced to lowercase (if possible). ASCII code points */
13011301+ /* are encoded literally, except that ASCII letters are forced */
13021302+ /* to uppercase or lowercase according to the corresponding */
13031303+ /* uppercase flags. If case_flags is a null pointer then ASCII */
13041304+ /* letters are left as they are, and other code points are */
13051305+ /* treated as if their uppercase flags were zero. The return */
13061306+ /* value can be any of the punycode_status values defined above */
13071307+ /* except punycode_bad_input; if not punycode_success, then */
13081308+ /* output_size and output might contain garbage. */
13091309+13101310+enum punycode_status punycode_decode(
13111311+ punycode_uint input_length,
13121312+ const char input[],
13131313+ punycode_uint *output_length,
13141314+ punycode_uint output[],
13151315+ unsigned char case_flags[] );
13161316+13171317+ /* punycode_decode() converts Punycode to Unicode. The input is */
13181318+ /* represented as an array of ASCII code points, and the output */
13191319+ /* will be represented as an array of Unicode code points. The */
13201320+ /* input_length is the number of code points in the input. The */
13211321+ /* output_length is an in/out argument: the caller passes in */
13221322+ /* the maximum number of code points that it can receive, and */
13231323+ /* on successful return it will contain the actual number of */
13241324+ /* code points output. The case_flags array needs room for at */
13251325+ /* least output_length values, or it can be a null pointer if the */
13261326+ /* case information is not needed. A nonzero flag suggests that */
13271327+ /* the corresponding Unicode character be forced to uppercase */
13281328+ /* by the caller (if possible), while zero suggests that it be */
13291329+ /* forced to lowercase (if possible). ASCII code points are */
13301330+ /* output already in the proper case, but their flags will be set */
13311331+ /* appropriately so that applying the flags would be harmless. */
13321332+ /* The return value can be any of the punycode_status values */
13331333+ /* defined above; if not punycode_success, then output_length, */
13341334+ /* output, and case_flags might contain garbage. On success, the */
13351335+ /* decoder will never need to write an output_length greater than */
13361336+ /* input_length, because of how the encoding is defined. */
13371337+13381338+/**********************************************************/
13391339+/* Implementation (would normally go in its own .c file): */
13401340+13411341+#include <string.h>
13421342+13431343+13441344+13451345+13461346+Costello Standards Track [Page 24]
13471347+13481348+RFC 3492 IDNA Punycode March 2003
13491349+13501350+13511351+/*** Bootstring parameters for Punycode ***/
13521352+13531353+enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
13541354+ initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
13551355+13561356+/* basic(cp) tests whether cp is a basic code point: */
13571357+#define basic(cp) ((punycode_uint)(cp) < 0x80)
13581358+13591359+/* delim(cp) tests whether cp is a delimiter: */
13601360+#define delim(cp) ((cp) == delimiter)
13611361+13621362+/* decode_digit(cp) returns the numeric value of a basic code */
13631363+/* point (for use in representing integers) in the range 0 to */
13641364+/* base-1, or base if cp is does not represent a value. */
13651365+13661366+static punycode_uint decode_digit(punycode_uint cp)
13671367+{
13681368+ return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
13691369+ cp - 97 < 26 ? cp - 97 : base;
13701370+}
13711371+13721372+/* encode_digit(d,flag) returns the basic code point whose value */
13731373+/* (when used for representing integers) is d, which needs to be in */
13741374+/* the range 0 to base-1. The lowercase form is used unless flag is */
13751375+/* nonzero, in which case the uppercase form is used. The behavior */
13761376+/* is undefined if flag is nonzero and digit d has no uppercase form. */
13771377+13781378+static char encode_digit(punycode_uint d, int flag)
13791379+{
13801380+ return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
13811381+ /* 0..25 map to ASCII a..z or A..Z */
13821382+ /* 26..35 map to ASCII 0..9 */
13831383+}
13841384+13851385+/* flagged(bcp) tests whether a basic code point is flagged */
13861386+/* (uppercase). The behavior is undefined if bcp is not a */
13871387+/* basic code point. */
13881388+13891389+#define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
13901390+13911391+/* encode_basic(bcp,flag) forces a basic code point to lowercase */
13921392+/* if flag is zero, uppercase if flag is nonzero, and returns */
13931393+/* the resulting code point. The code point is unchanged if it */
13941394+/* is caseless. The behavior is undefined if bcp is not a basic */
13951395+/* code point. */
13961396+13971397+static char encode_basic(punycode_uint bcp, int flag)
13981398+{
13991399+14001400+14011401+14021402+Costello Standards Track [Page 25]
14031403+14041404+RFC 3492 IDNA Punycode March 2003
14051405+14061406+14071407+ bcp -= (bcp - 97 < 26) << 5;
14081408+ return bcp + ((!flag && (bcp - 65 < 26)) << 5);
14091409+}
14101410+14111411+/*** Platform-specific constants ***/
14121412+14131413+/* maxint is the maximum value of a punycode_uint variable: */
14141414+static const punycode_uint maxint = -1;
14151415+/* Because maxint is unsigned, -1 becomes the maximum value. */
14161416+14171417+/*** Bias adaptation function ***/
14181418+14191419+static punycode_uint adapt(
14201420+ punycode_uint delta, punycode_uint numpoints, int firsttime )
14211421+{
14221422+ punycode_uint k;
14231423+14241424+ delta = firsttime ? delta / damp : delta >> 1;
14251425+ /* delta >> 1 is a faster way of doing delta / 2 */
14261426+ delta += delta / numpoints;
14271427+14281428+ for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
14291429+ delta /= base - tmin;
14301430+ }
14311431+14321432+ return k + (base - tmin + 1) * delta / (delta + skew);
14331433+}
14341434+14351435+/*** Main encode function ***/
14361436+14371437+enum punycode_status punycode_encode(
14381438+ punycode_uint input_length,
14391439+ const punycode_uint input[],
14401440+ const unsigned char case_flags[],
14411441+ punycode_uint *output_length,
14421442+ char output[] )
14431443+{
14441444+ punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
14451445+14461446+ /* Initialize the state: */
14471447+14481448+ n = initial_n;
14491449+ delta = out = 0;
14501450+ max_out = *output_length;
14511451+ bias = initial_bias;
14521452+14531453+ /* Handle the basic code points: */
14541454+14551455+14561456+14571457+14581458+Costello Standards Track [Page 26]
14591459+14601460+RFC 3492 IDNA Punycode March 2003
14611461+14621462+14631463+ for (j = 0; j < input_length; ++j) {
14641464+ if (basic(input[j])) {
14651465+ if (max_out - out < 2) return punycode_big_output;
14661466+ output[out++] =
14671467+ case_flags ? encode_basic(input[j], case_flags[j]) : input[j];
14681468+ }
14691469+ /* else if (input[j] < n) return punycode_bad_input; */
14701470+ /* (not needed for Punycode with unsigned code points) */
14711471+ }
14721472+14731473+ h = b = out;
14741474+14751475+ /* h is the number of code points that have been handled, b is the */
14761476+ /* number of basic code points, and out is the number of characters */
14771477+ /* that have been output. */
14781478+14791479+ if (b > 0) output[out++] = delimiter;
14801480+14811481+ /* Main encoding loop: */
14821482+14831483+ while (h < input_length) {
14841484+ /* All non-basic code points < n have been */
14851485+ /* handled already. Find the next larger one: */
14861486+14871487+ for (m = maxint, j = 0; j < input_length; ++j) {
14881488+ /* if (basic(input[j])) continue; */
14891489+ /* (not needed for Punycode) */
14901490+ if (input[j] >= n && input[j] < m) m = input[j];
14911491+ }
14921492+14931493+ /* Increase delta enough to advance the decoder's */
14941494+ /* <n,i> state to <m,0>, but guard against overflow: */
14951495+14961496+ if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
14971497+ delta += (m - n) * (h + 1);
14981498+ n = m;
14991499+15001500+ for (j = 0; j < input_length; ++j) {
15011501+ /* Punycode does not need to check whether input[j] is basic: */
15021502+ if (input[j] < n /* || basic(input[j]) */ ) {
15031503+ if (++delta == 0) return punycode_overflow;
15041504+ }
15051505+15061506+ if (input[j] == n) {
15071507+ /* Represent delta as a generalized variable-length integer: */
15081508+15091509+ for (q = delta, k = base; ; k += base) {
15101510+ if (out >= max_out) return punycode_big_output;
15111511+15121512+15131513+15141514+Costello Standards Track [Page 27]
15151515+15161516+RFC 3492 IDNA Punycode March 2003
15171517+15181518+15191519+ t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
15201520+ k >= bias + tmax ? tmax : k - bias;
15211521+ if (q < t) break;
15221522+ output[out++] = encode_digit(t + (q - t) % (base - t), 0);
15231523+ q = (q - t) / (base - t);
15241524+ }
15251525+15261526+ output[out++] = encode_digit(q, case_flags && case_flags[j]);
15271527+ bias = adapt(delta, h + 1, h == b);
15281528+ delta = 0;
15291529+ ++h;
15301530+ }
15311531+ }
15321532+15331533+ ++delta, ++n;
15341534+ }
15351535+15361536+ *output_length = out;
15371537+ return punycode_success;
15381538+}
15391539+15401540+/*** Main decode function ***/
15411541+15421542+enum punycode_status punycode_decode(
15431543+ punycode_uint input_length,
15441544+ const char input[],
15451545+ punycode_uint *output_length,
15461546+ punycode_uint output[],
15471547+ unsigned char case_flags[] )
15481548+{
15491549+ punycode_uint n, out, i, max_out, bias,
15501550+ b, j, in, oldi, w, k, digit, t;
15511551+15521552+ /* Initialize the state: */
15531553+15541554+ n = initial_n;
15551555+ out = i = 0;
15561556+ max_out = *output_length;
15571557+ bias = initial_bias;
15581558+15591559+ /* Handle the basic code points: Let b be the number of input code */
15601560+ /* points before the last delimiter, or 0 if there is none, then */
15611561+ /* copy the first b code points to the output. */
15621562+15631563+ for (b = j = 0; j < input_length; ++j) if (delim(input[j])) b = j;
15641564+ if (b > max_out) return punycode_big_output;
15651565+15661566+ for (j = 0; j < b; ++j) {
15671567+15681568+15691569+15701570+Costello Standards Track [Page 28]
15711571+15721572+RFC 3492 IDNA Punycode March 2003
15731573+15741574+15751575+ if (case_flags) case_flags[out] = flagged(input[j]);
15761576+ if (!basic(input[j])) return punycode_bad_input;
15771577+ output[out++] = input[j];
15781578+ }
15791579+15801580+ /* Main decoding loop: Start just after the last delimiter if any */
15811581+ /* basic code points were copied; start at the beginning otherwise. */
15821582+15831583+ for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) {
15841584+15851585+ /* in is the index of the next character to be consumed, and */
15861586+ /* out is the number of code points in the output array. */
15871587+15881588+ /* Decode a generalized variable-length integer into delta, */
15891589+ /* which gets added to i. The overflow checking is easier */
15901590+ /* if we increase i as we go, then subtract off its starting */
15911591+ /* value at the end to obtain delta. */
15921592+15931593+ for (oldi = i, w = 1, k = base; ; k += base) {
15941594+ if (in >= input_length) return punycode_bad_input;
15951595+ digit = decode_digit(input[in++]);
15961596+ if (digit >= base) return punycode_bad_input;
15971597+ if (digit > (maxint - i) / w) return punycode_overflow;
15981598+ i += digit * w;
15991599+ t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
16001600+ k >= bias + tmax ? tmax : k - bias;
16011601+ if (digit < t) break;
16021602+ if (w > maxint / (base - t)) return punycode_overflow;
16031603+ w *= (base - t);
16041604+ }
16051605+16061606+ bias = adapt(i - oldi, out + 1, oldi == 0);
16071607+16081608+ /* i was supposed to wrap around from out+1 to 0, */
16091609+ /* incrementing n each time, so we'll fix that now: */
16101610+16111611+ if (i / (out + 1) > maxint - n) return punycode_overflow;
16121612+ n += i / (out + 1);
16131613+ i %= (out + 1);
16141614+16151615+ /* Insert n at position i of the output: */
16161616+16171617+ /* not needed for Punycode: */
16181618+ /* if (decode_digit(n) <= base) return punycode_invalid_input; */
16191619+ if (out >= max_out) return punycode_big_output;
16201620+16211621+ if (case_flags) {
16221622+ memmove(case_flags + i + 1, case_flags + i, out - i);
16231623+16241624+16251625+16261626+Costello Standards Track [Page 29]
16271627+16281628+RFC 3492 IDNA Punycode March 2003
16291629+16301630+16311631+ /* Case of last character determines uppercase flag: */
16321632+ case_flags[i] = flagged(input[in - 1]);
16331633+ }
16341634+16351635+ memmove(output + i + 1, output + i, (out - i) * sizeof *output);
16361636+ output[i++] = n;
16371637+ }
16381638+16391639+ *output_length = out;
16401640+ return punycode_success;
16411641+}
16421642+16431643+/******************************************************************/
16441644+/* Wrapper for testing (would normally go in a separate .c file): */
16451645+16461646+#include <assert.h>
16471647+#include <stdio.h>
16481648+#include <stdlib.h>
16491649+#include <string.h>
16501650+16511651+/* For testing, we'll just set some compile-time limits rather than */
16521652+/* use malloc(), and set a compile-time option rather than using a */
16531653+/* command-line option. */
16541654+16551655+enum {
16561656+ unicode_max_length = 256,
16571657+ ace_max_length = 256
16581658+};
16591659+16601660+static void usage(char **argv)
16611661+{
16621662+ fprintf(stderr,
16631663+ "\n"
16641664+ "%s -e reads code points and writes a Punycode string.\n"
16651665+ "%s -d reads a Punycode string and writes code points.\n"
16661666+ "\n"
16671667+ "Input and output are plain text in the native character set.\n"
16681668+ "Code points are in the form u+hex separated by whitespace.\n"
16691669+ "Although the specification allows Punycode strings to contain\n"
16701670+ "any characters from the ASCII repertoire, this test code\n"
16711671+ "supports only the printable characters, and needs the Punycode\n"
16721672+ "string to be followed by a newline.\n"
16731673+ "The case of the u in u+hex is the force-to-uppercase flag.\n"
16741674+ , argv[0], argv[0]);
16751675+ exit(EXIT_FAILURE);
16761676+}
16771677+16781678+static void fail(const char *msg)
16791679+16801680+16811681+16821682+Costello Standards Track [Page 30]
16831683+16841684+RFC 3492 IDNA Punycode March 2003
16851685+16861686+16871687+{
16881688+ fputs(msg,stderr);
16891689+ exit(EXIT_FAILURE);
16901690+}
16911691+16921692+static const char too_big[] =
16931693+ "input or output is too large, recompile with larger limits\n";
16941694+static const char invalid_input[] = "invalid input\n";
16951695+static const char overflow[] = "arithmetic overflow\n";
16961696+static const char io_error[] = "I/O error\n";
16971697+16981698+/* The following string is used to convert printable */
16991699+/* characters between ASCII and the native charset: */
17001700+17011701+static const char print_ascii[] =
17021702+ "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
17031703+ "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
17041704+ " !\"#$%&'()*+,-./"
17051705+ "0123456789:;<=>?"
17061706+ "@ABCDEFGHIJKLMNO"
17071707+ "PQRSTUVWXYZ[\\]^_"
17081708+ "`abcdefghijklmno"
17091709+ "pqrstuvwxyz{|}~\n";
17101710+17111711+int main(int argc, char **argv)
17121712+{
17131713+ enum punycode_status status;
17141714+ int r;
17151715+ unsigned int input_length, output_length, j;
17161716+ unsigned char case_flags[unicode_max_length];
17171717+17181718+ if (argc != 2) usage(argv);
17191719+ if (argv[1][0] != '-') usage(argv);
17201720+ if (argv[1][2] != 0) usage(argv);
17211721+17221722+ if (argv[1][1] == 'e') {
17231723+ punycode_uint input[unicode_max_length];
17241724+ unsigned long codept;
17251725+ char output[ace_max_length+1], uplus[3];
17261726+ int c;
17271727+17281728+ /* Read the input code points: */
17291729+17301730+ input_length = 0;
17311731+17321732+ for (;;) {
17331733+ r = scanf("%2s%lx", uplus, &codept);
17341734+ if (ferror(stdin)) fail(io_error);
17351735+17361736+17371737+17381738+Costello Standards Track [Page 31]
17391739+17401740+RFC 3492 IDNA Punycode March 2003
17411741+17421742+17431743+ if (r == EOF || r == 0) break;
17441744+17451745+ if (r != 2 || uplus[1] != '+' || codept > (punycode_uint)-1) {
17461746+ fail(invalid_input);
17471747+ }
17481748+17491749+ if (input_length == unicode_max_length) fail(too_big);
17501750+17511751+ if (uplus[0] == 'u') case_flags[input_length] = 0;
17521752+ else if (uplus[0] == 'U') case_flags[input_length] = 1;
17531753+ else fail(invalid_input);
17541754+17551755+ input[input_length++] = codept;
17561756+ }
17571757+17581758+ /* Encode: */
17591759+17601760+ output_length = ace_max_length;
17611761+ status = punycode_encode(input_length, input, case_flags,
17621762+ &output_length, output);
17631763+ if (status == punycode_bad_input) fail(invalid_input);
17641764+ if (status == punycode_big_output) fail(too_big);
17651765+ if (status == punycode_overflow) fail(overflow);
17661766+ assert(status == punycode_success);
17671767+17681768+ /* Convert to native charset and output: */
17691769+17701770+ for (j = 0; j < output_length; ++j) {
17711771+ c = output[j];
17721772+ assert(c >= 0 && c <= 127);
17731773+ if (print_ascii[c] == 0) fail(invalid_input);
17741774+ output[j] = print_ascii[c];
17751775+ }
17761776+17771777+ output[j] = 0;
17781778+ r = puts(output);
17791779+ if (r == EOF) fail(io_error);
17801780+ return EXIT_SUCCESS;
17811781+ }
17821782+17831783+ if (argv[1][1] == 'd') {
17841784+ char input[ace_max_length+2], *p, *pp;
17851785+ punycode_uint output[unicode_max_length];
17861786+17871787+ /* Read the Punycode input string and convert to ASCII: */
17881788+17891789+ fgets(input, ace_max_length+2, stdin);
17901790+ if (ferror(stdin)) fail(io_error);
17911791+17921792+17931793+17941794+Costello Standards Track [Page 32]
17951795+17961796+RFC 3492 IDNA Punycode March 2003
17971797+17981798+17991799+ if (feof(stdin)) fail(invalid_input);
18001800+ input_length = strlen(input) - 1;
18011801+ if (input[input_length] != '\n') fail(too_big);
18021802+ input[input_length] = 0;
18031803+18041804+ for (p = input; *p != 0; ++p) {
18051805+ pp = strchr(print_ascii, *p);
18061806+ if (pp == 0) fail(invalid_input);
18071807+ *p = pp - print_ascii;
18081808+ }
18091809+18101810+ /* Decode: */
18111811+18121812+ output_length = unicode_max_length;
18131813+ status = punycode_decode(input_length, input, &output_length,
18141814+ output, case_flags);
18151815+ if (status == punycode_bad_input) fail(invalid_input);
18161816+ if (status == punycode_big_output) fail(too_big);
18171817+ if (status == punycode_overflow) fail(overflow);
18181818+ assert(status == punycode_success);
18191819+18201820+ /* Output the result: */
18211821+18221822+ for (j = 0; j < output_length; ++j) {
18231823+ r = printf("%s+%04lX\n",
18241824+ case_flags[j] ? "U" : "u",
18251825+ (unsigned long) output[j] );
18261826+ if (r < 0) fail(io_error);
18271827+ }
18281828+18291829+ return EXIT_SUCCESS;
18301830+ }
18311831+18321832+ usage(argv);
18331833+ return EXIT_SUCCESS; /* not reached, but quiets compiler warning */
18341834+}
18351835+18361836+18371837+18381838+18391839+18401840+18411841+18421842+18431843+18441844+18451845+18461846+18471847+18481848+18491849+18501850+Costello Standards Track [Page 33]
18511851+18521852+RFC 3492 IDNA Punycode March 2003
18531853+18541854+18551855+Author's Address
18561856+18571857+ Adam M. Costello
18581858+ University of California, Berkeley
18591859+ http://www.nicemice.net/amc/
18601860+18611861+18621862+18631863+18641864+18651865+18661866+18671867+18681868+18691869+18701870+18711871+18721872+18731873+18741874+18751875+18761876+18771877+18781878+18791879+18801880+18811881+18821882+18831883+18841884+18851885+18861886+18871887+18881888+18891889+18901890+18911891+18921892+18931893+18941894+18951895+18961896+18971897+18981898+18991899+19001900+19011901+19021902+19031903+19041904+19051905+19061906+Costello Standards Track [Page 34]
19071907+19081908+RFC 3492 IDNA Punycode March 2003
19091909+19101910+19111911+Full Copyright Statement
19121912+19131913+ Copyright (C) The Internet Society (2003). All Rights Reserved.
19141914+19151915+ This document and translations of it may be copied and furnished to
19161916+ others, and derivative works that comment on or otherwise explain it
19171917+ or assist in its implementation may be prepared, copied, published
19181918+ and distributed, in whole or in part, without restriction of any
19191919+ kind, provided that the above copyright notice and this paragraph are
19201920+ included on all such copies and derivative works. However, this
19211921+ document itself may not be modified in any way, such as by removing
19221922+ the copyright notice or references to the Internet Society or other
19231923+ Internet organizations, except as needed for the purpose of
19241924+ developing Internet standards in which case the procedures for
19251925+ copyrights defined in the Internet Standards process must be
19261926+ followed, or as required to translate it into languages other than
19271927+ English.
19281928+19291929+ The limited permissions granted above are perpetual and will not be
19301930+ revoked by the Internet Society or its successors or assigns.
19311931+19321932+ This document and the information contained herein is provided on an
19331933+ "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
19341934+ TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
19351935+ BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
19361936+ HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
19371937+ MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
19381938+19391939+Acknowledgement
19401940+19411941+ Funding for the RFC Editor function is currently provided by the
19421942+ Internet Society.
19431943+19441944+19451945+19461946+19471947+19481948+19491949+19501950+19511951+19521952+19531953+19541954+19551955+19561956+19571957+19581958+19591959+19601960+19611961+19621962+Costello Standards Track [Page 35]
19631963+