Punycode (RFC3492) in OCaml

init

+3721
+1
.gitignore
··· 1 + _build
+25
dune-project
··· 1 + (lang dune 3.0) 2 + (name puny) 3 + (version 0.1.0) 4 + 5 + (generate_opam_files true) 6 + 7 + (source (github username/puny)) 8 + (license ISC) 9 + (authors "Author Name") 10 + (maintainers "maintainer@example.com") 11 + 12 + (package 13 + (name puny) 14 + (synopsis "RFC 3492 Punycode and IDNA implementation for OCaml") 15 + (description 16 + "A high-quality implementation of RFC 3492 (Punycode) with IDNA support. 17 + Provides encoding and decoding of internationalized domain names, 18 + with proper Unicode normalization and mixed-case annotation support.") 19 + (depends 20 + (ocaml (>= 4.14.0)) 21 + (dune (>= 3.0)) 22 + (uutf (>= 1.0.0)) 23 + (uunf (>= 15.0.0)) 24 + (domain-name (>= 0.4.0)) 25 + (alcotest :with-test)))
+5
lib/dune
··· 1 + (library 2 + (name puny) 3 + (public_name puny) 4 + (modules punycode punycode_idna) 5 + (libraries uutf uunf uunf.string domain-name))
+512
lib/punycode.ml
··· 1 + (* RFC 3492 Punycode Implementation *) 2 + 3 + (* {1 Bootstring Parameters for Punycode (RFC 3492 Section 5)} *) 4 + 5 + let base = 36 6 + let tmin = 1 7 + let tmax = 26 8 + let skew = 38 9 + let damp = 700 10 + let initial_bias = 72 11 + let initial_n = 0x80 (* 128 *) 12 + let delimiter = '-' 13 + let ace_prefix = "xn--" 14 + let max_label_length = 63 15 + 16 + (* {1 Position Tracking} *) 17 + 18 + type position = { 19 + byte_offset : int; 20 + char_index : int; 21 + } 22 + 23 + let position_byte_offset pos = pos.byte_offset 24 + let position_char_index pos = pos.char_index 25 + 26 + let pp_position fmt pos = 27 + Format.fprintf fmt "byte %d, char %d" pos.byte_offset pos.char_index 28 + 29 + 30 + (* {1 Error Types} *) 31 + 32 + type error = 33 + | Overflow of position 34 + | Invalid_character of position * Uchar.t 35 + | Invalid_digit of position * char 36 + | Unexpected_end of position 37 + | Invalid_utf8 of position 38 + | Label_too_long of int 39 + | Empty_label 40 + 41 + let pp_error fmt = function 42 + | Overflow pos -> 43 + Format.fprintf fmt "arithmetic overflow at %a" pp_position pos 44 + | Invalid_character (pos, u) -> 45 + Format.fprintf fmt "invalid character U+%04X at %a" 46 + (Uchar.to_int u) pp_position pos 47 + | Invalid_digit (pos, c) -> 48 + Format.fprintf fmt "invalid Punycode digit '%c' (0x%02X) at %a" 49 + c (Char.code c) pp_position pos 50 + | Unexpected_end pos -> 51 + Format.fprintf fmt "unexpected end of input at %a" pp_position pos 52 + | Invalid_utf8 pos -> 53 + Format.fprintf fmt "invalid UTF-8 sequence at %a" pp_position pos 54 + | Label_too_long len -> 55 + Format.fprintf fmt "label too long: %d bytes (max %d)" len max_label_length 56 + | Empty_label -> 57 + Format.fprintf fmt "empty label" 58 + 59 + 60 + (* {1 Case Flags} *) 61 + 62 + type case_flag = Uppercase | Lowercase 63 + 64 + (* {1 Basic Predicates} *) 65 + 66 + let is_basic u = 67 + Uchar.to_int u < 0x80 68 + 69 + 70 + let is_delimiter c = c = delimiter 71 + 72 + let is_ascii_string s = 73 + let rec loop i = 74 + if i >= String.length s then true 75 + else if Char.code s.[i] >= 0x80 then false 76 + else loop (i + 1) 77 + in 78 + loop 0 79 + 80 + let has_ace_prefix s = 81 + let len = String.length s in 82 + len >= 4 && 83 + (s.[0] = 'x' || s.[0] = 'X') && 84 + (s.[1] = 'n' || s.[1] = 'N') && 85 + s.[2] = '-' && s.[3] = '-' 86 + 87 + (* {1 Digit Encoding/Decoding (RFC 3492 Section 5)} 88 + 89 + Digit values: 90 + - 0-25: a-z (or A-Z) 91 + - 26-35: 0-9 92 + *) 93 + 94 + let encode_digit d case_flag = 95 + if d < 26 then 96 + Char.chr (d + (if case_flag = Uppercase then 0x41 else 0x61)) 97 + else 98 + Char.chr (d - 26 + 0x30) 99 + 100 + let decode_digit c = 101 + let code = Char.code c in 102 + if code >= 0x30 && code <= 0x39 then 103 + Some (code - 0x30 + 26) (* '0'-'9' -> 26-35 *) 104 + else if code >= 0x41 && code <= 0x5A then 105 + Some (code - 0x41) (* 'A'-'Z' -> 0-25 *) 106 + else if code >= 0x61 && code <= 0x7A then 107 + Some (code - 0x61) (* 'a'-'z' -> 0-25 *) 108 + else 109 + None 110 + 111 + (* Check if a character is "flagged" (uppercase) for case annotation *) 112 + let is_flagged c = 113 + let code = Char.code c in 114 + code >= 0x41 && code <= 0x5A (* 'A'-'Z' *) 115 + 116 + (* {1 Bias Adaptation (RFC 3492 Section 6.1)} *) 117 + 118 + let adapt ~delta ~numpoints ~firsttime = 119 + let delta = if firsttime then delta / damp else delta / 2 in 120 + let delta = delta + (delta / numpoints) in 121 + let threshold = ((base - tmin) * tmax) / 2 in 122 + let rec loop delta k = 123 + if delta > threshold then 124 + loop (delta / (base - tmin)) (k + base) 125 + else 126 + k + (((base - tmin + 1) * delta) / (delta + skew)) 127 + in 128 + loop delta 0 129 + 130 + (* {1 Overflow-Safe Arithmetic} 131 + 132 + RFC 3492 Section 6.4: Use detection to avoid overflow. 133 + A + B overflows iff B > maxint - A 134 + A + B*C overflows iff B > (maxint - A) / C 135 + *) 136 + 137 + let max_int_value = max_int 138 + 139 + let safe_mul_add a b c pos = 140 + if c = 0 then Ok a 141 + else if b > (max_int_value - a) / c then 142 + Error (Overflow pos) 143 + else 144 + Ok (a + b * c) 145 + 146 + (* {1 UTF-8 to Code Points Conversion} *) 147 + 148 + let utf8_to_codepoints s = 149 + let len = String.length s in 150 + let acc = ref [] in 151 + let byte_offset = ref 0 in 152 + let char_index = ref 0 in 153 + let error = ref None in 154 + while !byte_offset < len && !error = None do 155 + let pos = { byte_offset = !byte_offset; char_index = !char_index } in 156 + let dec = String.get_utf_8_uchar s !byte_offset in 157 + if Uchar.utf_decode_is_valid dec then begin 158 + acc := Uchar.utf_decode_uchar dec :: !acc; 159 + byte_offset := !byte_offset + Uchar.utf_decode_length dec; 160 + incr char_index 161 + end else begin 162 + error := Some (Invalid_utf8 pos) 163 + end 164 + done; 165 + match !error with 166 + | Some e -> Error e 167 + | None -> Ok (Array.of_list (List.rev !acc)) 168 + 169 + (* {1 Code Points to UTF-8 Conversion} *) 170 + 171 + let codepoints_to_utf8 codepoints = 172 + let buf = Buffer.create (Array.length codepoints * 2) in 173 + Array.iter (Buffer.add_utf_8_uchar buf) codepoints; 174 + Buffer.contents buf 175 + 176 + (* {1 Punycode Encoding (RFC 3492 Section 6.3)} *) 177 + 178 + let encode_impl codepoints case_flags = 179 + let input_length = Array.length codepoints in 180 + if input_length = 0 then 181 + Ok "" 182 + else begin 183 + let output = Buffer.create (input_length * 2) in 184 + 185 + (* Copy basic code points to output *) 186 + let basic_count = ref 0 in 187 + for j = 0 to input_length - 1 do 188 + let cp = codepoints.(j) in 189 + if is_basic cp then begin 190 + let c = Uchar.to_int cp in 191 + let case = 192 + match case_flags with 193 + | Some flags -> flags.(j) 194 + | None -> Lowercase 195 + in 196 + (* Preserve or apply case for ASCII letters *) 197 + let c' = 198 + if c >= 0x41 && c <= 0x5A then (* 'A'-'Z' *) 199 + if case = Lowercase then c + 0x20 else c 200 + else if c >= 0x61 && c <= 0x7A then (* 'a'-'z' *) 201 + if case = Uppercase then c - 0x20 else c 202 + else 203 + c 204 + in 205 + Buffer.add_char output (Char.chr c'); 206 + incr basic_count 207 + end 208 + done; 209 + 210 + let b = !basic_count in 211 + let h = ref b in 212 + 213 + (* Add delimiter if there were basic code points *) 214 + if b > 0 then 215 + Buffer.add_char output delimiter; 216 + 217 + (* Main encoding loop *) 218 + let n = ref initial_n in 219 + let delta = ref 0 in 220 + let bias = ref initial_bias in 221 + 222 + let result = ref (Ok ()) in 223 + 224 + while !h < input_length && !result = Ok () do 225 + (* Find minimum code point >= n *) 226 + let m = ref max_int_value in 227 + for j = 0 to input_length - 1 do 228 + let cp = Uchar.to_int codepoints.(j) in 229 + if cp >= !n && cp < !m then 230 + m := cp 231 + done; 232 + 233 + (* Increase delta to advance state to <m, 0> *) 234 + let pos = { byte_offset = 0; char_index = !h } in 235 + (match safe_mul_add !delta (!m - !n) (!h + 1) pos with 236 + | Error e -> result := Error e 237 + | Ok new_delta -> 238 + delta := new_delta; 239 + n := !m; 240 + 241 + (* Process each code point *) 242 + let j = ref 0 in 243 + while !j < input_length && !result = Ok () do 244 + let cp = Uchar.to_int codepoints.(!j) in 245 + let pos = { byte_offset = 0; char_index = !j } in 246 + 247 + if cp < !n then begin 248 + incr delta; 249 + if !delta = 0 then (* Overflow *) 250 + result := Error (Overflow pos) 251 + end 252 + else if cp = !n then begin 253 + (* Encode delta as variable-length integer *) 254 + let q = ref !delta in 255 + let k = ref base in 256 + let done_encoding = ref false in 257 + 258 + while not !done_encoding do 259 + let t = 260 + if !k <= !bias then tmin 261 + else if !k >= !bias + tmax then tmax 262 + else !k - !bias 263 + in 264 + if !q < t then begin 265 + (* Output final digit *) 266 + let case = 267 + match case_flags with 268 + | Some flags -> flags.(!j) 269 + | None -> Lowercase 270 + in 271 + Buffer.add_char output (encode_digit !q case); 272 + done_encoding := true 273 + end else begin 274 + (* Output intermediate digit and continue *) 275 + let digit = t + ((!q - t) mod (base - t)) in 276 + Buffer.add_char output (encode_digit digit Lowercase); 277 + q := (!q - t) / (base - t); 278 + k := !k + base 279 + end 280 + done; 281 + 282 + bias := adapt ~delta:!delta ~numpoints:(!h + 1) ~firsttime:(!h = b); 283 + delta := 0; 284 + incr h 285 + end; 286 + incr j 287 + done; 288 + 289 + incr delta; 290 + incr n) 291 + done; 292 + 293 + match !result with 294 + | Error e -> Error e 295 + | Ok () -> Ok (Buffer.contents output) 296 + end 297 + 298 + let encode codepoints = 299 + encode_impl codepoints None 300 + 301 + let encode_with_case codepoints case_flags = 302 + if Array.length codepoints <> Array.length case_flags then 303 + invalid_arg "encode_with_case: array lengths must match"; 304 + encode_impl codepoints (Some case_flags) 305 + 306 + (* {1 Punycode Decoding (RFC 3492 Section 6.2)} *) 307 + 308 + let decode_impl input = 309 + let input_length = String.length input in 310 + if input_length = 0 then 311 + Ok ([||], [||]) 312 + else begin 313 + (* Find last delimiter *) 314 + let last_delim = ref (-1) in 315 + for j = 0 to input_length - 1 do 316 + if is_delimiter input.[j] then 317 + last_delim := j 318 + done; 319 + let b = if !last_delim < 0 then 0 else !last_delim in 320 + 321 + (* Copy basic code points and extract case flags *) 322 + let output = ref [] in 323 + let case_output = ref [] in 324 + let error = ref None in 325 + 326 + for j = 0 to b - 1 do 327 + if !error = None then begin 328 + let c = input.[j] in 329 + let pos = { byte_offset = j; char_index = j } in 330 + let code = Char.code c in 331 + if code >= 0x80 then 332 + error := Some (Invalid_character (pos, Uchar.of_int code)) 333 + else begin 334 + output := Uchar.of_int code :: !output; 335 + case_output := (if is_flagged c then Uppercase else Lowercase) :: !case_output 336 + end 337 + end 338 + done; 339 + 340 + match !error with 341 + | Some e -> Error e 342 + | None -> 343 + let output = ref (Array.of_list (List.rev !output)) in 344 + let case_output = ref (Array.of_list (List.rev !case_output)) in 345 + 346 + (* Main decoding loop *) 347 + let n = ref initial_n in 348 + let i = ref 0 in 349 + let bias = ref initial_bias in 350 + let in_pos = ref (if b > 0 then b + 1 else 0) in 351 + let result = ref (Ok ()) in 352 + 353 + while !in_pos < input_length && !result = Ok () do 354 + let oldi = !i in 355 + let w = ref 1 in 356 + let k = ref base in 357 + let done_decoding = ref false in 358 + 359 + while not !done_decoding && !result = Ok () do 360 + let pos = { byte_offset = !in_pos; char_index = Array.length !output } in 361 + 362 + if !in_pos >= input_length then begin 363 + result := Error (Unexpected_end pos); 364 + done_decoding := true 365 + end else begin 366 + let c = input.[!in_pos] in 367 + incr in_pos; 368 + 369 + match decode_digit c with 370 + | None -> 371 + result := Error (Invalid_digit (pos, c)); 372 + done_decoding := true 373 + | Some digit -> 374 + (* i = i + digit * w, with overflow check *) 375 + (match safe_mul_add !i digit !w pos with 376 + | Error e -> 377 + result := Error e; 378 + done_decoding := true 379 + | Ok new_i -> 380 + i := new_i; 381 + 382 + let t = 383 + if !k <= !bias then tmin 384 + else if !k >= !bias + tmax then tmax 385 + else !k - !bias 386 + in 387 + 388 + if digit < t then begin 389 + (* Record case flag from this final digit *) 390 + done_decoding := true 391 + end else begin 392 + (* w = w * (base - t), with overflow check *) 393 + let base_minus_t = base - t in 394 + if !w > max_int_value / base_minus_t then begin 395 + result := Error (Overflow pos); 396 + done_decoding := true 397 + end else begin 398 + w := !w * base_minus_t; 399 + k := !k + base 400 + end 401 + end) 402 + end 403 + done; 404 + 405 + if !result = Ok () then begin 406 + let out_len = Array.length !output in 407 + bias := adapt ~delta:(!i - oldi) ~numpoints:(out_len + 1) ~firsttime:(oldi = 0); 408 + 409 + let pos = { byte_offset = !in_pos - 1; char_index = out_len } in 410 + 411 + (* n = n + i / (out_len + 1), with overflow check *) 412 + let increment = !i / (out_len + 1) in 413 + if increment > max_int_value - !n then 414 + result := Error (Overflow pos) 415 + else begin 416 + n := !n + increment; 417 + i := !i mod (out_len + 1); 418 + 419 + (* Validate that n is a valid Unicode scalar value *) 420 + if not (Uchar.is_valid !n) then 421 + result := Error (Invalid_character (pos, Uchar.rep)) 422 + else begin 423 + (* Insert n at position i *) 424 + let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in 425 + let new_case = Array.make (out_len + 1) Lowercase in 426 + 427 + for j = 0 to !i - 1 do 428 + new_output.(j) <- !output.(j); 429 + new_case.(j) <- !case_output.(j) 430 + done; 431 + new_output.(!i) <- Uchar.of_int !n; 432 + (* Case flag from final digit of this delta *) 433 + new_case.(!i) <- (if !in_pos > 0 && is_flagged input.[!in_pos - 1] 434 + then Uppercase else Lowercase); 435 + for j = !i to out_len - 1 do 436 + new_output.(j + 1) <- !output.(j); 437 + new_case.(j + 1) <- !case_output.(j) 438 + done; 439 + 440 + output := new_output; 441 + case_output := new_case; 442 + incr i 443 + end 444 + end 445 + end 446 + done; 447 + 448 + match !result with 449 + | Error e -> Error e 450 + | Ok () -> Ok (!output, !case_output) 451 + end 452 + 453 + let decode input = 454 + match decode_impl input with 455 + | Error e -> Error e 456 + | Ok (codepoints, _) -> Ok codepoints 457 + 458 + let decode_with_case input = 459 + decode_impl input 460 + 461 + (* {1 UTF-8 String Operations} *) 462 + 463 + let encode_utf8 s = 464 + match utf8_to_codepoints s with 465 + | Error e -> Error e 466 + | Ok codepoints -> encode codepoints 467 + 468 + let decode_utf8 punycode = 469 + match decode punycode with 470 + | Error e -> Error e 471 + | Ok codepoints -> Ok (codepoints_to_utf8 codepoints) 472 + 473 + (* {1 Domain Label Operations} *) 474 + 475 + let encode_label label = 476 + if String.length label = 0 then 477 + Error Empty_label 478 + else if is_ascii_string label then begin 479 + (* All ASCII - return as-is, but check length *) 480 + let len = String.length label in 481 + if len > max_label_length then 482 + Error (Label_too_long len) 483 + else 484 + Ok label 485 + end else begin 486 + (* Has non-ASCII - encode with Punycode *) 487 + match encode_utf8 label with 488 + | Error e -> Error e 489 + | Ok encoded -> 490 + let result = ace_prefix ^ encoded in 491 + let len = String.length result in 492 + if len > max_label_length then 493 + Error (Label_too_long len) 494 + else 495 + Ok result 496 + end 497 + 498 + let decode_label label = 499 + if String.length label = 0 then 500 + Error Empty_label 501 + else if has_ace_prefix label then begin 502 + (* Remove ACE prefix and decode *) 503 + let punycode = String.sub label 4 (String.length label - 4) in 504 + decode_utf8 punycode 505 + end else begin 506 + (* No ACE prefix - validate and return *) 507 + if is_ascii_string label then 508 + Ok label 509 + else 510 + (* Has non-ASCII but no ACE prefix - return as-is *) 511 + Ok label 512 + end
+247
lib/punycode.mli
··· 1 + (** RFC 3492 Punycode: A Bootstring encoding of Unicode for IDNA. 2 + 3 + This module implements the Punycode algorithm as specified in 4 + {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, 5 + providing encoding and decoding of Unicode strings to/from ASCII-compatible 6 + encoding suitable for use in internationalized domain names. 7 + 8 + Punycode is an instance of Bootstring that uses particular parameter 9 + values appropriate for IDNA. See 10 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5} 11 + for the specific parameter values. 12 + 13 + {2 References} 14 + {ul 15 + {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A Bootstring encoding of Unicode for IDNA} 16 + {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol}} *) 17 + 18 + (** {1 Position Tracking} *) 19 + 20 + type position 21 + (** Abstract type representing a position in input for error reporting. 22 + Positions track both byte offset and Unicode character index. *) 23 + 24 + val position_byte_offset : position -> int 25 + (** [position_byte_offset pos] returns the byte offset in the input. *) 26 + 27 + val position_char_index : position -> int 28 + (** [position_char_index pos] returns the Unicode character index (0-based). *) 29 + 30 + val pp_position : Format.formatter -> position -> unit 31 + (** [pp_position fmt pos] pretty-prints a position as "byte N, char M". *) 32 + 33 + (** {1 Error Types} *) 34 + 35 + type error = 36 + | Overflow of position 37 + (** Arithmetic overflow during encode/decode. This can occur with 38 + very long strings or extreme Unicode code point values. 39 + See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4} 40 + RFC 3492 Section 6.4} for overflow handling requirements. *) 41 + | Invalid_character of position * Uchar.t 42 + (** A non-basic code point appeared where only basic code points 43 + (ASCII < 128) are allowed. Per 44 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} 45 + RFC 3492 Section 3.1}, basic code points must be segregated 46 + at the beginning of the encoded string. *) 47 + | Invalid_digit of position * char 48 + (** An invalid Punycode digit was encountered during decoding. 49 + Valid digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35). 50 + See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} 51 + RFC 3492 Section 5} for digit-value mappings. *) 52 + | Unexpected_end of position 53 + (** The input ended prematurely during decoding of a delta value. 54 + See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2} 55 + RFC 3492 Section 6.2} decoding procedure. *) 56 + | Invalid_utf8 of position 57 + (** Malformed UTF-8 sequence in input string. *) 58 + | Label_too_long of int 59 + (** Encoded label exceeds 63 bytes (DNS limit per 60 + {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}). 61 + The int is the actual length. *) 62 + | Empty_label 63 + (** Empty label is not valid for encoding. *) 64 + 65 + val pp_error : Format.formatter -> error -> unit 66 + (** [pp_error fmt e] pretty-prints an error with position information. *) 67 + 68 + (** {1 Constants} 69 + 70 + Punycode parameters as specified in 71 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}. *) 72 + 73 + val ace_prefix : string 74 + (** The ACE prefix ["xn--"] used for Punycode-encoded domain labels. 75 + See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} 76 + RFC 3492 Section 5} which notes that IDNA prepends this prefix. *) 77 + 78 + val max_label_length : int 79 + (** Maximum length of a domain label in bytes (63), per 80 + {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *) 81 + 82 + (** {1 Case Flags for Mixed-Case Annotation} 83 + 84 + {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A} 85 + describes an optional mechanism for preserving case information through 86 + the encoding/decoding round-trip. This is useful when the original 87 + string's case should be recoverable. 88 + 89 + Note: Mixed-case annotation is not used by the ToASCII and ToUnicode 90 + operations of IDNA. *) 91 + 92 + type case_flag = Uppercase | Lowercase 93 + (** Case annotation for a character. *) 94 + 95 + (** {1 Core Punycode Operations} 96 + 97 + These functions implement the Bootstring algorithms from 98 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section 6}. 99 + They operate on arrays of Unicode code points ([Uchar.t array]). 100 + The encoded output is a plain ASCII string without the ACE prefix. *) 101 + 102 + val encode : Uchar.t array -> (string, error) result 103 + (** [encode codepoints] encodes an array of Unicode code points to a 104 + Punycode ASCII string. 105 + 106 + Implements the encoding procedure from 107 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492 Section 6.3}: 108 + 109 + 1. Basic code points (ASCII < 128) are copied literally to the beginning 110 + of the output per {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} 111 + Section 3.1 (Basic code point segregation)} 112 + 2. A delimiter ('-') is appended if there are any basic code points 113 + 3. Non-basic code points are encoded as deltas using the generalized 114 + variable-length integer representation from 115 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3} 116 + 117 + Example: 118 + {[ 119 + encode [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC; ... |] 120 + (* = Ok "ihqwcrb4cv8a8dqg056pqjye" *) 121 + ]} *) 122 + 123 + val decode : string -> (Uchar.t array, error) result 124 + (** [decode punycode] decodes a Punycode ASCII string to an array of 125 + Unicode code points. 126 + 127 + Implements the decoding procedure from 128 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 Section 6.2}. 129 + 130 + The input should be the Punycode portion only, without the ACE prefix. 131 + The decoder is case-insensitive for the encoded portion, as required by 132 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}: 133 + "A decoder MUST recognize the letters in both uppercase and lowercase forms". 134 + 135 + Example: 136 + {[ 137 + decode "ihqwcrb4cv8a8dqg056pqjye" 138 + (* = Ok [| U+4ED6; U+4EEC; U+4E3A; ... |] (Chinese simplified) *) 139 + ]} *) 140 + 141 + (** {1 Mixed-Case Annotation} 142 + 143 + These functions support round-trip case preservation as described 144 + in {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}. *) 145 + 146 + val encode_with_case : Uchar.t array -> case_flag array -> (string, error) result 147 + (** [encode_with_case codepoints case_flags] encodes with case annotation. 148 + 149 + Per {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}: 150 + - For basic (ASCII) letters, the output preserves the case flag directly 151 + - For non-ASCII characters, the case of the final digit in each delta 152 + encoding indicates the flag (uppercase = suggested uppercase) 153 + 154 + The [case_flags] array must have the same length as [codepoints]. 155 + 156 + @raise Invalid_argument if array lengths don't match. *) 157 + 158 + val decode_with_case : string -> (Uchar.t array * case_flag array, error) result 159 + (** [decode_with_case punycode] decodes and extracts case annotations. 160 + 161 + Per {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}, 162 + returns both the decoded code points and an array of case flags 163 + indicating the suggested case for each character based on the 164 + uppercase/lowercase form of the encoding digits. *) 165 + 166 + (** {1 UTF-8 String Operations} 167 + 168 + Convenience functions that work directly with UTF-8 encoded OCaml strings. 169 + These combine UTF-8 decoding/encoding with the core Punycode operations. *) 170 + 171 + val encode_utf8 : string -> (string, error) result 172 + (** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix). 173 + 174 + This is equivalent to decoding [s] from UTF-8 to code points, then 175 + calling {!encode}. 176 + 177 + Example: 178 + {[ 179 + encode_utf8 "münchen" 180 + (* = Ok "mnchen-3ya" *) 181 + ]} *) 182 + 183 + val decode_utf8 : string -> (string, error) result 184 + (** [decode_utf8 punycode] decodes Punycode to a UTF-8 string (no ACE prefix). 185 + 186 + This is equivalent to calling {!decode} then encoding the result as UTF-8. 187 + 188 + Example: 189 + {[ 190 + decode_utf8 "mnchen-3ya" 191 + (* = Ok "münchen" *) 192 + ]} *) 193 + 194 + (** {1 Domain Label Operations} 195 + 196 + These functions handle the ACE prefix automatically and enforce 197 + DNS label length limits per {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *) 198 + 199 + val encode_label : string -> (string, error) result 200 + (** [encode_label label] encodes a domain label for use in DNS. 201 + 202 + If the label contains only ASCII characters, it is returned unchanged. 203 + Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended, 204 + as specified in {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} 205 + RFC 3492 Section 5}. 206 + 207 + Returns {!Error} {!Label_too_long} if the result exceeds 63 bytes. 208 + 209 + Example: 210 + {[ 211 + encode_label "münchen" 212 + (* = Ok "xn--mnchen-3ya" *) 213 + encode_label "example" 214 + (* = Ok "example" *) 215 + ]} *) 216 + 217 + val decode_label : string -> (string, error) result 218 + (** [decode_label label] decodes a domain label. 219 + 220 + If the label starts with the ACE prefix ("xn--", case-insensitive), 221 + it is Punycode-decoded. Otherwise, it is returned unchanged. 222 + 223 + Example: 224 + {[ 225 + decode_label "xn--mnchen-3ya" 226 + (* = Ok "münchen" *) 227 + decode_label "example" 228 + (* = Ok "example" *) 229 + ]} *) 230 + 231 + (** {1 Validation} 232 + 233 + Predicate functions for checking code point and string properties. *) 234 + 235 + val is_basic : Uchar.t -> bool 236 + (** [is_basic u] is [true] if [u] is a basic code point (ASCII, < 128). 237 + 238 + Per {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}, 239 + basic code points for Punycode are the ASCII code points (0..7F). *) 240 + 241 + val is_ascii_string : string -> bool 242 + (** [is_ascii_string s] is [true] if [s] contains only ASCII characters 243 + (all bytes < 128). *) 244 + 245 + val has_ace_prefix : string -> bool 246 + (** [has_ace_prefix s] is [true] if [s] starts with the ACE prefix "xn--" 247 + (case-insensitive comparison). *)
+192
lib/punycode_idna.ml
··· 1 + (* IDNA (Internationalized Domain Names in Applications) Implementation *) 2 + 3 + let max_domain_length = 253 4 + 5 + (* {1 Error Types} *) 6 + 7 + type error = 8 + | Punycode_error of Punycode.error 9 + | Invalid_label of string 10 + | Domain_too_long of int 11 + | Normalization_failed 12 + | Verification_failed 13 + 14 + let pp_error fmt = function 15 + | Punycode_error e -> 16 + Format.fprintf fmt "Punycode error: %a" Punycode.pp_error e 17 + | Invalid_label msg -> 18 + Format.fprintf fmt "invalid label: %s" msg 19 + | Domain_too_long len -> 20 + Format.fprintf fmt "domain too long: %d bytes (max %d)" len max_domain_length 21 + | Normalization_failed -> 22 + Format.fprintf fmt "Unicode normalization failed" 23 + | Verification_failed -> 24 + Format.fprintf fmt "IDNA verification failed (round-trip mismatch)" 25 + 26 + 27 + (* {1 Unicode Normalization} *) 28 + 29 + let normalize_nfc s = 30 + Uunf_string.normalize_utf_8 `NFC s 31 + 32 + (* {1 Validation Helpers} *) 33 + 34 + let is_ace_label label = 35 + Punycode.has_ace_prefix label 36 + 37 + (* Check if a label follows STD3 rules (hostname restrictions): 38 + - Only LDH (letters, digits, hyphens) 39 + - Cannot start or end with hyphen *) 40 + let is_std3_valid label = 41 + let len = String.length label in 42 + if len = 0 then false 43 + else if label.[0] = '-' || label.[len - 1] = '-' then false 44 + else 45 + let rec check i = 46 + if i >= len then true 47 + else 48 + let c = label.[i] in 49 + let valid = 50 + (c >= 'a' && c <= 'z') || 51 + (c >= 'A' && c <= 'Z') || 52 + (c >= '0' && c <= '9') || 53 + c = '-' 54 + in 55 + if valid then check (i + 1) else false 56 + in 57 + check 0 58 + 59 + (* Check hyphen placement: hyphens not in positions 3 and 4 (except for ACE) *) 60 + let check_hyphen_rules label = 61 + let len = String.length label in 62 + if len >= 4 && label.[2] = '-' && label.[3] = '-' then 63 + (* Hyphens in positions 3 and 4 - only valid for ACE prefix *) 64 + is_ace_label label 65 + else 66 + true 67 + 68 + (* {1 Label Operations} *) 69 + 70 + let label_to_ascii_impl ~check_hyphens ~use_std3_rules label = 71 + let len = String.length label in 72 + if len = 0 then 73 + Error (Invalid_label "empty label") 74 + else if len > Punycode.max_label_length then 75 + Error (Punycode_error (Punycode.Label_too_long len)) 76 + else if Punycode.is_ascii_string label then begin 77 + (* All ASCII - validate and pass through *) 78 + if use_std3_rules && not (is_std3_valid label) then 79 + Error (Invalid_label "STD3 rules violation") 80 + else if check_hyphens && not (check_hyphen_rules label) then 81 + Error (Invalid_label "invalid hyphen placement") 82 + else 83 + Ok label 84 + end else begin 85 + (* Has non-ASCII - normalize and encode *) 86 + let normalized = normalize_nfc label in 87 + 88 + (* Encode to Punycode *) 89 + match Punycode.encode_utf8 normalized with 90 + | Error e -> Error (Punycode_error e) 91 + | Ok encoded -> 92 + let result = Punycode.ace_prefix ^ encoded in 93 + let result_len = String.length result in 94 + if result_len > Punycode.max_label_length then 95 + Error (Punycode_error (Punycode.Label_too_long result_len)) 96 + else if check_hyphens && not (check_hyphen_rules result) then 97 + Error (Invalid_label "invalid hyphen placement in encoded label") 98 + else 99 + (* Verification: decode and compare to original normalized form *) 100 + match Punycode.decode_utf8 encoded with 101 + | Error _ -> Error Verification_failed 102 + | Ok decoded -> 103 + if decoded <> normalized then 104 + Error Verification_failed 105 + else 106 + Ok result 107 + end 108 + 109 + let label_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) label = 110 + label_to_ascii_impl ~check_hyphens ~use_std3_rules label 111 + 112 + let label_to_unicode label = 113 + if is_ace_label label then begin 114 + let encoded = String.sub label 4 (String.length label - 4) in 115 + match Punycode.decode_utf8 encoded with 116 + | Error e -> Error (Punycode_error e) 117 + | Ok decoded -> Ok decoded 118 + end else 119 + Ok label 120 + 121 + (* {1 Domain Operations} *) 122 + 123 + (* Split domain into labels *) 124 + let split_domain domain = 125 + String.split_on_char '.' domain 126 + 127 + (* Join labels into domain *) 128 + let join_labels labels = 129 + String.concat "." labels 130 + 131 + let to_ascii ?(check_hyphens = true) ?(check_bidi = false) 132 + ?(check_joiners = false) ?(use_std3_rules = false) 133 + ?(transitional = false) domain = 134 + (* Note: check_bidi, check_joiners, and transitional are accepted but 135 + not fully implemented - they would require additional Unicode data *) 136 + let _ = check_bidi in 137 + let _ = check_joiners in 138 + let _ = transitional in 139 + 140 + let labels = split_domain domain in 141 + let rec process acc = function 142 + | [] -> 143 + let result = join_labels (List.rev acc) in 144 + let len = String.length result in 145 + if len > max_domain_length then 146 + Error (Domain_too_long len) 147 + else 148 + Ok result 149 + | label :: rest -> 150 + match label_to_ascii_impl ~check_hyphens ~use_std3_rules label with 151 + | Error e -> Error e 152 + | Ok encoded -> process (encoded :: acc) rest 153 + in 154 + process [] labels 155 + 156 + let to_unicode domain = 157 + let labels = split_domain domain in 158 + let rec process acc = function 159 + | [] -> Ok (join_labels (List.rev acc)) 160 + | label :: rest -> 161 + match label_to_unicode label with 162 + | Error e -> Error e 163 + | Ok decoded -> process (decoded :: acc) rest 164 + in 165 + process [] labels 166 + 167 + (* {1 Domain Name Library Integration} *) 168 + 169 + let domain_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) domain = 170 + let s = Domain_name.to_string domain in 171 + match to_ascii ~check_hyphens ~use_std3_rules s with 172 + | Error e -> Error e 173 + | Ok ascii -> 174 + match Domain_name.of_string ascii with 175 + | Error (`Msg msg) -> Error (Invalid_label msg) 176 + | Ok d -> Ok d 177 + 178 + let domain_to_unicode domain = 179 + let s = Domain_name.to_string domain in 180 + match to_unicode s with 181 + | Error e -> Error e 182 + | Ok unicode -> 183 + match Domain_name.of_string unicode with 184 + | Error (`Msg msg) -> Error (Invalid_label msg) 185 + | Ok d -> Ok d 186 + 187 + (* {1 Validation} *) 188 + 189 + let is_idna_valid domain = 190 + match to_ascii domain with 191 + | Ok _ -> true 192 + | Error _ -> false
+189
lib/punycode_idna.mli
··· 1 + (** IDNA (Internationalized Domain Names in Applications) support. 2 + 3 + This module provides ToASCII and ToUnicode operations as specified 4 + in {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} (IDNA 2008), 5 + using Punycode ({{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}) 6 + for encoding. 7 + 8 + IDNA allows domain names to contain non-ASCII Unicode characters by 9 + encoding them using Punycode with an ACE prefix. This module handles 10 + the conversion between Unicode domain names and their ASCII-compatible 11 + encoding (ACE) form. 12 + 13 + {2 References} 14 + {ul 15 + {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - 16 + Internationalized Domain Names in Applications (IDNA): Protocol} 17 + {- {{:https://datatracker.ietf.org/doc/html/rfc5892}RFC 5892} - 18 + The Unicode Code Points and Internationalized Domain Names for Applications (IDNA)} 19 + {- {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} - 20 + Right-to-Left Scripts for Internationalized Domain Names for Applications (IDNA)} 21 + {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - 22 + Punycode: A Bootstring encoding of Unicode for IDNA}} *) 23 + 24 + (** {1 Error Types} *) 25 + 26 + type error = 27 + | Punycode_error of Punycode.error 28 + (** Error during Punycode encoding/decoding. 29 + See {!Punycode.error} for details. *) 30 + | Invalid_label of string 31 + (** Label violates IDNA constraints. The string describes the violation. 32 + See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} 33 + RFC 5891 Section 4} for label validation requirements. *) 34 + | Domain_too_long of int 35 + (** Domain name exceeds 253 bytes, per 36 + {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. 37 + The int is the actual length. *) 38 + | Normalization_failed 39 + (** Unicode normalization (NFC) failed. 40 + Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} 41 + RFC 5891 Section 4.2.1}, labels must be in NFC form. *) 42 + | Verification_failed 43 + (** ToASCII/ToUnicode verification step failed (round-trip check). 44 + Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} 45 + RFC 5891 Section 4.2}, the result of encoding must decode back 46 + to the original input. *) 47 + 48 + val pp_error : Format.formatter -> error -> unit 49 + (** [pp_error fmt e] pretty-prints an error. *) 50 + 51 + (** {1 Constants} *) 52 + 53 + val max_domain_length : int 54 + (** Maximum length of a domain name in bytes (253), per 55 + {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *) 56 + 57 + (** {1 ToASCII Operation} 58 + 59 + Converts an internationalized domain name to its ASCII-compatible 60 + encoding (ACE) form suitable for DNS lookup. 61 + 62 + See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} 63 + RFC 5891 Section 4} for the complete ToASCII specification. *) 64 + 65 + val to_ascii : ?check_hyphens:bool -> ?check_bidi:bool -> 66 + ?check_joiners:bool -> ?use_std3_rules:bool -> 67 + ?transitional:bool -> string -> (string, error) result 68 + (** [to_ascii domain] converts an internationalized domain name to ASCII. 69 + 70 + Implements the ToASCII operation from 71 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 Section 4.1}. 72 + 73 + For each label in the domain: 74 + 1. If all ASCII, pass through (with optional STD3 validation) 75 + 2. Otherwise, normalize to NFC per 76 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}Section 4.2.1} 77 + and Punycode-encode with ACE prefix 78 + 79 + Optional parameters (per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} 80 + RFC 5891 Section 4} processing options): 81 + - [check_hyphens]: Validate hyphen placement per 82 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1}Section 4.2.3.1} 83 + (default: true) 84 + - [check_bidi]: Check bidirectional text rules per 85 + {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} 86 + (default: false, not implemented) 87 + - [check_joiners]: Check contextual joiner rules per 88 + {{:https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1}RFC 5892 Appendix A.1} 89 + (default: false, not implemented) 90 + - [use_std3_rules]: Apply STD3 hostname rules per 91 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section 4.2.3.2} 92 + (default: false) 93 + - [transitional]: Use IDNA 2003 transitional processing 94 + (default: false) 95 + 96 + Example: 97 + {[ 98 + to_ascii "münchen.example.com" 99 + (* = Ok "xn--mnchen-3ya.example.com" *) 100 + ]} *) 101 + 102 + val label_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool -> 103 + string -> (string, error) result 104 + (** [label_to_ascii label] converts a single label to ASCII. 105 + 106 + This implements the core ToASCII operation for one label, as described in 107 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 Section 4.1}. *) 108 + 109 + (** {1 ToUnicode Operation} 110 + 111 + Converts an ASCII-compatible encoded domain name back to Unicode. 112 + 113 + See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} 114 + RFC 5891 Section 4.2} for the complete ToUnicode specification. *) 115 + 116 + val to_unicode : string -> (string, error) result 117 + (** [to_unicode domain] converts an ACE domain name to Unicode. 118 + 119 + Implements the ToUnicode operation from 120 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 Section 4.2}. 121 + 122 + For each label in the domain: 123 + 1. If it has the ACE prefix ("xn--"), Punycode-decode it per 124 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 Section 6.2} 125 + 2. Otherwise, pass through unchanged 126 + 127 + Example: 128 + {[ 129 + to_unicode "xn--mnchen-3ya.example.com" 130 + (* = Ok "münchen.example.com" *) 131 + ]} *) 132 + 133 + val label_to_unicode : string -> (string, error) result 134 + (** [label_to_unicode label] converts a single ACE label to Unicode. 135 + 136 + This implements the core ToUnicode operation for one label, as described in 137 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 Section 4.2}. *) 138 + 139 + (** {1 Domain Name Integration} 140 + 141 + Functions that work with the 142 + {{:https://github.com/hannesm/domain-name}domain-name} library types. 143 + 144 + These provide integration with the [Domain_name] module for applications 145 + that use that library for domain name handling. *) 146 + 147 + val domain_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool -> 148 + [`raw] Domain_name.t -> ([`raw] Domain_name.t, error) result 149 + (** [domain_to_ascii domain] converts a domain name to ASCII form. 150 + 151 + Applies {!to_ascii} to the string representation and returns the 152 + result as a [Domain_name.t]. 153 + 154 + Example: 155 + {[ 156 + let d = Domain_name.of_string_exn "münchen.example.com" in 157 + domain_to_ascii d 158 + (* = Ok (Domain_name.of_string_exn "xn--mnchen-3ya.example.com") *) 159 + ]} *) 160 + 161 + val domain_to_unicode : [`raw] Domain_name.t -> ([`raw] Domain_name.t, error) result 162 + (** [domain_to_unicode domain] converts a domain name to Unicode form. 163 + 164 + Applies {!to_unicode} to the string representation and returns the 165 + result as a [Domain_name.t]. *) 166 + 167 + (** {1 Validation} *) 168 + 169 + val is_idna_valid : string -> bool 170 + (** [is_idna_valid domain] checks if a domain name is valid for IDNA processing. 171 + 172 + Returns [true] if {!to_ascii} would succeed on the domain. *) 173 + 174 + val is_ace_label : string -> bool 175 + (** [is_ace_label label] is [true] if the label has the ACE prefix "xn--" 176 + (case-insensitive). This indicates the label is Punycode-encoded per 177 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}. *) 178 + 179 + (** {1 Normalization} *) 180 + 181 + val normalize_nfc : string -> string 182 + (** [normalize_nfc s] returns the NFC-normalized form of UTF-8 string [s]. 183 + 184 + Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} 185 + RFC 5891 Section 4.2.1}, domain labels must be normalized to NFC 186 + (Unicode Normalization Form C) before encoding. 187 + 188 + See {{:http://www.unicode.org/reports/tr15/}Unicode Standard Annex #15} 189 + for details on Unicode normalization forms. *)
+37
puny.opam
··· 1 + # This file is generated by dune, edit dune-project instead 2 + opam-version: "2.0" 3 + version: "0.1.0" 4 + synopsis: "RFC 3492 Punycode and IDNA implementation for OCaml" 5 + description: """ 6 + A high-quality implementation of RFC 3492 (Punycode) with IDNA support. 7 + Provides encoding and decoding of internationalized domain names, 8 + with proper Unicode normalization and mixed-case annotation support.""" 9 + maintainer: ["maintainer@example.com"] 10 + authors: ["Author Name"] 11 + license: "ISC" 12 + homepage: "https://github.com/username/puny" 13 + bug-reports: "https://github.com/username/puny/issues" 14 + depends: [ 15 + "ocaml" {>= "4.14.0"} 16 + "dune" {>= "3.0" & >= "3.0"} 17 + "uutf" {>= "1.0.0"} 18 + "uunf" {>= "15.0.0"} 19 + "domain-name" {>= "0.4.0"} 20 + "alcotest" {with-test} 21 + "odoc" {with-doc} 22 + ] 23 + build: [ 24 + ["dune" "subst"] {dev} 25 + [ 26 + "dune" 27 + "build" 28 + "-p" 29 + name 30 + "-j" 31 + jobs 32 + "@install" 33 + "@runtest" {with-test} 34 + "@doc" {with-doc} 35 + ] 36 + ] 37 + dev-repo: "git+https://github.com/username/puny.git"
+1963
spec/rfc3492.txt
··· 1 + 2 + 3 + 4 + 5 + 6 + 7 + Network Working Group A. Costello 8 + Request for Comments: 3492 Univ. of California, Berkeley 9 + Category: Standards Track March 2003 10 + 11 + 12 + Punycode: A Bootstring encoding of Unicode 13 + for Internationalized Domain Names in Applications (IDNA) 14 + 15 + Status of this Memo 16 + 17 + This document specifies an Internet standards track protocol for the 18 + Internet community, and requests discussion and suggestions for 19 + improvements. Please refer to the current edition of the "Internet 20 + Official Protocol Standards" (STD 1) for the standardization state 21 + and status of this protocol. Distribution of this memo is unlimited. 22 + 23 + Copyright Notice 24 + 25 + Copyright (C) The Internet Society (2003). All Rights Reserved. 26 + 27 + Abstract 28 + 29 + Punycode is a simple and efficient transfer encoding syntax designed 30 + for use with Internationalized Domain Names in Applications (IDNA). 31 + It uniquely and reversibly transforms a Unicode string into an ASCII 32 + string. ASCII characters in the Unicode string are represented 33 + literally, and non-ASCII characters are represented by ASCII 34 + characters that are allowed in host name labels (letters, digits, and 35 + hyphens). This document defines a general algorithm called 36 + Bootstring that allows a string of basic code points to uniquely 37 + represent any string of code points drawn from a larger set. 38 + Punycode is an instance of Bootstring that uses particular parameter 39 + values specified by this document, appropriate for IDNA. 40 + 41 + Table of Contents 42 + 43 + 1. Introduction...............................................2 44 + 1.1 Features..............................................2 45 + 1.2 Interaction of protocol parts.........................3 46 + 2. Terminology................................................3 47 + 3. Bootstring description.....................................4 48 + 3.1 Basic code point segregation..........................4 49 + 3.2 Insertion unsort coding...............................4 50 + 3.3 Generalized variable-length integers..................5 51 + 3.4 Bias adaptation.......................................7 52 + 4. Bootstring parameters......................................8 53 + 5. Parameter values for Punycode..............................8 54 + 6. Bootstring algorithms......................................9 55 + 56 + 57 + 58 + Costello Standards Track [Page 1] 59 + 60 + RFC 3492 IDNA Punycode March 2003 61 + 62 + 63 + 6.1 Bias adaptation function.............................10 64 + 6.2 Decoding procedure...................................11 65 + 6.3 Encoding procedure...................................12 66 + 6.4 Overflow handling....................................13 67 + 7. Punycode examples.........................................14 68 + 7.1 Sample strings.......................................14 69 + 7.2 Decoding traces......................................17 70 + 7.3 Encoding traces......................................19 71 + 8. Security Considerations...................................20 72 + 9. References................................................21 73 + 9.1 Normative References.................................21 74 + 9.2 Informative References...............................21 75 + A. Mixed-case annotation.....................................22 76 + B. Disclaimer and license....................................22 77 + C. Punycode sample implementation............................23 78 + Author's Address.............................................34 79 + Full Copyright Statement.....................................35 80 + 81 + 1. Introduction 82 + 83 + [IDNA] describes an architecture for supporting internationalized 84 + domain names. Labels containing non-ASCII characters can be 85 + represented by ACE labels, which begin with a special ACE prefix and 86 + contain only ASCII characters. The remainder of the label after the 87 + prefix is a Punycode encoding of a Unicode string satisfying certain 88 + constraints. For the details of the prefix and constraints, see 89 + [IDNA] and [NAMEPREP]. 90 + 91 + Punycode is an instance of a more general algorithm called 92 + Bootstring, which allows strings composed from a small set of "basic" 93 + code points to uniquely represent any string of code points drawn 94 + from a larger set. Punycode is Bootstring with particular parameter 95 + values appropriate for IDNA. 96 + 97 + 1.1 Features 98 + 99 + Bootstring has been designed to have the following features: 100 + 101 + * Completeness: Every extended string (sequence of arbitrary code 102 + points) can be represented by a basic string (sequence of basic 103 + code points). Restrictions on what strings are allowed, and on 104 + length, can be imposed by higher layers. 105 + 106 + * Uniqueness: There is at most one basic string that represents a 107 + given extended string. 108 + 109 + * Reversibility: Any extended string mapped to a basic string can 110 + be recovered from that basic string. 111 + 112 + 113 + 114 + Costello Standards Track [Page 2] 115 + 116 + RFC 3492 IDNA Punycode March 2003 117 + 118 + 119 + * Efficient encoding: The ratio of basic string length to extended 120 + string length is small. This is important in the context of 121 + domain names because RFC 1034 [RFC1034] restricts the length of a 122 + domain label to 63 characters. 123 + 124 + * Simplicity: The encoding and decoding algorithms are reasonably 125 + simple to implement. The goals of efficiency and simplicity are 126 + at odds; Bootstring aims at a good balance between them. 127 + 128 + * Readability: Basic code points appearing in the extended string 129 + are represented as themselves in the basic string (although the 130 + main purpose is to improve efficiency, not readability). 131 + 132 + Punycode can also support an additional feature that is not used by 133 + the ToASCII and ToUnicode operations of [IDNA]. When extended 134 + strings are case-folded prior to encoding, the basic string can use 135 + mixed case to tell how to convert the folded string into a mixed-case 136 + string. See appendix A "Mixed-case annotation". 137 + 138 + 1.2 Interaction of protocol parts 139 + 140 + Punycode is used by the IDNA protocol [IDNA] for converting domain 141 + labels into ASCII; it is not designed for any other purpose. It is 142 + explicitly not designed for processing arbitrary free text. 143 + 144 + 2. Terminology 145 + 146 + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", 147 + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this 148 + document are to be interpreted as described in BCP 14, RFC 2119 149 + [RFC2119]. 150 + 151 + A code point is an integral value associated with a character in a 152 + coded character set. 153 + 154 + As in the Unicode Standard [UNICODE], Unicode code points are denoted 155 + by "U+" followed by four to six hexadecimal digits, while a range of 156 + code points is denoted by two hexadecimal numbers separated by "..", 157 + with no prefixes. 158 + 159 + The operators div and mod perform integer division; (x div y) is the 160 + quotient of x divided by y, discarding the remainder, and (x mod y) 161 + is the remainder, so (x div y) * y + (x mod y) == x. Bootstring uses 162 + these operators only with nonnegative operands, so the quotient and 163 + remainder are always nonnegative. 164 + 165 + The break statement jumps out of the innermost loop (as in C). 166 + 167 + 168 + 169 + 170 + Costello Standards Track [Page 3] 171 + 172 + RFC 3492 IDNA Punycode March 2003 173 + 174 + 175 + An overflow is an attempt to compute a value that exceeds the maximum 176 + value of an integer variable. 177 + 178 + 3. Bootstring description 179 + 180 + Bootstring represents an arbitrary sequence of code points (the 181 + "extended string") as a sequence of basic code points (the "basic 182 + string"). This section describes the representation. Section 6 183 + "Bootstring algorithms" presents the algorithms as pseudocode. 184 + Sections 7.1 "Decoding traces" and 7.2 "Encoding traces" trace the 185 + algorithms for sample inputs. 186 + 187 + The following sections describe the four techniques used in 188 + Bootstring. "Basic code point segregation" is a very simple and 189 + efficient encoding for basic code points occurring in the extended 190 + string: they are simply copied all at once. "Insertion unsort 191 + coding" encodes the non-basic code points as deltas, and processes 192 + the code points in numerical order rather than in order of 193 + appearance, which typically results in smaller deltas. The deltas 194 + are represented as "generalized variable-length integers", which use 195 + basic code points to represent nonnegative integers. The parameters 196 + of this integer representation are dynamically adjusted using "bias 197 + adaptation", to improve efficiency when consecutive deltas have 198 + similar magnitudes. 199 + 200 + 3.1 Basic code point segregation 201 + 202 + All basic code points appearing in the extended string are 203 + represented literally at the beginning of the basic string, in their 204 + original order, followed by a delimiter if (and only if) the number 205 + of basic code points is nonzero. The delimiter is a particular basic 206 + code point, which never appears in the remainder of the basic string. 207 + The decoder can therefore find the end of the literal portion (if 208 + there is one) by scanning for the last delimiter. 209 + 210 + 3.2 Insertion unsort coding 211 + 212 + The remainder of the basic string (after the last delimiter if there 213 + is one) represents a sequence of nonnegative integral deltas as 214 + generalized variable-length integers, described in section 3.3. The 215 + meaning of the deltas is best understood in terms of the decoder. 216 + 217 + The decoder builds the extended string incrementally. Initially, the 218 + extended string is a copy of the literal portion of the basic string 219 + (excluding the last delimiter). The decoder inserts non-basic code 220 + points, one for each delta, into the extended string, ultimately 221 + arriving at the final decoded string. 222 + 223 + 224 + 225 + 226 + Costello Standards Track [Page 4] 227 + 228 + RFC 3492 IDNA Punycode March 2003 229 + 230 + 231 + At the heart of this process is a state machine with two state 232 + variables: an index i and a counter n. The index i refers to a 233 + position in the extended string; it ranges from 0 (the first 234 + position) to the current length of the extended string (which refers 235 + to a potential position beyond the current end). If the current 236 + state is <n,i>, the next state is <n,i+1> if i is less than the 237 + length of the extended string, or <n+1,0> if i equals the length of 238 + the extended string. In other words, each state change causes i to 239 + increment, wrapping around to zero if necessary, and n counts the 240 + number of wrap-arounds. 241 + 242 + Notice that the state always advances monotonically (there is no way 243 + for the decoder to return to an earlier state). At each state, an 244 + insertion is either performed or not performed. At most one 245 + insertion is performed in a given state. An insertion inserts the 246 + value of n at position i in the extended string. The deltas are a 247 + run-length encoding of this sequence of events: they are the lengths 248 + of the runs of non-insertion states preceeding the insertion states. 249 + Hence, for each delta, the decoder performs delta state changes, then 250 + an insertion, and then one more state change. (An implementation 251 + need not perform each state change individually, but can instead use 252 + division and remainder calculations to compute the next insertion 253 + state directly.) It is an error if the inserted code point is a 254 + basic code point (because basic code points were supposed to be 255 + segregated as described in section 3.1). 256 + 257 + The encoder's main task is to derive the sequence of deltas that will 258 + cause the decoder to construct the desired string. It can do this by 259 + repeatedly scanning the extended string for the next code point that 260 + the decoder would need to insert, and counting the number of state 261 + changes the decoder would need to perform, mindful of the fact that 262 + the decoder's extended string will include only those code points 263 + that have already been inserted. Section 6.3 "Encoding procedure" 264 + gives a precise algorithm. 265 + 266 + 3.3 Generalized variable-length integers 267 + 268 + In a conventional integer representation the base is the number of 269 + distinct symbols for digits, whose values are 0 through base-1. Let 270 + digit_0 denote the least significant digit, digit_1 the next least 271 + significant, and so on. The value represented is the sum over j of 272 + digit_j * w(j), where w(j) = base^j is the weight (scale factor) for 273 + position j. For example, in the base 8 integer 437, the digits are 274 + 7, 3, and 4, and the weights are 1, 8, and 64, so the value is 7 + 275 + 3*8 + 4*64 = 287. This representation has two disadvantages: First, 276 + there are multiple encodings of each value (because there can be 277 + extra zeros in the most significant positions), which is inconvenient 278 + 279 + 280 + 281 + 282 + Costello Standards Track [Page 5] 283 + 284 + RFC 3492 IDNA Punycode March 2003 285 + 286 + 287 + when unique encodings are needed. Second, the integer is not self- 288 + delimiting, so if multiple integers are concatenated the boundaries 289 + between them are lost. 290 + 291 + The generalized variable-length representation solves these two 292 + problems. The digit values are still 0 through base-1, but now the 293 + integer is self-delimiting by means of thresholds t(j), each of which 294 + is in the range 0 through base-1. Exactly one digit, the most 295 + significant, satisfies digit_j < t(j). Therefore, if several 296 + integers are concatenated, it is easy to separate them, starting with 297 + the first if they are little-endian (least significant digit first), 298 + or starting with the last if they are big-endian (most significant 299 + digit first). As before, the value is the sum over j of digit_j * 300 + w(j), but the weights are different: 301 + 302 + w(0) = 1 303 + w(j) = w(j-1) * (base - t(j-1)) for j > 0 304 + 305 + For example, consider the little-endian sequence of base 8 digits 306 + 734251... Suppose the thresholds are 2, 3, 5, 5, 5, 5... This 307 + implies that the weights are 1, 1*(8-2) = 6, 6*(8-3) = 30, 30*(8-5) = 308 + 90, 90*(8-5) = 270, and so on. 7 is not less than 2, and 3 is not 309 + less than 3, but 4 is less than 5, so 4 is the last digit. The value 310 + of 734 is 7*1 + 3*6 + 4*30 = 145. The next integer is 251, with 311 + value 2*1 + 5*6 + 1*30 = 62. Decoding this representation is very 312 + similar to decoding a conventional integer: Start with a current 313 + value of N = 0 and a weight w = 1. Fetch the next digit d and 314 + increase N by d * w. If d is less than the current threshold (t) 315 + then stop, otherwise increase w by a factor of (base - t), update t 316 + for the next position, and repeat. 317 + 318 + Encoding this representation is similar to encoding a conventional 319 + integer: If N < t then output one digit for N and stop, otherwise 320 + output the digit for t + ((N - t) mod (base - t)), then replace N 321 + with (N - t) div (base - t), update t for the next position, and 322 + repeat. 323 + 324 + For any particular set of values of t(j), there is exactly one 325 + generalized variable-length representation of each nonnegative 326 + integral value. 327 + 328 + Bootstring uses little-endian ordering so that the deltas can be 329 + separated starting with the first. The t(j) values are defined in 330 + terms of the constants base, tmin, and tmax, and a state variable 331 + called bias: 332 + 333 + t(j) = base * (j + 1) - bias, 334 + clamped to the range tmin through tmax 335 + 336 + 337 + 338 + Costello Standards Track [Page 6] 339 + 340 + RFC 3492 IDNA Punycode March 2003 341 + 342 + 343 + The clamping means that if the formula yields a value less than tmin 344 + or greater than tmax, then t(j) = tmin or tmax, respectively. (In 345 + the pseudocode in section 6 "Bootstring algorithms", the expression 346 + base * (j + 1) is denoted by k for performance reasons.) These t(j) 347 + values cause the representation to favor integers within a particular 348 + range determined by the bias. 349 + 350 + 3.4 Bias adaptation 351 + 352 + After each delta is encoded or decoded, bias is set for the next 353 + delta as follows: 354 + 355 + 1. Delta is scaled in order to avoid overflow in the next step: 356 + 357 + let delta = delta div 2 358 + 359 + But when this is the very first delta, the divisor is not 2, but 360 + instead a constant called damp. This compensates for the fact 361 + that the second delta is usually much smaller than the first. 362 + 363 + 2. Delta is increased to compensate for the fact that the next delta 364 + will be inserting into a longer string: 365 + 366 + let delta = delta + (delta div numpoints) 367 + 368 + numpoints is the total number of code points encoded/decoded so 369 + far (including the one corresponding to this delta itself, and 370 + including the basic code points). 371 + 372 + 3. Delta is repeatedly divided until it falls within a threshold, to 373 + predict the minimum number of digits needed to represent the next 374 + delta: 375 + 376 + while delta > ((base - tmin) * tmax) div 2 377 + do let delta = delta div (base - tmin) 378 + 379 + 4. The bias is set: 380 + 381 + let bias = 382 + (base * the number of divisions performed in step 3) + 383 + (((base - tmin + 1) * delta) div (delta + skew)) 384 + 385 + The motivation for this procedure is that the current delta 386 + provides a hint about the likely size of the next delta, and so 387 + t(j) is set to tmax for the more significant digits starting with 388 + the one expected to be last, tmin for the less significant digits 389 + up through the one expected to be third-last, and somewhere 390 + between tmin and tmax for the digit expected to be second-last 391 + 392 + 393 + 394 + Costello Standards Track [Page 7] 395 + 396 + RFC 3492 IDNA Punycode March 2003 397 + 398 + 399 + (balancing the hope of the expected-last digit being unnecessary 400 + against the danger of it being insufficient). 401 + 402 + 4. Bootstring parameters 403 + 404 + Given a set of basic code points, one needs to be designated as the 405 + delimiter. The base cannot be greater than the number of 406 + distinguishable basic code points remaining. The digit-values in the 407 + range 0 through base-1 need to be associated with distinct non- 408 + delimiter basic code points. In some cases multiple code points need 409 + to have the same digit-value; for example, uppercase and lowercase 410 + versions of the same letter need to be equivalent if basic strings 411 + are case-insensitive. 412 + 413 + The initial value of n cannot be greater than the minimum non-basic 414 + code point that could appear in extended strings. 415 + 416 + The remaining five parameters (tmin, tmax, skew, damp, and the 417 + initial value of bias) need to satisfy the following constraints: 418 + 419 + 0 <= tmin <= tmax <= base-1 420 + skew >= 1 421 + damp >= 2 422 + initial_bias mod base <= base - tmin 423 + 424 + Provided the constraints are satisfied, these five parameters affect 425 + efficiency but not correctness. They are best chosen empirically. 426 + 427 + If support for mixed-case annotation is desired (see appendix A), 428 + make sure that the code points corresponding to 0 through tmax-1 all 429 + have both uppercase and lowercase forms. 430 + 431 + 5. Parameter values for Punycode 432 + 433 + Punycode uses the following Bootstring parameter values: 434 + 435 + base = 36 436 + tmin = 1 437 + tmax = 26 438 + skew = 38 439 + damp = 700 440 + initial_bias = 72 441 + initial_n = 128 = 0x80 442 + 443 + Although the only restriction Punycode imposes on the input integers 444 + is that they be nonnegative, these parameters are especially designed 445 + to work well with Unicode [UNICODE] code points, which are integers 446 + in the range 0..10FFFF (but not D800..DFFF, which are reserved for 447 + 448 + 449 + 450 + Costello Standards Track [Page 8] 451 + 452 + RFC 3492 IDNA Punycode March 2003 453 + 454 + 455 + use by the UTF-16 encoding of Unicode). The basic code points are 456 + the ASCII [ASCII] code points (0..7F), of which U+002D (-) is the 457 + delimiter, and some of the others have digit-values as follows: 458 + 459 + code points digit-values 460 + ------------ ---------------------- 461 + 41..5A (A-Z) = 0 to 25, respectively 462 + 61..7A (a-z) = 0 to 25, respectively 463 + 30..39 (0-9) = 26 to 35, respectively 464 + 465 + Using hyphen-minus as the delimiter implies that the encoded string 466 + can end with a hyphen-minus only if the Unicode string consists 467 + entirely of basic code points, but IDNA forbids such strings from 468 + being encoded. The encoded string can begin with a hyphen-minus, but 469 + IDNA prepends a prefix. Therefore IDNA using Punycode conforms to 470 + the RFC 952 rule that host name labels neither begin nor end with a 471 + hyphen-minus [RFC952]. 472 + 473 + A decoder MUST recognize the letters in both uppercase and lowercase 474 + forms (including mixtures of both forms). An encoder SHOULD output 475 + only uppercase forms or only lowercase forms, unless it uses mixed- 476 + case annotation (see appendix A). 477 + 478 + Presumably most users will not manually write or type encoded strings 479 + (as opposed to cutting and pasting them), but those who do will need 480 + to be alert to the potential visual ambiguity between the following 481 + sets of characters: 482 + 483 + G 6 484 + I l 1 485 + O 0 486 + S 5 487 + U V 488 + Z 2 489 + 490 + Such ambiguities are usually resolved by context, but in a Punycode 491 + encoded string there is no context apparent to humans. 492 + 493 + 6. Bootstring algorithms 494 + 495 + Some parts of the pseudocode can be omitted if the parameters satisfy 496 + certain conditions (for which Punycode qualifies). These parts are 497 + enclosed in {braces}, and notes immediately following the pseudocode 498 + explain the conditions under which they can be omitted. 499 + 500 + 501 + 502 + 503 + 504 + 505 + 506 + Costello Standards Track [Page 9] 507 + 508 + RFC 3492 IDNA Punycode March 2003 509 + 510 + 511 + Formally, code points are integers, and hence the pseudocode assumes 512 + that arithmetic operations can be performed directly on code points. 513 + In some programming languages, explicit conversion between code 514 + points and integers might be necessary. 515 + 516 + 6.1 Bias adaptation function 517 + 518 + function adapt(delta,numpoints,firsttime): 519 + if firsttime then let delta = delta div damp 520 + else let delta = delta div 2 521 + let delta = delta + (delta div numpoints) 522 + let k = 0 523 + while delta > ((base - tmin) * tmax) div 2 do begin 524 + let delta = delta div (base - tmin) 525 + let k = k + base 526 + end 527 + return k + (((base - tmin + 1) * delta) div (delta + skew)) 528 + 529 + It does not matter whether the modifications to delta and k inside 530 + adapt() affect variables of the same name inside the 531 + encoding/decoding procedures, because after calling adapt() the 532 + caller does not read those variables before overwriting them. 533 + 534 + 535 + 536 + 537 + 538 + 539 + 540 + 541 + 542 + 543 + 544 + 545 + 546 + 547 + 548 + 549 + 550 + 551 + 552 + 553 + 554 + 555 + 556 + 557 + 558 + 559 + 560 + 561 + 562 + Costello Standards Track [Page 10] 563 + 564 + RFC 3492 IDNA Punycode March 2003 565 + 566 + 567 + 6.2 Decoding procedure 568 + 569 + let n = initial_n 570 + let i = 0 571 + let bias = initial_bias 572 + let output = an empty string indexed from 0 573 + consume all code points before the last delimiter (if there is one) 574 + and copy them to output, fail on any non-basic code point 575 + if more than zero code points were consumed then consume one more 576 + (which will be the last delimiter) 577 + while the input is not exhausted do begin 578 + let oldi = i 579 + let w = 1 580 + for k = base to infinity in steps of base do begin 581 + consume a code point, or fail if there was none to consume 582 + let digit = the code point's digit-value, fail if it has none 583 + let i = i + digit * w, fail on overflow 584 + let t = tmin if k <= bias {+ tmin}, or 585 + tmax if k >= bias + tmax, or k - bias otherwise 586 + if digit < t then break 587 + let w = w * (base - t), fail on overflow 588 + end 589 + let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) 590 + let n = n + i div (length(output) + 1), fail on overflow 591 + let i = i mod (length(output) + 1) 592 + {if n is a basic code point then fail} 593 + insert n into output at position i 594 + increment i 595 + end 596 + 597 + The full statement enclosed in braces (checking whether n is a basic 598 + code point) can be omitted if initial_n exceeds all basic code points 599 + (which is true for Punycode), because n is never less than initial_n. 600 + 601 + In the assignment of t, where t is clamped to the range tmin through 602 + tmax, "+ tmin" can always be omitted. This makes the clamping 603 + calculation incorrect when bias < k < bias + tmin, but that cannot 604 + happen because of the way bias is computed and because of the 605 + constraints on the parameters. 606 + 607 + Because the decoder state can only advance monotonically, and there 608 + is only one representation of any delta, there is therefore only one 609 + encoded string that can represent a given sequence of integers. The 610 + only error conditions are invalid code points, unexpected end-of- 611 + input, overflow, and basic code points encoded using deltas instead 612 + of appearing literally. If the decoder fails on these errors as 613 + shown above, then it cannot produce the same output for two distinct 614 + inputs. Without this property it would have been necessary to re- 615 + 616 + 617 + 618 + Costello Standards Track [Page 11] 619 + 620 + RFC 3492 IDNA Punycode March 2003 621 + 622 + 623 + encode the output and verify that it matches the input in order to 624 + guarantee the uniqueness of the encoding. 625 + 626 + 6.3 Encoding procedure 627 + 628 + let n = initial_n 629 + let delta = 0 630 + let bias = initial_bias 631 + let h = b = the number of basic code points in the input 632 + copy them to the output in order, followed by a delimiter if b > 0 633 + {if the input contains a non-basic code point < n then fail} 634 + while h < length(input) do begin 635 + let m = the minimum {non-basic} code point >= n in the input 636 + let delta = delta + (m - n) * (h + 1), fail on overflow 637 + let n = m 638 + for each code point c in the input (in order) do begin 639 + if c < n {or c is basic} then increment delta, fail on overflow 640 + if c == n then begin 641 + let q = delta 642 + for k = base to infinity in steps of base do begin 643 + let t = tmin if k <= bias {+ tmin}, or 644 + tmax if k >= bias + tmax, or k - bias otherwise 645 + if q < t then break 646 + output the code point for digit t + ((q - t) mod (base - t)) 647 + let q = (q - t) div (base - t) 648 + end 649 + output the code point for digit q 650 + let bias = adapt(delta, h + 1, test h equals b?) 651 + let delta = 0 652 + increment h 653 + end 654 + end 655 + increment delta and n 656 + end 657 + 658 + The full statement enclosed in braces (checking whether the input 659 + contains a non-basic code point less than n) can be omitted if all 660 + code points less than initial_n are basic code points (which is true 661 + for Punycode if code points are unsigned). 662 + 663 + The brace-enclosed conditions "non-basic" and "or c is basic" can be 664 + omitted if initial_n exceeds all basic code points (which is true for 665 + Punycode), because the code point being tested is never less than 666 + initial_n. 667 + 668 + In the assignment of t, where t is clamped to the range tmin through 669 + tmax, "+ tmin" can always be omitted. This makes the clamping 670 + calculation incorrect when bias < k < bias + tmin, but that cannot 671 + 672 + 673 + 674 + Costello Standards Track [Page 12] 675 + 676 + RFC 3492 IDNA Punycode March 2003 677 + 678 + 679 + happen because of the way bias is computed and because of the 680 + constraints on the parameters. 681 + 682 + The checks for overflow are necessary to avoid producing invalid 683 + output when the input contains very large values or is very long. 684 + 685 + The increment of delta at the bottom of the outer loop cannot 686 + overflow because delta < length(input) before the increment, and 687 + length(input) is already assumed to be representable. The increment 688 + of n could overflow, but only if h == length(input), in which case 689 + the procedure is finished anyway. 690 + 691 + 6.4 Overflow handling 692 + 693 + For IDNA, 26-bit unsigned integers are sufficient to handle all valid 694 + IDNA labels without overflow, because any string that needed a 27-bit 695 + delta would have to exceed either the code point limit (0..10FFFF) or 696 + the label length limit (63 characters). However, overflow handling 697 + is necessary because the inputs are not necessarily valid IDNA 698 + labels. 699 + 700 + If the programming language does not provide overflow detection, the 701 + following technique can be used. Suppose A, B, and C are 702 + representable nonnegative integers and C is nonzero. Then A + B 703 + overflows if and only if B > maxint - A, and A + (B * C) overflows if 704 + and only if B > (maxint - A) div C, where maxint is the greatest 705 + integer for which maxint + 1 cannot be represented. Refer to 706 + appendix C "Punycode sample implementation" for demonstrations of 707 + this technique in the C language. 708 + 709 + The decoding and encoding algorithms shown in sections 6.2 and 6.3 710 + handle overflow by detecting it whenever it happens. Another 711 + approach is to enforce limits on the inputs that prevent overflow 712 + from happening. For example, if the encoder were to verify that no 713 + input code points exceed M and that the input length does not exceed 714 + L, then no delta could ever exceed (M - initial_n) * (L + 1), and 715 + hence no overflow could occur if integer variables were capable of 716 + representing values that large. This prevention approach would 717 + impose more restrictions on the input than the detection approach 718 + does, but might be considered simpler in some programming languages. 719 + 720 + In theory, the decoder could use an analogous approach, limiting the 721 + number of digits in a variable-length integer (that is, limiting the 722 + number of iterations in the innermost loop). However, the number of 723 + digits that suffice to represent a given delta can sometimes 724 + represent much larger deltas (because of the adaptation), and hence 725 + this approach would probably need integers wider than 32 bits. 726 + 727 + 728 + 729 + 730 + Costello Standards Track [Page 13] 731 + 732 + RFC 3492 IDNA Punycode March 2003 733 + 734 + 735 + Yet another approach for the decoder is to allow overflow to occur, 736 + but to check the final output string by re-encoding it and comparing 737 + to the decoder input. If and only if they do not match (using a 738 + case-insensitive ASCII comparison) overflow has occurred. This 739 + delayed-detection approach would not impose any more restrictions on 740 + the input than the immediate-detection approach does, and might be 741 + considered simpler in some programming languages. 742 + 743 + In fact, if the decoder is used only inside the IDNA ToUnicode 744 + operation [IDNA], then it need not check for overflow at all, because 745 + ToUnicode performs a higher level re-encoding and comparison, and a 746 + mismatch has the same consequence as if the Punycode decoder had 747 + failed. 748 + 749 + 7. Punycode examples 750 + 751 + 7.1 Sample strings 752 + 753 + In the Punycode encodings below, the ACE prefix is not shown. 754 + Backslashes show where line breaks have been inserted in strings too 755 + long for one line. 756 + 757 + The first several examples are all translations of the sentence "Why 758 + can't they just speak in <language>?" (courtesy of Michael Kaplan's 759 + "provincial" page [PROVINCIAL]). Word breaks and punctuation have 760 + been removed, as is often done in domain names. 761 + 762 + (A) Arabic (Egyptian): 763 + u+0644 u+064A u+0647 u+0645 u+0627 u+0628 u+062A u+0643 u+0644 764 + u+0645 u+0648 u+0634 u+0639 u+0631 u+0628 u+064A u+061F 765 + Punycode: egbpdaj6bu4bxfgehfvwxn 766 + 767 + (B) Chinese (simplified): 768 + u+4ED6 u+4EEC u+4E3A u+4EC0 u+4E48 u+4E0D u+8BF4 u+4E2D u+6587 769 + Punycode: ihqwcrb4cv8a8dqg056pqjye 770 + 771 + (C) Chinese (traditional): 772 + u+4ED6 u+5011 u+7232 u+4EC0 u+9EBD u+4E0D u+8AAA u+4E2D u+6587 773 + Punycode: ihqwctvzc91f659drss3x8bo0yb 774 + 775 + (D) Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky 776 + U+0050 u+0072 u+006F u+010D u+0070 u+0072 u+006F u+0073 u+0074 777 + u+011B u+006E u+0065 u+006D u+006C u+0075 u+0076 u+00ED u+010D 778 + u+0065 u+0073 u+006B u+0079 779 + Punycode: Proprostnemluvesky-uyb24dma41a 780 + 781 + 782 + 783 + 784 + 785 + 786 + Costello Standards Track [Page 14] 787 + 788 + RFC 3492 IDNA Punycode March 2003 789 + 790 + 791 + (E) Hebrew: 792 + u+05DC u+05DE u+05D4 u+05D4 u+05DD u+05E4 u+05E9 u+05D5 u+05D8 793 + u+05DC u+05D0 u+05DE u+05D3 u+05D1 u+05E8 u+05D9 u+05DD u+05E2 794 + u+05D1 u+05E8 u+05D9 u+05EA 795 + Punycode: 4dbcagdahymbxekheh6e0a7fei0b 796 + 797 + (F) Hindi (Devanagari): 798 + u+092F u+0939 u+0932 u+094B u+0917 u+0939 u+093F u+0928 u+094D 799 + u+0926 u+0940 u+0915 u+094D u+092F u+094B u+0902 u+0928 u+0939 800 + u+0940 u+0902 u+092C u+094B u+0932 u+0938 u+0915 u+0924 u+0947 801 + u+0939 u+0948 u+0902 802 + Punycode: i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd 803 + 804 + (G) Japanese (kanji and hiragana): 805 + u+306A u+305C u+307F u+3093 u+306A u+65E5 u+672C u+8A9E u+3092 806 + u+8A71 u+3057 u+3066 u+304F u+308C u+306A u+3044 u+306E u+304B 807 + Punycode: n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa 808 + 809 + (H) Korean (Hangul syllables): 810 + u+C138 u+ACC4 u+C758 u+BAA8 u+B4E0 u+C0AC u+B78C u+B4E4 u+C774 811 + u+D55C u+AD6D u+C5B4 u+B97C u+C774 u+D574 u+D55C u+B2E4 u+BA74 812 + u+C5BC u+B9C8 u+B098 u+C88B u+C744 u+AE4C 813 + Punycode: 989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j\ 814 + psd879ccm6fea98c 815 + 816 + (I) Russian (Cyrillic): 817 + U+043F u+043E u+0447 u+0435 u+043C u+0443 u+0436 u+0435 u+043E 818 + u+043D u+0438 u+043D u+0435 u+0433 u+043E u+0432 u+043E u+0440 819 + u+044F u+0442 u+043F u+043E u+0440 u+0443 u+0441 u+0441 u+043A 820 + u+0438 821 + Punycode: b1abfaaepdrnnbgefbaDotcwatmq2g4l 822 + 823 + (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol 824 + U+0050 u+006F u+0072 u+0071 u+0075 u+00E9 u+006E u+006F u+0070 825 + u+0075 u+0065 u+0064 u+0065 u+006E u+0073 u+0069 u+006D u+0070 826 + u+006C u+0065 u+006D u+0065 u+006E u+0074 u+0065 u+0068 u+0061 827 + u+0062 u+006C u+0061 u+0072 u+0065 u+006E U+0045 u+0073 u+0070 828 + u+0061 u+00F1 u+006F u+006C 829 + Punycode: PorqunopuedensimplementehablarenEspaol-fmd56a 830 + 831 + (K) Vietnamese: 832 + T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\ 833 + <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t 834 + U+0054 u+1EA1 u+0069 u+0073 u+0061 u+006F u+0068 u+1ECD u+006B 835 + u+0068 u+00F4 u+006E u+0067 u+0074 u+0068 u+1EC3 u+0063 u+0068 836 + u+1EC9 u+006E u+00F3 u+0069 u+0074 u+0069 u+1EBF u+006E u+0067 837 + U+0056 u+0069 u+1EC7 u+0074 838 + Punycode: TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g 839 + 840 + 841 + 842 + Costello Standards Track [Page 15] 843 + 844 + RFC 3492 IDNA Punycode March 2003 845 + 846 + 847 + The next several examples are all names of Japanese music artists, 848 + song titles, and TV programs, just because the author happens to have 849 + them handy (but Japanese is useful for providing examples of single- 850 + row text, two-row text, ideographic text, and various mixtures 851 + thereof). 852 + 853 + (L) 3<nen>B<gumi><kinpachi><sensei> 854 + u+0033 u+5E74 U+0042 u+7D44 u+91D1 u+516B u+5148 u+751F 855 + Punycode: 3B-ww4c5e180e575a65lsy2b 856 + 857 + (M) <amuro><namie>-with-SUPER-MONKEYS 858 + u+5B89 u+5BA4 u+5948 u+7F8E u+6075 u+002D u+0077 u+0069 u+0074 859 + u+0068 u+002D U+0053 U+0055 U+0050 U+0045 U+0052 u+002D U+004D 860 + U+004F U+004E U+004B U+0045 U+0059 U+0053 861 + Punycode: -with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n 862 + 863 + (N) Hello-Another-Way-<sorezore><no><basho> 864 + U+0048 u+0065 u+006C u+006C u+006F u+002D U+0041 u+006E u+006F 865 + u+0074 u+0068 u+0065 u+0072 u+002D U+0057 u+0061 u+0079 u+002D 866 + u+305D u+308C u+305E u+308C u+306E u+5834 u+6240 867 + Punycode: Hello-Another-Way--fc4qua05auwb3674vfr0b 868 + 869 + (O) <hitotsu><yane><no><shita>2 870 + u+3072 u+3068 u+3064 u+5C4B u+6839 u+306E u+4E0B u+0032 871 + Punycode: 2-u9tlzr9756bt3uc0v 872 + 873 + (P) Maji<de>Koi<suru>5<byou><mae> 874 + U+004D u+0061 u+006A u+0069 u+3067 U+004B u+006F u+0069 u+3059 875 + u+308B u+0035 u+79D2 u+524D 876 + Punycode: MajiKoi5-783gue6qz075azm5e 877 + 878 + (Q) <pafii>de<runba> 879 + u+30D1 u+30D5 u+30A3 u+30FC u+0064 u+0065 u+30EB u+30F3 u+30D0 880 + Punycode: de-jg4avhby1noc0d 881 + 882 + (R) <sono><supiido><de> 883 + u+305D u+306E u+30B9 u+30D4 u+30FC u+30C9 u+3067 884 + Punycode: d9juau41awczczp 885 + 886 + The last example is an ASCII string that breaks the existing rules 887 + for host name labels. (It is not a realistic example for IDNA, 888 + because IDNA never encodes pure ASCII labels.) 889 + 890 + (S) -> $1.00 <- 891 + u+002D u+003E u+0020 u+0024 u+0031 u+002E u+0030 u+0030 u+0020 892 + u+003C u+002D 893 + Punycode: -> $1.00 <-- 894 + 895 + 896 + 897 + 898 + Costello Standards Track [Page 16] 899 + 900 + RFC 3492 IDNA Punycode March 2003 901 + 902 + 903 + 7.2 Decoding traces 904 + 905 + In the following traces, the evolving state of the decoder is shown 906 + as a sequence of hexadecimal values, representing the code points in 907 + the extended string. An asterisk appears just after the most 908 + recently inserted code point, indicating both n (the value preceeding 909 + the asterisk) and i (the position of the value just after the 910 + asterisk). Other numerical values are decimal. 911 + 912 + Decoding trace of example B from section 7.1: 913 + 914 + n is 128, i is 0, bias is 72 915 + input is "ihqwcrb4cv8a8dqg056pqjye" 916 + there is no delimiter, so extended string starts empty 917 + delta "ihq" decodes to 19853 918 + bias becomes 21 919 + 4E0D * 920 + delta "wc" decodes to 64 921 + bias becomes 20 922 + 4E0D 4E2D * 923 + delta "rb" decodes to 37 924 + bias becomes 13 925 + 4E3A * 4E0D 4E2D 926 + delta "4c" decodes to 56 927 + bias becomes 17 928 + 4E3A 4E48 * 4E0D 4E2D 929 + delta "v8a" decodes to 599 930 + bias becomes 32 931 + 4E3A 4EC0 * 4E48 4E0D 4E2D 932 + delta "8d" decodes to 130 933 + bias becomes 23 934 + 4ED6 * 4E3A 4EC0 4E48 4E0D 4E2D 935 + delta "qg" decodes to 154 936 + bias becomes 25 937 + 4ED6 4EEC * 4E3A 4EC0 4E48 4E0D 4E2D 938 + delta "056p" decodes to 46301 939 + bias becomes 84 940 + 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 4E2D 6587 * 941 + delta "qjye" decodes to 88531 942 + bias becomes 90 943 + 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 8BF4 * 4E2D 6587 944 + 945 + 946 + 947 + 948 + 949 + 950 + 951 + 952 + 953 + 954 + Costello Standards Track [Page 17] 955 + 956 + RFC 3492 IDNA Punycode March 2003 957 + 958 + 959 + Decoding trace of example L from section 7.1: 960 + 961 + n is 128, i is 0, bias is 72 962 + input is "3B-ww4c5e180e575a65lsy2b" 963 + literal portion is "3B-", so extended string starts as: 964 + 0033 0042 965 + delta "ww4c" decodes to 62042 966 + bias becomes 27 967 + 0033 0042 5148 * 968 + delta "5e" decodes to 139 969 + bias becomes 24 970 + 0033 0042 516B * 5148 971 + delta "180e" decodes to 16683 972 + bias becomes 67 973 + 0033 5E74 * 0042 516B 5148 974 + delta "575a" decodes to 34821 975 + bias becomes 82 976 + 0033 5E74 0042 516B 5148 751F * 977 + delta "65l" decodes to 14592 978 + bias becomes 67 979 + 0033 5E74 0042 7D44 * 516B 5148 751F 980 + delta "sy2b" decodes to 42088 981 + bias becomes 84 982 + 0033 5E74 0042 7D44 91D1 * 516B 5148 751F 983 + 984 + 985 + 986 + 987 + 988 + 989 + 990 + 991 + 992 + 993 + 994 + 995 + 996 + 997 + 998 + 999 + 1000 + 1001 + 1002 + 1003 + 1004 + 1005 + 1006 + 1007 + 1008 + 1009 + 1010 + Costello Standards Track [Page 18] 1011 + 1012 + RFC 3492 IDNA Punycode March 2003 1013 + 1014 + 1015 + 7.3 Encoding traces 1016 + 1017 + In the following traces, code point values are hexadecimal, while 1018 + other numerical values are decimal. 1019 + 1020 + Encoding trace of example B from section 7.1: 1021 + 1022 + bias is 72 1023 + input is: 1024 + 4ED6 4EEC 4E3A 4EC0 4E48 4E0D 8BF4 4E2D 6587 1025 + there are no basic code points, so no literal portion 1026 + next code point to insert is 4E0D 1027 + needed delta is 19853, encodes as "ihq" 1028 + bias becomes 21 1029 + next code point to insert is 4E2D 1030 + needed delta is 64, encodes as "wc" 1031 + bias becomes 20 1032 + next code point to insert is 4E3A 1033 + needed delta is 37, encodes as "rb" 1034 + bias becomes 13 1035 + next code point to insert is 4E48 1036 + needed delta is 56, encodes as "4c" 1037 + bias becomes 17 1038 + next code point to insert is 4EC0 1039 + needed delta is 599, encodes as "v8a" 1040 + bias becomes 32 1041 + next code point to insert is 4ED6 1042 + needed delta is 130, encodes as "8d" 1043 + bias becomes 23 1044 + next code point to insert is 4EEC 1045 + needed delta is 154, encodes as "qg" 1046 + bias becomes 25 1047 + next code point to insert is 6587 1048 + needed delta is 46301, encodes as "056p" 1049 + bias becomes 84 1050 + next code point to insert is 8BF4 1051 + needed delta is 88531, encodes as "qjye" 1052 + bias becomes 90 1053 + output is "ihqwcrb4cv8a8dqg056pqjye" 1054 + 1055 + 1056 + 1057 + 1058 + 1059 + 1060 + 1061 + 1062 + 1063 + 1064 + 1065 + 1066 + Costello Standards Track [Page 19] 1067 + 1068 + RFC 3492 IDNA Punycode March 2003 1069 + 1070 + 1071 + Encoding trace of example L from section 7.1: 1072 + 1073 + bias is 72 1074 + input is: 1075 + 0033 5E74 0042 7D44 91D1 516B 5148 751F 1076 + basic code points (0033, 0042) are copied to literal portion: "3B-" 1077 + next code point to insert is 5148 1078 + needed delta is 62042, encodes as "ww4c" 1079 + bias becomes 27 1080 + next code point to insert is 516B 1081 + needed delta is 139, encodes as "5e" 1082 + bias becomes 24 1083 + next code point to insert is 5E74 1084 + needed delta is 16683, encodes as "180e" 1085 + bias becomes 67 1086 + next code point to insert is 751F 1087 + needed delta is 34821, encodes as "575a" 1088 + bias becomes 82 1089 + next code point to insert is 7D44 1090 + needed delta is 14592, encodes as "65l" 1091 + bias becomes 67 1092 + next code point to insert is 91D1 1093 + needed delta is 42088, encodes as "sy2b" 1094 + bias becomes 84 1095 + output is "3B-ww4c5e180e575a65lsy2b" 1096 + 1097 + 8. Security Considerations 1098 + 1099 + Users expect each domain name in DNS to be controlled by a single 1100 + authority. If a Unicode string intended for use as a domain label 1101 + could map to multiple ACE labels, then an internationalized domain 1102 + name could map to multiple ASCII domain names, each controlled by a 1103 + different authority, some of which could be spoofs that hijack 1104 + service requests intended for another. Therefore Punycode is 1105 + designed so that each Unicode string has a unique encoding. 1106 + 1107 + However, there can still be multiple Unicode representations of the 1108 + "same" text, for various definitions of "same". This problem is 1109 + addressed to some extent by the Unicode standard under the topic of 1110 + canonicalization, and this work is leveraged for domain names by 1111 + Nameprep [NAMEPREP]. 1112 + 1113 + 1114 + 1115 + 1116 + 1117 + 1118 + 1119 + 1120 + 1121 + 1122 + Costello Standards Track [Page 20] 1123 + 1124 + RFC 3492 IDNA Punycode March 2003 1125 + 1126 + 1127 + 9. References 1128 + 1129 + 9.1 Normative References 1130 + 1131 + [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate 1132 + Requirement Levels", BCP 14, RFC 2119, March 1997. 1133 + 1134 + 9.2 Informative References 1135 + 1136 + [RFC952] Harrenstien, K., Stahl, M. and E. Feinler, "DOD Internet 1137 + Host Table Specification", RFC 952, October 1985. 1138 + 1139 + [RFC1034] Mockapetris, P., "Domain Names - Concepts and 1140 + Facilities", STD 13, RFC 1034, November 1987. 1141 + 1142 + [IDNA] Faltstrom, P., Hoffman, P. and A. Costello, 1143 + "Internationalizing Domain Names in Applications 1144 + (IDNA)", RFC 3490, March 2003. 1145 + 1146 + [NAMEPREP] Hoffman, P. and M. Blanchet, "Nameprep: A Stringprep 1147 + Profile for Internationalized Domain Names (IDN)", RFC 1148 + 3491, March 2003. 1149 + 1150 + [ASCII] Cerf, V., "ASCII format for Network Interchange", RFC 1151 + 20, October 1969. 1152 + 1153 + [PROVINCIAL] Kaplan, M., "The 'anyone can be provincial!' page", 1154 + http://www.trigeminal.com/samples/provincial.html. 1155 + 1156 + [UNICODE] The Unicode Consortium, "The Unicode Standard", 1157 + http://www.unicode.org/unicode/standard/standard.html. 1158 + 1159 + 1160 + 1161 + 1162 + 1163 + 1164 + 1165 + 1166 + 1167 + 1168 + 1169 + 1170 + 1171 + 1172 + 1173 + 1174 + 1175 + 1176 + 1177 + 1178 + Costello Standards Track [Page 21] 1179 + 1180 + RFC 3492 IDNA Punycode March 2003 1181 + 1182 + 1183 + A. Mixed-case annotation 1184 + 1185 + In order to use Punycode to represent case-insensitive strings, 1186 + higher layers need to case-fold the strings prior to Punycode 1187 + encoding. The encoded string can use mixed case as an annotation 1188 + telling how to convert the folded string into a mixed-case string for 1189 + display purposes. Note, however, that mixed-case annotation is not 1190 + used by the ToASCII and ToUnicode operations specified in [IDNA], and 1191 + therefore implementors of IDNA can disregard this appendix. 1192 + 1193 + Basic code points can use mixed case directly, because the decoder 1194 + copies them verbatim, leaving lowercase code points lowercase, and 1195 + leaving uppercase code points uppercase. Each non-basic code point 1196 + is represented by a delta, which is represented by a sequence of 1197 + basic code points, the last of which provides the annotation. If it 1198 + is uppercase, it is a suggestion to map the non-basic code point to 1199 + uppercase (if possible); if it is lowercase, it is a suggestion to 1200 + map the non-basic code point to lowercase (if possible). 1201 + 1202 + These annotations do not alter the code points returned by decoders; 1203 + the annotations are returned separately, for the caller to use or 1204 + ignore. Encoders can accept annotations in addition to code points, 1205 + but the annotations do not alter the output, except to influence the 1206 + uppercase/lowercase form of ASCII letters. 1207 + 1208 + Punycode encoders and decoders need not support these annotations, 1209 + and higher layers need not use them. 1210 + 1211 + B. Disclaimer and license 1212 + 1213 + Regarding this entire document or any portion of it (including the 1214 + pseudocode and C code), the author makes no guarantees and is not 1215 + responsible for any damage resulting from its use. The author grants 1216 + irrevocable permission to anyone to use, modify, and distribute it in 1217 + any way that does not diminish the rights of anyone else to use, 1218 + modify, and distribute it, provided that redistributed derivative 1219 + works do not contain misleading author or version information. 1220 + Derivative works need not be licensed under similar terms. 1221 + 1222 + 1223 + 1224 + 1225 + 1226 + 1227 + 1228 + 1229 + 1230 + 1231 + 1232 + 1233 + 1234 + Costello Standards Track [Page 22] 1235 + 1236 + RFC 3492 IDNA Punycode March 2003 1237 + 1238 + 1239 + C. Punycode sample implementation 1240 + 1241 + /* 1242 + punycode.c from RFC 3492 1243 + http://www.nicemice.net/idn/ 1244 + Adam M. Costello 1245 + http://www.nicemice.net/amc/ 1246 + 1247 + This is ANSI C code (C89) implementing Punycode (RFC 3492). 1248 + 1249 + */ 1250 + 1251 + 1252 + /************************************************************/ 1253 + /* Public interface (would normally go in its own .h file): */ 1254 + 1255 + #include <limits.h> 1256 + 1257 + enum punycode_status { 1258 + punycode_success, 1259 + punycode_bad_input, /* Input is invalid. */ 1260 + punycode_big_output, /* Output would exceed the space provided. */ 1261 + punycode_overflow /* Input needs wider integers to process. */ 1262 + }; 1263 + 1264 + #if UINT_MAX >= (1 << 26) - 1 1265 + typedef unsigned int punycode_uint; 1266 + #else 1267 + typedef unsigned long punycode_uint; 1268 + #endif 1269 + 1270 + enum punycode_status punycode_encode( 1271 + punycode_uint input_length, 1272 + const punycode_uint input[], 1273 + const unsigned char case_flags[], 1274 + punycode_uint *output_length, 1275 + char output[] ); 1276 + 1277 + /* punycode_encode() converts Unicode to Punycode. The input */ 1278 + /* is represented as an array of Unicode code points (not code */ 1279 + /* units; surrogate pairs are not allowed), and the output */ 1280 + /* will be represented as an array of ASCII code points. The */ 1281 + /* output string is *not* null-terminated; it will contain */ 1282 + /* zeros if and only if the input contains zeros. (Of course */ 1283 + /* the caller can leave room for a terminator and add one if */ 1284 + /* needed.) The input_length is the number of code points in */ 1285 + /* the input. The output_length is an in/out argument: the */ 1286 + /* caller passes in the maximum number of code points that it */ 1287 + 1288 + 1289 + 1290 + Costello Standards Track [Page 23] 1291 + 1292 + RFC 3492 IDNA Punycode March 2003 1293 + 1294 + 1295 + /* can receive, and on successful return it will contain the */ 1296 + /* number of code points actually output. The case_flags array */ 1297 + /* holds input_length boolean values, where nonzero suggests that */ 1298 + /* the corresponding Unicode character be forced to uppercase */ 1299 + /* after being decoded (if possible), and zero suggests that */ 1300 + /* it be forced to lowercase (if possible). ASCII code points */ 1301 + /* are encoded literally, except that ASCII letters are forced */ 1302 + /* to uppercase or lowercase according to the corresponding */ 1303 + /* uppercase flags. If case_flags is a null pointer then ASCII */ 1304 + /* letters are left as they are, and other code points are */ 1305 + /* treated as if their uppercase flags were zero. The return */ 1306 + /* value can be any of the punycode_status values defined above */ 1307 + /* except punycode_bad_input; if not punycode_success, then */ 1308 + /* output_size and output might contain garbage. */ 1309 + 1310 + enum punycode_status punycode_decode( 1311 + punycode_uint input_length, 1312 + const char input[], 1313 + punycode_uint *output_length, 1314 + punycode_uint output[], 1315 + unsigned char case_flags[] ); 1316 + 1317 + /* punycode_decode() converts Punycode to Unicode. The input is */ 1318 + /* represented as an array of ASCII code points, and the output */ 1319 + /* will be represented as an array of Unicode code points. The */ 1320 + /* input_length is the number of code points in the input. The */ 1321 + /* output_length is an in/out argument: the caller passes in */ 1322 + /* the maximum number of code points that it can receive, and */ 1323 + /* on successful return it will contain the actual number of */ 1324 + /* code points output. The case_flags array needs room for at */ 1325 + /* least output_length values, or it can be a null pointer if the */ 1326 + /* case information is not needed. A nonzero flag suggests that */ 1327 + /* the corresponding Unicode character be forced to uppercase */ 1328 + /* by the caller (if possible), while zero suggests that it be */ 1329 + /* forced to lowercase (if possible). ASCII code points are */ 1330 + /* output already in the proper case, but their flags will be set */ 1331 + /* appropriately so that applying the flags would be harmless. */ 1332 + /* The return value can be any of the punycode_status values */ 1333 + /* defined above; if not punycode_success, then output_length, */ 1334 + /* output, and case_flags might contain garbage. On success, the */ 1335 + /* decoder will never need to write an output_length greater than */ 1336 + /* input_length, because of how the encoding is defined. */ 1337 + 1338 + /**********************************************************/ 1339 + /* Implementation (would normally go in its own .c file): */ 1340 + 1341 + #include <string.h> 1342 + 1343 + 1344 + 1345 + 1346 + Costello Standards Track [Page 24] 1347 + 1348 + RFC 3492 IDNA Punycode March 2003 1349 + 1350 + 1351 + /*** Bootstring parameters for Punycode ***/ 1352 + 1353 + enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700, 1354 + initial_bias = 72, initial_n = 0x80, delimiter = 0x2D }; 1355 + 1356 + /* basic(cp) tests whether cp is a basic code point: */ 1357 + #define basic(cp) ((punycode_uint)(cp) < 0x80) 1358 + 1359 + /* delim(cp) tests whether cp is a delimiter: */ 1360 + #define delim(cp) ((cp) == delimiter) 1361 + 1362 + /* decode_digit(cp) returns the numeric value of a basic code */ 1363 + /* point (for use in representing integers) in the range 0 to */ 1364 + /* base-1, or base if cp is does not represent a value. */ 1365 + 1366 + static punycode_uint decode_digit(punycode_uint cp) 1367 + { 1368 + return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 : 1369 + cp - 97 < 26 ? cp - 97 : base; 1370 + } 1371 + 1372 + /* encode_digit(d,flag) returns the basic code point whose value */ 1373 + /* (when used for representing integers) is d, which needs to be in */ 1374 + /* the range 0 to base-1. The lowercase form is used unless flag is */ 1375 + /* nonzero, in which case the uppercase form is used. The behavior */ 1376 + /* is undefined if flag is nonzero and digit d has no uppercase form. */ 1377 + 1378 + static char encode_digit(punycode_uint d, int flag) 1379 + { 1380 + return d + 22 + 75 * (d < 26) - ((flag != 0) << 5); 1381 + /* 0..25 map to ASCII a..z or A..Z */ 1382 + /* 26..35 map to ASCII 0..9 */ 1383 + } 1384 + 1385 + /* flagged(bcp) tests whether a basic code point is flagged */ 1386 + /* (uppercase). The behavior is undefined if bcp is not a */ 1387 + /* basic code point. */ 1388 + 1389 + #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26) 1390 + 1391 + /* encode_basic(bcp,flag) forces a basic code point to lowercase */ 1392 + /* if flag is zero, uppercase if flag is nonzero, and returns */ 1393 + /* the resulting code point. The code point is unchanged if it */ 1394 + /* is caseless. The behavior is undefined if bcp is not a basic */ 1395 + /* code point. */ 1396 + 1397 + static char encode_basic(punycode_uint bcp, int flag) 1398 + { 1399 + 1400 + 1401 + 1402 + Costello Standards Track [Page 25] 1403 + 1404 + RFC 3492 IDNA Punycode March 2003 1405 + 1406 + 1407 + bcp -= (bcp - 97 < 26) << 5; 1408 + return bcp + ((!flag && (bcp - 65 < 26)) << 5); 1409 + } 1410 + 1411 + /*** Platform-specific constants ***/ 1412 + 1413 + /* maxint is the maximum value of a punycode_uint variable: */ 1414 + static const punycode_uint maxint = -1; 1415 + /* Because maxint is unsigned, -1 becomes the maximum value. */ 1416 + 1417 + /*** Bias adaptation function ***/ 1418 + 1419 + static punycode_uint adapt( 1420 + punycode_uint delta, punycode_uint numpoints, int firsttime ) 1421 + { 1422 + punycode_uint k; 1423 + 1424 + delta = firsttime ? delta / damp : delta >> 1; 1425 + /* delta >> 1 is a faster way of doing delta / 2 */ 1426 + delta += delta / numpoints; 1427 + 1428 + for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) { 1429 + delta /= base - tmin; 1430 + } 1431 + 1432 + return k + (base - tmin + 1) * delta / (delta + skew); 1433 + } 1434 + 1435 + /*** Main encode function ***/ 1436 + 1437 + enum punycode_status punycode_encode( 1438 + punycode_uint input_length, 1439 + const punycode_uint input[], 1440 + const unsigned char case_flags[], 1441 + punycode_uint *output_length, 1442 + char output[] ) 1443 + { 1444 + punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t; 1445 + 1446 + /* Initialize the state: */ 1447 + 1448 + n = initial_n; 1449 + delta = out = 0; 1450 + max_out = *output_length; 1451 + bias = initial_bias; 1452 + 1453 + /* Handle the basic code points: */ 1454 + 1455 + 1456 + 1457 + 1458 + Costello Standards Track [Page 26] 1459 + 1460 + RFC 3492 IDNA Punycode March 2003 1461 + 1462 + 1463 + for (j = 0; j < input_length; ++j) { 1464 + if (basic(input[j])) { 1465 + if (max_out - out < 2) return punycode_big_output; 1466 + output[out++] = 1467 + case_flags ? encode_basic(input[j], case_flags[j]) : input[j]; 1468 + } 1469 + /* else if (input[j] < n) return punycode_bad_input; */ 1470 + /* (not needed for Punycode with unsigned code points) */ 1471 + } 1472 + 1473 + h = b = out; 1474 + 1475 + /* h is the number of code points that have been handled, b is the */ 1476 + /* number of basic code points, and out is the number of characters */ 1477 + /* that have been output. */ 1478 + 1479 + if (b > 0) output[out++] = delimiter; 1480 + 1481 + /* Main encoding loop: */ 1482 + 1483 + while (h < input_length) { 1484 + /* All non-basic code points < n have been */ 1485 + /* handled already. Find the next larger one: */ 1486 + 1487 + for (m = maxint, j = 0; j < input_length; ++j) { 1488 + /* if (basic(input[j])) continue; */ 1489 + /* (not needed for Punycode) */ 1490 + if (input[j] >= n && input[j] < m) m = input[j]; 1491 + } 1492 + 1493 + /* Increase delta enough to advance the decoder's */ 1494 + /* <n,i> state to <m,0>, but guard against overflow: */ 1495 + 1496 + if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow; 1497 + delta += (m - n) * (h + 1); 1498 + n = m; 1499 + 1500 + for (j = 0; j < input_length; ++j) { 1501 + /* Punycode does not need to check whether input[j] is basic: */ 1502 + if (input[j] < n /* || basic(input[j]) */ ) { 1503 + if (++delta == 0) return punycode_overflow; 1504 + } 1505 + 1506 + if (input[j] == n) { 1507 + /* Represent delta as a generalized variable-length integer: */ 1508 + 1509 + for (q = delta, k = base; ; k += base) { 1510 + if (out >= max_out) return punycode_big_output; 1511 + 1512 + 1513 + 1514 + Costello Standards Track [Page 27] 1515 + 1516 + RFC 3492 IDNA Punycode March 2003 1517 + 1518 + 1519 + t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ 1520 + k >= bias + tmax ? tmax : k - bias; 1521 + if (q < t) break; 1522 + output[out++] = encode_digit(t + (q - t) % (base - t), 0); 1523 + q = (q - t) / (base - t); 1524 + } 1525 + 1526 + output[out++] = encode_digit(q, case_flags && case_flags[j]); 1527 + bias = adapt(delta, h + 1, h == b); 1528 + delta = 0; 1529 + ++h; 1530 + } 1531 + } 1532 + 1533 + ++delta, ++n; 1534 + } 1535 + 1536 + *output_length = out; 1537 + return punycode_success; 1538 + } 1539 + 1540 + /*** Main decode function ***/ 1541 + 1542 + enum punycode_status punycode_decode( 1543 + punycode_uint input_length, 1544 + const char input[], 1545 + punycode_uint *output_length, 1546 + punycode_uint output[], 1547 + unsigned char case_flags[] ) 1548 + { 1549 + punycode_uint n, out, i, max_out, bias, 1550 + b, j, in, oldi, w, k, digit, t; 1551 + 1552 + /* Initialize the state: */ 1553 + 1554 + n = initial_n; 1555 + out = i = 0; 1556 + max_out = *output_length; 1557 + bias = initial_bias; 1558 + 1559 + /* Handle the basic code points: Let b be the number of input code */ 1560 + /* points before the last delimiter, or 0 if there is none, then */ 1561 + /* copy the first b code points to the output. */ 1562 + 1563 + for (b = j = 0; j < input_length; ++j) if (delim(input[j])) b = j; 1564 + if (b > max_out) return punycode_big_output; 1565 + 1566 + for (j = 0; j < b; ++j) { 1567 + 1568 + 1569 + 1570 + Costello Standards Track [Page 28] 1571 + 1572 + RFC 3492 IDNA Punycode March 2003 1573 + 1574 + 1575 + if (case_flags) case_flags[out] = flagged(input[j]); 1576 + if (!basic(input[j])) return punycode_bad_input; 1577 + output[out++] = input[j]; 1578 + } 1579 + 1580 + /* Main decoding loop: Start just after the last delimiter if any */ 1581 + /* basic code points were copied; start at the beginning otherwise. */ 1582 + 1583 + for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) { 1584 + 1585 + /* in is the index of the next character to be consumed, and */ 1586 + /* out is the number of code points in the output array. */ 1587 + 1588 + /* Decode a generalized variable-length integer into delta, */ 1589 + /* which gets added to i. The overflow checking is easier */ 1590 + /* if we increase i as we go, then subtract off its starting */ 1591 + /* value at the end to obtain delta. */ 1592 + 1593 + for (oldi = i, w = 1, k = base; ; k += base) { 1594 + if (in >= input_length) return punycode_bad_input; 1595 + digit = decode_digit(input[in++]); 1596 + if (digit >= base) return punycode_bad_input; 1597 + if (digit > (maxint - i) / w) return punycode_overflow; 1598 + i += digit * w; 1599 + t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ 1600 + k >= bias + tmax ? tmax : k - bias; 1601 + if (digit < t) break; 1602 + if (w > maxint / (base - t)) return punycode_overflow; 1603 + w *= (base - t); 1604 + } 1605 + 1606 + bias = adapt(i - oldi, out + 1, oldi == 0); 1607 + 1608 + /* i was supposed to wrap around from out+1 to 0, */ 1609 + /* incrementing n each time, so we'll fix that now: */ 1610 + 1611 + if (i / (out + 1) > maxint - n) return punycode_overflow; 1612 + n += i / (out + 1); 1613 + i %= (out + 1); 1614 + 1615 + /* Insert n at position i of the output: */ 1616 + 1617 + /* not needed for Punycode: */ 1618 + /* if (decode_digit(n) <= base) return punycode_invalid_input; */ 1619 + if (out >= max_out) return punycode_big_output; 1620 + 1621 + if (case_flags) { 1622 + memmove(case_flags + i + 1, case_flags + i, out - i); 1623 + 1624 + 1625 + 1626 + Costello Standards Track [Page 29] 1627 + 1628 + RFC 3492 IDNA Punycode March 2003 1629 + 1630 + 1631 + /* Case of last character determines uppercase flag: */ 1632 + case_flags[i] = flagged(input[in - 1]); 1633 + } 1634 + 1635 + memmove(output + i + 1, output + i, (out - i) * sizeof *output); 1636 + output[i++] = n; 1637 + } 1638 + 1639 + *output_length = out; 1640 + return punycode_success; 1641 + } 1642 + 1643 + /******************************************************************/ 1644 + /* Wrapper for testing (would normally go in a separate .c file): */ 1645 + 1646 + #include <assert.h> 1647 + #include <stdio.h> 1648 + #include <stdlib.h> 1649 + #include <string.h> 1650 + 1651 + /* For testing, we'll just set some compile-time limits rather than */ 1652 + /* use malloc(), and set a compile-time option rather than using a */ 1653 + /* command-line option. */ 1654 + 1655 + enum { 1656 + unicode_max_length = 256, 1657 + ace_max_length = 256 1658 + }; 1659 + 1660 + static void usage(char **argv) 1661 + { 1662 + fprintf(stderr, 1663 + "\n" 1664 + "%s -e reads code points and writes a Punycode string.\n" 1665 + "%s -d reads a Punycode string and writes code points.\n" 1666 + "\n" 1667 + "Input and output are plain text in the native character set.\n" 1668 + "Code points are in the form u+hex separated by whitespace.\n" 1669 + "Although the specification allows Punycode strings to contain\n" 1670 + "any characters from the ASCII repertoire, this test code\n" 1671 + "supports only the printable characters, and needs the Punycode\n" 1672 + "string to be followed by a newline.\n" 1673 + "The case of the u in u+hex is the force-to-uppercase flag.\n" 1674 + , argv[0], argv[0]); 1675 + exit(EXIT_FAILURE); 1676 + } 1677 + 1678 + static void fail(const char *msg) 1679 + 1680 + 1681 + 1682 + Costello Standards Track [Page 30] 1683 + 1684 + RFC 3492 IDNA Punycode March 2003 1685 + 1686 + 1687 + { 1688 + fputs(msg,stderr); 1689 + exit(EXIT_FAILURE); 1690 + } 1691 + 1692 + static const char too_big[] = 1693 + "input or output is too large, recompile with larger limits\n"; 1694 + static const char invalid_input[] = "invalid input\n"; 1695 + static const char overflow[] = "arithmetic overflow\n"; 1696 + static const char io_error[] = "I/O error\n"; 1697 + 1698 + /* The following string is used to convert printable */ 1699 + /* characters between ASCII and the native charset: */ 1700 + 1701 + static const char print_ascii[] = 1702 + "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" 1703 + "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" 1704 + " !\"#$%&'()*+,-./" 1705 + "0123456789:;<=>?" 1706 + "@ABCDEFGHIJKLMNO" 1707 + "PQRSTUVWXYZ[\\]^_" 1708 + "`abcdefghijklmno" 1709 + "pqrstuvwxyz{|}~\n"; 1710 + 1711 + int main(int argc, char **argv) 1712 + { 1713 + enum punycode_status status; 1714 + int r; 1715 + unsigned int input_length, output_length, j; 1716 + unsigned char case_flags[unicode_max_length]; 1717 + 1718 + if (argc != 2) usage(argv); 1719 + if (argv[1][0] != '-') usage(argv); 1720 + if (argv[1][2] != 0) usage(argv); 1721 + 1722 + if (argv[1][1] == 'e') { 1723 + punycode_uint input[unicode_max_length]; 1724 + unsigned long codept; 1725 + char output[ace_max_length+1], uplus[3]; 1726 + int c; 1727 + 1728 + /* Read the input code points: */ 1729 + 1730 + input_length = 0; 1731 + 1732 + for (;;) { 1733 + r = scanf("%2s%lx", uplus, &codept); 1734 + if (ferror(stdin)) fail(io_error); 1735 + 1736 + 1737 + 1738 + Costello Standards Track [Page 31] 1739 + 1740 + RFC 3492 IDNA Punycode March 2003 1741 + 1742 + 1743 + if (r == EOF || r == 0) break; 1744 + 1745 + if (r != 2 || uplus[1] != '+' || codept > (punycode_uint)-1) { 1746 + fail(invalid_input); 1747 + } 1748 + 1749 + if (input_length == unicode_max_length) fail(too_big); 1750 + 1751 + if (uplus[0] == 'u') case_flags[input_length] = 0; 1752 + else if (uplus[0] == 'U') case_flags[input_length] = 1; 1753 + else fail(invalid_input); 1754 + 1755 + input[input_length++] = codept; 1756 + } 1757 + 1758 + /* Encode: */ 1759 + 1760 + output_length = ace_max_length; 1761 + status = punycode_encode(input_length, input, case_flags, 1762 + &output_length, output); 1763 + if (status == punycode_bad_input) fail(invalid_input); 1764 + if (status == punycode_big_output) fail(too_big); 1765 + if (status == punycode_overflow) fail(overflow); 1766 + assert(status == punycode_success); 1767 + 1768 + /* Convert to native charset and output: */ 1769 + 1770 + for (j = 0; j < output_length; ++j) { 1771 + c = output[j]; 1772 + assert(c >= 0 && c <= 127); 1773 + if (print_ascii[c] == 0) fail(invalid_input); 1774 + output[j] = print_ascii[c]; 1775 + } 1776 + 1777 + output[j] = 0; 1778 + r = puts(output); 1779 + if (r == EOF) fail(io_error); 1780 + return EXIT_SUCCESS; 1781 + } 1782 + 1783 + if (argv[1][1] == 'd') { 1784 + char input[ace_max_length+2], *p, *pp; 1785 + punycode_uint output[unicode_max_length]; 1786 + 1787 + /* Read the Punycode input string and convert to ASCII: */ 1788 + 1789 + fgets(input, ace_max_length+2, stdin); 1790 + if (ferror(stdin)) fail(io_error); 1791 + 1792 + 1793 + 1794 + Costello Standards Track [Page 32] 1795 + 1796 + RFC 3492 IDNA Punycode March 2003 1797 + 1798 + 1799 + if (feof(stdin)) fail(invalid_input); 1800 + input_length = strlen(input) - 1; 1801 + if (input[input_length] != '\n') fail(too_big); 1802 + input[input_length] = 0; 1803 + 1804 + for (p = input; *p != 0; ++p) { 1805 + pp = strchr(print_ascii, *p); 1806 + if (pp == 0) fail(invalid_input); 1807 + *p = pp - print_ascii; 1808 + } 1809 + 1810 + /* Decode: */ 1811 + 1812 + output_length = unicode_max_length; 1813 + status = punycode_decode(input_length, input, &output_length, 1814 + output, case_flags); 1815 + if (status == punycode_bad_input) fail(invalid_input); 1816 + if (status == punycode_big_output) fail(too_big); 1817 + if (status == punycode_overflow) fail(overflow); 1818 + assert(status == punycode_success); 1819 + 1820 + /* Output the result: */ 1821 + 1822 + for (j = 0; j < output_length; ++j) { 1823 + r = printf("%s+%04lX\n", 1824 + case_flags[j] ? "U" : "u", 1825 + (unsigned long) output[j] ); 1826 + if (r < 0) fail(io_error); 1827 + } 1828 + 1829 + return EXIT_SUCCESS; 1830 + } 1831 + 1832 + usage(argv); 1833 + return EXIT_SUCCESS; /* not reached, but quiets compiler warning */ 1834 + } 1835 + 1836 + 1837 + 1838 + 1839 + 1840 + 1841 + 1842 + 1843 + 1844 + 1845 + 1846 + 1847 + 1848 + 1849 + 1850 + Costello Standards Track [Page 33] 1851 + 1852 + RFC 3492 IDNA Punycode March 2003 1853 + 1854 + 1855 + Author's Address 1856 + 1857 + Adam M. Costello 1858 + University of California, Berkeley 1859 + http://www.nicemice.net/amc/ 1860 + 1861 + 1862 + 1863 + 1864 + 1865 + 1866 + 1867 + 1868 + 1869 + 1870 + 1871 + 1872 + 1873 + 1874 + 1875 + 1876 + 1877 + 1878 + 1879 + 1880 + 1881 + 1882 + 1883 + 1884 + 1885 + 1886 + 1887 + 1888 + 1889 + 1890 + 1891 + 1892 + 1893 + 1894 + 1895 + 1896 + 1897 + 1898 + 1899 + 1900 + 1901 + 1902 + 1903 + 1904 + 1905 + 1906 + Costello Standards Track [Page 34] 1907 + 1908 + RFC 3492 IDNA Punycode March 2003 1909 + 1910 + 1911 + Full Copyright Statement 1912 + 1913 + Copyright (C) The Internet Society (2003). All Rights Reserved. 1914 + 1915 + This document and translations of it may be copied and furnished to 1916 + others, and derivative works that comment on or otherwise explain it 1917 + or assist in its implementation may be prepared, copied, published 1918 + and distributed, in whole or in part, without restriction of any 1919 + kind, provided that the above copyright notice and this paragraph are 1920 + included on all such copies and derivative works. However, this 1921 + document itself may not be modified in any way, such as by removing 1922 + the copyright notice or references to the Internet Society or other 1923 + Internet organizations, except as needed for the purpose of 1924 + developing Internet standards in which case the procedures for 1925 + copyrights defined in the Internet Standards process must be 1926 + followed, or as required to translate it into languages other than 1927 + English. 1928 + 1929 + The limited permissions granted above are perpetual and will not be 1930 + revoked by the Internet Society or its successors or assigns. 1931 + 1932 + This document and the information contained herein is provided on an 1933 + "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING 1934 + TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING 1935 + BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION 1936 + HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF 1937 + MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 1938 + 1939 + Acknowledgement 1940 + 1941 + Funding for the RFC Editor function is currently provided by the 1942 + Internet Society. 1943 + 1944 + 1945 + 1946 + 1947 + 1948 + 1949 + 1950 + 1951 + 1952 + 1953 + 1954 + 1955 + 1956 + 1957 + 1958 + 1959 + 1960 + 1961 + 1962 + Costello Standards Track [Page 35] 1963 +
+4
test/dune
··· 1 + (test 2 + (name test_punycode) 3 + (libraries puny alcotest) 4 + (modules test_punycode))
+546
test/test_punycode.ml
··· 1 + (* Comprehensive tests for Punycode (RFC 3492) implementation *) 2 + 3 + open Alcotest 4 + module Punycode = Puny.Punycode 5 + module Punycode_idna = Puny.Punycode_idna 6 + 7 + (* Helper to convert hex code points to Uchar array *) 8 + let codepoints_of_hex_list hex_list = 9 + Array.of_list (List.map Uchar.of_int hex_list) 10 + 11 + (* Helper to convert string to code points *) 12 + let codepoints_of_string s = 13 + let acc = ref [] in 14 + let i = ref 0 in 15 + while !i < String.length s do 16 + let dec = String.get_utf_8_uchar s !i in 17 + acc := Uchar.utf_decode_uchar dec :: !acc; 18 + i := !i + Uchar.utf_decode_length dec 19 + done; 20 + Array.of_list (List.rev !acc) 21 + 22 + (* Test result helper *) 23 + let check_encode_ok expected input = 24 + match Punycode.encode input with 25 + | Ok result -> check string "encode" expected result 26 + | Error e -> 27 + fail (Format.asprintf "encode failed: %a" Punycode.pp_error e) 28 + 29 + let check_decode_ok expected input = 30 + match Punycode.decode input with 31 + | Ok result -> 32 + let expected_arr = codepoints_of_hex_list expected in 33 + check int "length" (Array.length expected_arr) (Array.length result); 34 + Array.iteri (fun i u -> 35 + check int (Printf.sprintf "char %d" i) 36 + (Uchar.to_int expected_arr.(i)) (Uchar.to_int u) 37 + ) result 38 + | Error e -> 39 + fail (Format.asprintf "decode failed: %a" Punycode.pp_error e) 40 + 41 + let check_utf8_roundtrip s = 42 + match Punycode.encode_utf8 s with 43 + | Error e -> 44 + fail (Format.asprintf "encode_utf8 failed: %a" Punycode.pp_error e) 45 + | Ok encoded -> 46 + match Punycode.decode_utf8 encoded with 47 + | Error e -> 48 + fail (Format.asprintf "decode_utf8 failed: %a" Punycode.pp_error e) 49 + | Ok decoded -> 50 + check string "roundtrip" s decoded 51 + 52 + (* RFC 3492 Section 7.1 Test Vectors *) 53 + 54 + (* (A) Arabic (Egyptian) *) 55 + let arabic_codepoints = [ 56 + 0x0644; 0x064A; 0x0647; 0x0645; 0x0627; 0x0628; 0x062A; 0x0643; 57 + 0x0644; 0x0645; 0x0648; 0x0634; 0x0639; 0x0631; 0x0628; 0x064A; 0x061F 58 + ] 59 + let arabic_punycode = "egbpdaj6bu4bxfgehfvwxn" 60 + 61 + (* (B) Chinese (simplified) *) 62 + let chinese_simplified_codepoints = [ 63 + 0x4ED6; 0x4EEC; 0x4E3A; 0x4EC0; 0x4E48; 0x4E0D; 0x8BF4; 0x4E2D; 0x6587 64 + ] 65 + let chinese_simplified_punycode = "ihqwcrb4cv8a8dqg056pqjye" 66 + 67 + (* (C) Chinese (traditional) *) 68 + let chinese_traditional_codepoints = [ 69 + 0x4ED6; 0x5011; 0x7232; 0x4EC0; 0x9EBD; 0x4E0D; 0x8AAA; 0x4E2D; 0x6587 70 + ] 71 + let chinese_traditional_punycode = "ihqwctvzc91f659drss3x8bo0yb" 72 + 73 + (* (D) Czech *) 74 + let czech_codepoints = [ 75 + 0x0050; 0x0072; 0x006F; 0x010D; 0x0070; 0x0072; 0x006F; 0x0073; 0x0074; 76 + 0x011B; 0x006E; 0x0065; 0x006D; 0x006C; 0x0075; 0x0076; 0x00ED; 0x010D; 77 + 0x0065; 0x0073; 0x006B; 0x0079 78 + ] 79 + let czech_punycode = "Proprostnemluvesky-uyb24dma41a" 80 + 81 + (* (E) Hebrew *) 82 + let hebrew_codepoints = [ 83 + 0x05DC; 0x05DE; 0x05D4; 0x05D4; 0x05DD; 0x05E4; 0x05E9; 0x05D5; 0x05D8; 84 + 0x05DC; 0x05D0; 0x05DE; 0x05D3; 0x05D1; 0x05E8; 0x05D9; 0x05DD; 0x05E2; 85 + 0x05D1; 0x05E8; 0x05D9; 0x05EA 86 + ] 87 + let hebrew_punycode = "4dbcagdahymbxekheh6e0a7fei0b" 88 + 89 + (* (F) Hindi (Devanagari) *) 90 + let hindi_codepoints = [ 91 + 0x092F; 0x0939; 0x0932; 0x094B; 0x0917; 0x0939; 0x093F; 0x0928; 0x094D; 92 + 0x0926; 0x0940; 0x0915; 0x094D; 0x092F; 0x094B; 0x0902; 0x0928; 0x0939; 93 + 0x0940; 0x0902; 0x092C; 0x094B; 0x0932; 0x0938; 0x0915; 0x0924; 0x0947; 94 + 0x0939; 0x0948; 0x0902 95 + ] 96 + let hindi_punycode = "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" 97 + 98 + (* (G) Japanese (kanji and hiragana) *) 99 + let japanese_codepoints = [ 100 + 0x306A; 0x305C; 0x307F; 0x3093; 0x306A; 0x65E5; 0x672C; 0x8A9E; 0x3092; 101 + 0x8A71; 0x3057; 0x3066; 0x304F; 0x308C; 0x306A; 0x3044; 0x306E; 0x304B 102 + ] 103 + let japanese_punycode = "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" 104 + 105 + (* (H) Korean (Hangul syllables) *) 106 + let korean_codepoints = [ 107 + 0xC138; 0xACC4; 0xC758; 0xBAA8; 0xB4E0; 0xC0AC; 0xB78C; 0xB4E4; 0xC774; 108 + 0xD55C; 0xAD6D; 0xC5B4; 0xB97C; 0xC774; 0xD574; 0xD55C; 0xB2E4; 0xBA74; 109 + 0xC5BC; 0xB9C8; 0xB098; 0xC88B; 0xC744; 0xAE4C 110 + ] 111 + let korean_punycode = "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" 112 + 113 + (* (I) Russian (Cyrillic) *) 114 + let russian_codepoints = [ 115 + 0x043F; 0x043E; 0x0447; 0x0435; 0x043C; 0x0443; 0x0436; 0x0435; 0x043E; 116 + 0x043D; 0x0438; 0x043D; 0x0435; 0x0433; 0x043E; 0x0432; 0x043E; 0x0440; 117 + 0x044F; 0x0442; 0x043F; 0x043E; 0x0440; 0x0443; 0x0441; 0x0441; 0x043A; 118 + 0x0438 119 + ] 120 + let russian_punycode = "b1abfaaepdrnnbgefbadotcwatmq2g4l" 121 + 122 + (* (J) Spanish *) 123 + let spanish_codepoints = [ 124 + 0x0050; 0x006F; 0x0072; 0x0071; 0x0075; 0x00E9; 0x006E; 0x006F; 0x0070; 125 + 0x0075; 0x0065; 0x0064; 0x0065; 0x006E; 0x0073; 0x0069; 0x006D; 0x0070; 126 + 0x006C; 0x0065; 0x006D; 0x0065; 0x006E; 0x0074; 0x0065; 0x0068; 0x0061; 127 + 0x0062; 0x006C; 0x0061; 0x0072; 0x0065; 0x006E; 0x0045; 0x0073; 0x0070; 128 + 0x0061; 0x00F1; 0x006F; 0x006C 129 + ] 130 + let spanish_punycode = "PorqunopuedensimplementehablarenEspaol-fmd56a" 131 + 132 + (* (K) Vietnamese *) 133 + let vietnamese_codepoints = [ 134 + 0x0054; 0x1EA1; 0x0069; 0x0073; 0x0061; 0x006F; 0x0068; 0x1ECD; 0x006B; 135 + 0x0068; 0x00F4; 0x006E; 0x0067; 0x0074; 0x0068; 0x1EC3; 0x0063; 0x0068; 136 + 0x1EC9; 0x006E; 0x00F3; 0x0069; 0x0074; 0x0069; 0x1EBF; 0x006E; 0x0067; 137 + 0x0056; 0x0069; 0x1EC7; 0x0074 138 + ] 139 + let vietnamese_punycode = "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" 140 + 141 + (* (L) 3年B組金八先生 - Japanese with ASCII *) 142 + let example_l_codepoints = [ 143 + 0x0033; 0x5E74; 0x0042; 0x7D44; 0x91D1; 0x516B; 0x5148; 0x751F 144 + ] 145 + let example_l_punycode = "3B-ww4c5e180e575a65lsy2b" 146 + 147 + (* (M) 安室奈美恵-with-SUPER-MONKEYS *) 148 + let example_m_codepoints = [ 149 + 0x5B89; 0x5BA4; 0x5948; 0x7F8E; 0x6075; 0x002D; 0x0077; 0x0069; 0x0074; 150 + 0x0068; 0x002D; 0x0053; 0x0055; 0x0050; 0x0045; 0x0052; 0x002D; 0x004D; 151 + 0x004F; 0x004E; 0x004B; 0x0045; 0x0059; 0x0053 152 + ] 153 + let example_m_punycode = "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n" 154 + 155 + (* (N) Hello-Another-Way-それぞれの場所 *) 156 + let example_n_codepoints = [ 157 + 0x0048; 0x0065; 0x006C; 0x006C; 0x006F; 0x002D; 0x0041; 0x006E; 0x006F; 158 + 0x0074; 0x0068; 0x0065; 0x0072; 0x002D; 0x0057; 0x0061; 0x0079; 0x002D; 159 + 0x305D; 0x308C; 0x305E; 0x308C; 0x306E; 0x5834; 0x6240 160 + ] 161 + let example_n_punycode = "Hello-Another-Way--fc4qua05auwb3674vfr0b" 162 + 163 + (* (O) ひとつ屋根の下2 *) 164 + let example_o_codepoints = [ 165 + 0x3072; 0x3068; 0x3064; 0x5C4B; 0x6839; 0x306E; 0x4E0B; 0x0032 166 + ] 167 + let example_o_punycode = "2-u9tlzr9756bt3uc0v" 168 + 169 + (* (P) MaijでKoiする5秒前 *) 170 + let example_p_codepoints = [ 171 + 0x004D; 0x0061; 0x006A; 0x0069; 0x3067; 0x004B; 0x006F; 0x0069; 0x3059; 172 + 0x308B; 0x0035; 0x79D2; 0x524D 173 + ] 174 + let example_p_punycode = "MajiKoi5-783gue6qz075azm5e" 175 + 176 + (* (Q) パフィーdeルンバ *) 177 + let example_q_codepoints = [ 178 + 0x30D1; 0x30D5; 0x30A3; 0x30FC; 0x0064; 0x0065; 0x30EB; 0x30F3; 0x30D0 179 + ] 180 + let example_q_punycode = "de-jg4avhby1noc0d" 181 + 182 + (* (R) そのスピードで *) 183 + let example_r_codepoints = [ 184 + 0x305D; 0x306E; 0x30B9; 0x30D4; 0x30FC; 0x30C9; 0x3067 185 + ] 186 + let example_r_punycode = "d9juau41awczczp" 187 + 188 + (* (S) -> $1.00 <- (pure ASCII) *) 189 + let example_s_codepoints = [ 190 + 0x002D; 0x003E; 0x0020; 0x0024; 0x0031; 0x002E; 0x0030; 0x0030; 0x0020; 191 + 0x003C; 0x002D 192 + ] 193 + let example_s_punycode = "-> $1.00 <--" 194 + 195 + (* Test functions *) 196 + 197 + let test_decode_arabic () = 198 + check_decode_ok arabic_codepoints arabic_punycode 199 + 200 + let test_decode_chinese_simplified () = 201 + check_decode_ok chinese_simplified_codepoints chinese_simplified_punycode 202 + 203 + let test_decode_chinese_traditional () = 204 + check_decode_ok chinese_traditional_codepoints chinese_traditional_punycode 205 + 206 + let test_decode_hebrew () = 207 + check_decode_ok hebrew_codepoints hebrew_punycode 208 + 209 + let test_decode_hindi () = 210 + check_decode_ok hindi_codepoints hindi_punycode 211 + 212 + let test_decode_japanese () = 213 + check_decode_ok japanese_codepoints japanese_punycode 214 + 215 + let test_decode_korean () = 216 + check_decode_ok korean_codepoints korean_punycode 217 + 218 + let test_decode_example_l () = 219 + check_decode_ok example_l_codepoints example_l_punycode 220 + 221 + let test_decode_example_m () = 222 + check_decode_ok example_m_codepoints example_m_punycode 223 + 224 + let test_decode_example_n () = 225 + check_decode_ok example_n_codepoints example_n_punycode 226 + 227 + let test_decode_example_o () = 228 + check_decode_ok example_o_codepoints example_o_punycode 229 + 230 + let test_decode_example_q () = 231 + check_decode_ok example_q_codepoints example_q_punycode 232 + 233 + let test_decode_example_r () = 234 + check_decode_ok example_r_codepoints example_r_punycode 235 + 236 + let test_decode_czech () = 237 + check_decode_ok czech_codepoints czech_punycode 238 + 239 + let test_decode_russian () = 240 + check_decode_ok russian_codepoints (String.lowercase_ascii russian_punycode) 241 + 242 + let test_decode_spanish () = 243 + check_decode_ok spanish_codepoints spanish_punycode 244 + 245 + let test_decode_vietnamese () = 246 + check_decode_ok vietnamese_codepoints vietnamese_punycode 247 + 248 + let test_decode_example_p () = 249 + check_decode_ok example_p_codepoints example_p_punycode 250 + 251 + let test_decode_example_s () = 252 + check_decode_ok example_s_codepoints example_s_punycode 253 + 254 + let test_encode_arabic () = 255 + check_encode_ok arabic_punycode (codepoints_of_hex_list arabic_codepoints) 256 + 257 + let test_encode_chinese_simplified () = 258 + check_encode_ok chinese_simplified_punycode 259 + (codepoints_of_hex_list chinese_simplified_codepoints) 260 + 261 + let test_encode_chinese_traditional () = 262 + check_encode_ok chinese_traditional_punycode 263 + (codepoints_of_hex_list chinese_traditional_codepoints) 264 + 265 + let test_encode_hebrew () = 266 + check_encode_ok hebrew_punycode (codepoints_of_hex_list hebrew_codepoints) 267 + 268 + let test_encode_hindi () = 269 + check_encode_ok hindi_punycode (codepoints_of_hex_list hindi_codepoints) 270 + 271 + let test_encode_japanese () = 272 + check_encode_ok japanese_punycode (codepoints_of_hex_list japanese_codepoints) 273 + 274 + let test_encode_korean () = 275 + check_encode_ok korean_punycode (codepoints_of_hex_list korean_codepoints) 276 + 277 + let test_encode_example_l () = 278 + check_encode_ok (String.lowercase_ascii example_l_punycode) 279 + (codepoints_of_hex_list example_l_codepoints) 280 + 281 + let test_encode_example_m () = 282 + check_encode_ok (String.lowercase_ascii example_m_punycode) 283 + (codepoints_of_hex_list example_m_codepoints) 284 + 285 + let test_encode_example_n () = 286 + check_encode_ok (String.lowercase_ascii example_n_punycode) 287 + (codepoints_of_hex_list example_n_codepoints) 288 + 289 + let test_encode_example_o () = 290 + check_encode_ok (String.lowercase_ascii example_o_punycode) 291 + (codepoints_of_hex_list example_o_codepoints) 292 + 293 + let test_encode_example_q () = 294 + check_encode_ok example_q_punycode (codepoints_of_hex_list example_q_codepoints) 295 + 296 + let test_encode_example_r () = 297 + check_encode_ok example_r_punycode (codepoints_of_hex_list example_r_codepoints) 298 + 299 + (* UTF-8 roundtrip tests *) 300 + let test_utf8_roundtrip_german () = 301 + check_utf8_roundtrip "münchen" 302 + 303 + let test_utf8_roundtrip_chinese () = 304 + check_utf8_roundtrip "中文" 305 + 306 + let test_utf8_roundtrip_japanese () = 307 + check_utf8_roundtrip "日本語" 308 + 309 + let test_utf8_roundtrip_arabic () = 310 + check_utf8_roundtrip "العربية" 311 + 312 + let test_utf8_roundtrip_russian () = 313 + check_utf8_roundtrip "русский" 314 + 315 + let test_utf8_roundtrip_greek () = 316 + check_utf8_roundtrip "ελληνικά" 317 + 318 + let test_utf8_roundtrip_korean () = 319 + check_utf8_roundtrip "한국어" 320 + 321 + let test_utf8_roundtrip_emoji () = 322 + check_utf8_roundtrip "hello👋world" 323 + 324 + (* Label encoding tests *) 325 + let test_label_encode_ascii () = 326 + match Punycode.encode_label "example" with 327 + | Ok result -> check string "ascii passthrough" "example" result 328 + | Error e -> fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error e) 329 + 330 + let test_label_encode_german () = 331 + match Punycode.encode_label "münchen" with 332 + | Ok result -> check string "german label" "xn--mnchen-3ya" result 333 + | Error e -> fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error e) 334 + 335 + let test_label_decode_german () = 336 + match Punycode.decode_label "xn--mnchen-3ya" with 337 + | Ok result -> check string "german decode" "münchen" result 338 + | Error e -> fail (Format.asprintf "decode_label failed: %a" Punycode.pp_error e) 339 + 340 + (* IDNA tests *) 341 + let test_idna_to_ascii_simple () = 342 + match Punycode_idna.to_ascii "münchen.example.com" with 343 + | Ok result -> check string "idna to_ascii" "xn--mnchen-3ya.example.com" result 344 + | Error e -> fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 345 + 346 + let test_idna_to_unicode_simple () = 347 + match Punycode_idna.to_unicode "xn--mnchen-3ya.example.com" with 348 + | Ok result -> check string "idna to_unicode" "münchen.example.com" result 349 + | Error e -> fail (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error e) 350 + 351 + let test_idna_roundtrip () = 352 + let original = "münchen.example.com" in 353 + match Punycode_idna.to_ascii original with 354 + | Error e -> fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 355 + | Ok ascii -> 356 + match Punycode_idna.to_unicode ascii with 357 + | Error e -> fail (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error e) 358 + | Ok unicode -> check string "idna roundtrip" original unicode 359 + 360 + let test_idna_all_ascii () = 361 + match Punycode_idna.to_ascii "www.example.com" with 362 + | Ok result -> check string "all ascii passthrough" "www.example.com" result 363 + | Error e -> fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 364 + 365 + let test_idna_mixed_labels () = 366 + match Punycode_idna.to_ascii "日本語.example.com" with 367 + | Ok result -> 368 + (* Check that result starts with xn-- and ends with .example.com *) 369 + check bool "has ace prefix" true (Punycode.has_ace_prefix result); 370 + check bool "ends with example.com" true 371 + (String.length result > 12 && 372 + String.sub result (String.length result - 12) 12 = ".example.com") 373 + | Error e -> fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 374 + 375 + (* Case annotation tests *) 376 + let test_case_annotation_decode () = 377 + (* RFC example: uppercase letters indicate case flags *) 378 + match Punycode.decode_with_case "MajiKoi5-783gue6qz075azm5e" with 379 + | Ok (codepoints, case_flags) -> 380 + check int "codepoints length" (List.length example_p_codepoints) (Array.length codepoints); 381 + check int "case_flags length" (Array.length codepoints) (Array.length case_flags); 382 + (* M should be uppercase *) 383 + check bool "M uppercase" true (case_flags.(0) = Punycode.Uppercase); 384 + (* a should be lowercase *) 385 + check bool "a lowercase" true (case_flags.(1) = Punycode.Lowercase) 386 + | Error e -> fail (Format.asprintf "decode_with_case failed: %a" Punycode.pp_error e) 387 + 388 + let test_case_annotation_encode () = 389 + let codepoints = codepoints_of_hex_list [0x0061; 0x0062; 0x0063] in (* "abc" *) 390 + let case_flags = [| Punycode.Uppercase; Punycode.Lowercase; Punycode.Uppercase |] in 391 + match Punycode.encode_with_case codepoints case_flags with 392 + | Ok result -> 393 + (* Should encode as "AbC-" (basic code points with case annotation) *) 394 + check string "case encoded" "AbC-" result 395 + | Error e -> fail (Format.asprintf "encode_with_case failed: %a" Punycode.pp_error e) 396 + 397 + (* Edge case tests *) 398 + let test_empty_input () = 399 + match Punycode.encode [||] with 400 + | Ok result -> check string "empty encode" "" result 401 + | Error _ -> fail "empty encode should succeed" 402 + 403 + let test_empty_decode () = 404 + match Punycode.decode "" with 405 + | Ok result -> check int "empty decode length" 0 (Array.length result) 406 + | Error _ -> fail "empty decode should succeed" 407 + 408 + let test_pure_ascii () = 409 + let input = codepoints_of_string "hello" in 410 + match Punycode.encode input with 411 + | Ok result -> check string "pure ascii" "hello-" result 412 + | Error e -> fail (Format.asprintf "encode failed: %a" Punycode.pp_error e) 413 + 414 + let test_invalid_digit () = 415 + match Punycode.decode "hello!" with 416 + | Ok _ -> fail "should fail on invalid digit" 417 + | Error (Punycode.Invalid_digit _) -> () 418 + | Error e -> fail (Format.asprintf "wrong error type: %a" Punycode.pp_error e) 419 + 420 + let test_label_too_long () = 421 + let long_label = String.make 100 'a' in 422 + match Punycode.encode_label long_label with 423 + | Ok _ -> fail "should fail on long label" 424 + | Error (Punycode.Label_too_long _) -> () 425 + | Error e -> fail (Format.asprintf "wrong error type: %a" Punycode.pp_error e) 426 + 427 + let test_empty_label () = 428 + match Punycode.encode_label "" with 429 + | Ok _ -> fail "should fail on empty label" 430 + | Error Punycode.Empty_label -> () 431 + | Error e -> fail (Format.asprintf "wrong error type: %a" Punycode.pp_error e) 432 + 433 + (* Validation tests *) 434 + let test_is_basic () = 435 + check bool "space is basic" true (Punycode.is_basic (Uchar.of_int 0x20)); 436 + check bool "A is basic" true (Punycode.is_basic (Uchar.of_int 0x41)); 437 + check bool "DEL is basic" true (Punycode.is_basic (Uchar.of_int 0x7F)); 438 + check bool "0x80 not basic" false (Punycode.is_basic (Uchar.of_int 0x80)); 439 + check bool "ü not basic" false (Punycode.is_basic (Uchar.of_int 0xFC)) 440 + 441 + let test_is_ascii_string () = 442 + check bool "ascii string" true (Punycode.is_ascii_string "hello"); 443 + check bool "non-ascii string" false (Punycode.is_ascii_string "héllo"); 444 + check bool "empty string" true (Punycode.is_ascii_string "") 445 + 446 + let test_has_ace_prefix () = 447 + check bool "has xn--" true (Punycode.has_ace_prefix "xn--mnchen-3ya"); 448 + check bool "has XN--" true (Punycode.has_ace_prefix "XN--mnchen-3ya"); 449 + check bool "no prefix" false (Punycode.has_ace_prefix "example"); 450 + check bool "too short" false (Punycode.has_ace_prefix "xn-") 451 + 452 + (* Test suites *) 453 + let decode_tests = [ 454 + "Arabic", `Quick, test_decode_arabic; 455 + "Chinese simplified", `Quick, test_decode_chinese_simplified; 456 + "Chinese traditional", `Quick, test_decode_chinese_traditional; 457 + "Czech", `Quick, test_decode_czech; 458 + "Hebrew", `Quick, test_decode_hebrew; 459 + "Hindi", `Quick, test_decode_hindi; 460 + "Japanese", `Quick, test_decode_japanese; 461 + "Korean", `Quick, test_decode_korean; 462 + "Russian", `Quick, test_decode_russian; 463 + "Spanish", `Quick, test_decode_spanish; 464 + "Vietnamese", `Quick, test_decode_vietnamese; 465 + "Example L (mixed)", `Quick, test_decode_example_l; 466 + "Example M (mixed)", `Quick, test_decode_example_m; 467 + "Example N (mixed)", `Quick, test_decode_example_n; 468 + "Example O (mixed)", `Quick, test_decode_example_o; 469 + "Example P (mixed)", `Quick, test_decode_example_p; 470 + "Example Q (mixed)", `Quick, test_decode_example_q; 471 + "Example R", `Quick, test_decode_example_r; 472 + "Example S (ASCII)", `Quick, test_decode_example_s; 473 + ] 474 + 475 + let encode_tests = [ 476 + "Arabic", `Quick, test_encode_arabic; 477 + "Chinese simplified", `Quick, test_encode_chinese_simplified; 478 + "Chinese traditional", `Quick, test_encode_chinese_traditional; 479 + "Hebrew", `Quick, test_encode_hebrew; 480 + "Hindi", `Quick, test_encode_hindi; 481 + "Japanese", `Quick, test_encode_japanese; 482 + "Korean", `Quick, test_encode_korean; 483 + "Example L (mixed)", `Quick, test_encode_example_l; 484 + "Example M (mixed)", `Quick, test_encode_example_m; 485 + "Example N (mixed)", `Quick, test_encode_example_n; 486 + "Example O (mixed)", `Quick, test_encode_example_o; 487 + "Example Q (mixed)", `Quick, test_encode_example_q; 488 + "Example R", `Quick, test_encode_example_r; 489 + ] 490 + 491 + let utf8_tests = [ 492 + "German roundtrip", `Quick, test_utf8_roundtrip_german; 493 + "Chinese roundtrip", `Quick, test_utf8_roundtrip_chinese; 494 + "Japanese roundtrip", `Quick, test_utf8_roundtrip_japanese; 495 + "Arabic roundtrip", `Quick, test_utf8_roundtrip_arabic; 496 + "Russian roundtrip", `Quick, test_utf8_roundtrip_russian; 497 + "Greek roundtrip", `Quick, test_utf8_roundtrip_greek; 498 + "Korean roundtrip", `Quick, test_utf8_roundtrip_korean; 499 + "Emoji roundtrip", `Quick, test_utf8_roundtrip_emoji; 500 + ] 501 + 502 + let label_tests = [ 503 + "ASCII passthrough", `Quick, test_label_encode_ascii; 504 + "German encode", `Quick, test_label_encode_german; 505 + "German decode", `Quick, test_label_decode_german; 506 + ] 507 + 508 + let idna_tests = [ 509 + "to_ascii simple", `Quick, test_idna_to_ascii_simple; 510 + "to_unicode simple", `Quick, test_idna_to_unicode_simple; 511 + "roundtrip", `Quick, test_idna_roundtrip; 512 + "all ASCII", `Quick, test_idna_all_ascii; 513 + "mixed labels", `Quick, test_idna_mixed_labels; 514 + ] 515 + 516 + let case_tests = [ 517 + "decode with case", `Quick, test_case_annotation_decode; 518 + "encode with case", `Quick, test_case_annotation_encode; 519 + ] 520 + 521 + let edge_case_tests = [ 522 + "empty encode", `Quick, test_empty_input; 523 + "empty decode", `Quick, test_empty_decode; 524 + "pure ASCII", `Quick, test_pure_ascii; 525 + "invalid digit", `Quick, test_invalid_digit; 526 + "label too long", `Quick, test_label_too_long; 527 + "empty label", `Quick, test_empty_label; 528 + ] 529 + 530 + let validation_tests = [ 531 + "is_basic", `Quick, test_is_basic; 532 + "is_ascii_string", `Quick, test_is_ascii_string; 533 + "has_ace_prefix", `Quick, test_has_ace_prefix; 534 + ] 535 + 536 + let () = 537 + run "Punycode" [ 538 + "decode RFC vectors", decode_tests; 539 + "encode RFC vectors", encode_tests; 540 + "UTF-8 roundtrip", utf8_tests; 541 + "label operations", label_tests; 542 + "IDNA operations", idna_tests; 543 + "case annotation", case_tests; 544 + "edge cases", edge_case_tests; 545 + "validation", validation_tests; 546 + ]