Punycode (RFC3492) in OCaml

Merge remote-tracking branch 'verse/gazagnaire.org/main'

+1101 -717
+1
dune
··· 1 1 ; Root dune file 2 2 3 3 ; Ignore third_party directory (for fetched dependency sources) 4 + 4 5 (data_only_dirs third_party)
+15
fuzz/dune
··· 1 + ; Crowbar fuzz testing for punycode 2 + ; 3 + ; To run: dune exec ocaml-punycode/fuzz/fuzz_punycode.exe 4 + ; With AFL: afl-fuzz -i fuzz/corpus -o fuzz/findings -- ./_build/default/ocaml-punycode/fuzz/fuzz_punycode.exe @@ 5 + 6 + (executable 7 + (name fuzz_punycode) 8 + (modules fuzz_punycode) 9 + (libraries punycode crowbar)) 10 + 11 + (rule 12 + (alias fuzz) 13 + (deps fuzz_punycode.exe) 14 + (action 15 + (run %{exe:fuzz_punycode.exe})))
+63
fuzz/fuzz_punycode.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Thomas Gazagnaire. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (* Crowbar-based fuzz testing for Punycode encoding/decoding *) 7 + 8 + open Crowbar 9 + 10 + (* Test that encode_utf8 never crashes on arbitrary input *) 11 + let test_encode_no_crash input = 12 + ignore (Punycode.encode_utf8 input); 13 + check true 14 + 15 + (* Test that decode_utf8 never crashes on arbitrary input *) 16 + let test_decode_no_crash input = 17 + ignore (Punycode.decode_utf8 input); 18 + check true 19 + 20 + (* Test roundtrip: encode then decode should give back original (case-insensitive) 21 + IDNA/Punycode lowercases ASCII characters during encoding per RFC 5891 *) 22 + let test_roundtrip input = 23 + match Punycode.encode_utf8 input with 24 + | Ok encoded -> ( 25 + match Punycode.decode_utf8 encoded with 26 + | Ok decoded -> 27 + (* Compare lowercase versions since IDNA lowercases ASCII *) 28 + check_eq ~pp:Format.pp_print_string 29 + (String.lowercase_ascii input) 30 + (String.lowercase_ascii decoded) 31 + | Error _ -> 32 + (* Some encoded values might not decode, that's ok for fuzz testing *) 33 + check true) 34 + | Error _ -> 35 + (* Some inputs might not encode, that's ok *) 36 + check true 37 + 38 + (* Test ASCII-only strings (should pass through mostly unchanged) *) 39 + let test_ascii_string input = 40 + if String.length input > 0 then begin 41 + let ascii_only = 42 + String.init 43 + (String.length input mod 64) 44 + (fun i -> 45 + Char.chr (Char.code input.[i mod String.length input] mod 128)) 46 + in 47 + if String.length ascii_only > 0 then 48 + ignore (Punycode.encode_utf8 ascii_only) 49 + end; 50 + check true 51 + 52 + (* Test inputs starting with ACE prefix "xn--" *) 53 + let test_ace_prefix input = 54 + let ace_input = "xn--" ^ input in 55 + ignore (Punycode.decode_utf8 ace_input); 56 + check true 57 + 58 + let () = 59 + add_test ~name:"punycode: encode no crash" [ bytes ] test_encode_no_crash; 60 + add_test ~name:"punycode: decode no crash" [ bytes ] test_decode_no_crash; 61 + add_test ~name:"punycode: roundtrip" [ bytes ] test_roundtrip; 62 + add_test ~name:"punycode: ascii string" [ bytes ] test_ascii_string; 63 + add_test ~name:"punycode: ace prefix" [ bytes ] test_ace_prefix
+214 -231
lib/punycode.ml
··· 20 20 21 21 (* {1 Position Tracking} *) 22 22 23 - type position = { 24 - byte_offset : int; 25 - char_index : int; 26 - } 23 + type position = { byte_offset : int; char_index : int } 27 24 28 25 let position_byte_offset pos = pos.byte_offset 29 26 let position_char_index pos = pos.char_index 30 27 31 28 let pp_position fmt pos = 32 29 Format.fprintf fmt "byte %d, char %d" pos.byte_offset pos.char_index 33 - 34 30 35 31 (* {1 Error Types} *) 36 32 ··· 45 41 46 42 let pp_error fmt = function 47 43 | Overflow pos -> 48 - Format.fprintf fmt "arithmetic overflow at %a" pp_position pos 44 + Format.fprintf fmt "arithmetic overflow at %a" pp_position pos 49 45 | Invalid_character (pos, u) -> 50 - Format.fprintf fmt "invalid character U+%04X at %a" 51 - (Uchar.to_int u) pp_position pos 46 + Format.fprintf fmt "invalid character U+%04X at %a" (Uchar.to_int u) 47 + pp_position pos 52 48 | Invalid_digit (pos, c) -> 53 - Format.fprintf fmt "invalid Punycode digit '%c' (0x%02X) at %a" 54 - c (Char.code c) pp_position pos 49 + Format.fprintf fmt "invalid Punycode digit '%c' (0x%02X) at %a" c 50 + (Char.code c) pp_position pos 55 51 | Unexpected_end pos -> 56 - Format.fprintf fmt "unexpected end of input at %a" pp_position pos 52 + Format.fprintf fmt "unexpected end of input at %a" pp_position pos 57 53 | Invalid_utf8 pos -> 58 - Format.fprintf fmt "invalid UTF-8 sequence at %a" pp_position pos 54 + Format.fprintf fmt "invalid UTF-8 sequence at %a" pp_position pos 59 55 | Label_too_long len -> 60 - Format.fprintf fmt "label too long: %d bytes (max %d)" len max_label_length 61 - | Empty_label -> 62 - Format.fprintf fmt "empty label" 56 + Format.fprintf fmt "label too long: %d bytes (max %d)" len 57 + max_label_length 58 + | Empty_label -> Format.fprintf fmt "empty label" 63 59 64 - let error_to_string err = 65 - Format.asprintf "%a" pp_error err 60 + let error_to_string err = Format.asprintf "%a" pp_error err 66 61 67 62 (* {1 Error Constructors} *) 68 63 ··· 80 75 81 76 (* {1 Basic Predicates} *) 82 77 83 - let is_basic u = 84 - Uchar.to_int u < 0x80 85 - 86 - let is_ascii_string s = 87 - String.for_all (fun c -> Char.code c < 0x80) s 78 + let is_basic u = Uchar.to_int u < 0x80 79 + let is_ascii_string s = String.for_all (fun c -> Char.code c < 0x80) s 88 80 89 81 let has_ace_prefix s = 90 82 let len = String.length s in 91 - len >= 4 && 92 - (s.[0] = 'x' || s.[0] = 'X') && 93 - (s.[1] = 'n' || s.[1] = 'N') && 94 - s.[2] = '-' && s.[3] = '-' 83 + len >= 4 84 + && (s.[0] = 'x' || s.[0] = 'X') 85 + && (s.[1] = 'n' || s.[1] = 'N') 86 + && s.[2] = '-' 87 + && s.[3] = '-' 95 88 96 89 (* {1 Digit Encoding/Decoding (RFC 3492 Section 5)} 97 90 ··· 101 94 *) 102 95 103 96 let encode_digit d case_flag = 104 - if d < 26 then 105 - Char.chr (d + (if case_flag = Uppercase then 0x41 else 0x61)) 106 - else 107 - Char.chr (d - 26 + 0x30) 97 + if d < 26 then Char.chr (d + if case_flag = Uppercase then 0x41 else 0x61) 98 + else Char.chr (d - 26 + 0x30) 108 99 109 100 let decode_digit c = 110 101 let code = Char.code c in 111 - if code >= 0x30 && code <= 0x39 then 112 - Some (code - 0x30 + 26) (* '0'-'9' -> 26-35 *) 113 - else if code >= 0x41 && code <= 0x5A then 114 - Some (code - 0x41) (* 'A'-'Z' -> 0-25 *) 115 - else if code >= 0x61 && code <= 0x7A then 116 - Some (code - 0x61) (* 'a'-'z' -> 0-25 *) 117 - else 118 - None 102 + if code >= 0x30 && code <= 0x39 then Some (code - 0x30 + 26) 103 + (* '0'-'9' -> 26-35 *) 104 + else if code >= 0x41 && code <= 0x5A then Some (code - 0x41) 105 + (* 'A'-'Z' -> 0-25 *) 106 + else if code >= 0x61 && code <= 0x7A then Some (code - 0x61) 107 + (* 'a'-'z' -> 0-25 *) 108 + else None 119 109 120 110 (* Check if a character is "flagged" (uppercase) for case annotation *) 121 111 let is_flagged c = 122 112 let code = Char.code c in 123 - code >= 0x41 && code <= 0x5A (* 'A'-'Z' *) 113 + code >= 0x41 && code <= 0x5A (* 'A'-'Z' *) 124 114 125 115 (* {1 Bias Adaptation (RFC 3492 Section 6.1)} *) 126 116 127 117 let adapt ~delta ~numpoints ~firsttime = 128 118 let delta = if firsttime then delta / damp else delta / 2 in 129 119 let delta = delta + (delta / numpoints) in 130 - let threshold = ((base - tmin) * tmax) / 2 in 120 + let threshold = (base - tmin) * tmax / 2 in 131 121 let rec loop delta k = 132 - if delta > threshold then 133 - loop (delta / (base - tmin)) (k + base) 134 - else 135 - k + (((base - tmin + 1) * delta) / (delta + skew)) 122 + if delta > threshold then loop (delta / (base - tmin)) (k + base) 123 + else k + ((base - tmin + 1) * delta / (delta + skew)) 136 124 in 137 125 loop delta 0 138 126 ··· 147 135 148 136 let safe_mul_add a b c pos = 149 137 if c = 0 then Ok a 150 - else if b > (max_int_value - a) / c then 151 - overflow pos 152 - else 153 - Ok (a + b * c) 138 + else if b > (max_int_value - a) / c then overflow pos 139 + else Ok (a + (b * c)) 154 140 155 141 (* {1 UTF-8 to Code Points Conversion} *) 156 142 ··· 167 153 acc := Uchar.utf_decode_uchar dec :: !acc; 168 154 byte_offset := !byte_offset + Uchar.utf_decode_length dec; 169 155 incr char_index 170 - end else begin 156 + end 157 + else begin 171 158 error := Some (Invalid_utf8 pos) 172 159 end 173 160 done; ··· 186 173 187 174 let encode_impl codepoints case_flags = 188 175 let input_length = Array.length codepoints in 189 - if input_length = 0 then 190 - Ok "" 176 + if input_length = 0 then Ok "" 191 177 else begin 192 178 let output = Buffer.create (input_length * 2) in 193 179 ··· 198 184 if is_basic cp then begin 199 185 let c = Uchar.to_int cp in 200 186 let case = 201 - match case_flags with 202 - | Some flags -> flags.(j) 203 - | None -> Lowercase 187 + match case_flags with Some flags -> flags.(j) | None -> Lowercase 204 188 in 205 189 (* Preserve or apply case for ASCII letters *) 206 190 let c' = ··· 208 192 if case = Lowercase then c + 0x20 else c 209 193 else if c >= 0x61 && c <= 0x7A then (* 'a'-'z' *) 210 194 if case = Uppercase then c - 0x20 else c 211 - else 212 - c 195 + else c 213 196 in 214 197 Buffer.add_char output (Char.chr c'); 215 198 incr basic_count ··· 220 203 let h = ref b in 221 204 222 205 (* Add delimiter if there were basic code points *) 223 - if b > 0 then 224 - Buffer.add_char output delimiter; 206 + if b > 0 then Buffer.add_char output delimiter; 225 207 226 208 (* Main encoding loop *) 227 209 let n = ref initial_n in ··· 232 214 233 215 while !h < input_length && !result = Ok () do 234 216 (* Find minimum code point >= n *) 235 - let m = Array.fold_left (fun acc cp -> 236 - let cp_val = Uchar.to_int cp in 237 - if cp_val >= !n && cp_val < acc then cp_val else acc 238 - ) max_int_value codepoints in 217 + let m = 218 + Array.fold_left 219 + (fun acc cp -> 220 + let cp_val = Uchar.to_int cp in 221 + if cp_val >= !n && cp_val < acc then cp_val else acc) 222 + max_int_value codepoints 223 + in 239 224 240 225 (* Increase delta to advance state to <m, 0> *) 241 226 let pos = { byte_offset = 0; char_index = !h } in 242 - (match safe_mul_add !delta (m - !n) (!h + 1) pos with 243 - | Error e -> result := Error e 244 - | Ok new_delta -> 245 - delta := new_delta; 246 - n := m; 227 + match safe_mul_add !delta (m - !n) (!h + 1) pos with 228 + | Error e -> result := Error e 229 + | Ok new_delta -> 230 + delta := new_delta; 231 + n := m; 247 232 248 - (* Process each code point *) 249 - let j = ref 0 in 250 - while !j < input_length && !result = Ok () do 251 - let cp = Uchar.to_int codepoints.(!j) in 252 - let pos = { byte_offset = 0; char_index = !j } in 233 + (* Process each code point *) 234 + let j = ref 0 in 235 + while !j < input_length && !result = Ok () do 236 + let cp = Uchar.to_int codepoints.(!j) in 237 + let pos = { byte_offset = 0; char_index = !j } in 253 238 254 - if cp < !n then begin 255 - incr delta; 256 - if !delta = 0 then (* Overflow *) 257 - result := overflow pos 258 - end 259 - else if cp = !n then begin 260 - (* Encode delta as variable-length integer *) 261 - let q = ref !delta in 262 - let k = ref base in 263 - let done_encoding = ref false in 239 + if cp < !n then begin 240 + incr delta; 241 + if !delta = 0 then (* Overflow *) 242 + result := overflow pos 243 + end 244 + else if cp = !n then begin 245 + (* Encode delta as variable-length integer *) 246 + let q = ref !delta in 247 + let k = ref base in 248 + let done_encoding = ref false in 264 249 265 - while not !done_encoding do 266 - let t = 267 - if !k <= !bias then tmin 268 - else if !k >= !bias + tmax then tmax 269 - else !k - !bias 270 - in 271 - if !q < t then begin 272 - (* Output final digit *) 273 - let case = 274 - match case_flags with 275 - | Some flags -> flags.(!j) 276 - | None -> Lowercase 277 - in 278 - Buffer.add_char output (encode_digit !q case); 279 - done_encoding := true 280 - end else begin 281 - (* Output intermediate digit and continue *) 282 - let digit = t + ((!q - t) mod (base - t)) in 283 - Buffer.add_char output (encode_digit digit Lowercase); 284 - q := (!q - t) / (base - t); 285 - k := !k + base 286 - end 287 - done; 250 + while not !done_encoding do 251 + let t = 252 + if !k <= !bias then tmin 253 + else if !k >= !bias + tmax then tmax 254 + else !k - !bias 255 + in 256 + if !q < t then begin 257 + (* Output final digit *) 258 + let case = 259 + match case_flags with 260 + | Some flags -> flags.(!j) 261 + | None -> Lowercase 262 + in 263 + Buffer.add_char output (encode_digit !q case); 264 + done_encoding := true 265 + end 266 + else begin 267 + (* Output intermediate digit and continue *) 268 + let digit = t + ((!q - t) mod (base - t)) in 269 + Buffer.add_char output (encode_digit digit Lowercase); 270 + q := (!q - t) / (base - t); 271 + k := !k + base 272 + end 273 + done; 288 274 289 - bias := adapt ~delta:!delta ~numpoints:(!h + 1) ~firsttime:(!h = b); 290 - delta := 0; 291 - incr h 292 - end; 293 - incr j 294 - done; 275 + bias := adapt ~delta:!delta ~numpoints:(!h + 1) ~firsttime:(!h = b); 276 + delta := 0; 277 + incr h 278 + end; 279 + incr j 280 + done; 295 281 296 - incr delta; 297 - incr n) 282 + incr delta; 283 + incr n 298 284 done; 299 285 300 286 match !result with ··· 302 288 | Ok () -> Ok (Buffer.contents output) 303 289 end 304 290 305 - let encode codepoints = 306 - encode_impl codepoints None 291 + let encode codepoints = encode_impl codepoints None 307 292 308 293 let encode_with_case codepoints case_flags = 309 294 if Array.length codepoints <> Array.length case_flags then ··· 314 299 315 300 let decode_impl input = 316 301 let input_length = String.length input in 317 - if input_length = 0 then 318 - Ok ([||], [||]) 302 + if input_length = 0 then Ok ([||], [||]) 319 303 else begin 320 304 (* Find last delimiter *) 321 305 let b = Option.value ~default:0 (String.rindex_opt input delimiter) in ··· 334 318 error := Some (Invalid_character (pos, Uchar.of_int code)) 335 319 else begin 336 320 output := Uchar.of_int code :: !output; 337 - case_output := (if is_flagged c then Uppercase else Lowercase) :: !case_output 321 + case_output := 322 + (if is_flagged c then Uppercase else Lowercase) :: !case_output 338 323 end 339 324 end 340 325 done; 341 326 342 327 match !error with 343 328 | Some e -> Error e 344 - | None -> 345 - let output = ref (Array.of_list (List.rev !output)) in 346 - let case_output = ref (Array.of_list (List.rev !case_output)) in 347 - 348 - (* Main decoding loop *) 349 - let n = ref initial_n in 350 - let i = ref 0 in 351 - let bias = ref initial_bias in 352 - let in_pos = ref (if b > 0 then b + 1 else 0) in 353 - let result = ref (Ok ()) in 329 + | None -> ( 330 + let output = ref (Array.of_list (List.rev !output)) in 331 + let case_output = ref (Array.of_list (List.rev !case_output)) in 354 332 355 - while !in_pos < input_length && !result = Ok () do 356 - let oldi = !i in 357 - let w = ref 1 in 358 - let k = ref base in 359 - let done_decoding = ref false in 333 + (* Main decoding loop *) 334 + let n = ref initial_n in 335 + let i = ref 0 in 336 + let bias = ref initial_bias in 337 + let in_pos = ref (if b > 0 then b + 1 else 0) in 338 + let result = ref (Ok ()) in 360 339 361 - while not !done_decoding && !result = Ok () do 362 - let pos = { byte_offset = !in_pos; char_index = Array.length !output } in 340 + while !in_pos < input_length && !result = Ok () do 341 + let oldi = !i in 342 + let w = ref 1 in 343 + let k = ref base in 344 + let done_decoding = ref false in 363 345 364 - if !in_pos >= input_length then begin 365 - result := unexpected_end pos; 366 - done_decoding := true 367 - end else begin 368 - let c = input.[!in_pos] in 369 - incr in_pos; 346 + while (not !done_decoding) && !result = Ok () do 347 + let pos = 348 + { byte_offset = !in_pos; char_index = Array.length !output } 349 + in 370 350 371 - match decode_digit c with 372 - | None -> 373 - result := invalid_digit pos c; 351 + if !in_pos >= input_length then begin 352 + result := unexpected_end pos; 374 353 done_decoding := true 375 - | Some digit -> 376 - (* i = i + digit * w, with overflow check *) 377 - (match safe_mul_add !i digit !w pos with 378 - | Error e -> 379 - result := Error e; 380 - done_decoding := true 381 - | Ok new_i -> 382 - i := new_i; 354 + end 355 + else begin 356 + let c = input.[!in_pos] in 357 + incr in_pos; 383 358 384 - let t = 385 - if !k <= !bias then tmin 386 - else if !k >= !bias + tmax then tmax 387 - else !k - !bias 388 - in 359 + match decode_digit c with 360 + | None -> 361 + result := invalid_digit pos c; 362 + done_decoding := true 363 + | Some digit -> ( 364 + (* i = i + digit * w, with overflow check *) 365 + match safe_mul_add !i digit !w pos with 366 + | Error e -> 367 + result := Error e; 368 + done_decoding := true 369 + | Ok new_i -> 370 + i := new_i; 389 371 390 - if digit < t then begin 391 - (* Record case flag from this final digit *) 392 - done_decoding := true 393 - end else begin 394 - (* w = w * (base - t), with overflow check *) 395 - let base_minus_t = base - t in 396 - if !w > max_int_value / base_minus_t then begin 397 - result := overflow pos; 398 - done_decoding := true 399 - end else begin 400 - w := !w * base_minus_t; 401 - k := !k + base 402 - end 403 - end) 404 - end 405 - done; 372 + let t = 373 + if !k <= !bias then tmin 374 + else if !k >= !bias + tmax then tmax 375 + else !k - !bias 376 + in 406 377 407 - if !result = Ok () then begin 408 - let out_len = Array.length !output in 409 - bias := adapt ~delta:(!i - oldi) ~numpoints:(out_len + 1) ~firsttime:(oldi = 0); 378 + if digit < t then begin 379 + (* Record case flag from this final digit *) 380 + done_decoding := true 381 + end 382 + else begin 383 + (* w = w * (base - t), with overflow check *) 384 + let base_minus_t = base - t in 385 + if !w > max_int_value / base_minus_t then begin 386 + result := overflow pos; 387 + done_decoding := true 388 + end 389 + else begin 390 + w := !w * base_minus_t; 391 + k := !k + base 392 + end 393 + end) 394 + end 395 + done; 410 396 411 - let pos = { byte_offset = !in_pos - 1; char_index = out_len } in 397 + if !result = Ok () then begin 398 + let out_len = Array.length !output in 399 + bias := 400 + adapt ~delta:(!i - oldi) ~numpoints:(out_len + 1) 401 + ~firsttime:(oldi = 0); 412 402 413 - (* n = n + i / (out_len + 1), with overflow check *) 414 - let increment = !i / (out_len + 1) in 415 - if increment > max_int_value - !n then 416 - result := overflow pos 417 - else begin 418 - n := !n + increment; 419 - i := !i mod (out_len + 1); 403 + let pos = { byte_offset = !in_pos - 1; char_index = out_len } in 420 404 421 - (* Validate that n is a valid Unicode scalar value *) 422 - if not (Uchar.is_valid !n) then 423 - result := invalid_character pos Uchar.rep 405 + (* n = n + i / (out_len + 1), with overflow check *) 406 + let increment = !i / (out_len + 1) in 407 + if increment > max_int_value - !n then result := overflow pos 424 408 else begin 425 - (* Insert n at position i *) 426 - let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in 427 - let new_case = Array.make (out_len + 1) Lowercase in 409 + n := !n + increment; 410 + i := !i mod (out_len + 1); 428 411 429 - for j = 0 to !i - 1 do 430 - new_output.(j) <- !output.(j); 431 - new_case.(j) <- !case_output.(j) 432 - done; 433 - new_output.(!i) <- Uchar.of_int !n; 434 - (* Case flag from final digit of this delta *) 435 - new_case.(!i) <- (if !in_pos > 0 && is_flagged input.[!in_pos - 1] 436 - then Uppercase else Lowercase); 437 - for j = !i to out_len - 1 do 438 - new_output.(j + 1) <- !output.(j); 439 - new_case.(j + 1) <- !case_output.(j) 440 - done; 412 + (* Validate that n is a valid Unicode scalar value *) 413 + if not (Uchar.is_valid !n) then 414 + result := invalid_character pos Uchar.rep 415 + else begin 416 + (* Insert n at position i *) 417 + let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in 418 + let new_case = Array.make (out_len + 1) Lowercase in 419 + 420 + for j = 0 to !i - 1 do 421 + new_output.(j) <- !output.(j); 422 + new_case.(j) <- !case_output.(j) 423 + done; 424 + new_output.(!i) <- Uchar.of_int !n; 425 + (* Case flag from final digit of this delta *) 426 + new_case.(!i) <- 427 + (if !in_pos > 0 && is_flagged input.[!in_pos - 1] then 428 + Uppercase 429 + else Lowercase); 430 + for j = !i to out_len - 1 do 431 + new_output.(j + 1) <- !output.(j); 432 + new_case.(j + 1) <- !case_output.(j) 433 + done; 441 434 442 - output := new_output; 443 - case_output := new_case; 444 - incr i 435 + output := new_output; 436 + case_output := new_case; 437 + incr i 438 + end 445 439 end 446 440 end 447 - end 448 - done; 441 + done; 449 442 450 - match !result with 451 - | Error e -> Error e 452 - | Ok () -> Ok (!output, !case_output) 443 + match !result with 444 + | Error e -> Error e 445 + | Ok () -> Ok (!output, !case_output)) 453 446 end 454 447 455 - let decode input = 456 - Result.map fst (decode_impl input) 457 - 458 - let decode_with_case input = 459 - decode_impl input 448 + let decode input = Result.map fst (decode_impl input) 449 + let decode_with_case input = decode_impl input 460 450 461 451 (* {1 UTF-8 String Operations} *) 462 452 ··· 473 463 (* {1 Domain Label Operations} *) 474 464 475 465 let encode_label label = 476 - if String.length label = 0 then 477 - empty_label 466 + if String.length label = 0 then empty_label 478 467 else if is_ascii_string label then begin 479 468 (* All ASCII - return as-is, but check length *) 480 469 let len = String.length label in 481 - if len > max_label_length then 482 - label_too_long len 483 - else 484 - Ok label 485 - end else 470 + if len > max_label_length then label_too_long len else Ok label 471 + end 472 + else 486 473 (* Has non-ASCII - encode with Punycode *) 487 474 let open Result.Syntax in 488 475 let* encoded = encode_utf8 label in 489 476 let result = ace_prefix ^ encoded in 490 477 let len = String.length result in 491 - if len > max_label_length then 492 - label_too_long len 493 - else 494 - Ok result 478 + if len > max_label_length then label_too_long len else Ok result 495 479 496 480 let decode_label label = 497 - if String.length label = 0 then 498 - empty_label 481 + if String.length label = 0 then empty_label 499 482 else if has_ace_prefix label then begin 500 483 (* Remove ACE prefix and decode *) 501 484 let punycode = String.sub label 4 (String.length label - 4) in 502 485 decode_utf8 punycode 503 - end else begin 486 + end 487 + else begin 504 488 (* No ACE prefix - validate and return *) 505 - if is_ascii_string label then 506 - Ok label 489 + if is_ascii_string label then Ok label 507 490 else 508 491 (* Has non-ASCII but no ACE prefix - return as-is *) 509 492 Ok label
+97 -85
lib/punycode.mli
··· 6 6 (** RFC 3492 Punycode: A Bootstring encoding of Unicode for IDNA. 7 7 8 8 This module implements the Punycode algorithm as specified in 9 - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, 10 - providing encoding and decoding of Unicode strings to/from ASCII-compatible 11 - encoding suitable for use in internationalized domain names. 9 + {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, providing 10 + encoding and decoding of Unicode strings to/from ASCII-compatible encoding 11 + suitable for use in internationalized domain names. 12 12 13 - Punycode is an instance of Bootstring that uses particular parameter 14 - values appropriate for IDNA. See 15 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5} 16 - for the specific parameter values. 13 + Punycode is an instance of Bootstring that uses particular parameter values 14 + appropriate for IDNA. See 15 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 16 + 5} for the specific parameter values. 17 17 18 18 {2 References} 19 - {ul 20 - {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A Bootstring encoding of Unicode for IDNA} 21 - {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol}} *) 19 + - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A 20 + Bootstring encoding of Unicode for IDNA 21 + - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol 22 + *) 22 23 23 24 (** {1 Position Tracking} *) 24 25 ··· 39 40 40 41 type error = 41 42 | Overflow of position 42 - (** Arithmetic overflow during encode/decode. This can occur with 43 - very long strings or extreme Unicode code point values. 44 - See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4} 45 - RFC 3492 Section 6.4} for overflow handling requirements. *) 43 + (** Arithmetic overflow during encode/decode. This can occur with very 44 + long strings or extreme Unicode code point values. See 45 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4} RFC 3492 46 + Section 6.4} for overflow handling requirements. *) 46 47 | Invalid_character of position * Uchar.t 47 - (** A non-basic code point appeared where only basic code points 48 - (ASCII < 128) are allowed. Per 49 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} 50 - RFC 3492 Section 3.1}, basic code points must be segregated 51 - at the beginning of the encoded string. *) 48 + (** A non-basic code point appeared where only basic code points (ASCII < 49 + 128) are allowed. Per 50 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} RFC 3492 51 + Section 3.1}, basic code points must be segregated at the beginning 52 + of the encoded string. *) 52 53 | Invalid_digit of position * char 53 - (** An invalid Punycode digit was encountered during decoding. 54 - Valid digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35). 55 - See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} 56 - RFC 3492 Section 5} for digit-value mappings. *) 54 + (** An invalid Punycode digit was encountered during decoding. Valid 55 + digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35). See 56 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 57 + Section 5} for digit-value mappings. *) 57 58 | Unexpected_end of position 58 - (** The input ended prematurely during decoding of a delta value. 59 - See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2} 60 - RFC 3492 Section 6.2} decoding procedure. *) 61 - | Invalid_utf8 of position 62 - (** Malformed UTF-8 sequence in input string. *) 59 + (** The input ended prematurely during decoding of a delta value. See 60 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2} RFC 3492 61 + Section 6.2} decoding procedure. *) 62 + | Invalid_utf8 of position (** Malformed UTF-8 sequence in input string. *) 63 63 | Label_too_long of int 64 64 (** Encoded label exceeds 63 bytes (DNS limit per 65 - {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}). 66 - The int is the actual length. *) 67 - | Empty_label 68 - (** Empty label is not valid for encoding. *) 65 + {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}). The int 66 + is the actual length. *) 67 + | Empty_label (** Empty label is not valid for encoding. *) 69 68 70 69 val pp_error : Format.formatter -> error -> unit 71 70 (** [pp_error fmt e] pretty-prints an error with position information. *) ··· 76 75 (** {1 Constants} 77 76 78 77 Punycode parameters as specified in 79 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}. *) 78 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 79 + 5}. *) 80 80 81 81 val ace_prefix : string 82 - (** The ACE prefix ["xn--"] used for Punycode-encoded domain labels. 83 - See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} 84 - RFC 3492 Section 5} which notes that IDNA prepends this prefix. *) 82 + (** The ACE prefix ["xn--"] used for Punycode-encoded domain labels. See 83 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section 84 + 5} which notes that IDNA prepends this prefix. *) 85 85 86 86 val max_label_length : int 87 87 (** Maximum length of a domain label in bytes (63), per ··· 89 89 90 90 (** {1 Case Flags for Mixed-Case Annotation} 91 91 92 - {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A} 93 - describes an optional mechanism for preserving case information through 94 - the encoding/decoding round-trip. This is useful when the original 92 + {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 93 + Appendix A} describes an optional mechanism for preserving case information 94 + through the encoding/decoding round-trip. This is useful when the original 95 95 string's case should be recoverable. 96 96 97 97 Note: Mixed-case annotation is not used by the ToASCII and ToUnicode 98 98 operations of IDNA. *) 99 99 100 - type case_flag = Uppercase | Lowercase 101 - (** Case annotation for a character. *) 100 + type case_flag = 101 + | Uppercase 102 + | Lowercase (** Case annotation for a character. *) 102 103 103 104 (** {1 Core Punycode Operations} 104 105 105 106 These functions implement the Bootstring algorithms from 106 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section 6}. 107 - They operate on arrays of Unicode code points ([Uchar.t array]). 108 - The encoded output is a plain ASCII string without the ACE prefix. *) 107 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section 108 + 6}. They operate on arrays of Unicode code points ([Uchar.t array]). The 109 + encoded output is a plain ASCII string without the ACE prefix. *) 109 110 110 111 val encode : Uchar.t array -> (string, error) result 111 - (** [encode codepoints] encodes an array of Unicode code points to a 112 - Punycode ASCII string. 112 + (** [encode codepoints] encodes an array of Unicode code points to a Punycode 113 + ASCII string. 113 114 114 115 Implements the encoding procedure from 115 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492 Section 6.3}: 116 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492 117 + Section 6.3}: 116 118 117 - 1. Basic code points (ASCII < 128) are copied literally to the beginning 118 - of the output per {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} 119 - Section 3.1 (Basic code point segregation)} 120 - 2. A delimiter ('-') is appended if there are any basic code points 121 - 3. Non-basic code points are encoded as deltas using the generalized 122 - variable-length integer representation from 123 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3} 119 + 1. Basic code points (ASCII < 128) are copied literally to the beginning of 120 + the output per 121 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} Section 3.1 122 + (Basic code point segregation)} 2. A delimiter ('-') is appended if there 123 + are any basic code points 3. Non-basic code points are encoded as deltas 124 + using the generalized variable-length integer representation from 125 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3} 124 126 125 127 Example: 126 128 {[ ··· 129 131 ]} *) 130 132 131 133 val decode : string -> (Uchar.t array, error) result 132 - (** [decode punycode] decodes a Punycode ASCII string to an array of 133 - Unicode code points. 134 + (** [decode punycode] decodes a Punycode ASCII string to an array of Unicode 135 + code points. 134 136 135 137 Implements the decoding procedure from 136 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 Section 6.2}. 138 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 139 + Section 6.2}. 137 140 138 - The input should be the Punycode portion only, without the ACE prefix. 139 - The decoder is case-insensitive for the encoded portion, as required by 140 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}: 141 - "A decoder MUST recognize the letters in both uppercase and lowercase forms". 141 + The input should be the Punycode portion only, without the ACE prefix. The 142 + decoder is case-insensitive for the encoded portion, as required by 143 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 144 + 5}: "A decoder MUST recognize the letters in both uppercase and lowercase 145 + forms". 142 146 143 147 Example: 144 148 {[ ··· 148 152 149 153 (** {1 Mixed-Case Annotation} 150 154 151 - These functions support round-trip case preservation as described 152 - in {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}. *) 155 + These functions support round-trip case preservation as described in 156 + {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 157 + Appendix A}. *) 153 158 154 - val encode_with_case : Uchar.t array -> case_flag array -> (string, error) result 159 + val encode_with_case : 160 + Uchar.t array -> case_flag array -> (string, error) result 155 161 (** [encode_with_case codepoints case_flags] encodes with case annotation. 156 162 157 - Per {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}: 163 + Per 164 + {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 165 + Appendix A}: 158 166 - For basic (ASCII) letters, the output preserves the case flag directly 159 167 - For non-ASCII characters, the case of the final digit in each delta 160 168 encoding indicates the flag (uppercase = suggested uppercase) ··· 166 174 val decode_with_case : string -> (Uchar.t array * case_flag array, error) result 167 175 (** [decode_with_case punycode] decodes and extracts case annotations. 168 176 169 - Per {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}, 170 - returns both the decoded code points and an array of case flags 171 - indicating the suggested case for each character based on the 177 + Per 178 + {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 179 + Appendix A}, returns both the decoded code points and an array of case 180 + flags indicating the suggested case for each character based on the 172 181 uppercase/lowercase form of the encoding digits. *) 173 182 174 183 (** {1 UTF-8 String Operations} ··· 179 188 val encode_utf8 : string -> (string, error) result 180 189 (** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix). 181 190 182 - This is equivalent to decoding [s] from UTF-8 to code points, then 183 - calling {!encode}. 191 + This is equivalent to decoding [s] from UTF-8 to code points, then calling 192 + {!encode}. 184 193 185 194 Example: 186 195 {[ ··· 201 210 202 211 (** {1 Domain Label Operations} 203 212 204 - These functions handle the ACE prefix automatically and enforce 205 - DNS label length limits per {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *) 213 + These functions handle the ACE prefix automatically and enforce DNS label 214 + length limits per 215 + {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *) 206 216 207 217 val encode_label : string -> (string, error) result 208 218 (** [encode_label label] encodes a domain label for use in DNS. 209 219 210 220 If the label contains only ASCII characters, it is returned unchanged. 211 - Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended, 212 - as specified in {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} 213 - RFC 3492 Section 5}. 221 + Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended, as 222 + specified in 223 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section 224 + 5}. 214 225 215 226 Returns {!Error} {!Label_too_long} if the result exceeds 63 bytes. 216 227 217 228 Example: 218 229 {[ 219 230 encode_label "münchen" 220 - (* = Ok "xn--mnchen-3ya" *) 221 - encode_label "example" 231 + (* = Ok "xn--mnchen-3ya" *) 232 + encode_label "example" 222 233 (* = Ok "example" *) 223 234 ]} *) 224 235 225 236 val decode_label : string -> (string, error) result 226 237 (** [decode_label label] decodes a domain label. 227 238 228 - If the label starts with the ACE prefix ("xn--", case-insensitive), 229 - it is Punycode-decoded. Otherwise, it is returned unchanged. 239 + If the label starts with the ACE prefix ("xn--", case-insensitive), it is 240 + Punycode-decoded. Otherwise, it is returned unchanged. 230 241 231 242 Example: 232 243 {[ 233 244 decode_label "xn--mnchen-3ya" 234 - (* = Ok "münchen" *) 235 - decode_label "example" 245 + (* = Ok "münchen" *) 246 + decode_label "example" 236 247 (* = Ok "example" *) 237 248 ]} *) 238 249 ··· 243 254 val is_basic : Uchar.t -> bool 244 255 (** [is_basic u] is [true] if [u] is a basic code point (ASCII, < 128). 245 256 246 - Per {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}, 247 - basic code points for Punycode are the ASCII code points (0..7F). *) 257 + Per 258 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 259 + 5}, basic code points for Punycode are the ASCII code points (0..7F). *) 248 260 249 261 val is_ascii_string : string -> bool 250 - (** [is_ascii_string s] is [true] if [s] contains only ASCII characters 251 - (all bytes < 128). *) 262 + (** [is_ascii_string s] is [true] if [s] contains only ASCII characters (all 263 + bytes < 128). *) 252 264 253 265 val has_ace_prefix : string -> bool 254 266 (** [has_ace_prefix s] is [true] if [s] starts with the ACE prefix "xn--"
+53 -65
lib/punycode_idna.ml
··· 18 18 19 19 let pp_error fmt = function 20 20 | Punycode_error e -> 21 - Format.fprintf fmt "Punycode error: %a" Punycode.pp_error e 22 - | Invalid_label msg -> 23 - Format.fprintf fmt "invalid label: %s" msg 21 + Format.fprintf fmt "Punycode error: %a" Punycode.pp_error e 22 + | Invalid_label msg -> Format.fprintf fmt "invalid label: %s" msg 24 23 | Domain_too_long len -> 25 - Format.fprintf fmt "domain too long: %d bytes (max %d)" len max_domain_length 26 - | Normalization_failed -> 27 - Format.fprintf fmt "Unicode normalization failed" 24 + Format.fprintf fmt "domain too long: %d bytes (max %d)" len 25 + max_domain_length 26 + | Normalization_failed -> Format.fprintf fmt "Unicode normalization failed" 28 27 | Verification_failed -> 29 - Format.fprintf fmt "IDNA verification failed (round-trip mismatch)" 28 + Format.fprintf fmt "IDNA verification failed (round-trip mismatch)" 30 29 31 - let error_to_string err = 32 - Format.asprintf "%a" pp_error err 30 + let error_to_string err = Format.asprintf "%a" pp_error err 33 31 34 32 (* {1 Error Constructors} *) 35 33 ··· 41 39 42 40 (* {1 Unicode Normalization} *) 43 41 44 - let normalize_nfc s = 45 - Uunf_string.normalize_utf_8 `NFC s 42 + let normalize_nfc s = Uunf_string.normalize_utf_8 `NFC s 46 43 47 44 (* {1 Validation Helpers} *) 48 45 49 - let is_ace_label label = 50 - Punycode.has_ace_prefix label 46 + let is_ace_label label = Punycode.has_ace_prefix label 51 47 52 48 (* Check if a label follows STD3 rules (hostname restrictions): 53 49 - Only LDH (letters, digits, hyphens) ··· 55 51 let is_std3_valid label = 56 52 let len = String.length label in 57 53 let is_ldh c = 58 - (c >= 'a' && c <= 'z') || 59 - (c >= 'A' && c <= 'Z') || 60 - (c >= '0' && c <= '9') || 61 - c = '-' 54 + (c >= 'a' && c <= 'z') 55 + || (c >= 'A' && c <= 'Z') 56 + || (c >= '0' && c <= '9') 57 + || c = '-' 62 58 in 63 - len > 0 && 64 - label.[0] <> '-' && 65 - label.[len - 1] <> '-' && 66 - String.for_all is_ldh label 59 + len > 0 60 + && label.[0] <> '-' 61 + && label.[len - 1] <> '-' 62 + && String.for_all is_ldh label 67 63 68 64 (* Check hyphen placement: hyphens not in positions 3 and 4 (except for ACE) *) 69 65 let check_hyphen_rules label = ··· 71 67 if len >= 4 && label.[2] = '-' && label.[3] = '-' then 72 68 (* Hyphens in positions 3 and 4 - only valid for ACE prefix *) 73 69 is_ace_label label 74 - else 75 - true 70 + else true 76 71 77 72 (* {1 Label Operations} *) 78 73 79 74 let label_to_ascii_impl ~check_hyphens ~use_std3_rules label = 80 75 let len = String.length label in 81 - if len = 0 then 82 - invalid_label "empty label" 76 + if len = 0 then invalid_label "empty label" 83 77 else if len > Punycode.max_label_length then 84 78 punycode_error (Punycode.Label_too_long len) 85 79 else if Punycode.is_ascii_string label then begin ··· 88 82 invalid_label "STD3 rules violation" 89 83 else if check_hyphens && not (check_hyphen_rules label) then 90 84 invalid_label "invalid hyphen placement" 91 - else 92 - Ok label 93 - end else begin 85 + else Ok label 86 + end 87 + else begin 94 88 (* Has non-ASCII - normalize and encode *) 95 89 let normalized = normalize_nfc label in 96 90 97 91 (* Encode to Punycode *) 98 92 match Punycode.encode_utf8 normalized with 99 93 | Error e -> punycode_error e 100 - | Ok encoded -> 101 - let result = Punycode.ace_prefix ^ encoded in 102 - let result_len = String.length result in 103 - if result_len > Punycode.max_label_length then 104 - punycode_error (Punycode.Label_too_long result_len) 105 - else if check_hyphens && not (check_hyphen_rules result) then 106 - invalid_label "invalid hyphen placement in encoded label" 107 - else 108 - (* Verification: decode and compare to original normalized form *) 109 - match Punycode.decode_utf8 encoded with 110 - | Error _ -> verification_failed 111 - | Ok decoded -> 112 - if decoded <> normalized then 113 - verification_failed 114 - else 115 - Ok result 94 + | Ok encoded -> ( 95 + let result = Punycode.ace_prefix ^ encoded in 96 + let result_len = String.length result in 97 + if result_len > Punycode.max_label_length then 98 + punycode_error (Punycode.Label_too_long result_len) 99 + else if check_hyphens && not (check_hyphen_rules result) then 100 + invalid_label "invalid hyphen placement in encoded label" 101 + else 102 + (* Verification: decode and compare to original normalized form *) 103 + match Punycode.decode_utf8 encoded with 104 + | Error _ -> verification_failed 105 + | Ok decoded -> 106 + if decoded <> normalized then verification_failed else Ok result) 116 107 end 117 108 118 109 let label_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) label = ··· 124 115 match Punycode.decode_utf8 encoded with 125 116 | Error e -> punycode_error e 126 117 | Ok decoded -> Ok decoded 127 - end else 128 - Ok label 118 + end 119 + else Ok label 129 120 130 121 (* {1 Domain Operations} *) 131 122 132 123 (* Split domain into labels *) 133 - let split_domain domain = 134 - String.split_on_char '.' domain 124 + let split_domain domain = String.split_on_char '.' domain 135 125 136 126 (* Join labels into domain *) 137 - let join_labels labels = 138 - String.concat "." labels 127 + let join_labels labels = String.concat "." labels 139 128 140 129 (* Map a function returning Result over a list, short-circuiting on first Error *) 141 130 let map_result f lst = 142 - List.fold_right (fun x acc -> 143 - let open Result.Syntax in 144 - let* y = f x in 145 - let+ ys = acc in 146 - y :: ys 147 - ) lst (Ok []) 131 + List.fold_right 132 + (fun x acc -> 133 + let open Result.Syntax in 134 + let* y = f x in 135 + let+ ys = acc in 136 + y :: ys) 137 + lst (Ok []) 148 138 149 139 let to_ascii ?(check_hyphens = true) ?(check_bidi = false) 150 - ?(check_joiners = false) ?(use_std3_rules = false) 151 - ?(transitional = false) domain = 140 + ?(check_joiners = false) ?(use_std3_rules = false) ?(transitional = false) 141 + domain = 152 142 (* Note: check_bidi, check_joiners, and transitional are accepted but 153 143 not fully implemented - they would require additional Unicode data *) 154 144 let _ = check_bidi in ··· 157 147 158 148 let open Result.Syntax in 159 149 let labels = split_domain domain in 160 - let* encoded_labels = map_result (label_to_ascii_impl ~check_hyphens ~use_std3_rules) labels in 150 + let* encoded_labels = 151 + map_result (label_to_ascii_impl ~check_hyphens ~use_std3_rules) labels 152 + in 161 153 let result = join_labels encoded_labels in 162 154 let len = String.length result in 163 - if len > max_domain_length then 164 - domain_too_long len 165 - else 166 - Ok result 155 + if len > max_domain_length then domain_too_long len else Ok result 167 156 168 157 let to_unicode domain = 169 158 let open Result.Syntax in ··· 191 180 192 181 (* {1 Validation} *) 193 182 194 - let is_idna_valid domain = 195 - Result.is_ok (to_ascii domain) 183 + let is_idna_valid domain = Result.is_ok (to_ascii domain)
+95 -77
lib/punycode_idna.mli
··· 5 5 6 6 (** IDNA (Internationalized Domain Names in Applications) support. 7 7 8 - This module provides ToASCII and ToUnicode operations as specified 9 - in {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} (IDNA 2008), 8 + This module provides ToASCII and ToUnicode operations as specified in 9 + {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} (IDNA 2008), 10 10 using Punycode ({{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}) 11 11 for encoding. 12 12 13 - IDNA allows domain names to contain non-ASCII Unicode characters by 14 - encoding them using Punycode with an ACE prefix. This module handles 15 - the conversion between Unicode domain names and their ASCII-compatible 16 - encoding (ACE) form. 13 + IDNA allows domain names to contain non-ASCII Unicode characters by encoding 14 + them using Punycode with an ACE prefix. This module handles the conversion 15 + between Unicode domain names and their ASCII-compatible encoding (ACE) form. 17 16 18 17 {2 References} 19 - {ul 20 - {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - 21 - Internationalized Domain Names in Applications (IDNA): Protocol} 22 - {- {{:https://datatracker.ietf.org/doc/html/rfc5892}RFC 5892} - 23 - The Unicode Code Points and Internationalized Domain Names for Applications (IDNA)} 24 - {- {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} - 25 - Right-to-Left Scripts for Internationalized Domain Names for Applications (IDNA)} 26 - {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - 27 - Punycode: A Bootstring encoding of Unicode for IDNA}} *) 18 + - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - 19 + Internationalized Domain Names in Applications (IDNA): Protocol 20 + - {{:https://datatracker.ietf.org/doc/html/rfc5892}RFC 5892} - The Unicode 21 + Code Points and Internationalized Domain Names for Applications (IDNA) 22 + - {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} - Right-to-Left 23 + Scripts for Internationalized Domain Names for Applications (IDNA) 24 + - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A 25 + Bootstring encoding of Unicode for IDNA *) 28 26 29 27 (** {1 Error Types} *) 30 28 31 29 type error = 32 30 | Punycode_error of Punycode.error 33 - (** Error during Punycode encoding/decoding. 34 - See {!Punycode.error} for details. *) 31 + (** Error during Punycode encoding/decoding. See {!Punycode.error} for 32 + details. *) 35 33 | Invalid_label of string 36 34 (** Label violates IDNA constraints. The string describes the violation. 37 - See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} 38 - RFC 5891 Section 4} for label validation requirements. *) 35 + See 36 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891 37 + Section 4} for label validation requirements. *) 39 38 | Domain_too_long of int 40 39 (** Domain name exceeds 253 bytes, per 41 - {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. 42 - The int is the actual length. *) 40 + {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. The int is 41 + the actual length. *) 43 42 | Normalization_failed 44 - (** Unicode normalization (NFC) failed. 45 - Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} 46 - RFC 5891 Section 4.2.1}, labels must be in NFC form. *) 43 + (** Unicode normalization (NFC) failed. Per 44 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} RFC 45 + 5891 Section 4.2.1}, labels must be in NFC form. *) 47 46 | Verification_failed 48 - (** ToASCII/ToUnicode verification step failed (round-trip check). 49 - Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} 50 - RFC 5891 Section 4.2}, the result of encoding must decode back 51 - to the original input. *) 47 + (** ToASCII/ToUnicode verification step failed (round-trip check). Per 48 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} RFC 5891 49 + Section 4.2}, the result of encoding must decode back to the original 50 + input. *) 52 51 53 52 val pp_error : Format.formatter -> error -> unit 54 53 (** [pp_error fmt e] pretty-prints an error. *) ··· 64 63 65 64 (** {1 ToASCII Operation} 66 65 67 - Converts an internationalized domain name to its ASCII-compatible 68 - encoding (ACE) form suitable for DNS lookup. 66 + Converts an internationalized domain name to its ASCII-compatible encoding 67 + (ACE) form suitable for DNS lookup. 69 68 70 - See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} 71 - RFC 5891 Section 4} for the complete ToASCII specification. *) 69 + See 70 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891 Section 71 + 4} for the complete ToASCII specification. *) 72 72 73 - val to_ascii : ?check_hyphens:bool -> ?check_bidi:bool -> 74 - ?check_joiners:bool -> ?use_std3_rules:bool -> 75 - ?transitional:bool -> string -> (string, error) result 73 + val to_ascii : 74 + ?check_hyphens:bool -> 75 + ?check_bidi:bool -> 76 + ?check_joiners:bool -> 77 + ?use_std3_rules:bool -> 78 + ?transitional:bool -> 79 + string -> 80 + (string, error) result 76 81 (** [to_ascii domain] converts an internationalized domain name to ASCII. 77 82 78 83 Implements the ToASCII operation from 79 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 Section 4.1}. 84 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 85 + Section 4.1}. 80 86 81 - For each label in the domain: 82 - 1. If all ASCII, pass through (with optional STD3 validation) 83 - 2. Otherwise, normalize to NFC per 84 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}Section 4.2.1} 85 - and Punycode-encode with ACE prefix 87 + For each label in the domain: 1. If all ASCII, pass through (with optional 88 + STD3 validation) 2. Otherwise, normalize to NFC per 89 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}Section 90 + 4.2.1} and Punycode-encode with ACE prefix 86 91 87 - Optional parameters (per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} 88 - RFC 5891 Section 4} processing options): 92 + Optional parameters (per 93 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891 Section 94 + 4} processing options): 89 95 - [check_hyphens]: Validate hyphen placement per 90 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1}Section 4.2.3.1} 91 - (default: true) 96 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1}Section 97 + 4.2.3.1} (default: true) 92 98 - [check_bidi]: Check bidirectional text rules per 93 - {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} 94 - (default: false, not implemented) 99 + {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} (default: 100 + false, not implemented) 95 101 - [check_joiners]: Check contextual joiner rules per 96 - {{:https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1}RFC 5892 Appendix A.1} 97 - (default: false, not implemented) 102 + {{:https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1}RFC 5892 103 + Appendix A.1} (default: false, not implemented) 98 104 - [use_std3_rules]: Apply STD3 hostname rules per 99 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section 4.2.3.2} 100 - (default: false) 101 - - [transitional]: Use IDNA 2003 transitional processing 102 - (default: false) 105 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section 106 + 4.2.3.2} (default: false) 107 + - [transitional]: Use IDNA 2003 transitional processing (default: false) 103 108 104 109 Example: 105 110 {[ ··· 107 112 (* = Ok "xn--mnchen-3ya.example.com" *) 108 113 ]} *) 109 114 110 - val label_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool -> 111 - string -> (string, error) result 115 + val label_to_ascii : 116 + ?check_hyphens:bool -> 117 + ?use_std3_rules:bool -> 118 + string -> 119 + (string, error) result 112 120 (** [label_to_ascii label] converts a single label to ASCII. 113 121 114 122 This implements the core ToASCII operation for one label, as described in 115 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 Section 4.1}. *) 123 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 124 + Section 4.1}. *) 116 125 117 126 (** {1 ToUnicode Operation} 118 127 119 128 Converts an ASCII-compatible encoded domain name back to Unicode. 120 129 121 - See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} 122 - RFC 5891 Section 4.2} for the complete ToUnicode specification. *) 130 + See 131 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} RFC 5891 132 + Section 4.2} for the complete ToUnicode specification. *) 123 133 124 134 val to_unicode : string -> (string, error) result 125 135 (** [to_unicode domain] converts an ACE domain name to Unicode. 126 136 127 137 Implements the ToUnicode operation from 128 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 Section 4.2}. 138 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 139 + Section 4.2}. 129 140 130 - For each label in the domain: 131 - 1. If it has the ACE prefix ("xn--"), Punycode-decode it per 132 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 Section 6.2} 133 - 2. Otherwise, pass through unchanged 141 + For each label in the domain: 1. If it has the ACE prefix ("xn--"), 142 + Punycode-decode it per 143 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 144 + Section 6.2} 2. Otherwise, pass through unchanged 134 145 135 146 Example: 136 147 {[ ··· 142 153 (** [label_to_unicode label] converts a single ACE label to Unicode. 143 154 144 155 This implements the core ToUnicode operation for one label, as described in 145 - {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 Section 4.2}. *) 156 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 157 + Section 4.2}. *) 146 158 147 159 (** {1 Domain Name Integration} 148 160 ··· 152 164 These provide integration with the [Domain_name] module for applications 153 165 that use that library for domain name handling. *) 154 166 155 - val domain_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool -> 156 - [`raw] Domain_name.t -> ([`raw] Domain_name.t, error) result 167 + val domain_to_ascii : 168 + ?check_hyphens:bool -> 169 + ?use_std3_rules:bool -> 170 + [ `raw ] Domain_name.t -> 171 + ([ `raw ] Domain_name.t, error) result 157 172 (** [domain_to_ascii domain] converts a domain name to ASCII form. 158 173 159 - Applies {!to_ascii} to the string representation and returns the 160 - result as a [Domain_name.t]. 174 + Applies {!to_ascii} to the string representation and returns the result as a 175 + [Domain_name.t]. 161 176 162 177 Example: 163 178 {[ ··· 166 181 (* = Ok (Domain_name.of_string_exn "xn--mnchen-3ya.example.com") *) 167 182 ]} *) 168 183 169 - val domain_to_unicode : [`raw] Domain_name.t -> ([`raw] Domain_name.t, error) result 184 + val domain_to_unicode : 185 + [ `raw ] Domain_name.t -> ([ `raw ] Domain_name.t, error) result 170 186 (** [domain_to_unicode domain] converts a domain name to Unicode form. 171 187 172 - Applies {!to_unicode} to the string representation and returns the 173 - result as a [Domain_name.t]. *) 188 + Applies {!to_unicode} to the string representation and returns the result as 189 + a [Domain_name.t]. *) 174 190 175 191 (** {1 Validation} *) 176 192 ··· 182 198 val is_ace_label : string -> bool 183 199 (** [is_ace_label label] is [true] if the label has the ACE prefix "xn--" 184 200 (case-insensitive). This indicates the label is Punycode-encoded per 185 - {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}. *) 201 + {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 202 + 5}. *) 186 203 187 204 (** {1 Normalization} *) 188 205 189 206 val normalize_nfc : string -> string 190 207 (** [normalize_nfc s] returns the NFC-normalized form of UTF-8 string [s]. 191 208 192 - Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} 193 - RFC 5891 Section 4.2.1}, domain labels must be normalized to NFC 194 - (Unicode Normalization Form C) before encoding. 209 + Per 210 + {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} RFC 5891 211 + Section 4.2.1}, domain labels must be normalized to NFC (Unicode 212 + Normalization Form C) before encoding. 195 213 196 - See {{:http://www.unicode.org/reports/tr15/}Unicode Standard Annex #15} 197 - for details on Unicode normalization forms. *) 214 + See {{:http://www.unicode.org/reports/tr15/}Unicode Standard Annex #15} for 215 + details on Unicode normalization forms. *)
+563 -259
test/test_punycode.ml
··· 28 28 let check_encode_ok expected input = 29 29 match Punycode.encode input with 30 30 | Ok result -> check string "encode" expected result 31 - | Error e -> 32 - fail (Format.asprintf "encode failed: %a" Punycode.pp_error e) 31 + | Error e -> fail (Format.asprintf "encode failed: %a" Punycode.pp_error e) 33 32 34 33 let check_decode_ok expected input = 35 34 match Punycode.decode input with 36 35 | Ok result -> 37 - let expected_arr = codepoints_of_hex_list expected in 38 - check int "length" (Array.length expected_arr) (Array.length result); 39 - Array.iteri (fun i u -> 40 - check int (Printf.sprintf "char %d" i) 41 - (Uchar.to_int expected_arr.(i)) (Uchar.to_int u) 42 - ) result 43 - | Error e -> 44 - fail (Format.asprintf "decode failed: %a" Punycode.pp_error e) 36 + let expected_arr = codepoints_of_hex_list expected in 37 + check int "length" (Array.length expected_arr) (Array.length result); 38 + Array.iteri 39 + (fun i u -> 40 + check int 41 + (Printf.sprintf "char %d" i) 42 + (Uchar.to_int expected_arr.(i)) 43 + (Uchar.to_int u)) 44 + result 45 + | Error e -> fail (Format.asprintf "decode failed: %a" Punycode.pp_error e) 45 46 46 47 let check_utf8_roundtrip s = 47 48 match Punycode.encode_utf8 s with 48 49 | Error e -> 49 - fail (Format.asprintf "encode_utf8 failed: %a" Punycode.pp_error e) 50 - | Ok encoded -> 51 - match Punycode.decode_utf8 encoded with 52 - | Error e -> 53 - fail (Format.asprintf "decode_utf8 failed: %a" Punycode.pp_error e) 54 - | Ok decoded -> 55 - check string "roundtrip" s decoded 50 + fail (Format.asprintf "encode_utf8 failed: %a" Punycode.pp_error e) 51 + | Ok encoded -> ( 52 + match Punycode.decode_utf8 encoded with 53 + | Error e -> 54 + fail (Format.asprintf "decode_utf8 failed: %a" Punycode.pp_error e) 55 + | Ok decoded -> check string "roundtrip" s decoded) 56 56 57 57 (* RFC 3492 Section 7.1 Test Vectors *) 58 58 59 59 (* (A) Arabic (Egyptian) *) 60 - let arabic_codepoints = [ 61 - 0x0644; 0x064A; 0x0647; 0x0645; 0x0627; 0x0628; 0x062A; 0x0643; 62 - 0x0644; 0x0645; 0x0648; 0x0634; 0x0639; 0x0631; 0x0628; 0x064A; 0x061F 63 - ] 60 + let arabic_codepoints = 61 + [ 62 + 0x0644; 63 + 0x064A; 64 + 0x0647; 65 + 0x0645; 66 + 0x0627; 67 + 0x0628; 68 + 0x062A; 69 + 0x0643; 70 + 0x0644; 71 + 0x0645; 72 + 0x0648; 73 + 0x0634; 74 + 0x0639; 75 + 0x0631; 76 + 0x0628; 77 + 0x064A; 78 + 0x061F; 79 + ] 80 + 64 81 let arabic_punycode = "egbpdaj6bu4bxfgehfvwxn" 65 82 66 83 (* (B) Chinese (simplified) *) 67 - let chinese_simplified_codepoints = [ 68 - 0x4ED6; 0x4EEC; 0x4E3A; 0x4EC0; 0x4E48; 0x4E0D; 0x8BF4; 0x4E2D; 0x6587 69 - ] 84 + let chinese_simplified_codepoints = 85 + [ 0x4ED6; 0x4EEC; 0x4E3A; 0x4EC0; 0x4E48; 0x4E0D; 0x8BF4; 0x4E2D; 0x6587 ] 86 + 70 87 let chinese_simplified_punycode = "ihqwcrb4cv8a8dqg056pqjye" 71 88 72 89 (* (C) Chinese (traditional) *) 73 - let chinese_traditional_codepoints = [ 74 - 0x4ED6; 0x5011; 0x7232; 0x4EC0; 0x9EBD; 0x4E0D; 0x8AAA; 0x4E2D; 0x6587 75 - ] 90 + let chinese_traditional_codepoints = 91 + [ 0x4ED6; 0x5011; 0x7232; 0x4EC0; 0x9EBD; 0x4E0D; 0x8AAA; 0x4E2D; 0x6587 ] 92 + 76 93 let chinese_traditional_punycode = "ihqwctvzc91f659drss3x8bo0yb" 77 94 78 95 (* (D) Czech *) 79 - let czech_codepoints = [ 80 - 0x0050; 0x0072; 0x006F; 0x010D; 0x0070; 0x0072; 0x006F; 0x0073; 0x0074; 81 - 0x011B; 0x006E; 0x0065; 0x006D; 0x006C; 0x0075; 0x0076; 0x00ED; 0x010D; 82 - 0x0065; 0x0073; 0x006B; 0x0079 83 - ] 96 + let czech_codepoints = 97 + [ 98 + 0x0050; 99 + 0x0072; 100 + 0x006F; 101 + 0x010D; 102 + 0x0070; 103 + 0x0072; 104 + 0x006F; 105 + 0x0073; 106 + 0x0074; 107 + 0x011B; 108 + 0x006E; 109 + 0x0065; 110 + 0x006D; 111 + 0x006C; 112 + 0x0075; 113 + 0x0076; 114 + 0x00ED; 115 + 0x010D; 116 + 0x0065; 117 + 0x0073; 118 + 0x006B; 119 + 0x0079; 120 + ] 121 + 84 122 let czech_punycode = "Proprostnemluvesky-uyb24dma41a" 85 123 86 124 (* (E) Hebrew *) 87 - let hebrew_codepoints = [ 88 - 0x05DC; 0x05DE; 0x05D4; 0x05D4; 0x05DD; 0x05E4; 0x05E9; 0x05D5; 0x05D8; 89 - 0x05DC; 0x05D0; 0x05DE; 0x05D3; 0x05D1; 0x05E8; 0x05D9; 0x05DD; 0x05E2; 90 - 0x05D1; 0x05E8; 0x05D9; 0x05EA 91 - ] 125 + let hebrew_codepoints = 126 + [ 127 + 0x05DC; 128 + 0x05DE; 129 + 0x05D4; 130 + 0x05D4; 131 + 0x05DD; 132 + 0x05E4; 133 + 0x05E9; 134 + 0x05D5; 135 + 0x05D8; 136 + 0x05DC; 137 + 0x05D0; 138 + 0x05DE; 139 + 0x05D3; 140 + 0x05D1; 141 + 0x05E8; 142 + 0x05D9; 143 + 0x05DD; 144 + 0x05E2; 145 + 0x05D1; 146 + 0x05E8; 147 + 0x05D9; 148 + 0x05EA; 149 + ] 150 + 92 151 let hebrew_punycode = "4dbcagdahymbxekheh6e0a7fei0b" 93 152 94 153 (* (F) Hindi (Devanagari) *) 95 - let hindi_codepoints = [ 96 - 0x092F; 0x0939; 0x0932; 0x094B; 0x0917; 0x0939; 0x093F; 0x0928; 0x094D; 97 - 0x0926; 0x0940; 0x0915; 0x094D; 0x092F; 0x094B; 0x0902; 0x0928; 0x0939; 98 - 0x0940; 0x0902; 0x092C; 0x094B; 0x0932; 0x0938; 0x0915; 0x0924; 0x0947; 99 - 0x0939; 0x0948; 0x0902 100 - ] 154 + let hindi_codepoints = 155 + [ 156 + 0x092F; 157 + 0x0939; 158 + 0x0932; 159 + 0x094B; 160 + 0x0917; 161 + 0x0939; 162 + 0x093F; 163 + 0x0928; 164 + 0x094D; 165 + 0x0926; 166 + 0x0940; 167 + 0x0915; 168 + 0x094D; 169 + 0x092F; 170 + 0x094B; 171 + 0x0902; 172 + 0x0928; 173 + 0x0939; 174 + 0x0940; 175 + 0x0902; 176 + 0x092C; 177 + 0x094B; 178 + 0x0932; 179 + 0x0938; 180 + 0x0915; 181 + 0x0924; 182 + 0x0947; 183 + 0x0939; 184 + 0x0948; 185 + 0x0902; 186 + ] 187 + 101 188 let hindi_punycode = "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" 102 189 103 190 (* (G) Japanese (kanji and hiragana) *) 104 - let japanese_codepoints = [ 105 - 0x306A; 0x305C; 0x307F; 0x3093; 0x306A; 0x65E5; 0x672C; 0x8A9E; 0x3092; 106 - 0x8A71; 0x3057; 0x3066; 0x304F; 0x308C; 0x306A; 0x3044; 0x306E; 0x304B 107 - ] 191 + let japanese_codepoints = 192 + [ 193 + 0x306A; 194 + 0x305C; 195 + 0x307F; 196 + 0x3093; 197 + 0x306A; 198 + 0x65E5; 199 + 0x672C; 200 + 0x8A9E; 201 + 0x3092; 202 + 0x8A71; 203 + 0x3057; 204 + 0x3066; 205 + 0x304F; 206 + 0x308C; 207 + 0x306A; 208 + 0x3044; 209 + 0x306E; 210 + 0x304B; 211 + ] 212 + 108 213 let japanese_punycode = "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" 109 214 110 215 (* (H) Korean (Hangul syllables) *) 111 - let korean_codepoints = [ 112 - 0xC138; 0xACC4; 0xC758; 0xBAA8; 0xB4E0; 0xC0AC; 0xB78C; 0xB4E4; 0xC774; 113 - 0xD55C; 0xAD6D; 0xC5B4; 0xB97C; 0xC774; 0xD574; 0xD55C; 0xB2E4; 0xBA74; 114 - 0xC5BC; 0xB9C8; 0xB098; 0xC88B; 0xC744; 0xAE4C 115 - ] 116 - let korean_punycode = "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" 216 + let korean_codepoints = 217 + [ 218 + 0xC138; 219 + 0xACC4; 220 + 0xC758; 221 + 0xBAA8; 222 + 0xB4E0; 223 + 0xC0AC; 224 + 0xB78C; 225 + 0xB4E4; 226 + 0xC774; 227 + 0xD55C; 228 + 0xAD6D; 229 + 0xC5B4; 230 + 0xB97C; 231 + 0xC774; 232 + 0xD574; 233 + 0xD55C; 234 + 0xB2E4; 235 + 0xBA74; 236 + 0xC5BC; 237 + 0xB9C8; 238 + 0xB098; 239 + 0xC88B; 240 + 0xC744; 241 + 0xAE4C; 242 + ] 243 + 244 + let korean_punycode = 245 + "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" 117 246 118 247 (* (I) Russian (Cyrillic) *) 119 - let russian_codepoints = [ 120 - 0x043F; 0x043E; 0x0447; 0x0435; 0x043C; 0x0443; 0x0436; 0x0435; 0x043E; 121 - 0x043D; 0x0438; 0x043D; 0x0435; 0x0433; 0x043E; 0x0432; 0x043E; 0x0440; 122 - 0x044F; 0x0442; 0x043F; 0x043E; 0x0440; 0x0443; 0x0441; 0x0441; 0x043A; 123 - 0x0438 124 - ] 248 + let russian_codepoints = 249 + [ 250 + 0x043F; 251 + 0x043E; 252 + 0x0447; 253 + 0x0435; 254 + 0x043C; 255 + 0x0443; 256 + 0x0436; 257 + 0x0435; 258 + 0x043E; 259 + 0x043D; 260 + 0x0438; 261 + 0x043D; 262 + 0x0435; 263 + 0x0433; 264 + 0x043E; 265 + 0x0432; 266 + 0x043E; 267 + 0x0440; 268 + 0x044F; 269 + 0x0442; 270 + 0x043F; 271 + 0x043E; 272 + 0x0440; 273 + 0x0443; 274 + 0x0441; 275 + 0x0441; 276 + 0x043A; 277 + 0x0438; 278 + ] 279 + 125 280 let russian_punycode = "b1abfaaepdrnnbgefbadotcwatmq2g4l" 126 281 127 282 (* (J) Spanish *) 128 - let spanish_codepoints = [ 129 - 0x0050; 0x006F; 0x0072; 0x0071; 0x0075; 0x00E9; 0x006E; 0x006F; 0x0070; 130 - 0x0075; 0x0065; 0x0064; 0x0065; 0x006E; 0x0073; 0x0069; 0x006D; 0x0070; 131 - 0x006C; 0x0065; 0x006D; 0x0065; 0x006E; 0x0074; 0x0065; 0x0068; 0x0061; 132 - 0x0062; 0x006C; 0x0061; 0x0072; 0x0065; 0x006E; 0x0045; 0x0073; 0x0070; 133 - 0x0061; 0x00F1; 0x006F; 0x006C 134 - ] 283 + let spanish_codepoints = 284 + [ 285 + 0x0050; 286 + 0x006F; 287 + 0x0072; 288 + 0x0071; 289 + 0x0075; 290 + 0x00E9; 291 + 0x006E; 292 + 0x006F; 293 + 0x0070; 294 + 0x0075; 295 + 0x0065; 296 + 0x0064; 297 + 0x0065; 298 + 0x006E; 299 + 0x0073; 300 + 0x0069; 301 + 0x006D; 302 + 0x0070; 303 + 0x006C; 304 + 0x0065; 305 + 0x006D; 306 + 0x0065; 307 + 0x006E; 308 + 0x0074; 309 + 0x0065; 310 + 0x0068; 311 + 0x0061; 312 + 0x0062; 313 + 0x006C; 314 + 0x0061; 315 + 0x0072; 316 + 0x0065; 317 + 0x006E; 318 + 0x0045; 319 + 0x0073; 320 + 0x0070; 321 + 0x0061; 322 + 0x00F1; 323 + 0x006F; 324 + 0x006C; 325 + ] 326 + 135 327 let spanish_punycode = "PorqunopuedensimplementehablarenEspaol-fmd56a" 136 328 137 329 (* (K) Vietnamese *) 138 - let vietnamese_codepoints = [ 139 - 0x0054; 0x1EA1; 0x0069; 0x0073; 0x0061; 0x006F; 0x0068; 0x1ECD; 0x006B; 140 - 0x0068; 0x00F4; 0x006E; 0x0067; 0x0074; 0x0068; 0x1EC3; 0x0063; 0x0068; 141 - 0x1EC9; 0x006E; 0x00F3; 0x0069; 0x0074; 0x0069; 0x1EBF; 0x006E; 0x0067; 142 - 0x0056; 0x0069; 0x1EC7; 0x0074 143 - ] 330 + let vietnamese_codepoints = 331 + [ 332 + 0x0054; 333 + 0x1EA1; 334 + 0x0069; 335 + 0x0073; 336 + 0x0061; 337 + 0x006F; 338 + 0x0068; 339 + 0x1ECD; 340 + 0x006B; 341 + 0x0068; 342 + 0x00F4; 343 + 0x006E; 344 + 0x0067; 345 + 0x0074; 346 + 0x0068; 347 + 0x1EC3; 348 + 0x0063; 349 + 0x0068; 350 + 0x1EC9; 351 + 0x006E; 352 + 0x00F3; 353 + 0x0069; 354 + 0x0074; 355 + 0x0069; 356 + 0x1EBF; 357 + 0x006E; 358 + 0x0067; 359 + 0x0056; 360 + 0x0069; 361 + 0x1EC7; 362 + 0x0074; 363 + ] 364 + 144 365 let vietnamese_punycode = "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" 145 366 146 367 (* (L) 3年B組金八先生 - Japanese with ASCII *) 147 - let example_l_codepoints = [ 148 - 0x0033; 0x5E74; 0x0042; 0x7D44; 0x91D1; 0x516B; 0x5148; 0x751F 149 - ] 368 + let example_l_codepoints = 369 + [ 0x0033; 0x5E74; 0x0042; 0x7D44; 0x91D1; 0x516B; 0x5148; 0x751F ] 370 + 150 371 let example_l_punycode = "3B-ww4c5e180e575a65lsy2b" 151 372 152 373 (* (M) 安室奈美恵-with-SUPER-MONKEYS *) 153 - let example_m_codepoints = [ 154 - 0x5B89; 0x5BA4; 0x5948; 0x7F8E; 0x6075; 0x002D; 0x0077; 0x0069; 0x0074; 155 - 0x0068; 0x002D; 0x0053; 0x0055; 0x0050; 0x0045; 0x0052; 0x002D; 0x004D; 156 - 0x004F; 0x004E; 0x004B; 0x0045; 0x0059; 0x0053 157 - ] 374 + let example_m_codepoints = 375 + [ 376 + 0x5B89; 377 + 0x5BA4; 378 + 0x5948; 379 + 0x7F8E; 380 + 0x6075; 381 + 0x002D; 382 + 0x0077; 383 + 0x0069; 384 + 0x0074; 385 + 0x0068; 386 + 0x002D; 387 + 0x0053; 388 + 0x0055; 389 + 0x0050; 390 + 0x0045; 391 + 0x0052; 392 + 0x002D; 393 + 0x004D; 394 + 0x004F; 395 + 0x004E; 396 + 0x004B; 397 + 0x0045; 398 + 0x0059; 399 + 0x0053; 400 + ] 401 + 158 402 let example_m_punycode = "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n" 159 403 160 404 (* (N) Hello-Another-Way-それぞれの場所 *) 161 - let example_n_codepoints = [ 162 - 0x0048; 0x0065; 0x006C; 0x006C; 0x006F; 0x002D; 0x0041; 0x006E; 0x006F; 163 - 0x0074; 0x0068; 0x0065; 0x0072; 0x002D; 0x0057; 0x0061; 0x0079; 0x002D; 164 - 0x305D; 0x308C; 0x305E; 0x308C; 0x306E; 0x5834; 0x6240 165 - ] 405 + let example_n_codepoints = 406 + [ 407 + 0x0048; 408 + 0x0065; 409 + 0x006C; 410 + 0x006C; 411 + 0x006F; 412 + 0x002D; 413 + 0x0041; 414 + 0x006E; 415 + 0x006F; 416 + 0x0074; 417 + 0x0068; 418 + 0x0065; 419 + 0x0072; 420 + 0x002D; 421 + 0x0057; 422 + 0x0061; 423 + 0x0079; 424 + 0x002D; 425 + 0x305D; 426 + 0x308C; 427 + 0x305E; 428 + 0x308C; 429 + 0x306E; 430 + 0x5834; 431 + 0x6240; 432 + ] 433 + 166 434 let example_n_punycode = "Hello-Another-Way--fc4qua05auwb3674vfr0b" 167 435 168 436 (* (O) ひとつ屋根の下2 *) 169 - let example_o_codepoints = [ 170 - 0x3072; 0x3068; 0x3064; 0x5C4B; 0x6839; 0x306E; 0x4E0B; 0x0032 171 - ] 437 + let example_o_codepoints = 438 + [ 0x3072; 0x3068; 0x3064; 0x5C4B; 0x6839; 0x306E; 0x4E0B; 0x0032 ] 439 + 172 440 let example_o_punycode = "2-u9tlzr9756bt3uc0v" 173 441 174 442 (* (P) MaijでKoiする5秒前 *) 175 - let example_p_codepoints = [ 176 - 0x004D; 0x0061; 0x006A; 0x0069; 0x3067; 0x004B; 0x006F; 0x0069; 0x3059; 177 - 0x308B; 0x0035; 0x79D2; 0x524D 178 - ] 443 + let example_p_codepoints = 444 + [ 445 + 0x004D; 446 + 0x0061; 447 + 0x006A; 448 + 0x0069; 449 + 0x3067; 450 + 0x004B; 451 + 0x006F; 452 + 0x0069; 453 + 0x3059; 454 + 0x308B; 455 + 0x0035; 456 + 0x79D2; 457 + 0x524D; 458 + ] 459 + 179 460 let example_p_punycode = "MajiKoi5-783gue6qz075azm5e" 180 461 181 462 (* (Q) パフィーdeルンバ *) 182 - let example_q_codepoints = [ 183 - 0x30D1; 0x30D5; 0x30A3; 0x30FC; 0x0064; 0x0065; 0x30EB; 0x30F3; 0x30D0 184 - ] 463 + let example_q_codepoints = 464 + [ 0x30D1; 0x30D5; 0x30A3; 0x30FC; 0x0064; 0x0065; 0x30EB; 0x30F3; 0x30D0 ] 465 + 185 466 let example_q_punycode = "de-jg4avhby1noc0d" 186 467 187 468 (* (R) そのスピードで *) 188 - let example_r_codepoints = [ 189 - 0x305D; 0x306E; 0x30B9; 0x30D4; 0x30FC; 0x30C9; 0x3067 190 - ] 469 + let example_r_codepoints = 470 + [ 0x305D; 0x306E; 0x30B9; 0x30D4; 0x30FC; 0x30C9; 0x3067 ] 471 + 191 472 let example_r_punycode = "d9juau41awczczp" 192 473 193 474 (* (S) -> $1.00 <- (pure ASCII) *) 194 - let example_s_codepoints = [ 195 - 0x002D; 0x003E; 0x0020; 0x0024; 0x0031; 0x002E; 0x0030; 0x0030; 0x0020; 196 - 0x003C; 0x002D 197 - ] 475 + let example_s_codepoints = 476 + [ 477 + 0x002D; 478 + 0x003E; 479 + 0x0020; 480 + 0x0024; 481 + 0x0031; 482 + 0x002E; 483 + 0x0030; 484 + 0x0030; 485 + 0x0020; 486 + 0x003C; 487 + 0x002D; 488 + ] 489 + 198 490 let example_s_punycode = "-> $1.00 <--" 199 491 200 492 (* Test functions *) 201 493 202 - let test_decode_arabic () = 203 - check_decode_ok arabic_codepoints arabic_punycode 494 + let test_decode_arabic () = check_decode_ok arabic_codepoints arabic_punycode 204 495 205 496 let test_decode_chinese_simplified () = 206 497 check_decode_ok chinese_simplified_codepoints chinese_simplified_punycode ··· 208 499 let test_decode_chinese_traditional () = 209 500 check_decode_ok chinese_traditional_codepoints chinese_traditional_punycode 210 501 211 - let test_decode_hebrew () = 212 - check_decode_ok hebrew_codepoints hebrew_punycode 213 - 214 - let test_decode_hindi () = 215 - check_decode_ok hindi_codepoints hindi_punycode 502 + let test_decode_hebrew () = check_decode_ok hebrew_codepoints hebrew_punycode 503 + let test_decode_hindi () = check_decode_ok hindi_codepoints hindi_punycode 216 504 217 505 let test_decode_japanese () = 218 506 check_decode_ok japanese_codepoints japanese_punycode 219 507 220 - let test_decode_korean () = 221 - check_decode_ok korean_codepoints korean_punycode 508 + let test_decode_korean () = check_decode_ok korean_codepoints korean_punycode 222 509 223 510 let test_decode_example_l () = 224 511 check_decode_ok example_l_codepoints example_l_punycode ··· 238 525 let test_decode_example_r () = 239 526 check_decode_ok example_r_codepoints example_r_punycode 240 527 241 - let test_decode_czech () = 242 - check_decode_ok czech_codepoints czech_punycode 528 + let test_decode_czech () = check_decode_ok czech_codepoints czech_punycode 243 529 244 530 let test_decode_russian () = 245 531 check_decode_ok russian_codepoints (String.lowercase_ascii russian_punycode) 246 532 247 - let test_decode_spanish () = 248 - check_decode_ok spanish_codepoints spanish_punycode 533 + let test_decode_spanish () = check_decode_ok spanish_codepoints spanish_punycode 249 534 250 535 let test_decode_vietnamese () = 251 536 check_decode_ok vietnamese_codepoints vietnamese_punycode ··· 280 565 check_encode_ok korean_punycode (codepoints_of_hex_list korean_codepoints) 281 566 282 567 let test_encode_example_l () = 283 - check_encode_ok (String.lowercase_ascii example_l_punycode) 568 + check_encode_ok 569 + (String.lowercase_ascii example_l_punycode) 284 570 (codepoints_of_hex_list example_l_codepoints) 285 571 286 572 let test_encode_example_m () = 287 - check_encode_ok (String.lowercase_ascii example_m_punycode) 573 + check_encode_ok 574 + (String.lowercase_ascii example_m_punycode) 288 575 (codepoints_of_hex_list example_m_codepoints) 289 576 290 577 let test_encode_example_n () = 291 - check_encode_ok (String.lowercase_ascii example_n_punycode) 578 + check_encode_ok 579 + (String.lowercase_ascii example_n_punycode) 292 580 (codepoints_of_hex_list example_n_codepoints) 293 581 294 582 let test_encode_example_o () = 295 - check_encode_ok (String.lowercase_ascii example_o_punycode) 583 + check_encode_ok 584 + (String.lowercase_ascii example_o_punycode) 296 585 (codepoints_of_hex_list example_o_codepoints) 297 586 298 587 let test_encode_example_q () = 299 - check_encode_ok example_q_punycode (codepoints_of_hex_list example_q_codepoints) 588 + check_encode_ok example_q_punycode 589 + (codepoints_of_hex_list example_q_codepoints) 300 590 301 591 let test_encode_example_r () = 302 - check_encode_ok example_r_punycode (codepoints_of_hex_list example_r_codepoints) 592 + check_encode_ok example_r_punycode 593 + (codepoints_of_hex_list example_r_codepoints) 303 594 304 595 (* UTF-8 roundtrip tests *) 305 - let test_utf8_roundtrip_german () = 306 - check_utf8_roundtrip "münchen" 307 - 308 - let test_utf8_roundtrip_chinese () = 309 - check_utf8_roundtrip "中文" 310 - 311 - let test_utf8_roundtrip_japanese () = 312 - check_utf8_roundtrip "日本語" 313 - 314 - let test_utf8_roundtrip_arabic () = 315 - check_utf8_roundtrip "العربية" 316 - 317 - let test_utf8_roundtrip_russian () = 318 - check_utf8_roundtrip "русский" 319 - 320 - let test_utf8_roundtrip_greek () = 321 - check_utf8_roundtrip "ελληνικά" 322 - 323 - let test_utf8_roundtrip_korean () = 324 - check_utf8_roundtrip "한국어" 325 - 326 - let test_utf8_roundtrip_emoji () = 327 - check_utf8_roundtrip "hello👋world" 596 + let test_utf8_roundtrip_german () = check_utf8_roundtrip "münchen" 597 + let test_utf8_roundtrip_chinese () = check_utf8_roundtrip "中文" 598 + let test_utf8_roundtrip_japanese () = check_utf8_roundtrip "日本語" 599 + let test_utf8_roundtrip_arabic () = check_utf8_roundtrip "العربية" 600 + let test_utf8_roundtrip_russian () = check_utf8_roundtrip "русский" 601 + let test_utf8_roundtrip_greek () = check_utf8_roundtrip "ελληνικά" 602 + let test_utf8_roundtrip_korean () = check_utf8_roundtrip "한국어" 603 + let test_utf8_roundtrip_emoji () = check_utf8_roundtrip "hello👋world" 328 604 329 605 (* Label encoding tests *) 330 606 let test_label_encode_ascii () = 331 607 match Punycode.encode_label "example" with 332 608 | Ok result -> check string "ascii passthrough" "example" result 333 - | Error e -> fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error e) 609 + | Error e -> 610 + fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error e) 334 611 335 612 let test_label_encode_german () = 336 613 match Punycode.encode_label "münchen" with 337 614 | Ok result -> check string "german label" "xn--mnchen-3ya" result 338 - | Error e -> fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error e) 615 + | Error e -> 616 + fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error e) 339 617 340 618 let test_label_decode_german () = 341 619 match Punycode.decode_label "xn--mnchen-3ya" with 342 620 | Ok result -> check string "german decode" "münchen" result 343 - | Error e -> fail (Format.asprintf "decode_label failed: %a" Punycode.pp_error e) 621 + | Error e -> 622 + fail (Format.asprintf "decode_label failed: %a" Punycode.pp_error e) 344 623 345 624 (* IDNA tests *) 346 625 let test_idna_to_ascii_simple () = 347 626 match Punycode_idna.to_ascii "münchen.example.com" with 348 - | Ok result -> check string "idna to_ascii" "xn--mnchen-3ya.example.com" result 349 - | Error e -> fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 627 + | Ok result -> 628 + check string "idna to_ascii" "xn--mnchen-3ya.example.com" result 629 + | Error e -> 630 + fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 350 631 351 632 let test_idna_to_unicode_simple () = 352 633 match Punycode_idna.to_unicode "xn--mnchen-3ya.example.com" with 353 634 | Ok result -> check string "idna to_unicode" "münchen.example.com" result 354 - | Error e -> fail (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error e) 635 + | Error e -> 636 + fail (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error e) 355 637 356 638 let test_idna_roundtrip () = 357 639 let original = "münchen.example.com" in 358 640 match Punycode_idna.to_ascii original with 359 - | Error e -> fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 360 - | Ok ascii -> 361 - match Punycode_idna.to_unicode ascii with 362 - | Error e -> fail (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error e) 363 - | Ok unicode -> check string "idna roundtrip" original unicode 641 + | Error e -> 642 + fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 643 + | Ok ascii -> ( 644 + match Punycode_idna.to_unicode ascii with 645 + | Error e -> 646 + fail 647 + (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error e) 648 + | Ok unicode -> check string "idna roundtrip" original unicode) 364 649 365 650 let test_idna_all_ascii () = 366 651 match Punycode_idna.to_ascii "www.example.com" with 367 652 | Ok result -> check string "all ascii passthrough" "www.example.com" result 368 - | Error e -> fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 653 + | Error e -> 654 + fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 369 655 370 656 let test_idna_mixed_labels () = 371 657 match Punycode_idna.to_ascii "日本語.example.com" with 372 658 | Ok result -> 373 - (* Check that result starts with xn-- and ends with .example.com *) 374 - check bool "has ace prefix" true (Punycode.has_ace_prefix result); 375 - check bool "ends with example.com" true 376 - (String.length result > 12 && 377 - String.sub result (String.length result - 12) 12 = ".example.com") 378 - | Error e -> fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 659 + (* Check that result starts with xn-- and ends with .example.com *) 660 + check bool "has ace prefix" true (Punycode.has_ace_prefix result); 661 + check bool "ends with example.com" true 662 + (String.length result > 12 663 + && String.sub result (String.length result - 12) 12 = ".example.com") 664 + | Error e -> 665 + fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e) 379 666 380 667 (* Case annotation tests *) 381 668 let test_case_annotation_decode () = 382 669 (* RFC example: uppercase letters indicate case flags *) 383 670 match Punycode.decode_with_case "MajiKoi5-783gue6qz075azm5e" with 384 671 | Ok (codepoints, case_flags) -> 385 - check int "codepoints length" (List.length example_p_codepoints) (Array.length codepoints); 386 - check int "case_flags length" (Array.length codepoints) (Array.length case_flags); 387 - (* M should be uppercase *) 388 - check bool "M uppercase" true (case_flags.(0) = Punycode.Uppercase); 389 - (* a should be lowercase *) 390 - check bool "a lowercase" true (case_flags.(1) = Punycode.Lowercase) 391 - | Error e -> fail (Format.asprintf "decode_with_case failed: %a" Punycode.pp_error e) 672 + check int "codepoints length" 673 + (List.length example_p_codepoints) 674 + (Array.length codepoints); 675 + check int "case_flags length" (Array.length codepoints) 676 + (Array.length case_flags); 677 + (* M should be uppercase *) 678 + check bool "M uppercase" true (case_flags.(0) = Punycode.Uppercase); 679 + (* a should be lowercase *) 680 + check bool "a lowercase" true (case_flags.(1) = Punycode.Lowercase) 681 + | Error e -> 682 + fail (Format.asprintf "decode_with_case failed: %a" Punycode.pp_error e) 392 683 393 684 let test_case_annotation_encode () = 394 - let codepoints = codepoints_of_hex_list [0x0061; 0x0062; 0x0063] in (* "abc" *) 395 - let case_flags = [| Punycode.Uppercase; Punycode.Lowercase; Punycode.Uppercase |] in 685 + let codepoints = codepoints_of_hex_list [ 0x0061; 0x0062; 0x0063 ] in 686 + (* "abc" *) 687 + let case_flags = 688 + [| Punycode.Uppercase; Punycode.Lowercase; Punycode.Uppercase |] 689 + in 396 690 match Punycode.encode_with_case codepoints case_flags with 397 691 | Ok result -> 398 - (* Should encode as "AbC-" (basic code points with case annotation) *) 399 - check string "case encoded" "AbC-" result 400 - | Error e -> fail (Format.asprintf "encode_with_case failed: %a" Punycode.pp_error e) 692 + (* Should encode as "AbC-" (basic code points with case annotation) *) 693 + check string "case encoded" "AbC-" result 694 + | Error e -> 695 + fail (Format.asprintf "encode_with_case failed: %a" Punycode.pp_error e) 401 696 402 697 (* Edge case tests *) 403 698 let test_empty_input () = ··· 455 750 check bool "too short" false (Punycode.has_ace_prefix "xn-") 456 751 457 752 (* Test suites *) 458 - let decode_tests = [ 459 - "Arabic", `Quick, test_decode_arabic; 460 - "Chinese simplified", `Quick, test_decode_chinese_simplified; 461 - "Chinese traditional", `Quick, test_decode_chinese_traditional; 462 - "Czech", `Quick, test_decode_czech; 463 - "Hebrew", `Quick, test_decode_hebrew; 464 - "Hindi", `Quick, test_decode_hindi; 465 - "Japanese", `Quick, test_decode_japanese; 466 - "Korean", `Quick, test_decode_korean; 467 - "Russian", `Quick, test_decode_russian; 468 - "Spanish", `Quick, test_decode_spanish; 469 - "Vietnamese", `Quick, test_decode_vietnamese; 470 - "Example L (mixed)", `Quick, test_decode_example_l; 471 - "Example M (mixed)", `Quick, test_decode_example_m; 472 - "Example N (mixed)", `Quick, test_decode_example_n; 473 - "Example O (mixed)", `Quick, test_decode_example_o; 474 - "Example P (mixed)", `Quick, test_decode_example_p; 475 - "Example Q (mixed)", `Quick, test_decode_example_q; 476 - "Example R", `Quick, test_decode_example_r; 477 - "Example S (ASCII)", `Quick, test_decode_example_s; 478 - ] 753 + let decode_tests = 754 + [ 755 + ("Arabic", `Quick, test_decode_arabic); 756 + ("Chinese simplified", `Quick, test_decode_chinese_simplified); 757 + ("Chinese traditional", `Quick, test_decode_chinese_traditional); 758 + ("Czech", `Quick, test_decode_czech); 759 + ("Hebrew", `Quick, test_decode_hebrew); 760 + ("Hindi", `Quick, test_decode_hindi); 761 + ("Japanese", `Quick, test_decode_japanese); 762 + ("Korean", `Quick, test_decode_korean); 763 + ("Russian", `Quick, test_decode_russian); 764 + ("Spanish", `Quick, test_decode_spanish); 765 + ("Vietnamese", `Quick, test_decode_vietnamese); 766 + ("Example L (mixed)", `Quick, test_decode_example_l); 767 + ("Example M (mixed)", `Quick, test_decode_example_m); 768 + ("Example N (mixed)", `Quick, test_decode_example_n); 769 + ("Example O (mixed)", `Quick, test_decode_example_o); 770 + ("Example P (mixed)", `Quick, test_decode_example_p); 771 + ("Example Q (mixed)", `Quick, test_decode_example_q); 772 + ("Example R", `Quick, test_decode_example_r); 773 + ("Example S (ASCII)", `Quick, test_decode_example_s); 774 + ] 479 775 480 - let encode_tests = [ 481 - "Arabic", `Quick, test_encode_arabic; 482 - "Chinese simplified", `Quick, test_encode_chinese_simplified; 483 - "Chinese traditional", `Quick, test_encode_chinese_traditional; 484 - "Hebrew", `Quick, test_encode_hebrew; 485 - "Hindi", `Quick, test_encode_hindi; 486 - "Japanese", `Quick, test_encode_japanese; 487 - "Korean", `Quick, test_encode_korean; 488 - "Example L (mixed)", `Quick, test_encode_example_l; 489 - "Example M (mixed)", `Quick, test_encode_example_m; 490 - "Example N (mixed)", `Quick, test_encode_example_n; 491 - "Example O (mixed)", `Quick, test_encode_example_o; 492 - "Example Q (mixed)", `Quick, test_encode_example_q; 493 - "Example R", `Quick, test_encode_example_r; 494 - ] 776 + let encode_tests = 777 + [ 778 + ("Arabic", `Quick, test_encode_arabic); 779 + ("Chinese simplified", `Quick, test_encode_chinese_simplified); 780 + ("Chinese traditional", `Quick, test_encode_chinese_traditional); 781 + ("Hebrew", `Quick, test_encode_hebrew); 782 + ("Hindi", `Quick, test_encode_hindi); 783 + ("Japanese", `Quick, test_encode_japanese); 784 + ("Korean", `Quick, test_encode_korean); 785 + ("Example L (mixed)", `Quick, test_encode_example_l); 786 + ("Example M (mixed)", `Quick, test_encode_example_m); 787 + ("Example N (mixed)", `Quick, test_encode_example_n); 788 + ("Example O (mixed)", `Quick, test_encode_example_o); 789 + ("Example Q (mixed)", `Quick, test_encode_example_q); 790 + ("Example R", `Quick, test_encode_example_r); 791 + ] 495 792 496 - let utf8_tests = [ 497 - "German roundtrip", `Quick, test_utf8_roundtrip_german; 498 - "Chinese roundtrip", `Quick, test_utf8_roundtrip_chinese; 499 - "Japanese roundtrip", `Quick, test_utf8_roundtrip_japanese; 500 - "Arabic roundtrip", `Quick, test_utf8_roundtrip_arabic; 501 - "Russian roundtrip", `Quick, test_utf8_roundtrip_russian; 502 - "Greek roundtrip", `Quick, test_utf8_roundtrip_greek; 503 - "Korean roundtrip", `Quick, test_utf8_roundtrip_korean; 504 - "Emoji roundtrip", `Quick, test_utf8_roundtrip_emoji; 505 - ] 793 + let utf8_tests = 794 + [ 795 + ("German roundtrip", `Quick, test_utf8_roundtrip_german); 796 + ("Chinese roundtrip", `Quick, test_utf8_roundtrip_chinese); 797 + ("Japanese roundtrip", `Quick, test_utf8_roundtrip_japanese); 798 + ("Arabic roundtrip", `Quick, test_utf8_roundtrip_arabic); 799 + ("Russian roundtrip", `Quick, test_utf8_roundtrip_russian); 800 + ("Greek roundtrip", `Quick, test_utf8_roundtrip_greek); 801 + ("Korean roundtrip", `Quick, test_utf8_roundtrip_korean); 802 + ("Emoji roundtrip", `Quick, test_utf8_roundtrip_emoji); 803 + ] 506 804 507 - let label_tests = [ 508 - "ASCII passthrough", `Quick, test_label_encode_ascii; 509 - "German encode", `Quick, test_label_encode_german; 510 - "German decode", `Quick, test_label_decode_german; 511 - ] 805 + let label_tests = 806 + [ 807 + ("ASCII passthrough", `Quick, test_label_encode_ascii); 808 + ("German encode", `Quick, test_label_encode_german); 809 + ("German decode", `Quick, test_label_decode_german); 810 + ] 512 811 513 - let idna_tests = [ 514 - "to_ascii simple", `Quick, test_idna_to_ascii_simple; 515 - "to_unicode simple", `Quick, test_idna_to_unicode_simple; 516 - "roundtrip", `Quick, test_idna_roundtrip; 517 - "all ASCII", `Quick, test_idna_all_ascii; 518 - "mixed labels", `Quick, test_idna_mixed_labels; 519 - ] 812 + let idna_tests = 813 + [ 814 + ("to_ascii simple", `Quick, test_idna_to_ascii_simple); 815 + ("to_unicode simple", `Quick, test_idna_to_unicode_simple); 816 + ("roundtrip", `Quick, test_idna_roundtrip); 817 + ("all ASCII", `Quick, test_idna_all_ascii); 818 + ("mixed labels", `Quick, test_idna_mixed_labels); 819 + ] 520 820 521 - let case_tests = [ 522 - "decode with case", `Quick, test_case_annotation_decode; 523 - "encode with case", `Quick, test_case_annotation_encode; 524 - ] 821 + let case_tests = 822 + [ 823 + ("decode with case", `Quick, test_case_annotation_decode); 824 + ("encode with case", `Quick, test_case_annotation_encode); 825 + ] 525 826 526 - let edge_case_tests = [ 527 - "empty encode", `Quick, test_empty_input; 528 - "empty decode", `Quick, test_empty_decode; 529 - "pure ASCII", `Quick, test_pure_ascii; 530 - "invalid digit", `Quick, test_invalid_digit; 531 - "label too long", `Quick, test_label_too_long; 532 - "empty label", `Quick, test_empty_label; 533 - ] 827 + let edge_case_tests = 828 + [ 829 + ("empty encode", `Quick, test_empty_input); 830 + ("empty decode", `Quick, test_empty_decode); 831 + ("pure ASCII", `Quick, test_pure_ascii); 832 + ("invalid digit", `Quick, test_invalid_digit); 833 + ("label too long", `Quick, test_label_too_long); 834 + ("empty label", `Quick, test_empty_label); 835 + ] 534 836 535 - let validation_tests = [ 536 - "is_basic", `Quick, test_is_basic; 537 - "is_ascii_string", `Quick, test_is_ascii_string; 538 - "has_ace_prefix", `Quick, test_has_ace_prefix; 539 - ] 837 + let validation_tests = 838 + [ 839 + ("is_basic", `Quick, test_is_basic); 840 + ("is_ascii_string", `Quick, test_is_ascii_string); 841 + ("has_ace_prefix", `Quick, test_has_ace_prefix); 842 + ] 540 843 541 844 let () = 542 - run "Punycode" [ 543 - "decode RFC vectors", decode_tests; 544 - "encode RFC vectors", encode_tests; 545 - "UTF-8 roundtrip", utf8_tests; 546 - "label operations", label_tests; 547 - "IDNA operations", idna_tests; 548 - "case annotation", case_tests; 549 - "edge cases", edge_case_tests; 550 - "validation", validation_tests; 551 - ] 845 + run "Punycode" 846 + [ 847 + ("decode RFC vectors", decode_tests); 848 + ("encode RFC vectors", encode_tests); 849 + ("UTF-8 roundtrip", utf8_tests); 850 + ("label operations", label_tests); 851 + ("IDNA operations", idna_tests); 852 + ("case annotation", case_tests); 853 + ("edge cases", edge_case_tests); 854 + ("validation", validation_tests); 855 + ]