Punycode (RFC3492) in OCaml
at main 407 lines 14 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(* Comprehensive tests for Punycode_idna (RFC 5891 IDNA) implementation *) 7 8open Alcotest 9 10(* {1 Test Helpers} *) 11 12let check_to_ascii ~msg expected input = 13 try 14 let result = Punycode_idna.to_ascii input in 15 check string msg expected result 16 with Punycode_idna.Error e -> 17 failf "%s: to_ascii failed: %a" msg Punycode_idna.pp_error_reason e 18 19let check_to_unicode ~msg expected input = 20 try 21 let result = Punycode_idna.to_unicode input in 22 check string msg expected result 23 with Punycode_idna.Error e -> 24 fail 25 (Fmt.str "%s: to_unicode failed: %a" msg Punycode_idna.pp_error_reason e) 26 27let check_label_to_ascii ~msg expected input = 28 try 29 let result = Punycode_idna.label_to_ascii input in 30 check string msg expected result 31 with Punycode_idna.Error e -> 32 fail 33 (Fmt.str "%s: label_to_ascii failed: %a" msg Punycode_idna.pp_error_reason 34 e) 35 36let check_label_to_unicode ~msg expected input = 37 try 38 let result = Punycode_idna.label_to_unicode input in 39 check string msg expected result 40 with Punycode_idna.Error e -> 41 fail 42 (Fmt.str "%s: label_to_unicode failed: %a" msg 43 Punycode_idna.pp_error_reason e) 44 45let check_roundtrip ~msg input = 46 try 47 let ascii = Punycode_idna.to_ascii input in 48 let unicode = Punycode_idna.to_unicode ascii in 49 check string msg input unicode 50 with Punycode_idna.Error e -> 51 fail 52 (Fmt.str "%s: roundtrip failed: %a" msg Punycode_idna.pp_error_reason e) 53 54let check_raises_error ~msg f = 55 try 56 ignore (f ()); 57 failf "%s: expected Error but succeeded" msg 58 with Punycode_idna.Error _ -> () 59 60(* {1 to_ascii Test Vectors} *) 61 62let test_to_ascii_german () = 63 check_to_ascii ~msg:"German domain" "xn--mnchen-3ya.de" "m\xc3\xbcnchen.de" 64 65let test_to_ascii_japanese () = 66 check_to_ascii ~msg:"Japanese domain" "xn--r8jz45g.jp" 67 "\xe4\xbe\x8b\xe3\x81\x88.jp" 68 69let test_to_ascii_ascii_passthrough () = 70 check_to_ascii ~msg:"ASCII passthrough" "example.com" "example.com" 71 72let test_to_ascii_mixed_labels () = 73 check_to_ascii ~msg:"mixed labels" "www.xn--mnchen-3ya.de" 74 "www.m\xc3\xbcnchen.de" 75 76let test_to_ascii_multi_idn () = 77 (* Both labels are non-ASCII *) 78 let input = "\xe4\xbe\x8b\xe3\x81\x88.\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88" in 79 try 80 let result = Punycode_idna.to_ascii input in 81 let labels = String.split_on_char '.' result in 82 List.iter 83 (fun l -> 84 check bool "each label has ACE prefix or is ASCII" true 85 (Punycode_idna.is_ace_label l || Punycode.is_ascii_string l)) 86 labels 87 with Punycode_idna.Error e -> 88 fail 89 (Fmt.str "multiple IDN labels: to_ascii failed: %a" 90 Punycode_idna.pp_error_reason e) 91 92let test_to_ascii_chinese () = 93 check_to_ascii ~msg:"Chinese domain" "xn--fiq228c.cn" 94 "\xe4\xb8\xad\xe6\x96\x87.cn" 95 96let test_to_ascii_russian () = 97 (* "example" in Russian Cyrillic + .ru *) 98 let input = "\xd0\xbf\xd1\x80\xd0\xb8\xd0\xbc\xd0\xb5\xd1\x80.ru" in 99 try 100 let result = Punycode_idna.to_ascii input in 101 let labels = String.split_on_char '.' result in 102 check bool "first label has ACE prefix" true 103 (Punycode_idna.is_ace_label (List.hd labels)); 104 check string "TLD preserved" "ru" (List.nth labels 1) 105 with Punycode_idna.Error e -> 106 fail 107 (Fmt.str "Russian domain: to_ascii failed: %a" 108 Punycode_idna.pp_error_reason e) 109 110(* {1 to_unicode Test Vectors} *) 111 112let test_to_unicode_german () = 113 check_to_unicode ~msg:"German domain" "m\xc3\xbcnchen.de" "xn--mnchen-3ya.de" 114 115let test_to_unicode_japanese () = 116 check_to_unicode ~msg:"Japanese domain" "\xe4\xbe\x8b\xe3\x81\x88.jp" 117 "xn--r8jz45g.jp" 118 119let test_to_unicode_ascii_passthrough () = 120 check_to_unicode ~msg:"ASCII passthrough" "example.com" "example.com" 121 122let test_to_unicode_mixed () = 123 check_to_unicode ~msg:"mixed domain" "www.m\xc3\xbcnchen.de" 124 "www.xn--mnchen-3ya.de" 125 126let test_to_unicode_chinese () = 127 check_to_unicode ~msg:"Chinese domain" "\xe4\xb8\xad\xe6\x96\x87.cn" 128 "xn--fiq228c.cn" 129 130(* {1 Roundtrip Tests} *) 131 132let test_roundtrip_german () = 133 check_roundtrip ~msg:"German roundtrip" "m\xc3\xbcnchen.de" 134 135let test_roundtrip_japanese () = 136 check_roundtrip ~msg:"Japanese roundtrip" "\xe4\xbe\x8b\xe3\x81\x88.jp" 137 138let test_roundtrip_chinese () = 139 check_roundtrip ~msg:"Chinese roundtrip" "\xe4\xb8\xad\xe6\x96\x87.cn" 140 141let test_roundtrip_mixed () = 142 check_roundtrip ~msg:"mixed roundtrip" "www.m\xc3\xbcnchen.de" 143 144let test_roundtrip_russian () = 145 check_roundtrip ~msg:"Russian roundtrip" 146 "\xd0\xbf\xd1\x80\xd0\xb8\xd0\xbc\xd0\xb5\xd1\x80.ru" 147 148let test_roundtrip_multi_idn () = 149 check_roundtrip ~msg:"multi-IDN roundtrip" 150 "\xe4\xbe\x8b\xe3\x81\x88.\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88" 151 152let test_roundtrip_ascii () = 153 check_roundtrip ~msg:"ASCII roundtrip" "www.example.com" 154 155(* {1 label_to_ascii / label_to_unicode Tests} *) 156 157let test_label_to_ascii_german () = 158 check_label_to_ascii ~msg:"German label" "xn--mnchen-3ya" "m\xc3\xbcnchen" 159 160let test_label_to_ascii_ascii () = 161 check_label_to_ascii ~msg:"ASCII label passthrough" "example" "example" 162 163let test_label_to_ascii_japanese () = 164 check_label_to_ascii ~msg:"Japanese label" "xn--r8jz45g" 165 "\xe4\xbe\x8b\xe3\x81\x88" 166 167let test_label_to_unicode_german () = 168 check_label_to_unicode ~msg:"German label" "m\xc3\xbcnchen" "xn--mnchen-3ya" 169 170let test_label_to_unicode_ascii () = 171 check_label_to_unicode ~msg:"ASCII label passthrough" "example" "example" 172 173let test_label_to_unicode_japanese () = 174 check_label_to_unicode ~msg:"Japanese label" "\xe4\xbe\x8b\xe3\x81\x88" 175 "xn--r8jz45g" 176 177let test_label_roundtrip () = 178 let label = "m\xc3\xbcnchen" in 179 let ascii = Punycode_idna.label_to_ascii label in 180 let unicode = Punycode_idna.label_to_unicode ascii in 181 check string "label roundtrip" label unicode 182 183(* {1 is_ace_label Tests} *) 184 185let test_is_ace_label_valid () = 186 check bool "xn-- prefix" true (Punycode_idna.is_ace_label "xn--mnchen-3ya") 187 188let test_is_ace_label_uppercase () = 189 check bool "XN-- prefix (case insensitive)" true 190 (Punycode_idna.is_ace_label "XN--mnchen-3ya") 191 192let test_ace_label_mixed_case () = 193 check bool "Xn-- prefix (mixed case)" true 194 (Punycode_idna.is_ace_label "Xn--mnchen-3ya") 195 196let test_is_ace_label_plain () = 197 check bool "plain label" false (Punycode_idna.is_ace_label "example") 198 199let test_is_ace_label_short () = 200 check bool "too short (xn-)" false (Punycode_idna.is_ace_label "xn-") 201 202let test_ace_label_single_dash () = 203 check bool "single dash (xn-notvalid)" false 204 (Punycode_idna.is_ace_label "xn-notvalid") 205 206let test_is_ace_label_empty () = 207 check bool "empty string" false (Punycode_idna.is_ace_label "") 208 209(* {1 is_idna_valid Tests} *) 210 211let test_is_idna_valid_ascii () = 212 check bool "ASCII domain valid" true 213 (Punycode_idna.is_idna_valid "example.com") 214 215let test_is_idna_valid_idn () = 216 check bool "IDN domain valid" true 217 (Punycode_idna.is_idna_valid "m\xc3\xbcnchen.de") 218 219let test_is_idna_valid_ace () = 220 check bool "ACE domain valid" true 221 (Punycode_idna.is_idna_valid "xn--mnchen-3ya.de") 222 223let test_idna_valid_empty_label () = 224 (* Empty label (double dot) should be invalid *) 225 check bool "empty label invalid" false 226 (Punycode_idna.is_idna_valid "example..com") 227 228(* {1 normalize_nfc Tests} *) 229 230let test_normalize_nfc_composed () = 231 (* e followed by combining acute accent (U+0065 U+0301) should become 232 precomposed e-acute (U+00E9) *) 233 let decomposed = "\x65\xcc\x81" in 234 let expected = "\xc3\xa9" in 235 let result = Punycode_idna.normalize_nfc decomposed in 236 check string "NFC normalization: decomposed to composed" expected result 237 238let test_normalize_nfc_already_composed () = 239 let composed = "\xc3\xa9" in 240 let result = Punycode_idna.normalize_nfc composed in 241 check string "NFC normalization: already composed" composed result 242 243let test_normalize_nfc_ascii () = 244 let ascii = "hello" in 245 let result = Punycode_idna.normalize_nfc ascii in 246 check string "NFC normalization: ASCII unchanged" ascii result 247 248let test_normalize_nfc_hangul () = 249 (* Hangul syllable composition: U+1100 U+1161 -> U+AC00 *) 250 let decomposed = "\xe1\x84\x80\xe1\x85\xa1" in 251 let composed = "\xea\xb0\x80" in 252 let result = Punycode_idna.normalize_nfc decomposed in 253 check string "NFC normalization: Hangul composition" composed result 254 255(* {1 max_domain_length Tests} *) 256 257let test_max_domain_length_value () = 258 check int "max_domain_length is 253" 253 Punycode_idna.max_domain_length 259 260let test_domain_too_long () = 261 (* Create a domain that exceeds 253 bytes *) 262 let long_label = String.make 60 'a' in 263 let domain = 264 String.concat "." 265 [ long_label; long_label; long_label; long_label; long_label ] 266 in 267 (* 60*5 + 4 dots = 304 bytes > 253 *) 268 check_raises_error ~msg:"domain too long" (fun () -> 269 Punycode_idna.to_ascii domain) 270 271(* {1 Edge Case Tests} *) 272 273let test_empty_label_error () = 274 check_raises_error ~msg:"empty label" (fun () -> 275 Punycode_idna.label_to_ascii "") 276 277let test_single_label_domain () = 278 check_to_ascii ~msg:"single label domain" "example" "example" 279 280let test_trailing_dot () = 281 (* A trailing dot produces an empty final label after splitting on '.'. 282 The implementation raises Error on empty labels. *) 283 check_raises_error ~msg:"trailing dot" (fun () -> 284 Punycode_idna.to_ascii "example.com.") 285 286let test_leading_hyphen_std3 () = 287 (* With use_std3_rules, leading hyphens should be rejected *) 288 try 289 ignore (Punycode_idna.to_ascii ~use_std3_rules:true "-example.com"); 290 fail "leading hyphen with STD3 rules: expected Error" 291 with Punycode_idna.Error _ -> () 292 293let test_trailing_hyphen_std3 () = 294 try 295 ignore (Punycode_idna.to_ascii ~use_std3_rules:true "example-.com"); 296 fail "trailing hyphen with STD3 rules: expected Error" 297 with Punycode_idna.Error _ -> () 298 299let test_ascii_no_ace_prefix () = 300 (* ASCII labels should not get an xn-- prefix *) 301 let result = Punycode_idna.label_to_ascii "example" in 302 check bool "no ACE prefix for ASCII" false (Punycode_idna.is_ace_label result) 303 304let test_error_reason_to_string () = 305 let s = 306 Punycode_idna.error_reason_to_string (Punycode_idna.Invalid_label "test") 307 in 308 check bool "error string is non-empty" true (String.length s > 0) 309 310let test_pp_error_reason () = 311 let buf = Buffer.create 64 in 312 let fmt = Format.formatter_of_buffer buf in 313 Punycode_idna.pp_error_reason fmt (Punycode_idna.Domain_too_long 300); 314 Format.pp_print_flush fmt (); 315 let s = Buffer.contents buf in 316 check bool "pp_error_reason produces output" true (String.length s > 0) 317 318(* {1 Test Suites} *) 319 320let to_ascii_tests = 321 [ 322 ("German domain", `Quick, test_to_ascii_german); 323 ("Japanese domain", `Quick, test_to_ascii_japanese); 324 ("ASCII passthrough", `Quick, test_to_ascii_ascii_passthrough); 325 ("mixed labels", `Quick, test_to_ascii_mixed_labels); 326 ("multiple IDN labels", `Quick, test_to_ascii_multi_idn); 327 ("Chinese domain", `Quick, test_to_ascii_chinese); 328 ("Russian domain", `Quick, test_to_ascii_russian); 329 ] 330 331let to_unicode_tests = 332 [ 333 ("German domain", `Quick, test_to_unicode_german); 334 ("Japanese domain", `Quick, test_to_unicode_japanese); 335 ("ASCII passthrough", `Quick, test_to_unicode_ascii_passthrough); 336 ("mixed domain", `Quick, test_to_unicode_mixed); 337 ("Chinese domain", `Quick, test_to_unicode_chinese); 338 ] 339 340let roundtrip_tests = 341 [ 342 ("German", `Quick, test_roundtrip_german); 343 ("Japanese", `Quick, test_roundtrip_japanese); 344 ("Chinese", `Quick, test_roundtrip_chinese); 345 ("mixed", `Quick, test_roundtrip_mixed); 346 ("Russian", `Quick, test_roundtrip_russian); 347 ("multi-IDN", `Quick, test_roundtrip_multi_idn); 348 ("ASCII", `Quick, test_roundtrip_ascii); 349 ] 350 351let label_tests = 352 [ 353 ("label_to_ascii German", `Quick, test_label_to_ascii_german); 354 ("label_to_ascii ASCII", `Quick, test_label_to_ascii_ascii); 355 ("label_to_ascii Japanese", `Quick, test_label_to_ascii_japanese); 356 ("label_to_unicode German", `Quick, test_label_to_unicode_german); 357 ("label_to_unicode ASCII", `Quick, test_label_to_unicode_ascii); 358 ("label_to_unicode Japanese", `Quick, test_label_to_unicode_japanese); 359 ("label roundtrip", `Quick, test_label_roundtrip); 360 ] 361 362let is_ace_label_tests = 363 [ 364 ("valid ACE prefix", `Quick, test_is_ace_label_valid); 365 ("uppercase prefix", `Quick, test_is_ace_label_uppercase); 366 ("mixed case prefix", `Quick, test_ace_label_mixed_case); 367 ("plain label", `Quick, test_is_ace_label_plain); 368 ("too short", `Quick, test_is_ace_label_short); 369 ("single dash", `Quick, test_ace_label_single_dash); 370 ("empty string", `Quick, test_is_ace_label_empty); 371 ] 372 373let is_idna_valid_tests = 374 [ 375 ("ASCII domain", `Quick, test_is_idna_valid_ascii); 376 ("IDN domain", `Quick, test_is_idna_valid_idn); 377 ("ACE domain", `Quick, test_is_idna_valid_ace); 378 ("empty label", `Quick, test_idna_valid_empty_label); 379 ] 380 381let normalize_nfc_tests = 382 [ 383 ("decomposed to composed", `Quick, test_normalize_nfc_composed); 384 ("already composed", `Quick, test_normalize_nfc_already_composed); 385 ("ASCII unchanged", `Quick, test_normalize_nfc_ascii); 386 ("Hangul composition", `Quick, test_normalize_nfc_hangul); 387 ] 388 389let edge_case_tests = 390 [ 391 ("max_domain_length value", `Quick, test_max_domain_length_value); 392 ("domain too long", `Quick, test_domain_too_long); 393 ("empty label error", `Quick, test_empty_label_error); 394 ("single label domain", `Quick, test_single_label_domain); 395 ("trailing dot", `Quick, test_trailing_dot); 396 ("leading hyphen STD3", `Quick, test_leading_hyphen_std3); 397 ("trailing hyphen STD3", `Quick, test_trailing_hyphen_std3); 398 ("ASCII no ACE prefix", `Quick, test_ascii_no_ace_prefix); 399 ("error_reason_to_string", `Quick, test_error_reason_to_string); 400 ("pp_error_reason", `Quick, test_pp_error_reason); 401 ] 402 403let suite = 404 ( "punycode_idna", 405 to_ascii_tests @ to_unicode_tests @ roundtrip_tests @ label_tests 406 @ is_ace_label_tests @ is_idna_valid_tests @ normalize_nfc_tests 407 @ edge_case_tests )