forked from
anil.recoil.org/ocaml-punycode
Punycode (RFC3492) in OCaml
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(* Comprehensive tests for Punycode_idna (RFC 5891 IDNA) implementation *)
7
8open Alcotest
9
10(* {1 Test Helpers} *)
11
12let check_to_ascii ~msg expected input =
13 try
14 let result = Punycode_idna.to_ascii input in
15 check string msg expected result
16 with Punycode_idna.Error e ->
17 failf "%s: to_ascii failed: %a" msg Punycode_idna.pp_error_reason e
18
19let check_to_unicode ~msg expected input =
20 try
21 let result = Punycode_idna.to_unicode input in
22 check string msg expected result
23 with Punycode_idna.Error e ->
24 fail
25 (Fmt.str "%s: to_unicode failed: %a" msg Punycode_idna.pp_error_reason e)
26
27let check_label_to_ascii ~msg expected input =
28 try
29 let result = Punycode_idna.label_to_ascii input in
30 check string msg expected result
31 with Punycode_idna.Error e ->
32 fail
33 (Fmt.str "%s: label_to_ascii failed: %a" msg Punycode_idna.pp_error_reason
34 e)
35
36let check_label_to_unicode ~msg expected input =
37 try
38 let result = Punycode_idna.label_to_unicode input in
39 check string msg expected result
40 with Punycode_idna.Error e ->
41 fail
42 (Fmt.str "%s: label_to_unicode failed: %a" msg
43 Punycode_idna.pp_error_reason e)
44
45let check_roundtrip ~msg input =
46 try
47 let ascii = Punycode_idna.to_ascii input in
48 let unicode = Punycode_idna.to_unicode ascii in
49 check string msg input unicode
50 with Punycode_idna.Error e ->
51 fail
52 (Fmt.str "%s: roundtrip failed: %a" msg Punycode_idna.pp_error_reason e)
53
54let check_raises_error ~msg f =
55 try
56 ignore (f ());
57 failf "%s: expected Error but succeeded" msg
58 with Punycode_idna.Error _ -> ()
59
60(* {1 to_ascii Test Vectors} *)
61
62let test_to_ascii_german () =
63 check_to_ascii ~msg:"German domain" "xn--mnchen-3ya.de" "m\xc3\xbcnchen.de"
64
65let test_to_ascii_japanese () =
66 check_to_ascii ~msg:"Japanese domain" "xn--r8jz45g.jp"
67 "\xe4\xbe\x8b\xe3\x81\x88.jp"
68
69let test_to_ascii_ascii_passthrough () =
70 check_to_ascii ~msg:"ASCII passthrough" "example.com" "example.com"
71
72let test_to_ascii_mixed_labels () =
73 check_to_ascii ~msg:"mixed labels" "www.xn--mnchen-3ya.de"
74 "www.m\xc3\xbcnchen.de"
75
76let test_to_ascii_multi_idn () =
77 (* Both labels are non-ASCII *)
78 let input = "\xe4\xbe\x8b\xe3\x81\x88.\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88" in
79 try
80 let result = Punycode_idna.to_ascii input in
81 let labels = String.split_on_char '.' result in
82 List.iter
83 (fun l ->
84 check bool "each label has ACE prefix or is ASCII" true
85 (Punycode_idna.is_ace_label l || Punycode.is_ascii_string l))
86 labels
87 with Punycode_idna.Error e ->
88 fail
89 (Fmt.str "multiple IDN labels: to_ascii failed: %a"
90 Punycode_idna.pp_error_reason e)
91
92let test_to_ascii_chinese () =
93 check_to_ascii ~msg:"Chinese domain" "xn--fiq228c.cn"
94 "\xe4\xb8\xad\xe6\x96\x87.cn"
95
96let test_to_ascii_russian () =
97 (* "example" in Russian Cyrillic + .ru *)
98 let input = "\xd0\xbf\xd1\x80\xd0\xb8\xd0\xbc\xd0\xb5\xd1\x80.ru" in
99 try
100 let result = Punycode_idna.to_ascii input in
101 let labels = String.split_on_char '.' result in
102 check bool "first label has ACE prefix" true
103 (Punycode_idna.is_ace_label (List.hd labels));
104 check string "TLD preserved" "ru" (List.nth labels 1)
105 with Punycode_idna.Error e ->
106 fail
107 (Fmt.str "Russian domain: to_ascii failed: %a"
108 Punycode_idna.pp_error_reason e)
109
110(* {1 to_unicode Test Vectors} *)
111
112let test_to_unicode_german () =
113 check_to_unicode ~msg:"German domain" "m\xc3\xbcnchen.de" "xn--mnchen-3ya.de"
114
115let test_to_unicode_japanese () =
116 check_to_unicode ~msg:"Japanese domain" "\xe4\xbe\x8b\xe3\x81\x88.jp"
117 "xn--r8jz45g.jp"
118
119let test_to_unicode_ascii_passthrough () =
120 check_to_unicode ~msg:"ASCII passthrough" "example.com" "example.com"
121
122let test_to_unicode_mixed () =
123 check_to_unicode ~msg:"mixed domain" "www.m\xc3\xbcnchen.de"
124 "www.xn--mnchen-3ya.de"
125
126let test_to_unicode_chinese () =
127 check_to_unicode ~msg:"Chinese domain" "\xe4\xb8\xad\xe6\x96\x87.cn"
128 "xn--fiq228c.cn"
129
130(* {1 Roundtrip Tests} *)
131
132let test_roundtrip_german () =
133 check_roundtrip ~msg:"German roundtrip" "m\xc3\xbcnchen.de"
134
135let test_roundtrip_japanese () =
136 check_roundtrip ~msg:"Japanese roundtrip" "\xe4\xbe\x8b\xe3\x81\x88.jp"
137
138let test_roundtrip_chinese () =
139 check_roundtrip ~msg:"Chinese roundtrip" "\xe4\xb8\xad\xe6\x96\x87.cn"
140
141let test_roundtrip_mixed () =
142 check_roundtrip ~msg:"mixed roundtrip" "www.m\xc3\xbcnchen.de"
143
144let test_roundtrip_russian () =
145 check_roundtrip ~msg:"Russian roundtrip"
146 "\xd0\xbf\xd1\x80\xd0\xb8\xd0\xbc\xd0\xb5\xd1\x80.ru"
147
148let test_roundtrip_multi_idn () =
149 check_roundtrip ~msg:"multi-IDN roundtrip"
150 "\xe4\xbe\x8b\xe3\x81\x88.\xe3\x83\x86\xe3\x82\xb9\xe3\x83\x88"
151
152let test_roundtrip_ascii () =
153 check_roundtrip ~msg:"ASCII roundtrip" "www.example.com"
154
155(* {1 label_to_ascii / label_to_unicode Tests} *)
156
157let test_label_to_ascii_german () =
158 check_label_to_ascii ~msg:"German label" "xn--mnchen-3ya" "m\xc3\xbcnchen"
159
160let test_label_to_ascii_ascii () =
161 check_label_to_ascii ~msg:"ASCII label passthrough" "example" "example"
162
163let test_label_to_ascii_japanese () =
164 check_label_to_ascii ~msg:"Japanese label" "xn--r8jz45g"
165 "\xe4\xbe\x8b\xe3\x81\x88"
166
167let test_label_to_unicode_german () =
168 check_label_to_unicode ~msg:"German label" "m\xc3\xbcnchen" "xn--mnchen-3ya"
169
170let test_label_to_unicode_ascii () =
171 check_label_to_unicode ~msg:"ASCII label passthrough" "example" "example"
172
173let test_label_to_unicode_japanese () =
174 check_label_to_unicode ~msg:"Japanese label" "\xe4\xbe\x8b\xe3\x81\x88"
175 "xn--r8jz45g"
176
177let test_label_roundtrip () =
178 let label = "m\xc3\xbcnchen" in
179 let ascii = Punycode_idna.label_to_ascii label in
180 let unicode = Punycode_idna.label_to_unicode ascii in
181 check string "label roundtrip" label unicode
182
183(* {1 is_ace_label Tests} *)
184
185let test_is_ace_label_valid () =
186 check bool "xn-- prefix" true (Punycode_idna.is_ace_label "xn--mnchen-3ya")
187
188let test_is_ace_label_uppercase () =
189 check bool "XN-- prefix (case insensitive)" true
190 (Punycode_idna.is_ace_label "XN--mnchen-3ya")
191
192let test_ace_label_mixed_case () =
193 check bool "Xn-- prefix (mixed case)" true
194 (Punycode_idna.is_ace_label "Xn--mnchen-3ya")
195
196let test_is_ace_label_plain () =
197 check bool "plain label" false (Punycode_idna.is_ace_label "example")
198
199let test_is_ace_label_short () =
200 check bool "too short (xn-)" false (Punycode_idna.is_ace_label "xn-")
201
202let test_ace_label_single_dash () =
203 check bool "single dash (xn-notvalid)" false
204 (Punycode_idna.is_ace_label "xn-notvalid")
205
206let test_is_ace_label_empty () =
207 check bool "empty string" false (Punycode_idna.is_ace_label "")
208
209(* {1 is_idna_valid Tests} *)
210
211let test_is_idna_valid_ascii () =
212 check bool "ASCII domain valid" true
213 (Punycode_idna.is_idna_valid "example.com")
214
215let test_is_idna_valid_idn () =
216 check bool "IDN domain valid" true
217 (Punycode_idna.is_idna_valid "m\xc3\xbcnchen.de")
218
219let test_is_idna_valid_ace () =
220 check bool "ACE domain valid" true
221 (Punycode_idna.is_idna_valid "xn--mnchen-3ya.de")
222
223let test_idna_valid_empty_label () =
224 (* Empty label (double dot) should be invalid *)
225 check bool "empty label invalid" false
226 (Punycode_idna.is_idna_valid "example..com")
227
228(* {1 normalize_nfc Tests} *)
229
230let test_normalize_nfc_composed () =
231 (* e followed by combining acute accent (U+0065 U+0301) should become
232 precomposed e-acute (U+00E9) *)
233 let decomposed = "\x65\xcc\x81" in
234 let expected = "\xc3\xa9" in
235 let result = Punycode_idna.normalize_nfc decomposed in
236 check string "NFC normalization: decomposed to composed" expected result
237
238let test_normalize_nfc_already_composed () =
239 let composed = "\xc3\xa9" in
240 let result = Punycode_idna.normalize_nfc composed in
241 check string "NFC normalization: already composed" composed result
242
243let test_normalize_nfc_ascii () =
244 let ascii = "hello" in
245 let result = Punycode_idna.normalize_nfc ascii in
246 check string "NFC normalization: ASCII unchanged" ascii result
247
248let test_normalize_nfc_hangul () =
249 (* Hangul syllable composition: U+1100 U+1161 -> U+AC00 *)
250 let decomposed = "\xe1\x84\x80\xe1\x85\xa1" in
251 let composed = "\xea\xb0\x80" in
252 let result = Punycode_idna.normalize_nfc decomposed in
253 check string "NFC normalization: Hangul composition" composed result
254
255(* {1 max_domain_length Tests} *)
256
257let test_max_domain_length_value () =
258 check int "max_domain_length is 253" 253 Punycode_idna.max_domain_length
259
260let test_domain_too_long () =
261 (* Create a domain that exceeds 253 bytes *)
262 let long_label = String.make 60 'a' in
263 let domain =
264 String.concat "."
265 [ long_label; long_label; long_label; long_label; long_label ]
266 in
267 (* 60*5 + 4 dots = 304 bytes > 253 *)
268 check_raises_error ~msg:"domain too long" (fun () ->
269 Punycode_idna.to_ascii domain)
270
271(* {1 Edge Case Tests} *)
272
273let test_empty_label_error () =
274 check_raises_error ~msg:"empty label" (fun () ->
275 Punycode_idna.label_to_ascii "")
276
277let test_single_label_domain () =
278 check_to_ascii ~msg:"single label domain" "example" "example"
279
280let test_trailing_dot () =
281 (* A trailing dot produces an empty final label after splitting on '.'.
282 The implementation raises Error on empty labels. *)
283 check_raises_error ~msg:"trailing dot" (fun () ->
284 Punycode_idna.to_ascii "example.com.")
285
286let test_leading_hyphen_std3 () =
287 (* With use_std3_rules, leading hyphens should be rejected *)
288 try
289 ignore (Punycode_idna.to_ascii ~use_std3_rules:true "-example.com");
290 fail "leading hyphen with STD3 rules: expected Error"
291 with Punycode_idna.Error _ -> ()
292
293let test_trailing_hyphen_std3 () =
294 try
295 ignore (Punycode_idna.to_ascii ~use_std3_rules:true "example-.com");
296 fail "trailing hyphen with STD3 rules: expected Error"
297 with Punycode_idna.Error _ -> ()
298
299let test_ascii_no_ace_prefix () =
300 (* ASCII labels should not get an xn-- prefix *)
301 let result = Punycode_idna.label_to_ascii "example" in
302 check bool "no ACE prefix for ASCII" false (Punycode_idna.is_ace_label result)
303
304let test_error_reason_to_string () =
305 let s =
306 Punycode_idna.error_reason_to_string (Punycode_idna.Invalid_label "test")
307 in
308 check bool "error string is non-empty" true (String.length s > 0)
309
310let test_pp_error_reason () =
311 let buf = Buffer.create 64 in
312 let fmt = Format.formatter_of_buffer buf in
313 Punycode_idna.pp_error_reason fmt (Punycode_idna.Domain_too_long 300);
314 Format.pp_print_flush fmt ();
315 let s = Buffer.contents buf in
316 check bool "pp_error_reason produces output" true (String.length s > 0)
317
318(* {1 Test Suites} *)
319
320let to_ascii_tests =
321 [
322 ("German domain", `Quick, test_to_ascii_german);
323 ("Japanese domain", `Quick, test_to_ascii_japanese);
324 ("ASCII passthrough", `Quick, test_to_ascii_ascii_passthrough);
325 ("mixed labels", `Quick, test_to_ascii_mixed_labels);
326 ("multiple IDN labels", `Quick, test_to_ascii_multi_idn);
327 ("Chinese domain", `Quick, test_to_ascii_chinese);
328 ("Russian domain", `Quick, test_to_ascii_russian);
329 ]
330
331let to_unicode_tests =
332 [
333 ("German domain", `Quick, test_to_unicode_german);
334 ("Japanese domain", `Quick, test_to_unicode_japanese);
335 ("ASCII passthrough", `Quick, test_to_unicode_ascii_passthrough);
336 ("mixed domain", `Quick, test_to_unicode_mixed);
337 ("Chinese domain", `Quick, test_to_unicode_chinese);
338 ]
339
340let roundtrip_tests =
341 [
342 ("German", `Quick, test_roundtrip_german);
343 ("Japanese", `Quick, test_roundtrip_japanese);
344 ("Chinese", `Quick, test_roundtrip_chinese);
345 ("mixed", `Quick, test_roundtrip_mixed);
346 ("Russian", `Quick, test_roundtrip_russian);
347 ("multi-IDN", `Quick, test_roundtrip_multi_idn);
348 ("ASCII", `Quick, test_roundtrip_ascii);
349 ]
350
351let label_tests =
352 [
353 ("label_to_ascii German", `Quick, test_label_to_ascii_german);
354 ("label_to_ascii ASCII", `Quick, test_label_to_ascii_ascii);
355 ("label_to_ascii Japanese", `Quick, test_label_to_ascii_japanese);
356 ("label_to_unicode German", `Quick, test_label_to_unicode_german);
357 ("label_to_unicode ASCII", `Quick, test_label_to_unicode_ascii);
358 ("label_to_unicode Japanese", `Quick, test_label_to_unicode_japanese);
359 ("label roundtrip", `Quick, test_label_roundtrip);
360 ]
361
362let is_ace_label_tests =
363 [
364 ("valid ACE prefix", `Quick, test_is_ace_label_valid);
365 ("uppercase prefix", `Quick, test_is_ace_label_uppercase);
366 ("mixed case prefix", `Quick, test_ace_label_mixed_case);
367 ("plain label", `Quick, test_is_ace_label_plain);
368 ("too short", `Quick, test_is_ace_label_short);
369 ("single dash", `Quick, test_ace_label_single_dash);
370 ("empty string", `Quick, test_is_ace_label_empty);
371 ]
372
373let is_idna_valid_tests =
374 [
375 ("ASCII domain", `Quick, test_is_idna_valid_ascii);
376 ("IDN domain", `Quick, test_is_idna_valid_idn);
377 ("ACE domain", `Quick, test_is_idna_valid_ace);
378 ("empty label", `Quick, test_idna_valid_empty_label);
379 ]
380
381let normalize_nfc_tests =
382 [
383 ("decomposed to composed", `Quick, test_normalize_nfc_composed);
384 ("already composed", `Quick, test_normalize_nfc_already_composed);
385 ("ASCII unchanged", `Quick, test_normalize_nfc_ascii);
386 ("Hangul composition", `Quick, test_normalize_nfc_hangul);
387 ]
388
389let edge_case_tests =
390 [
391 ("max_domain_length value", `Quick, test_max_domain_length_value);
392 ("domain too long", `Quick, test_domain_too_long);
393 ("empty label error", `Quick, test_empty_label_error);
394 ("single label domain", `Quick, test_single_label_domain);
395 ("trailing dot", `Quick, test_trailing_dot);
396 ("leading hyphen STD3", `Quick, test_leading_hyphen_std3);
397 ("trailing hyphen STD3", `Quick, test_trailing_hyphen_std3);
398 ("ASCII no ACE prefix", `Quick, test_ascii_no_ace_prefix);
399 ("error_reason_to_string", `Quick, test_error_reason_to_string);
400 ("pp_error_reason", `Quick, test_pp_error_reason);
401 ]
402
403let suite =
404 ( "punycode_idna",
405 to_ascii_tests @ to_unicode_tests @ roundtrip_tests @ label_tests
406 @ is_ace_label_tests @ is_idna_valid_tests @ normalize_nfc_tests
407 @ edge_case_tests )