···910(* Test that encode_utf8 never crashes on arbitrary input *)
11let test_encode_no_crash input =
12- ignore (Punycode.encode_utf8 input);
13 check true
1415(* Test that decode_utf8 never crashes on arbitrary input *)
16let test_decode_no_crash input =
17- ignore (Punycode.decode_utf8 input);
18 check true
1920(* Test roundtrip: encode then decode should give back original (case-insensitive)
21 IDNA/Punycode lowercases ASCII characters during encoding per RFC 5891 *)
22let test_roundtrip input =
23- match Punycode.encode_utf8 input with
24- | Ok encoded -> (
25- match Punycode.decode_utf8 encoded with
26- | Ok decoded ->
27- (* Compare lowercase versions since IDNA lowercases ASCII *)
28- check_eq ~pp:Format.pp_print_string
29- (String.lowercase_ascii input)
30- (String.lowercase_ascii decoded)
31- | Error _ ->
32- (* Some encoded values might not decode, that's ok for fuzz testing *)
33- check true)
34- | Error _ ->
35- (* Some inputs might not encode, that's ok *)
36- check true
3738(* Test ASCII-only strings (should pass through mostly unchanged) *)
39let test_ascii_string input =
···45 Char.chr (Char.code input.[i mod String.length input] mod 128))
46 in
47 if String.length ascii_only > 0 then
48- ignore (Punycode.encode_utf8 ascii_only)
49 end;
50 check true
5152(* Test inputs starting with ACE prefix "xn--" *)
53let test_ace_prefix input =
54 let ace_input = "xn--" ^ input in
55- ignore (Punycode.decode_utf8 ace_input);
56 check true
5758let () =
···910(* Test that encode_utf8 never crashes on arbitrary input *)
11let test_encode_no_crash input =
12+ (try ignore (Punycode.encode_utf8 input) with Punycode.Error _ -> ());
13 check true
1415(* Test that decode_utf8 never crashes on arbitrary input *)
16let test_decode_no_crash input =
17+ (try ignore (Punycode.decode_utf8 input) with Punycode.Error _ -> ());
18 check true
1920(* Test roundtrip: encode then decode should give back original (case-insensitive)
21 IDNA/Punycode lowercases ASCII characters during encoding per RFC 5891 *)
22let test_roundtrip input =
23+ (try
24+ let encoded = Punycode.encode_utf8 input in
25+ let decoded = Punycode.decode_utf8 encoded in
26+ (* Compare lowercase versions since IDNA lowercases ASCII *)
27+ check_eq ~pp:Format.pp_print_string
28+ (String.lowercase_ascii input)
29+ (String.lowercase_ascii decoded)
30+ with Punycode.Error _ ->
31+ (* Some inputs might not encode/decode, that's ok *)
32+ check true)
00003334(* Test ASCII-only strings (should pass through mostly unchanged) *)
35let test_ascii_string input =
···41 Char.chr (Char.code input.[i mod String.length input] mod 128))
42 in
43 if String.length ascii_only > 0 then
44+ (try ignore (Punycode.encode_utf8 ascii_only) with Punycode.Error _ -> ())
45 end;
46 check true
4748(* Test inputs starting with ACE prefix "xn--" *)
49let test_ace_prefix input =
50 let ace_input = "xn--" ^ input in
51+ (try ignore (Punycode.decode_utf8 ace_input) with Punycode.Error _ -> ());
52 check true
5354let () =
+178-211
lib/punycode.ml
···3031(* {1 Error Types} *)
3233-type error =
34 | Overflow of position
35 | Invalid_character of position * Uchar.t
36 | Invalid_digit of position * char
···39 | Label_too_long of int
40 | Empty_label
4142-let pp_error fmt = function
43 | Overflow pos ->
44 Format.fprintf fmt "arithmetic overflow at %a" pp_position pos
45 | Invalid_character (pos, u) ->
···57 max_label_length
58 | Empty_label -> Format.fprintf fmt "empty label"
5960-let error_to_string err = Format.asprintf "%a" pp_error err
0000006162(* {1 Error Constructors} *)
6364-let overflow pos = Error (Overflow pos)
65-let invalid_character pos u = Error (Invalid_character (pos, u))
66-let invalid_digit pos c = Error (Invalid_digit (pos, c))
67-let unexpected_end pos = Error (Unexpected_end pos)
68-let _invalid_utf8 pos = Error (Invalid_utf8 pos)
69-let label_too_long len = Error (Label_too_long len)
70-let empty_label = Error Empty_label
7172(* {1 Case Flags} *)
73···134let max_int_value = max_int
135136let safe_mul_add a b c pos =
137- if c = 0 then Ok a
138 else if b > (max_int_value - a) / c then overflow pos
139- else Ok (a + (b * c))
140141(* {1 UTF-8 to Code Points Conversion} *)
142···145 let acc = ref [] in
146 let byte_offset = ref 0 in
147 let char_index = ref 0 in
148- let error = ref None in
149- while !byte_offset < len && !error = None do
150 let pos = { byte_offset = !byte_offset; char_index = !char_index } in
151 let dec = String.get_utf_8_uchar s !byte_offset in
152 if Uchar.utf_decode_is_valid dec then begin
···154 byte_offset := !byte_offset + Uchar.utf_decode_length dec;
155 incr char_index
156 end
157- else begin
158- error := Some (Invalid_utf8 pos)
159- end
160 done;
161- match !error with
162- | Some e -> Error e
163- | None -> Ok (Array.of_list (List.rev !acc))
164165(* {1 Code Points to UTF-8 Conversion} *)
166···173174let encode_impl codepoints case_flags =
175 let input_length = Array.length codepoints in
176- if input_length = 0 then Ok ""
177 else begin
178 let output = Buffer.create (input_length * 2) in
179···210 let delta = ref 0 in
211 let bias = ref initial_bias in
212213- let result = ref (Ok ()) in
214-215- while !h < input_length && !result = Ok () do
216 (* Find minimum code point >= n *)
217 let m =
218 Array.fold_left
···224225 (* Increase delta to advance state to <m, 0> *)
226 let pos = { byte_offset = 0; char_index = !h } in
227- match safe_mul_add !delta (m - !n) (!h + 1) pos with
228- | Error e -> result := Error e
229- | Ok new_delta ->
230- delta := new_delta;
231- n := m;
00232233- (* Process each code point *)
234- let j = ref 0 in
235- while !j < input_length && !result = Ok () do
236- let cp = Uchar.to_int codepoints.(!j) in
237- let pos = { byte_offset = 0; char_index = !j } in
00000238239- if cp < !n then begin
240- incr delta;
241- if !delta = 0 then (* Overflow *)
242- result := overflow pos
000000000000000000243 end
244- else if cp = !n then begin
245- (* Encode delta as variable-length integer *)
246- let q = ref !delta in
247- let k = ref base in
248- let done_encoding = ref false in
249250- while not !done_encoding do
251- let t =
252- if !k <= !bias then tmin
253- else if !k >= !bias + tmax then tmax
254- else !k - !bias
255- in
256- if !q < t then begin
257- (* Output final digit *)
258- let case =
259- match case_flags with
260- | Some flags -> flags.(!j)
261- | None -> Lowercase
262- in
263- Buffer.add_char output (encode_digit !q case);
264- done_encoding := true
265- end
266- else begin
267- (* Output intermediate digit and continue *)
268- let digit = t + ((!q - t) mod (base - t)) in
269- Buffer.add_char output (encode_digit digit Lowercase);
270- q := (!q - t) / (base - t);
271- k := !k + base
272- end
273- done;
274275- bias := adapt ~delta:!delta ~numpoints:(!h + 1) ~firsttime:(!h = b);
276- delta := 0;
277- incr h
278- end;
279- incr j
280- done;
281-282- incr delta;
283- incr n
284 done;
285286- match !result with
287- | Error e -> Error e
288- | Ok () -> Ok (Buffer.contents output)
289 end
290291let encode codepoints = encode_impl codepoints None
···299300let decode_impl input =
301 let input_length = String.length input in
302- if input_length = 0 then Ok ([||], [||])
303 else begin
304 (* Find last delimiter *)
305 let b = Option.value ~default:0 (String.rindex_opt input delimiter) in
···307 (* Copy basic code points and extract case flags *)
308 let output = ref [] in
309 let case_output = ref [] in
310- let error = ref None in
311312 for j = 0 to b - 1 do
313- if !error = None then begin
314- let c = input.[j] in
315- let pos = { byte_offset = j; char_index = j } in
316- let code = Char.code c in
317- if code >= 0x80 then
318- error := Some (Invalid_character (pos, Uchar.of_int code))
319- else begin
320- output := Uchar.of_int code :: !output;
321- case_output :=
322- (if is_flagged c then Uppercase else Lowercase) :: !case_output
323- end
324 end
325 done;
326327- match !error with
328- | Some e -> Error e
329- | None -> (
330- let output = ref (Array.of_list (List.rev !output)) in
331- let case_output = ref (Array.of_list (List.rev !case_output)) in
332333- (* Main decoding loop *)
334- let n = ref initial_n in
335- let i = ref 0 in
336- let bias = ref initial_bias in
337- let in_pos = ref (if b > 0 then b + 1 else 0) in
338- let result = ref (Ok ()) in
339340- while !in_pos < input_length && !result = Ok () do
341- let oldi = !i in
342- let w = ref 1 in
343- let k = ref base in
344- let done_decoding = ref false in
345346- while (not !done_decoding) && !result = Ok () do
347- let pos =
348- { byte_offset = !in_pos; char_index = Array.length !output }
349- in
350351- if !in_pos >= input_length then begin
352- result := unexpected_end pos;
353- done_decoding := true
354- end
355- else begin
356- let c = input.[!in_pos] in
357- incr in_pos;
358359- match decode_digit c with
360- | None ->
361- result := invalid_digit pos c;
362- done_decoding := true
363- | Some digit -> (
364- (* i = i + digit * w, with overflow check *)
365- match safe_mul_add !i digit !w pos with
366- | Error e ->
367- result := Error e;
368- done_decoding := true
369- | Ok new_i ->
370- i := new_i;
371372- let t =
373- if !k <= !bias then tmin
374- else if !k >= !bias + tmax then tmax
375- else !k - !bias
376- in
377378- if digit < t then begin
379- (* Record case flag from this final digit *)
380- done_decoding := true
381- end
382- else begin
383- (* w = w * (base - t), with overflow check *)
384- let base_minus_t = base - t in
385- if !w > max_int_value / base_minus_t then begin
386- result := overflow pos;
387- done_decoding := true
388- end
389- else begin
390- w := !w * base_minus_t;
391- k := !k + base
392- end
393- end)
394- end
395- done;
396397- if !result = Ok () then begin
398- let out_len = Array.length !output in
399- bias :=
400- adapt ~delta:(!i - oldi) ~numpoints:(out_len + 1)
401- ~firsttime:(oldi = 0);
402403- let pos = { byte_offset = !in_pos - 1; char_index = out_len } in
404405- (* n = n + i / (out_len + 1), with overflow check *)
406- let increment = !i / (out_len + 1) in
407- if increment > max_int_value - !n then result := overflow pos
408- else begin
409- n := !n + increment;
410- i := !i mod (out_len + 1);
411412- (* Validate that n is a valid Unicode scalar value *)
413- if not (Uchar.is_valid !n) then
414- result := invalid_character pos Uchar.rep
415- else begin
416- (* Insert n at position i *)
417- let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in
418- let new_case = Array.make (out_len + 1) Lowercase in
419420- for j = 0 to !i - 1 do
421- new_output.(j) <- !output.(j);
422- new_case.(j) <- !case_output.(j)
423- done;
424- new_output.(!i) <- Uchar.of_int !n;
425- (* Case flag from final digit of this delta *)
426- new_case.(!i) <-
427- (if !in_pos > 0 && is_flagged input.[!in_pos - 1] then
428- Uppercase
429- else Lowercase);
430- for j = !i to out_len - 1 do
431- new_output.(j + 1) <- !output.(j);
432- new_case.(j + 1) <- !case_output.(j)
433- done;
434435- output := new_output;
436- case_output := new_case;
437- incr i
438- end
439- end
440- end
441- done;
442443- match !result with
444- | Error e -> Error e
445- | Ok () -> Ok (!output, !case_output))
446 end
447448-let decode input = Result.map fst (decode_impl input)
449let decode_with_case input = decode_impl input
450451(* {1 UTF-8 String Operations} *)
452453let encode_utf8 s =
454- let open Result.Syntax in
455- let* codepoints = utf8_to_codepoints s in
456 encode codepoints
457458let decode_utf8 punycode =
459- let open Result.Syntax in
460- let+ codepoints = decode punycode in
461 codepoints_to_utf8 codepoints
462463(* {1 Domain Label Operations} *)
464465let encode_label label =
466- if String.length label = 0 then empty_label
467 else if is_ascii_string label then begin
468 (* All ASCII - return as-is, but check length *)
469 let len = String.length label in
470- if len > max_label_length then label_too_long len else Ok label
471 end
472- else
473 (* Has non-ASCII - encode with Punycode *)
474- let open Result.Syntax in
475- let* encoded = encode_utf8 label in
476 let result = ace_prefix ^ encoded in
477 let len = String.length result in
478- if len > max_label_length then label_too_long len else Ok result
0479480let decode_label label =
481- if String.length label = 0 then empty_label
482 else if has_ace_prefix label then begin
483 (* Remove ACE prefix and decode *)
484 let punycode = String.sub label 4 (String.length label - 4) in
···486 end
487 else begin
488 (* No ACE prefix - validate and return *)
489- if is_ascii_string label then Ok label
490 else
491 (* Has non-ASCII but no ACE prefix - return as-is *)
492- Ok label
493 end
···3031(* {1 Error Types} *)
3233+type error_reason =
34 | Overflow of position
35 | Invalid_character of position * Uchar.t
36 | Invalid_digit of position * char
···39 | Label_too_long of int
40 | Empty_label
4142+let pp_error_reason fmt = function
43 | Overflow pos ->
44 Format.fprintf fmt "arithmetic overflow at %a" pp_position pos
45 | Invalid_character (pos, u) ->
···57 max_label_length
58 | Empty_label -> Format.fprintf fmt "empty label"
5960+exception Error of error_reason
61+62+let () = Printexc.register_printer (function
63+ | Error reason -> Some (Format.asprintf "Punycode.Error: %a" pp_error_reason reason)
64+ | _ -> None)
65+66+let error_reason_to_string reason = Format.asprintf "%a" pp_error_reason reason
6768(* {1 Error Constructors} *)
6970+let overflow pos = raise (Error (Overflow pos))
71+let invalid_character pos u = raise (Error (Invalid_character (pos, u)))
72+let invalid_digit pos c = raise (Error (Invalid_digit (pos, c)))
73+let unexpected_end pos = raise (Error (Unexpected_end pos))
74+let invalid_utf8 pos = raise (Error (Invalid_utf8 pos))
75+let label_too_long len = raise (Error (Label_too_long len))
76+let empty_label () = raise (Error Empty_label)
7778(* {1 Case Flags} *)
79···140let max_int_value = max_int
141142let safe_mul_add a b c pos =
143+ if c = 0 then a
144 else if b > (max_int_value - a) / c then overflow pos
145+ else a + (b * c)
146147(* {1 UTF-8 to Code Points Conversion} *)
148···151 let acc = ref [] in
152 let byte_offset = ref 0 in
153 let char_index = ref 0 in
154+ while !byte_offset < len do
0155 let pos = { byte_offset = !byte_offset; char_index = !char_index } in
156 let dec = String.get_utf_8_uchar s !byte_offset in
157 if Uchar.utf_decode_is_valid dec then begin
···159 byte_offset := !byte_offset + Uchar.utf_decode_length dec;
160 incr char_index
161 end
162+ else invalid_utf8 pos
00163 done;
164+ Array.of_list (List.rev !acc)
00165166(* {1 Code Points to UTF-8 Conversion} *)
167···174175let encode_impl codepoints case_flags =
176 let input_length = Array.length codepoints in
177+ if input_length = 0 then ""
178 else begin
179 let output = Buffer.create (input_length * 2) in
180···211 let delta = ref 0 in
212 let bias = ref initial_bias in
213214+ while !h < input_length do
00215 (* Find minimum code point >= n *)
216 let m =
217 Array.fold_left
···223224 (* Increase delta to advance state to <m, 0> *)
225 let pos = { byte_offset = 0; char_index = !h } in
226+ delta := safe_mul_add !delta (m - !n) (!h + 1) pos;
227+ n := m;
228+229+ (* Process each code point *)
230+ for j = 0 to input_length - 1 do
231+ let cp = Uchar.to_int codepoints.(j) in
232+ let pos = { byte_offset = 0; char_index = j } in
233234+ if cp < !n then begin
235+ incr delta;
236+ if !delta = 0 then (* Overflow *)
237+ overflow pos
238+ end
239+ else if cp = !n then begin
240+ (* Encode delta as variable-length integer *)
241+ let q = ref !delta in
242+ let k = ref base in
243+ let done_encoding = ref false in
244245+ while not !done_encoding do
246+ let t =
247+ if !k <= !bias then tmin
248+ else if !k >= !bias + tmax then tmax
249+ else !k - !bias
250+ in
251+ if !q < t then begin
252+ (* Output final digit *)
253+ let case =
254+ match case_flags with
255+ | Some flags -> flags.(j)
256+ | None -> Lowercase
257+ in
258+ Buffer.add_char output (encode_digit !q case);
259+ done_encoding := true
260+ end
261+ else begin
262+ (* Output intermediate digit and continue *)
263+ let digit = t + ((!q - t) mod (base - t)) in
264+ Buffer.add_char output (encode_digit digit Lowercase);
265+ q := (!q - t) / (base - t);
266+ k := !k + base
267 end
268+ done;
0000269270+ bias := adapt ~delta:!delta ~numpoints:(!h + 1) ~firsttime:(!h = b);
271+ delta := 0;
272+ incr h
273+ end
274+ done;
0000000000000000000275276+ incr delta;
277+ incr n
0000000278 done;
279280+ Buffer.contents output
00281 end
282283let encode codepoints = encode_impl codepoints None
···291292let decode_impl input =
293 let input_length = String.length input in
294+ if input_length = 0 then ([||], [||])
295 else begin
296 (* Find last delimiter *)
297 let b = Option.value ~default:0 (String.rindex_opt input delimiter) in
···299 (* Copy basic code points and extract case flags *)
300 let output = ref [] in
301 let case_output = ref [] in
0302303 for j = 0 to b - 1 do
304+ let c = input.[j] in
305+ let pos = { byte_offset = j; char_index = j } in
306+ let code = Char.code c in
307+ if code >= 0x80 then
308+ invalid_character pos (Uchar.of_int code)
309+ else begin
310+ output := Uchar.of_int code :: !output;
311+ case_output :=
312+ (if is_flagged c then Uppercase else Lowercase) :: !case_output
00313 end
314 done;
315316+ let output = ref (Array.of_list (List.rev !output)) in
317+ let case_output = ref (Array.of_list (List.rev !case_output)) in
000318319+ (* Main decoding loop *)
320+ let n = ref initial_n in
321+ let i = ref 0 in
322+ let bias = ref initial_bias in
323+ let in_pos = ref (if b > 0 then b + 1 else 0) in
0324325+ while !in_pos < input_length do
326+ let oldi = !i in
327+ let w = ref 1 in
328+ let k = ref base in
329+ let done_decoding = ref false in
330331+ while not !done_decoding do
332+ let pos =
333+ { byte_offset = !in_pos; char_index = Array.length !output }
334+ in
335336+ if !in_pos >= input_length then
337+ unexpected_end pos
338+ else begin
339+ let c = input.[!in_pos] in
340+ incr in_pos;
00341342+ match decode_digit c with
343+ | None -> invalid_digit pos c
344+ | Some digit ->
345+ (* i = i + digit * w, with overflow check *)
346+ i := safe_mul_add !i digit !w pos;
0000000347348+ let t =
349+ if !k <= !bias then tmin
350+ else if !k >= !bias + tmax then tmax
351+ else !k - !bias
352+ in
353354+ if digit < t then
355+ (* Record case flag from this final digit *)
356+ done_decoding := true
357+ else begin
358+ (* w = w * (base - t), with overflow check *)
359+ let base_minus_t = base - t in
360+ if !w > max_int_value / base_minus_t then
361+ overflow pos
362+ else begin
363+ w := !w * base_minus_t;
364+ k := !k + base
365+ end
366+ end
367+ end
368+ done;
000369370+ let out_len = Array.length !output in
371+ bias :=
372+ adapt ~delta:(!i - oldi) ~numpoints:(out_len + 1)
373+ ~firsttime:(oldi = 0);
0374375+ let pos = { byte_offset = !in_pos - 1; char_index = out_len } in
376377+ (* n = n + i / (out_len + 1), with overflow check *)
378+ let increment = !i / (out_len + 1) in
379+ if increment > max_int_value - !n then overflow pos
380+ else begin
381+ n := !n + increment;
382+ i := !i mod (out_len + 1);
383384+ (* Validate that n is a valid Unicode scalar value *)
385+ if not (Uchar.is_valid !n) then
386+ invalid_character pos Uchar.rep
387+ else begin
388+ (* Insert n at position i *)
389+ let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in
390+ let new_case = Array.make (out_len + 1) Lowercase in
391392+ for j = 0 to !i - 1 do
393+ new_output.(j) <- !output.(j);
394+ new_case.(j) <- !case_output.(j)
395+ done;
396+ new_output.(!i) <- Uchar.of_int !n;
397+ (* Case flag from final digit of this delta *)
398+ new_case.(!i) <-
399+ (if !in_pos > 0 && is_flagged input.[!in_pos - 1] then
400+ Uppercase
401+ else Lowercase);
402+ for j = !i to out_len - 1 do
403+ new_output.(j + 1) <- !output.(j);
404+ new_case.(j + 1) <- !case_output.(j)
405+ done;
406407+ output := new_output;
408+ case_output := new_case;
409+ incr i
410+ end
411+ end
412+ done;
0413414+ (!output, !case_output)
00415 end
416417+let decode input = fst (decode_impl input)
418let decode_with_case input = decode_impl input
419420(* {1 UTF-8 String Operations} *)
421422let encode_utf8 s =
423+ let codepoints = utf8_to_codepoints s in
0424 encode codepoints
425426let decode_utf8 punycode =
427+ let codepoints = decode punycode in
0428 codepoints_to_utf8 codepoints
429430(* {1 Domain Label Operations} *)
431432let encode_label label =
433+ if String.length label = 0 then empty_label ()
434 else if is_ascii_string label then begin
435 (* All ASCII - return as-is, but check length *)
436 let len = String.length label in
437+ if len > max_label_length then label_too_long len else label
438 end
439+ else begin
440 (* Has non-ASCII - encode with Punycode *)
441+ let encoded = encode_utf8 label in
0442 let result = ace_prefix ^ encoded in
443 let len = String.length result in
444+ if len > max_label_length then label_too_long len else result
445+ end
446447let decode_label label =
448+ if String.length label = 0 then empty_label ()
449 else if has_ace_prefix label then begin
450 (* Remove ACE prefix and decode *)
451 let punycode = String.sub label 4 (String.length label - 4) in
···453 end
454 else begin
455 (* No ACE prefix - validate and return *)
456+ if is_ascii_string label then label
457 else
458 (* Has non-ASCII but no ACE prefix - return as-is *)
459+ label
460 end
+41-25
lib/punycode.mli
···3839(** {1 Error Types} *)
4041-type error =
42 | Overflow of position
43 (** Arithmetic overflow during encode/decode. This can occur with very
44 long strings or extreme Unicode code point values. See
···66 is the actual length. *)
67 | Empty_label (** Empty label is not valid for encoding. *)
6869-val pp_error : Format.formatter -> error -> unit
70-(** [pp_error fmt e] pretty-prints an error with position information. *)
0007172-val error_to_string : error -> string
73-(** [error_to_string e] converts an error to a human-readable string. *)
7475(** {1 Constants}
76···108 6}. They operate on arrays of Unicode code points ([Uchar.t array]). The
109 encoded output is a plain ASCII string without the ACE prefix. *)
110111-val encode : Uchar.t array -> (string, error) result
112(** [encode codepoints] encodes an array of Unicode code points to a Punycode
113 ASCII string.
114···123 are any basic code points 3. Non-basic code points are encoded as deltas
124 using the generalized variable-length integer representation from
125 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3}
00126127 Example:
128 {[
129 encode [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC; ... |]
130- (* = Ok "ihqwcrb4cv8a8dqg056pqjye" *)
131 ]} *)
132133-val decode : string -> (Uchar.t array, error) result
134(** [decode punycode] decodes a Punycode ASCII string to an array of Unicode
135 code points.
136···144 5}: "A decoder MUST recognize the letters in both uppercase and lowercase
145 forms".
14600147 Example:
148 {[
149 decode "ihqwcrb4cv8a8dqg056pqjye"
150- (* = Ok [| U+4ED6; U+4EEC; U+4E3A; ... |] (Chinese simplified) *)
151 ]} *)
152153(** {1 Mixed-Case Annotation}
···156 {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
157 Appendix A}. *)
158159-val encode_with_case :
160- Uchar.t array -> case_flag array -> (string, error) result
161(** [encode_with_case codepoints case_flags] encodes with case annotation.
162163 Per
···169170 The [case_flags] array must have the same length as [codepoints].
171172- @raise Invalid_argument if array lengths don't match. *)
0173174-val decode_with_case : string -> (Uchar.t array * case_flag array, error) result
175(** [decode_with_case punycode] decodes and extracts case annotations.
176177 Per
178 {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
179 Appendix A}, returns both the decoded code points and an array of case
180 flags indicating the suggested case for each character based on the
181- uppercase/lowercase form of the encoding digits. *)
00182183(** {1 UTF-8 String Operations}
184185 Convenience functions that work directly with UTF-8 encoded OCaml strings.
186 These combine UTF-8 decoding/encoding with the core Punycode operations. *)
187188-val encode_utf8 : string -> (string, error) result
189(** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix).
190191 This is equivalent to decoding [s] from UTF-8 to code points, then calling
192 {!encode}.
19300194 Example:
195 {[
196 encode_utf8 "münchen"
197- (* = Ok "mnchen-3ya" *)
198 ]} *)
199200-val decode_utf8 : string -> (string, error) result
201(** [decode_utf8 punycode] decodes Punycode to a UTF-8 string (no ACE prefix).
202203 This is equivalent to calling {!decode} then encoding the result as UTF-8.
20400205 Example:
206 {[
207 decode_utf8 "mnchen-3ya"
208- (* = Ok "münchen" *)
209 ]} *)
210211(** {1 Domain Label Operations}
···214 length limits per
215 {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
216217-val encode_label : string -> (string, error) result
218(** [encode_label label] encodes a domain label for use in DNS.
219220 If the label contains only ASCII characters, it is returned unchanged.
···223 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
224 5}.
225226- Returns {!Error} {!Label_too_long} if the result exceeds 63 bytes.
0227228 Example:
229 {[
230 encode_label "münchen"
231- (* = Ok "xn--mnchen-3ya" *)
232 encode_label "example"
233- (* = Ok "example" *)
234 ]} *)
235236-val decode_label : string -> (string, error) result
237(** [decode_label label] decodes a domain label.
238239 If the label starts with the ACE prefix ("xn--", case-insensitive), it is
240 Punycode-decoded. Otherwise, it is returned unchanged.
00241242 Example:
243 {[
244 decode_label "xn--mnchen-3ya"
245- (* = Ok "münchen" *)
246 decode_label "example"
247- (* = Ok "example" *)
248 ]} *)
249250(** {1 Validation}
···3839(** {1 Error Types} *)
4041+type error_reason =
42 | Overflow of position
43 (** Arithmetic overflow during encode/decode. This can occur with very
44 long strings or extreme Unicode code point values. See
···66 is the actual length. *)
67 | Empty_label (** Empty label is not valid for encoding. *)
6869+exception Error of error_reason
70+(** Exception raised for all Punycode encoding/decoding errors. *)
71+72+val pp_error_reason : Format.formatter -> error_reason -> unit
73+(** [pp_error_reason fmt e] pretty-prints an error with position information. *)
7475+val error_reason_to_string : error_reason -> string
76+(** [error_reason_to_string e] converts an error to a human-readable string. *)
7778(** {1 Constants}
79···111 6}. They operate on arrays of Unicode code points ([Uchar.t array]). The
112 encoded output is a plain ASCII string without the ACE prefix. *)
113114+val encode : Uchar.t array -> string
115(** [encode codepoints] encodes an array of Unicode code points to a Punycode
116 ASCII string.
117···126 are any basic code points 3. Non-basic code points are encoded as deltas
127 using the generalized variable-length integer representation from
128 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3}
129+130+ @raise Error on encoding failure (overflow, etc.)
131132 Example:
133 {[
134 encode [| Uchar.of_int 0x4ED6; Uchar.of_int 0x4EEC; ... |]
135+ (* = "ihqwcrb4cv8a8dqg056pqjye" *)
136 ]} *)
137138+val decode : string -> Uchar.t array
139(** [decode punycode] decodes a Punycode ASCII string to an array of Unicode
140 code points.
141···149 5}: "A decoder MUST recognize the letters in both uppercase and lowercase
150 forms".
151152+ @raise Error on decoding failure (invalid digit, unexpected end, etc.)
153+154 Example:
155 {[
156 decode "ihqwcrb4cv8a8dqg056pqjye"
157+ (* = [| U+4ED6; U+4EEC; U+4E3A; ... |] (Chinese simplified) *)
158 ]} *)
159160(** {1 Mixed-Case Annotation}
···163 {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
164 Appendix A}. *)
165166+val encode_with_case : Uchar.t array -> case_flag array -> string
0167(** [encode_with_case codepoints case_flags] encodes with case annotation.
168169 Per
···175176 The [case_flags] array must have the same length as [codepoints].
177178+ @raise Invalid_argument if array lengths don't match.
179+ @raise Error on encoding failure. *)
180181+val decode_with_case : string -> Uchar.t array * case_flag array
182(** [decode_with_case punycode] decodes and extracts case annotations.
183184 Per
185 {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
186 Appendix A}, returns both the decoded code points and an array of case
187 flags indicating the suggested case for each character based on the
188+ uppercase/lowercase form of the encoding digits.
189+190+ @raise Error on decoding failure. *)
191192(** {1 UTF-8 String Operations}
193194 Convenience functions that work directly with UTF-8 encoded OCaml strings.
195 These combine UTF-8 decoding/encoding with the core Punycode operations. *)
196197+val encode_utf8 : string -> string
198(** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix).
199200 This is equivalent to decoding [s] from UTF-8 to code points, then calling
201 {!encode}.
202203+ @raise Error on encoding failure.
204+205 Example:
206 {[
207 encode_utf8 "münchen"
208+ (* = "mnchen-3ya" *)
209 ]} *)
210211+val decode_utf8 : string -> string
212(** [decode_utf8 punycode] decodes Punycode to a UTF-8 string (no ACE prefix).
213214 This is equivalent to calling {!decode} then encoding the result as UTF-8.
215216+ @raise Error on decoding failure.
217+218 Example:
219 {[
220 decode_utf8 "mnchen-3ya"
221+ (* = "münchen" *)
222 ]} *)
223224(** {1 Domain Label Operations}
···227 length limits per
228 {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
229230+val encode_label : string -> string
231(** [encode_label label] encodes a domain label for use in DNS.
232233 If the label contains only ASCII characters, it is returned unchanged.
···236 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
237 5}.
238239+ @raise Error with {!Label_too_long} if the result exceeds 63 bytes.
240+ @raise Error with {!Empty_label} if the label is empty.
241242 Example:
243 {[
244 encode_label "münchen"
245+ (* = "xn--mnchen-3ya" *)
246 encode_label "example"
247+ (* = "example" *)
248 ]} *)
249250+val decode_label : string -> string
251(** [decode_label label] decodes a domain label.
252253 If the label starts with the ACE prefix ("xn--", case-insensitive), it is
254 Punycode-decoded. Otherwise, it is returned unchanged.
255+256+ @raise Error on decoding failure.
257258 Example:
259 {[
260 decode_label "xn--mnchen-3ya"
261+ (* = "münchen" *)
262 decode_label "example"
263+ (* = "example" *)
264 ]} *)
265266(** {1 Validation}
+47-53
lib/punycode_idna.ml
···910(* {1 Error Types} *)
1112-type error =
13- | Punycode_error of Punycode.error
14 | Invalid_label of string
15 | Domain_too_long of int
16 | Normalization_failed
17 | Verification_failed
1819-let pp_error fmt = function
20 | Punycode_error e ->
21- Format.fprintf fmt "Punycode error: %a" Punycode.pp_error e
22 | Invalid_label msg -> Format.fprintf fmt "invalid label: %s" msg
23 | Domain_too_long len ->
24 Format.fprintf fmt "domain too long: %d bytes (max %d)" len
···27 | Verification_failed ->
28 Format.fprintf fmt "IDNA verification failed (round-trip mismatch)"
2930-let error_to_string err = Format.asprintf "%a" pp_error err
0000003132(* {1 Error Constructors} *)
3334-let punycode_error e = Error (Punycode_error e)
35-let invalid_label msg = Error (Invalid_label msg)
36-let domain_too_long len = Error (Domain_too_long len)
37-let _normalization_failed = Error Normalization_failed
38-let verification_failed = Error Verification_failed
3940(* {1 Unicode Normalization} *)
41···82 invalid_label "STD3 rules violation"
83 else if check_hyphens && not (check_hyphen_rules label) then
84 invalid_label "invalid hyphen placement"
85- else Ok label
86 end
87 else begin
88 (* Has non-ASCII - normalize and encode *)
89 let normalized = normalize_nfc label in
9091 (* Encode to Punycode *)
92- match Punycode.encode_utf8 normalized with
93- | Error e -> punycode_error e
94- | Ok encoded -> (
95- let result = Punycode.ace_prefix ^ encoded in
96- let result_len = String.length result in
97- if result_len > Punycode.max_label_length then
98- punycode_error (Punycode.Label_too_long result_len)
99- else if check_hyphens && not (check_hyphen_rules result) then
100- invalid_label "invalid hyphen placement in encoded label"
101- else
102- (* Verification: decode and compare to original normalized form *)
103- match Punycode.decode_utf8 encoded with
104- | Error _ -> verification_failed
105- | Ok decoded ->
106- if decoded <> normalized then verification_failed else Ok result)
00107 end
108109let label_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) label =
···112let label_to_unicode label =
113 if is_ace_label label then begin
114 let encoded = String.sub label 4 (String.length label - 4) in
115- match Punycode.decode_utf8 encoded with
116- | Error e -> punycode_error e
117- | Ok decoded -> Ok decoded
118 end
119- else Ok label
120121(* {1 Domain Operations} *)
122···126(* Join labels into domain *)
127let join_labels labels = String.concat "." labels
128129-(* Map a function returning Result over a list, short-circuiting on first Error *)
130-let map_result f lst =
131- List.fold_right
132- (fun x acc ->
133- let open Result.Syntax in
134- let* y = f x in
135- let+ ys = acc in
136- y :: ys)
137- lst (Ok [])
138-139let to_ascii ?(check_hyphens = true) ?(check_bidi = false)
140 ?(check_joiners = false) ?(use_std3_rules = false) ?(transitional = false)
141 domain =
···145 let _ = check_joiners in
146 let _ = transitional in
147148- let open Result.Syntax in
149 let labels = split_domain domain in
150- let* encoded_labels =
151- map_result (label_to_ascii_impl ~check_hyphens ~use_std3_rules) labels
152 in
153 let result = join_labels encoded_labels in
154 let len = String.length result in
155- if len > max_domain_length then domain_too_long len else Ok result
156157let to_unicode domain =
158- let open Result.Syntax in
159 let labels = split_domain domain in
160- let+ decoded_labels = map_result label_to_unicode labels in
161 join_labels decoded_labels
162163(* {1 Domain Name Library Integration} *)
164165let domain_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) domain =
166- let open Result.Syntax in
167 let s = Domain_name.to_string domain in
168- let* ascii = to_ascii ~check_hyphens ~use_std3_rules s in
169 match Domain_name.of_string ascii with
170 | Error (`Msg msg) -> invalid_label msg
171- | Ok d -> Ok d
172173let domain_to_unicode domain =
174- let open Result.Syntax in
175 let s = Domain_name.to_string domain in
176- let* unicode = to_unicode s in
177 match Domain_name.of_string unicode with
178 | Error (`Msg msg) -> invalid_label msg
179- | Ok d -> Ok d
180181(* {1 Validation} *)
182183-let is_idna_valid domain = Result.is_ok (to_ascii domain)
00
···910(* {1 Error Types} *)
1112+type error_reason =
13+ | Punycode_error of Punycode.error_reason
14 | Invalid_label of string
15 | Domain_too_long of int
16 | Normalization_failed
17 | Verification_failed
1819+let pp_error_reason fmt = function
20 | Punycode_error e ->
21+ Format.fprintf fmt "Punycode error: %a" Punycode.pp_error_reason e
22 | Invalid_label msg -> Format.fprintf fmt "invalid label: %s" msg
23 | Domain_too_long len ->
24 Format.fprintf fmt "domain too long: %d bytes (max %d)" len
···27 | Verification_failed ->
28 Format.fprintf fmt "IDNA verification failed (round-trip mismatch)"
2930+exception Error of error_reason
31+32+let () = Printexc.register_printer (function
33+ | Error reason -> Some (Format.asprintf "Punycode_idna.Error: %a" pp_error_reason reason)
34+ | _ -> None)
35+36+let error_reason_to_string reason = Format.asprintf "%a" pp_error_reason reason
3738(* {1 Error Constructors} *)
3940+let punycode_error e = raise (Error (Punycode_error e))
41+let invalid_label msg = raise (Error (Invalid_label msg))
42+let domain_too_long len = raise (Error (Domain_too_long len))
43+let verification_failed () = raise (Error Verification_failed)
04445(* {1 Unicode Normalization} *)
46···87 invalid_label "STD3 rules violation"
88 else if check_hyphens && not (check_hyphen_rules label) then
89 invalid_label "invalid hyphen placement"
90+ else label
91 end
92 else begin
93 (* Has non-ASCII - normalize and encode *)
94 let normalized = normalize_nfc label in
9596 (* Encode to Punycode *)
97+ let encoded =
98+ try Punycode.encode_utf8 normalized
99+ with Punycode.Error e -> punycode_error e
100+ in
101+ let result = Punycode.ace_prefix ^ encoded in
102+ let result_len = String.length result in
103+ if result_len > Punycode.max_label_length then
104+ punycode_error (Punycode.Label_too_long result_len)
105+ else if check_hyphens && not (check_hyphen_rules result) then
106+ invalid_label "invalid hyphen placement in encoded label"
107+ else
108+ (* Verification: decode and compare to original normalized form *)
109+ let decoded =
110+ try Punycode.decode_utf8 encoded
111+ with Punycode.Error _ -> verification_failed ()
112+ in
113+ if decoded <> normalized then verification_failed () else result
114 end
115116let label_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) label =
···119let label_to_unicode label =
120 if is_ace_label label then begin
121 let encoded = String.sub label 4 (String.length label - 4) in
122+ try Punycode.decode_utf8 encoded
123+ with Punycode.Error e -> punycode_error e
0124 end
125+ else label
126127(* {1 Domain Operations} *)
128···132(* Join labels into domain *)
133let join_labels labels = String.concat "." labels
1340000000000135let to_ascii ?(check_hyphens = true) ?(check_bidi = false)
136 ?(check_joiners = false) ?(use_std3_rules = false) ?(transitional = false)
137 domain =
···141 let _ = check_joiners in
142 let _ = transitional in
1430144 let labels = split_domain domain in
145+ let encoded_labels =
146+ List.map (label_to_ascii_impl ~check_hyphens ~use_std3_rules) labels
147 in
148 let result = join_labels encoded_labels in
149 let len = String.length result in
150+ if len > max_domain_length then domain_too_long len else result
151152let to_unicode domain =
0153 let labels = split_domain domain in
154+ let decoded_labels = List.map label_to_unicode labels in
155 join_labels decoded_labels
156157(* {1 Domain Name Library Integration} *)
158159let domain_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) domain =
0160 let s = Domain_name.to_string domain in
161+ let ascii = to_ascii ~check_hyphens ~use_std3_rules s in
162 match Domain_name.of_string ascii with
163 | Error (`Msg msg) -> invalid_label msg
164+ | Ok d -> d
165166let domain_to_unicode domain =
0167 let s = Domain_name.to_string domain in
168+ let unicode = to_unicode s in
169 match Domain_name.of_string unicode with
170 | Error (`Msg msg) -> invalid_label msg
171+ | Ok d -> d
172173(* {1 Validation} *)
174175+let is_idna_valid domain =
176+ try ignore (to_ascii domain); true
177+ with Error _ -> false
+35-25
lib/punycode_idna.mli
···2627(** {1 Error Types} *)
2829-type error =
30- | Punycode_error of Punycode.error
31- (** Error during Punycode encoding/decoding. See {!Punycode.error} for
32- details. *)
33 | Invalid_label of string
34 (** Label violates IDNA constraints. The string describes the violation.
35 See
···49 Section 4.2}, the result of encoding must decode back to the original
50 input. *)
5152-val pp_error : Format.formatter -> error -> unit
53-(** [pp_error fmt e] pretty-prints an error. *)
5455-val error_to_string : error -> string
56-(** [error_to_string e] converts an error to a human-readable string. *)
0005758(** {1 Constants} *)
59···77 ?use_std3_rules:bool ->
78 ?transitional:bool ->
79 string ->
80- (string, error) result
81(** [to_ascii domain] converts an internationalized domain name to ASCII.
8283 Implements the ToASCII operation from
···105 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section
106 4.2.3.2} (default: false)
107 - [transitional]: Use IDNA 2003 transitional processing (default: false)
00108109 Example:
110 {[
111 to_ascii "münchen.example.com"
112- (* = Ok "xn--mnchen-3ya.example.com" *)
113 ]} *)
114115-val label_to_ascii :
116- ?check_hyphens:bool ->
117- ?use_std3_rules:bool ->
118- string ->
119- (string, error) result
120(** [label_to_ascii label] converts a single label to ASCII.
121122 This implements the core ToASCII operation for one label, as described in
123 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891
124- Section 4.1}. *)
00125126(** {1 ToUnicode Operation}
127···131 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} RFC 5891
132 Section 4.2} for the complete ToUnicode specification. *)
133134-val to_unicode : string -> (string, error) result
135(** [to_unicode domain] converts an ACE domain name to Unicode.
136137 Implements the ToUnicode operation from
···143 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492
144 Section 6.2} 2. Otherwise, pass through unchanged
14500146 Example:
147 {[
148 to_unicode "xn--mnchen-3ya.example.com"
149- (* = Ok "münchen.example.com" *)
150 ]} *)
151152-val label_to_unicode : string -> (string, error) result
153(** [label_to_unicode label] converts a single ACE label to Unicode.
154155 This implements the core ToUnicode operation for one label, as described in
156 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891
157- Section 4.2}. *)
00158159(** {1 Domain Name Integration}
160···168 ?check_hyphens:bool ->
169 ?use_std3_rules:bool ->
170 [ `raw ] Domain_name.t ->
171- ([ `raw ] Domain_name.t, error) result
172(** [domain_to_ascii domain] converts a domain name to ASCII form.
173174 Applies {!to_ascii} to the string representation and returns the result as a
175 [Domain_name.t].
17600177 Example:
178 {[
179 let d = Domain_name.of_string_exn "münchen.example.com" in
180 domain_to_ascii d
181- (* = Ok (Domain_name.of_string_exn "xn--mnchen-3ya.example.com") *)
182 ]} *)
183184-val domain_to_unicode :
185- [ `raw ] Domain_name.t -> ([ `raw ] Domain_name.t, error) result
186(** [domain_to_unicode domain] converts a domain name to Unicode form.
187188 Applies {!to_unicode} to the string representation and returns the result as
189- a [Domain_name.t]. *)
00190191(** {1 Validation} *)
192
···2627(** {1 Error Types} *)
2829+type error_reason =
30+ | Punycode_error of Punycode.error_reason
31+ (** Error during Punycode encoding/decoding. See {!Punycode.error_reason}
32+ for details. *)
33 | Invalid_label of string
34 (** Label violates IDNA constraints. The string describes the violation.
35 See
···49 Section 4.2}, the result of encoding must decode back to the original
50 input. *)
5152+exception Error of error_reason
53+(** Exception raised for all IDNA processing errors. *)
5455+val pp_error_reason : Format.formatter -> error_reason -> unit
56+(** [pp_error_reason fmt e] pretty-prints an error. *)
57+58+val error_reason_to_string : error_reason -> string
59+(** [error_reason_to_string e] converts an error to a human-readable string. *)
6061(** {1 Constants} *)
62···80 ?use_std3_rules:bool ->
81 ?transitional:bool ->
82 string ->
83+ string
84(** [to_ascii domain] converts an internationalized domain name to ASCII.
8586 Implements the ToASCII operation from
···108 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section
109 4.2.3.2} (default: false)
110 - [transitional]: Use IDNA 2003 transitional processing (default: false)
111+112+ @raise Error on conversion failure.
113114 Example:
115 {[
116 to_ascii "münchen.example.com"
117+ (* = "xn--mnchen-3ya.example.com" *)
118 ]} *)
119120+val label_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool -> string -> string
0000121(** [label_to_ascii label] converts a single label to ASCII.
122123 This implements the core ToASCII operation for one label, as described in
124 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891
125+ Section 4.1}.
126+127+ @raise Error on conversion failure. *)
128129(** {1 ToUnicode Operation}
130···134 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} RFC 5891
135 Section 4.2} for the complete ToUnicode specification. *)
136137+val to_unicode : string -> string
138(** [to_unicode domain] converts an ACE domain name to Unicode.
139140 Implements the ToUnicode operation from
···146 {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492
147 Section 6.2} 2. Otherwise, pass through unchanged
148149+ @raise Error on decoding failure.
150+151 Example:
152 {[
153 to_unicode "xn--mnchen-3ya.example.com"
154+ (* = "münchen.example.com" *)
155 ]} *)
156157+val label_to_unicode : string -> string
158(** [label_to_unicode label] converts a single ACE label to Unicode.
159160 This implements the core ToUnicode operation for one label, as described in
161 {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891
162+ Section 4.2}.
163+164+ @raise Error on decoding failure. *)
165166(** {1 Domain Name Integration}
167···175 ?check_hyphens:bool ->
176 ?use_std3_rules:bool ->
177 [ `raw ] Domain_name.t ->
178+ [ `raw ] Domain_name.t
179(** [domain_to_ascii domain] converts a domain name to ASCII form.
180181 Applies {!to_ascii} to the string representation and returns the result as a
182 [Domain_name.t].
183184+ @raise Error on conversion failure.
185+186 Example:
187 {[
188 let d = Domain_name.of_string_exn "münchen.example.com" in
189 domain_to_ascii d
190+ (* = Domain_name.of_string_exn "xn--mnchen-3ya.example.com" *)
191 ]} *)
192193+val domain_to_unicode : [ `raw ] Domain_name.t -> [ `raw ] Domain_name.t
0194(** [domain_to_unicode domain] converts a domain name to Unicode form.
195196 Applies {!to_unicode} to the string representation and returns the result as
197+ a [Domain_name.t].
198+199+ @raise Error on decoding failure. *)
200201(** {1 Validation} *)
202
+124-106
test/test_punycode.ml
···2627(* Test result helper *)
28let check_encode_ok expected input =
29- match Punycode.encode input with
30- | Ok result -> check string "encode" expected result
31- | Error e -> fail (Format.asprintf "encode failed: %a" Punycode.pp_error e)
003233let check_decode_ok expected input =
34- match Punycode.decode input with
35- | Ok result ->
36- let expected_arr = codepoints_of_hex_list expected in
37- check int "length" (Array.length expected_arr) (Array.length result);
38- Array.iteri
39- (fun i u ->
40- check int
41- (Printf.sprintf "char %d" i)
42- (Uchar.to_int expected_arr.(i))
43- (Uchar.to_int u))
44- result
45- | Error e -> fail (Format.asprintf "decode failed: %a" Punycode.pp_error e)
04647let check_utf8_roundtrip s =
48- match Punycode.encode_utf8 s with
49- | Error e ->
50- fail (Format.asprintf "encode_utf8 failed: %a" Punycode.pp_error e)
51- | Ok encoded -> (
52- match Punycode.decode_utf8 encoded with
53- | Error e ->
54- fail (Format.asprintf "decode_utf8 failed: %a" Punycode.pp_error e)
55- | Ok decoded -> check string "roundtrip" s decoded)
5657(* RFC 3492 Section 7.1 Test Vectors *)
58···604605(* Label encoding tests *)
606let test_label_encode_ascii () =
607- match Punycode.encode_label "example" with
608- | Ok result -> check string "ascii passthrough" "example" result
609- | Error e ->
610- fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error e)
0611612let test_label_encode_german () =
613- match Punycode.encode_label "münchen" with
614- | Ok result -> check string "german label" "xn--mnchen-3ya" result
615- | Error e ->
616- fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error e)
0617618let test_label_decode_german () =
619- match Punycode.decode_label "xn--mnchen-3ya" with
620- | Ok result -> check string "german decode" "münchen" result
621- | Error e ->
622- fail (Format.asprintf "decode_label failed: %a" Punycode.pp_error e)
0623624(* IDNA tests *)
625let test_idna_to_ascii_simple () =
626- match Punycode_idna.to_ascii "münchen.example.com" with
627- | Ok result ->
628- check string "idna to_ascii" "xn--mnchen-3ya.example.com" result
629- | Error e ->
630- fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e)
631632let test_idna_to_unicode_simple () =
633- match Punycode_idna.to_unicode "xn--mnchen-3ya.example.com" with
634- | Ok result -> check string "idna to_unicode" "münchen.example.com" result
635- | Error e ->
636- fail (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error e)
0637638let test_idna_roundtrip () =
639 let original = "münchen.example.com" in
640- match Punycode_idna.to_ascii original with
641- | Error e ->
642- fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e)
643- | Ok ascii -> (
644- match Punycode_idna.to_unicode ascii with
645- | Error e ->
646- fail
647- (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error e)
648- | Ok unicode -> check string "idna roundtrip" original unicode)
649650let test_idna_all_ascii () =
651- match Punycode_idna.to_ascii "www.example.com" with
652- | Ok result -> check string "all ascii passthrough" "www.example.com" result
653- | Error e ->
654- fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e)
0655656let test_idna_mixed_labels () =
657- match Punycode_idna.to_ascii "日本語.example.com" with
658- | Ok result ->
659- (* Check that result starts with xn-- and ends with .example.com *)
660- check bool "has ace prefix" true (Punycode.has_ace_prefix result);
661- check bool "ends with example.com" true
662- (String.length result > 12
663- && String.sub result (String.length result - 12) 12 = ".example.com")
664- | Error e ->
665- fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error e)
666667(* Case annotation tests *)
668let test_case_annotation_decode () =
669 (* RFC example: uppercase letters indicate case flags *)
670- match Punycode.decode_with_case "MajiKoi5-783gue6qz075azm5e" with
671- | Ok (codepoints, case_flags) ->
672- check int "codepoints length"
673- (List.length example_p_codepoints)
674- (Array.length codepoints);
675- check int "case_flags length" (Array.length codepoints)
676- (Array.length case_flags);
677- (* M should be uppercase *)
678- check bool "M uppercase" true (case_flags.(0) = Punycode.Uppercase);
679- (* a should be lowercase *)
680- check bool "a lowercase" true (case_flags.(1) = Punycode.Lowercase)
681- | Error e ->
682- fail (Format.asprintf "decode_with_case failed: %a" Punycode.pp_error e)
00683684let test_case_annotation_encode () =
685 let codepoints = codepoints_of_hex_list [ 0x0061; 0x0062; 0x0063 ] in
···687 let case_flags =
688 [| Punycode.Uppercase; Punycode.Lowercase; Punycode.Uppercase |]
689 in
690- match Punycode.encode_with_case codepoints case_flags with
691- | Ok result ->
692- (* Should encode as "AbC-" (basic code points with case annotation) *)
693- check string "case encoded" "AbC-" result
694- | Error e ->
695- fail (Format.asprintf "encode_with_case failed: %a" Punycode.pp_error e)
696697(* Edge case tests *)
698let test_empty_input () =
699- match Punycode.encode [||] with
700- | Ok result -> check string "empty encode" "" result
701- | Error _ -> fail "empty encode should succeed"
0702703let test_empty_decode () =
704- match Punycode.decode "" with
705- | Ok result -> check int "empty decode length" 0 (Array.length result)
706- | Error _ -> fail "empty decode should succeed"
0707708let test_pure_ascii () =
709 let input = codepoints_of_string "hello" in
710- match Punycode.encode input with
711- | Ok result -> check string "pure ascii" "hello-" result
712- | Error e -> fail (Format.asprintf "encode failed: %a" Punycode.pp_error e)
00713714let test_invalid_digit () =
715- match Punycode.decode "hello!" with
716- | Ok _ -> fail "should fail on invalid digit"
717- | Error (Punycode.Invalid_digit _) -> ()
718- | Error e -> fail (Format.asprintf "wrong error type: %a" Punycode.pp_error e)
000719720let test_label_too_long () =
721 let long_label = String.make 100 'a' in
722- match Punycode.encode_label long_label with
723- | Ok _ -> fail "should fail on long label"
724- | Error (Punycode.Label_too_long _) -> ()
725- | Error e -> fail (Format.asprintf "wrong error type: %a" Punycode.pp_error e)
000726727let test_empty_label () =
728- match Punycode.encode_label "" with
729- | Ok _ -> fail "should fail on empty label"
730- | Error Punycode.Empty_label -> ()
731- | Error e -> fail (Format.asprintf "wrong error type: %a" Punycode.pp_error e)
000732733(* Validation tests *)
734let test_is_basic () =
···2627(* Test result helper *)
28let check_encode_ok expected input =
29+ try
30+ let result = Punycode.encode input in
31+ check string "encode" expected result
32+ with Punycode.Error e ->
33+ fail (Format.asprintf "encode failed: %a" Punycode.pp_error_reason e)
3435let check_decode_ok expected input =
36+ try
37+ let result = Punycode.decode input in
38+ let expected_arr = codepoints_of_hex_list expected in
39+ check int "length" (Array.length expected_arr) (Array.length result);
40+ Array.iteri
41+ (fun i u ->
42+ check int
43+ (Printf.sprintf "char %d" i)
44+ (Uchar.to_int expected_arr.(i))
45+ (Uchar.to_int u))
46+ result
47+ with Punycode.Error e ->
48+ fail (Format.asprintf "decode failed: %a" Punycode.pp_error_reason e)
4950let check_utf8_roundtrip s =
51+ try
52+ let encoded = Punycode.encode_utf8 s in
53+ let decoded = Punycode.decode_utf8 encoded in
54+ check string "roundtrip" s decoded
55+ with Punycode.Error e ->
56+ fail (Format.asprintf "roundtrip failed: %a" Punycode.pp_error_reason e)
005758(* RFC 3492 Section 7.1 Test Vectors *)
59···605606(* Label encoding tests *)
607let test_label_encode_ascii () =
608+ try
609+ let result = Punycode.encode_label "example" in
610+ check string "ascii passthrough" "example" result
611+ with Punycode.Error e ->
612+ fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error_reason e)
613614let test_label_encode_german () =
615+ try
616+ let result = Punycode.encode_label "münchen" in
617+ check string "german label" "xn--mnchen-3ya" result
618+ with Punycode.Error e ->
619+ fail (Format.asprintf "encode_label failed: %a" Punycode.pp_error_reason e)
620621let test_label_decode_german () =
622+ try
623+ let result = Punycode.decode_label "xn--mnchen-3ya" in
624+ check string "german decode" "münchen" result
625+ with Punycode.Error e ->
626+ fail (Format.asprintf "decode_label failed: %a" Punycode.pp_error_reason e)
627628(* IDNA tests *)
629let test_idna_to_ascii_simple () =
630+ try
631+ let result = Punycode_idna.to_ascii "münchen.example.com" in
632+ check string "idna to_ascii" "xn--mnchen-3ya.example.com" result
633+ with Punycode_idna.Error e ->
634+ fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error_reason e)
635636let test_idna_to_unicode_simple () =
637+ try
638+ let result = Punycode_idna.to_unicode "xn--mnchen-3ya.example.com" in
639+ check string "idna to_unicode" "münchen.example.com" result
640+ with Punycode_idna.Error e ->
641+ fail (Format.asprintf "to_unicode failed: %a" Punycode_idna.pp_error_reason e)
642643let test_idna_roundtrip () =
644 let original = "münchen.example.com" in
645+ try
646+ let ascii = Punycode_idna.to_ascii original in
647+ let unicode = Punycode_idna.to_unicode ascii in
648+ check string "idna roundtrip" original unicode
649+ with Punycode_idna.Error e ->
650+ fail (Format.asprintf "roundtrip failed: %a" Punycode_idna.pp_error_reason e)
000651652let test_idna_all_ascii () =
653+ try
654+ let result = Punycode_idna.to_ascii "www.example.com" in
655+ check string "all ascii passthrough" "www.example.com" result
656+ with Punycode_idna.Error e ->
657+ fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error_reason e)
658659let test_idna_mixed_labels () =
660+ try
661+ let result = Punycode_idna.to_ascii "日本語.example.com" in
662+ (* Check that result starts with xn-- and ends with .example.com *)
663+ check bool "has ace prefix" true (Punycode.has_ace_prefix result);
664+ check bool "ends with example.com" true
665+ (String.length result > 12
666+ && String.sub result (String.length result - 12) 12 = ".example.com")
667+ with Punycode_idna.Error e ->
668+ fail (Format.asprintf "to_ascii failed: %a" Punycode_idna.pp_error_reason e)
669670(* Case annotation tests *)
671let test_case_annotation_decode () =
672 (* RFC example: uppercase letters indicate case flags *)
673+ try
674+ let codepoints, case_flags =
675+ Punycode.decode_with_case "MajiKoi5-783gue6qz075azm5e"
676+ in
677+ check int "codepoints length"
678+ (List.length example_p_codepoints)
679+ (Array.length codepoints);
680+ check int "case_flags length" (Array.length codepoints)
681+ (Array.length case_flags);
682+ (* M should be uppercase *)
683+ check bool "M uppercase" true (case_flags.(0) = Punycode.Uppercase);
684+ (* a should be lowercase *)
685+ check bool "a lowercase" true (case_flags.(1) = Punycode.Lowercase)
686+ with Punycode.Error e ->
687+ fail (Format.asprintf "decode_with_case failed: %a" Punycode.pp_error_reason e)
688689let test_case_annotation_encode () =
690 let codepoints = codepoints_of_hex_list [ 0x0061; 0x0062; 0x0063 ] in
···692 let case_flags =
693 [| Punycode.Uppercase; Punycode.Lowercase; Punycode.Uppercase |]
694 in
695+ try
696+ let result = Punycode.encode_with_case codepoints case_flags in
697+ (* Should encode as "AbC-" (basic code points with case annotation) *)
698+ check string "case encoded" "AbC-" result
699+ with Punycode.Error e ->
700+ fail (Format.asprintf "encode_with_case failed: %a" Punycode.pp_error_reason e)
701702(* Edge case tests *)
703let test_empty_input () =
704+ try
705+ let result = Punycode.encode [||] in
706+ check string "empty encode" "" result
707+ with Punycode.Error _ -> fail "empty encode should succeed"
708709let test_empty_decode () =
710+ try
711+ let result = Punycode.decode "" in
712+ check int "empty decode length" 0 (Array.length result)
713+ with Punycode.Error _ -> fail "empty decode should succeed"
714715let test_pure_ascii () =
716 let input = codepoints_of_string "hello" in
717+ try
718+ let result = Punycode.encode input in
719+ check string "pure ascii" "hello-" result
720+ with Punycode.Error e ->
721+ fail (Format.asprintf "encode failed: %a" Punycode.pp_error_reason e)
722723let test_invalid_digit () =
724+ try
725+ ignore (Punycode.decode "hello!");
726+ fail "should fail on invalid digit"
727+ with
728+ | Punycode.Error (Punycode.Invalid_digit _) -> ()
729+ | Punycode.Error e ->
730+ fail (Format.asprintf "wrong error type: %a" Punycode.pp_error_reason e)
731732let test_label_too_long () =
733 let long_label = String.make 100 'a' in
734+ try
735+ ignore (Punycode.encode_label long_label);
736+ fail "should fail on long label"
737+ with
738+ | Punycode.Error (Punycode.Label_too_long _) -> ()
739+ | Punycode.Error e ->
740+ fail (Format.asprintf "wrong error type: %a" Punycode.pp_error_reason e)
741742let test_empty_label () =
743+ try
744+ ignore (Punycode.encode_label "");
745+ fail "should fail on empty label"
746+ with
747+ | Punycode.Error Punycode.Empty_label -> ()
748+ | Punycode.Error e ->
749+ fail (Format.asprintf "wrong error type: %a" Punycode.pp_error_reason e)
750751(* Validation tests *)
752let test_is_basic () =