···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Thomas Gazagnaire. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(* Crowbar-based fuzz testing for Punycode encoding/decoding *)
77+88+open Crowbar
99+1010+(* Test that encode_utf8 never crashes on arbitrary input *)
1111+let test_encode_no_crash input =
1212+ ignore (Punycode.encode_utf8 input);
1313+ check true
1414+1515+(* Test that decode_utf8 never crashes on arbitrary input *)
1616+let test_decode_no_crash input =
1717+ ignore (Punycode.decode_utf8 input);
1818+ check true
1919+2020+(* Test roundtrip: encode then decode should give back original (case-insensitive)
2121+ IDNA/Punycode lowercases ASCII characters during encoding per RFC 5891 *)
2222+let test_roundtrip input =
2323+ match Punycode.encode_utf8 input with
2424+ | Ok encoded -> (
2525+ match Punycode.decode_utf8 encoded with
2626+ | Ok decoded ->
2727+ (* Compare lowercase versions since IDNA lowercases ASCII *)
2828+ check_eq ~pp:Format.pp_print_string
2929+ (String.lowercase_ascii input)
3030+ (String.lowercase_ascii decoded)
3131+ | Error _ ->
3232+ (* Some encoded values might not decode, that's ok for fuzz testing *)
3333+ check true)
3434+ | Error _ ->
3535+ (* Some inputs might not encode, that's ok *)
3636+ check true
3737+3838+(* Test ASCII-only strings (should pass through mostly unchanged) *)
3939+let test_ascii_string input =
4040+ if String.length input > 0 then begin
4141+ let ascii_only =
4242+ String.init
4343+ (String.length input mod 64)
4444+ (fun i ->
4545+ Char.chr (Char.code input.[i mod String.length input] mod 128))
4646+ in
4747+ if String.length ascii_only > 0 then
4848+ ignore (Punycode.encode_utf8 ascii_only)
4949+ end;
5050+ check true
5151+5252+(* Test inputs starting with ACE prefix "xn--" *)
5353+let test_ace_prefix input =
5454+ let ace_input = "xn--" ^ input in
5555+ ignore (Punycode.decode_utf8 ace_input);
5656+ check true
5757+5858+let () =
5959+ add_test ~name:"punycode: encode no crash" [ bytes ] test_encode_no_crash;
6060+ add_test ~name:"punycode: decode no crash" [ bytes ] test_decode_no_crash;
6161+ add_test ~name:"punycode: roundtrip" [ bytes ] test_roundtrip;
6262+ add_test ~name:"punycode: ascii string" [ bytes ] test_ascii_string;
6363+ add_test ~name:"punycode: ace prefix" [ bytes ] test_ace_prefix
+214-231
lib/punycode.ml
···20202121(* {1 Position Tracking} *)
22222323-type position = {
2424- byte_offset : int;
2525- char_index : int;
2626-}
2323+type position = { byte_offset : int; char_index : int }
27242825let position_byte_offset pos = pos.byte_offset
2926let position_char_index pos = pos.char_index
30273128let pp_position fmt pos =
3229 Format.fprintf fmt "byte %d, char %d" pos.byte_offset pos.char_index
3333-34303531(* {1 Error Types} *)
3632···45414642let pp_error fmt = function
4743 | Overflow pos ->
4848- Format.fprintf fmt "arithmetic overflow at %a" pp_position pos
4444+ Format.fprintf fmt "arithmetic overflow at %a" pp_position pos
4945 | Invalid_character (pos, u) ->
5050- Format.fprintf fmt "invalid character U+%04X at %a"
5151- (Uchar.to_int u) pp_position pos
4646+ Format.fprintf fmt "invalid character U+%04X at %a" (Uchar.to_int u)
4747+ pp_position pos
5248 | Invalid_digit (pos, c) ->
5353- Format.fprintf fmt "invalid Punycode digit '%c' (0x%02X) at %a"
5454- c (Char.code c) pp_position pos
4949+ Format.fprintf fmt "invalid Punycode digit '%c' (0x%02X) at %a" c
5050+ (Char.code c) pp_position pos
5551 | Unexpected_end pos ->
5656- Format.fprintf fmt "unexpected end of input at %a" pp_position pos
5252+ Format.fprintf fmt "unexpected end of input at %a" pp_position pos
5753 | Invalid_utf8 pos ->
5858- Format.fprintf fmt "invalid UTF-8 sequence at %a" pp_position pos
5454+ Format.fprintf fmt "invalid UTF-8 sequence at %a" pp_position pos
5955 | Label_too_long len ->
6060- Format.fprintf fmt "label too long: %d bytes (max %d)" len max_label_length
6161- | Empty_label ->
6262- Format.fprintf fmt "empty label"
5656+ Format.fprintf fmt "label too long: %d bytes (max %d)" len
5757+ max_label_length
5858+ | Empty_label -> Format.fprintf fmt "empty label"
63596464-let error_to_string err =
6565- Format.asprintf "%a" pp_error err
6060+let error_to_string err = Format.asprintf "%a" pp_error err
66616762(* {1 Error Constructors} *)
6863···80758176(* {1 Basic Predicates} *)
82778383-let is_basic u =
8484- Uchar.to_int u < 0x80
8585-8686-let is_ascii_string s =
8787- String.for_all (fun c -> Char.code c < 0x80) s
7878+let is_basic u = Uchar.to_int u < 0x80
7979+let is_ascii_string s = String.for_all (fun c -> Char.code c < 0x80) s
88808981let has_ace_prefix s =
9082 let len = String.length s in
9191- len >= 4 &&
9292- (s.[0] = 'x' || s.[0] = 'X') &&
9393- (s.[1] = 'n' || s.[1] = 'N') &&
9494- s.[2] = '-' && s.[3] = '-'
8383+ len >= 4
8484+ && (s.[0] = 'x' || s.[0] = 'X')
8585+ && (s.[1] = 'n' || s.[1] = 'N')
8686+ && s.[2] = '-'
8787+ && s.[3] = '-'
95889689(* {1 Digit Encoding/Decoding (RFC 3492 Section 5)}
9790···10194*)
1029510396let encode_digit d case_flag =
104104- if d < 26 then
105105- Char.chr (d + (if case_flag = Uppercase then 0x41 else 0x61))
106106- else
107107- Char.chr (d - 26 + 0x30)
9797+ if d < 26 then Char.chr (d + if case_flag = Uppercase then 0x41 else 0x61)
9898+ else Char.chr (d - 26 + 0x30)
10899109100let decode_digit c =
110101 let code = Char.code c in
111111- if code >= 0x30 && code <= 0x39 then
112112- Some (code - 0x30 + 26) (* '0'-'9' -> 26-35 *)
113113- else if code >= 0x41 && code <= 0x5A then
114114- Some (code - 0x41) (* 'A'-'Z' -> 0-25 *)
115115- else if code >= 0x61 && code <= 0x7A then
116116- Some (code - 0x61) (* 'a'-'z' -> 0-25 *)
117117- else
118118- None
102102+ if code >= 0x30 && code <= 0x39 then Some (code - 0x30 + 26)
103103+ (* '0'-'9' -> 26-35 *)
104104+ else if code >= 0x41 && code <= 0x5A then Some (code - 0x41)
105105+ (* 'A'-'Z' -> 0-25 *)
106106+ else if code >= 0x61 && code <= 0x7A then Some (code - 0x61)
107107+ (* 'a'-'z' -> 0-25 *)
108108+ else None
119109120110(* Check if a character is "flagged" (uppercase) for case annotation *)
121111let is_flagged c =
122112 let code = Char.code c in
123123- code >= 0x41 && code <= 0x5A (* 'A'-'Z' *)
113113+ code >= 0x41 && code <= 0x5A (* 'A'-'Z' *)
124114125115(* {1 Bias Adaptation (RFC 3492 Section 6.1)} *)
126116127117let adapt ~delta ~numpoints ~firsttime =
128118 let delta = if firsttime then delta / damp else delta / 2 in
129119 let delta = delta + (delta / numpoints) in
130130- let threshold = ((base - tmin) * tmax) / 2 in
120120+ let threshold = (base - tmin) * tmax / 2 in
131121 let rec loop delta k =
132132- if delta > threshold then
133133- loop (delta / (base - tmin)) (k + base)
134134- else
135135- k + (((base - tmin + 1) * delta) / (delta + skew))
122122+ if delta > threshold then loop (delta / (base - tmin)) (k + base)
123123+ else k + ((base - tmin + 1) * delta / (delta + skew))
136124 in
137125 loop delta 0
138126···147135148136let safe_mul_add a b c pos =
149137 if c = 0 then Ok a
150150- else if b > (max_int_value - a) / c then
151151- overflow pos
152152- else
153153- Ok (a + b * c)
138138+ else if b > (max_int_value - a) / c then overflow pos
139139+ else Ok (a + (b * c))
154140155141(* {1 UTF-8 to Code Points Conversion} *)
156142···167153 acc := Uchar.utf_decode_uchar dec :: !acc;
168154 byte_offset := !byte_offset + Uchar.utf_decode_length dec;
169155 incr char_index
170170- end else begin
156156+ end
157157+ else begin
171158 error := Some (Invalid_utf8 pos)
172159 end
173160 done;
···186173187174let encode_impl codepoints case_flags =
188175 let input_length = Array.length codepoints in
189189- if input_length = 0 then
190190- Ok ""
176176+ if input_length = 0 then Ok ""
191177 else begin
192178 let output = Buffer.create (input_length * 2) in
193179···198184 if is_basic cp then begin
199185 let c = Uchar.to_int cp in
200186 let case =
201201- match case_flags with
202202- | Some flags -> flags.(j)
203203- | None -> Lowercase
187187+ match case_flags with Some flags -> flags.(j) | None -> Lowercase
204188 in
205189 (* Preserve or apply case for ASCII letters *)
206190 let c' =
···208192 if case = Lowercase then c + 0x20 else c
209193 else if c >= 0x61 && c <= 0x7A then (* 'a'-'z' *)
210194 if case = Uppercase then c - 0x20 else c
211211- else
212212- c
195195+ else c
213196 in
214197 Buffer.add_char output (Char.chr c');
215198 incr basic_count
···220203 let h = ref b in
221204222205 (* Add delimiter if there were basic code points *)
223223- if b > 0 then
224224- Buffer.add_char output delimiter;
206206+ if b > 0 then Buffer.add_char output delimiter;
225207226208 (* Main encoding loop *)
227209 let n = ref initial_n in
···232214233215 while !h < input_length && !result = Ok () do
234216 (* Find minimum code point >= n *)
235235- let m = Array.fold_left (fun acc cp ->
236236- let cp_val = Uchar.to_int cp in
237237- if cp_val >= !n && cp_val < acc then cp_val else acc
238238- ) max_int_value codepoints in
217217+ let m =
218218+ Array.fold_left
219219+ (fun acc cp ->
220220+ let cp_val = Uchar.to_int cp in
221221+ if cp_val >= !n && cp_val < acc then cp_val else acc)
222222+ max_int_value codepoints
223223+ in
239224240225 (* Increase delta to advance state to <m, 0> *)
241226 let pos = { byte_offset = 0; char_index = !h } in
242242- (match safe_mul_add !delta (m - !n) (!h + 1) pos with
243243- | Error e -> result := Error e
244244- | Ok new_delta ->
245245- delta := new_delta;
246246- n := m;
227227+ match safe_mul_add !delta (m - !n) (!h + 1) pos with
228228+ | Error e -> result := Error e
229229+ | Ok new_delta ->
230230+ delta := new_delta;
231231+ n := m;
247232248248- (* Process each code point *)
249249- let j = ref 0 in
250250- while !j < input_length && !result = Ok () do
251251- let cp = Uchar.to_int codepoints.(!j) in
252252- let pos = { byte_offset = 0; char_index = !j } in
233233+ (* Process each code point *)
234234+ let j = ref 0 in
235235+ while !j < input_length && !result = Ok () do
236236+ let cp = Uchar.to_int codepoints.(!j) in
237237+ let pos = { byte_offset = 0; char_index = !j } in
253238254254- if cp < !n then begin
255255- incr delta;
256256- if !delta = 0 then (* Overflow *)
257257- result := overflow pos
258258- end
259259- else if cp = !n then begin
260260- (* Encode delta as variable-length integer *)
261261- let q = ref !delta in
262262- let k = ref base in
263263- let done_encoding = ref false in
239239+ if cp < !n then begin
240240+ incr delta;
241241+ if !delta = 0 then (* Overflow *)
242242+ result := overflow pos
243243+ end
244244+ else if cp = !n then begin
245245+ (* Encode delta as variable-length integer *)
246246+ let q = ref !delta in
247247+ let k = ref base in
248248+ let done_encoding = ref false in
264249265265- while not !done_encoding do
266266- let t =
267267- if !k <= !bias then tmin
268268- else if !k >= !bias + tmax then tmax
269269- else !k - !bias
270270- in
271271- if !q < t then begin
272272- (* Output final digit *)
273273- let case =
274274- match case_flags with
275275- | Some flags -> flags.(!j)
276276- | None -> Lowercase
277277- in
278278- Buffer.add_char output (encode_digit !q case);
279279- done_encoding := true
280280- end else begin
281281- (* Output intermediate digit and continue *)
282282- let digit = t + ((!q - t) mod (base - t)) in
283283- Buffer.add_char output (encode_digit digit Lowercase);
284284- q := (!q - t) / (base - t);
285285- k := !k + base
286286- end
287287- done;
250250+ while not !done_encoding do
251251+ let t =
252252+ if !k <= !bias then tmin
253253+ else if !k >= !bias + tmax then tmax
254254+ else !k - !bias
255255+ in
256256+ if !q < t then begin
257257+ (* Output final digit *)
258258+ let case =
259259+ match case_flags with
260260+ | Some flags -> flags.(!j)
261261+ | None -> Lowercase
262262+ in
263263+ Buffer.add_char output (encode_digit !q case);
264264+ done_encoding := true
265265+ end
266266+ else begin
267267+ (* Output intermediate digit and continue *)
268268+ let digit = t + ((!q - t) mod (base - t)) in
269269+ Buffer.add_char output (encode_digit digit Lowercase);
270270+ q := (!q - t) / (base - t);
271271+ k := !k + base
272272+ end
273273+ done;
288274289289- bias := adapt ~delta:!delta ~numpoints:(!h + 1) ~firsttime:(!h = b);
290290- delta := 0;
291291- incr h
292292- end;
293293- incr j
294294- done;
275275+ bias := adapt ~delta:!delta ~numpoints:(!h + 1) ~firsttime:(!h = b);
276276+ delta := 0;
277277+ incr h
278278+ end;
279279+ incr j
280280+ done;
295281296296- incr delta;
297297- incr n)
282282+ incr delta;
283283+ incr n
298284 done;
299285300286 match !result with
···302288 | Ok () -> Ok (Buffer.contents output)
303289 end
304290305305-let encode codepoints =
306306- encode_impl codepoints None
291291+let encode codepoints = encode_impl codepoints None
307292308293let encode_with_case codepoints case_flags =
309294 if Array.length codepoints <> Array.length case_flags then
···314299315300let decode_impl input =
316301 let input_length = String.length input in
317317- if input_length = 0 then
318318- Ok ([||], [||])
302302+ if input_length = 0 then Ok ([||], [||])
319303 else begin
320304 (* Find last delimiter *)
321305 let b = Option.value ~default:0 (String.rindex_opt input delimiter) in
···334318 error := Some (Invalid_character (pos, Uchar.of_int code))
335319 else begin
336320 output := Uchar.of_int code :: !output;
337337- case_output := (if is_flagged c then Uppercase else Lowercase) :: !case_output
321321+ case_output :=
322322+ (if is_flagged c then Uppercase else Lowercase) :: !case_output
338323 end
339324 end
340325 done;
341326342327 match !error with
343328 | Some e -> Error e
344344- | None ->
345345- let output = ref (Array.of_list (List.rev !output)) in
346346- let case_output = ref (Array.of_list (List.rev !case_output)) in
347347-348348- (* Main decoding loop *)
349349- let n = ref initial_n in
350350- let i = ref 0 in
351351- let bias = ref initial_bias in
352352- let in_pos = ref (if b > 0 then b + 1 else 0) in
353353- let result = ref (Ok ()) in
329329+ | None -> (
330330+ let output = ref (Array.of_list (List.rev !output)) in
331331+ let case_output = ref (Array.of_list (List.rev !case_output)) in
354332355355- while !in_pos < input_length && !result = Ok () do
356356- let oldi = !i in
357357- let w = ref 1 in
358358- let k = ref base in
359359- let done_decoding = ref false in
333333+ (* Main decoding loop *)
334334+ let n = ref initial_n in
335335+ let i = ref 0 in
336336+ let bias = ref initial_bias in
337337+ let in_pos = ref (if b > 0 then b + 1 else 0) in
338338+ let result = ref (Ok ()) in
360339361361- while not !done_decoding && !result = Ok () do
362362- let pos = { byte_offset = !in_pos; char_index = Array.length !output } in
340340+ while !in_pos < input_length && !result = Ok () do
341341+ let oldi = !i in
342342+ let w = ref 1 in
343343+ let k = ref base in
344344+ let done_decoding = ref false in
363345364364- if !in_pos >= input_length then begin
365365- result := unexpected_end pos;
366366- done_decoding := true
367367- end else begin
368368- let c = input.[!in_pos] in
369369- incr in_pos;
346346+ while (not !done_decoding) && !result = Ok () do
347347+ let pos =
348348+ { byte_offset = !in_pos; char_index = Array.length !output }
349349+ in
370350371371- match decode_digit c with
372372- | None ->
373373- result := invalid_digit pos c;
351351+ if !in_pos >= input_length then begin
352352+ result := unexpected_end pos;
374353 done_decoding := true
375375- | Some digit ->
376376- (* i = i + digit * w, with overflow check *)
377377- (match safe_mul_add !i digit !w pos with
378378- | Error e ->
379379- result := Error e;
380380- done_decoding := true
381381- | Ok new_i ->
382382- i := new_i;
354354+ end
355355+ else begin
356356+ let c = input.[!in_pos] in
357357+ incr in_pos;
383358384384- let t =
385385- if !k <= !bias then tmin
386386- else if !k >= !bias + tmax then tmax
387387- else !k - !bias
388388- in
359359+ match decode_digit c with
360360+ | None ->
361361+ result := invalid_digit pos c;
362362+ done_decoding := true
363363+ | Some digit -> (
364364+ (* i = i + digit * w, with overflow check *)
365365+ match safe_mul_add !i digit !w pos with
366366+ | Error e ->
367367+ result := Error e;
368368+ done_decoding := true
369369+ | Ok new_i ->
370370+ i := new_i;
389371390390- if digit < t then begin
391391- (* Record case flag from this final digit *)
392392- done_decoding := true
393393- end else begin
394394- (* w = w * (base - t), with overflow check *)
395395- let base_minus_t = base - t in
396396- if !w > max_int_value / base_minus_t then begin
397397- result := overflow pos;
398398- done_decoding := true
399399- end else begin
400400- w := !w * base_minus_t;
401401- k := !k + base
402402- end
403403- end)
404404- end
405405- done;
372372+ let t =
373373+ if !k <= !bias then tmin
374374+ else if !k >= !bias + tmax then tmax
375375+ else !k - !bias
376376+ in
406377407407- if !result = Ok () then begin
408408- let out_len = Array.length !output in
409409- bias := adapt ~delta:(!i - oldi) ~numpoints:(out_len + 1) ~firsttime:(oldi = 0);
378378+ if digit < t then begin
379379+ (* Record case flag from this final digit *)
380380+ done_decoding := true
381381+ end
382382+ else begin
383383+ (* w = w * (base - t), with overflow check *)
384384+ let base_minus_t = base - t in
385385+ if !w > max_int_value / base_minus_t then begin
386386+ result := overflow pos;
387387+ done_decoding := true
388388+ end
389389+ else begin
390390+ w := !w * base_minus_t;
391391+ k := !k + base
392392+ end
393393+ end)
394394+ end
395395+ done;
410396411411- let pos = { byte_offset = !in_pos - 1; char_index = out_len } in
397397+ if !result = Ok () then begin
398398+ let out_len = Array.length !output in
399399+ bias :=
400400+ adapt ~delta:(!i - oldi) ~numpoints:(out_len + 1)
401401+ ~firsttime:(oldi = 0);
412402413413- (* n = n + i / (out_len + 1), with overflow check *)
414414- let increment = !i / (out_len + 1) in
415415- if increment > max_int_value - !n then
416416- result := overflow pos
417417- else begin
418418- n := !n + increment;
419419- i := !i mod (out_len + 1);
403403+ let pos = { byte_offset = !in_pos - 1; char_index = out_len } in
420404421421- (* Validate that n is a valid Unicode scalar value *)
422422- if not (Uchar.is_valid !n) then
423423- result := invalid_character pos Uchar.rep
405405+ (* n = n + i / (out_len + 1), with overflow check *)
406406+ let increment = !i / (out_len + 1) in
407407+ if increment > max_int_value - !n then result := overflow pos
424408 else begin
425425- (* Insert n at position i *)
426426- let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in
427427- let new_case = Array.make (out_len + 1) Lowercase in
409409+ n := !n + increment;
410410+ i := !i mod (out_len + 1);
428411429429- for j = 0 to !i - 1 do
430430- new_output.(j) <- !output.(j);
431431- new_case.(j) <- !case_output.(j)
432432- done;
433433- new_output.(!i) <- Uchar.of_int !n;
434434- (* Case flag from final digit of this delta *)
435435- new_case.(!i) <- (if !in_pos > 0 && is_flagged input.[!in_pos - 1]
436436- then Uppercase else Lowercase);
437437- for j = !i to out_len - 1 do
438438- new_output.(j + 1) <- !output.(j);
439439- new_case.(j + 1) <- !case_output.(j)
440440- done;
412412+ (* Validate that n is a valid Unicode scalar value *)
413413+ if not (Uchar.is_valid !n) then
414414+ result := invalid_character pos Uchar.rep
415415+ else begin
416416+ (* Insert n at position i *)
417417+ let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in
418418+ let new_case = Array.make (out_len + 1) Lowercase in
419419+420420+ for j = 0 to !i - 1 do
421421+ new_output.(j) <- !output.(j);
422422+ new_case.(j) <- !case_output.(j)
423423+ done;
424424+ new_output.(!i) <- Uchar.of_int !n;
425425+ (* Case flag from final digit of this delta *)
426426+ new_case.(!i) <-
427427+ (if !in_pos > 0 && is_flagged input.[!in_pos - 1] then
428428+ Uppercase
429429+ else Lowercase);
430430+ for j = !i to out_len - 1 do
431431+ new_output.(j + 1) <- !output.(j);
432432+ new_case.(j + 1) <- !case_output.(j)
433433+ done;
441434442442- output := new_output;
443443- case_output := new_case;
444444- incr i
435435+ output := new_output;
436436+ case_output := new_case;
437437+ incr i
438438+ end
445439 end
446440 end
447447- end
448448- done;
441441+ done;
449442450450- match !result with
451451- | Error e -> Error e
452452- | Ok () -> Ok (!output, !case_output)
443443+ match !result with
444444+ | Error e -> Error e
445445+ | Ok () -> Ok (!output, !case_output))
453446 end
454447455455-let decode input =
456456- Result.map fst (decode_impl input)
457457-458458-let decode_with_case input =
459459- decode_impl input
448448+let decode input = Result.map fst (decode_impl input)
449449+let decode_with_case input = decode_impl input
460450461451(* {1 UTF-8 String Operations} *)
462452···473463(* {1 Domain Label Operations} *)
474464475465let encode_label label =
476476- if String.length label = 0 then
477477- empty_label
466466+ if String.length label = 0 then empty_label
478467 else if is_ascii_string label then begin
479468 (* All ASCII - return as-is, but check length *)
480469 let len = String.length label in
481481- if len > max_label_length then
482482- label_too_long len
483483- else
484484- Ok label
485485- end else
470470+ if len > max_label_length then label_too_long len else Ok label
471471+ end
472472+ else
486473 (* Has non-ASCII - encode with Punycode *)
487474 let open Result.Syntax in
488475 let* encoded = encode_utf8 label in
489476 let result = ace_prefix ^ encoded in
490477 let len = String.length result in
491491- if len > max_label_length then
492492- label_too_long len
493493- else
494494- Ok result
478478+ if len > max_label_length then label_too_long len else Ok result
495479496480let decode_label label =
497497- if String.length label = 0 then
498498- empty_label
481481+ if String.length label = 0 then empty_label
499482 else if has_ace_prefix label then begin
500483 (* Remove ACE prefix and decode *)
501484 let punycode = String.sub label 4 (String.length label - 4) in
502485 decode_utf8 punycode
503503- end else begin
486486+ end
487487+ else begin
504488 (* No ACE prefix - validate and return *)
505505- if is_ascii_string label then
506506- Ok label
489489+ if is_ascii_string label then Ok label
507490 else
508491 (* Has non-ASCII but no ACE prefix - return as-is *)
509492 Ok label
+97-85
lib/punycode.mli
···66(** RFC 3492 Punycode: A Bootstring encoding of Unicode for IDNA.
7788 This module implements the Punycode algorithm as specified in
99- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492},
1010- providing encoding and decoding of Unicode strings to/from ASCII-compatible
1111- encoding suitable for use in internationalized domain names.
99+ {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492}, providing
1010+ encoding and decoding of Unicode strings to/from ASCII-compatible encoding
1111+ suitable for use in internationalized domain names.
12121313- Punycode is an instance of Bootstring that uses particular parameter
1414- values appropriate for IDNA. See
1515- {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}
1616- for the specific parameter values.
1313+ Punycode is an instance of Bootstring that uses particular parameter values
1414+ appropriate for IDNA. See
1515+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
1616+ 5} for the specific parameter values.
17171818 {2 References}
1919- {ul
2020- {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A Bootstring encoding of Unicode for IDNA}
2121- {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol}} *)
1919+ - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A
2020+ Bootstring encoding of Unicode for IDNA
2121+ - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} - IDNA Protocol
2222+*)
22232324(** {1 Position Tracking} *)
2425···39404041type error =
4142 | Overflow of position
4242- (** Arithmetic overflow during encode/decode. This can occur with
4343- very long strings or extreme Unicode code point values.
4444- See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4}
4545- RFC 3492 Section 6.4} for overflow handling requirements. *)
4343+ (** Arithmetic overflow during encode/decode. This can occur with very
4444+ long strings or extreme Unicode code point values. See
4545+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.4} RFC 3492
4646+ Section 6.4} for overflow handling requirements. *)
4647 | Invalid_character of position * Uchar.t
4747- (** A non-basic code point appeared where only basic code points
4848- (ASCII < 128) are allowed. Per
4949- {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1}
5050- RFC 3492 Section 3.1}, basic code points must be segregated
5151- at the beginning of the encoded string. *)
4848+ (** A non-basic code point appeared where only basic code points (ASCII <
4949+ 128) are allowed. Per
5050+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} RFC 3492
5151+ Section 3.1}, basic code points must be segregated at the beginning
5252+ of the encoded string. *)
5253 | Invalid_digit of position * char
5353- (** An invalid Punycode digit was encountered during decoding.
5454- Valid digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35).
5555- See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}
5656- RFC 3492 Section 5} for digit-value mappings. *)
5454+ (** An invalid Punycode digit was encountered during decoding. Valid
5555+ digits are a-z, A-Z (values 0-25) and 0-9 (values 26-35). See
5656+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492
5757+ Section 5} for digit-value mappings. *)
5758 | Unexpected_end of position
5858- (** The input ended prematurely during decoding of a delta value.
5959- See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}
6060- RFC 3492 Section 6.2} decoding procedure. *)
6161- | Invalid_utf8 of position
6262- (** Malformed UTF-8 sequence in input string. *)
5959+ (** The input ended prematurely during decoding of a delta value. See
6060+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2} RFC 3492
6161+ Section 6.2} decoding procedure. *)
6262+ | Invalid_utf8 of position (** Malformed UTF-8 sequence in input string. *)
6363 | Label_too_long of int
6464 (** Encoded label exceeds 63 bytes (DNS limit per
6565- {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}).
6666- The int is the actual length. *)
6767- | Empty_label
6868- (** Empty label is not valid for encoding. *)
6565+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}). The int
6666+ is the actual length. *)
6767+ | Empty_label (** Empty label is not valid for encoding. *)
69687069val pp_error : Format.formatter -> error -> unit
7170(** [pp_error fmt e] pretty-prints an error with position information. *)
···7675(** {1 Constants}
77767877 Punycode parameters as specified in
7979- {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}. *)
7878+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
7979+ 5}. *)
80808181val ace_prefix : string
8282-(** The ACE prefix ["xn--"] used for Punycode-encoded domain labels.
8383- See {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}
8484- RFC 3492 Section 5} which notes that IDNA prepends this prefix. *)
8282+(** The ACE prefix ["xn--"] used for Punycode-encoded domain labels. See
8383+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
8484+ 5} which notes that IDNA prepends this prefix. *)
85858686val max_label_length : int
8787(** Maximum length of a domain label in bytes (63), per
···89899090(** {1 Case Flags for Mixed-Case Annotation}
91919292- {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}
9393- describes an optional mechanism for preserving case information through
9494- the encoding/decoding round-trip. This is useful when the original
9292+ {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
9393+ Appendix A} describes an optional mechanism for preserving case information
9494+ through the encoding/decoding round-trip. This is useful when the original
9595 string's case should be recoverable.
96969797 Note: Mixed-case annotation is not used by the ToASCII and ToUnicode
9898 operations of IDNA. *)
9999100100-type case_flag = Uppercase | Lowercase
101101-(** Case annotation for a character. *)
100100+type case_flag =
101101+ | Uppercase
102102+ | Lowercase (** Case annotation for a character. *)
102103103104(** {1 Core Punycode Operations}
104105105106 These functions implement the Bootstring algorithms from
106106- {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section 6}.
107107- They operate on arrays of Unicode code points ([Uchar.t array]).
108108- The encoded output is a plain ASCII string without the ACE prefix. *)
107107+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6}RFC 3492 Section
108108+ 6}. They operate on arrays of Unicode code points ([Uchar.t array]). The
109109+ encoded output is a plain ASCII string without the ACE prefix. *)
109110110111val encode : Uchar.t array -> (string, error) result
111111-(** [encode codepoints] encodes an array of Unicode code points to a
112112- Punycode ASCII string.
112112+(** [encode codepoints] encodes an array of Unicode code points to a Punycode
113113+ ASCII string.
113114114115 Implements the encoding procedure from
115115- {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492 Section 6.3}:
116116+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.3}RFC 3492
117117+ Section 6.3}:
116118117117- 1. Basic code points (ASCII < 128) are copied literally to the beginning
118118- of the output per {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1}
119119- Section 3.1 (Basic code point segregation)}
120120- 2. A delimiter ('-') is appended if there are any basic code points
121121- 3. Non-basic code points are encoded as deltas using the generalized
122122- variable-length integer representation from
123123- {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3}
119119+ 1. Basic code points (ASCII < 128) are copied literally to the beginning of
120120+ the output per
121121+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.1} Section 3.1
122122+ (Basic code point segregation)} 2. A delimiter ('-') is appended if there
123123+ are any basic code points 3. Non-basic code points are encoded as deltas
124124+ using the generalized variable-length integer representation from
125125+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-3.3}Section 3.3}
124126125127 Example:
126128 {[
···129131 ]} *)
130132131133val decode : string -> (Uchar.t array, error) result
132132-(** [decode punycode] decodes a Punycode ASCII string to an array of
133133- Unicode code points.
134134+(** [decode punycode] decodes a Punycode ASCII string to an array of Unicode
135135+ code points.
134136135137 Implements the decoding procedure from
136136- {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 Section 6.2}.
138138+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492
139139+ Section 6.2}.
137140138138- The input should be the Punycode portion only, without the ACE prefix.
139139- The decoder is case-insensitive for the encoded portion, as required by
140140- {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}:
141141- "A decoder MUST recognize the letters in both uppercase and lowercase forms".
141141+ The input should be the Punycode portion only, without the ACE prefix. The
142142+ decoder is case-insensitive for the encoded portion, as required by
143143+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
144144+ 5}: "A decoder MUST recognize the letters in both uppercase and lowercase
145145+ forms".
142146143147 Example:
144148 {[
···148152149153(** {1 Mixed-Case Annotation}
150154151151- These functions support round-trip case preservation as described
152152- in {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}. *)
155155+ These functions support round-trip case preservation as described in
156156+ {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
157157+ Appendix A}. *)
153158154154-val encode_with_case : Uchar.t array -> case_flag array -> (string, error) result
159159+val encode_with_case :
160160+ Uchar.t array -> case_flag array -> (string, error) result
155161(** [encode_with_case codepoints case_flags] encodes with case annotation.
156162157157- Per {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A}:
163163+ Per
164164+ {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
165165+ Appendix A}:
158166 - For basic (ASCII) letters, the output preserves the case flag directly
159167 - For non-ASCII characters, the case of the final digit in each delta
160168 encoding indicates the flag (uppercase = suggested uppercase)
···166174val decode_with_case : string -> (Uchar.t array * case_flag array, error) result
167175(** [decode_with_case punycode] decodes and extracts case annotations.
168176169169- Per {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492 Appendix A},
170170- returns both the decoded code points and an array of case flags
171171- indicating the suggested case for each character based on the
177177+ Per
178178+ {{:https://datatracker.ietf.org/doc/html/rfc3492#appendix-A}RFC 3492
179179+ Appendix A}, returns both the decoded code points and an array of case
180180+ flags indicating the suggested case for each character based on the
172181 uppercase/lowercase form of the encoding digits. *)
173182174183(** {1 UTF-8 String Operations}
···179188val encode_utf8 : string -> (string, error) result
180189(** [encode_utf8 s] encodes a UTF-8 string to Punycode (no ACE prefix).
181190182182- This is equivalent to decoding [s] from UTF-8 to code points, then
183183- calling {!encode}.
191191+ This is equivalent to decoding [s] from UTF-8 to code points, then calling
192192+ {!encode}.
184193185194 Example:
186195 {[
···201210202211(** {1 Domain Label Operations}
203212204204- These functions handle the ACE prefix automatically and enforce
205205- DNS label length limits per {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
213213+ These functions handle the ACE prefix automatically and enforce DNS label
214214+ length limits per
215215+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. *)
206216207217val encode_label : string -> (string, error) result
208218(** [encode_label label] encodes a domain label for use in DNS.
209219210220 If the label contains only ASCII characters, it is returned unchanged.
211211- Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended,
212212- as specified in {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}
213213- RFC 3492 Section 5}.
221221+ Otherwise, it is Punycode-encoded with the ACE prefix ("xn--") prepended, as
222222+ specified in
223223+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5} RFC 3492 Section
224224+ 5}.
214225215226 Returns {!Error} {!Label_too_long} if the result exceeds 63 bytes.
216227217228 Example:
218229 {[
219230 encode_label "münchen"
220220- (* = Ok "xn--mnchen-3ya" *)
221221- encode_label "example"
231231+ (* = Ok "xn--mnchen-3ya" *)
232232+ encode_label "example"
222233 (* = Ok "example" *)
223234 ]} *)
224235225236val decode_label : string -> (string, error) result
226237(** [decode_label label] decodes a domain label.
227238228228- If the label starts with the ACE prefix ("xn--", case-insensitive),
229229- it is Punycode-decoded. Otherwise, it is returned unchanged.
239239+ If the label starts with the ACE prefix ("xn--", case-insensitive), it is
240240+ Punycode-decoded. Otherwise, it is returned unchanged.
230241231242 Example:
232243 {[
233244 decode_label "xn--mnchen-3ya"
234234- (* = Ok "münchen" *)
235235- decode_label "example"
245245+ (* = Ok "münchen" *)
246246+ decode_label "example"
236247 (* = Ok "example" *)
237248 ]} *)
238249···243254val is_basic : Uchar.t -> bool
244255(** [is_basic u] is [true] if [u] is a basic code point (ASCII, < 128).
245256246246- Per {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5},
247247- basic code points for Punycode are the ASCII code points (0..7F). *)
257257+ Per
258258+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
259259+ 5}, basic code points for Punycode are the ASCII code points (0..7F). *)
248260249261val is_ascii_string : string -> bool
250250-(** [is_ascii_string s] is [true] if [s] contains only ASCII characters
251251- (all bytes < 128). *)
262262+(** [is_ascii_string s] is [true] if [s] contains only ASCII characters (all
263263+ bytes < 128). *)
252264253265val has_ace_prefix : string -> bool
254266(** [has_ace_prefix s] is [true] if [s] starts with the ACE prefix "xn--"
+53-65
lib/punycode_idna.ml
···18181919let pp_error fmt = function
2020 | Punycode_error e ->
2121- Format.fprintf fmt "Punycode error: %a" Punycode.pp_error e
2222- | Invalid_label msg ->
2323- Format.fprintf fmt "invalid label: %s" msg
2121+ Format.fprintf fmt "Punycode error: %a" Punycode.pp_error e
2222+ | Invalid_label msg -> Format.fprintf fmt "invalid label: %s" msg
2423 | Domain_too_long len ->
2525- Format.fprintf fmt "domain too long: %d bytes (max %d)" len max_domain_length
2626- | Normalization_failed ->
2727- Format.fprintf fmt "Unicode normalization failed"
2424+ Format.fprintf fmt "domain too long: %d bytes (max %d)" len
2525+ max_domain_length
2626+ | Normalization_failed -> Format.fprintf fmt "Unicode normalization failed"
2827 | Verification_failed ->
2929- Format.fprintf fmt "IDNA verification failed (round-trip mismatch)"
2828+ Format.fprintf fmt "IDNA verification failed (round-trip mismatch)"
30293131-let error_to_string err =
3232- Format.asprintf "%a" pp_error err
3030+let error_to_string err = Format.asprintf "%a" pp_error err
33313432(* {1 Error Constructors} *)
3533···41394240(* {1 Unicode Normalization} *)
43414444-let normalize_nfc s =
4545- Uunf_string.normalize_utf_8 `NFC s
4242+let normalize_nfc s = Uunf_string.normalize_utf_8 `NFC s
46434744(* {1 Validation Helpers} *)
48454949-let is_ace_label label =
5050- Punycode.has_ace_prefix label
4646+let is_ace_label label = Punycode.has_ace_prefix label
51475248(* Check if a label follows STD3 rules (hostname restrictions):
5349 - Only LDH (letters, digits, hyphens)
···5551let is_std3_valid label =
5652 let len = String.length label in
5753 let is_ldh c =
5858- (c >= 'a' && c <= 'z') ||
5959- (c >= 'A' && c <= 'Z') ||
6060- (c >= '0' && c <= '9') ||
6161- c = '-'
5454+ (c >= 'a' && c <= 'z')
5555+ || (c >= 'A' && c <= 'Z')
5656+ || (c >= '0' && c <= '9')
5757+ || c = '-'
6258 in
6363- len > 0 &&
6464- label.[0] <> '-' &&
6565- label.[len - 1] <> '-' &&
6666- String.for_all is_ldh label
5959+ len > 0
6060+ && label.[0] <> '-'
6161+ && label.[len - 1] <> '-'
6262+ && String.for_all is_ldh label
67636864(* Check hyphen placement: hyphens not in positions 3 and 4 (except for ACE) *)
6965let check_hyphen_rules label =
···7167 if len >= 4 && label.[2] = '-' && label.[3] = '-' then
7268 (* Hyphens in positions 3 and 4 - only valid for ACE prefix *)
7369 is_ace_label label
7474- else
7575- true
7070+ else true
76717772(* {1 Label Operations} *)
78737974let label_to_ascii_impl ~check_hyphens ~use_std3_rules label =
8075 let len = String.length label in
8181- if len = 0 then
8282- invalid_label "empty label"
7676+ if len = 0 then invalid_label "empty label"
8377 else if len > Punycode.max_label_length then
8478 punycode_error (Punycode.Label_too_long len)
8579 else if Punycode.is_ascii_string label then begin
···8882 invalid_label "STD3 rules violation"
8983 else if check_hyphens && not (check_hyphen_rules label) then
9084 invalid_label "invalid hyphen placement"
9191- else
9292- Ok label
9393- end else begin
8585+ else Ok label
8686+ end
8787+ else begin
9488 (* Has non-ASCII - normalize and encode *)
9589 let normalized = normalize_nfc label in
96909791 (* Encode to Punycode *)
9892 match Punycode.encode_utf8 normalized with
9993 | Error e -> punycode_error e
100100- | Ok encoded ->
101101- let result = Punycode.ace_prefix ^ encoded in
102102- let result_len = String.length result in
103103- if result_len > Punycode.max_label_length then
104104- punycode_error (Punycode.Label_too_long result_len)
105105- else if check_hyphens && not (check_hyphen_rules result) then
106106- invalid_label "invalid hyphen placement in encoded label"
107107- else
108108- (* Verification: decode and compare to original normalized form *)
109109- match Punycode.decode_utf8 encoded with
110110- | Error _ -> verification_failed
111111- | Ok decoded ->
112112- if decoded <> normalized then
113113- verification_failed
114114- else
115115- Ok result
9494+ | Ok encoded -> (
9595+ let result = Punycode.ace_prefix ^ encoded in
9696+ let result_len = String.length result in
9797+ if result_len > Punycode.max_label_length then
9898+ punycode_error (Punycode.Label_too_long result_len)
9999+ else if check_hyphens && not (check_hyphen_rules result) then
100100+ invalid_label "invalid hyphen placement in encoded label"
101101+ else
102102+ (* Verification: decode and compare to original normalized form *)
103103+ match Punycode.decode_utf8 encoded with
104104+ | Error _ -> verification_failed
105105+ | Ok decoded ->
106106+ if decoded <> normalized then verification_failed else Ok result)
116107 end
117108118109let label_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) label =
···124115 match Punycode.decode_utf8 encoded with
125116 | Error e -> punycode_error e
126117 | Ok decoded -> Ok decoded
127127- end else
128128- Ok label
118118+ end
119119+ else Ok label
129120130121(* {1 Domain Operations} *)
131122132123(* Split domain into labels *)
133133-let split_domain domain =
134134- String.split_on_char '.' domain
124124+let split_domain domain = String.split_on_char '.' domain
135125136126(* Join labels into domain *)
137137-let join_labels labels =
138138- String.concat "." labels
127127+let join_labels labels = String.concat "." labels
139128140129(* Map a function returning Result over a list, short-circuiting on first Error *)
141130let map_result f lst =
142142- List.fold_right (fun x acc ->
143143- let open Result.Syntax in
144144- let* y = f x in
145145- let+ ys = acc in
146146- y :: ys
147147- ) lst (Ok [])
131131+ List.fold_right
132132+ (fun x acc ->
133133+ let open Result.Syntax in
134134+ let* y = f x in
135135+ let+ ys = acc in
136136+ y :: ys)
137137+ lst (Ok [])
148138149139let to_ascii ?(check_hyphens = true) ?(check_bidi = false)
150150- ?(check_joiners = false) ?(use_std3_rules = false)
151151- ?(transitional = false) domain =
140140+ ?(check_joiners = false) ?(use_std3_rules = false) ?(transitional = false)
141141+ domain =
152142 (* Note: check_bidi, check_joiners, and transitional are accepted but
153143 not fully implemented - they would require additional Unicode data *)
154144 let _ = check_bidi in
···157147158148 let open Result.Syntax in
159149 let labels = split_domain domain in
160160- let* encoded_labels = map_result (label_to_ascii_impl ~check_hyphens ~use_std3_rules) labels in
150150+ let* encoded_labels =
151151+ map_result (label_to_ascii_impl ~check_hyphens ~use_std3_rules) labels
152152+ in
161153 let result = join_labels encoded_labels in
162154 let len = String.length result in
163163- if len > max_domain_length then
164164- domain_too_long len
165165- else
166166- Ok result
155155+ if len > max_domain_length then domain_too_long len else Ok result
167156168157let to_unicode domain =
169158 let open Result.Syntax in
···191180192181(* {1 Validation} *)
193182194194-let is_idna_valid domain =
195195- Result.is_ok (to_ascii domain)
183183+let is_idna_valid domain = Result.is_ok (to_ascii domain)
+95-77
lib/punycode_idna.mli
···5566(** IDNA (Internationalized Domain Names in Applications) support.
7788- This module provides ToASCII and ToUnicode operations as specified
99- in {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} (IDNA 2008),
88+ This module provides ToASCII and ToUnicode operations as specified in
99+ {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} (IDNA 2008),
1010 using Punycode ({{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492})
1111 for encoding.
12121313- IDNA allows domain names to contain non-ASCII Unicode characters by
1414- encoding them using Punycode with an ACE prefix. This module handles
1515- the conversion between Unicode domain names and their ASCII-compatible
1616- encoding (ACE) form.
1313+ IDNA allows domain names to contain non-ASCII Unicode characters by encoding
1414+ them using Punycode with an ACE prefix. This module handles the conversion
1515+ between Unicode domain names and their ASCII-compatible encoding (ACE) form.
17161817 {2 References}
1919- {ul
2020- {- {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} -
2121- Internationalized Domain Names in Applications (IDNA): Protocol}
2222- {- {{:https://datatracker.ietf.org/doc/html/rfc5892}RFC 5892} -
2323- The Unicode Code Points and Internationalized Domain Names for Applications (IDNA)}
2424- {- {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} -
2525- Right-to-Left Scripts for Internationalized Domain Names for Applications (IDNA)}
2626- {- {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} -
2727- Punycode: A Bootstring encoding of Unicode for IDNA}} *)
1818+ - {{:https://datatracker.ietf.org/doc/html/rfc5891}RFC 5891} -
1919+ Internationalized Domain Names in Applications (IDNA): Protocol
2020+ - {{:https://datatracker.ietf.org/doc/html/rfc5892}RFC 5892} - The Unicode
2121+ Code Points and Internationalized Domain Names for Applications (IDNA)
2222+ - {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} - Right-to-Left
2323+ Scripts for Internationalized Domain Names for Applications (IDNA)
2424+ - {{:https://datatracker.ietf.org/doc/html/rfc3492}RFC 3492} - Punycode: A
2525+ Bootstring encoding of Unicode for IDNA *)
28262927(** {1 Error Types} *)
30283129type error =
3230 | Punycode_error of Punycode.error
3333- (** Error during Punycode encoding/decoding.
3434- See {!Punycode.error} for details. *)
3131+ (** Error during Punycode encoding/decoding. See {!Punycode.error} for
3232+ details. *)
3533 | Invalid_label of string
3634 (** Label violates IDNA constraints. The string describes the violation.
3737- See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4}
3838- RFC 5891 Section 4} for label validation requirements. *)
3535+ See
3636+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891
3737+ Section 4} for label validation requirements. *)
3938 | Domain_too_long of int
4039 (** Domain name exceeds 253 bytes, per
4141- {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}.
4242- The int is the actual length. *)
4040+ {{:https://datatracker.ietf.org/doc/html/rfc1035}RFC 1035}. The int is
4141+ the actual length. *)
4342 | Normalization_failed
4444- (** Unicode normalization (NFC) failed.
4545- Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}
4646- RFC 5891 Section 4.2.1}, labels must be in NFC form. *)
4343+ (** Unicode normalization (NFC) failed. Per
4444+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} RFC
4545+ 5891 Section 4.2.1}, labels must be in NFC form. *)
4746 | Verification_failed
4848- (** ToASCII/ToUnicode verification step failed (round-trip check).
4949- Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}
5050- RFC 5891 Section 4.2}, the result of encoding must decode back
5151- to the original input. *)
4747+ (** ToASCII/ToUnicode verification step failed (round-trip check). Per
4848+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} RFC 5891
4949+ Section 4.2}, the result of encoding must decode back to the original
5050+ input. *)
52515352val pp_error : Format.formatter -> error -> unit
5453(** [pp_error fmt e] pretty-prints an error. *)
···64636564(** {1 ToASCII Operation}
66656767- Converts an internationalized domain name to its ASCII-compatible
6868- encoding (ACE) form suitable for DNS lookup.
6666+ Converts an internationalized domain name to its ASCII-compatible encoding
6767+ (ACE) form suitable for DNS lookup.
69687070- See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4}
7171- RFC 5891 Section 4} for the complete ToASCII specification. *)
6969+ See
7070+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891 Section
7171+ 4} for the complete ToASCII specification. *)
72727373-val to_ascii : ?check_hyphens:bool -> ?check_bidi:bool ->
7474- ?check_joiners:bool -> ?use_std3_rules:bool ->
7575- ?transitional:bool -> string -> (string, error) result
7373+val to_ascii :
7474+ ?check_hyphens:bool ->
7575+ ?check_bidi:bool ->
7676+ ?check_joiners:bool ->
7777+ ?use_std3_rules:bool ->
7878+ ?transitional:bool ->
7979+ string ->
8080+ (string, error) result
7681(** [to_ascii domain] converts an internationalized domain name to ASCII.
77827883 Implements the ToASCII operation from
7979- {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 Section 4.1}.
8484+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891
8585+ Section 4.1}.
80868181- For each label in the domain:
8282- 1. If all ASCII, pass through (with optional STD3 validation)
8383- 2. Otherwise, normalize to NFC per
8484- {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}Section 4.2.1}
8585- and Punycode-encode with ACE prefix
8787+ For each label in the domain: 1. If all ASCII, pass through (with optional
8888+ STD3 validation) 2. Otherwise, normalize to NFC per
8989+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}Section
9090+ 4.2.1} and Punycode-encode with ACE prefix
86918787- Optional parameters (per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4}
8888- RFC 5891 Section 4} processing options):
9292+ Optional parameters (per
9393+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4} RFC 5891 Section
9494+ 4} processing options):
8995 - [check_hyphens]: Validate hyphen placement per
9090- {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1}Section 4.2.3.1}
9191- (default: true)
9696+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.1}Section
9797+ 4.2.3.1} (default: true)
9298 - [check_bidi]: Check bidirectional text rules per
9393- {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893}
9494- (default: false, not implemented)
9999+ {{:https://datatracker.ietf.org/doc/html/rfc5893}RFC 5893} (default:
100100+ false, not implemented)
95101 - [check_joiners]: Check contextual joiner rules per
9696- {{:https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1}RFC 5892 Appendix A.1}
9797- (default: false, not implemented)
102102+ {{:https://datatracker.ietf.org/doc/html/rfc5892#appendix-A.1}RFC 5892
103103+ Appendix A.1} (default: false, not implemented)
98104 - [use_std3_rules]: Apply STD3 hostname rules per
9999- {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section 4.2.3.2}
100100- (default: false)
101101- - [transitional]: Use IDNA 2003 transitional processing
102102- (default: false)
105105+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.3.2}Section
106106+ 4.2.3.2} (default: false)
107107+ - [transitional]: Use IDNA 2003 transitional processing (default: false)
103108104109 Example:
105110 {[
···107112 (* = Ok "xn--mnchen-3ya.example.com" *)
108113 ]} *)
109114110110-val label_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool ->
111111- string -> (string, error) result
115115+val label_to_ascii :
116116+ ?check_hyphens:bool ->
117117+ ?use_std3_rules:bool ->
118118+ string ->
119119+ (string, error) result
112120(** [label_to_ascii label] converts a single label to ASCII.
113121114122 This implements the core ToASCII operation for one label, as described in
115115- {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891 Section 4.1}. *)
123123+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.1}RFC 5891
124124+ Section 4.1}. *)
116125117126(** {1 ToUnicode Operation}
118127119128 Converts an ASCII-compatible encoded domain name back to Unicode.
120129121121- See {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}
122122- RFC 5891 Section 4.2} for the complete ToUnicode specification. *)
130130+ See
131131+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2} RFC 5891
132132+ Section 4.2} for the complete ToUnicode specification. *)
123133124134val to_unicode : string -> (string, error) result
125135(** [to_unicode domain] converts an ACE domain name to Unicode.
126136127137 Implements the ToUnicode operation from
128128- {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 Section 4.2}.
138138+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891
139139+ Section 4.2}.
129140130130- For each label in the domain:
131131- 1. If it has the ACE prefix ("xn--"), Punycode-decode it per
132132- {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492 Section 6.2}
133133- 2. Otherwise, pass through unchanged
141141+ For each label in the domain: 1. If it has the ACE prefix ("xn--"),
142142+ Punycode-decode it per
143143+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-6.2}RFC 3492
144144+ Section 6.2} 2. Otherwise, pass through unchanged
134145135146 Example:
136147 {[
···142153(** [label_to_unicode label] converts a single ACE label to Unicode.
143154144155 This implements the core ToUnicode operation for one label, as described in
145145- {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891 Section 4.2}. *)
156156+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2}RFC 5891
157157+ Section 4.2}. *)
146158147159(** {1 Domain Name Integration}
148160···152164 These provide integration with the [Domain_name] module for applications
153165 that use that library for domain name handling. *)
154166155155-val domain_to_ascii : ?check_hyphens:bool -> ?use_std3_rules:bool ->
156156- [`raw] Domain_name.t -> ([`raw] Domain_name.t, error) result
167167+val domain_to_ascii :
168168+ ?check_hyphens:bool ->
169169+ ?use_std3_rules:bool ->
170170+ [ `raw ] Domain_name.t ->
171171+ ([ `raw ] Domain_name.t, error) result
157172(** [domain_to_ascii domain] converts a domain name to ASCII form.
158173159159- Applies {!to_ascii} to the string representation and returns the
160160- result as a [Domain_name.t].
174174+ Applies {!to_ascii} to the string representation and returns the result as a
175175+ [Domain_name.t].
161176162177 Example:
163178 {[
···166181 (* = Ok (Domain_name.of_string_exn "xn--mnchen-3ya.example.com") *)
167182 ]} *)
168183169169-val domain_to_unicode : [`raw] Domain_name.t -> ([`raw] Domain_name.t, error) result
184184+val domain_to_unicode :
185185+ [ `raw ] Domain_name.t -> ([ `raw ] Domain_name.t, error) result
170186(** [domain_to_unicode domain] converts a domain name to Unicode form.
171187172172- Applies {!to_unicode} to the string representation and returns the
173173- result as a [Domain_name.t]. *)
188188+ Applies {!to_unicode} to the string representation and returns the result as
189189+ a [Domain_name.t]. *)
174190175191(** {1 Validation} *)
176192···182198val is_ace_label : string -> bool
183199(** [is_ace_label label] is [true] if the label has the ACE prefix "xn--"
184200 (case-insensitive). This indicates the label is Punycode-encoded per
185185- {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section 5}. *)
201201+ {{:https://datatracker.ietf.org/doc/html/rfc3492#section-5}RFC 3492 Section
202202+ 5}. *)
186203187204(** {1 Normalization} *)
188205189206val normalize_nfc : string -> string
190207(** [normalize_nfc s] returns the NFC-normalized form of UTF-8 string [s].
191208192192- Per {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1}
193193- RFC 5891 Section 4.2.1}, domain labels must be normalized to NFC
194194- (Unicode Normalization Form C) before encoding.
209209+ Per
210210+ {{:https://datatracker.ietf.org/doc/html/rfc5891#section-4.2.1} RFC 5891
211211+ Section 4.2.1}, domain labels must be normalized to NFC (Unicode
212212+ Normalization Form C) before encoding.
195213196196- See {{:http://www.unicode.org/reports/tr15/}Unicode Standard Annex #15}
197197- for details on Unicode normalization forms. *)
214214+ See {{:http://www.unicode.org/reports/tr15/}Unicode Standard Annex #15} for
215215+ details on Unicode normalization forms. *)