Punycode (RFC3492) in OCaml

more idiomatic ocaml

+101 -116
+43 -53
lib/punycode.ml
··· 61 | Empty_label -> 62 Format.fprintf fmt "empty label" 63 64 65 (* {1 Case Flags} *) 66 ··· 70 71 let is_basic u = 72 Uchar.to_int u < 0x80 73 - 74 - 75 - let is_delimiter c = c = delimiter 76 77 let is_ascii_string s = 78 - let rec loop i = 79 - if i >= String.length s then true 80 - else if Char.code s.[i] >= 0x80 then false 81 - else loop (i + 1) 82 - in 83 - loop 0 84 85 let has_ace_prefix s = 86 let len = String.length s in ··· 144 let safe_mul_add a b c pos = 145 if c = 0 then Ok a 146 else if b > (max_int_value - a) / c then 147 - Error (Overflow pos) 148 else 149 Ok (a + b * c) 150 ··· 228 229 while !h < input_length && !result = Ok () do 230 (* Find minimum code point >= n *) 231 - let m = ref max_int_value in 232 - for j = 0 to input_length - 1 do 233 - let cp = Uchar.to_int codepoints.(j) in 234 - if cp >= !n && cp < !m then 235 - m := cp 236 - done; 237 238 (* Increase delta to advance state to <m, 0> *) 239 let pos = { byte_offset = 0; char_index = !h } in 240 - (match safe_mul_add !delta (!m - !n) (!h + 1) pos with 241 | Error e -> result := Error e 242 | Ok new_delta -> 243 delta := new_delta; 244 - n := !m; 245 246 (* Process each code point *) 247 let j = ref 0 in ··· 252 if cp < !n then begin 253 incr delta; 254 if !delta = 0 then (* Overflow *) 255 - result := Error (Overflow pos) 256 end 257 else if cp = !n then begin 258 (* Encode delta as variable-length integer *) ··· 316 Ok ([||], [||]) 317 else begin 318 (* Find last delimiter *) 319 - let last_delim = ref (-1) in 320 - for j = 0 to input_length - 1 do 321 - if is_delimiter input.[j] then 322 - last_delim := j 323 - done; 324 - let b = if !last_delim < 0 then 0 else !last_delim in 325 326 (* Copy basic code points and extract case flags *) 327 let output = ref [] in ··· 365 let pos = { byte_offset = !in_pos; char_index = Array.length !output } in 366 367 if !in_pos >= input_length then begin 368 - result := Error (Unexpected_end pos); 369 done_decoding := true 370 end else begin 371 let c = input.[!in_pos] in ··· 373 374 match decode_digit c with 375 | None -> 376 - result := Error (Invalid_digit (pos, c)); 377 done_decoding := true 378 | Some digit -> 379 (* i = i + digit * w, with overflow check *) ··· 397 (* w = w * (base - t), with overflow check *) 398 let base_minus_t = base - t in 399 if !w > max_int_value / base_minus_t then begin 400 - result := Error (Overflow pos); 401 done_decoding := true 402 end else begin 403 w := !w * base_minus_t; ··· 416 (* n = n + i / (out_len + 1), with overflow check *) 417 let increment = !i / (out_len + 1) in 418 if increment > max_int_value - !n then 419 - result := Error (Overflow pos) 420 else begin 421 n := !n + increment; 422 i := !i mod (out_len + 1); 423 424 (* Validate that n is a valid Unicode scalar value *) 425 if not (Uchar.is_valid !n) then 426 - result := Error (Invalid_character (pos, Uchar.rep)) 427 else begin 428 (* Insert n at position i *) 429 let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in ··· 456 end 457 458 let decode input = 459 - match decode_impl input with 460 - | Error e -> Error e 461 - | Ok (codepoints, _) -> Ok codepoints 462 463 let decode_with_case input = 464 decode_impl input ··· 466 (* {1 UTF-8 String Operations} *) 467 468 let encode_utf8 s = 469 - match utf8_to_codepoints s with 470 - | Error e -> Error e 471 - | Ok codepoints -> encode codepoints 472 473 let decode_utf8 punycode = 474 - match decode punycode with 475 - | Error e -> Error e 476 - | Ok codepoints -> Ok (codepoints_to_utf8 codepoints) 477 478 (* {1 Domain Label Operations} *) 479 480 let encode_label label = 481 if String.length label = 0 then 482 - Error Empty_label 483 else if is_ascii_string label then begin 484 (* All ASCII - return as-is, but check length *) 485 let len = String.length label in 486 if len > max_label_length then 487 - Error (Label_too_long len) 488 else 489 Ok label 490 - end else begin 491 (* Has non-ASCII - encode with Punycode *) 492 - match encode_utf8 label with 493 - | Error e -> Error e 494 - | Ok encoded -> 495 - let result = ace_prefix ^ encoded in 496 - let len = String.length result in 497 - if len > max_label_length then 498 - Error (Label_too_long len) 499 - else 500 - Ok result 501 - end 502 503 let decode_label label = 504 if String.length label = 0 then 505 - Error Empty_label 506 else if has_ace_prefix label then begin 507 (* Remove ACE prefix and decode *) 508 let punycode = String.sub label 4 (String.length label - 4) in
··· 61 | Empty_label -> 62 Format.fprintf fmt "empty label" 63 64 + (* {1 Error Constructors} *) 65 + 66 + let overflow pos = Error (Overflow pos) 67 + let invalid_character pos u = Error (Invalid_character (pos, u)) 68 + let invalid_digit pos c = Error (Invalid_digit (pos, c)) 69 + let unexpected_end pos = Error (Unexpected_end pos) 70 + let _invalid_utf8 pos = Error (Invalid_utf8 pos) 71 + let label_too_long len = Error (Label_too_long len) 72 + let empty_label = Error Empty_label 73 74 (* {1 Case Flags} *) 75 ··· 79 80 let is_basic u = 81 Uchar.to_int u < 0x80 82 83 let is_ascii_string s = 84 + String.for_all (fun c -> Char.code c < 0x80) s 85 86 let has_ace_prefix s = 87 let len = String.length s in ··· 145 let safe_mul_add a b c pos = 146 if c = 0 then Ok a 147 else if b > (max_int_value - a) / c then 148 + overflow pos 149 else 150 Ok (a + b * c) 151 ··· 229 230 while !h < input_length && !result = Ok () do 231 (* Find minimum code point >= n *) 232 + let m = Array.fold_left (fun acc cp -> 233 + let cp_val = Uchar.to_int cp in 234 + if cp_val >= !n && cp_val < acc then cp_val else acc 235 + ) max_int_value codepoints in 236 237 (* Increase delta to advance state to <m, 0> *) 238 let pos = { byte_offset = 0; char_index = !h } in 239 + (match safe_mul_add !delta (m - !n) (!h + 1) pos with 240 | Error e -> result := Error e 241 | Ok new_delta -> 242 delta := new_delta; 243 + n := m; 244 245 (* Process each code point *) 246 let j = ref 0 in ··· 251 if cp < !n then begin 252 incr delta; 253 if !delta = 0 then (* Overflow *) 254 + result := overflow pos 255 end 256 else if cp = !n then begin 257 (* Encode delta as variable-length integer *) ··· 315 Ok ([||], [||]) 316 else begin 317 (* Find last delimiter *) 318 + let b = Option.value ~default:0 (String.rindex_opt input delimiter) in 319 320 (* Copy basic code points and extract case flags *) 321 let output = ref [] in ··· 359 let pos = { byte_offset = !in_pos; char_index = Array.length !output } in 360 361 if !in_pos >= input_length then begin 362 + result := unexpected_end pos; 363 done_decoding := true 364 end else begin 365 let c = input.[!in_pos] in ··· 367 368 match decode_digit c with 369 | None -> 370 + result := invalid_digit pos c; 371 done_decoding := true 372 | Some digit -> 373 (* i = i + digit * w, with overflow check *) ··· 391 (* w = w * (base - t), with overflow check *) 392 let base_minus_t = base - t in 393 if !w > max_int_value / base_minus_t then begin 394 + result := overflow pos; 395 done_decoding := true 396 end else begin 397 w := !w * base_minus_t; ··· 410 (* n = n + i / (out_len + 1), with overflow check *) 411 let increment = !i / (out_len + 1) in 412 if increment > max_int_value - !n then 413 + result := overflow pos 414 else begin 415 n := !n + increment; 416 i := !i mod (out_len + 1); 417 418 (* Validate that n is a valid Unicode scalar value *) 419 if not (Uchar.is_valid !n) then 420 + result := invalid_character pos Uchar.rep 421 else begin 422 (* Insert n at position i *) 423 let new_output = Array.make (out_len + 1) (Uchar.of_int 0) in ··· 450 end 451 452 let decode input = 453 + Result.map fst (decode_impl input) 454 455 let decode_with_case input = 456 decode_impl input ··· 458 (* {1 UTF-8 String Operations} *) 459 460 let encode_utf8 s = 461 + let open Result.Syntax in 462 + let* codepoints = utf8_to_codepoints s in 463 + encode codepoints 464 465 let decode_utf8 punycode = 466 + let open Result.Syntax in 467 + let+ codepoints = decode punycode in 468 + codepoints_to_utf8 codepoints 469 470 (* {1 Domain Label Operations} *) 471 472 let encode_label label = 473 if String.length label = 0 then 474 + empty_label 475 else if is_ascii_string label then begin 476 (* All ASCII - return as-is, but check length *) 477 let len = String.length label in 478 if len > max_label_length then 479 + label_too_long len 480 else 481 Ok label 482 + end else 483 (* Has non-ASCII - encode with Punycode *) 484 + let open Result.Syntax in 485 + let* encoded = encode_utf8 label in 486 + let result = ace_prefix ^ encoded in 487 + let len = String.length result in 488 + if len > max_label_length then 489 + label_too_long len 490 + else 491 + Ok result 492 493 let decode_label label = 494 if String.length label = 0 then 495 + empty_label 496 else if has_ace_prefix label then begin 497 (* Remove ACE prefix and decode *) 498 let punycode = String.sub label 4 (String.length label - 4) in
+58 -63
lib/punycode_idna.ml
··· 28 | Verification_failed -> 29 Format.fprintf fmt "IDNA verification failed (round-trip mismatch)" 30 31 32 (* {1 Unicode Normalization} *) 33 ··· 44 - Cannot start or end with hyphen *) 45 let is_std3_valid label = 46 let len = String.length label in 47 - if len = 0 then false 48 - else if label.[0] = '-' || label.[len - 1] = '-' then false 49 - else 50 - let rec check i = 51 - if i >= len then true 52 - else 53 - let c = label.[i] in 54 - let valid = 55 - (c >= 'a' && c <= 'z') || 56 - (c >= 'A' && c <= 'Z') || 57 - (c >= '0' && c <= '9') || 58 - c = '-' 59 - in 60 - if valid then check (i + 1) else false 61 - in 62 - check 0 63 64 (* Check hyphen placement: hyphens not in positions 3 and 4 (except for ACE) *) 65 let check_hyphen_rules label = ··· 75 let label_to_ascii_impl ~check_hyphens ~use_std3_rules label = 76 let len = String.length label in 77 if len = 0 then 78 - Error (Invalid_label "empty label") 79 else if len > Punycode.max_label_length then 80 - Error (Punycode_error (Punycode.Label_too_long len)) 81 else if Punycode.is_ascii_string label then begin 82 (* All ASCII - validate and pass through *) 83 if use_std3_rules && not (is_std3_valid label) then 84 - Error (Invalid_label "STD3 rules violation") 85 else if check_hyphens && not (check_hyphen_rules label) then 86 - Error (Invalid_label "invalid hyphen placement") 87 else 88 Ok label 89 end else begin ··· 92 93 (* Encode to Punycode *) 94 match Punycode.encode_utf8 normalized with 95 - | Error e -> Error (Punycode_error e) 96 | Ok encoded -> 97 let result = Punycode.ace_prefix ^ encoded in 98 let result_len = String.length result in 99 if result_len > Punycode.max_label_length then 100 - Error (Punycode_error (Punycode.Label_too_long result_len)) 101 else if check_hyphens && not (check_hyphen_rules result) then 102 - Error (Invalid_label "invalid hyphen placement in encoded label") 103 else 104 (* Verification: decode and compare to original normalized form *) 105 match Punycode.decode_utf8 encoded with 106 - | Error _ -> Error Verification_failed 107 | Ok decoded -> 108 if decoded <> normalized then 109 - Error Verification_failed 110 else 111 Ok result 112 end ··· 118 if is_ace_label label then begin 119 let encoded = String.sub label 4 (String.length label - 4) in 120 match Punycode.decode_utf8 encoded with 121 - | Error e -> Error (Punycode_error e) 122 | Ok decoded -> Ok decoded 123 end else 124 Ok label ··· 133 let join_labels labels = 134 String.concat "." labels 135 136 let to_ascii ?(check_hyphens = true) ?(check_bidi = false) 137 ?(check_joiners = false) ?(use_std3_rules = false) 138 ?(transitional = false) domain = ··· 142 let _ = check_joiners in 143 let _ = transitional in 144 145 let labels = split_domain domain in 146 - let rec process acc = function 147 - | [] -> 148 - let result = join_labels (List.rev acc) in 149 - let len = String.length result in 150 - if len > max_domain_length then 151 - Error (Domain_too_long len) 152 - else 153 - Ok result 154 - | label :: rest -> 155 - match label_to_ascii_impl ~check_hyphens ~use_std3_rules label with 156 - | Error e -> Error e 157 - | Ok encoded -> process (encoded :: acc) rest 158 - in 159 - process [] labels 160 161 let to_unicode domain = 162 let labels = split_domain domain in 163 - let rec process acc = function 164 - | [] -> Ok (join_labels (List.rev acc)) 165 - | label :: rest -> 166 - match label_to_unicode label with 167 - | Error e -> Error e 168 - | Ok decoded -> process (decoded :: acc) rest 169 - in 170 - process [] labels 171 172 (* {1 Domain Name Library Integration} *) 173 174 let domain_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) domain = 175 let s = Domain_name.to_string domain in 176 - match to_ascii ~check_hyphens ~use_std3_rules s with 177 - | Error e -> Error e 178 - | Ok ascii -> 179 - match Domain_name.of_string ascii with 180 - | Error (`Msg msg) -> Error (Invalid_label msg) 181 - | Ok d -> Ok d 182 183 let domain_to_unicode domain = 184 let s = Domain_name.to_string domain in 185 - match to_unicode s with 186 - | Error e -> Error e 187 - | Ok unicode -> 188 - match Domain_name.of_string unicode with 189 - | Error (`Msg msg) -> Error (Invalid_label msg) 190 - | Ok d -> Ok d 191 192 (* {1 Validation} *) 193 194 let is_idna_valid domain = 195 - match to_ascii domain with 196 - | Ok _ -> true 197 - | Error _ -> false
··· 28 | Verification_failed -> 29 Format.fprintf fmt "IDNA verification failed (round-trip mismatch)" 30 31 + (* {1 Error Constructors} *) 32 + 33 + let punycode_error e = Error (Punycode_error e) 34 + let invalid_label msg = Error (Invalid_label msg) 35 + let domain_too_long len = Error (Domain_too_long len) 36 + let _normalization_failed = Error Normalization_failed 37 + let verification_failed = Error Verification_failed 38 39 (* {1 Unicode Normalization} *) 40 ··· 51 - Cannot start or end with hyphen *) 52 let is_std3_valid label = 53 let len = String.length label in 54 + let is_ldh c = 55 + (c >= 'a' && c <= 'z') || 56 + (c >= 'A' && c <= 'Z') || 57 + (c >= '0' && c <= '9') || 58 + c = '-' 59 + in 60 + len > 0 && 61 + label.[0] <> '-' && 62 + label.[len - 1] <> '-' && 63 + String.for_all is_ldh label 64 65 (* Check hyphen placement: hyphens not in positions 3 and 4 (except for ACE) *) 66 let check_hyphen_rules label = ··· 76 let label_to_ascii_impl ~check_hyphens ~use_std3_rules label = 77 let len = String.length label in 78 if len = 0 then 79 + invalid_label "empty label" 80 else if len > Punycode.max_label_length then 81 + punycode_error (Punycode.Label_too_long len) 82 else if Punycode.is_ascii_string label then begin 83 (* All ASCII - validate and pass through *) 84 if use_std3_rules && not (is_std3_valid label) then 85 + invalid_label "STD3 rules violation" 86 else if check_hyphens && not (check_hyphen_rules label) then 87 + invalid_label "invalid hyphen placement" 88 else 89 Ok label 90 end else begin ··· 93 94 (* Encode to Punycode *) 95 match Punycode.encode_utf8 normalized with 96 + | Error e -> punycode_error e 97 | Ok encoded -> 98 let result = Punycode.ace_prefix ^ encoded in 99 let result_len = String.length result in 100 if result_len > Punycode.max_label_length then 101 + punycode_error (Punycode.Label_too_long result_len) 102 else if check_hyphens && not (check_hyphen_rules result) then 103 + invalid_label "invalid hyphen placement in encoded label" 104 else 105 (* Verification: decode and compare to original normalized form *) 106 match Punycode.decode_utf8 encoded with 107 + | Error _ -> verification_failed 108 | Ok decoded -> 109 if decoded <> normalized then 110 + verification_failed 111 else 112 Ok result 113 end ··· 119 if is_ace_label label then begin 120 let encoded = String.sub label 4 (String.length label - 4) in 121 match Punycode.decode_utf8 encoded with 122 + | Error e -> punycode_error e 123 | Ok decoded -> Ok decoded 124 end else 125 Ok label ··· 134 let join_labels labels = 135 String.concat "." labels 136 137 + (* Map a function returning Result over a list, short-circuiting on first Error *) 138 + let map_result f lst = 139 + List.fold_right (fun x acc -> 140 + let open Result.Syntax in 141 + let* y = f x in 142 + let+ ys = acc in 143 + y :: ys 144 + ) lst (Ok []) 145 + 146 let to_ascii ?(check_hyphens = true) ?(check_bidi = false) 147 ?(check_joiners = false) ?(use_std3_rules = false) 148 ?(transitional = false) domain = ··· 152 let _ = check_joiners in 153 let _ = transitional in 154 155 + let open Result.Syntax in 156 let labels = split_domain domain in 157 + let* encoded_labels = map_result (label_to_ascii_impl ~check_hyphens ~use_std3_rules) labels in 158 + let result = join_labels encoded_labels in 159 + let len = String.length result in 160 + if len > max_domain_length then 161 + domain_too_long len 162 + else 163 + Ok result 164 165 let to_unicode domain = 166 + let open Result.Syntax in 167 let labels = split_domain domain in 168 + let+ decoded_labels = map_result label_to_unicode labels in 169 + join_labels decoded_labels 170 171 (* {1 Domain Name Library Integration} *) 172 173 let domain_to_ascii ?(check_hyphens = true) ?(use_std3_rules = false) domain = 174 + let open Result.Syntax in 175 let s = Domain_name.to_string domain in 176 + let* ascii = to_ascii ~check_hyphens ~use_std3_rules s in 177 + match Domain_name.of_string ascii with 178 + | Error (`Msg msg) -> invalid_label msg 179 + | Ok d -> Ok d 180 181 let domain_to_unicode domain = 182 + let open Result.Syntax in 183 let s = Domain_name.to_string domain in 184 + let* unicode = to_unicode s in 185 + match Domain_name.of_string unicode with 186 + | Error (`Msg msg) -> invalid_label msg 187 + | Ok d -> Ok d 188 189 (* {1 Validation} *) 190 191 let is_idna_valid domain = 192 + Result.is_ok (to_ascii domain)