Detect which human language a document uses from OCaml, from the Nu Html validator
languages unicode ocaml

fix

+42 -5
+6 -5
lib/langdetect.ml
··· 146 146 let prob = init_prob n_langs in 147 147 let alpha = t.config.alpha +. (random_gaussian () *. alpha_width) in 148 148 let converged = ref false in 149 - let i = ref 0 in 150 - while (not !converged) && !i < iteration_limit do 149 + let iter_count = ref 0 in 150 + while (not !converged) && !iter_count < iteration_limit do 151 151 let r = random_int (Array.length ngrams) in 152 152 let (_ : bool) = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in 153 - if !i mod 5 = 0 then 153 + if !iter_count mod 5 = 0 then begin 154 154 let max_p = normalize_prob prob in 155 - if max_p > t.config.conv_threshold then converged := true; 156 - incr i 155 + if max_p > t.config.conv_threshold then converged := true 156 + end; 157 + incr iter_count 157 158 done; 158 159 for j = 0 to n_langs - 1 do 159 160 lang_prob.(j) <- lang_prob.(j) +. (prob.(j) /. float_of_int t.config.n_trial)
+36
test/test_langdetect.ml
··· 538 538 with exn -> 539 539 Alcotest.fail (Printf.sprintf "Exception testing profiles: %s" (Printexc.to_string exn)) 540 540 541 + (* Regression test: ensure detection loop completes in reasonable time. 542 + This catches bugs like the iter_count variable name mismatch that caused infinite loops. *) 543 + let test_detection_completes_quickly () = 544 + let d = get_detector () in 545 + let start_time = Unix.gettimeofday () in 546 + (* Run detection on several texts to ensure it completes *) 547 + List.iter (fun (_, _, text) -> 548 + let _ = Langdetect.detect_best d text in () 549 + ) all_test_corpus; 550 + let elapsed = Unix.gettimeofday () -. start_time in 551 + (* All detections should complete within 5 seconds total *) 552 + if elapsed > 5.0 then 553 + Alcotest.fail (Printf.sprintf "Detection took too long: %.2f seconds (expected < 5s)" elapsed) 554 + else 555 + Printf.printf "Detection completed in %.2f seconds\n" elapsed 556 + 557 + (* Regression test: verify iteration_limit is respected in detect_block *) 558 + let test_iteration_limit_respected () = 559 + let d = get_detector () in 560 + (* Use a text that might not converge quickly *) 561 + let mixed_text = String.concat " " (List.init 100 (fun i -> 562 + if i mod 3 = 0 then "hello" 563 + else if i mod 3 = 1 then "bonjour" 564 + else "hallo" 565 + )) in 566 + let start_time = Unix.gettimeofday () in 567 + let _ = Langdetect.detect d mixed_text in 568 + let elapsed = Unix.gettimeofday () -. start_time in 569 + (* Single detection should complete within 1 second *) 570 + if elapsed > 1.0 then 571 + Alcotest.fail (Printf.sprintf "Single detection took too long: %.2f seconds (expected < 1s)" elapsed) 572 + 541 573 (* Main test suite *) 542 574 let () = 543 575 Alcotest.run "Langdetect" [ ··· 583 615 Alcotest.test_case "very long text" `Quick test_very_long_text; 584 616 Alcotest.test_case "repeated detection consistency" `Quick test_repeated_detection_consistency; 585 617 Alcotest.test_case "all profiles functional" `Quick test_all_profiles_functional; 618 + ]); 619 + ("Regression tests", [ 620 + Alcotest.test_case "detection completes quickly" `Quick test_detection_completes_quickly; 621 + Alcotest.test_case "iteration limit respected" `Quick test_iteration_limit_respected; 586 622 ]); 587 623 ]