···99 mutable html_locator : (int * int) option; (* line, column *)
1010 mutable in_body : bool;
1111 mutable skip_depth : int; (* depth in elements to skip *)
1212+ mutable foreign_depth : int; (* depth in SVG/MathML content to skip *)
1213 mutable text_buffer : Buffer.t;
1314 mutable char_count : int;
1415}
···3233 html_locator = None;
3334 in_body = false;
3435 skip_depth = 0;
3636+ foreign_depth = 0;
3537 text_buffer = Buffer.create 4096;
3638 char_count = 0;
3739}
···4244 state.html_locator <- None;
4345 state.in_body <- false;
4446 state.skip_depth <- 0;
4747+ state.foreign_depth <- 0;
4548 Buffer.clear state.text_buffer;
4649 state.char_count <- 0
47505151+(* Namespaces to skip for language detection *)
5252+let svg_namespace = "http://www.w3.org/2000/svg"
5353+let mathml_namespace = "http://www.w3.org/1998/Math/MathML"
5454+5555+let is_foreign_namespace ns =
5656+ ns = svg_namespace || ns = mathml_namespace
5757+5858+(* Element names that start foreign content (for when namespace isn't set) *)
5959+let is_foreign_element name =
6060+ let n = String.lowercase_ascii name in
6161+ n = "svg" || n = "math"
6262+4863let get_attr name attrs =
4964 List.find_map (fun (n, v) ->
5065 if String.lowercase_ascii n = name then Some v else None
···126141 | "zh-tw" -> "zh-hant"
127142 | _ -> code
128143129129-let start_element state ~name ~namespace:_ ~attrs _collector =
144144+let start_element state ~name ~namespace ~attrs _collector =
130145 let name_lower = String.lowercase_ascii name in
146146+ let ns = Option.value namespace ~default:"" in
131147132148 if name_lower = "html" then begin
133149 state.html_lang <- get_attr "lang" attrs;
···138154 else if name_lower = "body" then
139155 state.in_body <- true
140156 else if state.in_body then begin
157157+ (* Track foreign namespace depth (SVG/MathML) *)
158158+ if is_foreign_namespace ns || is_foreign_element name then
159159+ state.foreign_depth <- state.foreign_depth + 1
160160+ else if state.foreign_depth > 0 then
161161+ state.foreign_depth <- state.foreign_depth + 1
141162 (* Check if we should skip this element's text *)
142142- if List.mem name_lower skip_elements then
163163+ else if List.mem name_lower skip_elements then
143164 state.skip_depth <- state.skip_depth + 1
144165 else begin
145166 (* Check for different lang attribute *)
···154175 let name_lower = String.lowercase_ascii name in
155176 if name_lower = "body" then
156177 state.in_body <- false
157157- else if state.in_body && state.skip_depth > 0 then begin
158158- if List.mem name_lower skip_elements then
159159- state.skip_depth <- state.skip_depth - 1
160160- else begin
161161- (* TODO: properly track nested elements with different lang *)
162162- state.skip_depth <- max 0 (state.skip_depth - 1)
178178+ else if state.in_body then begin
179179+ (* Track foreign namespace depth *)
180180+ if state.foreign_depth > 0 then
181181+ state.foreign_depth <- state.foreign_depth - 1
182182+ else if state.skip_depth > 0 then begin
183183+ if List.mem name_lower skip_elements then
184184+ state.skip_depth <- state.skip_depth - 1
185185+ else
186186+ (* TODO: properly track nested elements with different lang *)
187187+ state.skip_depth <- max 0 (state.skip_depth - 1)
163188 end
164189 end
165190166191let characters state text _collector =
167167- if state.in_body && state.skip_depth = 0 && state.char_count < max_chars then begin
192192+ if state.in_body && state.skip_depth = 0 && state.foreign_depth = 0 && state.char_count < max_chars then begin
168193 (* Count Unicode code points, not bytes *)
169194 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
170195 let rec process () =
···197222 match Langdetect.detect_with_prob detector text with
198223 | None -> ()
199224 | Some (detected_lang, prob) when prob > 0.90 ->
200200- let declared_lang = match state.html_lang with
201201- | Some l -> get_lang_code l
225225+ (* Get the original declared lang value (preserve exactly as written) *)
226226+ let original_declared = match state.html_lang with
227227+ | Some l -> l
202228 | None -> ""
203229 in
204230 let detected_code = detected_lang in (* Keep full code like zh-tw *)
205231 let detected_name = get_language_name detected_lang in
206232 let suggested_code = get_bcp47_code detected_lang in
207233208208- (* Check for language mismatch *)
209209- let base_declared = get_lang_code declared_lang in
234234+ (* Check for language mismatch using base codes *)
235235+ let base_declared = get_lang_code original_declared in
210236 let base_detected = get_lang_code detected_code in
211211- if declared_lang = "" then begin
237237+ if original_declared = "" then begin
212238 (* No lang attribute - suggest adding one *)
213239 Message_collector.add_warning collector
214240 ~message:(Printf.sprintf
···224250 Message_collector.add_warning collector
225251 ~message:(Printf.sprintf
226252 "This document appears to be written in %s but the \xe2\x80\x9chtml\xe2\x80\x9d start tag has \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d. Consider using \xe2\x80\x9clang=\"%s\"\xe2\x80\x9d (or variant) instead."
227227- detected_name declared_lang suggested_code)
253253+ detected_name original_declared suggested_code)
228254 ~code:"wrong-lang"
229255 ~element:"html"
230256 ()
+22-13
lib/html5_checker/specialized/h1_checker.ml
···2233type state = {
44 mutable h1_count : int;
55+ mutable svg_depth : int; (* Track depth inside SVG *)
56}
6778let create () = {
89 h1_count = 0;
1010+ svg_depth = 0;
911}
10121113let reset state =
1212- state.h1_count <- 0
1414+ state.h1_count <- 0;
1515+ state.svg_depth <- 0
13161417let start_element state ~name ~namespace ~attrs collector =
1518 ignore attrs;
1616- if namespace <> None then ()
1717- else begin
1818- let name_lower = String.lowercase_ascii name in
1919- if name_lower = "h1" then begin
2020- state.h1_count <- state.h1_count + 1;
2121- if state.h1_count > 1 then
2222- Message_collector.add_info collector
2323- ~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)."
2424- ~code:"multiple-h1"
2525- ~element:name ()
2626- end
1919+ let name_lower = String.lowercase_ascii name in
2020+ (* Track SVG depth - h1 inside SVG (foreignObject, desc) shouldn't count *)
2121+ if name_lower = "svg" then
2222+ state.svg_depth <- state.svg_depth + 1
2323+ else if namespace <> None || state.svg_depth > 0 then
2424+ () (* Skip non-HTML namespace or inside SVG *)
2525+ else if name_lower = "h1" then begin
2626+ state.h1_count <- state.h1_count + 1;
2727+ if state.h1_count > 1 then
2828+ Message_collector.add_info collector
2929+ ~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)."
3030+ ~code:"multiple-h1"
3131+ ~element:name ()
2732 end
28332929-let end_element _state ~name:_ ~namespace:_ _collector = ()
3434+let end_element state ~name ~namespace:_ _collector =
3535+ let name_lower = String.lowercase_ascii name in
3636+ if name_lower = "svg" && state.svg_depth > 0 then
3737+ state.svg_depth <- state.svg_depth - 1
3838+3039let characters _state _text _collector = ()
3140let end_document _state _collector = ()
3241