···1919}
20202121let check ?(collect_parse_errors = true) ?system_id reader =
2222- let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in
2322 let collector = Message_collector.create () in
24232525- (* Add parse errors if collected *)
2626- if collect_parse_errors then begin
2727- let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in
2828- List.iter (Message_collector.add collector) parse_errors
2929- end;
2424+ (* Check if this is an XHTML file - use XML parser if so *)
2525+ if Xhtml_parser.is_xhtml_file system_id then begin
2626+ (* Read all content for XHTML parsing *)
2727+ let content = Bytesrw.Bytes.Reader.to_string reader in
2828+2929+ match Xhtml_parser.parse_xhtml content with
3030+ | Ok root ->
3131+ (* Run all registered checkers via DOM traversal *)
3232+ let registry = Checker_registry.default () in
3333+ Dom_walker.walk_registry registry collector root;
3434+ let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in
3535+ { doc = dummy_doc; msgs = Message_collector.messages collector; system_id }
3636+ | Error msg ->
3737+ Message_collector.add_error collector ~message:msg ~code:"xml-parse-error" ();
3838+ let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in
3939+ { doc = dummy_doc; msgs = Message_collector.messages collector; system_id }
4040+ end
4141+ else begin
4242+ (* Standard HTML5 parsing *)
4343+ let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in
30443131- (* Run all registered checkers via DOM traversal *)
3232- let registry = Checker_registry.default () in
3333- Dom_walker.walk_registry registry collector (Html5rw.root doc);
4545+ (* Add parse errors if collected *)
4646+ if collect_parse_errors then begin
4747+ let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in
4848+ List.iter (Message_collector.add collector) parse_errors
4949+ end;
34503535- { doc; msgs = Message_collector.messages collector; system_id }
5151+ (* Run all registered checkers via DOM traversal *)
5252+ let registry = Checker_registry.default () in
5353+ Dom_walker.walk_registry registry collector (Html5rw.root doc);
5454+5555+ { doc; msgs = Message_collector.messages collector; system_id }
5656+ end
36573758let check_dom ?(collect_parse_errors = true) ?system_id doc =
3859 let collector = Message_collector.create () in
···33type state = {
44 mutable _in_figure : bool;
55 (** Track if we're inside a <figure> element (alt is more critical there) *)
66+ mutable in_a_with_href : bool;
77+ (** Track if we're inside an <a> element with href attribute *)
68}
7988-let create () = { _in_figure = false }
1010+let create () = { _in_figure = false; in_a_with_href = false }
9111010-let reset state = state._in_figure <- false
1212+let reset state =
1313+ state._in_figure <- false;
1414+ state.in_a_with_href <- false
11151216(** Check if an attribute list contains a specific attribute. *)
1317let has_attr name attrs =
···2024 if String.equal attr_name name then Some value else None)
2125 attrs
22262323-let check_img_element attrs collector =
2727+let check_img_element state attrs collector =
2428 (* Check for required src OR srcset attribute *)
2529 if not (has_attr "src" attrs) && not (has_attr "srcset" attrs) then
2630 Message_collector.add_error collector
···3135 if not (has_attr "alt" attrs) then
3236 Message_collector.add_error collector
3337 ~message:"img element requires alt attribute for accessibility"
3434- ~code:"missing-required-attribute" ~element:"img" ~attribute:"alt" ()
3838+ ~code:"missing-required-attribute" ~element:"img" ~attribute:"alt" ();
3939+4040+ (* Check ismap requires 'a' ancestor with href *)
4141+ if has_attr "ismap" attrs && not state.in_a_with_href then
4242+ Message_collector.add_error collector
4343+ ~message:"The \xe2\x80\x9cimg\xe2\x80\x9d element with the \xe2\x80\x9cismap\xe2\x80\x9d attribute set must have an \xe2\x80\x9ca\xe2\x80\x9d ancestor with the \xe2\x80\x9chref\xe2\x80\x9d attribute."
4444+ ~code:"missing-required-ancestor" ~element:"img" ~attribute:"ismap" ()
35453646let check_area_element attrs collector =
3747 (* area with href requires alt *)
···143153 ~code:"bad-attribute-value" ~element:"div" ~attribute:"popover" ()
144154 | None -> ()
145155156156+let check_meter_element attrs collector =
157157+ (* meter requires value attribute *)
158158+ if not (has_attr "value" attrs) then
159159+ Message_collector.add_error collector
160160+ ~message:"Element \xe2\x80\x9cmeter\xe2\x80\x9d is missing required attribute \xe2\x80\x9cvalue\xe2\x80\x9d."
161161+ ~code:"missing-required-attribute" ~element:"meter" ~attribute:"value" ()
162162+ else begin
163163+ (* Validate min <= value constraint *)
164164+ match get_attr "value" attrs, get_attr "min" attrs with
165165+ | Some value_str, Some min_str ->
166166+ (try
167167+ let value = float_of_string value_str in
168168+ let min_val = float_of_string min_str in
169169+ if min_val > value then
170170+ Message_collector.add_error collector
171171+ ~message:"The value of the \xe2\x80\x9cmin\xe2\x80\x9d attribute must be less than or equal to the value of the \xe2\x80\x9cvalue\xe2\x80\x9d attribute."
172172+ ~code:"bad-attribute-value" ~element:"meter" ~attribute:"min" ()
173173+ with _ -> ())
174174+ | _ -> ()
175175+ end
176176+177177+let check_progress_element attrs collector =
178178+ (* Validate progress value constraints *)
179179+ match get_attr "value" attrs with
180180+ | None -> () (* value is optional *)
181181+ | Some value_str ->
182182+ (try
183183+ let value = float_of_string value_str in
184184+ let max_val = match get_attr "max" attrs with
185185+ | None -> 1.0 (* default max is 1 *)
186186+ | Some max_str -> (try float_of_string max_str with _ -> 1.0)
187187+ in
188188+ if value > max_val then
189189+ (* Check which message to use based on whether max is present *)
190190+ if has_attr "max" attrs then
191191+ Message_collector.add_error collector
192192+ ~message:"The value of the \xe2\x80\x9cvalue\xe2\x80\x9d attribute must be less than or equal to the value of the \xe2\x80\x9cmax\xe2\x80\x9d attribute."
193193+ ~code:"bad-attribute-value" ~element:"progress" ~attribute:"value" ()
194194+ else
195195+ Message_collector.add_error collector
196196+ ~message:"The value of the \xe2\x80\x9cvalue\xe2\x80\x9d attribute must be less than or equal to one when the \xe2\x80\x9cmax\xe2\x80\x9d attribute is absent."
197197+ ~code:"bad-attribute-value" ~element:"progress" ~attribute:"value" ()
198198+ with _ -> ())
199199+146200let start_element state ~name ~namespace:_ ~attrs collector =
147201 match name with
148148- | "img" -> check_img_element attrs collector
202202+ | "img" -> check_img_element state attrs collector
149203 | "area" -> check_area_element attrs collector
150204 | "input" -> check_input_element attrs collector
151205 | "script" -> check_script_element attrs collector
152206 | "meta" -> check_meta_element attrs collector
153207 | "link" -> check_link_element attrs collector
154154- | "a" -> check_a_element attrs collector
208208+ | "a" ->
209209+ check_a_element attrs collector;
210210+ if has_attr "href" attrs then state.in_a_with_href <- true
155211 | "map" -> check_map_element attrs collector
156212 | "object" -> check_object_element attrs collector
213213+ | "meter" -> check_meter_element attrs collector
214214+ | "progress" -> check_progress_element attrs collector
157215 | "figure" -> state._in_figure <- true
158216 | _ ->
159217 (* Check popover attribute on any element *)
160218 if has_attr "popover" attrs then check_popover_element attrs collector
161219162220let end_element state ~name ~namespace:_ _collector =
163163- match name with "figure" -> state._in_figure <- false | _ -> ()
221221+ match name with
222222+ | "figure" -> state._in_figure <- false
223223+ | "a" -> state.in_a_with_href <- false
224224+ | _ -> ()
164225165226let characters _state _text _collector = ()
166227
···11+(** XHTML content model checker.
22+33+ Validates specific content model rules that the Nu validator checks,
44+ particularly for elements that don't allow text content or specific children. *)
55+66+type state = {
77+ mutable element_stack : string list;
88+}
99+1010+let create () = { element_stack = [] }
1111+1212+let reset state = state.element_stack <- []
1313+1414+(* Elements that don't allow direct text content (only specific child elements) *)
1515+let no_text_elements = [
1616+ "menu"; (* Only li elements *)
1717+ "iframe"; (* In XHTML mode, no content allowed *)
1818+ "figure"; (* Only figcaption and flow content, not bare text *)
1919+]
2020+2121+2222+(* Check if an element is allowed as child of parent *)
2323+let is_child_allowed ~parent ~child =
2424+ match parent with
2525+ | "menu" ->
2626+ (* menu only allows li, script, template *)
2727+ List.mem child ["li"; "script"; "template"]
2828+ | _ -> true
2929+3030+(* Check if text is allowed in element *)
3131+let is_text_allowed element =
3232+ not (List.mem element no_text_elements)
3333+3434+(* Check if data-* attribute has uppercase characters *)
3535+let check_data_attr_case attrs collector =
3636+ List.iter (fun (attr_name, _) ->
3737+ if String.length attr_name > 5 &&
3838+ String.sub attr_name 0 5 = "data-" then
3939+ let suffix = String.sub attr_name 5 (String.length attr_name - 5) in
4040+ if String.exists (fun c -> c >= 'A' && c <= 'Z') suffix then
4141+ Message_collector.add_error collector
4242+ ~message:"\xe2\x80\x9cdata-*\xe2\x80\x9d attributes must not have characters from the range \xe2\x80\x9cA\xe2\x80\x9d\xe2\x80\xa6\xe2\x80\x9cZ\xe2\x80\x9d in the name."
4343+ ~attribute:attr_name
4444+ ()
4545+ ) attrs
4646+4747+let start_element state ~name ~namespace ~attrs collector =
4848+ ignore namespace;
4949+ let name_lower = String.lowercase_ascii name in
5050+5151+ (* Check data-* attributes for uppercase *)
5252+ check_data_attr_case attrs collector;
5353+5454+ (* Check if this element is allowed as child of parent *)
5555+ (match state.element_stack with
5656+ | parent :: _ ->
5757+ let parent_lower = String.lowercase_ascii parent in
5858+ if not (is_child_allowed ~parent:parent_lower ~child:name_lower) then
5959+ Message_collector.add_error collector
6060+ ~message:(Printf.sprintf
6161+ "Element \xe2\x80\x9c%s\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
6262+ name_lower parent_lower)
6363+ ~element:name_lower
6464+ ()
6565+ | [] -> ());
6666+6767+ (* Push onto stack *)
6868+ state.element_stack <- name :: state.element_stack
6969+7070+let end_element state ~name:_ ~namespace:_ _collector =
7171+ (* Pop from stack *)
7272+ match state.element_stack with
7373+ | _ :: rest -> state.element_stack <- rest
7474+ | [] -> ()
7575+7676+let characters state text collector =
7777+ (* Check if text is allowed in current element *)
7878+ match state.element_stack with
7979+ | [] -> () (* Root level - ignore *)
8080+ | parent :: _ ->
8181+ let parent_lower = String.lowercase_ascii parent in
8282+ (* Only report non-whitespace text *)
8383+ let trimmed = String.trim text in
8484+ if trimmed <> "" && not (is_text_allowed parent_lower) then
8585+ Message_collector.add_error collector
8686+ ~message:(Printf.sprintf
8787+ "Text not allowed in element \xe2\x80\x9c%s\xe2\x80\x9d in this context."
8888+ parent_lower)
8989+ ~element:parent_lower
9090+ ()
9191+9292+let end_document _state _collector = ()
9393+9494+let checker =
9595+ (module struct
9696+ type nonrec state = state
9797+ let create = create
9898+ let reset = reset
9999+ let start_element = start_element
100100+ let end_element = end_element
101101+ let characters = characters
102102+ let end_document = end_document
103103+ end : Checker.S)
···11+(** XHTML content model checker.
22+33+ Validates specific content model rules for XHTML. *)
44+55+val checker : Checker.t
+85
lib/html5_checker/xhtml_parser.ml
···11+(** XHTML parser using xmlm for proper XML parsing.
22+33+ This module provides XML parsing for XHTML files, which the HTML5 parser
44+ cannot handle correctly (especially self-closing tags on non-void elements). *)
55+66+(** Parse XHTML content using xmlm and return a DOM tree. *)
77+let parse_xhtml content =
88+ let input = Xmlm.make_input (`String (0, content)) in
99+1010+ (* Stack of nodes during parsing *)
1111+ let stack = ref [] in
1212+ let root = Html5rw.Dom.create_document () in
1313+ stack := [root];
1414+1515+ (* Helper to get namespace shorthand *)
1616+ let ns_shorthand ns =
1717+ if ns = "http://www.w3.org/2000/svg" then Some "svg"
1818+ else if ns = "http://www.w3.org/1998/Math/MathML" then Some "mathml"
1919+ else if ns = "http://www.w3.org/1999/xhtml" then None (* HTML namespace *)
2020+ else if ns = "" then None (* No namespace = HTML *)
2121+ else Some ns (* Keep other namespaces as-is *)
2222+ in
2323+2424+ (* Process xmlm signals *)
2525+ let rec process () =
2626+ if Xmlm.eoi input then ()
2727+ else begin
2828+ match Xmlm.input input with
2929+ | `Dtd _ ->
3030+ (* Skip DTD for now *)
3131+ process ()
3232+ | `El_start ((ns, local), attrs) ->
3333+ (* Create element node *)
3434+ let namespace = ns_shorthand ns in
3535+ let attr_list = List.map (fun ((_, aname), aval) -> (aname, aval)) attrs in
3636+ let node = Html5rw.Dom.create_element local ~namespace ~attrs:attr_list () in
3737+ (* Append to current parent *)
3838+ (match !stack with
3939+ | parent :: _ -> Html5rw.Dom.append_child parent node
4040+ | [] -> ());
4141+ (* Push onto stack *)
4242+ stack := node :: !stack;
4343+ process ()
4444+ | `El_end ->
4545+ (* Pop from stack *)
4646+ (match !stack with
4747+ | _ :: rest -> stack := rest
4848+ | [] -> ());
4949+ process ()
5050+ | `Data text ->
5151+ (* Create text node and append to current parent *)
5252+ let trimmed = String.trim text in
5353+ if trimmed <> "" || String.length text > 0 then begin
5454+ let text_node = Html5rw.Dom.create_text text in
5555+ (match !stack with
5656+ | parent :: _ -> Html5rw.Dom.append_child parent text_node
5757+ | [] -> ())
5858+ end;
5959+ process ()
6060+ end
6161+ in
6262+6363+ try
6464+ process ();
6565+ Ok root
6666+ with
6767+ | Xmlm.Error ((line, col), err) ->
6868+ Error (Printf.sprintf "XML parse error at %d:%d: %s" line col (Xmlm.error_message err))
6969+7070+(** Check if a system_id indicates an XHTML file. *)
7171+let is_xhtml_file system_id =
7272+ match system_id with
7373+ | Some path ->
7474+ String.length path > 6 &&
7575+ String.sub path (String.length path - 6) 6 = ".xhtml"
7676+ | None -> false
7777+7878+(** Wrap DOM in an Html5rw.t-compatible structure for the checker. *)
7979+type xhtml_doc = {
8080+ root : Html5rw.Dom.node;
8181+ errors : Html5rw.Error.t list;
8282+}
8383+8484+let xhtml_root doc = doc.root
8585+let xhtml_errors _doc = [] (* XML parser handles errors differently *)
+2-2
test/debug_check.ml
···11let () =
22- let test_file = "validator/tests/xhtml/elements/progress/002-isvalid.xhtml" in
22+ let test_file = "validator/tests/xhtml/elements/menu/menu-containing-text-novalid.xhtml" in
33 let ic = open_in test_file in
44 let html = really_input_string ic (in_channel_length ic) in
55 close_in ic;
···3232 print_endline "=== Errors ===";
3333 List.iter (fun e -> print_endline e.Html5_checker.Message.message) errors;
3434 print_endline "\n=== Expected ===";
3535- print_endline "Element \xe2\x80\x9crect\xe2\x80\x9d is missing required attribute \xe2\x80\x9cheight\xe2\x80\x9d."
3535+ print_endline "Text not allowed in element \xe2\x80\x9cmenu\xe2\x80\x9d in this context."