···737val has_attr : node -> string -> bool
738(** [has_attr node name] returns [true] if the node has attribute [name]. *)
739740+(** {1 Space-Separated Attribute Values}
741+742+ Many HTML attributes contain space-separated lists of values. For example,
743+ the [class] attribute contains CSS class names: [class="header main active"].
744+ These functions parse such attributes into OCaml lists.
745+746+ Per the HTML5 spec, "ASCII whitespace" (space, tab, newline, carriage return,
747+ form feed) is used as the separator.
748+*)
749+750+val split_on_whitespace : string -> string list
751+(** [split_on_whitespace s] splits a string on ASCII whitespace.
752+753+ This implements the HTML5 "split on ASCII whitespace" algorithm.
754+755+ {b Example:}
756+ {[
757+ split_on_whitespace "foo bar\tbaz"
758+ (* Returns: ["foo"; "bar"; "baz"] *)
759+ ]}
760+*)
761+762+val get_attr_list : node -> string -> string list
763+(** [get_attr_list node name] returns a space-separated attribute as a list.
764+765+ Returns an empty list if the attribute doesn't exist.
766+*)
767+768+val get_class_list : node -> string list
769+(** [get_class_list node] returns the class attribute as a list of class names. *)
770+771+val get_rel_list : node -> string list
772+(** [get_rel_list node] returns the rel attribute as a list of link types
773+ (lowercased since they are case-insensitive). *)
774+775+val get_headers_list : node -> string list
776+(** [get_headers_list node] returns the headers attribute as a list of IDs. *)
777+778+val get_itemref_list : node -> string list
779+(** [get_itemref_list node] returns the itemref attribute as a list of IDs. *)
780+781+val get_itemprop_list : node -> string list
782+(** [get_itemprop_list node] returns the itemprop attribute as a list. *)
783+784+val get_itemtype_list : node -> string list
785+(** [get_itemtype_list node] returns the itemtype attribute as a list of URLs. *)
786+787(** {1 Location Helpers} *)
788789val make_location : line:int -> column:int -> ?end_line:int -> ?end_column:int ->
+33
lib/html5rw/dom/dom_node.ml
···147148let has_attr node name = List.mem_assoc name node.attrs
149000000000000000000000000000000000150(* Tree traversal *)
151let rec descendants node =
152 List.concat_map (fun n -> n :: descendants n) node.children
···147148let has_attr node name = List.mem_assoc name node.attrs
149150+(* Whitespace splitting for space-separated attribute values per HTML5 spec.
151+ Handles ASCII whitespace: space, tab, newline, carriage return, form feed *)
152+let split_on_whitespace s =
153+ let is_whitespace = function
154+ | ' ' | '\t' | '\n' | '\r' | '\x0c' -> true
155+ | _ -> false
156+ in
157+ let len = String.length s in
158+ let rec find_start acc i =
159+ if i >= len then List.rev acc
160+ else if is_whitespace s.[i] then find_start acc (i + 1)
161+ else find_end acc i (i + 1)
162+ and find_end acc start i =
163+ if i >= len then List.rev (String.sub s start (i - start) :: acc)
164+ else if is_whitespace s.[i] then find_start (String.sub s start (i - start) :: acc) (i + 1)
165+ else find_end acc start (i + 1)
166+ in
167+ find_start [] 0
168+169+(* Get space-separated attribute as list *)
170+let get_attr_list node name =
171+ match get_attr node name with
172+ | Some s -> split_on_whitespace s
173+ | None -> []
174+175+(* Common space-separated attribute accessors *)
176+let get_class_list node = get_attr_list node "class"
177+let get_rel_list node = List.map String.lowercase_ascii (get_attr_list node "rel")
178+let get_headers_list node = get_attr_list node "headers"
179+let get_itemref_list node = get_attr_list node "itemref"
180+let get_itemprop_list node = get_attr_list node "itemprop"
181+let get_itemtype_list node = get_attr_list node "itemtype"
182+183(* Tree traversal *)
184let rec descendants node =
185 List.concat_map (fun n -> n :: descendants n) node.children
+86
lib/html5rw/dom/dom_node.mli
···740val has_attr : node -> string -> bool
741(** [has_attr node name] returns [true] if the node has attribute [name]. *)
74200000000000000000000000000000000000000000000000000000000000000000000000000000000000000743(** {1 Location Helpers}
744745 Functions to manage source location information for nodes.
···740val has_attr : node -> string -> bool
741(** [has_attr node name] returns [true] if the node has attribute [name]. *)
742743+(** {1 Space-Separated Attribute Values}
744+745+ Many HTML attributes contain space-separated lists of values. For example,
746+ the [class] attribute contains CSS class names: [class="header main active"].
747+ These functions parse such attributes into OCaml lists.
748+749+ Per the HTML5 spec, "ASCII whitespace" (space, tab, newline, carriage return,
750+ form feed) is used as the separator.
751+*)
752+753+val split_on_whitespace : string -> string list
754+(** [split_on_whitespace s] splits a string on ASCII whitespace.
755+756+ This implements the HTML5 "split on ASCII whitespace" algorithm used
757+ for parsing space-separated attribute values.
758+759+ {b Example:}
760+ {[
761+ split_on_whitespace "foo bar\tbaz"
762+ (* Returns: ["foo"; "bar"; "baz"] *)
763+ ]}
764+*)
765+766+val get_attr_list : node -> string -> string list
767+(** [get_attr_list node name] returns a space-separated attribute as a list.
768+769+ Returns an empty list if the attribute doesn't exist.
770+771+ {b Example:}
772+ {[
773+ (* For <div class="foo bar baz"> *)
774+ get_attr_list div "class"
775+ (* Returns: ["foo"; "bar"; "baz"] *)
776+ ]}
777+*)
778+779+val get_class_list : node -> string list
780+(** [get_class_list node] returns the class attribute as a list of class names.
781+782+ Equivalent to [get_attr_list node "class"].
783+784+ {b Example:}
785+ {[
786+ (* For <div class="container main"> *)
787+ get_class_list div
788+ (* Returns: ["container"; "main"] *)
789+ ]}
790+*)
791+792+val get_rel_list : node -> string list
793+(** [get_rel_list node] returns the rel attribute as a list of link types.
794+795+ Link types are lowercased since they are case-insensitive per HTML5 spec.
796+797+ {b Example:}
798+ {[
799+ (* For <link rel="stylesheet preload"> *)
800+ get_rel_list link
801+ (* Returns: ["stylesheet"; "preload"] *)
802+ ]}
803+*)
804+805+val get_headers_list : node -> string list
806+(** [get_headers_list node] returns the headers attribute as a list of IDs.
807+808+ Used on [<td>] and [<th>] elements to associate cells with headers.
809+*)
810+811+val get_itemref_list : node -> string list
812+(** [get_itemref_list node] returns the itemref attribute as a list of IDs.
813+814+ Used for microdata to reference elements by ID.
815+*)
816+817+val get_itemprop_list : node -> string list
818+(** [get_itemprop_list node] returns the itemprop attribute as a list.
819+820+ Used for microdata property names.
821+*)
822+823+val get_itemtype_list : node -> string list
824+(** [get_itemtype_list node] returns the itemtype attribute as a list of URLs.
825+826+ Used for microdata type URLs.
827+*)
828+829(** {1 Location Helpers}
830831 Functions to manage source location information for nodes.
+10-6
lib/html5rw/dom/dom_serialize.ml
···8open Bytesrw
9open Dom_node
1011-(* Void elements that don't have end tags *)
12-let void_elements = [
13- "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
14- "link"; "meta"; "source"; "track"; "wbr"
15-]
00001617-let is_void name = List.mem name void_elements
1819(* Foreign attribute adjustments for test output *)
20let foreign_attr_adjustments = [
···8open Bytesrw
9open Dom_node
1011+(* Void elements that don't have end tags - O(1) hashtable lookup *)
12+let void_elements_tbl =
13+ let elements = [
14+ "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
15+ "link"; "meta"; "source"; "track"; "wbr"
16+ ] in
17+ let tbl = Hashtbl.create (List.length elements) in
18+ List.iter (fun e -> Hashtbl.add tbl e ()) elements;
19+ tbl
2021+let is_void name = Hashtbl.mem void_elements_tbl name
2223(* Foreign attribute adjustments for test output *)
24let foreign_attr_adjustments = [
+32-8
lib/html5rw/parser/parser_constants.ml
···80let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"]
81let mathml_text_integration_tbl = make_set mathml_text_integration
8283-(* MathML attribute adjustments *)
84-let mathml_attr_adjustments = [
85- ("definitionurl", "definitionURL")
86-]
008788let adjust_mathml_attrs attrs =
89 List.map (fun (k, v) ->
90- match List.assoc_opt (lowercase k) mathml_attr_adjustments with
91 | Some adjusted_k -> (adjusted_k, v)
92 | None -> (k, v)
93 ) attrs
···95(* SVG HTML integration points *)
96let svg_html_integration = ["foreignObject"; "desc"; "title"]
97let svg_html_integration_tbl = make_set (List.map lowercase svg_html_integration)
0000009899(* SVG tag name adjustments *)
100let svg_tag_adjustments = [
···136 ("radialgradient", "radialGradient");
137 ("textpath", "textPath");
138]
0139140(* SVG attribute adjustments *)
141let svg_attr_adjustments = [
···198 ("ychannelselector", "yChannelSelector");
199 ("zoomandpan", "zoomAndPan");
200]
0201202(* Foreign attribute adjustments *)
203let foreign_attr_adjustments = [
···213 ("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/"));
214 ("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/"));
215]
0216217(* Quirks mode detection *)
218let quirky_public_matches = [
···293 "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
294]
2950000000000296(* Helper functions - O(1) hashtable lookups *)
297let is_void_element name = Hashtbl.mem void_elements_tbl name
298let is_formatting_element name = Hashtbl.mem formatting_elements_tbl name
···303let is_mathml_text_integration name = Hashtbl.mem mathml_text_integration_tbl name
304let is_svg_html_integration name = Hashtbl.mem svg_html_integration_tbl (lowercase name)
305let is_select_scope_exclude name = Hashtbl.mem select_scope_exclude_tbl name
000306307(* Backwards compatibility aliases *)
308let is_void = List.mem
···311let is_heading = List.mem
312313let adjust_svg_tag_name name =
314- match List.assoc_opt (lowercase name) svg_tag_adjustments with
315 | Some adjusted -> adjusted
316 | None -> name
317318let adjust_svg_attrs attrs =
319 List.map (fun (name, value) ->
320 let adjusted_name =
321- match List.assoc_opt (lowercase name) svg_attr_adjustments with
322 | Some n -> n
323 | None -> name
324 in
···327328let adjust_foreign_attrs attrs =
329 List.map (fun (name, value) ->
330- match List.assoc_opt (lowercase name) foreign_attr_adjustments with
331 | Some (prefix, local, _ns) ->
332 if prefix = "" then (local, value)
333 else (prefix ^ ":" ^ local, value)
···80let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"]
81let mathml_text_integration_tbl = make_set mathml_text_integration
8283+(* MathML attribute adjustments - O(1) hashtable lookup *)
84+let mathml_attr_adjustments_tbl =
85+ let adjustments = [("definitionurl", "definitionURL")] in
86+ let tbl = Hashtbl.create 4 in
87+ List.iter (fun (k, v) -> Hashtbl.add tbl k v) adjustments;
88+ tbl
8990let adjust_mathml_attrs attrs =
91 List.map (fun (k, v) ->
92+ match Hashtbl.find_opt mathml_attr_adjustments_tbl (lowercase k) with
93 | Some adjusted_k -> (adjusted_k, v)
94 | None -> (k, v)
95 ) attrs
···97(* SVG HTML integration points *)
98let svg_html_integration = ["foreignObject"; "desc"; "title"]
99let svg_html_integration_tbl = make_set (List.map lowercase svg_html_integration)
100+101+(* Helper to create hashtable from association list for O(1) lookup *)
102+let make_assoc_tbl pairs =
103+ let tbl = Hashtbl.create (List.length pairs) in
104+ List.iter (fun (k, v) -> Hashtbl.add tbl k v) pairs;
105+ tbl
106107(* SVG tag name adjustments *)
108let svg_tag_adjustments = [
···144 ("radialgradient", "radialGradient");
145 ("textpath", "textPath");
146]
147+let svg_tag_adjustments_tbl = make_assoc_tbl svg_tag_adjustments
148149(* SVG attribute adjustments *)
150let svg_attr_adjustments = [
···207 ("ychannelselector", "yChannelSelector");
208 ("zoomandpan", "zoomAndPan");
209]
210+let svg_attr_adjustments_tbl = make_assoc_tbl svg_attr_adjustments
211212(* Foreign attribute adjustments *)
213let foreign_attr_adjustments = [
···223 ("xmlns", ("", "xmlns", "http://www.w3.org/2000/xmlns/"));
224 ("xmlns:xlink", ("xmlns", "xlink", "http://www.w3.org/2000/xmlns/"));
225]
226+let foreign_attr_adjustments_tbl = make_assoc_tbl foreign_attr_adjustments
227228(* Quirks mode detection *)
229let quirky_public_matches = [
···304 "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
305]
306307+(* Table-related element sets for tree builder O(1) lookups *)
308+let table_section_elements = ["tbody"; "thead"; "tfoot"]
309+let table_section_elements_tbl = make_set table_section_elements
310+311+let table_cell_elements = ["td"; "th"]
312+let table_cell_elements_tbl = make_set table_cell_elements
313+314+let foster_parenting_elements = ["table"; "tbody"; "tfoot"; "thead"; "tr"]
315+let foster_parenting_elements_tbl = make_set foster_parenting_elements
316+317(* Helper functions - O(1) hashtable lookups *)
318let is_void_element name = Hashtbl.mem void_elements_tbl name
319let is_formatting_element name = Hashtbl.mem formatting_elements_tbl name
···324let is_mathml_text_integration name = Hashtbl.mem mathml_text_integration_tbl name
325let is_svg_html_integration name = Hashtbl.mem svg_html_integration_tbl (lowercase name)
326let is_select_scope_exclude name = Hashtbl.mem select_scope_exclude_tbl name
327+let is_table_section_element name = Hashtbl.mem table_section_elements_tbl name
328+let is_table_cell_element name = Hashtbl.mem table_cell_elements_tbl name
329+let is_foster_parenting_element name = Hashtbl.mem foster_parenting_elements_tbl name
330331(* Backwards compatibility aliases *)
332let is_void = List.mem
···335let is_heading = List.mem
336337let adjust_svg_tag_name name =
338+ match Hashtbl.find_opt svg_tag_adjustments_tbl (lowercase name) with
339 | Some adjusted -> adjusted
340 | None -> name
341342let adjust_svg_attrs attrs =
343 List.map (fun (name, value) ->
344 let adjusted_name =
345+ match Hashtbl.find_opt svg_attr_adjustments_tbl (lowercase name) with
346 | Some n -> n
347 | None -> name
348 in
···351352let adjust_foreign_attrs attrs =
353 List.map (fun (name, value) ->
354+ match Hashtbl.find_opt foreign_attr_adjustments_tbl (lowercase name) with
355 | Some (prefix, local, _ns) ->
356 if prefix = "" then (local, value)
357 else (prefix ^ ":" ^ local, value)
+11-11
lib/html5rw/parser/parser_tree_builder.ml
···91 (* Set initial mode based on context *)
92 t.mode <- (
93 if name = "html" then Parser_insertion_mode.Before_head
94- else if List.mem name ["tbody"; "thead"; "tfoot"] && (ns = None || ns = Some "html") then
95 Parser_insertion_mode.In_table_body
96 else if name = "tr" && (ns = None || ns = Some "html") then
97 Parser_insertion_mode.In_row
98- else if List.mem name ["td"; "th"] && (ns = None || ns = Some "html") then
99 Parser_insertion_mode.In_cell
100 else if name = "caption" && (ns = None || ns = Some "html") then
101 Parser_insertion_mode.In_caption
···160 match current_node t with
161 | None -> (t.document, None)
162 | Some target ->
163- if t.foster_parenting && List.mem target.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin
164 (* Foster parenting per WHATWG spec *)
165 (* Step 1: Find last (most recent) template and table in stack *)
166 (* Note: index 0 = top of stack = most recently added *)
···599 | Some p -> Dom.remove_child p !last_node
600 | None -> ());
601 (* Check if we need foster parenting *)
602- if t.foster_parenting && List.mem ca.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] then begin
603 (* Find table and insert before it *)
604 let rec find_table = function
605 | [] -> None
···698 end;
699 if t.mode <> Parser_insertion_mode.In_select_in_table then
700 t.mode <- Parser_insertion_mode.In_select
701- end else if List.mem name ["td"; "th"] && not is_last then
702 t.mode <- Parser_insertion_mode.In_cell
703 else if name = "tr" then
704 t.mode <- Parser_insertion_mode.In_row
705- else if List.mem name ["tbody"; "thead"; "tfoot"] then
706 t.mode <- Parser_insertion_mode.In_table_body
707 else if name = "caption" then
708 t.mode <- Parser_insertion_mode.In_caption
···14731474and process_in_table t token =
1475 match token with
1476- | Token.Character _ when (match current_node t with Some n -> List.mem n.Dom.name ["table"; "tbody"; "tfoot"; "thead"; "tr"] | None -> false) ->
1477 t.pending_table_chars <- [];
1478 t.original_mode <- Some t.mode;
1479 t.mode <- Parser_insertion_mode.In_table_text;
···17981799and process_in_cell t token =
1800 match token with
1801- | Token.Tag { kind = Token.End; name; _ } when List.mem name ["td"; "th"] ->
1802 if not (has_element_in_table_scope t name) then
1803 parse_error t "unexpected-end-tag"
1804 else begin
···1822 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] ->
1823 parse_error t "unexpected-end-tag"
1824 | Token.Tag { kind = Token.End; name; _ }
1825- when List.mem name ["table"; "tbody"; "tfoot"; "thead"; "tr"] ->
1826 if not (has_element_in_table_scope t name) then
1827 parse_error t "unexpected-end-tag"
1828 else begin
···1835and close_cell t =
1836 generate_implied_end_tags t ();
1837 (match current_node t with
1838- | Some n when not (List.mem n.Dom.name ["td"; "th"] && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
1839 | _ -> ());
1840 pop_until_html_one_of t ["td"; "th"];
1841 clear_active_formatting_to_marker t;
···2050 t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes;
2051 t.mode <- Parser_insertion_mode.In_table_body;
2052 process_token t token
2053- | Token.Tag { kind = Token.Start; name; _ } when List.mem name ["td"; "th"] ->
2054 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2055 t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes;
2056 t.mode <- Parser_insertion_mode.In_row;
···91 (* Set initial mode based on context *)
92 t.mode <- (
93 if name = "html" then Parser_insertion_mode.Before_head
94+ else if Parser_constants.is_table_section_element name && (ns = None || ns = Some "html") then
95 Parser_insertion_mode.In_table_body
96 else if name = "tr" && (ns = None || ns = Some "html") then
97 Parser_insertion_mode.In_row
98+ else if Parser_constants.is_table_cell_element name && (ns = None || ns = Some "html") then
99 Parser_insertion_mode.In_cell
100 else if name = "caption" && (ns = None || ns = Some "html") then
101 Parser_insertion_mode.In_caption
···160 match current_node t with
161 | None -> (t.document, None)
162 | Some target ->
163+ if t.foster_parenting && Parser_constants.is_foster_parenting_element target.Dom.name then begin
164 (* Foster parenting per WHATWG spec *)
165 (* Step 1: Find last (most recent) template and table in stack *)
166 (* Note: index 0 = top of stack = most recently added *)
···599 | Some p -> Dom.remove_child p !last_node
600 | None -> ());
601 (* Check if we need foster parenting *)
602+ if t.foster_parenting && Parser_constants.is_foster_parenting_element ca.Dom.name then begin
603 (* Find table and insert before it *)
604 let rec find_table = function
605 | [] -> None
···698 end;
699 if t.mode <> Parser_insertion_mode.In_select_in_table then
700 t.mode <- Parser_insertion_mode.In_select
701+ end else if Parser_constants.is_table_cell_element name && not is_last then
702 t.mode <- Parser_insertion_mode.In_cell
703 else if name = "tr" then
704 t.mode <- Parser_insertion_mode.In_row
705+ else if Parser_constants.is_table_section_element name then
706 t.mode <- Parser_insertion_mode.In_table_body
707 else if name = "caption" then
708 t.mode <- Parser_insertion_mode.In_caption
···14731474and process_in_table t token =
1475 match token with
1476+ | Token.Character _ when (match current_node t with Some n -> Parser_constants.is_foster_parenting_element n.Dom.name | None -> false) ->
1477 t.pending_table_chars <- [];
1478 t.original_mode <- Some t.mode;
1479 t.mode <- Parser_insertion_mode.In_table_text;
···17981799and process_in_cell t token =
1800 match token with
1801+ | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_table_cell_element name ->
1802 if not (has_element_in_table_scope t name) then
1803 parse_error t "unexpected-end-tag"
1804 else begin
···1822 when List.mem name ["body"; "caption"; "col"; "colgroup"; "html"] ->
1823 parse_error t "unexpected-end-tag"
1824 | Token.Tag { kind = Token.End; name; _ }
1825+ when Parser_constants.is_foster_parenting_element name ->
1826 if not (has_element_in_table_scope t name) then
1827 parse_error t "unexpected-end-tag"
1828 else begin
···1835and close_cell t =
1836 generate_implied_end_tags t ();
1837 (match current_node t with
1838+ | Some n when not (Parser_constants.is_table_cell_element n.Dom.name && is_in_html_namespace n) -> parse_error t "end-tag-too-early"
1839 | _ -> ());
1840 pop_until_html_one_of t ["td"; "th"];
1841 clear_active_formatting_to_marker t;
···2050 t.template_modes <- Parser_insertion_mode.In_table_body :: t.template_modes;
2051 t.mode <- Parser_insertion_mode.In_table_body;
2052 process_token t token
2053+ | Token.Tag { kind = Token.Start; name; _ } when Parser_constants.is_table_cell_element name ->
2054 t.template_modes <- (match t.template_modes with _ :: rest -> rest | [] -> []);
2055 t.template_modes <- Parser_insertion_mode.In_row :: t.template_modes;
2056 t.mode <- Parser_insertion_mode.In_row;
···1943 error t (Printf.sprintf "surrogate-character-reference:%04x" code);
1944 replacement_char
1945 end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1946+ (* Noncharacters end in 0xFFFE or 0xFFFF in each plane (0-16).
1947+ O(1) bitwise check instead of O(n) list membership. *)
1948+ (let low16 = code land 0xFFFF in low16 = 0xFFFE || low16 = 0xFFFF) then begin
0001949 error t (Printf.sprintf "noncharacter-character-reference:%05x" code);
1950 Entities.Numeric_ref.codepoint_to_utf8 code
1951 end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
+3-21
lib/htmlrw_check/datatype/datatype.ml
···42 else String.sub s start (end_pos - start + 1)
4344(** Split string on HTML whitespace characters (space, tab, LF, FF, CR).
45- Filters out empty tokens. Used for space-separated attribute values. *)
46-let split_on_whitespace s =
47- let len = String.length s in
48- let rec split acc start i =
49- if i >= len then
50- if i > start then
51- List.rev ((String.sub s start (i - start)) :: acc)
52- else
53- List.rev acc
54- else if is_whitespace s.[i] then
55- let acc' =
56- if i > start then
57- (String.sub s start (i - start)) :: acc
58- else
59- acc
60- in
61- split acc' (i + 1) (i + 1)
62- else
63- split acc start (i + 1)
64- in
65- split [] 0 0
6667(** Factory for creating enum-based validators.
68 Many HTML attributes accept a fixed set of keyword values.
···42 else String.sub s start (end_pos - start + 1)
4344(** Split string on HTML whitespace characters (space, tab, LF, FF, CR).
45+ Filters out empty tokens. Used for space-separated attribute values.
46+ Delegates to the core library implementation. *)
47+let split_on_whitespace = Html5rw.Dom.split_on_whitespace
0000000000000000004849(** Factory for creating enum-based validators.
50 Many HTML attributes accept a fixed set of keyword values.
+2-2
lib/htmlrw_check/element/attr.ml
···872let get_rel attrs =
873 List.find_map (function `Rel s -> Some s | _ -> None) attrs
874875-(** Get rel attribute as list of link types (space-separated) *)
876let get_rel_list attrs =
877 match get_rel attrs with
878- | Some s -> Datatype.split_on_whitespace s
879 | None -> []
880881(** Get headers attribute as raw string *)
···872let get_rel attrs =
873 List.find_map (function `Rel s -> Some s | _ -> None) attrs
874875+(** Get rel attribute as list of link types (space-separated, lowercased per HTML5 spec) *)
876let get_rel_list attrs =
877 match get_rel attrs with
878+ | Some s -> List.map String.lowercase_ascii (Datatype.split_on_whitespace s)
879 | None -> []
880881(** Get headers attribute as raw string *)
+9
lib/htmlrw_check/element/element.ml
···104let get_all_aria elem = Attr.get_all_aria elem.attrs
105let get_all_data elem = Attr.get_all_data elem.attrs
106000000000107(** {1 Category Checks} *)
108109(** Check if this is a void element *)
···104let get_all_aria elem = Attr.get_all_aria elem.attrs
105let get_all_data elem = Attr.get_all_data elem.attrs
106107+(** Space-separated list getters *)
108+let get_class_list elem = Attr.get_class_list elem.attrs
109+let get_rel_list elem = Attr.get_rel_list elem.attrs
110+let get_headers_list elem = Attr.get_headers_list elem.attrs
111+let get_itemref_list elem = Attr.get_itemref_list elem.attrs
112+let get_itemprop_list elem = Attr.get_itemprop_list elem.attrs
113+let get_itemtype_list elem = Attr.get_itemtype_list elem.attrs
114+let get_aria_list name elem = Attr.get_aria_list name elem.attrs
115+116(** {1 Category Checks} *)
117118(** Check if this is a void element *)
+26
lib/htmlrw_check/element/element.mli
···162val get_all_data : t -> (string * string) list
163(** [get_all_data elem] extracts all data-* attributes. *)
16400000000000000000000000000165(** {1 Raw Attribute Fallback} *)
166167val get_raw_attr : string -> t -> string option
···162val get_all_data : t -> (string * string) list
163(** [get_all_data elem] extracts all data-* attributes. *)
164165+(** {1 Space-Separated List Accessors}
166+167+ These functions return attribute values as parsed lists, splitting on
168+ whitespace per HTML5 spec. *)
169+170+val get_class_list : t -> string list
171+(** [get_class_list elem] returns class names as a list. *)
172+173+val get_rel_list : t -> string list
174+(** [get_rel_list elem] returns link types as a list. *)
175+176+val get_headers_list : t -> string list
177+(** [get_headers_list elem] returns header IDs as a list (for td/th). *)
178+179+val get_itemref_list : t -> string list
180+(** [get_itemref_list elem] returns itemref IDs as a list. *)
181+182+val get_itemprop_list : t -> string list
183+(** [get_itemprop_list elem] returns itemprop names as a list. *)
184+185+val get_itemtype_list : t -> string list
186+(** [get_itemtype_list elem] returns itemtype URLs as a list. *)
187+188+val get_aria_list : string -> t -> string list
189+(** [get_aria_list name elem] returns space-separated ARIA values as a list. *)
190+191(** {1 Raw Attribute Fallback} *)
192193val get_raw_attr : string -> t -> string option
+15-4
lib/htmlrw_check/message_collector.ml
···3type t = {
4 mutable messages : Message.t list;
5 mutable current_location : Message.location option;
06}
78-let create () = { messages = []; current_location = None }
910let set_current_location t location = t.current_location <- location
11let clear_current_location t = t.current_location <- None
12let get_current_location t = t.current_location
1314-let add t msg = t.messages <- msg :: t.messages
001516(** Add a message from a typed conformance error code *)
17let add_typed t ?location ?element ?attribute ?extract error_code =
···23 let msg = Message.of_conformance_error ?location:loc ?element ?attribute ?extract error_code in
24 add t msg
2526-let messages t = List.rev t.messages
0000002728let errors t =
29 List.filter (fun msg -> msg.Message.severity = Message.Error) (messages t)
···45 if msg.Message.severity = Message.Error then acc + 1 else acc)
46 0 t.messages
4748-let clear t = t.messages <- []
00
···3type t = {
4 mutable messages : Message.t list;
5 mutable current_location : Message.location option;
6+ mutable cached_reversed : Message.t list option; (* Cache for O(1) repeated access *)
7}
89+let create () = { messages = []; current_location = None; cached_reversed = None }
1011let set_current_location t location = t.current_location <- location
12let clear_current_location t = t.current_location <- None
13let get_current_location t = t.current_location
1415+let add t msg =
16+ t.messages <- msg :: t.messages;
17+ t.cached_reversed <- None (* Invalidate cache *)
1819(** Add a message from a typed conformance error code *)
20let add_typed t ?location ?element ?attribute ?extract error_code =
···26 let msg = Message.of_conformance_error ?location:loc ?element ?attribute ?extract error_code in
27 add t msg
2829+let messages t =
30+ match t.cached_reversed with
31+ | Some cached -> cached
32+ | None ->
33+ let reversed = List.rev t.messages in
34+ t.cached_reversed <- Some reversed;
35+ reversed
3637let errors t =
38 List.filter (fun msg -> msg.Message.severity = Message.Error) (messages t)
···54 if msg.Message.severity = Message.Error then acc + 1 else acc)
55 0 t.messages
5657+let clear t =
58+ t.messages <- [];
59+ t.cached_reversed <- None
+4-8
lib/htmlrw_check/semantic/id_checker.ml
···5556(** Attributes that reference a single ID - O(1) lookup. *)
57let single_id_ref_attrs =
58- let tbl = Hashtbl.create 8 in
59- List.iter (fun a -> Hashtbl.add tbl a ()) [
60 "for"; (* label *)
61 "form"; (* form-associated elements *)
62 "list"; (* input *)
···64 "popovertarget"; (* button - references popover element *)
65 "commandfor"; (* button - references element to control *)
66 "anchor"; (* popover - references anchor element *)
67- ];
68- tbl
6970let is_single_id_ref_attr name = Hashtbl.mem single_id_ref_attrs name
7172(** Attributes that reference multiple IDs (space-separated) - O(1) lookup. *)
73let multi_id_ref_attrs =
74- let tbl = Hashtbl.create 8 in
75- List.iter (fun a -> Hashtbl.add tbl a ()) [
76 "headers"; (* td, th *)
77 "aria-labelledby";
78 "aria-describedby";
···80 "aria-flowto";
81 "aria-owns";
82 "itemref";
83- ];
84- tbl
8586let is_multi_id_ref_attr name = Hashtbl.mem multi_id_ref_attrs name
87
···5556(** Attributes that reference a single ID - O(1) lookup. *)
57let single_id_ref_attrs =
58+ Attr_utils.hashtbl_of_list [
059 "for"; (* label *)
60 "form"; (* form-associated elements *)
61 "list"; (* input *)
···63 "popovertarget"; (* button - references popover element *)
64 "commandfor"; (* button - references element to control *)
65 "anchor"; (* popover - references anchor element *)
66+ ]
06768let is_single_id_ref_attr name = Hashtbl.mem single_id_ref_attrs name
6970(** Attributes that reference multiple IDs (space-separated) - O(1) lookup. *)
71let multi_id_ref_attrs =
72+ Attr_utils.hashtbl_of_list [
073 "headers"; (* td, th *)
74 "aria-labelledby";
75 "aria-describedby";
···77 "aria-flowto";
78 "aria-owns";
79 "itemref";
80+ ]
08182let is_multi_id_ref_attr name = Hashtbl.mem multi_id_ref_attrs name
83
···1819(* Elements whose text content we skip for language detection - O(1) lookup *)
20let skip_elements =
21- let tbl = Hashtbl.create 20 in
22- List.iter (fun e -> Hashtbl.add tbl e ()) [
23 "a"; "button"; "details"; "figcaption"; "form"; "li"; "nav";
24 "pre"; "script"; "select"; "span"; "style"; "summary";
25 "td"; "textarea"; "th"; "tr"
26- ];
27- tbl
2829let is_skip_element name = Hashtbl.mem skip_elements name
3031(* RTL languages - O(1) lookup *)
32let rtl_langs =
33- let tbl = Hashtbl.create 16 in
34- List.iter (fun l -> Hashtbl.add tbl l ()) ["ar"; "azb"; "ckb"; "dv"; "fa"; "he"; "pnb"; "ps"; "sd"; "ug"; "ur"; "iw"];
35- tbl
3637let is_rtl_lang lang = Hashtbl.mem rtl_langs lang
38
···1819(* Elements whose text content we skip for language detection - O(1) lookup *)
20let skip_elements =
21+ Attr_utils.hashtbl_of_list [
022 "a"; "button"; "details"; "figcaption"; "form"; "li"; "nav";
23 "pre"; "script"; "select"; "span"; "style"; "summary";
24 "td"; "textarea"; "th"; "tr"
25+ ]
02627let is_skip_element name = Hashtbl.mem skip_elements name
2829(* RTL languages - O(1) lookup *)
30let rtl_langs =
31+ Attr_utils.hashtbl_of_list ["ar"; "azb"; "ckb"; "dv"; "fa"; "he"; "pnb"; "ps"; "sd"; "ug"; "ur"; "iw"]
003233let is_rtl_lang lang = Hashtbl.mem rtl_langs lang
34
+6-3
lib/htmlrw_check/semantic/nesting_checker.ml
···155 let map_num = special_ancestor_number "map" in
156 1 lsl map_num
157158-(** Transparent elements - inherit content model from parent *)
159-let transparent_elements = ["a"; "canvas"; "video"; "audio"; "object"; "ins"; "del"; "map"]
000160161(** Stack node representing an element's context. *)
162type stack_node = {
···334 in
335336 (* Push onto stack *)
337- let is_transparent = List.mem name transparent_elements in
338 let node = { ancestor_mask = state.ancestor_mask; name; is_transparent } in
339 state.stack <- node :: state.stack;
340 state.ancestor_mask <- new_mask
···155 let map_num = special_ancestor_number "map" in
156 1 lsl map_num
157158+(** Transparent elements - inherit content model from parent. O(1) hashtable lookup. *)
159+let transparent_elements_tbl =
160+ Attr_utils.hashtbl_of_list ["a"; "canvas"; "video"; "audio"; "object"; "ins"; "del"; "map"]
161+162+let is_transparent_element name = Hashtbl.mem transparent_elements_tbl name
163164(** Stack node representing an element's context. *)
165type stack_node = {
···337 in
338339 (* Push onto stack *)
340+ let is_transparent = is_transparent_element name in
341 let node = { ancestor_mask = state.ancestor_mask; name; is_transparent } in
342 state.stack <- node :: state.stack;
343 state.ancestor_mask <- new_mask
+14-19
lib/htmlrw_check/specialized/aria_checker.ml
···9 specification. Abstract roles are included but should not be used
10 in HTML content. *)
11let valid_aria_roles =
12- let roles = [
13 (* Document structure roles *)
14 (* Note: "directory" is deprecated in WAI-ARIA 1.2, use "list" instead *)
15 "article"; "associationlist"; "associationlistitemkey";
···4344 (* Additional roles *)
45 "application"; "columnheader"; "rowheader";
46- ] in
47- let tbl = Hashtbl.create (List.length roles) in
48- List.iter (fun role -> Hashtbl.add tbl role ()) roles;
49- tbl
5051(** Roles that cannot have accessible names.
5253 These roles must not have aria-label or aria-labelledby attributes. *)
54let roles_which_cannot_be_named =
55- let roles = [
56 "caption"; "code"; "deletion"; "emphasis"; "generic"; "insertion";
57 "mark"; "none"; "paragraph"; "presentation"; "strong"; "subscript";
58 "suggestion"; "superscript"
59- ] in
60- let tbl = Hashtbl.create (List.length roles) in
61- List.iter (fun role -> Hashtbl.add tbl role ()) roles;
62- tbl
6364(** Elements whose implicit role is 'generic' and cannot have aria-label unless
65- they have an explicit role that allows naming. *)
66-let elements_with_generic_role = [
67- "a"; "abbr"; "address"; "b"; "bdi"; "bdo"; "br"; "caption"; "cite"; "code";
68- "colgroup"; "data"; "del"; "dfn"; "div"; "em"; "figcaption"; "hgroup"; "i";
69- "ins"; "kbd"; "legend"; "mark"; "p"; "pre"; "q"; "rp"; "rt"; "ruby"; "s";
70- "samp"; "small"; "span"; "strong"; "sub"; "sup"; "time"; "title"; "u"; "var";
71- "wbr"
72-]
07374(** Check if element name is a custom element (contains hyphen). *)
75let is_custom_element name =
···90 if is_custom_element element_name then false
91 else
92 (* No implicit role - element has generic role unless it's interactive *)
93- not (List.mem element_name elements_with_generic_role)
9495(** Map from descendant role to set of required ancestor roles. *)
96let required_role_ancestor_by_descendant : (string, string list) Hashtbl.t =
···9 specification. Abstract roles are included but should not be used
10 in HTML content. *)
11let valid_aria_roles =
12+ Attr_utils.hashtbl_of_list [
13 (* Document structure roles *)
14 (* Note: "directory" is deprecated in WAI-ARIA 1.2, use "list" instead *)
15 "article"; "associationlist"; "associationlistitemkey";
···4344 (* Additional roles *)
45 "application"; "columnheader"; "rowheader";
46+ ]
0004748(** Roles that cannot have accessible names.
4950 These roles must not have aria-label or aria-labelledby attributes. *)
51let roles_which_cannot_be_named =
52+ Attr_utils.hashtbl_of_list [
53 "caption"; "code"; "deletion"; "emphasis"; "generic"; "insertion";
54 "mark"; "none"; "paragraph"; "presentation"; "strong"; "subscript";
55 "suggestion"; "superscript"
56+ ]
0005758(** Elements whose implicit role is 'generic' and cannot have aria-label unless
59+ they have an explicit role that allows naming. O(1) lookup. *)
60+let elements_with_generic_role =
61+ Attr_utils.hashtbl_of_list [
62+ "a"; "abbr"; "address"; "b"; "bdi"; "bdo"; "br"; "caption"; "cite"; "code";
63+ "colgroup"; "data"; "del"; "dfn"; "div"; "em"; "figcaption"; "hgroup"; "i";
64+ "ins"; "kbd"; "legend"; "mark"; "p"; "pre"; "q"; "rp"; "rt"; "ruby"; "s";
65+ "samp"; "small"; "span"; "strong"; "sub"; "sup"; "time"; "title"; "u"; "var";
66+ "wbr"
67+ ]
6869(** Check if element name is a custom element (contains hyphen). *)
70let is_custom_element name =
···85 if is_custom_element element_name then false
86 else
87 (* No implicit role - element has generic role unless it's interactive *)
88+ not (Hashtbl.mem elements_with_generic_role element_name)
8990(** Map from descendant role to set of required ancestor roles. *)
91let required_role_ancestor_by_descendant : (string, string list) Hashtbl.t =
···135 if name_lower = "link" then begin
136 let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in
137 let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in
138- let rel_value = Attr_utils.get_attr "rel" attrs in
139 let as_value = Attr_utils.get_attr "as" attrs in
140141 (* imagesizes requires imagesrcset *)
···155 (* as attribute requires rel="preload" or rel="modulepreload" *)
156 (match as_value with
157 | Some _ ->
158- let rel_is_preload = match rel_value with
159- | Some v ->
160- let rel_lower = String.lowercase_ascii (String.trim v) in
161- String.length rel_lower > 0 &&
162- (List.mem "preload" (String.split_on_char ' ' rel_lower) ||
163- List.mem "modulepreload" (String.split_on_char ' ' rel_lower))
164- | None -> false
165- in
166 if not rel_is_preload then
167 Message_collector.add_typed collector (`Link `As_requires_preload)
168 | None -> ())
···135 if name_lower = "link" then begin
136 let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in
137 let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in
0138 let as_value = Attr_utils.get_attr "as" attrs in
139140 (* imagesizes requires imagesrcset *)
···154 (* as attribute requires rel="preload" or rel="modulepreload" *)
155 (match as_value with
156 | Some _ ->
157+ let rel_types = Element.get_rel_list element in
158+ let rel_is_preload = List.mem "preload" rel_types || List.mem "modulepreload" rel_types in
000000159 if not rel_is_preload then
160 Message_collector.add_typed collector (`Link `As_requires_preload)
161 | None -> ())
+1-3
lib/htmlrw_check/specialized/label_checker.ml
···45(** Labelable elements that label can reference - O(1) hashtable lookup *)
6let labelable_elements =
7- let tbl = Hashtbl.create 8 in
8- List.iter (fun e -> Hashtbl.add tbl e ()) ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"];
9- tbl
1011let is_labelable name = Hashtbl.mem labelable_elements name
12
···45(** Labelable elements that label can reference - O(1) hashtable lookup *)
6let labelable_elements =
7+ Attr_utils.hashtbl_of_list ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"]
0089let is_labelable name = Hashtbl.mem labelable_elements name
10