···33(* Use Astring for string operations *)
44let lowercase = Astring.String.Ascii.lowercase
5566+(* Helper to create a hashtable set from a list for O(1) membership *)
77+let make_set elements =
88+ let tbl = Hashtbl.create (List.length elements) in
99+ List.iter (fun e -> Hashtbl.add tbl e ()) elements;
1010+ tbl
1111+612(* Void elements - no end tag allowed *)
713let void_elements = [
814 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
915 "link"; "meta"; "source"; "track"; "wbr"
1016]
1717+let void_elements_tbl = make_set void_elements
11181219(* Raw text elements - content is raw text *)
1320let raw_text_elements = ["script"; "style"]
···2027 "a"; "b"; "big"; "code"; "em"; "font"; "i"; "nobr"; "s"; "small";
2128 "strike"; "strong"; "tt"; "u"
2229]
3030+let formatting_elements_tbl = make_set formatting_elements
23312432(* Special elements *)
2533let special_elements = [
···3543 "tbody"; "td"; "template"; "textarea"; "tfoot"; "th"; "thead"; "title";
3644 "tr"; "track"; "ul"; "wbr"; "xmp"
3745]
4646+let special_elements_tbl = make_set special_elements
38473948(* Heading elements *)
4049let heading_elements = ["h1"; "h2"; "h3"; "h4"; "h5"; "h6"]
5050+let heading_elements_tbl = make_set heading_elements
41514252(* Implied end tag elements *)
4353let implied_end_tags = [
4454 "dd"; "dt"; "li"; "optgroup"; "option"; "p"; "rb"; "rp"; "rt"; "rtc"
4555]
5656+let implied_end_tags_tbl = make_set implied_end_tags
46574758(* Thoroughly implied end tags *)
4859let thoroughly_implied_end_tags = [
4960 "caption"; "colgroup"; "dd"; "dt"; "li"; "optgroup"; "option"; "p";
5061 "rb"; "rp"; "rt"; "rtc"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"
5162]
6363+let thoroughly_implied_end_tags_tbl = make_set thoroughly_implied_end_tags
52645365(* Scope elements for various scope checks *)
5466let default_scope = [
···6274let table_scope = ["html"; "table"; "template"]
63756476let select_scope_exclude = ["optgroup"; "option"]
7777+let select_scope_exclude_tbl = make_set select_scope_exclude
65786679(* MathML text integration points *)
6780let mathml_text_integration = ["mi"; "mo"; "mn"; "ms"; "mtext"]
8181+let mathml_text_integration_tbl = make_set mathml_text_integration
68826983(* MathML attribute adjustments *)
7084let mathml_attr_adjustments = [
···80948195(* SVG HTML integration points *)
8296let svg_html_integration = ["foreignObject"; "desc"; "title"]
9797+let svg_html_integration_tbl = make_set (List.map lowercase svg_html_integration)
83988499(* SVG tag name adjustments *)
85100let svg_tag_adjustments = [
···278293 "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"
279294]
280295281281-(* Helper functions *)
296296+(* Helper functions - O(1) hashtable lookups *)
297297+let is_void_element name = Hashtbl.mem void_elements_tbl name
298298+let is_formatting_element name = Hashtbl.mem formatting_elements_tbl name
299299+let is_special_element name = Hashtbl.mem special_elements_tbl name
300300+let is_heading_element name = Hashtbl.mem heading_elements_tbl name
301301+let is_implied_end_tag name = Hashtbl.mem implied_end_tags_tbl name
302302+let is_thoroughly_implied_end_tag name = Hashtbl.mem thoroughly_implied_end_tags_tbl name
303303+let is_mathml_text_integration name = Hashtbl.mem mathml_text_integration_tbl name
304304+let is_svg_html_integration name = Hashtbl.mem svg_html_integration_tbl (lowercase name)
305305+let is_select_scope_exclude name = Hashtbl.mem select_scope_exclude_tbl name
306306+307307+(* Backwards compatibility aliases *)
282308let is_void = List.mem
283309let is_formatting = List.mem
284310let is_special name = List.mem name special_elements
+14-14
lib/html5rw/parser/parser_tree_builder.ml
···294294let is_html_integration_point node =
295295 (* SVG foreignObject, desc, and title are always HTML integration points *)
296296 if node.Dom.namespace = Some "svg" &&
297297- List.mem node.Dom.name Parser_constants.svg_html_integration then true
297297+ Parser_constants.is_svg_html_integration node.Dom.name then true
298298 (* annotation-xml is an HTML integration point only with specific encoding values *)
299299 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
300300 match List.assoc_opt "encoding" node.Dom.attrs with
···307307(* Check if element is a MathML text integration point *)
308308let is_mathml_text_integration_point node =
309309 node.Dom.namespace = Some "mathml" &&
310310- List.mem node.Dom.name ["mi"; "mo"; "mn"; "ms"; "mtext"]
310310+ Parser_constants.is_mathml_text_integration node.Dom.name
311311312312(* Scope checks - integration points also terminate scope (except for table scope) *)
313313(* Per WHATWG spec, scope checks only consider HTML namespace elements for the target names *)
···341341 | [] -> false
342342 | n :: rest ->
343343 if n.Dom.name = name then true
344344- else if not (List.mem n.Dom.name Parser_constants.select_scope_exclude) then false
344344+ else if not (Parser_constants.is_select_scope_exclude n.Dom.name) then false
345345 else check rest
346346 in
347347 check t.open_elements
···350350let generate_implied_end_tags t ?except () =
351351 let rec loop () =
352352 match current_node t with
353353- | Some n when List.mem n.Dom.name Parser_constants.implied_end_tags ->
353353+ | Some n when Parser_constants.is_implied_end_tag n.Dom.name ->
354354 (match except with
355355 | Some ex when n.Dom.name = ex -> ()
356356 | _ -> pop_current t; loop ())
···361361let generate_all_implied_end_tags t =
362362 let rec loop () =
363363 match current_node t with
364364- | Some n when List.mem n.Dom.name Parser_constants.thoroughly_implied_end_tags ->
364364+ | Some n when Parser_constants.is_thoroughly_implied_end_tag n.Dom.name ->
365365 pop_current t; loop ()
366366 | _ -> ()
367367 in
···11051105 when List.mem name ["address"; "article"; "aside"; "blockquote"; "center"; "details"; "dialog"; "dir"; "div"; "dl"; "fieldset"; "figcaption"; "figure"; "footer"; "header"; "hgroup"; "main"; "menu"; "nav"; "ol"; "p"; "search"; "section"; "summary"; "ul"] ->
11061106 if has_element_in_button_scope t "p" then close_p_element t;
11071107 ignore (insert_element t name ~push:true attrs)
11081108- | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Parser_constants.heading_elements ->
11081108+ | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_heading_element name ->
11091109 if has_element_in_button_scope t "p" then close_p_element t;
11101110 (match current_node t with
11111111- | Some n when List.mem n.Dom.name Parser_constants.heading_elements ->
11111111+ | Some n when Parser_constants.is_heading_element n.Dom.name ->
11121112 parse_error t "unexpected-start-tag";
11131113 pop_current t
11141114 | _ -> ());
···12431243 | _ -> ());
12441244 pop_until_tag t name
12451245 end
12461246- | Token.Tag { kind = Token.End; name; _ } when List.mem name Parser_constants.heading_elements ->
12461246+ | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_heading_element name ->
12471247 if not (has_element_in_scope_impl t Parser_constants.heading_elements Parser_constants.default_scope ~check_integration_points:true) then
12481248 parse_error t "unexpected-end-tag"
12491249 else begin
···14371437 reconstruct_active_formatting t;
14381438 ignore (insert_element t name ~push:true attrs);
14391439 (* Check for self-closing on non-void HTML element *)
14401440- if self_closing && not (List.mem name Parser_constants.void_elements) then
14401440+ if self_closing && not (Parser_constants.is_void_element name) then
14411441 parse_error t "non-void-html-element-start-tag-with-trailing-solidus"
14421442 | Token.Tag { kind = Token.End; name; _ } ->
14431443 (* Any other end tag *)
···19431943 ignore (insert_element t name attrs)
19441944 (* Don't push to stack - void elements *)
19451945 (* Handle formatting elements in select *)
19461946- | Token.Tag { kind = Token.Start; name; attrs; _ } when List.mem name Parser_constants.formatting_elements ->
19461946+ | Token.Tag { kind = Token.Start; name; attrs; _ } when Parser_constants.is_formatting_element name ->
19471947 reconstruct_active_formatting t;
19481948 let node = insert_element t name ~push:true attrs in
19491949 push_formatting_element t node name attrs
19501950- | Token.Tag { kind = Token.End; name; _ } when List.mem name Parser_constants.formatting_elements ->
19501950+ | Token.Tag { kind = Token.End; name; _ } when Parser_constants.is_formatting_element name ->
19511951 (* Find select element and check if formatting element is inside select *)
19521952 let select_idx = ref None in
19531953 let fmt_idx = ref None in
···22112211 let is_html_integration_point node =
22122212 (* SVG foreignObject, desc, and title are always HTML integration points *)
22132213 if node.Dom.namespace = Some "svg" &&
22142214- List.mem node.Dom.name Parser_constants.svg_html_integration then true
22142214+ Parser_constants.is_svg_html_integration node.Dom.name then true
22152215 (* annotation-xml is an HTML integration point only with specific encoding values *)
22162216 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
22172217 match List.assoc_opt "encoding" node.Dom.attrs with
···22242224 (* Check for MathML text integration points *)
22252225 let is_mathml_text_integration_point node =
22262226 node.Dom.namespace = Some "mathml" &&
22272227- List.mem node.Dom.name ["mi"; "mo"; "mn"; "ms"; "mtext"]
22272227+ Parser_constants.is_mathml_text_integration node.Dom.name
22282228 in
22292229 (* Foreign content handling *)
22302230 let in_foreign =
···22932293 let is_html_integration_point node =
22942294 (* SVG foreignObject, desc, and title are always HTML integration points *)
22952295 if node.Dom.namespace = Some "svg" &&
22962296- List.mem node.Dom.name Parser_constants.svg_html_integration then true
22962296+ Parser_constants.is_svg_html_integration node.Dom.name then true
22972297 (* annotation-xml is an HTML integration point only with specific encoding values *)
22982298 else if node.Dom.namespace = Some "mathml" && node.Dom.name = "annotation-xml" then
22992299 match List.assoc_opt "encoding" node.Dom.attrs with
+23
lib/htmlrw_check/datatype/datatype.ml
···4141 if start > end_pos then ""
4242 else String.sub s start (end_pos - start + 1)
43434444+(** Split string on HTML whitespace characters (space, tab, LF, FF, CR).
4545+ Filters out empty tokens. Used for space-separated attribute values. *)
4646+let split_on_whitespace s =
4747+ let len = String.length s in
4848+ let rec split acc start i =
4949+ if i >= len then
5050+ if i > start then
5151+ List.rev ((String.sub s start (i - start)) :: acc)
5252+ else
5353+ List.rev acc
5454+ else if is_whitespace s.[i] then
5555+ let acc' =
5656+ if i > start then
5757+ (String.sub s start (i - start)) :: acc
5858+ else
5959+ acc
6060+ in
6161+ split acc' (i + 1) (i + 1)
6262+ else
6363+ split acc start (i + 1)
6464+ in
6565+ split [] 0 0
6666+4467(** Factory for creating enum-based validators.
4568 Many HTML attributes accept a fixed set of keyword values.
4669 Uses Hashtbl for O(1) membership check. *)
+4
lib/htmlrw_check/datatype/datatype.mli
···4444(** Trim HTML5 whitespace from both ends of a string. *)
4545val trim_html_spaces : string -> string
46464747+(** Split string on HTML5 whitespace characters (space, tab, LF, FF, CR).
4848+ Filters out empty tokens. Used for space-separated attribute values. *)
4949+val split_on_whitespace : string -> string list
5050+4751(** {2 Datatype Factories} *)
48524953(** Create an enum-based validator for attributes with fixed keyword values.
+6-20
lib/htmlrw_check/datatype/dt_autocomplete.ml
···11(** Autocomplete attribute validation based on HTML5 spec *)
2233-(** Check if character is whitespace *)
44-let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r'
33+(* Use shared utilities from Datatype *)
44+let is_whitespace = Datatype.is_whitespace
55+let to_ascii_lowercase = Datatype.to_ascii_lowercase
5666-(** Convert character to ASCII lowercase *)
77-let to_ascii_lowercase c =
88- if c >= 'A' && c <= 'Z' then Char.chr (Char.code c + 32) else c
99-1010-(** Trim whitespace from string *)
77+(** Trim whitespace from string and collapse internal whitespace *)
118let trim_whitespace s =
129 let s = String.trim s in
1310 (* Also collapse internal whitespace *)
···104101 "impp";
105102 ]
106103107107-(** Split string on whitespace *)
108108-let split_on_whitespace s =
109109- let rec split acc start i =
110110- if i >= String.length s then
111111- if start < i then List.rev (String.sub s start (i - start) :: acc)
112112- else List.rev acc
113113- else if is_whitespace s.[i] then
114114- if start < i then
115115- split (String.sub s start (i - start) :: acc) (i + 1) (i + 1)
116116- else split acc (i + 1) (i + 1)
117117- else split acc start (i + 1)
118118- in
119119- split [] 0 0
104104+(** Split string on whitespace - uses shared utility *)
105105+let split_on_whitespace = Datatype.split_on_whitespace
120106121107(** Check if string starts with prefix *)
122108let starts_with s prefix =
+34-44
lib/htmlrw_check/semantic/id_checker.ml
···5050 else
5151 None
52525353-(** Split whitespace-separated ID references. *)
5454-let split_ids value =
5555- let rec split acc start i =
5656- if i >= String.length value then
5757- if i > start then
5858- (String.sub value start (i - start)) :: acc
5959- else
6060- acc
6161- else
6262- match value.[i] with
6363- | ' ' | '\t' | '\n' | '\r' ->
6464- let acc' =
6565- if i > start then
6666- (String.sub value start (i - start)) :: acc
6767- else
6868- acc
6969- in
7070- split acc' (i + 1) (i + 1)
7171- | _ ->
7272- split acc start (i + 1)
7373- in
7474- List.rev (split [] 0 0)
5353+(** Split whitespace-separated ID references - uses shared utility. *)
5454+let split_ids = Datatype.split_on_whitespace
75557676-(** Attributes that reference a single ID. *)
7777-let single_id_ref_attrs = [
7878- "for"; (* label *)
7979- "form"; (* form-associated elements *)
8080- "list"; (* input *)
8181- "aria-activedescendant";
8282- "popovertarget"; (* button - references popover element *)
8383- "commandfor"; (* button - references element to control *)
8484- "anchor"; (* popover - references anchor element *)
8585-]
5656+(** Attributes that reference a single ID - O(1) lookup. *)
5757+let single_id_ref_attrs =
5858+ let tbl = Hashtbl.create 8 in
5959+ List.iter (fun a -> Hashtbl.add tbl a ()) [
6060+ "for"; (* label *)
6161+ "form"; (* form-associated elements *)
6262+ "list"; (* input *)
6363+ "aria-activedescendant";
6464+ "popovertarget"; (* button - references popover element *)
6565+ "commandfor"; (* button - references element to control *)
6666+ "anchor"; (* popover - references anchor element *)
6767+ ];
6868+ tbl
86698787-(** Attributes that reference multiple IDs (space-separated). *)
8888-let multi_id_ref_attrs = [
8989- "headers"; (* td, th *)
9090- "aria-labelledby";
9191- "aria-describedby";
9292- "aria-controls";
9393- "aria-flowto";
9494- "aria-owns";
9595- "itemref";
9696-]
7070+let is_single_id_ref_attr name = Hashtbl.mem single_id_ref_attrs name
7171+7272+(** Attributes that reference multiple IDs (space-separated) - O(1) lookup. *)
7373+let multi_id_ref_attrs =
7474+ let tbl = Hashtbl.create 8 in
7575+ List.iter (fun a -> Hashtbl.add tbl a ()) [
7676+ "headers"; (* td, th *)
7777+ "aria-labelledby";
7878+ "aria-describedby";
7979+ "aria-controls";
8080+ "aria-flowto";
8181+ "aria-owns";
8282+ "itemref";
8383+ ];
8484+ tbl
8585+8686+let is_multi_id_ref_attr name = Hashtbl.mem multi_id_ref_attrs name
97879888(** Check and store an ID attribute. *)
9989let check_id state ~element:_ ~id ~location:_ collector =
···161151 if String.length value > 0 then
162152 Hashtbl.add state.map_names value ()
163153164164- | attr when List.mem attr single_id_ref_attrs ->
154154+ | attr when is_single_id_ref_attr attr ->
165155 add_reference state ~referring_element:element
166156 ~attribute:attr ~referenced_id:value ~location
167157168168- | attr when List.mem attr multi_id_ref_attrs ->
158158+ | attr when is_multi_id_ref_attr attr ->
169159 (* Split space-separated IDs and add each as a reference *)
170160 let ids = split_ids value in
171161 List.iter (fun id ->
+2-22
lib/htmlrw_check/specialized/microdata_checker.ml
···4343 Hashtbl.clear state.all_ids;
4444 state.html_element_seen <- false
45454646-(** Split whitespace-separated values. *)
4747-let split_whitespace value =
4848- let rec split acc start i =
4949- if i >= String.length value then
5050- if i > start then
5151- (String.sub value start (i - start)) :: acc
5252- else
5353- acc
5454- else
5555- match value.[i] with
5656- | ' ' | '\t' | '\n' | '\r' ->
5757- let acc' =
5858- if i > start then
5959- (String.sub value start (i - start)) :: acc
6060- else
6161- acc
6262- in
6363- split acc' (i + 1) (i + 1)
6464- | _ ->
6565- split acc start (i + 1)
6666- in
6767- List.rev (split [] 0 0)
4646+(** Split whitespace-separated values - uses shared utility. *)
4747+let split_whitespace = Datatype.split_on_whitespace
68486949(** Check if a string is a valid URL (contains a colon). *)
7050let is_url s =