···4141 (** Iterate over all contexts (top to bottom). *)
4242 val iter : 'a t -> ('a -> unit) -> unit
4343end = struct
4444- type 'a t = { mutable stack : 'a list }
4444+ type 'a t = { mutable stack : 'a list; mutable len : int }
45454646- let create () = { stack = [] }
4747- let reset t = t.stack <- []
4848- let push t x = t.stack <- x :: t.stack
4646+ let create () = { stack = []; len = 0 }
4747+ let reset t = t.stack <- []; t.len <- 0
4848+ let push t x = t.stack <- x :: t.stack; t.len <- t.len + 1
4949 let pop t = match t.stack with
5050 | [] -> None
5151- | x :: rest -> t.stack <- rest; Some x
5151+ | x :: rest -> t.stack <- rest; t.len <- t.len - 1; Some x
5252 let current t = match t.stack with
5353 | [] -> None
5454 | x :: _ -> Some x
5555- let depth t = List.length t.stack
5656- let is_empty t = t.stack = []
5555+ let depth t = t.len (* O(1) instead of O(n) *)
5656+ let is_empty t = t.len = 0
5757 let to_list t = List.rev t.stack
5858 let exists t f = List.exists f t.stack
5959 let find t f = List.find_opt f t.stack
···124124 (** Get all ancestor names (outermost first). *)
125125 val to_list : t -> string list
126126end = struct
127127- type t = { mutable stack : string list }
127127+ type t = { mutable stack : string list; mutable len : int }
128128129129- let create () = { stack = [] }
130130- let reset t = t.stack <- []
131131- let push t name = t.stack <- name :: t.stack
129129+ let create () = { stack = []; len = 0 }
130130+ let reset t = t.stack <- []; t.len <- 0
131131+ let push t name = t.stack <- name :: t.stack; t.len <- t.len + 1
132132 let pop t = match t.stack with
133133- | _ :: rest -> t.stack <- rest
133133+ | _ :: rest -> t.stack <- rest; t.len <- t.len - 1
134134 | [] -> ()
135135 let parent t = match t.stack with
136136 | x :: _ -> Some x
137137 | [] -> None
138138 let has_ancestor t name = List.mem name t.stack
139139- let depth t = List.length t.stack
139139+ let depth t = t.len (* O(1) instead of O(n) *)
140140 let to_list t = List.rev t.stack
141141end
+6-3
lib/htmlrw_check/datatype/datatype.ml
···4242 else String.sub s start (end_pos - start + 1)
43434444(** Factory for creating enum-based validators.
4545- Many HTML attributes accept a fixed set of keyword values. *)
4545+ Many HTML attributes accept a fixed set of keyword values.
4646+ Uses Hashtbl for O(1) membership check. *)
4647let make_enum ~name ~values ?(allow_empty = true) () : t =
4747- let values_set = List.map String.lowercase_ascii values in
4848+ (* Pre-compute hashtable for O(1) membership *)
4949+ let values_tbl = Hashtbl.create (List.length values) in
5050+ List.iter (fun v -> Hashtbl.add values_tbl (String.lowercase_ascii v) ()) values;
4851 let values_str = String.concat ", " (List.map (Printf.sprintf "'%s'") values) in
4952 (module struct
5053 let name = name
5154 let validate s =
5255 let s_lower = string_to_ascii_lowercase s in
5353- if (allow_empty && s = "") || List.mem s_lower values_set then Ok ()
5656+ if (allow_empty && s = "") || Hashtbl.mem values_tbl s_lower then Ok ()
5457 else Error (Printf.sprintf "The value '%s' is not a valid %s value. Expected %s%s."
5558 s name (if allow_empty then "empty string, " else "") values_str)
5659 let is_valid s = Result.is_ok (validate s)
+73-72
lib/htmlrw_check/element/tag.ml
···157157158158(** {1 Conversion Functions} *)
159159160160-(** Convert a lowercase tag name string to html_tag option *)
161161-let html_tag_of_string_opt name =
162162- match name with
163163- (* Document metadata *)
164164- | "html" -> Some `Html | "head" -> Some `Head | "title" -> Some `Title
165165- | "base" -> Some `Base | "link" -> Some `Link | "meta" -> Some `Meta
166166- | "style" -> Some `Style
167167- (* Sectioning root *)
168168- | "body" -> Some `Body
169169- (* Content sectioning *)
170170- | "address" -> Some `Address | "article" -> Some `Article | "aside" -> Some `Aside
171171- | "footer" -> Some `Footer | "header" -> Some `Header | "hgroup" -> Some `Hgroup
172172- | "main" -> Some `Main | "nav" -> Some `Nav | "search" -> Some `Search
173173- | "section" -> Some `Section
174174- (* Headings *)
175175- | "h1" -> Some `H1 | "h2" -> Some `H2 | "h3" -> Some `H3
176176- | "h4" -> Some `H4 | "h5" -> Some `H5 | "h6" -> Some `H6
177177- (* Grouping content *)
178178- | "blockquote" -> Some `Blockquote | "dd" -> Some `Dd | "div" -> Some `Div
179179- | "dl" -> Some `Dl | "dt" -> Some `Dt | "figcaption" -> Some `Figcaption
180180- | "figure" -> Some `Figure | "hr" -> Some `Hr | "li" -> Some `Li
181181- | "menu" -> Some `Menu | "ol" -> Some `Ol | "p" -> Some `P
182182- | "pre" -> Some `Pre | "ul" -> Some `Ul
183183- (* Text-level semantics *)
184184- | "a" -> Some `A | "abbr" -> Some `Abbr | "b" -> Some `B
185185- | "bdi" -> Some `Bdi | "bdo" -> Some `Bdo | "br" -> Some `Br
186186- | "cite" -> Some `Cite | "code" -> Some `Code | "data" -> Some `Data
187187- | "dfn" -> Some `Dfn | "em" -> Some `Em | "i" -> Some `I
188188- | "kbd" -> Some `Kbd | "mark" -> Some `Mark | "q" -> Some `Q
189189- | "rp" -> Some `Rp | "rt" -> Some `Rt | "ruby" -> Some `Ruby
190190- | "s" -> Some `S | "samp" -> Some `Samp | "small" -> Some `Small
191191- | "span" -> Some `Span | "strong" -> Some `Strong | "sub" -> Some `Sub
192192- | "sup" -> Some `Sup | "time" -> Some `Time | "u" -> Some `U
193193- | "var" -> Some `Var | "wbr" -> Some `Wbr
194194- (* Edits *)
195195- | "del" -> Some `Del | "ins" -> Some `Ins
196196- (* Embedded content *)
197197- | "area" -> Some `Area | "audio" -> Some `Audio | "canvas" -> Some `Canvas
198198- | "embed" -> Some `Embed | "iframe" -> Some `Iframe | "img" -> Some `Img
199199- | "map" -> Some `Map | "object" -> Some `Object | "picture" -> Some `Picture
200200- | "source" -> Some `Source | "track" -> Some `Track | "video" -> Some `Video
201201- (* Tabular data *)
202202- | "caption" -> Some `Caption | "col" -> Some `Col | "colgroup" -> Some `Colgroup
203203- | "table" -> Some `Table | "tbody" -> Some `Tbody | "td" -> Some `Td
204204- | "tfoot" -> Some `Tfoot | "th" -> Some `Th | "thead" -> Some `Thead
205205- | "tr" -> Some `Tr
206206- (* Forms *)
207207- | "button" -> Some `Button | "datalist" -> Some `Datalist
208208- | "fieldset" -> Some `Fieldset | "form" -> Some `Form | "input" -> Some `Input
209209- | "label" -> Some `Label | "legend" -> Some `Legend | "meter" -> Some `Meter
210210- | "optgroup" -> Some `Optgroup | "option" -> Some `Option
211211- | "output" -> Some `Output | "progress" -> Some `Progress
212212- | "select" -> Some `Select | "textarea" -> Some `Textarea
213213- (* Interactive *)
214214- | "details" -> Some `Details | "dialog" -> Some `Dialog | "summary" -> Some `Summary
215215- (* Scripting *)
216216- | "noscript" -> Some `Noscript | "script" -> Some `Script
217217- | "slot" -> Some `Slot | "template" -> Some `Template
218218- (* Web Components / Misc *)
219219- | "portal" -> Some `Portal | "param" -> Some `Param
220220- (* Deprecated/obsolete elements *)
221221- | "applet" -> Some `Applet | "acronym" -> Some `Acronym | "bgsound" -> Some `Bgsound
222222- | "dir" -> Some `Dir | "frame" -> Some `Frame | "frameset" -> Some `Frameset
223223- | "noframes" -> Some `Noframes | "isindex" -> Some `Isindex | "keygen" -> Some `Keygen
224224- | "listing" -> Some `Listing | "menuitem" -> Some `Menuitem | "nextid" -> Some `Nextid
225225- | "noembed" -> Some `Noembed | "plaintext" -> Some `Plaintext
226226- | "rb" -> Some `Rb | "rtc" -> Some `Rtc | "strike" -> Some `Strike | "xmp" -> Some `Xmp
227227- | "basefont" -> Some `Basefont | "big" -> Some `Big | "blink" -> Some `Blink
228228- | "center" -> Some `Center | "font" -> Some `Font | "marquee" -> Some `Marquee
229229- | "multicol" -> Some `Multicol | "nobr" -> Some `Nobr | "spacer" -> Some `Spacer
230230- | "tt" -> Some `Tt | "image" -> Some `Image
231231- | _ -> None
160160+(** Hashtable for O(1) tag name lookup - initialized once at module load *)
161161+let html_tag_table : (string, html_tag) Hashtbl.t =
162162+ let tbl = Hashtbl.create 128 in
163163+ List.iter (fun (name, tag) -> Hashtbl.add tbl name tag) [
164164+ (* Document metadata *)
165165+ ("html", `Html); ("head", `Head); ("title", `Title);
166166+ ("base", `Base); ("link", `Link); ("meta", `Meta); ("style", `Style);
167167+ (* Sectioning root *)
168168+ ("body", `Body);
169169+ (* Content sectioning *)
170170+ ("address", `Address); ("article", `Article); ("aside", `Aside);
171171+ ("footer", `Footer); ("header", `Header); ("hgroup", `Hgroup);
172172+ ("main", `Main); ("nav", `Nav); ("search", `Search); ("section", `Section);
173173+ (* Headings *)
174174+ ("h1", `H1); ("h2", `H2); ("h3", `H3);
175175+ ("h4", `H4); ("h5", `H5); ("h6", `H6);
176176+ (* Grouping content *)
177177+ ("blockquote", `Blockquote); ("dd", `Dd); ("div", `Div);
178178+ ("dl", `Dl); ("dt", `Dt); ("figcaption", `Figcaption);
179179+ ("figure", `Figure); ("hr", `Hr); ("li", `Li);
180180+ ("menu", `Menu); ("ol", `Ol); ("p", `P); ("pre", `Pre); ("ul", `Ul);
181181+ (* Text-level semantics *)
182182+ ("a", `A); ("abbr", `Abbr); ("b", `B);
183183+ ("bdi", `Bdi); ("bdo", `Bdo); ("br", `Br);
184184+ ("cite", `Cite); ("code", `Code); ("data", `Data);
185185+ ("dfn", `Dfn); ("em", `Em); ("i", `I);
186186+ ("kbd", `Kbd); ("mark", `Mark); ("q", `Q);
187187+ ("rp", `Rp); ("rt", `Rt); ("ruby", `Ruby);
188188+ ("s", `S); ("samp", `Samp); ("small", `Small);
189189+ ("span", `Span); ("strong", `Strong); ("sub", `Sub);
190190+ ("sup", `Sup); ("time", `Time); ("u", `U);
191191+ ("var", `Var); ("wbr", `Wbr);
192192+ (* Edits *)
193193+ ("del", `Del); ("ins", `Ins);
194194+ (* Embedded content *)
195195+ ("area", `Area); ("audio", `Audio); ("canvas", `Canvas);
196196+ ("embed", `Embed); ("iframe", `Iframe); ("img", `Img);
197197+ ("map", `Map); ("object", `Object); ("picture", `Picture);
198198+ ("source", `Source); ("track", `Track); ("video", `Video);
199199+ (* Tabular data *)
200200+ ("caption", `Caption); ("col", `Col); ("colgroup", `Colgroup);
201201+ ("table", `Table); ("tbody", `Tbody); ("td", `Td);
202202+ ("tfoot", `Tfoot); ("th", `Th); ("thead", `Thead); ("tr", `Tr);
203203+ (* Forms *)
204204+ ("button", `Button); ("datalist", `Datalist);
205205+ ("fieldset", `Fieldset); ("form", `Form); ("input", `Input);
206206+ ("label", `Label); ("legend", `Legend); ("meter", `Meter);
207207+ ("optgroup", `Optgroup); ("option", `Option);
208208+ ("output", `Output); ("progress", `Progress);
209209+ ("select", `Select); ("textarea", `Textarea);
210210+ (* Interactive *)
211211+ ("details", `Details); ("dialog", `Dialog); ("summary", `Summary);
212212+ (* Scripting *)
213213+ ("noscript", `Noscript); ("script", `Script);
214214+ ("slot", `Slot); ("template", `Template);
215215+ (* Web Components / Misc *)
216216+ ("portal", `Portal); ("param", `Param);
217217+ (* Deprecated/obsolete elements *)
218218+ ("applet", `Applet); ("acronym", `Acronym); ("bgsound", `Bgsound);
219219+ ("dir", `Dir); ("frame", `Frame); ("frameset", `Frameset);
220220+ ("noframes", `Noframes); ("isindex", `Isindex); ("keygen", `Keygen);
221221+ ("listing", `Listing); ("menuitem", `Menuitem); ("nextid", `Nextid);
222222+ ("noembed", `Noembed); ("plaintext", `Plaintext);
223223+ ("rb", `Rb); ("rtc", `Rtc); ("strike", `Strike); ("xmp", `Xmp);
224224+ ("basefont", `Basefont); ("big", `Big); ("blink", `Blink);
225225+ ("center", `Center); ("font", `Font); ("marquee", `Marquee);
226226+ ("multicol", `Multicol); ("nobr", `Nobr); ("spacer", `Spacer);
227227+ ("tt", `Tt); ("image", `Image);
228228+ ];
229229+ tbl
230230+231231+(** Convert a lowercase tag name string to html_tag option - O(1) lookup *)
232232+let html_tag_of_string_opt name = Hashtbl.find_opt html_tag_table name
232233233234(** Check if a name is a valid custom element name (contains hyphen, not reserved) *)
234235let is_custom_element_name name =
···1616let max_chars = 30720
1717let min_chars = 1024
18181919-(* Elements whose text content we skip for language detection *)
2020-let skip_elements = [
2121- "a"; "button"; "details"; "figcaption"; "form"; "li"; "nav";
2222- "pre"; "script"; "select"; "span"; "style"; "summary";
2323- "td"; "textarea"; "th"; "tr"
2424-]
1919+(* Elements whose text content we skip for language detection - O(1) lookup *)
2020+let skip_elements =
2121+ let tbl = Hashtbl.create 20 in
2222+ List.iter (fun e -> Hashtbl.add tbl e ()) [
2323+ "a"; "button"; "details"; "figcaption"; "form"; "li"; "nav";
2424+ "pre"; "script"; "select"; "span"; "style"; "summary";
2525+ "td"; "textarea"; "th"; "tr"
2626+ ];
2727+ tbl
25282626-(* RTL languages *)
2727-let rtl_langs = ["ar"; "azb"; "ckb"; "dv"; "fa"; "he"; "pnb"; "ps"; "sd"; "ug"; "ur"; "iw"]
2929+let is_skip_element name = Hashtbl.mem skip_elements name
3030+3131+(* RTL languages - O(1) lookup *)
3232+let rtl_langs =
3333+ let tbl = Hashtbl.create 16 in
3434+ List.iter (fun l -> Hashtbl.add tbl l ()) ["ar"; "azb"; "ckb"; "dv"; "fa"; "he"; "pnb"; "ps"; "sd"; "ug"; "ur"; "iw"];
3535+ tbl
3636+3737+let is_rtl_lang lang = Hashtbl.mem rtl_langs lang
28382939let create () = {
3040 html_lang = None;
···217227 if state.foreign_depth > 0 then
218228 state.foreign_depth <- state.foreign_depth + 1
219229 (* Check if we should skip this element's text *)
220220- else if List.mem name_lower skip_elements then
230230+ else if is_skip_element name_lower then
221231 state.skip_depth <- state.skip_depth + 1
222232 else begin
223233 (* Check for different lang attribute *)
···241251 if state.foreign_depth > 0 then
242252 state.foreign_depth <- state.foreign_depth - 1
243253 else if state.skip_depth > 0 then begin
244244- if List.mem name_lower skip_elements then
254254+ if is_skip_element name_lower then
245255 state.skip_depth <- state.skip_depth - 1
246256 else
247257 (* TODO: properly track nested elements with different lang *)
···313323 end;
314324315325 (* Check dir attribute for RTL languages *)
316316- if List.mem base_detected rtl_langs then begin
326326+ if is_rtl_lang base_detected then begin
317327 match state.html_dir with
318328 | None ->
319329 Message_collector.add_typed collector
+10-7
lib/htmlrw_check/semantic/nesting_checker.ml
···1313 "s"; "small"; "mark"; "abbr"; "cite"; "code"; "q"; "sub"; "sup"; "samp";
1414 "kbd"; "var" |]
15151616+(** Hashtable for O(1) lookup of special ancestor bit positions *)
1717+let special_ancestor_table : (string, int) Hashtbl.t =
1818+ let tbl = Hashtbl.create 64 in
1919+ Array.iteri (fun i name -> Hashtbl.add tbl name i) special_ancestors;
2020+ tbl
2121+1622(** Get the bit position for a special ancestor element.
1717- Returns [-1] if the element is not a special ancestor. *)
2323+ Returns [-1] if the element is not a special ancestor. O(1) lookup. *)
1824let special_ancestor_number name =
1919- let rec find i =
2020- if i >= Array.length special_ancestors then -1
2121- else if special_ancestors.(i) = name then i
2222- else find (i + 1)
2323- in
2424- find 0
2525+ match Hashtbl.find_opt special_ancestor_table name with
2626+ | Some i -> i
2727+ | None -> -1
25282629(** Interactive elements that cannot be nested inside [a] or [button]. *)
2730let interactive_elements =
+9-6
lib/htmlrw_check/semantic/obsolete_checker.ml
···188188189189 tbl
190190191191-(** Obsolete style attributes map: attr_name -> element_name list *)
192192-let obsolete_style_attrs =
191191+(** Obsolete style attributes map: attr_name -> element_name -> unit hashtable
192192+ Uses nested hashtables for O(1) lookup instead of List.mem O(n) *)
193193+let obsolete_style_attrs : (string, (string, unit) Hashtbl.t) Hashtbl.t =
193194 let tbl = Hashtbl.create 64 in
194195195196 let register attr_name elements =
196196- Hashtbl.add tbl attr_name elements
197197+ let elem_tbl = Hashtbl.create (List.length elements) in
198198+ List.iter (fun e -> Hashtbl.add elem_tbl e ()) elements;
199199+ Hashtbl.add tbl attr_name elem_tbl
197200 in
198201199202 register "align" ["caption"; "iframe"; "img"; "input"; "object"; "embed"; "legend"; "table"; "hr"; "div"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "p"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"];
···292295 Message_collector.add_typed collector
293296 (`Element (`Obsolete_attr (`Elem name, `Attr attr_name, `Suggestion (Some suggestion))))));
294297295295- (* Check obsolete style attributes *)
298298+ (* Check obsolete style attributes - O(1) nested hashtable lookup *)
296299 (match Hashtbl.find_opt obsolete_style_attrs attr_lower with
297300 | None -> ()
298298- | Some elements ->
299299- if List.mem name_lower elements then
301301+ | Some elem_tbl ->
302302+ if Hashtbl.mem elem_tbl name_lower then
300303 Message_collector.add_typed collector
301304 (`Element (`Obsolete_attr (`Elem name, `Attr attr_name, `Suggestion (Some "Use CSS instead.")))));
302305
+9-4
lib/htmlrw_check/specialized/label_checker.ml
···22 Validates that label element contains at most one labelable element
33 and that descendants with for attribute have matching ids. *)
4455-(** Labelable elements that label can reference *)
66-let labelable_elements = ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"]
55+(** Labelable elements that label can reference - O(1) hashtable lookup *)
66+let labelable_elements =
77+ let tbl = Hashtbl.create 8 in
88+ List.iter (fun e -> Hashtbl.add tbl e ()) ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"];
99+ tbl
1010+1111+let is_labelable name = Hashtbl.mem labelable_elements name
712813type label_for_info = {
914 for_target : string;
···6570 let name_lower = String.lowercase_ascii (Tag.tag_to_string (Tag.Html tag)) in
66716772 (* Track labelable element IDs *)
6868- (if List.mem name_lower labelable_elements then
7373+ (if is_labelable name_lower then
6974 match Attr_utils.get_attr "id" element.raw_attrs with
7075 | Some id -> state.labelable_ids <- id :: state.labelable_ids
7176 | None -> ());
···7479 state.label_depth <- state.label_depth + 1;
75807681 (* Check for labelable elements inside label *)
7777- if List.mem name_lower labelable_elements then begin
8282+ if is_labelable name_lower then begin
7883 state.labelable_count <- state.labelable_count + 1;
7984 if state.labelable_count > 1 then
8085 Message_collector.add_typed collector (`Label `Too_many_labelable);