···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Common attribute utilities used across checkers.
7+8+ This module provides simple helper functions for working with raw
9+ attribute lists (name-value pairs). These utilities are used by
10+ checkers that need to inspect attributes without full typed parsing.
11+12+ For typed attribute access, see the {!Attr} module.
13+*)
14+15+(** {1 Types} *)
16+17+type attrs = (string * string) list
18+(** Raw attribute list as name-value pairs. *)
19+20+(** {1 Attribute Lookup} *)
21+22+val has_attr : string -> attrs -> bool
23+(** [has_attr name attrs] checks if an attribute exists.
24+25+ The comparison is case-insensitive.
26+27+ @param name The attribute name to look for (lowercase)
28+ @param attrs The attribute list
29+ @return [true] if the attribute is present *)
30+31+val get_attr : string -> attrs -> string option
32+(** [get_attr name attrs] gets an attribute value.
33+34+ The comparison is case-insensitive.
35+36+ @param name The attribute name to look for (lowercase)
37+ @param attrs The attribute list
38+ @return [Some value] if found, [None] otherwise *)
39+40+val get_attr_or : string -> default:string -> attrs -> string
41+(** [get_attr_or name ~default attrs] gets an attribute value with a default.
42+43+ @param name The attribute name to look for (lowercase)
44+ @param default The default value if not found
45+ @param attrs The attribute list
46+ @return The attribute value or the default *)
47+48+val is_non_empty_attr : string -> attrs -> bool
49+(** [is_non_empty_attr name attrs] checks if an attribute exists with non-empty value.
50+51+ The value is considered non-empty if it contains non-whitespace characters.
52+53+ @param name The attribute name to look for (lowercase)
54+ @param attrs The attribute list
55+ @return [true] if the attribute exists and has a non-empty value *)
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Typed HTML5 element representation.
7+8+ This module combines tags and attributes into a complete typed element
9+ representation. Elements are created from raw input (tag name, namespace,
10+ attributes) and provide typed accessors for validation and manipulation.
11+12+ {2 Design Philosophy}
13+14+ An element in this module represents a complete typed view of an HTML
15+ element, including:
16+17+ - The element's tag (typed via {!Tag.element_tag})
18+ - Typed attributes (via {!Attr.t} list)
19+ - Raw attributes (for fallback access)
20+21+ This dual representation allows checkers to use typed pattern matching
22+ for common cases while falling back to raw strings when needed.
23+24+ {2 Usage Example}
25+26+ {[
27+ let elem = Element.create
28+ ~name:"input"
29+ ~namespace:None
30+ ~attrs:[("type", "email"); ("required", ""); ("class", "form-input")]
31+ in
32+ match elem.tag with
33+ | Tag.Html `Input ->
34+ if Element.has_required elem then
35+ (* Validate required input *)
36+ ()
37+ | _ -> ()
38+ ]}
39+40+ @see 'Tag' for element tag types
41+ @see 'Attr' for attribute types
42+*)
43+44+(** {1 Element Type} *)
45+46+(** A typed HTML element.
47+48+ @field tag The element's tag classification
49+ @field attrs Typed attributes parsed from raw input
50+ @field raw_attrs Original attribute name-value pairs for fallback *)
51+type t = {
52+ tag : Tag.element_tag;
53+ attrs : Attr.t list;
54+ raw_attrs : (string * string) list;
55+}
56+57+(** {1 Construction} *)
58+59+val create : name:string -> namespace:string option -> attrs:(string * string) list -> t
60+(** [create ~name ~namespace ~attrs] creates a typed element.
61+62+ @param name The element's tag name
63+ @param namespace Optional namespace URI (for SVG/MathML)
64+ @param attrs Raw attribute name-value pairs
65+ @return A typed element
66+67+ {b Example:}
68+ {[
69+ let div = Element.create ~name:"div" ~namespace:None
70+ ~attrs:[("class", "container"); ("id", "main")]
71+ ]} *)
72+73+(** {1 Tag Accessors} *)
74+75+val tag : t -> Tag.element_tag
76+(** [tag elem] returns the element's tag. *)
77+78+val tag_name : t -> string
79+(** [tag_name elem] returns the element's tag name as a string. *)
80+81+val is_html_tag : Tag.html_tag -> t -> bool
82+(** [is_html_tag expected elem] checks if the element is a specific HTML tag.
83+84+ @param expected The expected HTML tag variant
85+ @param elem The element to check
86+ @return [true] if the element matches *)
87+88+val as_html_tag : t -> Tag.html_tag option
89+(** [as_html_tag elem] extracts the HTML tag if this is an HTML element.
90+91+ @return [Some tag] for HTML elements, [None] for SVG/MathML/Custom/Unknown *)
92+93+(** {1 Attribute Accessors} *)
94+95+val attrs : t -> Attr.t list
96+(** [attrs elem] returns the typed attributes. *)
97+98+val raw_attrs : t -> (string * string) list
99+(** [raw_attrs elem] returns the original raw attributes. *)
100+101+val get_id : t -> string option
102+(** [get_id elem] extracts the id attribute value. *)
103+104+val get_class : t -> string option
105+(** [get_class elem] extracts the class attribute value. *)
106+107+val get_href : t -> string option
108+(** [get_href elem] extracts the href attribute value. *)
109+110+val get_src : t -> string option
111+(** [get_src elem] extracts the src attribute value. *)
112+113+val get_alt : t -> string option
114+(** [get_alt elem] extracts the alt attribute value. *)
115+116+val get_name : t -> string option
117+(** [get_name elem] extracts the name attribute value. *)
118+119+val get_value : t -> string option
120+(** [get_value elem] extracts the value attribute value. *)
121+122+val get_role : t -> string option
123+(** [get_role elem] extracts the role attribute value. *)
124+125+val get_aria : string -> t -> string option
126+(** [get_aria name elem] extracts a specific aria-* attribute value.
127+128+ @param name The aria attribute name without the "aria-" prefix *)
129+130+val get_data : string -> t -> string option
131+(** [get_data name elem] extracts a specific data-* attribute value.
132+133+ @param name The data attribute name without the "data-" prefix *)
134+135+val has_disabled : t -> bool
136+(** [has_disabled elem] checks if the disabled attribute is present. *)
137+138+val has_required : t -> bool
139+(** [has_required elem] checks if the required attribute is present. *)
140+141+val has_readonly : t -> bool
142+(** [has_readonly elem] checks if the readonly attribute is present. *)
143+144+val has_checked : t -> bool
145+(** [has_checked elem] checks if the checked attribute is present. *)
146+147+val has_autofocus : t -> bool
148+(** [has_autofocus elem] checks if the autofocus attribute is present. *)
149+150+val has_hidden : t -> bool
151+(** [has_hidden elem] checks if the hidden attribute is present. *)
152+153+val has_inert : t -> bool
154+(** [has_inert elem] checks if the inert attribute is present. *)
155+156+val has_open : t -> bool
157+(** [has_open elem] checks if the open attribute is present. *)
158+159+val get_all_aria : t -> (string * string) list
160+(** [get_all_aria elem] extracts all aria-* attributes. *)
161+162+val get_all_data : t -> (string * string) list
163+(** [get_all_data elem] extracts all data-* attributes. *)
164+165+(** {1 Raw Attribute Fallback} *)
166+167+val get_raw_attr : string -> t -> string option
168+(** [get_raw_attr name elem] gets a raw attribute value by name.
169+170+ This is useful when the typed representation doesn't capture a specific
171+ attribute or when you need the exact original value.
172+173+ @param name The attribute name (case-insensitive)
174+ @param elem The element
175+ @return [Some value] if the attribute exists *)
176+177+val has_raw_attr : string -> t -> bool
178+(** [has_raw_attr name elem] checks if a raw attribute exists.
179+180+ @param name The attribute name (case-insensitive)
181+ @param elem The element
182+ @return [true] if the attribute is present *)
183+184+(** {1 Category Checks}
185+186+ These predicates check element categories based on the HTML5 content model. *)
187+188+val is_void : t -> bool
189+(** [is_void elem] checks if this is a void element (cannot have children).
190+191+ @return [true] for br, hr, img, input, etc. *)
192+193+val is_heading : t -> bool
194+(** [is_heading elem] checks if this is a heading element.
195+196+ @return [true] for h1-h6 *)
197+198+val heading_level : t -> int option
199+(** [heading_level elem] gets the heading level (1-6) if applicable.
200+201+ @return [Some level] for h1-h6, [None] otherwise *)
202+203+val is_sectioning : t -> bool
204+(** [is_sectioning elem] checks if this is sectioning content.
205+206+ @return [true] for article, aside, nav, section *)
207+208+val is_sectioning_root : t -> bool
209+(** [is_sectioning_root elem] checks if this is a sectioning root.
210+211+ @return [true] for blockquote, body, details, dialog, fieldset, figure, td *)
212+213+val is_embedded : t -> bool
214+(** [is_embedded elem] checks if this is embedded content.
215+216+ @return [true] for audio, canvas, embed, iframe, img, object, picture, video *)
217+218+val is_interactive : t -> bool
219+(** [is_interactive elem] checks if this is interactive content.
220+221+ @return [true] for focusable/activatable elements *)
222+223+val is_form_associated : t -> bool
224+(** [is_form_associated elem] checks if this is form-associated.
225+226+ @return [true] for elements that can belong to a form *)
227+228+val is_labelable : t -> bool
229+(** [is_labelable elem] checks if this can be associated with a label.
230+231+ @return [true] for button, input, meter, output, progress, select, textarea *)
232+233+val is_submittable : t -> bool
234+(** [is_submittable elem] checks if this is a submittable form element.
235+236+ @return [true] for button, input, select, textarea *)
237+238+val is_table_element : t -> bool
239+(** [is_table_element elem] checks if this is a table-related element.
240+241+ @return [true] for table, tr, td, th, etc. *)
242+243+val is_media : t -> bool
244+(** [is_media elem] checks if this is a media element.
245+246+ @return [true] for audio, video *)
247+248+val is_list_container : t -> bool
249+(** [is_list_container elem] checks if this is a list container.
250+251+ @return [true] for ul, ol, menu, dl *)
252+253+val is_transparent : t -> bool
254+(** [is_transparent elem] checks if this has a transparent content model.
255+256+ @return [true] for a, abbr, audio, canvas, del, ins, map, noscript, etc. *)
257+258+val is_phrasing : t -> bool
259+(** [is_phrasing elem] checks if this is phrasing content.
260+261+ @return [true] for inline-level elements *)
262+263+val is_flow : t -> bool
264+(** [is_flow elem] checks if this is flow content.
265+266+ @return [true] for most body-level elements *)
267+268+val is_obsolete : t -> bool
269+(** [is_obsolete elem] checks if this is a deprecated element.
270+271+ @return [true] for applet, font, marquee, etc. *)
272+273+val is_svg : t -> bool
274+(** [is_svg elem] checks if this is an SVG element.
275+276+ @return [true] if the element is in the SVG namespace *)
277+278+val is_mathml : t -> bool
279+(** [is_mathml elem] checks if this is a MathML element.
280+281+ @return [true] if the element is in the MathML namespace *)
282+283+val is_custom : t -> bool
284+(** [is_custom elem] checks if this is a custom element.
285+286+ @return [true] if the element name contains a hyphen *)
287+288+val is_unknown : t -> bool
289+(** [is_unknown elem] checks if this is an unknown element.
290+291+ @return [true] if the element is not recognized *)
292+293+(** {1 Input Type Utilities} *)
294+295+val get_input_type : t -> Attr.input_type option
296+(** [get_input_type elem] gets the input type for input elements.
297+298+ @return [Some type] for input elements with a type, [None] otherwise *)
299+300+val get_button_type : t -> Attr.button_type option
301+(** [get_button_type elem] gets the button type for button elements.
302+303+ @return [Some type] for button elements with a type, [None] otherwise *)
304+305+val is_input_type : Attr.input_type -> t -> bool
306+(** [is_input_type expected elem] checks if an input has a specific type.
307+308+ @param expected The expected input type
309+ @param elem The element to check
310+ @return [true] if this is an input with the specified type *)
311+312+(** {1 Pattern Matching Helpers} *)
313+314+val match_html : t -> (Tag.html_tag -> 'a) -> 'a option
315+(** [match_html elem f] applies [f] to the HTML tag if present.
316+317+ @param elem The element
318+ @param f Function to apply to the HTML tag
319+ @return [Some (f tag)] for HTML elements, [None] otherwise *)
320+321+val when_html_tag : Tag.html_tag -> t -> (unit -> 'a) -> 'a option
322+(** [when_html_tag expected elem f] applies [f] if the element matches.
323+324+ @param expected The expected HTML tag
325+ @param elem The element to check
326+ @param f Function to call if the element matches
327+ @return [Some (f ())] if matched, [None] otherwise *)
328+329+(** {1 Internal} *)
330+331+val parse_type_attr : Tag.html_tag -> string -> Attr.t
332+(** [parse_type_attr tag value] parses a type attribute for an element.
333+334+ Different elements have different valid type values. This function
335+ handles context-dependent parsing.
336+337+ @param tag The element's HTML tag
338+ @param value The type attribute value
339+ @return The parsed attribute variant *)
340+341+val parse_attrs_for_tag : Tag.element_tag -> (string * string) list -> Attr.t list
342+(** [parse_attrs_for_tag tag raw_attrs] parses attributes with element context.
343+344+ The type attribute is parsed differently depending on the element tag.
345+346+ @param tag The element's tag
347+ @param raw_attrs Raw attribute name-value pairs
348+ @return List of typed attributes *)
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Typed HTML5 tag representations using polymorphic variants.
7+8+ This module provides compile-time type safety for HTML elements while
9+ maintaining escape hatches for unknown/custom elements. Tags are
10+ represented using polymorphic variants, enabling pattern matching with
11+ exhaustiveness checking while avoiding the overhead of explicit
12+ constructors.
13+14+ {2 Design Philosophy}
15+16+ HTML5 defines over 100 standard elements with specific categories and
17+ content models. This module:
18+19+ - Provides typed representations for all standard elements
20+ - Supports SVG and MathML namespaced elements
21+ - Recognizes custom elements (containing hyphens)
22+ - Falls back to [Unknown] for unrecognized elements
23+24+ {2 Element Categories}
25+26+ HTML5 categorizes elements into content categories that define where
27+ elements can appear and what they can contain. This module provides
28+ predicates for common categories:
29+30+ - {!is_void} - Elements that cannot have children
31+ - {!is_heading} - Heading elements (h1-h6)
32+ - {!is_sectioning} - Elements that create document sections
33+ - {!is_phrasing} - Inline/phrasing content elements
34+ - {!is_flow} - Block/flow content elements
35+36+ @see <https://html.spec.whatwg.org/multipage/dom.html#content-models>
37+ HTML Standard: Content models
38+*)
39+40+(** {1 HTML Tag Types} *)
41+42+(** All standard HTML5 elements plus deprecated elements needed by the validator.
43+44+ This type covers:
45+ - Document metadata elements (html, head, title, etc.)
46+ - Sectioning elements (article, section, nav, etc.)
47+ - Heading elements (h1-h6)
48+ - Grouping content (div, p, ul, ol, etc.)
49+ - Text-level semantics (a, em, strong, span, etc.)
50+ - Embedded content (img, video, audio, etc.)
51+ - Table elements (table, tr, td, th, etc.)
52+ - Form elements (form, input, button, etc.)
53+ - Interactive elements (details, dialog, summary)
54+ - Scripting elements (script, noscript, template)
55+ - Deprecated/obsolete elements (font, center, marquee, etc.) *)
56+type html_tag = [
57+ (* Document metadata *)
58+ | `Html | `Head | `Title | `Base | `Link | `Meta | `Style
59+60+ (* Sectioning root *)
61+ | `Body
62+63+ (* Content sectioning *)
64+ | `Address | `Article | `Aside | `Footer | `Header | `Hgroup
65+ | `Main | `Nav | `Search | `Section
66+67+ (* Heading content *)
68+ | `H1 | `H2 | `H3 | `H4 | `H5 | `H6
69+70+ (* Grouping content *)
71+ | `Blockquote | `Dd | `Div | `Dl | `Dt | `Figcaption | `Figure
72+ | `Hr | `Li | `Menu | `Ol | `P | `Pre | `Ul
73+74+ (* Text-level semantics *)
75+ | `A | `Abbr | `B | `Bdi | `Bdo | `Br | `Cite | `Code | `Data
76+ | `Dfn | `Em | `I | `Kbd | `Mark | `Q | `Rp | `Rt | `Ruby
77+ | `S | `Samp | `Small | `Span | `Strong | `Sub | `Sup | `Time
78+ | `U | `Var | `Wbr
79+80+ (* Edits *)
81+ | `Del | `Ins
82+83+ (* Embedded content *)
84+ | `Area | `Audio | `Canvas | `Embed | `Iframe | `Img | `Map | `Object
85+ | `Picture | `Source | `Track | `Video
86+87+ (* Tabular data *)
88+ | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
89+ | `Th | `Thead | `Tr
90+91+ (* Forms *)
92+ | `Button | `Datalist | `Fieldset | `Form | `Input | `Label
93+ | `Legend | `Meter | `Optgroup | `Option | `Output | `Progress
94+ | `Select | `Textarea
95+96+ (* Interactive elements *)
97+ | `Details | `Dialog | `Summary
98+99+ (* Scripting *)
100+ | `Noscript | `Script | `Slot | `Template
101+102+ (* Web Components / Misc *)
103+ | `Portal | `Param
104+105+ (* Deprecated/obsolete elements *)
106+ | `Applet | `Acronym | `Bgsound | `Dir | `Frame | `Frameset
107+ | `Noframes | `Isindex | `Keygen | `Listing | `Menuitem | `Nextid
108+ | `Noembed | `Plaintext | `Rb | `Rtc | `Strike | `Xmp
109+ | `Basefont | `Big | `Blink | `Center | `Font | `Marquee
110+ | `Multicol | `Nobr | `Spacer | `Tt | `Image
111+]
112+113+(** {1 Category Types}
114+115+ Type aliases for element subsets, enabling functions that only accept
116+ specific categories with compile-time checking. *)
117+118+(** Void elements - cannot have children (e.g., br, hr, img, input). *)
119+type void_tag = [
120+ | `Area | `Base | `Br | `Col | `Embed | `Hr | `Img | `Input
121+ | `Link | `Meta | `Source | `Track | `Wbr
122+ | `Basefont | `Frame | `Isindex | `Keygen | `Param
123+]
124+125+(** Heading elements (h1-h6). *)
126+type heading_tag = [ `H1 | `H2 | `H3 | `H4 | `H5 | `H6 ]
127+128+(** Sectioning content elements that establish document sections. *)
129+type sectioning_tag = [ `Article | `Aside | `Nav | `Section ]
130+131+(** Sectioning roots that establish their own outline context. *)
132+type sectioning_root_tag = [
133+ | `Blockquote | `Body | `Details | `Dialog | `Fieldset | `Figure | `Td
134+]
135+136+(** Embedded content elements. *)
137+type embedded_tag = [
138+ | `Audio | `Canvas | `Embed | `Iframe | `Img | `Object | `Picture | `Video
139+]
140+141+(** Interactive content elements (focusable/activatable). *)
142+type interactive_tag = [
143+ | `A | `Audio | `Button | `Details | `Embed | `Iframe | `Img
144+ | `Input | `Label | `Select | `Textarea | `Video
145+]
146+147+(** Form-associated elements that can belong to a form. *)
148+type form_associated_tag = [
149+ | `Button | `Fieldset | `Input | `Label | `Object | `Output
150+ | `Select | `Textarea | `Meter | `Progress
151+]
152+153+(** Labelable elements that can be associated with a label. *)
154+type labelable_tag = [
155+ | `Button | `Input | `Meter | `Output | `Progress | `Select | `Textarea
156+]
157+158+(** Submittable form elements. *)
159+type submittable_tag = [
160+ | `Button | `Input | `Select | `Textarea
161+]
162+163+(** Resettable form elements. *)
164+type resettable_tag = [
165+ | `Input | `Output | `Select | `Textarea
166+]
167+168+(** Table-related elements. *)
169+type table_tag = [
170+ | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
171+ | `Th | `Thead | `Tr
172+]
173+174+(** Media elements (audio and video). *)
175+type media_tag = [ `Audio | `Video ]
176+177+(** List container elements. *)
178+type list_container_tag = [ `Ul | `Ol | `Menu | `Dl ]
179+180+(** List item elements. *)
181+type list_item_tag = [ `Li | `Dd | `Dt ]
182+183+(** Script-supporting elements. *)
184+type script_supporting_tag = [ `Script | `Template ]
185+186+(** Metadata content elements. *)
187+type metadata_tag = [ `Base | `Link | `Meta | `Noscript | `Script | `Style | `Template | `Title ]
188+189+(** {1 Top-Level Element Type} *)
190+191+(** Top-level element classification.
192+193+ Elements are classified by namespace and recognition status:
194+ - [Html tag] - A known HTML5 element
195+ - [Svg name] - An SVG element (preserves original case)
196+ - [MathML name] - A MathML element (preserves original case)
197+ - [Custom name] - A custom element (contains hyphen)
198+ - [Unknown name] - An unrecognized element *)
199+type element_tag =
200+ | Html of html_tag
201+ | Svg of string
202+ | MathML of string
203+ | Custom of string
204+ | Unknown of string
205+206+(** {1 Namespace Constants} *)
207+208+val svg_namespace : string
209+(** The SVG namespace URI: ["http://www.w3.org/2000/svg"]. *)
210+211+val mathml_namespace : string
212+(** The MathML namespace URI: ["http://www.w3.org/1998/Math/MathML"]. *)
213+214+(** {1 Conversion Functions} *)
215+216+val html_tag_of_string_opt : string -> html_tag option
217+(** [html_tag_of_string_opt name] converts a lowercase tag name to an [html_tag].
218+219+ @param name The lowercase tag name (e.g., ["div"], ["span"])
220+ @return [Some tag] if recognized, [None] otherwise
221+222+ {b Example:}
223+ {[
224+ html_tag_of_string_opt "div" (* Some `Div *)
225+ html_tag_of_string_opt "xyz" (* None *)
226+ ]} *)
227+228+val is_custom_element_name : string -> bool
229+(** [is_custom_element_name name] checks if a name is a valid custom element name.
230+231+ A valid custom element name must contain a hyphen and not be reserved
232+ (e.g., not start with "xml" or be "annotation-xml").
233+234+ @param name The element name to check
235+ @return [true] if the name is a valid custom element name *)
236+237+val is_svg_namespace : string -> bool
238+(** [is_svg_namespace ns] checks if a namespace string represents SVG.
239+240+ Accepts both the short form ["svg"] and the full URI. *)
241+242+val is_mathml_namespace : string -> bool
243+(** [is_mathml_namespace ns] checks if a namespace string represents MathML.
244+245+ Accepts both the short form ["mathml"] and the full URI. *)
246+247+val tag_of_string : ?namespace:string -> string -> element_tag
248+(** [tag_of_string ?namespace name] converts a tag name to an [element_tag].
249+250+ @param namespace Optional namespace URI or short form
251+ @param name The element name
252+ @return The classified element tag
253+254+ {b Example:}
255+ {[
256+ tag_of_string "div" (* Html `Div *)
257+ tag_of_string ~namespace:"svg" "circle" (* Svg "circle" *)
258+ tag_of_string "my-component" (* Custom "my-component" *)
259+ tag_of_string "xyz" (* Unknown "xyz" *)
260+ ]} *)
261+262+val html_tag_to_string : html_tag -> string
263+(** [html_tag_to_string tag] converts an [html_tag] to its lowercase string name.
264+265+ @param tag The HTML tag variant
266+ @return The lowercase tag name (e.g., ["div"], ["span"]) *)
267+268+val tag_to_string : element_tag -> string
269+(** [tag_to_string tag] converts any [element_tag] to its string name.
270+271+ @param tag The element tag
272+ @return The tag name (lowercase for HTML, original case for SVG/MathML) *)
273+274+(** {1 Category Predicates} *)
275+276+val is_void : html_tag -> bool
277+(** [is_void tag] checks if an element is a void element (cannot have children).
278+279+ @param tag The HTML tag to check
280+ @return [true] if the element is void (br, hr, img, input, etc.) *)
281+282+val is_heading : html_tag -> bool
283+(** [is_heading tag] checks if an element is a heading element.
284+285+ @param tag The HTML tag to check
286+ @return [true] if the element is h1-h6 *)
287+288+val heading_level : html_tag -> int option
289+(** [heading_level tag] gets the heading level (1-6) if applicable.
290+291+ @param tag The HTML tag to check
292+ @return [Some level] for h1-h6, [None] for other elements *)
293+294+val is_sectioning : html_tag -> bool
295+(** [is_sectioning tag] checks if an element is sectioning content.
296+297+ @param tag The HTML tag to check
298+ @return [true] if the element is article, aside, nav, or section *)
299+300+val is_sectioning_root : html_tag -> bool
301+(** [is_sectioning_root tag] checks if an element is a sectioning root.
302+303+ Sectioning roots establish their own outline context.
304+305+ @param tag The HTML tag to check
306+ @return [true] if the element is blockquote, body, details, dialog,
307+ fieldset, figure, or td *)
308+309+val is_embedded : html_tag -> bool
310+(** [is_embedded tag] checks if an element is embedded content.
311+312+ @param tag The HTML tag to check
313+ @return [true] if the element is audio, canvas, embed, iframe, img,
314+ object, picture, or video *)
315+316+val is_interactive : html_tag -> bool
317+(** [is_interactive tag] checks if an element is interactive content.
318+319+ @param tag The HTML tag to check
320+ @return [true] if the element is focusable or activatable *)
321+322+val is_form_associated : html_tag -> bool
323+(** [is_form_associated tag] checks if an element is form-associated.
324+325+ @param tag The HTML tag to check
326+ @return [true] if the element can belong to a form *)
327+328+val is_labelable : html_tag -> bool
329+(** [is_labelable tag] checks if an element can be associated with a label.
330+331+ @param tag The HTML tag to check
332+ @return [true] if the element is labelable *)
333+334+val is_submittable : html_tag -> bool
335+(** [is_submittable tag] checks if an element is a submittable form element.
336+337+ @param tag The HTML tag to check
338+ @return [true] if the element is button, input, select, or textarea *)
339+340+val is_resettable : html_tag -> bool
341+(** [is_resettable tag] checks if an element is a resettable form element.
342+343+ @param tag The HTML tag to check
344+ @return [true] if the element is input, output, select, or textarea *)
345+346+val is_transparent : html_tag -> bool
347+(** [is_transparent tag] checks if an element has a transparent content model.
348+349+ Transparent elements inherit their content model from their parent.
350+351+ @param tag The HTML tag to check
352+ @return [true] if the element is transparent (a, abbr, audio, canvas, etc.) *)
353+354+val is_script_supporting : html_tag -> bool
355+(** [is_script_supporting tag] checks if an element is script-supporting.
356+357+ @param tag The HTML tag to check
358+ @return [true] if the element is script or template *)
359+360+val is_table_element : html_tag -> bool
361+(** [is_table_element tag] checks if an element is a table-related element.
362+363+ @param tag The HTML tag to check
364+ @return [true] if the element is table, tr, td, th, etc. *)
365+366+val is_media : html_tag -> bool
367+(** [is_media tag] checks if an element is a media element.
368+369+ @param tag The HTML tag to check
370+ @return [true] if the element is audio or video *)
371+372+val is_list_container : html_tag -> bool
373+(** [is_list_container tag] checks if an element is a list container.
374+375+ @param tag The HTML tag to check
376+ @return [true] if the element is ul, ol, menu, or dl *)
377+378+val is_list_item : html_tag -> bool
379+(** [is_list_item tag] checks if an element is a list item.
380+381+ @param tag The HTML tag to check
382+ @return [true] if the element is li, dd, or dt *)
383+384+val is_metadata : html_tag -> bool
385+(** [is_metadata tag] checks if an element is metadata content.
386+387+ @param tag The HTML tag to check
388+ @return [true] if the element is base, link, meta, etc. *)
389+390+val is_obsolete : html_tag -> bool
391+(** [is_obsolete tag] checks if an element is deprecated/obsolete.
392+393+ @param tag The HTML tag to check
394+ @return [true] if the element is applet, font, marquee, etc. *)
395+396+val is_raw_text : html_tag -> bool
397+(** [is_raw_text tag] checks if an element is a raw text element.
398+399+ Raw text elements contain unparsed text content.
400+401+ @param tag The HTML tag to check
402+ @return [true] if the element is script or style *)
403+404+val is_escapable_raw_text : html_tag -> bool
405+(** [is_escapable_raw_text tag] checks if an element is escapable raw text.
406+407+ @param tag The HTML tag to check
408+ @return [true] if the element is textarea or title *)
409+410+val is_phrasing : html_tag -> bool
411+(** [is_phrasing tag] checks if an element is phrasing content.
412+413+ Phrasing content is inline-level content that forms paragraphs.
414+415+ @param tag The HTML tag to check
416+ @return [true] if the element is phrasing content *)
417+418+val is_flow : html_tag -> bool
419+(** [is_flow tag] checks if an element is flow content.
420+421+ Flow content is most elements that can appear in the body.
422+423+ @param tag The HTML tag to check
424+ @return [true] if the element is flow content *)
425+426+(** {1 Pattern Matching Helpers} *)
427+428+val as_html_tag : element_tag -> html_tag option
429+(** [as_html_tag tag] extracts the HTML tag if present.
430+431+ @param tag The element tag
432+ @return [Some html_tag] if [tag] is [Html html_tag], [None] otherwise *)
433+434+val is_html_tag : html_tag -> element_tag -> bool
435+(** [is_html_tag expected tag] checks if [tag] matches the expected HTML tag.
436+437+ @param expected The expected HTML tag variant
438+ @param tag The element tag to check
439+ @return [true] if [tag] is [Html expected] *)
+5
lib/htmlrw_check/error_code.ml
···119 | `For_id_mismatch
120 | `Role_on_ancestor
121 | `Role_on_for
0122 | `Aria_label_on_for
123]
124···309 | `Label `For_id_mismatch -> "label-for-mismatch"
310 | `Label `Role_on_ancestor -> "role-on-label"
311 | `Label `Role_on_for -> "role-on-label"
0312 | `Label `Aria_label_on_for -> "aria-label-on-label"
313314 (* Input errors *)
···624 | `Label `Role_on_for ->
625 Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element."
626 (q "role") (q "label")
000627 | `Label `Aria_label_on_for ->
628 Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element."
629 (q "aria-label") (q "label")
···119 | `For_id_mismatch
120 | `Role_on_ancestor
121 | `Role_on_for
122+ | `Aria_label_on_ancestor
123 | `Aria_label_on_for
124]
125···310 | `Label `For_id_mismatch -> "label-for-mismatch"
311 | `Label `Role_on_ancestor -> "role-on-label"
312 | `Label `Role_on_for -> "role-on-label"
313+ | `Label `Aria_label_on_ancestor -> "aria-label-on-label"
314 | `Label `Aria_label_on_for -> "aria-label-on-label"
315316 (* Input errors *)
···626 | `Label `Role_on_for ->
627 Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element."
628 (q "role") (q "label")
629+ | `Label `Aria_label_on_ancestor ->
630+ Printf.sprintf "The %s attribute must not be used on any %s element that is an ancestor of a labelable element."
631+ (q "aria-label") (q "label")
632 | `Label `Aria_label_on_for ->
633 Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element."
634 (q "aria-label") (q "label")
+5
lib/htmlrw_check/error_code.mli
···527 Adding [role] to a label that wraps a form control
528 breaks the implicit label association. *)
52900000530 | `Role_on_for
531 (** [<label>] with role uses [for] association.
532 Labels with explicit [for] association must not have [role]. *)
···527 Adding [role] to a label that wraps a form control
528 breaks the implicit label association. *)
529530+ | `Aria_label_on_ancestor
531+ (** [<label>] with [aria-label] is ancestor of labelable element.
532+ [aria-label] on a label that wraps a form control creates
533+ conflicting accessible names. *)
534+535 | `Role_on_for
536 (** [<label>] with role uses [for] association.
537 Labels with explicit [for] association must not have [role]. *)
+31
lib/htmlrw_check/semantic/autofocus_checker.mli
···0000000000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Autofocus attribute validation checker.
7+8+ This checker validates that only one element with the [autofocus] attribute
9+ exists within each dialog or popover context. HTML5 specifies that there
10+ should be at most one autofocused element per autofocus scope.
11+12+ {2 Validation Rules}
13+14+ - Within each dialog element, only one descendant may have [autofocus]
15+ - Within each popover element, only one descendant may have [autofocus]
16+ - Nested dialogs and popovers create separate scopes
17+18+ {2 Error Messages}
19+20+ Reports [Multiple_autofocus] when more than one autofocus attribute is
21+ found within the same scope.
22+23+ @see <https://html.spec.whatwg.org/multipage/interaction.html#the-autofocus-attribute>
24+ HTML Standard: The autofocus attribute
25+*)
26+27+val checker : Checker.t
28+(** The autofocus checker instance.
29+30+ This checker can be registered with the checker registry and will be
31+ invoked during DOM traversal to validate autofocus attribute usage. *)
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Language detection and validation checker.
7+8+ This checker validates that the document's [lang] attribute matches the
9+ detected language of the content, and that the [dir] attribute is correct
10+ for right-to-left (RTL) languages.
11+12+ {2 Detection Algorithm}
13+14+ The checker:
15+ 1. Collects text content from the document body (up to 30720 characters)
16+ 2. Skips text from certain elements (scripts, navigation, form controls)
17+ 3. Skips foreign namespace content (SVG, MathML)
18+ 4. Uses statistical language detection with >90% confidence threshold
19+ 5. Handles Traditional vs Simplified Chinese detection
20+21+ {2 Validation Rules}
22+23+ - Documents should have a [lang] attribute on the [<html>] element
24+ - The declared language should match the detected content language
25+ - RTL languages (Arabic, Hebrew, Persian, Urdu, etc.) should have [dir="rtl"]
26+27+ {2 Error Messages}
28+29+ - [Wrong_lang]: The declared language doesn't match detected content
30+ - [Missing_dir_rtl]: An RTL language is detected but no [dir] attribute
31+ - [Wrong_dir]: The [dir] attribute doesn't match the detected RTL language
32+33+ @see <https://html.spec.whatwg.org/multipage/dom.html#the-lang-and-xml:lang-attributes>
34+ HTML Standard: The lang attribute
35+*)
36+37+val checker : Checker.t
38+(** The language detection checker instance.
39+40+ This checker collects text during DOM traversal and performs language
41+ detection at document end. *)
+2-1
lib/htmlrw_check/semantic/option_checker.ml
···49 (match state.option_stack with
50 | ctx :: rest ->
51 state.option_stack <- rest;
52- if not ctx.has_text && not ctx.has_label then
053 Message_collector.add_typed collector (`Misc `Option_empty_without_label)
54 | [] -> ())
55 | _ -> ()
···49 (match state.option_stack with
50 | ctx :: rest ->
51 state.option_stack <- rest;
52+ (* Empty label attribute doesn't count as a valid label *)
53+ if not ctx.has_text && (not ctx.has_label || ctx.label_empty) then
54 Message_collector.add_typed collector (`Misc `Option_empty_without_label)
55 | [] -> ())
56 | _ -> ()
+32
lib/htmlrw_check/semantic/option_checker.mli
···00000000000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Option element validation checker.
7+8+ This checker validates that [<option>] elements have proper content or
9+ a [label] attribute. Empty options without labels can be confusing for
10+ users, especially those using assistive technologies.
11+12+ {2 Validation Rules}
13+14+ - An [<option>] element must have either:
15+ - Non-whitespace text content, OR
16+ - A non-empty [label] attribute
17+ - Empty [label] attribute values are reported as errors
18+ - Options inside [<template>] elements are not checked
19+20+ {2 Error Messages}
21+22+ - [Option_empty_without_label]: Option has no text and no label attribute
23+ - [Bad_value] for label: The label attribute value is empty
24+25+ @see <https://html.spec.whatwg.org/multipage/form-elements.html#the-option-element>
26+ HTML Standard: The option element
27+*)
28+29+val checker : Checker.t
30+(** The option element checker instance.
31+32+ This checker validates option elements during DOM traversal. *)
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Attribute restrictions checker.
7+8+ This checker validates that certain attributes are not used on elements
9+ where they are not allowed. It catches common misuses such as:
10+11+ - RDFa-style [href] on elements like [<img>], [<p>], [<div>]
12+ - [src] or [media] on [<a>] elements
13+ - [srcset] on media elements ([<audio>], [<video>], [<object>])
14+15+ {2 Validation Rules}
16+17+ The checker maintains a list of (element, disallowed_attributes) pairs
18+ for both HTML and SVG elements. When an element is encountered with
19+ a disallowed attribute, an error is reported.
20+21+ {2 Error Messages}
22+23+ Reports [Not_allowed] when an attribute is used on an element where
24+ it is not permitted.
25+26+ @see <https://html.spec.whatwg.org/multipage/dom.html#element-definitions>
27+ HTML Standard: Element definitions
28+*)
29+30+val checker : Checker.t
31+(** The attribute restrictions checker instance. *)
+28
lib/htmlrw_check/specialized/base_checker.mli
···0000000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Base element ordering checker.
7+8+ This checker validates that the [<base>] element appears before any
9+ elements that may use URLs resolved against the base URL. Specifically,
10+ [<base>] should appear before [<link>] and [<script>] elements.
11+12+ {2 Validation Rules}
13+14+ - [<base>] must appear before any [<link>] elements
15+ - [<base>] must appear before any [<script>] elements
16+ - The order is significant for URL resolution in the document
17+18+ {2 Error Messages}
19+20+ Reports [Base_after_link_script] when a [<base>] element is found
21+ after [<link>] or [<script>] elements.
22+23+ @see <https://html.spec.whatwg.org/multipage/semantics.html#the-base-element>
24+ HTML Standard: The base element
25+*)
26+27+val checker : Checker.t
28+(** The base element ordering checker instance. *)
-3
lib/htmlrw_check/specialized/datetime_checker.ml
···5(** Elements that have datetime attribute *)
6let datetime_elements = ["del"; "ins"; "time"]
78-(** Helper: check if char is digit *)
9-let is_digit c = c >= '0' && c <= '9'
10-11(** Parse int safely *)
12let parse_int s =
13 try Some (int_of_string s) with _ -> None
···5(** Elements that have datetime attribute *)
6let datetime_elements = ["del"; "ins"; "time"]
70008(** Parse int safely *)
9let parse_int s =
10 try Some (int_of_string s) with _ -> None
+43
lib/htmlrw_check/specialized/datetime_checker.mli
···0000000000000000000000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Datetime attribute validation checker.
7+8+ This checker validates the [datetime] attribute on [<del>], [<ins>],
9+ and [<time>] elements. The datetime value must conform to a valid
10+ date, time, or datetime format as specified by HTML5.
11+12+ {2 Supported Formats}
13+14+ The checker validates these datetime formats:
15+ - Date: [YYYY-MM-DD] (e.g., "2025-12-19")
16+ - Month: [YYYY-MM] (e.g., "2025-12")
17+ - Year: [YYYY] (e.g., "2025")
18+ - Week: [YYYY-Www] (e.g., "2025-W51")
19+ - Time: [HH:MM] or [HH:MM:SS] (e.g., "14:30:00")
20+ - Datetime: Date followed by time with separator (e.g., "2025-12-19T14:30")
21+ - Timezone offsets: [+HH:MM] or [-HH:MM] or [Z]
22+ - Duration: [P] prefix followed by duration components
23+24+ {2 Validation Rules}
25+26+ - Month values must be 01-12
27+ - Day values must be valid for the given month
28+ - Leap years are correctly handled for February 29th
29+ - Hour values must be 00-23
30+ - Minute and second values must be 00-59
31+ - Week numbers must be 01-53
32+33+ {2 Error Messages}
34+35+ Reports [Bad_value] when the datetime attribute contains an invalid
36+ format or out-of-range values.
37+38+ @see <https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#dates-and-times>
39+ HTML Standard: Dates and times
40+*)
41+42+val checker : Checker.t
43+(** The datetime attribute checker instance. *)
+37
lib/htmlrw_check/specialized/dl_checker.mli
···0000000000000000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** DL element content model validation checker.
7+8+ This checker validates that [<dl>] (description list) elements follow
9+ the HTML5 content model requirements. Description lists must contain
10+ [<dt>] (term) and [<dd>] (description) elements in the correct order.
11+12+ {2 Content Model}
13+14+ A [<dl>] element may contain:
15+ - Zero or more groups of [<dt>] followed by [<dd>] elements
16+ - [<div>] elements wrapping [<dt>]/[<dd>] groups (for styling)
17+ - [<template>] and [<script>] elements (script-supporting)
18+19+ {2 Validation Rules}
20+21+ - [<dd>] should not appear before any [<dt>] (terms should come first)
22+ - [<dl>] should not be empty (should contain at least one term/description)
23+ - When using [<div>] wrappers, mixing wrapped and unwrapped content
24+ is discouraged
25+ - Each [<div>] in a [<dl>] should contain at least one [<dt>]/[<dd>] group
26+27+ {2 Error Messages}
28+29+ - [Dl_empty]: The [<dl>] element has no content
30+ - [Dd_before_dt]: A [<dd>] appears before any [<dt>] element
31+32+ @see <https://html.spec.whatwg.org/multipage/grouping-content.html#the-dl-element>
33+ HTML Standard: The dl element
34+*)
35+36+val checker : Checker.t
37+(** The description list content model checker instance. *)
+35
lib/htmlrw_check/specialized/h1_checker.mli
···00000000000000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** H1 element counter and validator.
7+8+ This checker warns about multiple [<h1>] elements in a document.
9+ While HTML5 technically allows multiple [<h1>] elements when using
10+ the document outline algorithm, this algorithm was never implemented
11+ by browsers and has been removed from the specification.
12+13+ {2 Best Practice}
14+15+ Documents should have exactly one [<h1>] element that represents the
16+ main heading of the page. Multiple [<h1>] elements can confuse users
17+ and assistive technologies about the document's structure.
18+19+ {2 Special Cases}
20+21+ - [<h1>] elements inside [<svg>] content (e.g., in [<foreignObject>])
22+ are not counted, as they may represent different content contexts
23+ - The checker reports a warning after the second [<h1>] is encountered
24+25+ {2 Error Messages}
26+27+ Reports [Multiple_h1] when more than one [<h1>] element is found
28+ in the document.
29+30+ @see <https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements>
31+ HTML Standard: The h1-h6 elements
32+*)
33+34+val checker : Checker.t
35+(** The h1 element counter/validator instance. *)
···110 | Tag.Html `Label when state.label_depth = 0 ->
111 if state.label_has_role && state.labelable_count > 0 then
112 Message_collector.add_typed collector (`Label `Role_on_ancestor);
113+ if state.label_has_aria_label && state.labelable_count > 0 then
114+ Message_collector.add_typed collector (`Label `Aria_label_on_ancestor);
115 state.in_label <- false;
116 state.labelable_count <- 0;
117 state.label_for_value <- None;
+41
lib/htmlrw_check/specialized/label_checker.mli
···00000000000000000000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Label element content model validation checker.
7+8+ This checker validates that [<label>] elements follow the HTML5
9+ content model requirements. Labels associate text with form controls
10+ and must be used correctly for accessibility.
11+12+ {2 Validation Rules}
13+14+ - A [<label>] element may contain at most one labelable element
15+ (button, input, meter, output, progress, select, textarea)
16+ - When using the [for] attribute, it should reference an existing
17+ element ID in the document
18+ - Nested labelable elements are not counted (only direct descendants)
19+20+ {2 Labelable Elements}
21+22+ The following elements can be labeled:
23+ - [<button>]
24+ - [<input>] (except type="hidden")
25+ - [<meter>]
26+ - [<output>]
27+ - [<progress>]
28+ - [<select>]
29+ - [<textarea>]
30+31+ {2 Error Messages}
32+33+ - Multiple labelable elements inside a single [<label>]
34+ - [for] attribute references a non-existent ID
35+36+ @see <https://html.spec.whatwg.org/multipage/forms.html#the-label-element>
37+ HTML Standard: The label element
38+*)
39+40+val checker : Checker.t
41+(** The label element content model checker instance. *)
+42
lib/htmlrw_check/specialized/picture_checker.mli
···000000000000000000000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Picture element content model and attribute validation checker.
7+8+ This checker validates that [<picture>] elements follow the HTML5
9+ content model requirements and that attributes are used correctly.
10+11+ {2 Content Model}
12+13+ A [<picture>] element may contain:
14+ - Zero or more [<source>] elements (must come before [<img>])
15+ - Exactly one [<img>] element (required)
16+ - [<script>] and [<template>] elements (script-supporting)
17+18+ {2 Attribute Restrictions}
19+20+ The [<picture>] element should not have image-related attributes
21+ directly on it (these belong on the [<img>] child):
22+ - [src], [srcset], [sizes], [alt], [width], [height]
23+ - [crossorigin], [loading], [decoding]
24+ - Legacy attributes like [align], [border], [hspace], etc.
25+26+ {2 Source Restrictions in Picture}
27+28+ When [<source>] is a child of [<picture>]:
29+ - It must have [srcset] attribute (required)
30+ - It should not have [src] attribute
31+32+ {2 Error Messages}
33+34+ - Disallowed attributes on [<picture>] or [<source>] in picture context
35+ - Invalid parent elements for [<picture>]
36+37+ @see <https://html.spec.whatwg.org/multipage/embedded-content.html#the-picture-element>
38+ HTML Standard: The picture element
39+*)
40+41+val checker : Checker.t
42+(** The picture element checker instance. *)
+36
lib/htmlrw_check/specialized/ruby_checker.mli
···000000000000000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Ruby element content model validation checker.
7+8+ This checker validates that [<ruby>] elements follow the HTML5
9+ content model requirements. Ruby annotations are used for East Asian
10+ typography to show pronunciation or meaning of characters.
11+12+ {2 Content Model}
13+14+ A [<ruby>] element must contain:
15+ - Phrasing content (the base text)
16+ - One or more [<rt>] elements (the ruby text/annotation)
17+ - Optional [<rp>] elements (fallback parentheses)
18+19+ {2 Validation Rules}
20+21+ - [<ruby>] must contain at least one [<rt>] element
22+ - There should be phrasing content before the first [<rt>]
23+ - [<rp>] elements should surround [<rt>] for fallback rendering
24+ - Nested [<ruby>] elements are handled correctly
25+26+ {2 Error Messages}
27+28+ - Ruby element without any [<rt>] child
29+ - Missing base text before ruby annotation
30+31+ @see <https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-ruby-element>
32+ HTML Standard: The ruby element
33+*)
34+35+val checker : Checker.t
36+(** The ruby element content model checker instance. *)
+34
lib/htmlrw_check/specialized/source_checker.mli
···0000000000000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Source element context validation checker.
7+8+ This checker validates that [<source>] element attributes are appropriate
9+ for the parent context. The allowed attributes differ based on whether
10+ the source is inside [<picture>], [<video>], or [<audio>].
11+12+ {2 Context-Dependent Rules}
13+14+ In [<picture>] context:
15+ - [srcset] is required
16+ - [src] is not allowed
17+ - [media] and [type] are allowed
18+19+ In [<video>] or [<audio>] context:
20+ - [src] is required
21+ - [srcset] and [sizes] are not allowed
22+ - [type] is allowed for MIME type hints
23+24+ {2 Error Messages}
25+26+ - Missing required attributes for the context
27+ - Attributes not allowed in the current context
28+29+ @see <https://html.spec.whatwg.org/multipage/embedded-content.html#the-source-element>
30+ HTML Standard: The source element
31+*)
32+33+val checker : Checker.t
34+(** The source element context checker instance. *)
···61let split_on_space_respecting_parens s =
62 split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "")
6364-(** Check if string contains only whitespace *)
65-let is_whitespace_only s =
66- String.for_all (fun c -> c = ' ' || c = '\t' || c = '\n' || c = '\r') s
67-68(** Invalid units that are not CSS lengths but might be confused for them *)
69let invalid_size_units = [
70 "deg"; "grad"; "rad"; "turn"; (* angle units *)
···154 NoCommentError
155 end
156 end
157-158-(** For backward compatibility *)
159-let has_invalid_css_comment s =
160- match check_css_comment_position s with
161- | NoCommentError -> false
162- | _ -> true
163164(** Check if scientific notation has invalid exponent (like 1e+1.5 - decimal in exponent) *)
165let has_invalid_scientific_notation s =
···280 end
281 end
282 end
283-284-let has_valid_size_unit size_value =
285- match check_size_value size_value with
286- | Valid -> true
287- | InvalidUnit (_, _) | NegativeValue | CssCommentAfterSign (_, _) | CssCommentBeforeUnit (_, _) | BadScientificNotation | BadCssNumber (_, _) -> false
288289(** Check if a sizes entry has a media condition (starts with '(') *)
290let has_media_condition entry =
···61let split_on_space_respecting_parens s =
62 split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "")
63000064(** Invalid units that are not CSS lengths but might be confused for them *)
65let invalid_size_units = [
66 "deg"; "grad"; "rad"; "turn"; (* angle units *)
···150 NoCommentError
151 end
152 end
000000153154(** Check if scientific notation has invalid exponent (like 1e+1.5 - decimal in exponent) *)
155let has_invalid_scientific_notation s =
···270 end
271 end
272 end
00000273274(** Check if a sizes entry has a media condition (starts with '(') *)
275let has_media_condition entry =
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Srcset and sizes attribute validation checker.
7+8+ This checker validates the [srcset] and [sizes] attributes on [<img>]
9+ and [<source>] elements. These attributes use a specialized microsyntax
10+ for responsive images.
11+12+ {2 Srcset Syntax}
13+14+ The [srcset] attribute contains a comma-separated list of image
15+ candidates, each with:
16+ - A URL
17+ - An optional width descriptor ([Nw], e.g., "800w")
18+ - Or an optional pixel density descriptor ([Nx], e.g., "2x")
19+20+ Width and pixel density descriptors cannot be mixed in the same srcset.
21+22+ {2 Sizes Syntax}
23+24+ The [sizes] attribute contains a comma-separated list of:
25+ - Media conditions (optional)
26+ - Source sizes (CSS lengths)
27+28+ The last entry should not have a media condition (it's the default).
29+30+ {2 Validation Rules}
31+32+ - URLs in srcset must be valid
33+ - Width descriptors must be positive integers
34+ - Pixel density descriptors must be positive numbers
35+ - Sizes must use valid CSS length units
36+ - Duplicate descriptors are flagged
37+38+ {2 Error Messages}
39+40+ - Invalid srcset syntax
41+ - Invalid sizes syntax
42+ - Missing sizes when srcset uses width descriptors
43+ - Invalid CSS length units
44+45+ @see <https://html.spec.whatwg.org/multipage/images.html#srcset-attributes>
46+ HTML Standard: Srcset attributes
47+*)
48+49+val checker : Checker.t
50+(** The srcset/sizes attribute checker instance. *)
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Title element validation checker.
7+8+ This checker validates that documents have a proper [<title>] element
9+ with meaningful content. The title is important for accessibility,
10+ SEO, and browser tab identification.
11+12+ {2 Validation Rules}
13+14+ - Documents should have exactly one [<title>] element in the [<head>]
15+ - The [<title>] element should contain non-whitespace text
16+ - Empty titles are flagged as errors
17+18+ {2 Error Messages}
19+20+ - [Empty_title]: The title element is empty or contains only whitespace
21+ - [Missing_title]: No title element found in the document head
22+23+ @see <https://html.spec.whatwg.org/multipage/semantics.html#the-title-element>
24+ HTML Standard: The title element
25+*)
26+27+val checker : Checker.t
28+(** The title element checker instance. *)
···1(** Unknown HTML element checker.
23 Detects elements that are not in the HTML5 specification and produces
4- appropriate error messages. Custom elements (with hyphens) are allowed. *)
5-6-(** Set of all known HTML5 element names. *)
7-let known_elements =
8- let elements = [
9- (* Document metadata *)
10- "html"; "head"; "title"; "base"; "link"; "meta"; "style";
11-12- (* Sections *)
13- "body"; "article"; "section"; "nav"; "aside"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6";
14- "hgroup"; "header"; "footer"; "address"; "main";
15-16- (* Grouping content *)
17- "p"; "hr"; "pre"; "blockquote"; "ol"; "ul"; "menu"; "li"; "dl"; "dt"; "dd";
18- "figure"; "figcaption"; "div";
1920- (* Text-level semantics *)
21- "a"; "em"; "strong"; "small"; "s"; "cite"; "q"; "dfn"; "abbr"; "ruby"; "rt"; "rp";
22- "data"; "time"; "code"; "var"; "samp"; "kbd"; "sub"; "sup"; "i"; "b"; "u"; "mark";
23- "bdi"; "bdo"; "span"; "br"; "wbr"; "search";
24-25- (* Edits *)
26- "ins"; "del";
27-28- (* Embedded content *)
29- "picture"; "source"; "img"; "iframe"; "embed"; "object"; "video"; "audio";
30- "track"; "map"; "area"; "math"; "svg";
31-32- (* Tables *)
33- "table"; "caption"; "colgroup"; "col"; "tbody"; "thead"; "tfoot"; "tr"; "td"; "th";
34-35- (* Forms *)
36- "form"; "label"; "input"; "button"; "select"; "datalist"; "optgroup"; "option";
37- "textarea"; "output"; "progress"; "meter"; "fieldset"; "legend";
38-39- (* Interactive *)
40- "details"; "summary"; "dialog";
41-42- (* Scripting *)
43- "script"; "noscript"; "template"; "slot"; "canvas";
44-45- (* Deprecated but still recognized *)
46- "param";
47- ] in
48- let tbl = Hashtbl.create (List.length elements) in
49- List.iter (fun el -> Hashtbl.add tbl el ()) elements;
50- tbl
51-52-(** Check if an element name is a custom element (contains hyphen). *)
53-let is_custom_element name =
54- String.contains name '-'
55-56-(** Check if an element name is known. *)
57-let is_known_element name =
58- let name_lower = String.lowercase_ascii name in
59- Hashtbl.mem known_elements name_lower || is_custom_element name_lower
6061type state = {
62 mutable stack : string list; (* Parent element stack *)
···1(** Unknown HTML element checker.
23 Detects elements that are not in the HTML5 specification and produces
4+ appropriate error messages. Custom elements (with hyphens) are allowed.
0000000000000056+ Note: Unknown element detection is performed by the parser, which marks
7+ unrecognized elements as [Tag.Unknown]. This checker produces appropriate
8+ error messages for those elements. *)
0000000000000000000000000000000000000910type state = {
11 mutable stack : string list; (* Parent element stack *)
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Unknown HTML element checker.
7+8+ This checker detects elements that are not in the HTML5 specification
9+ and produces appropriate error messages. Custom elements (names
10+ containing hyphens) are allowed per the Web Components specification.
11+12+ {2 Recognized Elements}
13+14+ The checker recognizes all standard HTML5 elements including:
15+ - Document metadata (html, head, title, etc.)
16+ - Sections (body, article, section, nav, etc.)
17+ - Grouping content (p, div, ul, ol, etc.)
18+ - Text-level semantics (a, em, strong, span, etc.)
19+ - Embedded content (img, video, audio, iframe, etc.)
20+ - Tabular data (table, tr, td, th, etc.)
21+ - Forms (form, input, button, select, etc.)
22+ - Interactive elements (details, dialog, summary)
23+ - Scripting (script, noscript, template)
24+25+ {2 Custom Elements}
26+27+ Element names containing a hyphen are treated as custom elements
28+ and are allowed without warning (e.g., [<my-component>], [<app-header>]).
29+30+ {2 Error Messages}
31+32+ Reports [Unknown_element] for unrecognized element names that are
33+ not valid custom elements.
34+35+ @see <https://html.spec.whatwg.org/multipage/custom-elements.html>
36+ HTML Standard: Custom elements
37+*)
38+39+val checker : Checker.t
40+(** The unknown element checker instance. *)
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** URL validation checker.
7+8+ This checker validates URL attributes ([href], [src], [action], etc.)
9+ on HTML elements. It checks for common URL issues and security concerns.
10+11+ {2 Validated Attributes}
12+13+ The checker validates URLs in these attributes:
14+ - [href] on [<a>], [<area>], [<base>], [<link>]
15+ - [src] on [<audio>], [<embed>], [<iframe>], [<img>], [<input>],
16+ [<script>], [<source>], [<track>], [<video>]
17+ - [action] on [<form>], [<button>] (formaction)
18+ - [cite] on [<blockquote>], [<del>], [<ins>], [<q>]
19+ - [data] on [<object>]
20+ - [poster] on [<video>]
21+ - [value] on [<input type="url">]
22+23+ {2 Validation Rules}
24+25+ - URLs should be well-formed (parseable)
26+ - Relative URLs are allowed
27+ - Fragment-only URLs ([#anchor]) are valid
28+ - Data URLs are validated for proper structure
29+ - javascript: URLs may trigger warnings
30+ - Empty URLs are flagged on elements that require them
31+32+ {2 Error Messages}
33+34+ - [Bad_url]: Malformed URL that cannot be parsed
35+ - [Empty_url]: Required URL attribute is empty
36+ - Various URL-specific validation errors
37+38+ @see <https://url.spec.whatwg.org/>
39+ URL Standard
40+*)
41+42+(** {1 URL Parsing Utilities} *)
43+44+val extract_scheme : string -> string option
45+(** [extract_scheme url] extracts the scheme (protocol) from a URL.
46+47+ @param url The URL to parse
48+ @return [Some scheme] if a valid scheme is found (e.g., "http", "https"),
49+ [None] if no scheme is present or the URL is relative *)
50+51+val validate_url : string -> string -> string -> string option
52+(** [validate_url url element_name attr_name] validates a URL.
53+54+ Performs comprehensive validation including:
55+ - Checking for empty URLs on elements that require them
56+ - Validating scheme, host, port, path, query, and fragment
57+ - Checking for illegal characters and encoding issues
58+ - Validating special schemes (http, https, etc.)
59+60+ @param url The URL to validate
61+ @param element_name The element containing the URL attribute
62+ @param attr_name The attribute name
63+ @return [Some error_message] if the URL is invalid, [None] if valid *)
64+65+(** {1 Checker} *)
66+67+val checker : Checker.t
68+(** The URL validation checker instance. *)