···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Common attribute utilities used across checkers.
77+88+ This module provides simple helper functions for working with raw
99+ attribute lists (name-value pairs). These utilities are used by
1010+ checkers that need to inspect attributes without full typed parsing.
1111+1212+ For typed attribute access, see the {!Attr} module.
1313+*)
1414+1515+(** {1 Types} *)
1616+1717+type attrs = (string * string) list
1818+(** Raw attribute list as name-value pairs. *)
1919+2020+(** {1 Attribute Lookup} *)
2121+2222+val has_attr : string -> attrs -> bool
2323+(** [has_attr name attrs] checks if an attribute exists.
2424+2525+ The comparison is case-insensitive.
2626+2727+ @param name The attribute name to look for (lowercase)
2828+ @param attrs The attribute list
2929+ @return [true] if the attribute is present *)
3030+3131+val get_attr : string -> attrs -> string option
3232+(** [get_attr name attrs] gets an attribute value.
3333+3434+ The comparison is case-insensitive.
3535+3636+ @param name The attribute name to look for (lowercase)
3737+ @param attrs The attribute list
3838+ @return [Some value] if found, [None] otherwise *)
3939+4040+val get_attr_or : string -> default:string -> attrs -> string
4141+(** [get_attr_or name ~default attrs] gets an attribute value with a default.
4242+4343+ @param name The attribute name to look for (lowercase)
4444+ @param default The default value if not found
4545+ @param attrs The attribute list
4646+ @return The attribute value or the default *)
4747+4848+val is_non_empty_attr : string -> attrs -> bool
4949+(** [is_non_empty_attr name attrs] checks if an attribute exists with non-empty value.
5050+5151+ The value is considered non-empty if it contains non-whitespace characters.
5252+5353+ @param name The attribute name to look for (lowercase)
5454+ @param attrs The attribute list
5555+ @return [true] if the attribute exists and has a non-empty value *)
+531
lib/htmlrw_check/element/attr.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Typed HTML5 attribute representations using polymorphic variants.
77+88+ This module provides typed representations for HTML attributes with
99+ proper value types for enumerated attributes. Parsing raw attribute
1010+ name-value pairs produces typed variants that can be pattern-matched
1111+ with exhaustiveness checking.
1212+1313+ {2 Design Philosophy}
1414+1515+ HTML5 attributes have specific value constraints that this module
1616+ encodes in the type system:
1717+1818+ - Boolean attributes: Present means true (e.g., [disabled], [checked])
1919+ - Enumerated attributes: Fixed set of valid values (e.g., [dir], [method])
2020+ - Numeric attributes: Integer or float values (e.g., [tabindex], [colspan])
2121+ - URL attributes: String values representing URLs (e.g., [href], [src])
2222+ - Free-form attributes: Any string value (e.g., [class], [title])
2323+2424+ {2 Parsing Strategy}
2525+2626+ Attributes are parsed with validation:
2727+ - Known attributes are parsed into typed variants
2828+ - Invalid values for enumerated attributes fall back to [Unknown_attr]
2929+ - Unknown attribute names are captured as [Unknown_attr]
3030+ - Special handling for [data-*] and [aria-*] prefixed attributes
3131+3232+ @see <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>
3333+ HTML Standard: Global attributes
3434+*)
3535+3636+(** {1 Attribute Value Types}
3737+3838+ These types represent the valid values for enumerated HTML attributes. *)
3939+4040+(** Direction attribute values for [dir]. *)
4141+type dir_value = [ `Ltr | `Rtl | `Auto ]
4242+4343+(** Hidden attribute values. *)
4444+type hidden_value = [ `Hidden | `Until_found ]
4545+4646+(** Popover attribute values. *)
4747+type popover_value = [ `Auto | `Manual | `Hint ]
4848+4949+(** Link target attribute values. *)
5050+type target_value = [ `Self | `Blank | `Parent | `Top | `Named of string ]
5151+5252+(** Image/resource loading behavior. *)
5353+type loading_value = [ `Eager | `Lazy ]
5454+5555+(** Image decoding hint. *)
5656+type decoding_value = [ `Sync | `Async | `Auto ]
5757+5858+(** Fetch priority hint. *)
5959+type fetchpriority_value = [ `High | `Low | `Auto ]
6060+6161+(** CORS settings. *)
6262+type crossorigin_value = [ `Anonymous | `Use_credentials ]
6363+6464+(** Media preload hint. *)
6565+type preload_value = [ `None | `Metadata | `Auto ]
6666+6767+(** Form method values. *)
6868+type method_value = [ `Get | `Post | `Dialog ]
6969+7070+(** Form encoding type values. *)
7171+type enctype_value = [ `Urlencoded | `Multipart | `Plain ]
7272+7373+(** Textarea wrap mode. *)
7474+type wrap_value = [ `Soft | `Hard ]
7575+7676+(** Table cell scope. *)
7777+type scope_value = [ `Row | `Col | `Rowgroup | `Colgroup ]
7878+7979+(** Input element type values. *)
8080+type input_type = [
8181+ | `Hidden | `Text | `Search | `Tel | `Url | `Email | `Password
8282+ | `Date | `Month | `Week | `Time | `Datetime_local | `Number
8383+ | `Range | `Color | `Checkbox | `Radio | `File | `Submit
8484+ | `Image | `Reset | `Button
8585+]
8686+8787+(** Button element type values. *)
8888+type button_type = [ `Submit | `Reset | `Button ]
8989+9090+(** Referrer policy values. *)
9191+type referrerpolicy_value = [
9292+ | `No_referrer | `No_referrer_when_downgrade | `Origin
9393+ | `Origin_when_cross_origin | `Same_origin | `Strict_origin
9494+ | `Strict_origin_when_cross_origin | `Unsafe_url
9595+]
9696+9797+(** Iframe sandbox flags. *)
9898+type sandbox_flag = [
9999+ | `Allow_downloads | `Allow_forms | `Allow_modals | `Allow_orientation_lock
100100+ | `Allow_pointer_lock | `Allow_popups | `Allow_popups_to_escape_sandbox
101101+ | `Allow_presentation | `Allow_same_origin | `Allow_scripts
102102+ | `Allow_top_navigation | `Allow_top_navigation_by_user_activation
103103+ | `Allow_top_navigation_to_custom_protocols
104104+]
105105+106106+(** Enter key hint values for virtual keyboards. *)
107107+type enterkeyhint_value = [
108108+ | `Enter | `Done | `Go | `Next | `Previous | `Search | `Send
109109+]
110110+111111+(** Input mode hint for virtual keyboards. *)
112112+type inputmode_value = [
113113+ | `None | `Text | `Decimal | `Numeric | `Tel | `Search | `Email | `Url
114114+]
115115+116116+(** Content editable values. *)
117117+type contenteditable_value = [ `True | `False | `Plaintext_only ]
118118+119119+(** Autocapitalize values. *)
120120+type autocapitalize_value = [
121121+ | `Off | `None | `On | `Sentences | `Words | `Characters
122122+]
123123+124124+(** Image map shape values. *)
125125+type shape_value = [ `Rect | `Circle | `Poly | `Default ]
126126+127127+(** Input capture values for file inputs. *)
128128+type capture_value = [ `User | `Environment ]
129129+130130+(** Ordered list type values. *)
131131+type list_type_value = [
132132+ | `Decimal | `Lower_alpha | `Upper_alpha | `Lower_roman | `Upper_roman
133133+]
134134+135135+(** Track element kind values. *)
136136+type kind_value = [
137137+ | `Subtitles | `Captions | `Descriptions | `Chapters | `Metadata
138138+]
139139+140140+(** {1 Typed Attribute Variant} *)
141141+142142+(** Typed attribute representation.
143143+144144+ This type covers all HTML5 attributes with appropriate value types.
145145+ Attributes are organized into logical groups. *)
146146+type t = [
147147+ (* Global attributes *)
148148+ | `Id of string
149149+ | `Class of string
150150+ | `Style of string
151151+ | `Title of string
152152+ | `Lang of string
153153+ | `Dir of dir_value
154154+ | `Hidden of hidden_value option
155155+ | `Tabindex of int
156156+ | `Accesskey of string
157157+ | `Autocapitalize of autocapitalize_value
158158+ | `Autofocus
159159+ | `Contenteditable of contenteditable_value option
160160+ | `Draggable of bool
161161+ | `Enterkeyhint of enterkeyhint_value
162162+ | `Inert
163163+ | `Inputmode of inputmode_value
164164+ | `Is of string
165165+ | `Nonce of string
166166+ | `Popover of popover_value option
167167+ | `Slot of string
168168+ | `Spellcheck of bool option
169169+ | `Translate of bool
170170+ | `Exportparts of string
171171+ | `Part of string
172172+173173+ (* Microdata *)
174174+ | `Itemscope
175175+ | `Itemtype of string
176176+ | `Itemprop of string
177177+ | `Itemid of string
178178+ | `Itemref of string
179179+180180+ (* ARIA *)
181181+ | `Role of string
182182+ | `Aria of string * string
183183+184184+ (* Event handlers *)
185185+ | `Event of string * string
186186+187187+ (* Link/navigation attributes *)
188188+ | `Href of string
189189+ | `Target of target_value
190190+ | `Rel of string
191191+ | `Download of string option
192192+ | `Hreflang of string
193193+ | `Ping of string
194194+ | `Referrerpolicy of referrerpolicy_value
195195+ | `Type_link of string
196196+197197+ (* Media/resource attributes *)
198198+ | `Src of string
199199+ | `Srcset of string
200200+ | `Sizes of string
201201+ | `Alt of string
202202+ | `Width of string
203203+ | `Height of string
204204+ | `Loading of loading_value
205205+ | `Decoding of decoding_value
206206+ | `Fetchpriority of fetchpriority_value
207207+ | `Crossorigin of crossorigin_value option
208208+ | `Ismap
209209+ | `Usemap of string
210210+ | `Media of string
211211+212212+ (* Audio/Video specific *)
213213+ | `Controls
214214+ | `Autoplay
215215+ | `Loop
216216+ | `Muted
217217+ | `Preload of preload_value
218218+ | `Poster of string
219219+ | `Playsinline
220220+221221+ (* Image map *)
222222+ | `Coords of string
223223+ | `Shape of shape_value
224224+225225+ (* iframe *)
226226+ | `Sandbox of sandbox_flag list option
227227+ | `Allow of string
228228+ | `Allowfullscreen
229229+ | `Srcdoc of string
230230+ | `Csp of string
231231+232232+ (* Form attributes *)
233233+ | `Action of string
234234+ | `Method of method_value
235235+ | `Enctype of enctype_value
236236+ | `Novalidate
237237+ | `Accept_charset of string
238238+ | `Autocomplete of string
239239+ | `Name of string
240240+ | `Form of string
241241+242242+ (* Form control attributes *)
243243+ | `Value of string
244244+ | `Type_input of input_type
245245+ | `Type_button of button_type
246246+ | `Disabled
247247+ | `Readonly
248248+ | `Required
249249+ | `Checked
250250+ | `Selected
251251+ | `Multiple
252252+ | `Placeholder of string
253253+ | `Min of string
254254+ | `Max of string
255255+ | `Step of string
256256+ | `Minlength of int
257257+ | `Maxlength of int
258258+ | `Pattern of string
259259+ | `Size of int
260260+ | `Cols of int
261261+ | `Rows of int
262262+ | `Wrap of wrap_value
263263+ | `Accept of string
264264+ | `Capture of capture_value
265265+ | `Dirname of string
266266+ | `For of string
267267+ | `List of string
268268+269269+ (* Form submission attributes *)
270270+ | `Formaction of string
271271+ | `Formmethod of method_value
272272+ | `Formenctype of enctype_value
273273+ | `Formnovalidate
274274+ | `Formtarget of target_value
275275+276276+ (* Table attributes *)
277277+ | `Colspan of int
278278+ | `Rowspan of int
279279+ | `Headers of string
280280+ | `Scope of scope_value
281281+ | `Span of int
282282+283283+ (* Details/Dialog *)
284284+ | `Open
285285+286286+ (* Script *)
287287+ | `Async
288288+ | `Defer
289289+ | `Integrity of string
290290+ | `Nomodule
291291+ | `Blocking of string
292292+ | `Type_script of string
293293+294294+ (* Meta *)
295295+ | `Charset of string
296296+ | `Content of string
297297+ | `Http_equiv of string
298298+299299+ (* Link element *)
300300+ | `As of string
301301+ | `Imagesizes of string
302302+ | `Imagesrcset of string
303303+304304+ (* Object/Embed *)
305305+ | `Data_object of string
306306+307307+ (* Output *)
308308+ | `For_output of string
309309+310310+ (* Meter/Progress *)
311311+ | `Low of float
312312+ | `High of float
313313+ | `Optimum of float
314314+315315+ (* Time *)
316316+ | `Datetime of string
317317+318318+ (* Ol *)
319319+ | `Start of int
320320+ | `Reversed
321321+ | `Type_list of list_type_value
322322+323323+ (* Track *)
324324+ | `Kind of kind_value
325325+ | `Srclang of string
326326+ | `Default
327327+328328+ (* Td/Th *)
329329+ | `Abbr of string
330330+331331+ (* Data attributes *)
332332+ | `Data_attr of string * string
333333+334334+ (* RDFa *)
335335+ | `Property of string
336336+ | `Typeof of string
337337+ | `Resource of string
338338+ | `Prefix of string
339339+ | `Vocab of string
340340+ | `About of string
341341+ | `Datatype of string
342342+ | `Inlist
343343+ | `Rev of string
344344+345345+ (* Escape hatch *)
346346+ | `Unknown_attr of string * string
347347+]
348348+349349+(** {1 Parsing Functions} *)
350350+351351+val parse_dir : string -> dir_value option
352352+(** [parse_dir value] parses a direction attribute value. *)
353353+354354+val parse_target : string -> target_value
355355+(** [parse_target value] parses a target attribute value. *)
356356+357357+val parse_loading : string -> loading_value option
358358+(** [parse_loading value] parses a loading attribute value. *)
359359+360360+val parse_decoding : string -> decoding_value option
361361+(** [parse_decoding value] parses a decoding attribute value. *)
362362+363363+val parse_fetchpriority : string -> fetchpriority_value option
364364+(** [parse_fetchpriority value] parses a fetchpriority attribute value. *)
365365+366366+val parse_crossorigin : string -> crossorigin_value option
367367+(** [parse_crossorigin value] parses a crossorigin attribute value. *)
368368+369369+val parse_preload : string -> preload_value option
370370+(** [parse_preload value] parses a preload attribute value. *)
371371+372372+val parse_method : string -> method_value option
373373+(** [parse_method value] parses a form method attribute value. *)
374374+375375+val parse_enctype : string -> enctype_value option
376376+(** [parse_enctype value] parses a form enctype attribute value. *)
377377+378378+val parse_wrap : string -> wrap_value option
379379+(** [parse_wrap value] parses a textarea wrap attribute value. *)
380380+381381+val parse_scope : string -> scope_value option
382382+(** [parse_scope value] parses a table scope attribute value. *)
383383+384384+val parse_input_type : string -> input_type option
385385+(** [parse_input_type value] parses an input type attribute value. *)
386386+387387+val parse_button_type : string -> button_type option
388388+(** [parse_button_type value] parses a button type attribute value. *)
389389+390390+val parse_shape : string -> shape_value option
391391+(** [parse_shape value] parses an area shape attribute value. *)
392392+393393+val parse_capture : string -> capture_value option
394394+(** [parse_capture value] parses an input capture attribute value. *)
395395+396396+val parse_list_type : string -> list_type_value option
397397+(** [parse_list_type value] parses an ordered list type attribute value. *)
398398+399399+val parse_kind : string -> kind_value option
400400+(** [parse_kind value] parses a track kind attribute value. *)
401401+402402+val parse_referrerpolicy : string -> referrerpolicy_value option
403403+(** [parse_referrerpolicy value] parses a referrer policy attribute value. *)
404404+405405+val parse_sandbox_flag : string -> sandbox_flag option
406406+(** [parse_sandbox_flag value] parses a single sandbox flag token. *)
407407+408408+val parse_sandbox : string -> sandbox_flag list option
409409+(** [parse_sandbox value] parses a space-separated sandbox attribute value. *)
410410+411411+val parse_enterkeyhint : string -> enterkeyhint_value option
412412+(** [parse_enterkeyhint value] parses an enterkeyhint attribute value. *)
413413+414414+val parse_inputmode : string -> inputmode_value option
415415+(** [parse_inputmode value] parses an inputmode attribute value. *)
416416+417417+val parse_contenteditable : string -> contenteditable_value option
418418+(** [parse_contenteditable value] parses a contenteditable attribute value. *)
419419+420420+val parse_autocapitalize : string -> autocapitalize_value option
421421+(** [parse_autocapitalize value] parses an autocapitalize attribute value. *)
422422+423423+val parse_hidden : string -> hidden_value option
424424+(** [parse_hidden value] parses a hidden attribute value. *)
425425+426426+val parse_popover : string -> popover_value option
427427+(** [parse_popover value] parses a popover attribute value. *)
428428+429429+val parse_int : string -> int option
430430+(** [parse_int value] attempts to parse an integer from a string. *)
431431+432432+val parse_float : string -> float option
433433+(** [parse_float value] attempts to parse a float from a string. *)
434434+435435+val parse_bool : string -> bool option
436436+(** [parse_bool value] parses a boolean attribute value. *)
437437+438438+val parse_attr : string -> string -> t
439439+(** [parse_attr name value] parses a single attribute name-value pair.
440440+441441+ @param name The attribute name
442442+ @param value The attribute value
443443+ @return A typed attribute variant
444444+445445+ {b Example:}
446446+ {[
447447+ parse_attr "class" "container" (* `Class "container" *)
448448+ parse_attr "disabled" "" (* `Disabled *)
449449+ parse_attr "data-id" "123" (* `Data_attr ("id", "123") *)
450450+ ]} *)
451451+452452+val parse_attrs : (string * string) list -> t list
453453+(** [parse_attrs attrs] parses multiple attributes.
454454+455455+ @param attrs List of (name, value) pairs
456456+ @return List of typed attributes *)
457457+458458+(** {1 Accessor Functions} *)
459459+460460+val get_id : t list -> string option
461461+(** [get_id attrs] extracts the id attribute value if present. *)
462462+463463+val get_class : t list -> string option
464464+(** [get_class attrs] extracts the class attribute value if present. *)
465465+466466+val get_href : t list -> string option
467467+(** [get_href attrs] extracts the href attribute value if present. *)
468468+469469+val get_src : t list -> string option
470470+(** [get_src attrs] extracts the src attribute value if present. *)
471471+472472+val get_alt : t list -> string option
473473+(** [get_alt attrs] extracts the alt attribute value if present. *)
474474+475475+val get_name : t list -> string option
476476+(** [get_name attrs] extracts the name attribute value if present. *)
477477+478478+val get_value : t list -> string option
479479+(** [get_value attrs] extracts the value attribute value if present. *)
480480+481481+val get_role : t list -> string option
482482+(** [get_role attrs] extracts the role attribute value if present. *)
483483+484484+val get_aria : string -> t list -> string option
485485+(** [get_aria name attrs] extracts a specific aria-* attribute value.
486486+487487+ @param name The aria attribute name without the "aria-" prefix *)
488488+489489+val get_data : string -> t list -> string option
490490+(** [get_data name attrs] extracts a specific data-* attribute value.
491491+492492+ @param name The data attribute name without the "data-" prefix *)
493493+494494+val has_disabled : t list -> bool
495495+(** [has_disabled attrs] checks if the disabled attribute is present. *)
496496+497497+val has_required : t list -> bool
498498+(** [has_required attrs] checks if the required attribute is present. *)
499499+500500+val has_readonly : t list -> bool
501501+(** [has_readonly attrs] checks if the readonly attribute is present. *)
502502+503503+val has_checked : t list -> bool
504504+(** [has_checked attrs] checks if the checked attribute is present. *)
505505+506506+val has_autofocus : t list -> bool
507507+(** [has_autofocus attrs] checks if the autofocus attribute is present. *)
508508+509509+val has_hidden : t list -> bool
510510+(** [has_hidden attrs] checks if the hidden attribute is present. *)
511511+512512+val has_inert : t list -> bool
513513+(** [has_inert attrs] checks if the inert attribute is present. *)
514514+515515+val has_open : t list -> bool
516516+(** [has_open attrs] checks if the open attribute is present. *)
517517+518518+val get_all_aria : t list -> (string * string) list
519519+(** [get_all_aria attrs] extracts all aria-* attributes. *)
520520+521521+val get_all_data : t list -> (string * string) list
522522+(** [get_all_data attrs] extracts all data-* attributes. *)
523523+524524+val find : (t -> 'a option) -> t list -> 'a option
525525+(** [find f attrs] finds the first attribute matching predicate [f]. *)
526526+527527+val exists : (t -> bool) -> t list -> bool
528528+(** [exists f attrs] checks if any attribute matches predicate [f]. *)
529529+530530+val filter : (t -> bool) -> t list -> t list
531531+(** [filter f attrs] filters attributes matching predicate [f]. *)
+348
lib/htmlrw_check/element/element.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Typed HTML5 element representation.
77+88+ This module combines tags and attributes into a complete typed element
99+ representation. Elements are created from raw input (tag name, namespace,
1010+ attributes) and provide typed accessors for validation and manipulation.
1111+1212+ {2 Design Philosophy}
1313+1414+ An element in this module represents a complete typed view of an HTML
1515+ element, including:
1616+1717+ - The element's tag (typed via {!Tag.element_tag})
1818+ - Typed attributes (via {!Attr.t} list)
1919+ - Raw attributes (for fallback access)
2020+2121+ This dual representation allows checkers to use typed pattern matching
2222+ for common cases while falling back to raw strings when needed.
2323+2424+ {2 Usage Example}
2525+2626+ {[
2727+ let elem = Element.create
2828+ ~name:"input"
2929+ ~namespace:None
3030+ ~attrs:[("type", "email"); ("required", ""); ("class", "form-input")]
3131+ in
3232+ match elem.tag with
3333+ | Tag.Html `Input ->
3434+ if Element.has_required elem then
3535+ (* Validate required input *)
3636+ ()
3737+ | _ -> ()
3838+ ]}
3939+4040+ @see 'Tag' for element tag types
4141+ @see 'Attr' for attribute types
4242+*)
4343+4444+(** {1 Element Type} *)
4545+4646+(** A typed HTML element.
4747+4848+ @field tag The element's tag classification
4949+ @field attrs Typed attributes parsed from raw input
5050+ @field raw_attrs Original attribute name-value pairs for fallback *)
5151+type t = {
5252+ tag : Tag.element_tag;
5353+ attrs : Attr.t list;
5454+ raw_attrs : (string * string) list;
5555+}
5656+5757+(** {1 Construction} *)
5858+5959+val create : name:string -> namespace:string option -> attrs:(string * string) list -> t
6060+(** [create ~name ~namespace ~attrs] creates a typed element.
6161+6262+ @param name The element's tag name
6363+ @param namespace Optional namespace URI (for SVG/MathML)
6464+ @param attrs Raw attribute name-value pairs
6565+ @return A typed element
6666+6767+ {b Example:}
6868+ {[
6969+ let div = Element.create ~name:"div" ~namespace:None
7070+ ~attrs:[("class", "container"); ("id", "main")]
7171+ ]} *)
7272+7373+(** {1 Tag Accessors} *)
7474+7575+val tag : t -> Tag.element_tag
7676+(** [tag elem] returns the element's tag. *)
7777+7878+val tag_name : t -> string
7979+(** [tag_name elem] returns the element's tag name as a string. *)
8080+8181+val is_html_tag : Tag.html_tag -> t -> bool
8282+(** [is_html_tag expected elem] checks if the element is a specific HTML tag.
8383+8484+ @param expected The expected HTML tag variant
8585+ @param elem The element to check
8686+ @return [true] if the element matches *)
8787+8888+val as_html_tag : t -> Tag.html_tag option
8989+(** [as_html_tag elem] extracts the HTML tag if this is an HTML element.
9090+9191+ @return [Some tag] for HTML elements, [None] for SVG/MathML/Custom/Unknown *)
9292+9393+(** {1 Attribute Accessors} *)
9494+9595+val attrs : t -> Attr.t list
9696+(** [attrs elem] returns the typed attributes. *)
9797+9898+val raw_attrs : t -> (string * string) list
9999+(** [raw_attrs elem] returns the original raw attributes. *)
100100+101101+val get_id : t -> string option
102102+(** [get_id elem] extracts the id attribute value. *)
103103+104104+val get_class : t -> string option
105105+(** [get_class elem] extracts the class attribute value. *)
106106+107107+val get_href : t -> string option
108108+(** [get_href elem] extracts the href attribute value. *)
109109+110110+val get_src : t -> string option
111111+(** [get_src elem] extracts the src attribute value. *)
112112+113113+val get_alt : t -> string option
114114+(** [get_alt elem] extracts the alt attribute value. *)
115115+116116+val get_name : t -> string option
117117+(** [get_name elem] extracts the name attribute value. *)
118118+119119+val get_value : t -> string option
120120+(** [get_value elem] extracts the value attribute value. *)
121121+122122+val get_role : t -> string option
123123+(** [get_role elem] extracts the role attribute value. *)
124124+125125+val get_aria : string -> t -> string option
126126+(** [get_aria name elem] extracts a specific aria-* attribute value.
127127+128128+ @param name The aria attribute name without the "aria-" prefix *)
129129+130130+val get_data : string -> t -> string option
131131+(** [get_data name elem] extracts a specific data-* attribute value.
132132+133133+ @param name The data attribute name without the "data-" prefix *)
134134+135135+val has_disabled : t -> bool
136136+(** [has_disabled elem] checks if the disabled attribute is present. *)
137137+138138+val has_required : t -> bool
139139+(** [has_required elem] checks if the required attribute is present. *)
140140+141141+val has_readonly : t -> bool
142142+(** [has_readonly elem] checks if the readonly attribute is present. *)
143143+144144+val has_checked : t -> bool
145145+(** [has_checked elem] checks if the checked attribute is present. *)
146146+147147+val has_autofocus : t -> bool
148148+(** [has_autofocus elem] checks if the autofocus attribute is present. *)
149149+150150+val has_hidden : t -> bool
151151+(** [has_hidden elem] checks if the hidden attribute is present. *)
152152+153153+val has_inert : t -> bool
154154+(** [has_inert elem] checks if the inert attribute is present. *)
155155+156156+val has_open : t -> bool
157157+(** [has_open elem] checks if the open attribute is present. *)
158158+159159+val get_all_aria : t -> (string * string) list
160160+(** [get_all_aria elem] extracts all aria-* attributes. *)
161161+162162+val get_all_data : t -> (string * string) list
163163+(** [get_all_data elem] extracts all data-* attributes. *)
164164+165165+(** {1 Raw Attribute Fallback} *)
166166+167167+val get_raw_attr : string -> t -> string option
168168+(** [get_raw_attr name elem] gets a raw attribute value by name.
169169+170170+ This is useful when the typed representation doesn't capture a specific
171171+ attribute or when you need the exact original value.
172172+173173+ @param name The attribute name (case-insensitive)
174174+ @param elem The element
175175+ @return [Some value] if the attribute exists *)
176176+177177+val has_raw_attr : string -> t -> bool
178178+(** [has_raw_attr name elem] checks if a raw attribute exists.
179179+180180+ @param name The attribute name (case-insensitive)
181181+ @param elem The element
182182+ @return [true] if the attribute is present *)
183183+184184+(** {1 Category Checks}
185185+186186+ These predicates check element categories based on the HTML5 content model. *)
187187+188188+val is_void : t -> bool
189189+(** [is_void elem] checks if this is a void element (cannot have children).
190190+191191+ @return [true] for br, hr, img, input, etc. *)
192192+193193+val is_heading : t -> bool
194194+(** [is_heading elem] checks if this is a heading element.
195195+196196+ @return [true] for h1-h6 *)
197197+198198+val heading_level : t -> int option
199199+(** [heading_level elem] gets the heading level (1-6) if applicable.
200200+201201+ @return [Some level] for h1-h6, [None] otherwise *)
202202+203203+val is_sectioning : t -> bool
204204+(** [is_sectioning elem] checks if this is sectioning content.
205205+206206+ @return [true] for article, aside, nav, section *)
207207+208208+val is_sectioning_root : t -> bool
209209+(** [is_sectioning_root elem] checks if this is a sectioning root.
210210+211211+ @return [true] for blockquote, body, details, dialog, fieldset, figure, td *)
212212+213213+val is_embedded : t -> bool
214214+(** [is_embedded elem] checks if this is embedded content.
215215+216216+ @return [true] for audio, canvas, embed, iframe, img, object, picture, video *)
217217+218218+val is_interactive : t -> bool
219219+(** [is_interactive elem] checks if this is interactive content.
220220+221221+ @return [true] for focusable/activatable elements *)
222222+223223+val is_form_associated : t -> bool
224224+(** [is_form_associated elem] checks if this is form-associated.
225225+226226+ @return [true] for elements that can belong to a form *)
227227+228228+val is_labelable : t -> bool
229229+(** [is_labelable elem] checks if this can be associated with a label.
230230+231231+ @return [true] for button, input, meter, output, progress, select, textarea *)
232232+233233+val is_submittable : t -> bool
234234+(** [is_submittable elem] checks if this is a submittable form element.
235235+236236+ @return [true] for button, input, select, textarea *)
237237+238238+val is_table_element : t -> bool
239239+(** [is_table_element elem] checks if this is a table-related element.
240240+241241+ @return [true] for table, tr, td, th, etc. *)
242242+243243+val is_media : t -> bool
244244+(** [is_media elem] checks if this is a media element.
245245+246246+ @return [true] for audio, video *)
247247+248248+val is_list_container : t -> bool
249249+(** [is_list_container elem] checks if this is a list container.
250250+251251+ @return [true] for ul, ol, menu, dl *)
252252+253253+val is_transparent : t -> bool
254254+(** [is_transparent elem] checks if this has a transparent content model.
255255+256256+ @return [true] for a, abbr, audio, canvas, del, ins, map, noscript, etc. *)
257257+258258+val is_phrasing : t -> bool
259259+(** [is_phrasing elem] checks if this is phrasing content.
260260+261261+ @return [true] for inline-level elements *)
262262+263263+val is_flow : t -> bool
264264+(** [is_flow elem] checks if this is flow content.
265265+266266+ @return [true] for most body-level elements *)
267267+268268+val is_obsolete : t -> bool
269269+(** [is_obsolete elem] checks if this is a deprecated element.
270270+271271+ @return [true] for applet, font, marquee, etc. *)
272272+273273+val is_svg : t -> bool
274274+(** [is_svg elem] checks if this is an SVG element.
275275+276276+ @return [true] if the element is in the SVG namespace *)
277277+278278+val is_mathml : t -> bool
279279+(** [is_mathml elem] checks if this is a MathML element.
280280+281281+ @return [true] if the element is in the MathML namespace *)
282282+283283+val is_custom : t -> bool
284284+(** [is_custom elem] checks if this is a custom element.
285285+286286+ @return [true] if the element name contains a hyphen *)
287287+288288+val is_unknown : t -> bool
289289+(** [is_unknown elem] checks if this is an unknown element.
290290+291291+ @return [true] if the element is not recognized *)
292292+293293+(** {1 Input Type Utilities} *)
294294+295295+val get_input_type : t -> Attr.input_type option
296296+(** [get_input_type elem] gets the input type for input elements.
297297+298298+ @return [Some type] for input elements with a type, [None] otherwise *)
299299+300300+val get_button_type : t -> Attr.button_type option
301301+(** [get_button_type elem] gets the button type for button elements.
302302+303303+ @return [Some type] for button elements with a type, [None] otherwise *)
304304+305305+val is_input_type : Attr.input_type -> t -> bool
306306+(** [is_input_type expected elem] checks if an input has a specific type.
307307+308308+ @param expected The expected input type
309309+ @param elem The element to check
310310+ @return [true] if this is an input with the specified type *)
311311+312312+(** {1 Pattern Matching Helpers} *)
313313+314314+val match_html : t -> (Tag.html_tag -> 'a) -> 'a option
315315+(** [match_html elem f] applies [f] to the HTML tag if present.
316316+317317+ @param elem The element
318318+ @param f Function to apply to the HTML tag
319319+ @return [Some (f tag)] for HTML elements, [None] otherwise *)
320320+321321+val when_html_tag : Tag.html_tag -> t -> (unit -> 'a) -> 'a option
322322+(** [when_html_tag expected elem f] applies [f] if the element matches.
323323+324324+ @param expected The expected HTML tag
325325+ @param elem The element to check
326326+ @param f Function to call if the element matches
327327+ @return [Some (f ())] if matched, [None] otherwise *)
328328+329329+(** {1 Internal} *)
330330+331331+val parse_type_attr : Tag.html_tag -> string -> Attr.t
332332+(** [parse_type_attr tag value] parses a type attribute for an element.
333333+334334+ Different elements have different valid type values. This function
335335+ handles context-dependent parsing.
336336+337337+ @param tag The element's HTML tag
338338+ @param value The type attribute value
339339+ @return The parsed attribute variant *)
340340+341341+val parse_attrs_for_tag : Tag.element_tag -> (string * string) list -> Attr.t list
342342+(** [parse_attrs_for_tag tag raw_attrs] parses attributes with element context.
343343+344344+ The type attribute is parsed differently depending on the element tag.
345345+346346+ @param tag The element's tag
347347+ @param raw_attrs Raw attribute name-value pairs
348348+ @return List of typed attributes *)
+439
lib/htmlrw_check/element/tag.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Typed HTML5 tag representations using polymorphic variants.
77+88+ This module provides compile-time type safety for HTML elements while
99+ maintaining escape hatches for unknown/custom elements. Tags are
1010+ represented using polymorphic variants, enabling pattern matching with
1111+ exhaustiveness checking while avoiding the overhead of explicit
1212+ constructors.
1313+1414+ {2 Design Philosophy}
1515+1616+ HTML5 defines over 100 standard elements with specific categories and
1717+ content models. This module:
1818+1919+ - Provides typed representations for all standard elements
2020+ - Supports SVG and MathML namespaced elements
2121+ - Recognizes custom elements (containing hyphens)
2222+ - Falls back to [Unknown] for unrecognized elements
2323+2424+ {2 Element Categories}
2525+2626+ HTML5 categorizes elements into content categories that define where
2727+ elements can appear and what they can contain. This module provides
2828+ predicates for common categories:
2929+3030+ - {!is_void} - Elements that cannot have children
3131+ - {!is_heading} - Heading elements (h1-h6)
3232+ - {!is_sectioning} - Elements that create document sections
3333+ - {!is_phrasing} - Inline/phrasing content elements
3434+ - {!is_flow} - Block/flow content elements
3535+3636+ @see <https://html.spec.whatwg.org/multipage/dom.html#content-models>
3737+ HTML Standard: Content models
3838+*)
3939+4040+(** {1 HTML Tag Types} *)
4141+4242+(** All standard HTML5 elements plus deprecated elements needed by the validator.
4343+4444+ This type covers:
4545+ - Document metadata elements (html, head, title, etc.)
4646+ - Sectioning elements (article, section, nav, etc.)
4747+ - Heading elements (h1-h6)
4848+ - Grouping content (div, p, ul, ol, etc.)
4949+ - Text-level semantics (a, em, strong, span, etc.)
5050+ - Embedded content (img, video, audio, etc.)
5151+ - Table elements (table, tr, td, th, etc.)
5252+ - Form elements (form, input, button, etc.)
5353+ - Interactive elements (details, dialog, summary)
5454+ - Scripting elements (script, noscript, template)
5555+ - Deprecated/obsolete elements (font, center, marquee, etc.) *)
5656+type html_tag = [
5757+ (* Document metadata *)
5858+ | `Html | `Head | `Title | `Base | `Link | `Meta | `Style
5959+6060+ (* Sectioning root *)
6161+ | `Body
6262+6363+ (* Content sectioning *)
6464+ | `Address | `Article | `Aside | `Footer | `Header | `Hgroup
6565+ | `Main | `Nav | `Search | `Section
6666+6767+ (* Heading content *)
6868+ | `H1 | `H2 | `H3 | `H4 | `H5 | `H6
6969+7070+ (* Grouping content *)
7171+ | `Blockquote | `Dd | `Div | `Dl | `Dt | `Figcaption | `Figure
7272+ | `Hr | `Li | `Menu | `Ol | `P | `Pre | `Ul
7373+7474+ (* Text-level semantics *)
7575+ | `A | `Abbr | `B | `Bdi | `Bdo | `Br | `Cite | `Code | `Data
7676+ | `Dfn | `Em | `I | `Kbd | `Mark | `Q | `Rp | `Rt | `Ruby
7777+ | `S | `Samp | `Small | `Span | `Strong | `Sub | `Sup | `Time
7878+ | `U | `Var | `Wbr
7979+8080+ (* Edits *)
8181+ | `Del | `Ins
8282+8383+ (* Embedded content *)
8484+ | `Area | `Audio | `Canvas | `Embed | `Iframe | `Img | `Map | `Object
8585+ | `Picture | `Source | `Track | `Video
8686+8787+ (* Tabular data *)
8888+ | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
8989+ | `Th | `Thead | `Tr
9090+9191+ (* Forms *)
9292+ | `Button | `Datalist | `Fieldset | `Form | `Input | `Label
9393+ | `Legend | `Meter | `Optgroup | `Option | `Output | `Progress
9494+ | `Select | `Textarea
9595+9696+ (* Interactive elements *)
9797+ | `Details | `Dialog | `Summary
9898+9999+ (* Scripting *)
100100+ | `Noscript | `Script | `Slot | `Template
101101+102102+ (* Web Components / Misc *)
103103+ | `Portal | `Param
104104+105105+ (* Deprecated/obsolete elements *)
106106+ | `Applet | `Acronym | `Bgsound | `Dir | `Frame | `Frameset
107107+ | `Noframes | `Isindex | `Keygen | `Listing | `Menuitem | `Nextid
108108+ | `Noembed | `Plaintext | `Rb | `Rtc | `Strike | `Xmp
109109+ | `Basefont | `Big | `Blink | `Center | `Font | `Marquee
110110+ | `Multicol | `Nobr | `Spacer | `Tt | `Image
111111+]
112112+113113+(** {1 Category Types}
114114+115115+ Type aliases for element subsets, enabling functions that only accept
116116+ specific categories with compile-time checking. *)
117117+118118+(** Void elements - cannot have children (e.g., br, hr, img, input). *)
119119+type void_tag = [
120120+ | `Area | `Base | `Br | `Col | `Embed | `Hr | `Img | `Input
121121+ | `Link | `Meta | `Source | `Track | `Wbr
122122+ | `Basefont | `Frame | `Isindex | `Keygen | `Param
123123+]
124124+125125+(** Heading elements (h1-h6). *)
126126+type heading_tag = [ `H1 | `H2 | `H3 | `H4 | `H5 | `H6 ]
127127+128128+(** Sectioning content elements that establish document sections. *)
129129+type sectioning_tag = [ `Article | `Aside | `Nav | `Section ]
130130+131131+(** Sectioning roots that establish their own outline context. *)
132132+type sectioning_root_tag = [
133133+ | `Blockquote | `Body | `Details | `Dialog | `Fieldset | `Figure | `Td
134134+]
135135+136136+(** Embedded content elements. *)
137137+type embedded_tag = [
138138+ | `Audio | `Canvas | `Embed | `Iframe | `Img | `Object | `Picture | `Video
139139+]
140140+141141+(** Interactive content elements (focusable/activatable). *)
142142+type interactive_tag = [
143143+ | `A | `Audio | `Button | `Details | `Embed | `Iframe | `Img
144144+ | `Input | `Label | `Select | `Textarea | `Video
145145+]
146146+147147+(** Form-associated elements that can belong to a form. *)
148148+type form_associated_tag = [
149149+ | `Button | `Fieldset | `Input | `Label | `Object | `Output
150150+ | `Select | `Textarea | `Meter | `Progress
151151+]
152152+153153+(** Labelable elements that can be associated with a label. *)
154154+type labelable_tag = [
155155+ | `Button | `Input | `Meter | `Output | `Progress | `Select | `Textarea
156156+]
157157+158158+(** Submittable form elements. *)
159159+type submittable_tag = [
160160+ | `Button | `Input | `Select | `Textarea
161161+]
162162+163163+(** Resettable form elements. *)
164164+type resettable_tag = [
165165+ | `Input | `Output | `Select | `Textarea
166166+]
167167+168168+(** Table-related elements. *)
169169+type table_tag = [
170170+ | `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
171171+ | `Th | `Thead | `Tr
172172+]
173173+174174+(** Media elements (audio and video). *)
175175+type media_tag = [ `Audio | `Video ]
176176+177177+(** List container elements. *)
178178+type list_container_tag = [ `Ul | `Ol | `Menu | `Dl ]
179179+180180+(** List item elements. *)
181181+type list_item_tag = [ `Li | `Dd | `Dt ]
182182+183183+(** Script-supporting elements. *)
184184+type script_supporting_tag = [ `Script | `Template ]
185185+186186+(** Metadata content elements. *)
187187+type metadata_tag = [ `Base | `Link | `Meta | `Noscript | `Script | `Style | `Template | `Title ]
188188+189189+(** {1 Top-Level Element Type} *)
190190+191191+(** Top-level element classification.
192192+193193+ Elements are classified by namespace and recognition status:
194194+ - [Html tag] - A known HTML5 element
195195+ - [Svg name] - An SVG element (preserves original case)
196196+ - [MathML name] - A MathML element (preserves original case)
197197+ - [Custom name] - A custom element (contains hyphen)
198198+ - [Unknown name] - An unrecognized element *)
199199+type element_tag =
200200+ | Html of html_tag
201201+ | Svg of string
202202+ | MathML of string
203203+ | Custom of string
204204+ | Unknown of string
205205+206206+(** {1 Namespace Constants} *)
207207+208208+val svg_namespace : string
209209+(** The SVG namespace URI: ["http://www.w3.org/2000/svg"]. *)
210210+211211+val mathml_namespace : string
212212+(** The MathML namespace URI: ["http://www.w3.org/1998/Math/MathML"]. *)
213213+214214+(** {1 Conversion Functions} *)
215215+216216+val html_tag_of_string_opt : string -> html_tag option
217217+(** [html_tag_of_string_opt name] converts a lowercase tag name to an [html_tag].
218218+219219+ @param name The lowercase tag name (e.g., ["div"], ["span"])
220220+ @return [Some tag] if recognized, [None] otherwise
221221+222222+ {b Example:}
223223+ {[
224224+ html_tag_of_string_opt "div" (* Some `Div *)
225225+ html_tag_of_string_opt "xyz" (* None *)
226226+ ]} *)
227227+228228+val is_custom_element_name : string -> bool
229229+(** [is_custom_element_name name] checks if a name is a valid custom element name.
230230+231231+ A valid custom element name must contain a hyphen and not be reserved
232232+ (e.g., not start with "xml" or be "annotation-xml").
233233+234234+ @param name The element name to check
235235+ @return [true] if the name is a valid custom element name *)
236236+237237+val is_svg_namespace : string -> bool
238238+(** [is_svg_namespace ns] checks if a namespace string represents SVG.
239239+240240+ Accepts both the short form ["svg"] and the full URI. *)
241241+242242+val is_mathml_namespace : string -> bool
243243+(** [is_mathml_namespace ns] checks if a namespace string represents MathML.
244244+245245+ Accepts both the short form ["mathml"] and the full URI. *)
246246+247247+val tag_of_string : ?namespace:string -> string -> element_tag
248248+(** [tag_of_string ?namespace name] converts a tag name to an [element_tag].
249249+250250+ @param namespace Optional namespace URI or short form
251251+ @param name The element name
252252+ @return The classified element tag
253253+254254+ {b Example:}
255255+ {[
256256+ tag_of_string "div" (* Html `Div *)
257257+ tag_of_string ~namespace:"svg" "circle" (* Svg "circle" *)
258258+ tag_of_string "my-component" (* Custom "my-component" *)
259259+ tag_of_string "xyz" (* Unknown "xyz" *)
260260+ ]} *)
261261+262262+val html_tag_to_string : html_tag -> string
263263+(** [html_tag_to_string tag] converts an [html_tag] to its lowercase string name.
264264+265265+ @param tag The HTML tag variant
266266+ @return The lowercase tag name (e.g., ["div"], ["span"]) *)
267267+268268+val tag_to_string : element_tag -> string
269269+(** [tag_to_string tag] converts any [element_tag] to its string name.
270270+271271+ @param tag The element tag
272272+ @return The tag name (lowercase for HTML, original case for SVG/MathML) *)
273273+274274+(** {1 Category Predicates} *)
275275+276276+val is_void : html_tag -> bool
277277+(** [is_void tag] checks if an element is a void element (cannot have children).
278278+279279+ @param tag The HTML tag to check
280280+ @return [true] if the element is void (br, hr, img, input, etc.) *)
281281+282282+val is_heading : html_tag -> bool
283283+(** [is_heading tag] checks if an element is a heading element.
284284+285285+ @param tag The HTML tag to check
286286+ @return [true] if the element is h1-h6 *)
287287+288288+val heading_level : html_tag -> int option
289289+(** [heading_level tag] gets the heading level (1-6) if applicable.
290290+291291+ @param tag The HTML tag to check
292292+ @return [Some level] for h1-h6, [None] for other elements *)
293293+294294+val is_sectioning : html_tag -> bool
295295+(** [is_sectioning tag] checks if an element is sectioning content.
296296+297297+ @param tag The HTML tag to check
298298+ @return [true] if the element is article, aside, nav, or section *)
299299+300300+val is_sectioning_root : html_tag -> bool
301301+(** [is_sectioning_root tag] checks if an element is a sectioning root.
302302+303303+ Sectioning roots establish their own outline context.
304304+305305+ @param tag The HTML tag to check
306306+ @return [true] if the element is blockquote, body, details, dialog,
307307+ fieldset, figure, or td *)
308308+309309+val is_embedded : html_tag -> bool
310310+(** [is_embedded tag] checks if an element is embedded content.
311311+312312+ @param tag The HTML tag to check
313313+ @return [true] if the element is audio, canvas, embed, iframe, img,
314314+ object, picture, or video *)
315315+316316+val is_interactive : html_tag -> bool
317317+(** [is_interactive tag] checks if an element is interactive content.
318318+319319+ @param tag The HTML tag to check
320320+ @return [true] if the element is focusable or activatable *)
321321+322322+val is_form_associated : html_tag -> bool
323323+(** [is_form_associated tag] checks if an element is form-associated.
324324+325325+ @param tag The HTML tag to check
326326+ @return [true] if the element can belong to a form *)
327327+328328+val is_labelable : html_tag -> bool
329329+(** [is_labelable tag] checks if an element can be associated with a label.
330330+331331+ @param tag The HTML tag to check
332332+ @return [true] if the element is labelable *)
333333+334334+val is_submittable : html_tag -> bool
335335+(** [is_submittable tag] checks if an element is a submittable form element.
336336+337337+ @param tag The HTML tag to check
338338+ @return [true] if the element is button, input, select, or textarea *)
339339+340340+val is_resettable : html_tag -> bool
341341+(** [is_resettable tag] checks if an element is a resettable form element.
342342+343343+ @param tag The HTML tag to check
344344+ @return [true] if the element is input, output, select, or textarea *)
345345+346346+val is_transparent : html_tag -> bool
347347+(** [is_transparent tag] checks if an element has a transparent content model.
348348+349349+ Transparent elements inherit their content model from their parent.
350350+351351+ @param tag The HTML tag to check
352352+ @return [true] if the element is transparent (a, abbr, audio, canvas, etc.) *)
353353+354354+val is_script_supporting : html_tag -> bool
355355+(** [is_script_supporting tag] checks if an element is script-supporting.
356356+357357+ @param tag The HTML tag to check
358358+ @return [true] if the element is script or template *)
359359+360360+val is_table_element : html_tag -> bool
361361+(** [is_table_element tag] checks if an element is a table-related element.
362362+363363+ @param tag The HTML tag to check
364364+ @return [true] if the element is table, tr, td, th, etc. *)
365365+366366+val is_media : html_tag -> bool
367367+(** [is_media tag] checks if an element is a media element.
368368+369369+ @param tag The HTML tag to check
370370+ @return [true] if the element is audio or video *)
371371+372372+val is_list_container : html_tag -> bool
373373+(** [is_list_container tag] checks if an element is a list container.
374374+375375+ @param tag The HTML tag to check
376376+ @return [true] if the element is ul, ol, menu, or dl *)
377377+378378+val is_list_item : html_tag -> bool
379379+(** [is_list_item tag] checks if an element is a list item.
380380+381381+ @param tag The HTML tag to check
382382+ @return [true] if the element is li, dd, or dt *)
383383+384384+val is_metadata : html_tag -> bool
385385+(** [is_metadata tag] checks if an element is metadata content.
386386+387387+ @param tag The HTML tag to check
388388+ @return [true] if the element is base, link, meta, etc. *)
389389+390390+val is_obsolete : html_tag -> bool
391391+(** [is_obsolete tag] checks if an element is deprecated/obsolete.
392392+393393+ @param tag The HTML tag to check
394394+ @return [true] if the element is applet, font, marquee, etc. *)
395395+396396+val is_raw_text : html_tag -> bool
397397+(** [is_raw_text tag] checks if an element is a raw text element.
398398+399399+ Raw text elements contain unparsed text content.
400400+401401+ @param tag The HTML tag to check
402402+ @return [true] if the element is script or style *)
403403+404404+val is_escapable_raw_text : html_tag -> bool
405405+(** [is_escapable_raw_text tag] checks if an element is escapable raw text.
406406+407407+ @param tag The HTML tag to check
408408+ @return [true] if the element is textarea or title *)
409409+410410+val is_phrasing : html_tag -> bool
411411+(** [is_phrasing tag] checks if an element is phrasing content.
412412+413413+ Phrasing content is inline-level content that forms paragraphs.
414414+415415+ @param tag The HTML tag to check
416416+ @return [true] if the element is phrasing content *)
417417+418418+val is_flow : html_tag -> bool
419419+(** [is_flow tag] checks if an element is flow content.
420420+421421+ Flow content is most elements that can appear in the body.
422422+423423+ @param tag The HTML tag to check
424424+ @return [true] if the element is flow content *)
425425+426426+(** {1 Pattern Matching Helpers} *)
427427+428428+val as_html_tag : element_tag -> html_tag option
429429+(** [as_html_tag tag] extracts the HTML tag if present.
430430+431431+ @param tag The element tag
432432+ @return [Some html_tag] if [tag] is [Html html_tag], [None] otherwise *)
433433+434434+val is_html_tag : html_tag -> element_tag -> bool
435435+(** [is_html_tag expected tag] checks if [tag] matches the expected HTML tag.
436436+437437+ @param expected The expected HTML tag variant
438438+ @param tag The element tag to check
439439+ @return [true] if [tag] is [Html expected] *)
+5
lib/htmlrw_check/error_code.ml
···119119 | `For_id_mismatch
120120 | `Role_on_ancestor
121121 | `Role_on_for
122122+ | `Aria_label_on_ancestor
122123 | `Aria_label_on_for
123124]
124125···309310 | `Label `For_id_mismatch -> "label-for-mismatch"
310311 | `Label `Role_on_ancestor -> "role-on-label"
311312 | `Label `Role_on_for -> "role-on-label"
313313+ | `Label `Aria_label_on_ancestor -> "aria-label-on-label"
312314 | `Label `Aria_label_on_for -> "aria-label-on-label"
313315314316 (* Input errors *)
···624626 | `Label `Role_on_for ->
625627 Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element."
626628 (q "role") (q "label")
629629+ | `Label `Aria_label_on_ancestor ->
630630+ Printf.sprintf "The %s attribute must not be used on any %s element that is an ancestor of a labelable element."
631631+ (q "aria-label") (q "label")
627632 | `Label `Aria_label_on_for ->
628633 Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element."
629634 (q "aria-label") (q "label")
+5
lib/htmlrw_check/error_code.mli
···527527 Adding [role] to a label that wraps a form control
528528 breaks the implicit label association. *)
529529530530+ | `Aria_label_on_ancestor
531531+ (** [<label>] with [aria-label] is ancestor of labelable element.
532532+ [aria-label] on a label that wraps a form control creates
533533+ conflicting accessible names. *)
534534+530535 | `Role_on_for
531536 (** [<label>] with role uses [for] association.
532537 Labels with explicit [for] association must not have [role]. *)
+31
lib/htmlrw_check/semantic/autofocus_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Autofocus attribute validation checker.
77+88+ This checker validates that only one element with the [autofocus] attribute
99+ exists within each dialog or popover context. HTML5 specifies that there
1010+ should be at most one autofocused element per autofocus scope.
1111+1212+ {2 Validation Rules}
1313+1414+ - Within each dialog element, only one descendant may have [autofocus]
1515+ - Within each popover element, only one descendant may have [autofocus]
1616+ - Nested dialogs and popovers create separate scopes
1717+1818+ {2 Error Messages}
1919+2020+ Reports [Multiple_autofocus] when more than one autofocus attribute is
2121+ found within the same scope.
2222+2323+ @see <https://html.spec.whatwg.org/multipage/interaction.html#the-autofocus-attribute>
2424+ HTML Standard: The autofocus attribute
2525+*)
2626+2727+val checker : Checker.t
2828+(** The autofocus checker instance.
2929+3030+ This checker can be registered with the checker registry and will be
3131+ invoked during DOM traversal to validate autofocus attribute usage. *)
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Language detection and validation checker.
77+88+ This checker validates that the document's [lang] attribute matches the
99+ detected language of the content, and that the [dir] attribute is correct
1010+ for right-to-left (RTL) languages.
1111+1212+ {2 Detection Algorithm}
1313+1414+ The checker:
1515+ 1. Collects text content from the document body (up to 30720 characters)
1616+ 2. Skips text from certain elements (scripts, navigation, form controls)
1717+ 3. Skips foreign namespace content (SVG, MathML)
1818+ 4. Uses statistical language detection with >90% confidence threshold
1919+ 5. Handles Traditional vs Simplified Chinese detection
2020+2121+ {2 Validation Rules}
2222+2323+ - Documents should have a [lang] attribute on the [<html>] element
2424+ - The declared language should match the detected content language
2525+ - RTL languages (Arabic, Hebrew, Persian, Urdu, etc.) should have [dir="rtl"]
2626+2727+ {2 Error Messages}
2828+2929+ - [Wrong_lang]: The declared language doesn't match detected content
3030+ - [Missing_dir_rtl]: An RTL language is detected but no [dir] attribute
3131+ - [Wrong_dir]: The [dir] attribute doesn't match the detected RTL language
3232+3333+ @see <https://html.spec.whatwg.org/multipage/dom.html#the-lang-and-xml:lang-attributes>
3434+ HTML Standard: The lang attribute
3535+*)
3636+3737+val checker : Checker.t
3838+(** The language detection checker instance.
3939+4040+ This checker collects text during DOM traversal and performs language
4141+ detection at document end. *)
+2-1
lib/htmlrw_check/semantic/option_checker.ml
···4949 (match state.option_stack with
5050 | ctx :: rest ->
5151 state.option_stack <- rest;
5252- if not ctx.has_text && not ctx.has_label then
5252+ (* Empty label attribute doesn't count as a valid label *)
5353+ if not ctx.has_text && (not ctx.has_label || ctx.label_empty) then
5354 Message_collector.add_typed collector (`Misc `Option_empty_without_label)
5455 | [] -> ())
5556 | _ -> ()
+32
lib/htmlrw_check/semantic/option_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Option element validation checker.
77+88+ This checker validates that [<option>] elements have proper content or
99+ a [label] attribute. Empty options without labels can be confusing for
1010+ users, especially those using assistive technologies.
1111+1212+ {2 Validation Rules}
1313+1414+ - An [<option>] element must have either:
1515+ - Non-whitespace text content, OR
1616+ - A non-empty [label] attribute
1717+ - Empty [label] attribute values are reported as errors
1818+ - Options inside [<template>] elements are not checked
1919+2020+ {2 Error Messages}
2121+2222+ - [Option_empty_without_label]: Option has no text and no label attribute
2323+ - [Bad_value] for label: The label attribute value is empty
2424+2525+ @see <https://html.spec.whatwg.org/multipage/form-elements.html#the-option-element>
2626+ HTML Standard: The option element
2727+*)
2828+2929+val checker : Checker.t
3030+(** The option element checker instance.
3131+3232+ This checker validates option elements during DOM traversal. *)
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Attribute restrictions checker.
77+88+ This checker validates that certain attributes are not used on elements
99+ where they are not allowed. It catches common misuses such as:
1010+1111+ - RDFa-style [href] on elements like [<img>], [<p>], [<div>]
1212+ - [src] or [media] on [<a>] elements
1313+ - [srcset] on media elements ([<audio>], [<video>], [<object>])
1414+1515+ {2 Validation Rules}
1616+1717+ The checker maintains a list of (element, disallowed_attributes) pairs
1818+ for both HTML and SVG elements. When an element is encountered with
1919+ a disallowed attribute, an error is reported.
2020+2121+ {2 Error Messages}
2222+2323+ Reports [Not_allowed] when an attribute is used on an element where
2424+ it is not permitted.
2525+2626+ @see <https://html.spec.whatwg.org/multipage/dom.html#element-definitions>
2727+ HTML Standard: Element definitions
2828+*)
2929+3030+val checker : Checker.t
3131+(** The attribute restrictions checker instance. *)
+28
lib/htmlrw_check/specialized/base_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Base element ordering checker.
77+88+ This checker validates that the [<base>] element appears before any
99+ elements that may use URLs resolved against the base URL. Specifically,
1010+ [<base>] should appear before [<link>] and [<script>] elements.
1111+1212+ {2 Validation Rules}
1313+1414+ - [<base>] must appear before any [<link>] elements
1515+ - [<base>] must appear before any [<script>] elements
1616+ - The order is significant for URL resolution in the document
1717+1818+ {2 Error Messages}
1919+2020+ Reports [Base_after_link_script] when a [<base>] element is found
2121+ after [<link>] or [<script>] elements.
2222+2323+ @see <https://html.spec.whatwg.org/multipage/semantics.html#the-base-element>
2424+ HTML Standard: The base element
2525+*)
2626+2727+val checker : Checker.t
2828+(** The base element ordering checker instance. *)
-3
lib/htmlrw_check/specialized/datetime_checker.ml
···55(** Elements that have datetime attribute *)
66let datetime_elements = ["del"; "ins"; "time"]
7788-(** Helper: check if char is digit *)
99-let is_digit c = c >= '0' && c <= '9'
1010-118(** Parse int safely *)
129let parse_int s =
1310 try Some (int_of_string s) with _ -> None
+43
lib/htmlrw_check/specialized/datetime_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Datetime attribute validation checker.
77+88+ This checker validates the [datetime] attribute on [<del>], [<ins>],
99+ and [<time>] elements. The datetime value must conform to a valid
1010+ date, time, or datetime format as specified by HTML5.
1111+1212+ {2 Supported Formats}
1313+1414+ The checker validates these datetime formats:
1515+ - Date: [YYYY-MM-DD] (e.g., "2025-12-19")
1616+ - Month: [YYYY-MM] (e.g., "2025-12")
1717+ - Year: [YYYY] (e.g., "2025")
1818+ - Week: [YYYY-Www] (e.g., "2025-W51")
1919+ - Time: [HH:MM] or [HH:MM:SS] (e.g., "14:30:00")
2020+ - Datetime: Date followed by time with separator (e.g., "2025-12-19T14:30")
2121+ - Timezone offsets: [+HH:MM] or [-HH:MM] or [Z]
2222+ - Duration: [P] prefix followed by duration components
2323+2424+ {2 Validation Rules}
2525+2626+ - Month values must be 01-12
2727+ - Day values must be valid for the given month
2828+ - Leap years are correctly handled for February 29th
2929+ - Hour values must be 00-23
3030+ - Minute and second values must be 00-59
3131+ - Week numbers must be 01-53
3232+3333+ {2 Error Messages}
3434+3535+ Reports [Bad_value] when the datetime attribute contains an invalid
3636+ format or out-of-range values.
3737+3838+ @see <https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#dates-and-times>
3939+ HTML Standard: Dates and times
4040+*)
4141+4242+val checker : Checker.t
4343+(** The datetime attribute checker instance. *)
+37
lib/htmlrw_check/specialized/dl_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** DL element content model validation checker.
77+88+ This checker validates that [<dl>] (description list) elements follow
99+ the HTML5 content model requirements. Description lists must contain
1010+ [<dt>] (term) and [<dd>] (description) elements in the correct order.
1111+1212+ {2 Content Model}
1313+1414+ A [<dl>] element may contain:
1515+ - Zero or more groups of [<dt>] followed by [<dd>] elements
1616+ - [<div>] elements wrapping [<dt>]/[<dd>] groups (for styling)
1717+ - [<template>] and [<script>] elements (script-supporting)
1818+1919+ {2 Validation Rules}
2020+2121+ - [<dd>] should not appear before any [<dt>] (terms should come first)
2222+ - [<dl>] should not be empty (should contain at least one term/description)
2323+ - When using [<div>] wrappers, mixing wrapped and unwrapped content
2424+ is discouraged
2525+ - Each [<div>] in a [<dl>] should contain at least one [<dt>]/[<dd>] group
2626+2727+ {2 Error Messages}
2828+2929+ - [Dl_empty]: The [<dl>] element has no content
3030+ - [Dd_before_dt]: A [<dd>] appears before any [<dt>] element
3131+3232+ @see <https://html.spec.whatwg.org/multipage/grouping-content.html#the-dl-element>
3333+ HTML Standard: The dl element
3434+*)
3535+3636+val checker : Checker.t
3737+(** The description list content model checker instance. *)
+35
lib/htmlrw_check/specialized/h1_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** H1 element counter and validator.
77+88+ This checker warns about multiple [<h1>] elements in a document.
99+ While HTML5 technically allows multiple [<h1>] elements when using
1010+ the document outline algorithm, this algorithm was never implemented
1111+ by browsers and has been removed from the specification.
1212+1313+ {2 Best Practice}
1414+1515+ Documents should have exactly one [<h1>] element that represents the
1616+ main heading of the page. Multiple [<h1>] elements can confuse users
1717+ and assistive technologies about the document's structure.
1818+1919+ {2 Special Cases}
2020+2121+ - [<h1>] elements inside [<svg>] content (e.g., in [<foreignObject>])
2222+ are not counted, as they may represent different content contexts
2323+ - The checker reports a warning after the second [<h1>] is encountered
2424+2525+ {2 Error Messages}
2626+2727+ Reports [Multiple_h1] when more than one [<h1>] element is found
2828+ in the document.
2929+3030+ @see <https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements>
3131+ HTML Standard: The h1-h6 elements
3232+*)
3333+3434+val checker : Checker.t
3535+(** The h1 element counter/validator instance. *)
+2
lib/htmlrw_check/specialized/label_checker.ml
···110110 | Tag.Html `Label when state.label_depth = 0 ->
111111 if state.label_has_role && state.labelable_count > 0 then
112112 Message_collector.add_typed collector (`Label `Role_on_ancestor);
113113+ if state.label_has_aria_label && state.labelable_count > 0 then
114114+ Message_collector.add_typed collector (`Label `Aria_label_on_ancestor);
113115 state.in_label <- false;
114116 state.labelable_count <- 0;
115117 state.label_for_value <- None;
+41
lib/htmlrw_check/specialized/label_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Label element content model validation checker.
77+88+ This checker validates that [<label>] elements follow the HTML5
99+ content model requirements. Labels associate text with form controls
1010+ and must be used correctly for accessibility.
1111+1212+ {2 Validation Rules}
1313+1414+ - A [<label>] element may contain at most one labelable element
1515+ (button, input, meter, output, progress, select, textarea)
1616+ - When using the [for] attribute, it should reference an existing
1717+ element ID in the document
1818+ - Nested labelable elements are not counted (only direct descendants)
1919+2020+ {2 Labelable Elements}
2121+2222+ The following elements can be labeled:
2323+ - [<button>]
2424+ - [<input>] (except type="hidden")
2525+ - [<meter>]
2626+ - [<output>]
2727+ - [<progress>]
2828+ - [<select>]
2929+ - [<textarea>]
3030+3131+ {2 Error Messages}
3232+3333+ - Multiple labelable elements inside a single [<label>]
3434+ - [for] attribute references a non-existent ID
3535+3636+ @see <https://html.spec.whatwg.org/multipage/forms.html#the-label-element>
3737+ HTML Standard: The label element
3838+*)
3939+4040+val checker : Checker.t
4141+(** The label element content model checker instance. *)
+42
lib/htmlrw_check/specialized/picture_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Picture element content model and attribute validation checker.
77+88+ This checker validates that [<picture>] elements follow the HTML5
99+ content model requirements and that attributes are used correctly.
1010+1111+ {2 Content Model}
1212+1313+ A [<picture>] element may contain:
1414+ - Zero or more [<source>] elements (must come before [<img>])
1515+ - Exactly one [<img>] element (required)
1616+ - [<script>] and [<template>] elements (script-supporting)
1717+1818+ {2 Attribute Restrictions}
1919+2020+ The [<picture>] element should not have image-related attributes
2121+ directly on it (these belong on the [<img>] child):
2222+ - [src], [srcset], [sizes], [alt], [width], [height]
2323+ - [crossorigin], [loading], [decoding]
2424+ - Legacy attributes like [align], [border], [hspace], etc.
2525+2626+ {2 Source Restrictions in Picture}
2727+2828+ When [<source>] is a child of [<picture>]:
2929+ - It must have [srcset] attribute (required)
3030+ - It should not have [src] attribute
3131+3232+ {2 Error Messages}
3333+3434+ - Disallowed attributes on [<picture>] or [<source>] in picture context
3535+ - Invalid parent elements for [<picture>]
3636+3737+ @see <https://html.spec.whatwg.org/multipage/embedded-content.html#the-picture-element>
3838+ HTML Standard: The picture element
3939+*)
4040+4141+val checker : Checker.t
4242+(** The picture element checker instance. *)
+36
lib/htmlrw_check/specialized/ruby_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Ruby element content model validation checker.
77+88+ This checker validates that [<ruby>] elements follow the HTML5
99+ content model requirements. Ruby annotations are used for East Asian
1010+ typography to show pronunciation or meaning of characters.
1111+1212+ {2 Content Model}
1313+1414+ A [<ruby>] element must contain:
1515+ - Phrasing content (the base text)
1616+ - One or more [<rt>] elements (the ruby text/annotation)
1717+ - Optional [<rp>] elements (fallback parentheses)
1818+1919+ {2 Validation Rules}
2020+2121+ - [<ruby>] must contain at least one [<rt>] element
2222+ - There should be phrasing content before the first [<rt>]
2323+ - [<rp>] elements should surround [<rt>] for fallback rendering
2424+ - Nested [<ruby>] elements are handled correctly
2525+2626+ {2 Error Messages}
2727+2828+ - Ruby element without any [<rt>] child
2929+ - Missing base text before ruby annotation
3030+3131+ @see <https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-ruby-element>
3232+ HTML Standard: The ruby element
3333+*)
3434+3535+val checker : Checker.t
3636+(** The ruby element content model checker instance. *)
+34
lib/htmlrw_check/specialized/source_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Source element context validation checker.
77+88+ This checker validates that [<source>] element attributes are appropriate
99+ for the parent context. The allowed attributes differ based on whether
1010+ the source is inside [<picture>], [<video>], or [<audio>].
1111+1212+ {2 Context-Dependent Rules}
1313+1414+ In [<picture>] context:
1515+ - [srcset] is required
1616+ - [src] is not allowed
1717+ - [media] and [type] are allowed
1818+1919+ In [<video>] or [<audio>] context:
2020+ - [src] is required
2121+ - [srcset] and [sizes] are not allowed
2222+ - [type] is allowed for MIME type hints
2323+2424+ {2 Error Messages}
2525+2626+ - Missing required attributes for the context
2727+ - Attributes not allowed in the current context
2828+2929+ @see <https://html.spec.whatwg.org/multipage/embedded-content.html#the-source-element>
3030+ HTML Standard: The source element
3131+*)
3232+3333+val checker : Checker.t
3434+(** The source element context checker instance. *)
···6161let split_on_space_respecting_parens s =
6262 split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "")
63636464-(** Check if string contains only whitespace *)
6565-let is_whitespace_only s =
6666- String.for_all (fun c -> c = ' ' || c = '\t' || c = '\n' || c = '\r') s
6767-6864(** Invalid units that are not CSS lengths but might be confused for them *)
6965let invalid_size_units = [
7066 "deg"; "grad"; "rad"; "turn"; (* angle units *)
···154150 NoCommentError
155151 end
156152 end
157157-158158-(** For backward compatibility *)
159159-let has_invalid_css_comment s =
160160- match check_css_comment_position s with
161161- | NoCommentError -> false
162162- | _ -> true
163153164154(** Check if scientific notation has invalid exponent (like 1e+1.5 - decimal in exponent) *)
165155let has_invalid_scientific_notation s =
···280270 end
281271 end
282272 end
283283-284284-let has_valid_size_unit size_value =
285285- match check_size_value size_value with
286286- | Valid -> true
287287- | InvalidUnit (_, _) | NegativeValue | CssCommentAfterSign (_, _) | CssCommentBeforeUnit (_, _) | BadScientificNotation | BadCssNumber (_, _) -> false
288273289274(** Check if a sizes entry has a media condition (starts with '(') *)
290275let has_media_condition entry =
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Srcset and sizes attribute validation checker.
77+88+ This checker validates the [srcset] and [sizes] attributes on [<img>]
99+ and [<source>] elements. These attributes use a specialized microsyntax
1010+ for responsive images.
1111+1212+ {2 Srcset Syntax}
1313+1414+ The [srcset] attribute contains a comma-separated list of image
1515+ candidates, each with:
1616+ - A URL
1717+ - An optional width descriptor ([Nw], e.g., "800w")
1818+ - Or an optional pixel density descriptor ([Nx], e.g., "2x")
1919+2020+ Width and pixel density descriptors cannot be mixed in the same srcset.
2121+2222+ {2 Sizes Syntax}
2323+2424+ The [sizes] attribute contains a comma-separated list of:
2525+ - Media conditions (optional)
2626+ - Source sizes (CSS lengths)
2727+2828+ The last entry should not have a media condition (it's the default).
2929+3030+ {2 Validation Rules}
3131+3232+ - URLs in srcset must be valid
3333+ - Width descriptors must be positive integers
3434+ - Pixel density descriptors must be positive numbers
3535+ - Sizes must use valid CSS length units
3636+ - Duplicate descriptors are flagged
3737+3838+ {2 Error Messages}
3939+4040+ - Invalid srcset syntax
4141+ - Invalid sizes syntax
4242+ - Missing sizes when srcset uses width descriptors
4343+ - Invalid CSS length units
4444+4545+ @see <https://html.spec.whatwg.org/multipage/images.html#srcset-attributes>
4646+ HTML Standard: Srcset attributes
4747+*)
4848+4949+val checker : Checker.t
5050+(** The srcset/sizes attribute checker instance. *)
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Title element validation checker.
77+88+ This checker validates that documents have a proper [<title>] element
99+ with meaningful content. The title is important for accessibility,
1010+ SEO, and browser tab identification.
1111+1212+ {2 Validation Rules}
1313+1414+ - Documents should have exactly one [<title>] element in the [<head>]
1515+ - The [<title>] element should contain non-whitespace text
1616+ - Empty titles are flagged as errors
1717+1818+ {2 Error Messages}
1919+2020+ - [Empty_title]: The title element is empty or contains only whitespace
2121+ - [Missing_title]: No title element found in the document head
2222+2323+ @see <https://html.spec.whatwg.org/multipage/semantics.html#the-title-element>
2424+ HTML Standard: The title element
2525+*)
2626+2727+val checker : Checker.t
2828+(** The title element checker instance. *)
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Unknown HTML element checker.
77+88+ This checker detects elements that are not in the HTML5 specification
99+ and produces appropriate error messages. Custom elements (names
1010+ containing hyphens) are allowed per the Web Components specification.
1111+1212+ {2 Recognized Elements}
1313+1414+ The checker recognizes all standard HTML5 elements including:
1515+ - Document metadata (html, head, title, etc.)
1616+ - Sections (body, article, section, nav, etc.)
1717+ - Grouping content (p, div, ul, ol, etc.)
1818+ - Text-level semantics (a, em, strong, span, etc.)
1919+ - Embedded content (img, video, audio, iframe, etc.)
2020+ - Tabular data (table, tr, td, th, etc.)
2121+ - Forms (form, input, button, select, etc.)
2222+ - Interactive elements (details, dialog, summary)
2323+ - Scripting (script, noscript, template)
2424+2525+ {2 Custom Elements}
2626+2727+ Element names containing a hyphen are treated as custom elements
2828+ and are allowed without warning (e.g., [<my-component>], [<app-header>]).
2929+3030+ {2 Error Messages}
3131+3232+ Reports [Unknown_element] for unrecognized element names that are
3333+ not valid custom elements.
3434+3535+ @see <https://html.spec.whatwg.org/multipage/custom-elements.html>
3636+ HTML Standard: Custom elements
3737+*)
3838+3939+val checker : Checker.t
4040+(** The unknown element checker instance. *)
+68
lib/htmlrw_check/specialized/url_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** URL validation checker.
77+88+ This checker validates URL attributes ([href], [src], [action], etc.)
99+ on HTML elements. It checks for common URL issues and security concerns.
1010+1111+ {2 Validated Attributes}
1212+1313+ The checker validates URLs in these attributes:
1414+ - [href] on [<a>], [<area>], [<base>], [<link>]
1515+ - [src] on [<audio>], [<embed>], [<iframe>], [<img>], [<input>],
1616+ [<script>], [<source>], [<track>], [<video>]
1717+ - [action] on [<form>], [<button>] (formaction)
1818+ - [cite] on [<blockquote>], [<del>], [<ins>], [<q>]
1919+ - [data] on [<object>]
2020+ - [poster] on [<video>]
2121+ - [value] on [<input type="url">]
2222+2323+ {2 Validation Rules}
2424+2525+ - URLs should be well-formed (parseable)
2626+ - Relative URLs are allowed
2727+ - Fragment-only URLs ([#anchor]) are valid
2828+ - Data URLs are validated for proper structure
2929+ - javascript: URLs may trigger warnings
3030+ - Empty URLs are flagged on elements that require them
3131+3232+ {2 Error Messages}
3333+3434+ - [Bad_url]: Malformed URL that cannot be parsed
3535+ - [Empty_url]: Required URL attribute is empty
3636+ - Various URL-specific validation errors
3737+3838+ @see <https://url.spec.whatwg.org/>
3939+ URL Standard
4040+*)
4141+4242+(** {1 URL Parsing Utilities} *)
4343+4444+val extract_scheme : string -> string option
4545+(** [extract_scheme url] extracts the scheme (protocol) from a URL.
4646+4747+ @param url The URL to parse
4848+ @return [Some scheme] if a valid scheme is found (e.g., "http", "https"),
4949+ [None] if no scheme is present or the URL is relative *)
5050+5151+val validate_url : string -> string -> string -> string option
5252+(** [validate_url url element_name attr_name] validates a URL.
5353+5454+ Performs comprehensive validation including:
5555+ - Checking for empty URLs on elements that require them
5656+ - Validating scheme, host, port, path, query, and fragment
5757+ - Checking for illegal characters and encoding issues
5858+ - Validating special schemes (http, https, etc.)
5959+6060+ @param url The URL to validate
6161+ @param element_name The element containing the URL attribute
6262+ @param attr_name The attribute name
6363+ @return [Some error_message] if the URL is invalid, [None] if valid *)
6464+6565+(** {1 Checker} *)
6666+6767+val checker : Checker.t
6868+(** The URL validation checker instance. *)
+56
lib/htmlrw_check/xhtml_parser.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** XHTML parser using xmlm for proper XML parsing.
77+88+ This module provides XML parsing for XHTML files. While the HTML5 parser
99+ handles most content, XHTML requires proper XML parsing to correctly handle:
1010+1111+ - Self-closing tags on non-void elements (e.g., [<div/>])
1212+ - XML namespaces for SVG and MathML
1313+ - Strict XML well-formedness requirements
1414+1515+ {2 Usage}
1616+1717+ {[
1818+ if Xhtml_parser.is_xhtml_file (Some "page.xhtml") then
1919+ match Xhtml_parser.parse_xhtml content with
2020+ | Ok doc -> (* Process XHTML document *)
2121+ | Error msg -> (* Handle parse error *)
2222+ ]}
2323+*)
2424+2525+(** {1 Types} *)
2626+2727+type xhtml_doc = {
2828+ root : Html5rw.Dom.node;
2929+ (** The document root node. *)
3030+ errors : Html5rw.Error.t list;
3131+ (** Parse errors (empty for valid XML). *)
3232+}
3333+(** An XHTML document representation. *)
3434+3535+(** {1 Parsing} *)
3636+3737+val parse_xhtml : string -> (Html5rw.Dom.node, string) result
3838+(** [parse_xhtml content] parses XHTML content using xmlm.
3939+4040+ @param content The XHTML content as a string
4141+ @return [Ok root] with the document root on success,
4242+ [Error message] with parse error details on failure *)
4343+4444+val is_xhtml_file : string option -> bool
4545+(** [is_xhtml_file system_id] checks if a system_id indicates an XHTML file.
4646+4747+ @param system_id The optional file path or identifier
4848+ @return [true] if the path ends with ".xhtml" *)
4949+5050+(** {1 Document Access} *)
5151+5252+val xhtml_root : xhtml_doc -> Html5rw.Dom.node
5353+(** [xhtml_root doc] returns the document root node. *)
5454+5555+val xhtml_errors : xhtml_doc -> Html5rw.Error.t list
5656+(** [xhtml_errors doc] returns the parse errors (always empty for XHTML). *)