···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+---------------------------------------------------------------------------*)
55+66+(** html5check - HTML5 conformance checker CLI
77+88+ Command line interface for validating HTML5 documents. *)
99+1010+open Cmdliner
1111+1212+let version = "0.1.0"
1313+1414+(** Exit codes *)
1515+module Exit_code = struct
1616+ let ok = Cmd.Exit.ok
1717+ let validation_errors = 1
1818+ let io_error = 2
1919+end
2020+2121+(** Read input from file or stdin *)
2222+let read_input file =
2323+ try
2424+ let ic =
2525+ if file = "-" then stdin
2626+ else open_in file
2727+ in
2828+ let reader = Bytesrw.Bytes.Reader.of_in_channel ic in
2929+ Ok (reader, ic, file)
3030+ with
3131+ | Sys_error msg ->
3232+ Error (`Io_error (Printf.sprintf "Cannot read file '%s': %s" file msg))
3333+3434+(** Format output based on the requested format *)
3535+let format_output format result =
3636+ match format with
3737+ | `Text -> Html5_checker.format_text result
3838+ | `Json -> Html5_checker.format_json result
3939+ | `Gnu -> Html5_checker.format_gnu result
4040+4141+(** Run the validation *)
4242+let run format errors_only exit_zero quiet verbose file =
4343+ match read_input file with
4444+ | Error (`Io_error msg) ->
4545+ if not quiet then Printf.eprintf "Error: %s\n" msg;
4646+ Exit_code.io_error
4747+ | Ok (reader, ic, system_id) ->
4848+ (* Run validation *)
4949+ let result = Html5_checker.check ~system_id reader in
5050+5151+ (* Close input if it's not stdin *)
5252+ if file <> "-" then close_in ic;
5353+5454+ (* Get messages based on filtering *)
5555+ let messages =
5656+ if errors_only then Html5_checker.errors result
5757+ else Html5_checker.messages result
5858+ in
5959+6060+ (* Output based on mode *)
6161+ if quiet then begin
6262+ (* Only show counts *)
6363+ let error_count = List.length (Html5_checker.errors result) in
6464+ let warning_count = List.length (Html5_checker.warnings result) in
6565+ if errors_only then
6666+ Printf.printf "%d error%s\n" error_count (if error_count = 1 then "" else "s")
6767+ else
6868+ Printf.printf "%d error%s, %d warning%s\n"
6969+ error_count (if error_count = 1 then "" else "s")
7070+ warning_count (if warning_count = 1 then "" else "s")
7171+ end else begin
7272+ (* Format and print messages *)
7373+ let output = format_output format result in
7474+ if output <> "" then print_string output;
7575+7676+ (* Show summary if verbose *)
7777+ if verbose && messages <> [] then begin
7878+ let error_count = List.length (Html5_checker.errors result) in
7979+ let warning_count = List.length (Html5_checker.warnings result) in
8080+ Printf.eprintf "\nSummary: %d error%s, %d warning%s\n"
8181+ error_count (if error_count = 1 then "" else "s")
8282+ warning_count (if warning_count = 1 then "" else "s")
8383+ end
8484+ end;
8585+8686+ (* Determine exit code *)
8787+ if exit_zero || not (Html5_checker.has_errors result) then
8888+ Exit_code.ok
8989+ else
9090+ Exit_code.validation_errors
9191+9292+(** Command line argument definitions *)
9393+9494+let format_arg =
9595+ let formats = [("text", `Text); ("json", `Json); ("gnu", `Gnu)] in
9696+ let doc =
9797+ "Output format. $(docv) must be one of $(b,text) (human-readable, default), \
9898+ $(b,json) (Nu validator compatible JSON), or $(b,gnu) (GNU-style for IDE integration)."
9999+ in
100100+ Arg.(value & opt (enum formats) `Text & info ["format"] ~docv:"FORMAT" ~doc)
101101+102102+let errors_only_arg =
103103+ let doc = "Only show errors (suppress warnings)." in
104104+ Arg.(value & flag & info ["errors-only"] ~doc)
105105+106106+let exit_zero_arg =
107107+ let doc =
108108+ "Always exit with status code 0, even if validation errors are found. \
109109+ Useful for CI pipelines where you want to collect validation results \
110110+ but not fail the build."
111111+ in
112112+ Arg.(value & flag & info ["exit-zero"] ~doc)
113113+114114+let quiet_arg =
115115+ let doc = "Quiet mode - only show error and warning counts, no details." in
116116+ Arg.(value & flag & info ["q"; "quiet"] ~doc)
117117+118118+let verbose_arg =
119119+ let doc = "Verbose mode - show additional information including summary." in
120120+ Arg.(value & flag & info ["v"; "verbose"] ~doc)
121121+122122+let file_arg =
123123+ let doc =
124124+ "HTML file to validate. Use $(b,-) to read from standard input. \
125125+ If no file is specified, reads from stdin."
126126+ in
127127+ Arg.(value & pos 0 string "-" & info [] ~docv:"FILE" ~doc)
128128+129129+let cmd =
130130+ let doc = "validate HTML5 documents for conformance" in
131131+ let man = [
132132+ `S Manpage.s_description;
133133+ `P "$(tname) validates HTML5 documents against the WHATWG HTML5 specification. \
134134+ It reports parse errors, structural validation issues, and conformance problems.";
135135+ `P "The validator checks for:";
136136+ `I ("Parse errors", "Malformed HTML syntax according to the WHATWG specification");
137137+ `I ("Content model violations", "Elements in invalid parent/child relationships");
138138+ `I ("Attribute errors", "Invalid or missing required attributes");
139139+ `I ("Structural issues", "Other conformance problems");
140140+ `S Manpage.s_options;
141141+ `S "OUTPUT FORMATS";
142142+ `P "The validator supports three output formats:";
143143+ `I ("$(b,text)", "Human-readable format showing file:line:col: severity: message");
144144+ `I ("$(b,json)", "JSON format compatible with the Nu Html Checker (v.Nu)");
145145+ `I ("$(b,gnu)", "GNU-style format for IDE integration (file:line:column: message)");
146146+ `S "EXIT STATUS";
147147+ `P "The validator exits with one of the following status codes:";
148148+ `I ("0", "No validation errors found (or --exit-zero was specified)");
149149+ `I ("1", "Validation errors were found");
150150+ `I ("2", "File not found or I/O error");
151151+ `S Manpage.s_examples;
152152+ `P "Validate a file:";
153153+ `Pre " $(mname) index.html";
154154+ `P "Validate from stdin:";
155155+ `Pre " cat page.html | $(mname) -";
156156+ `P "Show only errors in JSON format:";
157157+ `Pre " $(mname) --format=json --errors-only page.html";
158158+ `P "Quiet mode for CI:";
159159+ `Pre " $(mname) --quiet --exit-zero index.html";
160160+ `S Manpage.s_bugs;
161161+ `P "Report bugs at https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues";
162162+ ] in
163163+ let info = Cmd.info "html5check" ~version ~doc ~man in
164164+ Cmd.v info Term.(const run $ format_arg $ errors_only_arg $ exit_zero_arg
165165+ $ quiet_arg $ verbose_arg $ file_arg)
166166+167167+let main () = Cmd.eval' cmd
168168+let () = Stdlib.exit (main ())
···11+(** Base checker module for HTML5 conformance checking. *)
22+33+module type S = sig
44+ type state
55+66+ val create : unit -> state
77+ val reset : state -> unit
88+99+ val start_element :
1010+ state ->
1111+ name:string ->
1212+ namespace:string option ->
1313+ attrs:(string * string) list ->
1414+ Message_collector.t ->
1515+ unit
1616+1717+ val end_element :
1818+ state -> name:string -> namespace:string option -> Message_collector.t -> unit
1919+2020+ val characters : state -> string -> Message_collector.t -> unit
2121+ val end_document : state -> Message_collector.t -> unit
2222+end
2323+2424+type t = (module S)
2525+2626+(** No-operation checker implementation. *)
2727+module Noop = struct
2828+ type state = unit
2929+3030+ let create () = ()
3131+ let reset () = ()
3232+3333+ let start_element () ~name:_ ~namespace:_ ~attrs:_ _ = ()
3434+ let end_element () ~name:_ ~namespace:_ _ = ()
3535+ let characters () _ _ = ()
3636+ let end_document () _ = ()
3737+end
3838+3939+let noop () = (module Noop : S)
+177
lib/html5_checker/checker.mli
···11+(** Base checker module for HTML5 conformance checking.
22+33+ This module provides the core checker abstraction used throughout the
44+ html5_checker library. A checker validates HTML5 documents by observing
55+ DOM tree traversal events and emitting validation messages.
66+77+ {2 Design Overview}
88+99+ Checkers follow a SAX-like event model where they receive notifications
1010+ about elements, text, and document boundaries as a DOM tree is traversed.
1111+ This design allows for:
1212+1313+ - {b Stateful validation}: Each checker maintains its own state across
1414+ multiple events
1515+ - {b Composability}: Multiple checkers can validate the same document
1616+ simultaneously
1717+ - {b Efficiency}: DOM traversal happens once regardless of checker count
1818+1919+ {2 Checker Lifecycle}
2020+2121+ A checker progresses through these phases:
2222+2323+ 1. {b Creation}: Initialize with {!create} to set up initial state
2424+ 2. {b Traversal}: Receive {!start_element}, {!characters}, and
2525+ {!end_element} events as the DOM is walked
2626+ 3. {b Completion}: Finalize validation with {!end_document}
2727+ 4. {b Reset} (optional): Return to initial state with {!reset}
2828+2929+ {3 Event Sequence}
3030+3131+ For a document like [<p>Hello <b>world</b></p>], the event sequence is:
3232+3333+ {v
3434+ start_element "p"
3535+ characters "Hello "
3636+ start_element "b"
3737+ characters "world"
3838+ end_element "b"
3939+ end_element "p"
4040+ end_document
4141+ v}
4242+4343+ {2 First-Class Modules}
4444+4545+ Checkers are represented as first-class modules implementing the {!S}
4646+ signature. This allows:
4747+4848+ - Dynamic checker registration and discovery
4949+ - Heterogeneous collections of checkers
5050+ - Checker selection at runtime based on validation requirements
5151+5252+ @see <https://v2.ocaml.org/manual/firstclassmodules.html>
5353+ OCaml manual: First-class modules
5454+*)
5555+5656+(** {1 Module Signature} *)
5757+5858+(** The signature that all checker modules must implement.
5959+6060+ A checker module maintains validation state and receives notifications
6161+ about DOM tree traversal events. *)
6262+module type S = sig
6363+ (** The type of checker state.
6464+6565+ This is an abstract type that holds the checker's internal validation
6666+ state. Different checkers will have different state representations
6767+ depending on what they need to track during validation. *)
6868+ type state
6969+7070+ (** {1 Lifecycle Operations} *)
7171+7272+ val create : unit -> state
7373+ (** [create ()] initializes a new checker state.
7474+7575+ This function sets up the initial state needed for validation,
7676+ such as empty stacks for context tracking, counters, or lookup
7777+ tables. *)
7878+7979+ val reset : state -> unit
8080+ (** [reset state] resets the checker to its initial state.
8181+8282+ This allows reusing a checker for multiple documents without
8383+ reallocating. After reset, the checker behaves as if freshly
8484+ created with {!create}. *)
8585+8686+ (** {1 DOM Traversal Events} *)
8787+8888+ val start_element :
8989+ state ->
9090+ name:string ->
9191+ namespace:string option ->
9292+ attrs:(string * string) list ->
9393+ Message_collector.t ->
9494+ unit
9595+ (** [start_element state ~name ~namespace ~attrs collector] is called when
9696+ entering an element during DOM traversal.
9797+9898+ @param state The checker state
9999+ @param name The element tag name (e.g., "div", "p", "span")
100100+ @param namespace The element namespace ([None] for HTML, [Some "svg"]
101101+ for SVG, [Some "mathml"] for MathML)
102102+ @param attrs The element's attributes as [(name, value)] pairs
103103+ @param collector The message collector for emitting validation messages
104104+105105+ This is where checkers can validate:
106106+ - Whether the element is allowed in the current context
107107+ - Whether required attributes are present
108108+ - Whether attribute values are valid
109109+ - Whether the element opens a new validation context *)
110110+111111+ val end_element :
112112+ state -> name:string -> namespace:string option -> Message_collector.t -> unit
113113+ (** [end_element state ~name ~namespace collector] is called when exiting
114114+ an element during DOM traversal.
115115+116116+ @param state The checker state
117117+ @param name The element tag name
118118+ @param namespace The element namespace
119119+ @param collector The message collector for emitting validation messages
120120+121121+ This is where checkers can:
122122+ - Pop validation contexts from stacks
123123+ - Validate that required child elements were present
124124+ - Emit messages about element-scoped validation rules *)
125125+126126+ val characters : state -> string -> Message_collector.t -> unit
127127+ (** [characters state text collector] is called when text content is
128128+ encountered during DOM traversal.
129129+130130+ @param state The checker state
131131+ @param text The text content
132132+ @param collector The message collector for emitting validation messages
133133+134134+ This is where checkers can validate:
135135+ - Whether text is allowed in the current context
136136+ - Whether text content follows specific patterns
137137+ - Whether text matches expected formats *)
138138+139139+ val end_document : state -> Message_collector.t -> unit
140140+ (** [end_document state collector] is called after the entire DOM tree has
141141+ been traversed.
142142+143143+ @param state The checker state
144144+ @param collector The message collector for emitting validation messages
145145+146146+ This is where checkers can:
147147+ - Emit messages about missing required elements
148148+ - Validate document-level constraints
149149+ - Check that all opened contexts were properly closed
150150+ - Report any accumulated validation failures *)
151151+end
152152+153153+(** {1 Checker Values} *)
154154+155155+(** The type of a checker value.
156156+157157+ This is a packed first-class module containing both the checker
158158+ implementation and its state. It enables storing heterogeneous
159159+ checkers in collections and passing them around dynamically. *)
160160+type t = (module S)
161161+162162+(** {1 Built-in Checkers} *)
163163+164164+val noop : unit -> t
165165+(** [noop ()] creates a no-operation checker that performs no validation.
166166+167167+ This checker ignores all events and never emits messages. It is useful:
168168+ - As a placeholder in checker registries
169169+ - For testing checker infrastructure
170170+ - As a base for building new checkers
171171+172172+ {b Example:}
173173+ {[
174174+ let checker = noop () in
175175+ (* Does nothing when walked over a DOM tree *)
176176+ ]}
177177+*)
+22
lib/html5_checker/checker_registry.ml
···11+(** Registry for HTML5 conformance checkers. *)
22+33+type t = (string, Checker.t) Hashtbl.t
44+55+let create () = Hashtbl.create 16
66+77+let default () =
88+ (* In Phase 1, return an empty registry.
99+ Built-in checkers will be added in later phases. *)
1010+ create ()
1111+1212+let register registry name checker = Hashtbl.replace registry name checker
1313+1414+let unregister registry name = Hashtbl.remove registry name
1515+1616+let get registry name = Hashtbl.find_opt registry name
1717+1818+let list_names registry =
1919+ Hashtbl.to_seq_keys registry |> List.of_seq
2020+2121+let all registry =
2222+ Hashtbl.to_seq_values registry |> List.of_seq
+156
lib/html5_checker/checker_registry.mli
···11+(** Registry for HTML5 conformance checkers.
22+33+ This module provides a dynamic registry for managing collections of
44+ checkers. It enables:
55+66+ - {b Registration}: Add checkers under descriptive names
77+ - {b Discovery}: Retrieve checkers by name or list all available ones
88+ - {b Lifecycle management}: Register and unregister checkers at runtime
99+ - {b Defaults}: Access a pre-configured set of built-in checkers
1010+1111+ {2 Design Rationale}
1212+1313+ The registry pattern separates checker implementation from checker usage.
1414+ Applications can:
1515+1616+ 1. Query available checkers to present options to users
1717+ 2. Select specific checkers based on validation requirements
1818+ 3. Add custom checkers without modifying library code
1919+ 4. Share checker configurations across validation runs
2020+2121+ {2 Usage Pattern}
2222+2323+ {[
2424+ (* Start with default checkers *)
2525+ let reg = default () in
2626+2727+ (* Add a custom checker *)
2828+ let my_checker = (module MyChecker : Checker.S) in
2929+ register reg "my-custom-check" my_checker;
3030+3131+ (* List all available checkers *)
3232+ let names = list_names reg in
3333+ List.iter (Printf.printf "Available: %s\n") names;
3434+3535+ (* Retrieve a specific checker *)
3636+ match get reg "my-custom-check" with
3737+ | Some checker -> (* Use the checker *)
3838+ | None -> (* Not found *)
3939+4040+ (* Get all checkers for validation *)
4141+ let all_checkers = all reg in
4242+ (* Pass to dom_walker *)
4343+ ]}
4444+4545+ {2 Thread Safety}
4646+4747+ This registry is not thread-safe. If shared across threads, external
4848+ synchronization is required. *)
4949+5050+(** {1 Types} *)
5151+5252+(** The type of a checker registry.
5353+5454+ This is an opaque type representing a mutable collection of named
5555+ checkers. Internally implemented as a hash table for efficient lookups. *)
5656+type t
5757+5858+(** {1 Creation} *)
5959+6060+val create : unit -> t
6161+(** [create ()] creates a new empty checker registry.
6262+6363+ Use this when you want to build a custom set of checkers from scratch,
6464+ without any defaults. *)
6565+6666+val default : unit -> t
6767+(** [default ()] creates a registry with built-in checkers.
6868+6969+ The default registry is initially empty but serves as a starting point
7070+ for adding standard validation checkers in future phases.
7171+7272+ Built-in checkers will include:
7373+ - Document structure validation
7474+ - Attribute validation
7575+ - Content model checking
7676+ - Accessibility checks
7777+7878+ Note: In Phase 1, the default registry is empty. Built-in checkers
7979+ will be added in subsequent phases. *)
8080+8181+(** {1 Registration} *)
8282+8383+val register : t -> string -> Checker.t -> unit
8484+(** [register registry name checker] adds a checker to the registry.
8585+8686+ @param registry The registry to add to
8787+ @param name A unique identifier for the checker (e.g., "obsolete-elements",
8888+ "required-attributes")
8989+ @param checker The checker implementation
9090+9191+ If a checker with the same name already exists, it is replaced.
9292+9393+ {b Example:}
9494+ {[
9595+ let reg = create () in
9696+ let checker = (module MyChecker : Checker.S) in
9797+ register reg "my-check" checker
9898+ ]} *)
9999+100100+val unregister : t -> string -> unit
101101+(** [unregister registry name] removes a checker from the registry.
102102+103103+ @param registry The registry to remove from
104104+ @param name The checker name
105105+106106+ If no checker with the given name exists, this is a no-op. *)
107107+108108+(** {1 Retrieval} *)
109109+110110+val get : t -> string -> Checker.t option
111111+(** [get registry name] retrieves a checker by name.
112112+113113+ @param registry The registry to search
114114+ @param name The checker name
115115+ @return [Some checker] if found, [None] otherwise
116116+117117+ {b Example:}
118118+ {[
119119+ match get reg "obsolete-elements" with
120120+ | Some checker -> (* Use checker *)
121121+ | None -> (* Checker not registered *)
122122+ ]} *)
123123+124124+val list_names : t -> string list
125125+(** [list_names registry] returns all registered checker names.
126126+127127+ @param registry The registry to query
128128+ @return A list of all checker names in arbitrary order
129129+130130+ This is useful for:
131131+ - Displaying available checkers to users
132132+ - Debugging registry contents
133133+ - Iterating over specific subsets of checkers
134134+135135+ {b Example:}
136136+ {[
137137+ let names = list_names reg in
138138+ Printf.printf "Available checkers: %s\n"
139139+ (String.concat ", " names)
140140+ ]} *)
141141+142142+val all : t -> Checker.t list
143143+(** [all registry] returns all registered checkers.
144144+145145+ @param registry The registry to query
146146+ @return A list of all checkers in arbitrary order
147147+148148+ This is the primary way to retrieve checkers for validation.
149149+ Pass the result to {!Dom_walker.walk_all} to run all registered
150150+ checkers on a DOM tree.
151151+152152+ {b Example:}
153153+ {[
154154+ let checkers = all reg in
155155+ Dom_walker.walk_all checkers collector dom
156156+ ]} *)
···11+(** HTML5 content categories.
22+33+ This module defines the content categories used in HTML5 to classify elements
44+ based on their characteristics and allowed contexts. Elements can belong to
55+ multiple categories.
66+77+ @see <https://html.spec.whatwg.org/multipage/dom.html#content-models> WHATWG HTML Specification *)
88+99+(** Content category type. *)
1010+type t =
1111+ | Metadata
1212+ (** Metadata content sets up the presentation or behavior of the rest of
1313+ the content, or sets up the relationship of the document with other
1414+ documents, or conveys other "out of band" information. *)
1515+ | Flow
1616+ (** Most elements that are used in the body of documents and applications
1717+ are categorized as flow content. *)
1818+ | Sectioning
1919+ (** Sectioning content is content that defines the scope of headings and
2020+ footers. *)
2121+ | Heading
2222+ (** Heading content defines the heading of a section (whether explicitly
2323+ marked up using sectioning content elements, or implied by the heading
2424+ content itself). *)
2525+ | Phrasing
2626+ (** Phrasing content is the text of the document, as well as elements that
2727+ mark up that text at the intra-paragraph level. *)
2828+ | Embedded
2929+ (** Embedded content is content that imports another resource into the
3030+ document, or content from another vocabulary that is inserted into the
3131+ document. *)
3232+ | Interactive
3333+ (** Interactive content is content that is specifically intended for user
3434+ interaction. *)
3535+ | Palpable
3636+ (** As a general rule, elements whose content model allows any flow content
3737+ or phrasing content should have at least one node in its contents that
3838+ is palpable content and that does not have the hidden attribute specified. *)
3939+ | Script_supporting
4040+ (** Script-supporting elements are those that do not represent anything
4141+ themselves (i.e., they are not rendered), but are used to support scripts. *)
4242+ | Form_associated
4343+ (** Form-associated elements can have a form owner. *)
4444+ | Listed
4545+ (** Listed form-associated elements have a form attribute that can point
4646+ to a form element. *)
4747+ | Labelable
4848+ (** Labelable form-associated elements can be associated with label elements. *)
4949+ | Submittable
5050+ (** Submittable form-associated elements can be used for constructing the
5151+ entry list when a form element is submitted. *)
5252+ | Resettable
5353+ (** Resettable form-associated elements are affected when a form element
5454+ is reset. *)
5555+ | Autocapitalize_inheriting
5656+ (** Some elements inherit the autocapitalize attribute from their form owner. *)
5757+ | Transparent
5858+ (** Transparent content models adopt the content model of their parent
5959+ element. *)
6060+6161+(** {1 Predicates} *)
6262+6363+val to_string : t -> string
6464+(** [to_string category] returns a string representation of the category. *)
6565+6666+val compare : t -> t -> int
6767+(** [compare c1 c2] compares two categories for ordering. *)
6868+6969+val equal : t -> t -> bool
7070+(** [equal c1 c2] returns [true] if the categories are equal. *)
···11+(** Content model checker.
22+33+ Validates that HTML elements conform to their content model specifications.
44+55+ The content model checker performs structural validation of HTML documents
66+ by ensuring that:
77+88+ - Element children match the element's declared content model
99+ - No prohibited ancestor relationships exist (e.g., no [<a>] inside [<a>])
1010+ - Void elements contain no children
1111+ - Required children are present where mandated
1212+1313+ {2 Content Model Validation}
1414+1515+ The checker validates content models by:
1616+1717+ 1. Looking up the element specification in the registry
1818+ 2. Checking each child element or text node against the content model
1919+ 3. Tracking the ancestor stack to detect prohibited relationships
2020+ 4. Emitting appropriate errors or warnings for violations
2121+2222+ {2 Usage Example}
2323+2424+ {[
2525+ let checker = Content_checker.create (Message_collector.create ()) in
2626+ let module C = (val checker : Checker.S) in
2727+ let state = C.create () in
2828+2929+ (* Walk the DOM tree *)
3030+ C.start_element state ~name:"div" ~namespace:None ~attrs:[] collector;
3131+ C.characters state "Hello, world!" collector;
3232+ C.end_element state ~name:"div" ~namespace:None collector;
3333+ C.end_document state collector
3434+ ]}
3535+*)
3636+3737+(** Include the standard checker signature. *)
3838+include Checker.S
3939+4040+(** {1 Creation} *)
4141+4242+val create_with_registry : ?registry:Element_registry.t -> Message_collector.t -> state
4343+(** [create_with_registry ?registry collector] creates a content checker with an
4444+ optional custom element registry.
4545+4646+ If no registry is provided, uses {!Element_registry.default}.
4747+4848+ @param registry Custom element registry (defaults to standard HTML5 elements)
4949+ @param collector Message collector for validation messages *)
5050+5151+(** {1 First-Class Module} *)
5252+5353+val checker : Checker.t
5454+(** [checker] is the content checker packaged as a first-class module.
5555+5656+ This allows the content checker to be used in checker registries and
5757+ other contexts that work with heterogeneous checker collections. *)
+60
lib/html5_checker/content_model/content_model.ml
···11+type t =
22+ | Nothing
33+ | Text
44+ | Transparent
55+ | Categories of Content_category.t list
66+ | Elements of string list
77+ | Mixed of Content_category.t list
88+ | One_or_more of t
99+ | Zero_or_more of t
1010+ | Optional of t
1111+ | Sequence of t list
1212+ | Choice of t list
1313+ | Except of t * Content_category.t list
1414+1515+let rec pp fmt = function
1616+ | Nothing -> Format.fprintf fmt "Nothing"
1717+ | Text -> Format.fprintf fmt "Text"
1818+ | Transparent -> Format.fprintf fmt "Transparent"
1919+ | Categories cats ->
2020+ Format.fprintf fmt "Categories [%a]"
2121+ (Format.pp_print_list
2222+ ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ")
2323+ Content_category.pp)
2424+ cats
2525+ | Elements elems ->
2626+ Format.fprintf fmt "Elements [%a]"
2727+ (Format.pp_print_list
2828+ ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ")
2929+ Format.pp_print_string)
3030+ elems
3131+ | Mixed cats ->
3232+ Format.fprintf fmt "Mixed [%a]"
3333+ (Format.pp_print_list
3434+ ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ")
3535+ Content_category.pp)
3636+ cats
3737+ | One_or_more t -> Format.fprintf fmt "One_or_more (%a)" pp t
3838+ | Zero_or_more t -> Format.fprintf fmt "Zero_or_more (%a)" pp t
3939+ | Optional t -> Format.fprintf fmt "Optional (%a)" pp t
4040+ | Sequence ts ->
4141+ Format.fprintf fmt "Sequence [%a]"
4242+ (Format.pp_print_list ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ") pp)
4343+ ts
4444+ | Choice ts ->
4545+ Format.fprintf fmt "Choice [%a]"
4646+ (Format.pp_print_list ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ") pp)
4747+ ts
4848+ | Except (t, cats) ->
4949+ Format.fprintf fmt "Except (%a, [%a])" pp t
5050+ (Format.pp_print_list
5151+ ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ")
5252+ Content_category.pp)
5353+ cats
5454+5555+let to_string t =
5656+ let buf = Buffer.create 256 in
5757+ let fmt = Format.formatter_of_buffer buf in
5858+ pp fmt t;
5959+ Format.pp_print_flush fmt ();
6060+ Buffer.contents buf
+21
lib/html5_checker/content_model/content_model.mli
···11+(** HTML5 element content models.
22+33+ Defines what children an element can contain. *)
44+55+type t =
66+ | Nothing (** No children allowed (void elements) *)
77+ | Text (** Text only (no elements) *)
88+ | Transparent (** Inherits parent's content model *)
99+ | Categories of Content_category.t list (** Elements from categories *)
1010+ | Elements of string list (** Specific elements only *)
1111+ | Mixed of Content_category.t list (** Text + elements from categories *)
1212+ | One_or_more of t (** At least one child matching *)
1313+ | Zero_or_more of t (** Any number of children matching *)
1414+ | Optional of t (** Zero or one child matching *)
1515+ | Sequence of t list (** Ordered sequence *)
1616+ | Choice of t list (** Any one of *)
1717+ | Except of t * Content_category.t list (** t except categories *)
1818+1919+val pp : Format.formatter -> t -> unit
2020+2121+val to_string : t -> string
···11+(** Registry for HTML5 element specifications.
22+33+ Provides fast lookup of element specs by name. *)
44+55+(** The type of an element registry. *)
66+type t
77+88+(** {1 Creation and Modification} *)
99+1010+val create : unit -> t
1111+(** [create ()] creates a new empty element registry. *)
1212+1313+val register : t -> Element_spec.t -> unit
1414+(** [register registry spec] adds an element specification to the registry.
1515+1616+ If an element with the same name already exists, it is replaced. *)
1717+1818+(** {1 Lookup} *)
1919+2020+val get : t -> string -> Element_spec.t option
2121+(** [get registry name] looks up an element specification by tag name.
2222+2323+ Returns [None] if the element is not registered. Tag names are
2424+ case-insensitive. *)
2525+2626+val list_names : t -> string list
2727+(** [list_names registry] returns a sorted list of all registered element names. *)
2828+2929+val all : t -> Element_spec.t list
3030+(** [all registry] returns all registered element specifications. *)
3131+3232+(** {1 Default Registry} *)
3333+3434+val default : unit -> t
3535+(** [default ()] creates a registry pre-populated with all standard HTML5 elements.
3636+3737+ The registry includes elements from:
3838+ - {!Elements_document} - Document structure and sectioning
3939+ - {!Elements_text} - Text-level semantics
4040+ - {!Elements_form} - Forms and input controls
4141+ - {!Elements_embedded} - Embedded content
4242+ - {!Elements_table} - Tables
4343+ - {!Elements_interactive} - Interactive elements *)
···11+(** HTML5 structural and document element specifications.
22+33+ This module defines element specifications for HTML5 document structure,
44+ sectioning, and grouping elements according to the WHATWG HTML specification.
55+66+ @see <https://html.spec.whatwg.org/multipage/> WHATWG HTML Specification *)
77+88+(** {1 Document structure elements} *)
99+1010+val html : Element_spec.t
1111+(** The [html] element represents the root of an HTML document. *)
1212+1313+val head : Element_spec.t
1414+(** The [head] element represents a collection of metadata for the document. *)
1515+1616+val body : Element_spec.t
1717+(** The [body] element represents the contents of the document. *)
1818+1919+val title : Element_spec.t
2020+(** The [title] element represents the document's title or name. *)
2121+2222+val base : Element_spec.t
2323+(** The [base] element specifies the document base URL and/or default browsing
2424+ context for navigation. *)
2525+2626+val link : Element_spec.t
2727+(** The [link] element specifies relationships between the current document and
2828+ external resources. *)
2929+3030+val meta : Element_spec.t
3131+(** The [meta] element represents various kinds of metadata that cannot be
3232+ expressed using other metadata elements. *)
3333+3434+val style : Element_spec.t
3535+(** The [style] element allows authors to embed CSS style sheets in their documents. *)
3636+3737+(** {1 Sectioning elements} *)
3838+3939+val article : Element_spec.t
4040+(** The [article] element represents a complete, or self-contained, composition
4141+ in a document, page, application, or site. *)
4242+4343+val section : Element_spec.t
4444+(** The [section] element represents a generic section of a document or application. *)
4545+4646+val nav : Element_spec.t
4747+(** The [nav] element represents a section of a page that links to other pages
4848+ or to parts within the page. *)
4949+5050+val aside : Element_spec.t
5151+(** The [aside] element represents a section of a page that consists of content
5252+ that is tangentially related to the content around it. *)
5353+5454+val h1 : Element_spec.t
5555+(** The [h1] element represents a heading at level 1. *)
5656+5757+val h2 : Element_spec.t
5858+(** The [h2] element represents a heading at level 2. *)
5959+6060+val h3 : Element_spec.t
6161+(** The [h3] element represents a heading at level 3. *)
6262+6363+val h4 : Element_spec.t
6464+(** The [h4] element represents a heading at level 4. *)
6565+6666+val h5 : Element_spec.t
6767+(** The [h5] element represents a heading at level 5. *)
6868+6969+val h6 : Element_spec.t
7070+(** The [h6] element represents a heading at level 6. *)
7171+7272+val hgroup : Element_spec.t
7373+(** The [hgroup] element represents a heading and related content, such as
7474+ subheadings, an alternative title, or a tagline. *)
7575+7676+val header : Element_spec.t
7777+(** The [header] element represents introductory content for its nearest ancestor
7878+ sectioning content or sectioning root element. *)
7979+8080+val footer : Element_spec.t
8181+(** The [footer] element represents a footer for its nearest ancestor sectioning
8282+ content or sectioning root element. *)
8383+8484+val address : Element_spec.t
8585+(** The [address] element represents contact information for its nearest [article]
8686+ or [body] element ancestor. *)
8787+8888+val main : Element_spec.t
8989+(** The [main] element represents the dominant contents of the document. *)
9090+9191+(** {1 Grouping elements} *)
9292+9393+val p : Element_spec.t
9494+(** The [p] element represents a paragraph. *)
9595+9696+val hr : Element_spec.t
9797+(** The [hr] element represents a thematic break between paragraph-level elements. *)
9898+9999+val pre : Element_spec.t
100100+(** The [pre] element represents a block of preformatted text. *)
101101+102102+val blockquote : Element_spec.t
103103+(** The [blockquote] element represents a section that is quoted from another source. *)
104104+105105+val ol : Element_spec.t
106106+(** The [ol] element represents a list of items, where the items have been
107107+ intentionally ordered. *)
108108+109109+val ul : Element_spec.t
110110+(** The [ul] element represents a list of items, where the order of the items
111111+ is not important. *)
112112+113113+val menu : Element_spec.t
114114+(** The [menu] element represents a toolbar consisting of its contents, in the
115115+ form of an unordered list of items. *)
116116+117117+val li : Element_spec.t
118118+(** The [li] element represents a list item. *)
119119+120120+val dl : Element_spec.t
121121+(** The [dl] element represents an association list consisting of zero or more
122122+ name-value groups (a description list). *)
123123+124124+val dt : Element_spec.t
125125+(** The [dt] element represents the term, or name, part of a term-description
126126+ group in a description list. *)
127127+128128+val dd : Element_spec.t
129129+(** The [dd] element represents the description, definition, or value, part of
130130+ a term-description group in a description list. *)
131131+132132+val figure : Element_spec.t
133133+(** The [figure] element represents some flow content, optionally with a caption,
134134+ that is self-contained and is typically referenced as a single unit from
135135+ the main flow of the document. *)
136136+137137+val figcaption : Element_spec.t
138138+(** The [figcaption] element represents a caption or legend for the rest of the
139139+ contents of the parent [figure] element. *)
140140+141141+val div : Element_spec.t
142142+(** The [div] element has no special meaning at all. It represents its children. *)
143143+144144+(** {1 Element registry} *)
145145+146146+val all : Element_spec.t list
147147+(** [all] contains all element specifications defined in this module. *)
···11+(** HTML5 embedded content element specifications.
22+33+ Embedded content elements import resources into the document.
44+ See https://html.spec.whatwg.org/multipage/embedded-content.html *)
55+66+val picture : Element_spec.t
77+(** The picture element contains zero or more source elements followed by
88+ one img element to offer alternative versions of an image for different
99+ display scenarios. *)
1010+1111+val source : Element_spec.t
1212+(** The source element specifies multiple media resources for picture, audio,
1313+ and video elements. It is a void element. *)
1414+1515+val img : Element_spec.t
1616+(** The img element represents an image. It is a void element. *)
1717+1818+val iframe : Element_spec.t
1919+(** The iframe element represents a nested browsing context. *)
2020+2121+val embed : Element_spec.t
2222+(** The embed element provides an integration point for an external application
2323+ or interactive content. It is a void element. *)
2424+2525+val object_ : Element_spec.t
2626+(** The object element can represent an external resource, which is treated as
2727+ an image, a nested browsing context, or a plugin. *)
2828+2929+val param : Element_spec.t
3030+(** The param element defines parameters for plugins invoked by object elements.
3131+ Deprecated in favor of using data attributes. It is a void element. *)
3232+3333+val video : Element_spec.t
3434+(** The video element is used for playing videos or movies, and audio files
3535+ with captions. *)
3636+3737+val audio : Element_spec.t
3838+(** The audio element represents a sound or audio stream. *)
3939+4040+val track : Element_spec.t
4141+(** The track element allows authors to specify explicit external timed text
4242+ tracks for media elements. It is a void element. *)
4343+4444+val map : Element_spec.t
4545+(** The map element, in conjunction with img and area elements, defines an
4646+ image map. *)
4747+4848+val area : Element_spec.t
4949+(** The area element represents either a hyperlink with some text and a
5050+ corresponding area on an image map, or a dead area on an image map.
5151+ It is a void element. *)
5252+5353+val all : Element_spec.t list
5454+(** List of all embedded content element specifications. *)
···11+(** HTML5 form element specifications.
22+33+ Form-associated elements for user input and form submission.
44+ See https://html.spec.whatwg.org/multipage/forms.html *)
55+66+(** {1 Form Container} *)
77+88+val form : Element_spec.t
99+(** The form element represents a hyperlink that can be manipulated through a
1010+ collection of form-associated elements, some of which can represent editable
1111+ values that can be submitted to a server for processing.
1212+1313+ Content model: Flow content, but must not contain form element descendants. *)
1414+1515+(** {1 Form Controls} *)
1616+1717+val label : Element_spec.t
1818+(** The label element represents a caption in a user interface. The caption can
1919+ be associated with a specific form control, known as the label element's
2020+ labeled control.
2121+2222+ Content model: Phrasing content, but must not contain descendant label
2323+ elements, and must not contain form control descendants other than the
2424+ labeled control. *)
2525+2626+val input : Element_spec.t
2727+(** The input element represents a typed data field, usually with a form control
2828+ to allow the user to edit the data. It is a void element.
2929+3030+ Content model: Nothing (void element).
3131+3232+ The type attribute controls the data type (and associated control) of the
3333+ element. *)
3434+3535+val button : Element_spec.t
3636+(** The button element represents a button labeled by its contents.
3737+3838+ Content model: Phrasing content, but must not contain interactive content
3939+ descendants. *)
4040+4141+val select : Element_spec.t
4242+(** The select element represents a control for selecting amongst a set of
4343+ options.
4444+4545+ Content model: Zero or more option, optgroup, and script-supporting elements. *)
4646+4747+val datalist : Element_spec.t
4848+(** The datalist element represents a set of option elements that represent
4949+ predefined options for other controls. In the rendering, the datalist element
5050+ represents nothing and it, along with its children, should be hidden.
5151+5252+ Content model: Either phrasing content or zero or more option and
5353+ script-supporting elements. *)
5454+5555+val optgroup : Element_spec.t
5656+(** The optgroup element represents a group of option elements with a common
5757+ label.
5858+5959+ Content model: Zero or more option and script-supporting elements. *)
6060+6161+val option : Element_spec.t
6262+(** The option element represents an option in a select element or as part of a
6363+ list of suggestions in a datalist element.
6464+6565+ Content model: Text, or empty if label attribute is present. *)
6666+6767+val textarea : Element_spec.t
6868+(** The textarea element represents a multiline plain text edit control for the
6969+ element's raw value.
7070+7171+ Content model: Text. *)
7272+7373+val output : Element_spec.t
7474+(** The output element represents the result of a calculation performed by the
7575+ application, or the result of a user action.
7676+7777+ Content model: Phrasing content. *)
7878+7979+val progress : Element_spec.t
8080+(** The progress element represents the completion progress of a task.
8181+8282+ Content model: Phrasing content, but must not contain progress element
8383+ descendants. *)
8484+8585+val meter : Element_spec.t
8686+(** The meter element represents a scalar measurement within a known range, or a
8787+ fractional value; for example disk usage, the relevance of a query result, or
8888+ the fraction of a voting population to have selected a particular candidate.
8989+9090+ Content model: Phrasing content, but must not contain meter element
9191+ descendants. *)
9292+9393+val fieldset : Element_spec.t
9494+(** The fieldset element represents a set of form controls (or other content)
9595+ grouped together, optionally with a caption.
9696+9797+ Content model: Optionally a legend element, followed by flow content. *)
9898+9999+val legend : Element_spec.t
100100+(** The legend element represents a caption for the rest of the contents of the
101101+ legend element's parent fieldset element, if any.
102102+103103+ Content model: Phrasing content, and optionally intermixed with heading
104104+ content. *)
105105+106106+(** {1 Element List} *)
107107+108108+val all : Element_spec.t list
109109+(** List of all form element specifications. *)
···11+(** HTML5 interactive and scripting element specifications.
22+33+ Interactive elements are specifically intended for user interaction,
44+ and scripting elements support scripts in the document.
55+ See https://html.spec.whatwg.org/multipage/interactive-elements.html
66+ and https://html.spec.whatwg.org/multipage/scripting.html *)
77+88+val details : Element_spec.t
99+(** The details element represents a disclosure widget from which the user can
1010+ obtain additional information or controls. *)
1111+1212+val summary : Element_spec.t
1313+(** The summary element represents a summary, caption, or legend for the rest
1414+ of the contents of the summary element's parent details element. *)
1515+1616+val dialog : Element_spec.t
1717+(** The dialog element represents a part of an application that a user
1818+ interacts with to perform a task, such as a dialog box or modal. *)
1919+2020+val script : Element_spec.t
2121+(** The script element allows authors to include dynamic script and data blocks
2222+ in their documents. *)
2323+2424+val noscript : Element_spec.t
2525+(** The noscript element represents fallback content for when scripting is
2626+ disabled or not supported. *)
2727+2828+val template : Element_spec.t
2929+(** The template element is used to declare fragments of HTML that can be cloned
3030+ and inserted in the document by script. *)
3131+3232+val slot : Element_spec.t
3333+(** The slot element is used as a placeholder inside a web component that users
3434+ can fill with their own markup. *)
3535+3636+val canvas : Element_spec.t
3737+(** The canvas element provides scripts with a resolution-dependent bitmap
3838+ canvas for rendering graphs, game graphics, art, or other visual images. *)
3939+4040+val all : Element_spec.t list
4141+(** List of all interactive and scripting element specifications. *)
···11+(** HTML5 table element specifications.
22+33+ Table elements represent data with more than one dimension.
44+ See https://html.spec.whatwg.org/multipage/tables.html *)
55+66+val table : Element_spec.t
77+(** The table element represents data with more than one dimension in the form
88+ of a table. *)
99+1010+val caption : Element_spec.t
1111+(** The caption element represents the title of the table. *)
1212+1313+val colgroup : Element_spec.t
1414+(** The colgroup element represents a group of one or more columns in the table. *)
1515+1616+val col : Element_spec.t
1717+(** The col element represents one or more columns in the table. It is a void
1818+ element. *)
1919+2020+val tbody : Element_spec.t
2121+(** The tbody element represents a block of rows that consist of a body of data
2222+ for the table. *)
2323+2424+val thead : Element_spec.t
2525+(** The thead element represents the block of rows that consist of the column
2626+ labels (headers) for the table. *)
2727+2828+val tfoot : Element_spec.t
2929+(** The tfoot element represents the block of rows that consist of the column
3030+ summaries (footers) for the table. *)
3131+3232+val tr : Element_spec.t
3333+(** The tr element represents a row of cells in a table. *)
3434+3535+val td : Element_spec.t
3636+(** The td element represents a data cell in a table. *)
3737+3838+val th : Element_spec.t
3939+(** The th element represents a header cell in a table. *)
4040+4141+val all : Element_spec.t list
4242+(** List of all table element specifications. *)
···11+(** HTML5 text-level and edit element specifications.
22+33+ Text-level semantic elements and edit tracking elements.
44+ See https://html.spec.whatwg.org/multipage/text-level-semantics.html
55+ and https://html.spec.whatwg.org/multipage/edits.html *)
66+77+(** {1 Hyperlinks} *)
88+99+val a : Element_spec.t
1010+(** The a element represents a hyperlink. When it has an href attribute, it
1111+ represents a hyperlink (a hypertext anchor) labeled by its contents.
1212+1313+ Content model: Transparent, but must not contain interactive content
1414+ descendants. *)
1515+1616+(** {1 Text-level Semantics} *)
1717+1818+val em : Element_spec.t
1919+(** The em element represents stress emphasis of its contents. The level of
2020+ emphasis is given by its number of ancestor em elements.
2121+2222+ Content model: Phrasing content. *)
2323+2424+val strong : Element_spec.t
2525+(** The strong element represents strong importance, seriousness, or urgency
2626+ for its contents.
2727+2828+ Content model: Phrasing content. *)
2929+3030+val small : Element_spec.t
3131+(** The small element represents side comments such as small print. Small print
3232+ typically features disclaimers, caveats, legal restrictions, or copyrights.
3333+3434+ Content model: Phrasing content. *)
3535+3636+val s : Element_spec.t
3737+(** The s element represents contents that are no longer accurate or no longer
3838+ relevant.
3939+4040+ Content model: Phrasing content. *)
4141+4242+val cite : Element_spec.t
4343+(** The cite element represents the title of a work (e.g. a book, a paper,
4444+ an essay, a poem, a score, a song, a script, a film, a TV show, a game,
4545+ a sculpture, a painting, a theatre production, a play, an opera, a musical,
4646+ an exhibition, a legal case report, a computer program, etc).
4747+4848+ Content model: Phrasing content. *)
4949+5050+val q : Element_spec.t
5151+(** The q element represents some phrasing content quoted from another source.
5252+5353+ Content model: Phrasing content. *)
5454+5555+val dfn : Element_spec.t
5656+(** The dfn element represents the defining instance of a term.
5757+5858+ Content model: Phrasing content, but must not contain dfn element descendants. *)
5959+6060+val abbr : Element_spec.t
6161+(** The abbr element represents an abbreviation or acronym, optionally with its
6262+ expansion. The title attribute may be used to provide an expansion of the
6363+ abbreviation.
6464+6565+ Content model: Phrasing content. *)
6666+6767+val ruby : Element_spec.t
6868+(** The ruby element allows one or more spans of phrasing content to be marked
6969+ with ruby annotations. Ruby annotations are short runs of text presented
7070+ alongside base text, primarily used in East Asian typography.
7171+7272+ Content model: Phrasing content, but must contain at least one rt or rp
7373+ element. *)
7474+7575+val rt : Element_spec.t
7676+(** The rt element marks the ruby text component of a ruby annotation.
7777+7878+ Content model: Phrasing content. *)
7979+8080+val rp : Element_spec.t
8181+(** The rp element is used to provide fallback parentheses for browsers that
8282+ don't support ruby annotations.
8383+8484+ Content model: Text, or phrasing content that represents what can be used
8585+ as fallback in annotations. *)
8686+8787+val data : Element_spec.t
8888+(** The data element represents its contents, along with a machine-readable
8989+ form of those contents in the value attribute.
9090+9191+ Content model: Phrasing content. *)
9292+9393+val time : Element_spec.t
9494+(** The time element represents its contents, along with a machine-readable
9595+ form of those contents in the datetime attribute.
9696+9797+ Content model: Phrasing content, but must not contain time element descendants. *)
9898+9999+val code : Element_spec.t
100100+(** The code element represents a fragment of computer code.
101101+102102+ Content model: Phrasing content. *)
103103+104104+val var : Element_spec.t
105105+(** The var element represents a variable in a mathematical expression or a
106106+ programming context.
107107+108108+ Content model: Phrasing content. *)
109109+110110+val samp : Element_spec.t
111111+(** The samp element represents sample or quoted output from another program
112112+ or computing system.
113113+114114+ Content model: Phrasing content. *)
115115+116116+val kbd : Element_spec.t
117117+(** The kbd element represents user input (typically keyboard input, although
118118+ it may also be used to represent other input, such as voice commands).
119119+120120+ Content model: Phrasing content. *)
121121+122122+val sub : Element_spec.t
123123+(** The sub element represents a subscript.
124124+125125+ Content model: Phrasing content. *)
126126+127127+val sup : Element_spec.t
128128+(** The sup element represents a superscript.
129129+130130+ Content model: Phrasing content. *)
131131+132132+val i : Element_spec.t
133133+(** The i element represents a span of text in an alternate voice or mood, or
134134+ otherwise offset from the normal prose in a manner indicating a different
135135+ quality of text.
136136+137137+ Content model: Phrasing content. *)
138138+139139+val b : Element_spec.t
140140+(** The b element represents a span of text to which attention is being drawn
141141+ for utilitarian purposes without conveying any extra importance and with no
142142+ implication of an alternate voice or mood.
143143+144144+ Content model: Phrasing content. *)
145145+146146+val u : Element_spec.t
147147+(** The u element represents a span of text with an unarticulated, though
148148+ explicitly rendered, non-textual annotation, such as labeling the text as
149149+ being a proper name in Chinese text or labeling the text as being
150150+ misspelt.
151151+152152+ Content model: Phrasing content. *)
153153+154154+val mark : Element_spec.t
155155+(** The mark element represents a run of text in one document marked or
156156+ highlighted for reference purposes, due to its relevance in another context.
157157+158158+ Content model: Phrasing content. *)
159159+160160+val bdi : Element_spec.t
161161+(** The bdi element represents a span of text that is to be isolated from its
162162+ surroundings for the purposes of bidirectional text formatting.
163163+164164+ Content model: Phrasing content. *)
165165+166166+val bdo : Element_spec.t
167167+(** The bdo element represents explicit text directionality formatting control
168168+ for its children. It allows authors to override the Unicode bidirectional
169169+ algorithm.
170170+171171+ Content model: Phrasing content. *)
172172+173173+val span : Element_spec.t
174174+(** The span element doesn't mean anything on its own, but can be useful when
175175+ used together with the global attributes, e.g. class, lang, or dir.
176176+177177+ Content model: Phrasing content. *)
178178+179179+val br : Element_spec.t
180180+(** The br element represents a line break. It is a void element.
181181+182182+ Content model: Nothing (void element). *)
183183+184184+val wbr : Element_spec.t
185185+(** The wbr element represents a line break opportunity. It is a void element.
186186+187187+ Content model: Nothing (void element). *)
188188+189189+(** {1 Edits} *)
190190+191191+val ins : Element_spec.t
192192+(** The ins element represents an addition to the document.
193193+194194+ Content model: Transparent. *)
195195+196196+val del : Element_spec.t
197197+(** The del element represents a removal from the document.
198198+199199+ Content model: Transparent. *)
200200+201201+(** {1 Element List} *)
202202+203203+val all : Element_spec.t list
204204+(** List of all text-level and edit element specifications. *)
lib/html5_checker/datatype/datatype.cmi
This is a binary file and will not be displayed.
+42
lib/html5_checker/datatype/datatype.ml
···11+module type S = sig
22+ val name : string
33+ val validate : string -> (unit, string) result
44+ val is_valid : string -> bool
55+end
66+77+type t = (module S)
88+99+let name (module D : S) = D.name
1010+let validate (module D : S) s = D.validate s
1111+let is_valid (module D : S) s = D.is_valid s
1212+1313+(* Helper utilities *)
1414+1515+let is_whitespace = function
1616+ | ' ' | '\t' | '\n' | '\r' | '\012' (* FF *) -> true
1717+ | _ -> false
1818+1919+let is_ascii_digit = function '0' .. '9' -> true | _ -> false
2020+2121+let to_ascii_lowercase c =
2222+ match c with 'A' .. 'Z' -> Char.chr (Char.code c + 32) | _ -> c
2323+2424+let string_to_ascii_lowercase s =
2525+ String.map to_ascii_lowercase s
2626+2727+let trim_html_spaces s =
2828+ let len = String.length s in
2929+ let rec find_start i =
3030+ if i >= len then len
3131+ else if is_whitespace s.[i] then find_start (i + 1)
3232+ else i
3333+ in
3434+ let rec find_end i =
3535+ if i < 0 then -1
3636+ else if is_whitespace s.[i] then find_end (i - 1)
3737+ else i
3838+ in
3939+ let start = find_start 0 in
4040+ let end_pos = find_end (len - 1) in
4141+ if start > end_pos then ""
4242+ else String.sub s start (end_pos - start + 1)
+45
lib/html5_checker/datatype/datatype.mli
···11+(** HTML5 datatype validation.
22+33+ This module provides the base interface for HTML5 attribute datatype
44+ validators. Each datatype validates string values according to HTML5 spec. *)
55+66+(** A datatype validator *)
77+module type S = sig
88+ (** Name of this datatype (e.g., "integer", "url") *)
99+ val name : string
1010+1111+ (** Validate a string value. Returns Ok () if valid, Error message otherwise *)
1212+ val validate : string -> (unit, string) result
1313+1414+ (** Check if value is valid (convenience function) *)
1515+ val is_valid : string -> bool
1616+end
1717+1818+(** A datatype packed as a first-class module *)
1919+type t = (module S)
2020+2121+(** Get the name of a datatype *)
2222+val name : t -> string
2323+2424+(** Validate a value with a datatype *)
2525+val validate : t -> string -> (unit, string) result
2626+2727+(** Check if a value is valid *)
2828+val is_valid : t -> string -> bool
2929+3030+(** Helper utilities for implementing datatype validators. *)
3131+3232+(** Check if a character is HTML5 whitespace (space, tab, LF, FF, or CR). *)
3333+val is_whitespace : char -> bool
3434+3535+(** Check if a character is an ASCII digit (0-9). *)
3636+val is_ascii_digit : char -> bool
3737+3838+(** Convert an ASCII character to lowercase. *)
3939+val to_ascii_lowercase : char -> char
4040+4141+(** Convert an ASCII string to lowercase. *)
4242+val string_to_ascii_lowercase : string -> string
4343+4444+(** Trim HTML5 whitespace from both ends of a string. *)
4545+val trim_html_spaces : string -> string
+31
lib/html5_checker/datatype/datatype_registry.ml
···11+type t = { datatypes : (string, Datatype.t) Hashtbl.t }
22+33+let create () = { datatypes = Hashtbl.create 16 }
44+55+let register t dt =
66+ let name = Datatype.name dt in
77+ Hashtbl.replace t.datatypes name dt
88+99+let get t name = Hashtbl.find_opt t.datatypes name
1010+1111+let list_names t =
1212+ Hashtbl.fold (fun name _ acc -> name :: acc) t.datatypes []
1313+ |> List.sort String.compare
1414+1515+let default =
1616+ let registry = ref None in
1717+ fun () ->
1818+ match !registry with
1919+ | Some r -> r
2020+ | None ->
2121+ let r = create () in
2222+ (* Register built-in datatypes *)
2323+ register r (module Dt_integer.Integer : Datatype.S);
2424+ register r (module Dt_integer.Integer_non_negative : Datatype.S);
2525+ register r (module Dt_integer.Integer_positive : Datatype.S);
2626+ register r (module Dt_float.Float_ : Datatype.S);
2727+ register r (module Dt_float.Float_non_negative : Datatype.S);
2828+ register r (module Dt_float.Float_positive : Datatype.S);
2929+ register r (module Dt_boolean.Boolean : Datatype.S);
3030+ registry := Some r;
3131+ r
+19
lib/html5_checker/datatype/datatype_registry.mli
···11+(** Registry for HTML5 datatypes *)
22+33+(** Registry type that holds datatypes indexed by name *)
44+type t
55+66+(** Create a new empty registry *)
77+val create : unit -> t
88+99+(** Register a datatype in the registry *)
1010+val register : t -> Datatype.t -> unit
1111+1212+(** Get a datatype by name. Returns None if not found *)
1313+val get : t -> string -> Datatype.t option
1414+1515+(** List all registered datatype names *)
1616+val list_names : t -> string list
1717+1818+(** Default registry with all built-in datatypes *)
1919+val default : unit -> t
+229
lib/html5_checker/datatype/dt_autocomplete.ml
···11+(** Autocomplete attribute validation based on HTML5 spec *)
22+33+(** Check if character is whitespace *)
44+let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r'
55+66+(** Convert character to ASCII lowercase *)
77+let to_ascii_lowercase c =
88+ if c >= 'A' && c <= 'Z' then Char.chr (Char.code c + 32) else c
99+1010+(** Trim whitespace from string *)
1111+let trim_whitespace s =
1212+ let s = String.trim s in
1313+ (* Also collapse internal whitespace *)
1414+ let buf = Buffer.create (String.length s) in
1515+ let in_space = ref false in
1616+ String.iter
1717+ (fun c ->
1818+ if is_whitespace c then
1919+ if not !in_space then (
2020+ Buffer.add_char buf ' ';
2121+ in_space := true)
2222+ else (
2323+ Buffer.add_char buf (to_ascii_lowercase c);
2424+ in_space := false))
2525+ s;
2626+ Buffer.contents buf
2727+2828+(** Contact type tokens *)
2929+let contact_types = [ "home"; "work"; "mobile"; "fax"; "pager" ]
3030+3131+(** All autofill field names *)
3232+let all_field_names =
3333+ [
3434+ "name";
3535+ "honorific-prefix";
3636+ "given-name";
3737+ "additional-name";
3838+ "family-name";
3939+ "honorific-suffix";
4040+ "nickname";
4141+ "organization-title";
4242+ "username";
4343+ "new-password";
4444+ "current-password";
4545+ "one-time-code";
4646+ "organization";
4747+ "street-address";
4848+ "address-line1";
4949+ "address-line2";
5050+ "address-line3";
5151+ "address-level4";
5252+ "address-level3";
5353+ "address-level2";
5454+ "address-level1";
5555+ "country";
5656+ "country-name";
5757+ "postal-code";
5858+ "cc-name";
5959+ "cc-given-name";
6060+ "cc-additional-name";
6161+ "cc-family-name";
6262+ "cc-number";
6363+ "cc-exp";
6464+ "cc-exp-month";
6565+ "cc-exp-year";
6666+ "cc-csc";
6767+ "cc-type";
6868+ "transaction-currency";
6969+ "transaction-amount";
7070+ "language";
7171+ "bday";
7272+ "bday-day";
7373+ "bday-month";
7474+ "bday-year";
7575+ "sex";
7676+ "url";
7777+ "photo";
7878+ "tel";
7979+ "tel-country-code";
8080+ "tel-national";
8181+ "tel-area-code";
8282+ "tel-local";
8383+ "tel-local-prefix";
8484+ "tel-local-suffix";
8585+ "tel-extension";
8686+ "email";
8787+ "impp";
8888+ ]
8989+9090+(** Contact field names (subset that can be used with contact types) *)
9191+let contact_field_names =
9292+ [
9393+ "tel";
9494+ "tel-country-code";
9595+ "tel-national";
9696+ "tel-area-code";
9797+ "tel-local";
9898+ "tel-local-prefix";
9999+ "tel-local-suffix";
100100+ "tel-extension";
101101+ "email";
102102+ "impp";
103103+ ]
104104+105105+(** Split string on whitespace *)
106106+let split_on_whitespace s =
107107+ let rec split acc start i =
108108+ if i >= String.length s then
109109+ if start < i then List.rev (String.sub s start (i - start) :: acc)
110110+ else List.rev acc
111111+ else if is_whitespace s.[i] then
112112+ if start < i then
113113+ split (String.sub s start (i - start) :: acc) (i + 1) (i + 1)
114114+ else split acc (i + 1) (i + 1)
115115+ else split acc start (i + 1)
116116+ in
117117+ split [] 0 0
118118+119119+(** Check if string starts with prefix *)
120120+let starts_with s prefix =
121121+ String.length s >= String.length prefix
122122+ && String.sub s 0 (String.length prefix) = prefix
123123+124124+(** Validate detail tokens *)
125125+let check_tokens tokens =
126126+ let tokens = ref tokens in
127127+ let is_contact_details = ref false in
128128+129129+ (* Check for section-* *)
130130+ (match !tokens with
131131+ | token :: rest when starts_with token "section-" ->
132132+ tokens := rest
133133+ | _ -> ());
134134+135135+ (* Check for shipping/billing *)
136136+ (match !tokens with
137137+ | "shipping" :: rest | "billing" :: rest ->
138138+ tokens := rest
139139+ | _ -> ());
140140+141141+ (* Check for contact type *)
142142+ (match !tokens with
143143+ | token :: rest when List.mem token contact_types ->
144144+ tokens := rest;
145145+ is_contact_details := true
146146+ | _ -> ());
147147+148148+ (* Process remaining tokens *)
149149+ let process_field_tokens = function
150150+ | [] -> Error "A list of autofill details tokens must contain an autofill field name"
151151+ | [ "webauthn" ] ->
152152+ Error
153153+ "The token \"webauthn\" must not be the only token in a list of \
154154+ autofill detail tokens"
155155+ | [ field_name ] ->
156156+ if not (List.mem field_name all_field_names) then
157157+ Error
158158+ (Printf.sprintf
159159+ "The string \"%s\" is not a valid autofill field name"
160160+ field_name)
161161+ else if !is_contact_details && not (List.mem field_name contact_field_names)
162162+ then
163163+ Error
164164+ (Printf.sprintf
165165+ "The autofill field name \"%s\" is not allowed in contact \
166166+ context"
167167+ field_name)
168168+ else Ok ()
169169+ | [ field_name; "webauthn" ] ->
170170+ if not (List.mem field_name all_field_names) then
171171+ Error
172172+ (Printf.sprintf
173173+ "The string \"%s\" is not a valid autofill field name"
174174+ field_name)
175175+ else if !is_contact_details && not (List.mem field_name contact_field_names)
176176+ then
177177+ Error
178178+ (Printf.sprintf
179179+ "The autofill field name \"%s\" is not allowed in contact \
180180+ context"
181181+ field_name)
182182+ else Ok ()
183183+ | token :: _ when List.mem token contact_types ->
184184+ Error
185185+ (Printf.sprintf
186186+ "The token \"%s\" must only appear before any autofill field names"
187187+ token)
188188+ | token :: _ when starts_with token "section-" ->
189189+ Error
190190+ "A \"section-*\" indicator must only appear as the first token in a \
191191+ list of autofill detail tokens"
192192+ | "shipping" :: _ | "billing" :: _ as toks ->
193193+ Error
194194+ (Printf.sprintf
195195+ "The token \"%s\" must only appear as either the first token in a \
196196+ list of autofill detail tokens, or, if the first token is a \
197197+ \"section-*\" indicator, as the second token"
198198+ (List.hd toks))
199199+ | _ :: "webauthn" :: _ :: _ ->
200200+ Error
201201+ "The token \"webauthn\" must only appear as the very last token in a \
202202+ list of autofill detail tokens"
203203+ | _ :: _ :: _ ->
204204+ Error
205205+ "A list of autofill details tokens must not contain more than one \
206206+ autofill field name"
207207+ in
208208+ process_field_tokens !tokens
209209+210210+(** Validate autocomplete value *)
211211+let validate_autocomplete s =
212212+ let trimmed = trim_whitespace s in
213213+ if String.length trimmed = 0 then Error "Must not be empty"
214214+ else if trimmed = "on" || trimmed = "off" then Ok ()
215215+ else
216216+ let tokens = split_on_whitespace trimmed in
217217+ check_tokens tokens
218218+219219+module Autocomplete = struct
220220+ let name = "autocomplete"
221221+ let validate = validate_autocomplete
222222+223223+ let is_valid s =
224224+ match validate s with
225225+ | Ok () -> true
226226+ | Error _ -> false
227227+end
228228+229229+let datatypes = [ (module Autocomplete : Datatype.S) ]
+41
lib/html5_checker/datatype/dt_autocomplete.mli
···11+(** Autocomplete attribute datatype validator.
22+33+ This module provides a validator for the autocomplete attribute used on
44+ form fields, as defined by the HTML5 specification. *)
55+66+(** Autocomplete attribute validator.
77+88+ Validates autocomplete attribute values which can be:
99+ - "on" or "off" (simple values)
1010+ - Autofill detail tokens in the format:
1111+ [section-*] [shipping|billing] [contact-type] field-name [webauthn]
1212+1313+ Contact types: home, work, mobile, fax, pager
1414+1515+ Field names include:
1616+ - Name fields: name, honorific-prefix, given-name, additional-name,
1717+ family-name, honorific-suffix, nickname, organization-title
1818+ - Authentication: username, new-password, current-password, one-time-code
1919+ - Organization: organization
2020+ - Address: street-address, address-line1, address-line2, address-line3,
2121+ address-level1, address-level2, address-level3, address-level4,
2222+ country, country-name, postal-code
2323+ - Credit card: cc-name, cc-given-name, cc-additional-name, cc-family-name,
2424+ cc-number, cc-exp, cc-exp-month, cc-exp-year, cc-csc, cc-type
2525+ - Transaction: transaction-currency, transaction-amount
2626+ - Other: language, bday, bday-day, bday-month, bday-year, sex, url, photo
2727+ - Contact: tel, tel-country-code, tel-national, tel-area-code, tel-local,
2828+ tel-local-prefix, tel-local-suffix, tel-extension, email, impp
2929+3030+ Examples:
3131+ - "on"
3232+ - "off"
3333+ - "name"
3434+ - "email"
3535+ - "shipping street-address"
3636+ - "section-blue billing email"
3737+ - "work tel" *)
3838+module Autocomplete : Datatype.S
3939+4040+(** List of all datatypes defined in this module *)
4141+val datatypes : Datatype.t list
+38
lib/html5_checker/datatype/dt_boolean.ml
···11+(** Boolean attribute validation for HTML5 *)
22+module Boolean = struct
33+ let name = "boolean"
44+55+ let validate s =
66+ match s with
77+ | "" | "true" | "false" -> Ok ()
88+ | _ ->
99+ Error
1010+ (Printf.sprintf
1111+ "The value '%s' is not a valid boolean. Expected empty string, \
1212+ 'true', or 'false'."
1313+ s)
1414+1515+ let is_valid s = Result.is_ok (validate s)
1616+1717+ let with_name attr_name =
1818+ let module M = struct
1919+ let name = "boolean"
2020+2121+ let validate s =
2222+ match s with
2323+ | "" | "true" | "false" -> Ok ()
2424+ | _ ->
2525+ let s_lower = Datatype.string_to_ascii_lowercase s in
2626+ let attr_lower = Datatype.string_to_ascii_lowercase attr_name in
2727+ if s_lower = attr_lower then Ok ()
2828+ else
2929+ Error
3030+ (Printf.sprintf
3131+ "The value '%s' is not a valid boolean. Expected empty \
3232+ string, 'true', 'false', or '%s'."
3333+ s attr_name)
3434+3535+ let is_valid s = Result.is_ok (validate s)
3636+ end in
3737+ (module M : Datatype.S)
3838+end
+19
lib/html5_checker/datatype/dt_boolean.mli
···11+(** Boolean attribute datatype validator for HTML5 *)
22+33+(** Boolean attribute validation.
44+55+ In HTML5, boolean attributes can have the following values:
66+ - Empty string
77+ - "true"
88+ - "false"
99+ - The attribute name itself (case-insensitive)
1010+1111+ For attribute-name validation, use [Boolean.with_name]. *)
1212+module Boolean : sig
1313+ include Datatype.S
1414+1515+ (** Create a boolean validator that also accepts a specific attribute name.
1616+ For example, [with_name "disabled"] will accept "", "true", "false", or
1717+ "disabled" (case-insensitive). *)
1818+ val with_name : string -> (module Datatype.S)
1919+end
+22
lib/html5_checker/datatype/dt_button_type.ml
···11+(** Button type attribute validation based on HTML5 spec *)
22+33+(** Valid button type values *)
44+let valid_types = [ "submit"; "reset"; "button" ]
55+66+module Button_type = struct
77+ let name = "button-type"
88+99+ let validate s =
1010+ let s_lower = Datatype.string_to_ascii_lowercase s in
1111+ if List.mem s_lower valid_types then Ok ()
1212+ else
1313+ Error
1414+ (Printf.sprintf
1515+ "The value '%s' is not a valid button type. Expected one of: %s."
1616+ s
1717+ (String.concat ", " valid_types))
1818+1919+ let is_valid s = Result.is_ok (validate s)
2020+end
2121+2222+let datatypes = [ (module Button_type : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_button_type.mli
···11+(** Button type attribute datatype validator.
22+33+ This module provides a validator for the type attribute used on
44+ button elements, as defined by the HTML5 specification. *)
55+66+(** Button type attribute validator.
77+88+ Validates button type attribute values which can be:
99+ - submit - Submit button (submits the form)
1010+ - reset - Reset button (resets the form)
1111+ - button - Push button (no default behavior)
1212+1313+ Values are matched case-insensitively according to HTML5 spec.
1414+1515+ Examples:
1616+ - "submit"
1717+ - "reset"
1818+ - "button" *)
1919+module Button_type : Datatype.S
2020+2121+(** List of all datatypes defined in this module *)
2222+val datatypes : Datatype.t list
+136
lib/html5_checker/datatype/dt_charset.ml
···11+(** Helper functions for charset validation *)
22+33+let is_valid_charset_char c =
44+ (c >= '0' && c <= '9') ||
55+ (c >= 'a' && c <= 'z') ||
66+ (c >= 'A' && c <= 'Z') ||
77+ c = '-' || c = '!' || c = '#' || c = '$' || c = '%' || c = '&' ||
88+ c = '\'' || c = '+' || c = '_' || c = '`' || c = '{' || c = '}' ||
99+ c = '~' || c = '^'
1010+1111+let to_lower s = String.lowercase_ascii s
1212+1313+(** Common encoding labels recognized by WHATWG Encoding Standard.
1414+ This is a subset of the full list. *)
1515+let known_charsets = [
1616+ (* UTF-8 *)
1717+ "utf-8"; "utf8"; "unicode-1-1-utf-8";
1818+ (* Legacy single-byte encodings *)
1919+ "iso-8859-1"; "iso8859-1"; "latin1"; "iso-8859-2"; "iso-8859-3";
2020+ "iso-8859-4"; "iso-8859-5"; "iso-8859-6"; "iso-8859-7"; "iso-8859-8";
2121+ "iso-8859-9"; "iso-8859-10"; "iso-8859-13"; "iso-8859-14"; "iso-8859-15";
2222+ "iso-8859-16";
2323+ (* Windows code pages *)
2424+ "windows-1250"; "windows-1251"; "windows-1252"; "windows-1253";
2525+ "windows-1254"; "windows-1255"; "windows-1256"; "windows-1257";
2626+ "windows-1258";
2727+ (* Other common encodings *)
2828+ "us-ascii"; "ascii"; "utf-16"; "utf-16le"; "utf-16be";
2929+ "gb2312"; "gbk"; "gb18030"; "big5"; "euc-jp"; "iso-2022-jp";
3030+ "shift_jis"; "euc-kr"; "koi8-r"; "koi8-u";
3131+ (* Macintosh encodings *)
3232+ "macintosh"; "x-mac-roman";
3333+]
3434+3535+(** Check if a charset name is recognized *)
3636+let is_known_charset name =
3737+ let lower = to_lower name in
3838+ List.mem lower known_charsets
3939+4040+module Charset = struct
4141+ let name = "encoding name"
4242+4343+ let validate s =
4444+ if String.length s = 0 then
4545+ Error "The empty string is not a valid character encoding name"
4646+ else
4747+ (* Check all characters are valid *)
4848+ let rec check_chars i =
4949+ if i >= String.length s then
5050+ Ok ()
5151+ else
5252+ let c = s.[i] in
5353+ if not (is_valid_charset_char c) then
5454+ Error (Printf.sprintf "Value contained '%c', which is not a valid character in an encoding name" c)
5555+ else
5656+ check_chars (i + 1)
5757+ in
5858+ match check_chars 0 with
5959+ | Error e -> Error e
6060+ | Ok () ->
6161+ let lower = to_lower s in
6262+ (* Reject "replacement" encoding *)
6363+ if lower = "replacement" then
6464+ Error (Printf.sprintf "'%s' is not a valid character encoding name" s)
6565+ (* Check if it's a known charset *)
6666+ else if not (is_known_charset lower) then
6767+ Error (Printf.sprintf "'%s' is not a valid character encoding name" s)
6868+ else
6969+ Ok ()
7070+7171+ let is_valid s = Result.is_ok (validate s)
7272+end
7373+7474+module Meta_charset = struct
7575+ let name = "legacy character encoding declaration"
7676+7777+ let is_whitespace c =
7878+ c = ' ' || c = '\t' || c = '\n' || c = '\012' || c = '\r'
7979+8080+ let validate s =
8181+ let lower = to_lower s in
8282+ if not (String.starts_with ~prefix:"text/html;" lower) then
8383+ Error "The legacy encoding declaration did not start with 'text/html;'"
8484+ else if String.length lower = 10 then
8585+ Error "The legacy encoding declaration ended prematurely"
8686+ else
8787+ (* Skip whitespace after semicolon *)
8888+ let rec skip_ws i =
8989+ if i >= String.length lower then
9090+ Error "The legacy encoding declaration did not contain 'charset=' after the semicolon"
9191+ else
9292+ let c = lower.[i] in
9393+ if is_whitespace c then
9494+ skip_ws (i + 1)
9595+ else if c = 'c' then
9696+ Ok i
9797+ else
9898+ Error (Printf.sprintf "The legacy encoding declaration did not start with space characters or 'charset=' after the semicolon. Found '%c' instead" c)
9999+ in
100100+ match skip_ws 10 with
101101+ | Error e -> Error e
102102+ | Ok offset ->
103103+ if not (String.sub lower offset (String.length lower - offset) |> String.starts_with ~prefix:"charset=") then
104104+ Error "The legacy encoding declaration did not contain 'charset=' after the semicolon"
105105+ else
106106+ let charset_offset = offset + 8 in
107107+ if charset_offset >= String.length lower then
108108+ Error "The empty string is not a valid character encoding name"
109109+ else
110110+ (* Validate remaining characters *)
111111+ let rec check_chars i =
112112+ if i >= String.length lower then
113113+ Ok ()
114114+ else
115115+ let c = lower.[i] in
116116+ if not (is_valid_charset_char c) then
117117+ Error (Printf.sprintf "The legacy encoding contained '%c', which is not a valid character in an encoding name" c)
118118+ else
119119+ check_chars (i + 1)
120120+ in
121121+ match check_chars charset_offset with
122122+ | Error e -> Error e
123123+ | Ok () ->
124124+ let encoding_name = String.sub lower charset_offset (String.length lower - charset_offset) in
125125+ if encoding_name <> "utf-8" then
126126+ Error "'charset=' must be followed by 'utf-8'"
127127+ else
128128+ Ok ()
129129+130130+ let is_valid s = Result.is_ok (validate s)
131131+end
132132+133133+let datatypes = [
134134+ (module Charset : Datatype.S);
135135+ (module Meta_charset : Datatype.S);
136136+]
+37
lib/html5_checker/datatype/dt_charset.mli
···11+(** Character encoding datatype validators for HTML5.
22+33+ This module provides validators for character encoding names as used in
44+ HTML5. Encoding names must conform to the WHATWG Encoding Standard. *)
55+66+(** Character encoding name datatype.
77+88+ Validates a character encoding name according to the WHATWG Encoding
99+ Standard. Valid encoding names include:
1010+ - UTF-8 (and variants like "utf-8", "utf8")
1111+ - Legacy encodings (ISO-8859-1, windows-1252, etc.)
1212+1313+ The validator checks:
1414+ - Non-empty string
1515+ - Valid characters (alphanumeric, hyphen, and special chars: ! # $ % & ' + _ ` \{ \} ~ ^)
1616+ - Recognizes common encoding labels
1717+1818+ Note: This is a simplified validator that recognizes common encoding
1919+ names but does not include the full WHATWG encoding label table.
2020+ It accepts labels case-insensitively. *)
2121+module Charset : Datatype.S
2222+2323+(** Meta charset datatype for legacy encoding declarations.
2424+2525+ Validates the charset attribute value in legacy meta elements of the form:
2626+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
2727+2828+ The validator checks that:
2929+ - String starts with "text/html;"
3030+ - After optional whitespace, contains "charset="
3131+ - The charset value is "utf-8" (the only allowed value in modern HTML5)
3232+3333+ Example valid value: "text/html; charset=utf-8" *)
3434+module Meta_charset : Datatype.S
3535+3636+(** List of all charset datatypes *)
3737+val datatypes : Datatype.t list
+243
lib/html5_checker/datatype/dt_color.ml
···11+(** Color validation *)
22+33+(** Named CSS colors *)
44+let named_colors =
55+ [
66+ "aliceblue";
77+ "antiquewhite";
88+ "aqua";
99+ "aquamarine";
1010+ "azure";
1111+ "beige";
1212+ "bisque";
1313+ "black";
1414+ "blanchedalmond";
1515+ "blue";
1616+ "blueviolet";
1717+ "brown";
1818+ "burlywood";
1919+ "cadetblue";
2020+ "chartreuse";
2121+ "chocolate";
2222+ "coral";
2323+ "cornflowerblue";
2424+ "cornsilk";
2525+ "crimson";
2626+ "cyan";
2727+ "darkblue";
2828+ "darkcyan";
2929+ "darkgoldenrod";
3030+ "darkgray";
3131+ "darkgrey";
3232+ "darkgreen";
3333+ "darkkhaki";
3434+ "darkmagenta";
3535+ "darkolivegreen";
3636+ "darkorange";
3737+ "darkorchid";
3838+ "darkred";
3939+ "darksalmon";
4040+ "darkseagreen";
4141+ "darkslateblue";
4242+ "darkslategray";
4343+ "darkslategrey";
4444+ "darkturquoise";
4545+ "darkviolet";
4646+ "deeppink";
4747+ "deepskyblue";
4848+ "dimgray";
4949+ "dimgrey";
5050+ "dodgerblue";
5151+ "firebrick";
5252+ "floralwhite";
5353+ "forestgreen";
5454+ "fuchsia";
5555+ "gainsboro";
5656+ "ghostwhite";
5757+ "gold";
5858+ "goldenrod";
5959+ "gray";
6060+ "grey";
6161+ "green";
6262+ "greenyellow";
6363+ "honeydew";
6464+ "hotpink";
6565+ "indianred";
6666+ "indigo";
6767+ "ivory";
6868+ "khaki";
6969+ "lavender";
7070+ "lavenderblush";
7171+ "lawngreen";
7272+ "lemonchiffon";
7373+ "lightblue";
7474+ "lightcoral";
7575+ "lightcyan";
7676+ "lightgoldenrodyellow";
7777+ "lightgray";
7878+ "lightgrey";
7979+ "lightgreen";
8080+ "lightpink";
8181+ "lightsalmon";
8282+ "lightseagreen";
8383+ "lightskyblue";
8484+ "lightslategray";
8585+ "lightslategrey";
8686+ "lightsteelblue";
8787+ "lightyellow";
8888+ "lime";
8989+ "limegreen";
9090+ "linen";
9191+ "magenta";
9292+ "maroon";
9393+ "mediumaquamarine";
9494+ "mediumblue";
9595+ "mediumorchid";
9696+ "mediumpurple";
9797+ "mediumseagreen";
9898+ "mediumslateblue";
9999+ "mediumspringgreen";
100100+ "mediumturquoise";
101101+ "mediumvioletred";
102102+ "midnightblue";
103103+ "mintcream";
104104+ "mistyrose";
105105+ "moccasin";
106106+ "navajowhite";
107107+ "navy";
108108+ "oldlace";
109109+ "olive";
110110+ "olivedrab";
111111+ "orange";
112112+ "orangered";
113113+ "orchid";
114114+ "palegoldenrod";
115115+ "palegreen";
116116+ "paleturquoise";
117117+ "palevioletred";
118118+ "papayawhip";
119119+ "peachpuff";
120120+ "peru";
121121+ "pink";
122122+ "plum";
123123+ "powderblue";
124124+ "purple";
125125+ "red";
126126+ "rosybrown";
127127+ "royalblue";
128128+ "saddlebrown";
129129+ "salmon";
130130+ "sandybrown";
131131+ "seagreen";
132132+ "seashell";
133133+ "sienna";
134134+ "silver";
135135+ "skyblue";
136136+ "slateblue";
137137+ "slategray";
138138+ "slategrey";
139139+ "snow";
140140+ "springgreen";
141141+ "steelblue";
142142+ "tan";
143143+ "teal";
144144+ "thistle";
145145+ "tomato";
146146+ "transparent";
147147+ "turquoise";
148148+ "violet";
149149+ "wheat";
150150+ "white";
151151+ "whitesmoke";
152152+ "yellow";
153153+ "yellowgreen";
154154+ ]
155155+156156+(** Check if character is hex digit *)
157157+let is_hex_digit c =
158158+ (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
159159+160160+(** Validate hex color (#RGB or #RRGGBB) *)
161161+let validate_hex_color s =
162162+ let len = String.length s in
163163+ if len <> 4 && len <> 7 then
164164+ Error "Hex color must be #RGB or #RRGGBB format"
165165+ else if s.[0] <> '#' then
166166+ Error "Hex color must start with '#'"
167167+ else
168168+ let rec check_hex i =
169169+ if i >= len then Ok ()
170170+ else if is_hex_digit s.[i] then check_hex (i + 1)
171171+ else
172172+ Error
173173+ (Printf.sprintf "Invalid hex digit '%c' at position %d" s.[i] i)
174174+ in
175175+ check_hex 1
176176+177177+(** Simple color validator - strict #RRGGBB format *)
178178+module Simple_color = struct
179179+ let name = "simple color"
180180+181181+ let validate s =
182182+ let s = String.trim s in
183183+ if String.length s <> 7 then
184184+ Error "Incorrect length for color string (must be 7 characters)"
185185+ else if s.[0] <> '#' then
186186+ Error
187187+ (Printf.sprintf
188188+ "Color starts with incorrect character '%c'. Expected the number \
189189+ sign '#'"
190190+ s.[0])
191191+ else
192192+ let rec check_hex i =
193193+ if i >= 7 then Ok ()
194194+ else if is_hex_digit s.[i] then check_hex (i + 1)
195195+ else
196196+ Error
197197+ (Printf.sprintf "'%c' is not a valid hexadecimal digit" s.[i])
198198+ in
199199+ check_hex 1
200200+201201+ let is_valid s =
202202+ match validate s with
203203+ | Ok () -> true
204204+ | Error _ -> false
205205+end
206206+207207+(** CSS color validator - supports multiple formats *)
208208+module Color = struct
209209+ let name = "color"
210210+211211+ let validate s =
212212+ let s = String.trim s |> String.lowercase_ascii in
213213+ if String.length s = 0 then Error "Color value must not be empty"
214214+ else if List.mem s named_colors then Ok ()
215215+ else if String.length s > 0 && s.[0] = '#' then validate_hex_color s
216216+ else if
217217+ String.length s > 4
218218+ && (String.sub s 0 4 = "rgb(" || String.sub s 0 5 = "rgba(")
219219+ then
220220+ (* Basic validation for rgb/rgba - just check balanced parens *)
221221+ if s.[String.length s - 1] = ')' then Ok ()
222222+ else Error "rgb/rgba function must end with ')'"
223223+ else if
224224+ String.length s > 4
225225+ && (String.sub s 0 4 = "hsl(" || String.sub s 0 5 = "hsla(")
226226+ then
227227+ (* Basic validation for hsl/hsla - just check balanced parens *)
228228+ if s.[String.length s - 1] = ')' then Ok ()
229229+ else Error "hsl/hsla function must end with ')'"
230230+ else
231231+ Error
232232+ (Printf.sprintf
233233+ "Unrecognized color format '%s' (expected named color, hex, rgb(), \
234234+ rgba(), hsl(), or hsla())"
235235+ s)
236236+237237+ let is_valid s =
238238+ match validate s with
239239+ | Ok () -> true
240240+ | Error _ -> false
241241+end
242242+243243+let datatypes = [ (module Color : Datatype.S); (module Simple_color : Datatype.S) ]
+29
lib/html5_checker/datatype/dt_color.mli
···11+(** Color value datatype validators.
22+33+ This module provides validators for CSS color values and simple colors
44+ as defined by the HTML5 specification. *)
55+66+(** CSS color value validator.
77+88+ Validates various CSS color formats:
99+ - Named colors (e.g., "red", "blue", "transparent")
1010+ - Hex colors: #RGB or #RRGGBB
1111+ - rgb() and rgba() functional notation
1212+ - hsl() and hsla() functional notation
1313+1414+ This is a simplified validator and does not validate all edge cases. *)
1515+module Color : Datatype.S
1616+1717+(** Simple color validator (for input[type=color]).
1818+1919+ Validates the simple color format:
2020+ - Must be exactly 7 characters
2121+ - Must start with '#'
2222+ - Followed by exactly 6 hexadecimal digits (0-9, a-f, A-F)
2323+ - Format: #RRGGBB
2424+2525+ This is the strict format required for input[type=color]. *)
2626+module Simple_color : Datatype.S
2727+2828+(** List of all datatypes defined in this module *)
2929+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_contenteditable.ml
···11+(** Contenteditable attribute validation for HTML5 *)
22+33+module Contenteditable = struct
44+ let name = "contenteditable"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "true" | "false" | "plaintext-only" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid contenteditable value. Expected \
1414+ 'true', 'false', 'plaintext-only', or empty string."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Contenteditable : Datatype.S) ]
+25
lib/html5_checker/datatype/dt_contenteditable.mli
···11+(** Contenteditable attribute datatype validator for HTML5.
22+33+ This module provides a validator for the contenteditable attribute, as
44+ defined by the HTML5 specification. *)
55+66+(** Contenteditable attribute validator.
77+88+ Validates contenteditable attribute values which can be:
99+ - "true" - the element is editable
1010+ - "false" - the element is not editable
1111+ - "" (empty string) - equivalent to "true"
1212+ - "plaintext-only" - the element is editable, but rich text formatting is
1313+ disabled
1414+1515+ Values are case-insensitive.
1616+1717+ Examples:
1818+ - "true"
1919+ - "false"
2020+ - ""
2121+ - "plaintext-only" *)
2222+module Contenteditable : Datatype.S
2323+2424+(** List of all datatypes defined in this module *)
2525+val datatypes : Datatype.t list
+45
lib/html5_checker/datatype/dt_coords.ml
···11+(** Coordinates attribute validation for HTML5 *)
22+33+module Coords = struct
44+ let name = "coords"
55+66+ let validate s =
77+ (* Empty string is valid for default shape *)
88+ if s = "" then Ok ()
99+ else
1010+ (* Split on comma and validate each part is an integer *)
1111+ let parts =
1212+ String.split_on_char ',' s
1313+ |> List.map (fun p ->
1414+ let trimmed = String.trim p in
1515+ (* Check if it's a valid integer *)
1616+ try
1717+ let _ = int_of_string trimmed in
1818+ Ok ()
1919+ with Failure _ ->
2020+ Error (Printf.sprintf "The value '%s' is not a valid integer" p))
2121+ in
2222+ (* Check if all parts are valid *)
2323+ let rec check = function
2424+ | [] -> Ok ()
2525+ | Ok () :: rest -> check rest
2626+ | (Error msg) :: _ -> Error msg
2727+ in
2828+ match check parts with
2929+ | Error msg ->
3030+ Error
3131+ (Printf.sprintf
3232+ "The coords value '%s' is not valid. %s. Expected a \
3333+ comma-separated list of integers."
3434+ s msg)
3535+ | Ok () -> (
3636+ (* Verify we have at least some values *)
3737+ let count = List.length parts in
3838+ match count with
3939+ | 0 -> Error "The coords value must not be empty unless for default shape"
4040+ | _ -> Ok ())
4141+4242+ let is_valid s = Result.is_ok (validate s)
4343+end
4444+4545+let datatypes = [ (module Coords : Datatype.S) ]
+25
lib/html5_checker/datatype/dt_coords.mli
···11+(** Coordinates attribute datatype validator for HTML5.
22+33+ This module provides a validator for the coords attribute used on area
44+ elements within image maps, as defined by the HTML5 specification. *)
55+66+(** Coordinates attribute validator.
77+88+ Validates coords attribute values which must be a comma-separated list of
99+ valid integers. The number of values depends on the shape:
1010+ - rect: exactly 4 values (x1,y1,x2,y2)
1111+ - circle: exactly 3 values (x,y,radius)
1212+ - poly: even number of values >= 6 (x1,y1,x2,y2,x3,y3,...)
1313+ - default: should be empty or not present
1414+1515+ Note: This validator only checks that the value is a valid comma-separated
1616+ list of integers. Shape-specific validation should be done at a higher level.
1717+1818+ Examples:
1919+ - "0,0,10,10" (rect)
2020+ - "50,50,25" (circle)
2121+ - "0,0,50,0,50,50,0,50" (poly) *)
2222+module Coords : Datatype.S
2323+2424+(** List of all datatypes defined in this module *)
2525+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_crossorigin.ml
···11+(** CORS crossorigin attribute validation for HTML5 *)
22+33+module Crossorigin = struct
44+ let name = "crossorigin"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "anonymous" | "use-credentials" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid crossorigin value. Expected \
1414+ empty string, 'anonymous', or 'use-credentials'."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Crossorigin : Datatype.S) ]
+23
lib/html5_checker/datatype/dt_crossorigin.mli
···11+(** CORS crossorigin attribute datatype validator.
22+33+ This module provides a validator for the crossorigin attribute used on
44+ script, link, img, audio, video elements, as defined by the HTML5 spec. *)
55+66+(** Crossorigin attribute validator.
77+88+ Validates crossorigin attribute values which can be:
99+ - "" (empty string, equivalent to anonymous)
1010+ - "anonymous" (requests use CORS without credentials)
1111+ - "use-credentials" (requests use CORS with credentials)
1212+1313+ Values are case-insensitive after ASCII lowercasing.
1414+1515+ Examples:
1616+ - ""
1717+ - "anonymous"
1818+ - "use-credentials"
1919+ - "Anonymous" (equivalent to "anonymous") *)
2020+module Crossorigin : Datatype.S
2121+2222+(** List of all datatypes defined in this module *)
2323+val datatypes : Datatype.t list
+235
lib/html5_checker/datatype/dt_datetime.ml
···11+(** Helper functions for datetime validation *)
22+33+let is_digit c = c >= '0' && c <= '9'
44+55+let is_all_digits s =
66+ String.for_all is_digit s
77+88+let parse_int s =
99+ try Some (int_of_string s)
1010+ with Failure _ -> None
1111+1212+(** Days in each month (non-leap year) *)
1313+let days_in_month = [| 31; 28; 31; 30; 31; 30; 31; 31; 30; 31; 30; 31 |]
1414+1515+(** Check if a year is a leap year *)
1616+let is_leap_year year =
1717+ (year mod 400 = 0) || (year mod 4 = 0 && year mod 100 <> 0)
1818+1919+(** Get maximum day for a given month/year *)
2020+let max_day_for_month year month =
2121+ if month = 2 && is_leap_year year then 29
2222+ else days_in_month.(month - 1)
2323+2424+(** Years in the 400-year cycle that have 53 weeks *)
2525+let special_years_mod_400 = [|
2626+ 4; 9; 15; 20; 26; 32; 37; 43; 48; 54; 60; 65; 71; 76; 82; 88; 93; 99;
2727+ 105; 111; 116; 122; 128; 133; 139; 144; 150; 156; 161; 167; 172; 178;
2828+ 184; 189; 195; 201; 207; 212; 218; 224; 229; 235; 240; 246; 252; 257;
2929+ 263; 268; 274; 280; 285; 291; 296; 303; 308; 314; 320; 325; 331; 336;
3030+ 342; 348; 353; 359; 364; 370; 376; 381; 387; 392; 398
3131+|]
3232+3333+(** Check if a year has 53 weeks *)
3434+let has_53_weeks year =
3535+ let year_mod = year mod 400 in
3636+ Array.exists (fun y -> y = year_mod) special_years_mod_400
3737+3838+module Year = struct
3939+ let name = "year"
4040+4141+ let validate s =
4242+ let len = String.length s in
4343+ if len < 4 then
4444+ Error "Year must be at least 4 digits"
4545+ else if not (is_all_digits s) then
4646+ Error "Year must contain only digits"
4747+ else
4848+ match parse_int s with
4949+ | None -> Error "Year value out of range"
5050+ | Some year ->
5151+ if year < 1 then
5252+ Error "Year cannot be less than 1"
5353+ else
5454+ Ok ()
5555+5656+ let is_valid s = Result.is_ok (validate s)
5757+end
5858+5959+module Month = struct
6060+ let name = "month"
6161+6262+ let validate s =
6363+ if String.length s < 7 then
6464+ Error "Month must be in YYYY-MM format"
6565+ else
6666+ let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)$" in
6767+ if not (Str.string_match pattern s 0) then
6868+ Error "Month must be in YYYY-MM format"
6969+ else
7070+ let year_s = Str.matched_group 1 s in
7171+ let month_s = Str.matched_group 2 s in
7272+ match (parse_int year_s, parse_int month_s) with
7373+ | None, _ | _, None -> Error "Year or month out of range"
7474+ | Some year, Some month ->
7575+ if year < 1 then
7676+ Error "Year cannot be less than 1"
7777+ else if month < 1 then
7878+ Error "Month cannot be less than 1"
7979+ else if month > 12 then
8080+ Error "Month cannot be greater than 12"
8181+ else
8282+ Ok ()
8383+8484+ let is_valid s = Result.is_ok (validate s)
8585+end
8686+8787+module Week = struct
8888+ let name = "week"
8989+9090+ let validate s =
9191+ let pattern = Str.regexp "^\\([0-9]+\\)-W\\([0-9][0-9]\\)$" in
9292+ if not (Str.string_match pattern s 0) then
9393+ Error "Week must be in YYYY-Www format"
9494+ else
9595+ let year_s = Str.matched_group 1 s in
9696+ let week_s = Str.matched_group 2 s in
9797+ match (parse_int year_s, parse_int week_s) with
9898+ | None, _ | _, None -> Error "Year or week out of range"
9999+ | Some year, Some week ->
100100+ if year < 1 then
101101+ Error "Year cannot be less than 1"
102102+ else if week < 1 then
103103+ Error "Week cannot be less than 1"
104104+ else if week > 53 then
105105+ Error "Week out of range"
106106+ else if week = 53 && not (has_53_weeks year) then
107107+ Error "Week out of range"
108108+ else
109109+ Ok ()
110110+111111+ let is_valid s = Result.is_ok (validate s)
112112+end
113113+114114+module Date = struct
115115+ let name = "date"
116116+117117+ let validate s =
118118+ let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)-\\([0-9][0-9]\\)$" in
119119+ if not (Str.string_match pattern s 0) then
120120+ Error "Date must be in YYYY-MM-DD format"
121121+ else
122122+ let year_s = Str.matched_group 1 s in
123123+ let month_s = Str.matched_group 2 s in
124124+ let day_s = Str.matched_group 3 s in
125125+ if String.length year_s < 4 then
126126+ Error "Year must be at least 4 digits"
127127+ else
128128+ match (parse_int year_s, parse_int month_s, parse_int day_s) with
129129+ | None, _, _ | _, None, _ | _, _, None ->
130130+ Error "Year, month, or day out of range"
131131+ | Some year, Some month, Some day ->
132132+ if year < 1 then
133133+ Error "Year cannot be less than 1"
134134+ else if month < 1 then
135135+ Error "Month cannot be less than 1"
136136+ else if month > 12 then
137137+ Error "Month cannot be greater than 12"
138138+ else if day < 1 then
139139+ Error "Day cannot be less than 1"
140140+ else
141141+ let max_day = max_day_for_month year month in
142142+ if day > max_day then
143143+ Error "Day out of range"
144144+ else
145145+ Ok ()
146146+147147+ let is_valid s = Result.is_ok (validate s)
148148+end
149149+150150+module Time = struct
151151+ let name = "time"
152152+153153+ let validate s =
154154+ let pattern = Str.regexp "^\\([0-9][0-9]\\):\\([0-9][0-9]\\)\\(:\\([0-9][0-9]\\)\\(\\.\\([0-9]+\\)\\)?\\)?$" in
155155+ if not (Str.string_match pattern s 0) then
156156+ Error "Time must be in HH:MM[:SS[.sss]] format"
157157+ else
158158+ let hour_s = Str.matched_group 1 s in
159159+ let minute_s = Str.matched_group 2 s in
160160+ let second_s = try Some (Str.matched_group 4 s) with Not_found -> None in
161161+ let millis_s = try Some (Str.matched_group 6 s) with Not_found -> None in
162162+ match (parse_int hour_s, parse_int minute_s) with
163163+ | None, _ | _, None -> Error "Hour or minute out of range"
164164+ | Some hour, Some minute ->
165165+ if hour > 23 then
166166+ Error "Hour cannot be greater than 23"
167167+ else if minute > 59 then
168168+ Error "Minute cannot be greater than 59"
169169+ else
170170+ match second_s with
171171+ | None -> Ok ()
172172+ | Some sec_s ->
173173+ match parse_int sec_s with
174174+ | None -> Error "Seconds out of range"
175175+ | Some second ->
176176+ if second > 59 then
177177+ Error "Second cannot be greater than 59"
178178+ else
179179+ match millis_s with
180180+ | None -> Ok ()
181181+ | Some ms ->
182182+ if String.length ms > 3 then
183183+ Error "A fraction of a second must be one, two, or three digits"
184184+ else if not (is_all_digits ms) then
185185+ Error "Invalid milliseconds"
186186+ else
187187+ Ok ()
188188+189189+ let is_valid s = Result.is_ok (validate s)
190190+end
191191+192192+module Datetime_local = struct
193193+ let name = "local datetime"
194194+195195+ let validate s =
196196+ let pattern = Str.regexp "^\\(.+\\)[T ]\\(.+\\)$" in
197197+ if not (Str.string_match pattern s 0) then
198198+ Error "Datetime must be in YYYY-MM-DD[T ]HH:MM[:SS[.sss]] format"
199199+ else
200200+ let date_s = Str.matched_group 1 s in
201201+ let time_s = Str.matched_group 2 s in
202202+ match Date.validate date_s with
203203+ | Error e -> Error ("Invalid date: " ^ e)
204204+ | Ok () ->
205205+ match Time.validate time_s with
206206+ | Error e -> Error ("Invalid time: " ^ e)
207207+ | Ok () -> Ok ()
208208+209209+ let is_valid s = Result.is_ok (validate s)
210210+end
211211+212212+module Datetime = struct
213213+ let name = "datetime"
214214+215215+ let validate s =
216216+ if not (String.ends_with ~suffix:"Z" s) then
217217+ Error "Global datetime must end with 'Z'"
218218+ else
219219+ let s_without_z = String.sub s 0 (String.length s - 1) in
220220+ match Datetime_local.validate s_without_z with
221221+ | Error e -> Error e
222222+ | Ok () -> Ok ()
223223+224224+ let is_valid s = Result.is_ok (validate s)
225225+end
226226+227227+let datatypes = [
228228+ (module Year : Datatype.S);
229229+ (module Month : Datatype.S);
230230+ (module Week : Datatype.S);
231231+ (module Date : Datatype.S);
232232+ (module Time : Datatype.S);
233233+ (module Datetime_local : Datatype.S);
234234+ (module Datetime : Datatype.S);
235235+]
+74
lib/html5_checker/datatype/dt_datetime.mli
···11+(** Date and time datatype validators for HTML5.
22+33+ This module provides validators for various HTML5 date and time formats
44+ as specified in the HTML5 standard. Each validator checks that strings
55+ conform to the specific format and contain valid values. *)
66+77+(** Year datatype (YYYY format, minimum 1).
88+99+ Validates a year string consisting of 4 or more ASCII digits.
1010+ The year must be at least 1. *)
1111+module Year : Datatype.S
1212+1313+(** Month datatype (YYYY-MM format).
1414+1515+ Validates a month string in the format YYYY-MM where:
1616+ - YYYY is a valid year (>= 1)
1717+ - MM is a month from 01 to 12 *)
1818+module Month : Datatype.S
1919+2020+(** Week datatype (YYYY-Www format).
2121+2222+ Validates a week string in the format YYYY-Www where:
2323+ - YYYY is a valid year (>= 1)
2424+ - W is the literal character 'W'
2525+ - ww is a week number from 01 to 53
2626+2727+ Week 53 is only valid for years that have 53 weeks in the ISO 8601
2828+ week-numbering calendar. *)
2929+module Week : Datatype.S
3030+3131+(** Date datatype (YYYY-MM-DD format).
3232+3333+ Validates a date string in the format YYYY-MM-DD where:
3434+ - YYYY is a valid year (>= 1)
3535+ - MM is a month from 01 to 12
3636+ - DD is a day valid for the given month/year
3737+3838+ Handles leap years correctly. *)
3939+module Date : Datatype.S
4040+4141+(** Time datatype (HH:MM[:SS[.sss]] format).
4242+4343+ Validates a time string where:
4444+ - HH is hours from 00 to 23
4545+ - MM is minutes from 00 to 59
4646+ - SS (optional) is seconds from 00 to 59
4747+ - sss (optional) is milliseconds (1-3 digits)
4848+4949+ Valid formats:
5050+ - HH:MM
5151+ - HH:MM:SS
5252+ - HH:MM:SS.s
5353+ - HH:MM:SS.ss
5454+ - HH:MM:SS.sss *)
5555+module Time : Datatype.S
5656+5757+(** Local datetime datatype (YYYY-MM-DD[T ]HH:MM[:SS[.sss]] format).
5858+5959+ Validates a local datetime string combining date and time with 'T' or
6060+ space separator. The date must be valid and the time must be valid.
6161+6262+ This format does not include timezone information. *)
6363+module Datetime_local : Datatype.S
6464+6565+(** Global datetime datatype (YYYY-MM-DD[T ]HH:MM[:SS[.sss]]Z format).
6666+6767+ Validates a global datetime string in UTC (ending with 'Z').
6868+ The date must be valid and the time must be valid.
6969+7070+ This is the format for datetime values that include timezone (UTC). *)
7171+module Datetime : Datatype.S
7272+7373+(** List of all datetime datatypes *)
7474+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_decoding.ml
···11+(** Image decoding attribute validation for HTML5 *)
22+33+module Decoding = struct
44+ let name = "decoding"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "sync" | "async" | "auto" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid decoding value. Expected empty \
1414+ string, 'sync', 'async', or 'auto'."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Decoding : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_decoding.mli
···11+(** Image decoding attribute datatype validator.
22+33+ This module provides a validator for the decoding attribute used to provide
44+ a hint for image decoding, as defined by the HTML5 spec. *)
55+66+(** Decoding attribute validator.
77+88+ Validates decoding attribute values which can be:
99+ - "" (empty string, default decoding behavior)
1010+ - "sync" (decode synchronously for atomic presentation)
1111+ - "async" (decode asynchronously to avoid delaying other content)
1212+ - "auto" (no preference for decoding mode)
1313+1414+ Values are case-insensitive after ASCII lowercasing.
1515+1616+ Examples:
1717+ - ""
1818+ - "sync"
1919+ - "async"
2020+ - "auto" *)
2121+module Decoding : Datatype.S
2222+2323+(** List of all datatypes defined in this module *)
2424+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_dir.ml
···11+(** Text direction attribute validation for HTML5 *)
22+33+module Dir = struct
44+ let name = "dir"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "ltr" | "rtl" | "auto" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid dir value. Expected empty \
1414+ string, 'ltr', 'rtl', or 'auto'."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Dir : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_dir.mli
···11+(** Text direction attribute datatype validator.
22+33+ This module provides a validator for the dir attribute used to specify
44+ the text directionality of element content, as defined by the HTML5 spec. *)
55+66+(** Dir attribute validator.
77+88+ Validates dir attribute values which can be:
99+ - "" (empty string, inherits directionality)
1010+ - "ltr" (left-to-right text direction)
1111+ - "rtl" (right-to-left text direction)
1212+ - "auto" (directionality determined from content)
1313+1414+ Values are case-insensitive after ASCII lowercasing.
1515+1616+ Examples:
1717+ - ""
1818+ - "ltr"
1919+ - "rtl"
2020+ - "auto" *)
2121+module Dir : Datatype.S
2222+2323+(** List of all datatypes defined in this module *)
2424+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_draggable.ml
···11+(** Draggable attribute validation for HTML5 *)
22+33+module Draggable = struct
44+ let name = "draggable"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "true" | "false" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid draggable value. Expected 'true' \
1414+ or 'false'."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Draggable : Datatype.S) ]
+20
lib/html5_checker/datatype/dt_draggable.mli
···11+(** Draggable attribute datatype validator for HTML5.
22+33+ This module provides a validator for the draggable attribute, as defined by
44+ the HTML5 specification. *)
55+66+(** Draggable attribute validator.
77+88+ Validates draggable attribute values which can be:
99+ - "true" - the element is draggable
1010+ - "false" - the element is not draggable
1111+1212+ Values are case-insensitive.
1313+1414+ Examples:
1515+ - "true"
1616+ - "false" *)
1717+module Draggable : Datatype.S
1818+1919+(** List of all datatypes defined in this module *)
2020+val datatypes : Datatype.t list
+88
lib/html5_checker/datatype/dt_email.ml
···11+(** Email address validation *)
22+33+(** Helper to check if a character is valid in email local/domain parts *)
44+let is_email_char c =
55+ (c >= 'a' && c <= 'z')
66+ || (c >= 'A' && c <= 'Z')
77+ || (c >= '0' && c <= '9')
88+ || c = '.' || c = '-' || c = '_' || c = '+' || c = '='
99+1010+(** Validate a single email address using simplified rules *)
1111+let validate_email s =
1212+ let s = String.trim s in
1313+ if String.length s = 0 then Error "Email address must not be empty"
1414+ else
1515+ (* Check for exactly one @ symbol *)
1616+ let at_count = ref 0 in
1717+ let at_pos = ref (-1) in
1818+ String.iteri
1919+ (fun i c -> if c = '@' then (
2020+ incr at_count;
2121+ at_pos := i
2222+ ))
2323+ s;
2424+ if !at_count = 0 then Error "Email address must contain an '@' character"
2525+ else if !at_count > 1 then
2626+ Error "Email address must contain exactly one '@' character"
2727+ else
2828+ let local = String.sub s 0 !at_pos in
2929+ let domain = String.sub s (!at_pos + 1) (String.length s - !at_pos - 1) in
3030+3131+ (* Validate local part *)
3232+ if String.length local = 0 then
3333+ Error "Email address must have a local part before '@'"
3434+ else if local.[0] = '.' || local.[String.length local - 1] = '.' then
3535+ Error "Email local part must not start or end with '.'"
3636+ else if not (String.for_all is_email_char local) then
3737+ Error "Email local part contains invalid characters"
3838+ else (* Validate domain part *)
3939+ if String.length domain = 0 then
4040+ Error "Email address must have a domain part after '@'"
4141+ else if not (String.contains domain '.') then
4242+ Error "Email domain must contain at least one '.'"
4343+ else if domain.[0] = '.' || domain.[String.length domain - 1] = '.' then
4444+ Error "Email domain must not start or end with '.'"
4545+ else if
4646+ not
4747+ (String.for_all
4848+ (fun c -> is_email_char c || c = '.')
4949+ domain)
5050+ then Error "Email domain contains invalid characters"
5151+ else Ok ()
5252+5353+module Email = struct
5454+ let name = "email address"
5555+ let validate = validate_email
5656+5757+ let is_valid s =
5858+ match validate s with
5959+ | Ok () -> true
6060+ | Error _ -> false
6161+end
6262+6363+module Email_list = struct
6464+ let name = "email address list"
6565+6666+ let validate s =
6767+ let s = String.trim s in
6868+ if String.length s = 0 then Error "Email list must not be empty"
6969+ else
7070+ (* Split on commas and validate each email *)
7171+ let emails = String.split_on_char ',' s in
7272+ let rec check_all = function
7373+ | [] -> Ok ()
7474+ | email :: rest -> (
7575+ match validate_email email with
7676+ | Ok () -> check_all rest
7777+ | Error msg ->
7878+ Error (Printf.sprintf "Invalid email in list: %s" msg))
7979+ in
8080+ check_all emails
8181+8282+ let is_valid s =
8383+ match validate s with
8484+ | Ok () -> true
8585+ | Error _ -> false
8686+end
8787+8888+let datatypes = [ (module Email : Datatype.S); (module Email_list : Datatype.S) ]
+23
lib/html5_checker/datatype/dt_email.mli
···11+(** Email address datatype validators.
22+33+ This module provides validators for email addresses and email address lists
44+ as defined by the HTML5 specification. *)
55+66+(** Valid email address validator.
77+88+ Validates a single email address. Uses simplified validation rules:
99+ - Must contain exactly one '@' character
1010+ - Local part (before @) must be non-empty
1111+ - Domain part (after @) must be non-empty and contain at least one '.'
1212+ - Only ASCII characters allowed *)
1313+module Email : Datatype.S
1414+1515+(** Comma-separated email address list validator.
1616+1717+ Validates a comma-separated list of email addresses.
1818+ Each address in the list must be valid according to {!Email} rules.
1919+ Whitespace around commas is ignored. *)
2020+module Email_list : Datatype.S
2121+2222+(** List of all datatypes defined in this module *)
2323+val datatypes : Datatype.t list
+22
lib/html5_checker/datatype/dt_enterkeyhint.ml
···11+(** Enter key hint attribute validation for HTML5 *)
22+33+module Enterkeyhint = struct
44+ let name = "enterkeyhint"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "enter" | "done" | "go" | "next" | "previous" | "search" | "send" ->
1010+ Ok ()
1111+ | _ ->
1212+ Error
1313+ (Printf.sprintf
1414+ "The value '%s' is not a valid enterkeyhint value. Expected \
1515+ one of: empty string, 'enter', 'done', 'go', 'next', \
1616+ 'previous', 'search', or 'send'."
1717+ s)
1818+1919+ let is_valid s = Result.is_ok (validate s)
2020+end
2121+2222+let datatypes = [ (module Enterkeyhint : Datatype.S) ]
+28
lib/html5_checker/datatype/dt_enterkeyhint.mli
···11+(** Enter key hint attribute datatype validator.
22+33+ This module provides a validator for the enterkeyhint attribute used to
44+ customize the enter key label on virtual keyboards, as defined by the HTML5 spec. *)
55+66+(** Enterkeyhint attribute validator.
77+88+ Validates enterkeyhint attribute values which can be:
99+ - "" (empty string, default enter key)
1010+ - "enter" (insert new line)
1111+ - "done" (close input method editor)
1212+ - "go" (navigate to target)
1313+ - "next" (advance to next field)
1414+ - "previous" (go back to previous field)
1515+ - "search" (perform search)
1616+ - "send" (submit or deliver)
1717+1818+ Values are case-insensitive after ASCII lowercasing.
1919+2020+ Examples:
2121+ - ""
2222+ - "done"
2323+ - "next"
2424+ - "search" *)
2525+module Enterkeyhint : Datatype.S
2626+2727+(** List of all datatypes defined in this module *)
2828+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_fetchpriority.ml
···11+(** Fetch priority attribute validation for HTML5 *)
22+33+module Fetchpriority = struct
44+ let name = "fetchpriority"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "high" | "low" | "auto" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid fetchpriority value. Expected \
1414+ empty string, 'high', 'low', or 'auto'."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Fetchpriority : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_fetchpriority.mli
···11+(** Fetch priority attribute datatype validator.
22+33+ This module provides a validator for the fetchpriority attribute used to
44+ provide a hint for resource fetch priority, as defined by the HTML5 spec. *)
55+66+(** Fetchpriority attribute validator.
77+88+ Validates fetchpriority attribute values which can be:
99+ - "" (empty string, default fetch priority)
1010+ - "high" (fetch at high priority relative to other resources)
1111+ - "low" (fetch at low priority relative to other resources)
1212+ - "auto" (no preference for fetch priority)
1313+1414+ Values are case-insensitive after ASCII lowercasing.
1515+1616+ Examples:
1717+ - ""
1818+ - "high"
1919+ - "low"
2020+ - "auto" *)
2121+module Fetchpriority : Datatype.S
2222+2323+(** List of all datatypes defined in this module *)
2424+val datatypes : Datatype.t list
+254
lib/html5_checker/datatype/dt_float.ml
···11+(** Valid HTML5 floating point number *)
22+module Float_ = struct
33+ let name = "floating point number"
44+55+ type state =
66+ | At_start
77+ | At_start_minus_seen
88+ | In_integer_part_digits_seen
99+ | Dot_seen
1010+ | E_seen
1111+ | In_decimal_part_digits_seen
1212+ | In_exponent_sign_seen
1313+ | In_exponent_digits_seen
1414+1515+ let validate s =
1616+ let len = String.length s in
1717+ let rec parse i state =
1818+ if i >= len then
1919+ match state with
2020+ | In_integer_part_digits_seen | In_decimal_part_digits_seen
2121+ | In_exponent_digits_seen ->
2222+ Ok ()
2323+ | At_start -> Error "The empty string is not a valid floating point number."
2424+ | At_start_minus_seen ->
2525+ Error "The minus sign alone is not a valid floating point number."
2626+ | Dot_seen ->
2727+ Error "A floating point number must not end with the decimal point."
2828+ | E_seen ->
2929+ Error "A floating point number must not end with the exponent 'e'."
3030+ | In_exponent_sign_seen ->
3131+ Error
3232+ "A floating point number must not end with only a sign in the \
3333+ exponent."
3434+ else
3535+ let c = s.[i] in
3636+ match state with
3737+ | At_start ->
3838+ if c = '-' then parse (i + 1) At_start_minus_seen
3939+ else if c = '.' then parse (i + 1) Dot_seen
4040+ else if Datatype.is_ascii_digit c then
4141+ parse (i + 1) In_integer_part_digits_seen
4242+ else
4343+ Error
4444+ (Printf.sprintf
4545+ "Expected a minus sign or a digit but saw '%c' instead." c)
4646+ | At_start_minus_seen ->
4747+ if Datatype.is_ascii_digit c then
4848+ parse (i + 1) In_integer_part_digits_seen
4949+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." c)
5050+ | In_integer_part_digits_seen ->
5151+ if c = '.' then parse (i + 1) Dot_seen
5252+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
5353+ else if Datatype.is_ascii_digit c then
5454+ parse (i + 1) In_integer_part_digits_seen
5555+ else
5656+ Error
5757+ (Printf.sprintf
5858+ "Expected a decimal point, 'e', 'E' or a digit but saw '%c' \
5959+ instead."
6060+ c)
6161+ | Dot_seen ->
6262+ if Datatype.is_ascii_digit c then
6363+ parse (i + 1) In_decimal_part_digits_seen
6464+ else
6565+ Error
6666+ (Printf.sprintf
6767+ "Expected a digit after the decimal point but saw '%c' instead."
6868+ c)
6969+ | In_decimal_part_digits_seen ->
7070+ if Datatype.is_ascii_digit c then
7171+ parse (i + 1) In_decimal_part_digits_seen
7272+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
7373+ else
7474+ Error
7575+ (Printf.sprintf "Expected 'e', 'E' or a digit but saw '%c' instead."
7676+ c)
7777+ | E_seen ->
7878+ if c = '-' || c = '+' then parse (i + 1) In_exponent_sign_seen
7979+ else if Datatype.is_ascii_digit c then
8080+ parse (i + 1) In_exponent_digits_seen
8181+ else
8282+ Error
8383+ (Printf.sprintf
8484+ "Expected a minus sign, a plus sign or a digit but saw '%c' \
8585+ instead."
8686+ c)
8787+ | In_exponent_sign_seen ->
8888+ if Datatype.is_ascii_digit c then
8989+ parse (i + 1) In_exponent_digits_seen
9090+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." c)
9191+ | In_exponent_digits_seen ->
9292+ if Datatype.is_ascii_digit c then
9393+ parse (i + 1) In_exponent_digits_seen
9494+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." c)
9595+ in
9696+ parse 0 At_start
9797+9898+ let is_valid s = Result.is_ok (validate s)
9999+end
100100+101101+(** Non-negative floating point number (>= 0) *)
102102+module Float_non_negative = struct
103103+ let name = "non-negative floating point number"
104104+105105+ type state =
106106+ | At_start
107107+ | At_start_minus_seen
108108+ | In_integer_part_digits_seen
109109+ | In_integer_part_digits_seen_zero
110110+ | Dot_seen
111111+ | Dot_seen_zero
112112+ | E_seen
113113+ | In_decimal_part_digits_seen
114114+ | In_decimal_part_digits_seen_zero
115115+ | In_exponent_sign_seen
116116+ | In_exponent_digits_seen
117117+118118+ let validate s =
119119+ let len = String.length s in
120120+ let rec parse i state =
121121+ if i >= len then
122122+ match state with
123123+ | In_integer_part_digits_seen | In_decimal_part_digits_seen
124124+ | In_integer_part_digits_seen_zero | In_decimal_part_digits_seen_zero
125125+ | In_exponent_digits_seen ->
126126+ Ok ()
127127+ | At_start ->
128128+ Error "The empty string is not a valid non-negative floating point number."
129129+ | At_start_minus_seen ->
130130+ Error
131131+ "The minus sign alone is not a valid non-negative floating point \
132132+ number."
133133+ | Dot_seen | Dot_seen_zero ->
134134+ Error
135135+ "A non-negative floating point number must not end with the \
136136+ decimal point."
137137+ | E_seen ->
138138+ Error
139139+ "A non-negative floating point number must not end with the \
140140+ exponent 'e'."
141141+ | In_exponent_sign_seen ->
142142+ Error
143143+ "A non-negative floating point number must not end with only a \
144144+ sign in the exponent."
145145+ else
146146+ let c = s.[i] in
147147+ match state with
148148+ | At_start ->
149149+ if c = '-' then parse (i + 1) At_start_minus_seen
150150+ else if c = '.' then parse (i + 1) Dot_seen
151151+ else if Datatype.is_ascii_digit c then
152152+ parse (i + 1) In_integer_part_digits_seen
153153+ else
154154+ Error
155155+ (Printf.sprintf
156156+ "Expected a minus sign or a digit but saw '%c' instead." c)
157157+ | At_start_minus_seen ->
158158+ if c = '0' then parse (i + 1) In_integer_part_digits_seen_zero
159159+ else Error (Printf.sprintf "Expected a zero but saw '%c' instead." c)
160160+ | In_integer_part_digits_seen ->
161161+ if c = '.' then parse (i + 1) Dot_seen
162162+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
163163+ else if Datatype.is_ascii_digit c then
164164+ parse (i + 1) In_integer_part_digits_seen
165165+ else
166166+ Error
167167+ (Printf.sprintf
168168+ "Expected a decimal point, 'e', 'E' or a digit but saw '%c' \
169169+ instead."
170170+ c)
171171+ | In_integer_part_digits_seen_zero ->
172172+ if c = '.' then parse (i + 1) Dot_seen_zero
173173+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
174174+ else if c = '0' then parse (i + 1) In_integer_part_digits_seen_zero
175175+ else
176176+ Error
177177+ (Printf.sprintf
178178+ "Expected a decimal point, 'e', 'E' or a zero but saw '%c' \
179179+ instead."
180180+ c)
181181+ | Dot_seen ->
182182+ if Datatype.is_ascii_digit c then
183183+ parse (i + 1) In_decimal_part_digits_seen
184184+ else
185185+ Error
186186+ (Printf.sprintf
187187+ "Expected a digit after the decimal point but saw '%c' instead."
188188+ c)
189189+ | Dot_seen_zero ->
190190+ if c = '0' then parse (i + 1) In_decimal_part_digits_seen_zero
191191+ else
192192+ Error
193193+ (Printf.sprintf
194194+ "Expected a zero after the decimal point but saw '%c' instead."
195195+ c)
196196+ | In_decimal_part_digits_seen ->
197197+ if Datatype.is_ascii_digit c then
198198+ parse (i + 1) In_decimal_part_digits_seen
199199+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
200200+ else
201201+ Error
202202+ (Printf.sprintf "Expected 'e', 'E' or a digit but saw '%c' instead."
203203+ c)
204204+ | In_decimal_part_digits_seen_zero ->
205205+ if c = '0' then parse (i + 1) In_decimal_part_digits_seen_zero
206206+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
207207+ else
208208+ Error
209209+ (Printf.sprintf "Expected 'e', 'E' or a zero but saw '%c' instead."
210210+ c)
211211+ | E_seen ->
212212+ if c = '-' || c = '+' then parse (i + 1) In_exponent_sign_seen
213213+ else if Datatype.is_ascii_digit c then
214214+ parse (i + 1) In_exponent_digits_seen
215215+ else
216216+ Error
217217+ (Printf.sprintf
218218+ "Expected a minus sign, a plus sign or a digit but saw '%c' \
219219+ instead."
220220+ c)
221221+ | In_exponent_sign_seen ->
222222+ if Datatype.is_ascii_digit c then
223223+ parse (i + 1) In_exponent_digits_seen
224224+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." c)
225225+ | In_exponent_digits_seen ->
226226+ if Datatype.is_ascii_digit c then
227227+ parse (i + 1) In_exponent_digits_seen
228228+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." c)
229229+ in
230230+ parse 0 At_start
231231+232232+ let is_valid s = Result.is_ok (validate s)
233233+end
234234+235235+(** Positive floating point number (> 0) *)
236236+module Float_positive = struct
237237+ let name = "positive floating point number"
238238+239239+ (* For positive floats, we validate it's a valid non-negative float,
240240+ then check it's not zero *)
241241+ let validate s =
242242+ match Float_non_negative.validate s with
243243+ | Error _ as e -> e
244244+ | Ok () -> (
245245+ (* Parse as float and check if it's positive *)
246246+ try
247247+ let f = float_of_string s in
248248+ if f > 0.0 then Ok ()
249249+ else Error "The value must be a positive floating point number."
250250+ with Failure _ ->
251251+ Error "Invalid floating point number format.")
252252+253253+ let is_valid s = Result.is_ok (validate s)
254254+end
+16
lib/html5_checker/datatype/dt_float.mli
···11+(** Floating point datatype validators for HTML5 *)
22+33+(** Valid HTML5 floating point number *)
44+module Float_ : sig
55+ include Datatype.S
66+end
77+88+(** Non-negative floating point number (>= 0) *)
99+module Float_non_negative : sig
1010+ include Datatype.S
1111+end
1212+1313+(** Positive floating point number (> 0) *)
1414+module Float_positive : sig
1515+ include Datatype.S
1616+end
+28
lib/html5_checker/datatype/dt_form_enctype.ml
···11+(** Form encoding type attribute validation based on HTML5 spec *)
22+33+(** Valid form enctype values *)
44+let valid_enctypes =
55+ [
66+ "application/x-www-form-urlencoded";
77+ "multipart/form-data";
88+ "text/plain";
99+ ]
1010+1111+module Form_enctype = struct
1212+ let name = "form-enctype"
1313+1414+ let validate s =
1515+ let s_lower = Datatype.string_to_ascii_lowercase s in
1616+ if List.mem s_lower valid_enctypes then Ok ()
1717+ else
1818+ Error
1919+ (Printf.sprintf
2020+ "The value '%s' is not a valid form encoding type. Expected one of: \
2121+ %s."
2222+ s
2323+ (String.concat ", " valid_enctypes))
2424+2525+ let is_valid s = Result.is_ok (validate s)
2626+end
2727+2828+let datatypes = [ (module Form_enctype : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_form_enctype.mli
···11+(** Form encoding type attribute datatype validator.
22+33+ This module provides a validator for the enctype and formenctype attributes
44+ used on form and input/button elements, as defined by the HTML5 specification. *)
55+66+(** Form encoding type attribute validator.
77+88+ Validates form enctype/formenctype attribute values which can be:
99+ - application/x-www-form-urlencoded - Default encoding (form fields as name=value pairs)
1010+ - multipart/form-data - Multipart encoding (required for file uploads)
1111+ - text/plain - Plain text encoding (mostly for debugging)
1212+1313+ Values are matched case-insensitively according to HTML5 spec.
1414+1515+ Examples:
1616+ - "application/x-www-form-urlencoded"
1717+ - "multipart/form-data"
1818+ - "text/plain" *)
1919+module Form_enctype : Datatype.S
2020+2121+(** List of all datatypes defined in this module *)
2222+val datatypes : Datatype.t list
+22
lib/html5_checker/datatype/dt_form_method.ml
···11+(** Form method attribute validation based on HTML5 spec *)
22+33+(** Valid form method values *)
44+let valid_methods = [ "get"; "post"; "dialog" ]
55+66+module Form_method = struct
77+ let name = "form-method"
88+99+ let validate s =
1010+ let s_lower = Datatype.string_to_ascii_lowercase s in
1111+ if List.mem s_lower valid_methods then Ok ()
1212+ else
1313+ Error
1414+ (Printf.sprintf
1515+ "The value '%s' is not a valid form method. Expected one of: %s."
1616+ s
1717+ (String.concat ", " valid_methods))
1818+1919+ let is_valid s = Result.is_ok (validate s)
2020+end
2121+2222+let datatypes = [ (module Form_method : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_form_method.mli
···11+(** Form method attribute datatype validator.
22+33+ This module provides a validator for the method attribute used on
44+ form elements, as defined by the HTML5 specification. *)
55+66+(** Form method attribute validator.
77+88+ Validates form method attribute values which can be:
99+ - get - GET method (form data submitted in URL query string)
1010+ - post - POST method (form data submitted in request body)
1111+ - dialog - Dialog method (closes the dialog containing the form)
1212+1313+ Values are matched case-insensitively according to HTML5 spec.
1414+1515+ Examples:
1616+ - "get"
1717+ - "post"
1818+ - "dialog" *)
1919+module Form_method : Datatype.S
2020+2121+(** List of all datatypes defined in this module *)
2222+val datatypes : Datatype.t list
+29
lib/html5_checker/datatype/dt_hash.ml
···11+(** Hash-name and fragment identifier datatype validators for HTML5. *)
22+33+module Hash_name : Datatype.S = struct
44+ let name = "hash-name reference"
55+66+ let validate s =
77+ let len = String.length s in
88+ if len = 0 then Error "The empty string is not a valid hash-name reference."
99+ else if s.[0] <> '#' then
1010+ Error "A hash-name reference must start with \"#\"."
1111+ else if len = 1 then
1212+ Error "A hash-name reference must have at least one character after \"#\"."
1313+ else Ok ()
1414+1515+ let is_valid s = Result.is_ok (validate s)
1616+end
1717+1818+module Hash_or_empty : Datatype.S = struct
1919+ let name = "hash-name reference (potentially empty)"
2020+2121+ let validate s =
2222+ if String.length s = 0 then Ok ()
2323+ else Hash_name.validate s
2424+2525+ let is_valid s = Result.is_ok (validate s)
2626+end
2727+2828+let datatypes =
2929+ [ (module Hash_name : Datatype.S); (module Hash_or_empty : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_hash.mli
···11+(** Hash-name and fragment identifier datatype validators for HTML5.
22+33+ This module provides validators for fragment identifiers (hash-name
44+ references) used in URLs to reference specific parts of a document. *)
55+66+(** Hash-name reference validator.
77+88+ A hash-name reference is a fragment identifier that starts with '#'
99+ followed by one or more characters.
1010+1111+ Requirements:
1212+ - Must not be empty
1313+ - Must start with '#'
1414+ - Must have at least one character after '#' *)
1515+module Hash_name : Datatype.S
1616+1717+(** Hash-name or empty validator.
1818+1919+ Same as Hash_name but allows empty strings. This is used for attributes
2020+ where the hash-name reference is optional. *)
2121+module Hash_or_empty : Datatype.S
2222+2323+(** List of all hash-related datatypes for registration. *)
2424+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_hidden.ml
···11+(** Hidden attribute validation for HTML5 *)
22+33+module Hidden = struct
44+ let name = "hidden"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "hidden" | "until-found" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid hidden value. Expected 'hidden', \
1414+ 'until-found', or empty string."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Hidden : Datatype.S) ]
+23
lib/html5_checker/datatype/dt_hidden.mli
···11+(** Hidden attribute datatype validator for HTML5.
22+33+ This module provides a validator for the hidden attribute, as defined by
44+ the HTML5 specification. *)
55+66+(** Hidden attribute validator.
77+88+ Validates hidden attribute values which can be:
99+ - "" (empty string) - the element is hidden
1010+ - "hidden" - the element is hidden
1111+ - "until-found" - the element is hidden until found by a find-in-page or
1212+ fragment navigation
1313+1414+ Values are case-insensitive.
1515+1616+ Examples:
1717+ - ""
1818+ - "hidden"
1919+ - "until-found" *)
2020+module Hidden : Datatype.S
2121+2222+(** List of all datatypes defined in this module *)
2323+val datatypes : Datatype.t list
+54
lib/html5_checker/datatype/dt_id.ml
···11+(** ID-related datatype validators for HTML5. *)
22+33+module Id : Datatype.S = struct
44+ let name = "id"
55+66+ let validate s =
77+ let len = String.length s in
88+ if len = 0 then Error "An ID must not be the empty string."
99+ else
1010+ match String.index_opt s ' ' with
1111+ | Some _ -> Error "An ID must not contain whitespace."
1212+ | None -> (
1313+ (* Check for other whitespace characters *)
1414+ let rec check_whitespace i =
1515+ if i >= len then Ok ()
1616+ else if Datatype.is_whitespace s.[i] then
1717+ Error "An ID must not contain whitespace."
1818+ else check_whitespace (i + 1)
1919+ in
2020+ check_whitespace 0)
2121+2222+ let is_valid s = Result.is_ok (validate s)
2323+end
2424+2525+module Idref : Datatype.S = struct
2626+ let name = "id reference"
2727+2828+ (* An IDREF has the same validation rules as an ID *)
2929+ let validate = Id.validate
3030+ let is_valid s = Result.is_ok (validate s)
3131+end
3232+3333+module Idrefs : Datatype.S = struct
3434+ let name = "id references"
3535+3636+ let validate s =
3737+ (* IDREFS must contain at least one non-whitespace character *)
3838+ let len = String.length s in
3939+ let rec check_non_whitespace i =
4040+ if i >= len then
4141+ Error "An IDREFS value must contain at least one non-whitespace character."
4242+ else if not (Datatype.is_whitespace s.[i]) then Ok ()
4343+ else check_non_whitespace (i + 1)
4444+ in
4545+ check_non_whitespace 0
4646+4747+ let is_valid s = Result.is_ok (validate s)
4848+end
4949+5050+let datatypes =
5151+ [ (module Id : Datatype.S)
5252+ ; (module Idref : Datatype.S)
5353+ ; (module Idrefs : Datatype.S)
5454+ ]
+35
lib/html5_checker/datatype/dt_id.mli
···11+(** ID-related datatype validators for HTML5.
22+33+ This module provides validators for HTML5 ID attributes and ID references
44+ based on the Nu HTML Checker's implementation. *)
55+66+(** ID validator.
77+88+ Accepts any string that consists of one or more characters and does not
99+ contain any whitespace characters.
1010+1111+ An ID must be:
1212+ - Non-empty
1313+ - Contain no whitespace characters (space, tab, LF, FF, CR) *)
1414+module Id : Datatype.S
1515+1616+(** ID reference validator.
1717+1818+ An IDREF has the same validation rules as an ID - it must be non-empty
1919+ and contain no whitespace. The semantic difference is that an IDREF
2020+ references an existing ID rather than defining one. *)
2121+module Idref : Datatype.S
2222+2323+(** ID references validator.
2424+2525+ Accepts a space-separated list of ID references. The value must contain
2626+ at least one non-whitespace character.
2727+2828+ IDREFS values:
2929+ - Must not be empty or contain only whitespace
3030+ - Can contain multiple space-separated ID references
3131+ - Each individual reference follows IDREF rules *)
3232+module Idrefs : Datatype.S
3333+3434+(** List of all ID-related datatypes for registration. *)
3535+val datatypes : Datatype.t list
+46
lib/html5_checker/datatype/dt_input_type.ml
···11+(** Input type attribute validation based on HTML5 spec *)
22+33+(** Valid input type values *)
44+let valid_types =
55+ [
66+ "hidden";
77+ "text";
88+ "search";
99+ "tel";
1010+ "url";
1111+ "email";
1212+ "password";
1313+ "date";
1414+ "month";
1515+ "week";
1616+ "time";
1717+ "datetime-local";
1818+ "number";
1919+ "range";
2020+ "color";
2121+ "checkbox";
2222+ "radio";
2323+ "file";
2424+ "submit";
2525+ "image";
2626+ "reset";
2727+ "button";
2828+ ]
2929+3030+module Input_type = struct
3131+ let name = "input-type"
3232+3333+ let validate s =
3434+ let s_lower = Datatype.string_to_ascii_lowercase s in
3535+ if List.mem s_lower valid_types then Ok ()
3636+ else
3737+ Error
3838+ (Printf.sprintf
3939+ "The value '%s' is not a valid input type. Expected one of: %s."
4040+ s
4141+ (String.concat ", " valid_types))
4242+4343+ let is_valid s = Result.is_ok (validate s)
4444+end
4545+4646+let datatypes = [ (module Input_type : Datatype.S) ]
+42
lib/html5_checker/datatype/dt_input_type.mli
···11+(** Input type attribute datatype validator.
22+33+ This module provides a validator for the type attribute used on
44+ input elements, as defined by the HTML5 specification. *)
55+66+(** Input type attribute validator.
77+88+ Validates input type attribute values which can be:
99+ - hidden - Hidden input field
1010+ - text - Single-line text field
1111+ - search - Search input field
1212+ - tel - Telephone number input field
1313+ - url - URL input field
1414+ - email - Email address input field
1515+ - password - Password input field
1616+ - date - Date input field (year, month, day)
1717+ - month - Month input field (year, month)
1818+ - week - Week input field (year, week)
1919+ - time - Time input field (hour, minute, seconds, fractional seconds)
2020+ - datetime-local - Local date and time input field
2121+ - number - Numeric input field
2222+ - range - Range control (slider)
2323+ - color - Color picker
2424+ - checkbox - Checkbox
2525+ - radio - Radio button
2626+ - file - File upload control
2727+ - submit - Submit button
2828+ - image - Image submit button
2929+ - reset - Reset button
3030+ - button - Push button
3131+3232+ Values are matched case-insensitively according to HTML5 spec.
3333+3434+ Examples:
3535+ - "text"
3636+ - "email"
3737+ - "datetime-local"
3838+ - "submit" *)
3939+module Input_type : Datatype.S
4040+4141+(** List of all datatypes defined in this module *)
4242+val datatypes : Datatype.t list
+23
lib/html5_checker/datatype/dt_inputmode.ml
···11+(** Input mode attribute validation for HTML5 *)
22+33+module Inputmode = struct
44+ let name = "inputmode"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "none" | "text" | "decimal" | "numeric" | "tel" | "search"
1010+ | "email" | "url" ->
1111+ Ok ()
1212+ | _ ->
1313+ Error
1414+ (Printf.sprintf
1515+ "The value '%s' is not a valid inputmode value. Expected one \
1616+ of: empty string, 'none', 'text', 'decimal', 'numeric', 'tel', \
1717+ 'search', 'email', or 'url'."
1818+ s)
1919+2020+ let is_valid s = Result.is_ok (validate s)
2121+end
2222+2323+let datatypes = [ (module Inputmode : Datatype.S) ]
+29
lib/html5_checker/datatype/dt_inputmode.mli
···11+(** Input mode attribute datatype validator.
22+33+ This module provides a validator for the inputmode attribute used to hint
44+ at the type of data the user might enter, as defined by the HTML5 spec. *)
55+66+(** Inputmode attribute validator.
77+88+ Validates inputmode attribute values which can be:
99+ - "" (empty string, no specific input mode)
1010+ - "none" (no virtual keyboard)
1111+ - "text" (standard text input)
1212+ - "decimal" (decimal numeric input with locale-appropriate format)
1313+ - "numeric" (numeric input)
1414+ - "tel" (telephone number input)
1515+ - "search" (search input)
1616+ - "email" (email address input)
1717+ - "url" (URL input)
1818+1919+ Values are case-insensitive after ASCII lowercasing.
2020+2121+ Examples:
2222+ - ""
2323+ - "numeric"
2424+ - "email"
2525+ - "tel" *)
2626+module Inputmode : Datatype.S
2727+2828+(** List of all datatypes defined in this module *)
2929+val datatypes : Datatype.t list
+71
lib/html5_checker/datatype/dt_integer.ml
···11+(** Valid HTML5 integer (optional sign followed by digits) *)
22+module Integer = struct
33+ let name = "integer"
44+55+ let validate s =
66+ let len = String.length s in
77+ if len = 0 then Error "The empty string is not a valid integer."
88+ else
99+ let start_pos =
1010+ if s.[0] = '-' then
1111+ if len = 1 then failwith "unreachable"
1212+ else 1
1313+ else 0
1414+ in
1515+ (* First character must be minus or digit *)
1616+ if start_pos = 0 && not (Datatype.is_ascii_digit s.[0]) then
1717+ Error
1818+ (Printf.sprintf "Expected a minus sign or a digit but saw '%c' instead."
1919+ s.[0])
2020+ else
2121+ (* Rest must be digits *)
2222+ let rec check_digits i =
2323+ if i >= len then Ok ()
2424+ else if Datatype.is_ascii_digit s.[i] then check_digits (i + 1)
2525+ else
2626+ Error (Printf.sprintf "Expected a digit but saw '%c' instead." s.[i])
2727+ in
2828+ check_digits start_pos
2929+3030+ let is_valid s = Result.is_ok (validate s)
3131+end
3232+3333+(** Non-negative integer (>= 0) *)
3434+module Integer_non_negative = struct
3535+ let name = "non-negative integer"
3636+3737+ let validate s =
3838+ let len = String.length s in
3939+ if len = 0 then Error "The empty string is not a valid non-negative integer."
4040+ else
4141+ (* All characters must be digits *)
4242+ let rec check_digits i =
4343+ if i >= len then Ok ()
4444+ else if Datatype.is_ascii_digit s.[i] then check_digits (i + 1)
4545+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." s.[i])
4646+ in
4747+ check_digits 0
4848+4949+ let is_valid s = Result.is_ok (validate s)
5050+end
5151+5252+(** Positive integer (> 0) *)
5353+module Integer_positive = struct
5454+ let name = "positive integer"
5555+5656+ let validate s =
5757+ let len = String.length s in
5858+ if len = 0 then Error "The empty string is not a valid positive integer."
5959+ else
6060+ (* All characters must be digits *)
6161+ let rec check_digits i all_zeros =
6262+ if i >= len then
6363+ if all_zeros then Error "Zero is not a positive integer." else Ok ()
6464+ else if Datatype.is_ascii_digit s.[i] then
6565+ check_digits (i + 1) (all_zeros && s.[i] = '0')
6666+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." s.[i])
6767+ in
6868+ check_digits 0 true
6969+7070+ let is_valid s = Result.is_ok (validate s)
7171+end
+16
lib/html5_checker/datatype/dt_integer.mli
···11+(** Integer datatype validators for HTML5 *)
22+33+(** Valid HTML5 integer (optional sign followed by digits) *)
44+module Integer : sig
55+ include Datatype.S
66+end
77+88+(** Non-negative integer (>= 0) *)
99+module Integer_non_negative : sig
1010+ include Datatype.S
1111+end
1212+1313+(** Positive integer (> 0) *)
1414+module Integer_positive : sig
1515+ include Datatype.S
1616+end
+103
lib/html5_checker/datatype/dt_integrity.ml
···11+(** Subresource integrity attribute validation *)
22+33+(** Valid hash algorithms *)
44+let valid_algorithms = [ "sha256"; "sha384"; "sha512" ]
55+66+(** Check if character is valid base64 character *)
77+let is_base64_char c =
88+ (c >= 'A' && c <= 'Z')
99+ || (c >= 'a' && c <= 'z')
1010+ || (c >= '0' && c <= '9')
1111+ || c = '+' || c = '/' || c = '='
1212+1313+(** Validate base64 encoding *)
1414+let validate_base64 s =
1515+ if String.length s = 0 then false
1616+ else
1717+ (* Check all characters are valid base64 *)
1818+ let all_valid = ref true in
1919+ for i = 0 to String.length s - 1 do
2020+ if not (is_base64_char s.[i]) then all_valid := false
2121+ done;
2222+ if not !all_valid then false
2323+ else
2424+ (* Check padding is at the end only *)
2525+ let has_padding = String.contains s '=' in
2626+ if not has_padding then true
2727+ else
2828+ (* Find first '=' *)
2929+ let first_eq = String.index s '=' in
3030+ (* All chars after first '=' must be '=' *)
3131+ let valid_padding = ref true in
3232+ for i = first_eq to String.length s - 1 do
3333+ if s.[i] <> '=' then valid_padding := false
3434+ done;
3535+ !valid_padding
3636+ && (* At most 2 padding characters *)
3737+ String.length s - first_eq <= 2
3838+3939+(** Validate a single hash value *)
4040+let validate_hash_value s =
4141+ let trimmed = Datatype.trim_html_spaces s in
4242+ if trimmed = "" then Error "Hash value must not be empty"
4343+ else
4444+ (* Split on '-' to get algorithm and hash *)
4545+ match String.index_opt trimmed '-' with
4646+ | None ->
4747+ Error
4848+ (Printf.sprintf
4949+ "Hash value '%s' must be in format 'algorithm-base64hash'" trimmed)
5050+ | Some dash_pos ->
5151+ let algorithm = String.sub trimmed 0 dash_pos in
5252+ let algorithm_lower = Datatype.string_to_ascii_lowercase algorithm in
5353+ if not (List.mem algorithm_lower valid_algorithms) then
5454+ Error
5555+ (Printf.sprintf
5656+ "Hash algorithm '%s' is not supported. Must be one of: %s"
5757+ algorithm (String.concat ", " valid_algorithms))
5858+ else
5959+ let rest = String.sub trimmed (dash_pos + 1) (String.length trimmed - dash_pos - 1) in
6060+ (* Split on '?' to separate hash from options *)
6161+ let hash_part =
6262+ match String.index_opt rest '?' with
6363+ | None -> rest
6464+ | Some q_pos -> String.sub rest 0 q_pos
6565+ in
6666+ if String.length hash_part = 0 then
6767+ Error "Hash value after algorithm must not be empty"
6868+ else if not (validate_base64 hash_part) then
6969+ Error
7070+ (Printf.sprintf
7171+ "Hash value '%s' is not valid base64 encoding" hash_part)
7272+ else Ok ()
7373+7474+(** Validate integrity attribute value *)
7575+let validate_integrity s =
7676+ let trimmed = Datatype.trim_html_spaces s in
7777+ if trimmed = "" then Error "Integrity attribute must not be empty"
7878+ else
7979+ (* Split on whitespace *)
8080+ let hash_values = String.split_on_char ' ' trimmed in
8181+ let hash_values =
8282+ List.filter (fun h -> Datatype.trim_html_spaces h <> "") hash_values
8383+ in
8484+ if hash_values = [] then
8585+ Error "Integrity attribute must contain at least one hash value"
8686+ else
8787+ (* Validate each hash value *)
8888+ let rec check_hashes = function
8989+ | [] -> Ok ()
9090+ | h :: rest -> (
9191+ match validate_hash_value h with
9292+ | Error e -> Error e
9393+ | Ok () -> check_hashes rest)
9494+ in
9595+ check_hashes hash_values
9696+9797+module Integrity = struct
9898+ let name = "integrity"
9999+ let validate = validate_integrity
100100+ let is_valid s = Result.is_ok (validate s)
101101+end
102102+103103+let datatypes = [ (module Integrity : Datatype.S) ]
+27
lib/html5_checker/datatype/dt_integrity.mli
···11+(** Subresource integrity attribute validator.
22+33+ This module provides a validator for the integrity attribute used on
44+ script and link elements for subresource integrity checks, as defined
55+ by the W3C Subresource Integrity specification. *)
66+77+(** Integrity attribute validator.
88+99+ Validates integrity attribute values which contain space-separated hash
1010+ values. Each hash value consists of:
1111+ - An algorithm identifier (sha256, sha384, or sha512)
1212+ - A hyphen (-)
1313+ - The base64-encoded hash value
1414+ - Optional options preceded by '?'
1515+1616+ Examples:
1717+ - "sha256-abc123..."
1818+ - "sha384-xyz789..."
1919+ - "sha256-abc123... sha512-def456..."
2020+ - "sha256-abc123...?ct=application/javascript"
2121+2222+ The base64 encoding must be valid and the algorithm must be one of the
2323+ supported hash functions. *)
2424+module Integrity : Datatype.S
2525+2626+(** List of all datatypes defined in this module *)
2727+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_kind.ml
···11+(** Kind attribute validation for HTML5 *)
22+33+module Kind = struct
44+ let name = "kind"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "subtitles" | "captions" | "descriptions" | "chapters" | "metadata" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid kind. Expected 'subtitles', \
1414+ 'captions', 'descriptions', 'chapters', or 'metadata'."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Kind : Datatype.S) ]
+29
lib/html5_checker/datatype/dt_kind.mli
···11+(** Kind attribute datatype validator for HTML5.
22+33+ This module provides a validator for the kind attribute used on track
44+ elements, as defined by the HTML5 specification. *)
55+66+(** Kind attribute validator.
77+88+ Validates kind attribute values which can be:
99+ - "subtitles" - Transcription or translation of the dialogue, suitable for
1010+ when the sound is available but not understood
1111+ - "captions" - Transcription or translation of the dialogue, sound effects,
1212+ relevant musical cues, and other relevant audio information, suitable for
1313+ when sound is unavailable or not clearly audible
1414+ - "descriptions" - Textual descriptions of the video component, suitable for
1515+ audio synthesis when the visual component is unavailable
1616+ - "chapters" - Chapter titles, for use in navigating the media resource
1717+ - "metadata" - Tracks intended for use from script, not displayed by the user
1818+ agent
1919+2020+ Values are case-insensitive.
2121+2222+ Examples:
2323+ - "subtitles"
2424+ - "captions"
2525+ - "descriptions" *)
2626+module Kind : Datatype.S
2727+2828+(** List of all datatypes defined in this module *)
2929+val datatypes : Datatype.t list
+110
lib/html5_checker/datatype/dt_language.ml
···11+(** Helper functions for language tag validation *)
22+33+let is_lower_alpha c = c >= 'a' && c <= 'z'
44+let is_upper_alpha c = c >= 'A' && c <= 'Z'
55+let is_alpha c = is_lower_alpha c || is_upper_alpha c
66+let is_digit c = c >= '0' && c <= '9'
77+let is_alphanumeric c = is_alpha c || is_digit c
88+99+let is_all_alpha s =
1010+ String.for_all is_alpha s
1111+1212+let _is_all_digits s =
1313+ String.for_all is_digit s
1414+1515+let is_all_alphanumeric s =
1616+ String.for_all is_alphanumeric s
1717+1818+let to_lower s =
1919+ String.lowercase_ascii s
2020+2121+(** Validate language tag structure according to BCP 47.
2222+ This is a simplified validator that checks structural validity
2323+ but does not validate against the IANA registry. *)
2424+let validate_language_structure s =
2525+ if String.length s = 0 then
2626+ Error "The empty string is not a valid language tag"
2727+ else if String.starts_with ~prefix:"-" s then
2828+ Error "Language tag must not start with HYPHEN-MINUS"
2929+ else if String.ends_with ~suffix:"-" s then
3030+ Error "Language tag must not end with HYPHEN-MINUS"
3131+ else
3232+ let subtags = String.split_on_char '-' s in
3333+3434+ (* Check for empty subtags and length constraints *)
3535+ let rec check_subtag_constraints = function
3636+ | [] -> Ok ()
3737+ | subtag :: rest ->
3838+ let len = String.length subtag in
3939+ if len = 0 then
4040+ Error "Zero-length subtag"
4141+ else if len > 8 then
4242+ Error "Subtags must not exceed 8 characters in length"
4343+ else
4444+ check_subtag_constraints rest
4545+ in
4646+4747+ match check_subtag_constraints subtags with
4848+ | Error e -> Error e
4949+ | Ok () ->
5050+ (* Primary language subtag validation *)
5151+ match subtags with
5252+ | [] -> Error "Language tag must have at least one subtag"
5353+ | first :: rest ->
5454+ let first_lower = to_lower first in
5555+ let len = String.length first_lower in
5656+5757+ (* Check for private use tag *)
5858+ if first_lower = "x" then
5959+ if rest = [] then
6060+ Error "No subtags in private use sequence"
6161+ else
6262+ (* Private use subtags must be 1-8 alphanumeric *)
6363+ let rec check_private_use = function
6464+ | [] -> Ok ()
6565+ | subtag :: rest ->
6666+ let subtag_lower = to_lower subtag in
6767+ if String.length subtag_lower < 1 then
6868+ Error "Private use subtag is too short"
6969+ else if not (is_all_alphanumeric subtag_lower) then
7070+ Error "Bad character in private use subtag"
7171+ else
7272+ check_private_use rest
7373+ in
7474+ check_private_use rest
7575+ (* Primary language: 2-3 letters (ISO 639) *)
7676+ else if (len = 2 || len = 3) && is_all_alpha first_lower then
7777+ Ok ()
7878+ (* Reserved: 4 letters *)
7979+ else if len = 4 && is_all_alpha first_lower then
8080+ Error "Found reserved language tag"
8181+ (* Registered: 5+ letters *)
8282+ else if len >= 5 && is_all_alpha first_lower then
8383+ Ok ()
8484+ else
8585+ Error "Invalid language subtag format"
8686+8787+module Language = struct
8888+ let name = "language tag"
8989+9090+ let validate s = validate_language_structure s
9191+9292+ let is_valid s = Result.is_ok (validate s)
9393+end
9494+9595+module Language_or_empty = struct
9696+ let name = "language tag or empty"
9797+9898+ let validate s =
9999+ if String.length s = 0 then
100100+ Ok ()
101101+ else
102102+ validate_language_structure s
103103+104104+ let is_valid s = Result.is_ok (validate s)
105105+end
106106+107107+let datatypes = [
108108+ (module Language : Datatype.S);
109109+ (module Language_or_empty : Datatype.S);
110110+]
+43
lib/html5_checker/datatype/dt_language.mli
···11+(** Language tag datatype validators for HTML5.
22+33+ This module provides validators for BCP 47 language tags as used in HTML5.
44+ Language tags identify natural languages and consist of subtags separated
55+ by hyphens, following the IETF BCP 47 standard. *)
66+77+(** Language tag datatype (BCP 47 format).
88+99+ Validates a language tag according to BCP 47. A language tag consists of:
1010+ - Primary language subtag (2-3 letters, or 5+ letters for registered languages)
1111+ - Optional extended language subtag (3 letters)
1212+ - Optional script subtag (4 letters)
1313+ - Optional region subtag (2 letters or 3 digits)
1414+ - Optional variant subtags (5-8 alphanumeric characters, or 4 starting with digit)
1515+ - Optional extension subtags (single letter + subtags)
1616+ - Optional private use subtags (starting with 'x-')
1717+1818+ Examples:
1919+ - "en" (English)
2020+ - "en-US" (US English)
2121+ - "zh-Hans" (Simplified Chinese)
2222+ - "zh-Hans-CN" (Simplified Chinese as used in China)
2323+2424+ The validator performs basic structural validation:
2525+ - Tag cannot be empty
2626+ - Tag cannot start or end with hyphen
2727+ - Subtags cannot be empty
2828+ - Subtags cannot exceed 8 characters (except for registered values)
2929+ - Primary language subtag must be 2-3 letters (ISO 639) or 5+ letters (registered)
3030+ - 4-letter primary subtags are reserved
3131+3232+ Note: This implementation does NOT validate against the IANA language
3333+ subtag registry. It only validates the structural format. *)
3434+module Language : Datatype.S
3535+3636+(** Language tag or empty string.
3737+3838+ Like Language but also accepts the empty string. This is used for cases
3939+ where lang="" is valid to indicate an unknown or unspecified language. *)
4040+module Language_or_empty : Datatype.S
4141+4242+(** List of all language datatypes *)
4343+val datatypes : Datatype.t list
+79
lib/html5_checker/datatype/dt_link_type.ml
···11+(** Link relationship type validation *)
22+33+(** Valid link relationship types *)
44+let valid_link_types =
55+ [
66+ "alternate";
77+ "author";
88+ "bookmark";
99+ "canonical";
1010+ "dns-prefetch";
1111+ "external";
1212+ "help";
1313+ "icon";
1414+ "license";
1515+ "manifest";
1616+ "modulepreload";
1717+ "next";
1818+ "nofollow";
1919+ "noopener";
2020+ "noreferrer";
2121+ "opener";
2222+ "pingback";
2323+ "preconnect";
2424+ "prefetch";
2525+ "preload";
2626+ "prerender";
2727+ "prev";
2828+ "search";
2929+ "stylesheet";
3030+ "tag";
3131+ ]
3232+3333+(** Validate a single link type *)
3434+let validate_link_type s =
3535+ let trimmed = Datatype.trim_html_spaces s in
3636+ if trimmed = "" then Error "Link type must not be empty"
3737+ else
3838+ let lower = Datatype.string_to_ascii_lowercase trimmed in
3939+ if List.mem lower valid_link_types then Ok ()
4040+ else
4141+ Error
4242+ (Printf.sprintf
4343+ "The value '%s' is not a valid link type. Valid link types are: %s"
4444+ s (String.concat ", " valid_link_types))
4545+4646+module Link_type = struct
4747+ let name = "link-type"
4848+ let validate = validate_link_type
4949+ let is_valid s = Result.is_ok (validate s)
5050+end
5151+5252+(** Validate space-separated link types *)
5353+let validate_link_types s =
5454+ let trimmed = Datatype.trim_html_spaces s in
5555+ if trimmed = "" then Error "Link types must not be empty"
5656+ else
5757+ (* Split on whitespace *)
5858+ let types = String.split_on_char ' ' trimmed in
5959+ let types = List.filter (fun t -> Datatype.trim_html_spaces t <> "") types in
6060+ if types = [] then Error "Link types must contain at least one link type"
6161+ else
6262+ (* Validate each link type *)
6363+ let rec check_types = function
6464+ | [] -> Ok ()
6565+ | t :: rest -> (
6666+ match validate_link_type t with
6767+ | Error e -> Error e
6868+ | Ok () -> check_types rest)
6969+ in
7070+ check_types types
7171+7272+module Link_types = struct
7373+ let name = "link-types"
7474+ let validate = validate_link_types
7575+ let is_valid s = Result.is_ok (validate s)
7676+end
7777+7878+let datatypes =
7979+ [ (module Link_type : Datatype.S); (module Link_types : Datatype.S) ]
+59
lib/html5_checker/datatype/dt_link_type.mli
···11+(** Link relationship type validators.
22+33+ This module provides validators for link relationship types used in rel
44+ attributes on a and link elements, as defined by the HTML5 specification. *)
55+66+(** Single link type validator.
77+88+ Validates a single link relationship type value. Valid link types include:
99+1010+ - alternate: Alternate representation of the current document
1111+ - author: Link to author information
1212+ - bookmark: Permanent URL for nearest ancestor article
1313+ - canonical: Preferred URL for the current document
1414+ - dns-prefetch: Hint that the browser should prefetch DNS for the target
1515+ - external: Link to a different website
1616+ - help: Link to context-sensitive help
1717+ - icon: Icon representing the current document
1818+ - license: Copyright license for the current document
1919+ - manifest: Web app manifest
2020+ - modulepreload: Preload a JavaScript module
2121+ - next: Next document in a sequence
2222+ - nofollow: Do not follow this link for ranking
2323+ - noopener: Do not grant window.opener access
2424+ - noreferrer: Do not send Referer header
2525+ - opener: Grant window.opener access
2626+ - pingback: Pingback server address
2727+ - preconnect: Hint to preconnect to target origin
2828+ - prefetch: Hint to prefetch the target resource
2929+ - preload: Preload a resource
3030+ - prerender: Hint to prerender the target page
3131+ - prev: Previous document in a sequence
3232+ - search: Link to search tool for the current document
3333+ - stylesheet: External stylesheet
3434+ - tag: Tag (keyword) for the current document
3535+3636+ Examples:
3737+ - "stylesheet"
3838+ - "icon"
3939+ - "preload"
4040+4141+ Link types are case-insensitive. *)
4242+module Link_type : Datatype.S
4343+4444+(** Space-separated link types validator.
4545+4646+ Validates space-separated link relationship types. Multiple link types
4747+ can be specified separated by ASCII whitespace.
4848+4949+ Examples:
5050+ - "stylesheet"
5151+ - "icon preload"
5252+ - "nofollow noopener noreferrer"
5353+5454+ Each token must be a valid link type. Duplicate link types are allowed
5555+ but not recommended. *)
5656+module Link_types : Datatype.S
5757+5858+(** List of all datatypes defined in this module *)
5959+val datatypes : Datatype.t list
+42
lib/html5_checker/datatype/dt_list_type.ml
···11+(** List type attribute validation based on HTML5 spec *)
22+33+(** Valid ol type values (case-sensitive) *)
44+let valid_ol_types = [ "1"; "a"; "A"; "i"; "I" ]
55+66+(** Valid ul type values (deprecated but still parsed) *)
77+let valid_ul_types = [ "disc"; "circle"; "square" ]
88+99+module Ol_type = struct
1010+ let name = "ol-type"
1111+1212+ let validate s =
1313+ (* Note: ol type is case-sensitive *)
1414+ if List.mem s valid_ol_types then Ok ()
1515+ else
1616+ Error
1717+ (Printf.sprintf
1818+ "The value '%s' is not a valid ol type. Expected one of: %s."
1919+ s
2020+ (String.concat ", " valid_ol_types))
2121+2222+ let is_valid s = Result.is_ok (validate s)
2323+end
2424+2525+module Ul_type = struct
2626+ let name = "ul-type"
2727+2828+ let validate s =
2929+ let s_lower = Datatype.string_to_ascii_lowercase s in
3030+ if List.mem s_lower valid_ul_types then Ok ()
3131+ else
3232+ Error
3333+ (Printf.sprintf
3434+ "The value '%s' is not a valid ul type. Expected one of: %s."
3535+ s
3636+ (String.concat ", " valid_ul_types))
3737+3838+ let is_valid s = Result.is_ok (validate s)
3939+end
4040+4141+let datatypes =
4242+ [ (module Ol_type : Datatype.S); (module Ul_type : Datatype.S) ]
+44
lib/html5_checker/datatype/dt_list_type.mli
···11+(** List type attribute datatype validators.
22+33+ This module provides validators for the type attribute used on
44+ ol and ul elements, as defined by the HTML5 specification. *)
55+66+(** Ordered list type attribute validator.
77+88+ Validates ol type attribute values which can be:
99+ - 1 - Decimal numbers (1, 2, 3, ...)
1010+ - a - Lowercase Latin letters (a, b, c, ...)
1111+ - A - Uppercase Latin letters (A, B, C, ...)
1212+ - i - Lowercase Roman numerals (i, ii, iii, ...)
1313+ - I - Uppercase Roman numerals (I, II, III, ...)
1414+1515+ Values are matched case-sensitively for ol type.
1616+1717+ Examples:
1818+ - "1"
1919+ - "a"
2020+ - "A"
2121+ - "i"
2222+ - "I" *)
2323+module Ol_type : Datatype.S
2424+2525+(** Unordered list type attribute validator.
2626+2727+ Validates ul type attribute values which can be:
2828+ - disc - Filled circle (default)
2929+ - circle - Hollow circle
3030+ - square - Filled square
3131+3232+ Note: The type attribute on ul is deprecated in HTML5 but may still
3333+ be parsed for backwards compatibility.
3434+3535+ Values are matched case-insensitively according to HTML5 spec.
3636+3737+ Examples:
3838+ - "disc"
3939+ - "circle"
4040+ - "square" *)
4141+module Ul_type : Datatype.S
4242+4343+(** List of all datatypes defined in this module *)
4444+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_loading.ml
···11+(** Lazy loading attribute validation for HTML5 *)
22+33+module Loading = struct
44+ let name = "loading"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "lazy" | "eager" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid loading value. Expected empty \
1414+ string, 'lazy', or 'eager'."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Loading : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_loading.mli
···11+(** Lazy loading attribute datatype validator.
22+33+ This module provides a validator for the loading attribute used to control
44+ lazy loading behavior for images and iframes, as defined by the HTML5 spec. *)
55+66+(** Loading attribute validator.
77+88+ Validates loading attribute values which can be:
99+ - "" (empty string, default loading behavior)
1010+ - "lazy" (defer loading until needed)
1111+ - "eager" (load immediately)
1212+1313+ Values are case-insensitive after ASCII lowercasing.
1414+1515+ Examples:
1616+ - ""
1717+ - "lazy"
1818+ - "eager" *)
1919+module Loading : Datatype.S
2020+2121+(** List of all datatypes defined in this module *)
2222+val datatypes : Datatype.t list
+144
lib/html5_checker/datatype/dt_media_query.ml
···11+(** Media query validation - simplified implementation *)
22+33+(** Media types *)
44+let media_types =
55+ [
66+ "all";
77+ "screen";
88+ "print";
99+ "speech";
1010+ "aural";
1111+ "braille";
1212+ "handheld";
1313+ "projection";
1414+ "tty";
1515+ "tv";
1616+ "embossed";
1717+ ]
1818+1919+(** Media query keywords *)
2020+let media_keywords = [ "and"; "or"; "not"; "only" ]
2121+2222+(** Check if character is whitespace *)
2323+let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r'
2424+2525+(** Check if character can start an identifier *)
2626+let is_ident_start c =
2727+ (c >= 'a' && c <= 'z')
2828+ || (c >= 'A' && c <= 'Z')
2929+ || c = '_' || c = '-' || Char.code c >= 128
3030+3131+(** Check if character can be in an identifier *)
3232+let is_ident_char c =
3333+ is_ident_start c || (c >= '0' && c <= '9')
3434+3535+(** Check balanced parentheses *)
3636+let check_balanced_parens s =
3737+ let rec check depth i =
3838+ if i >= String.length s then
3939+ if depth = 0 then Ok ()
4040+ else Error "Unbalanced parentheses: unclosed '('"
4141+ else
4242+ let c = s.[i] in
4343+ match c with
4444+ | '(' -> check (depth + 1) (i + 1)
4545+ | ')' ->
4646+ if depth = 0 then Error "Unbalanced parentheses: unexpected ')'"
4747+ else check (depth - 1) (i + 1)
4848+ | _ -> check depth (i + 1)
4949+ in
5050+ check 0 0
5151+5252+(** Extract words (identifiers and keywords) from media query *)
5353+let extract_words s =
5454+ let words = ref [] in
5555+ let buf = Buffer.create 16 in
5656+ let in_parens = ref 0 in
5757+5858+ for i = 0 to String.length s - 1 do
5959+ let c = s.[i] in
6060+ match c with
6161+ | '(' ->
6262+ if Buffer.length buf > 0 then (
6363+ words := Buffer.contents buf :: !words;
6464+ Buffer.clear buf);
6565+ incr in_parens
6666+ | ')' ->
6767+ if Buffer.length buf > 0 then (
6868+ words := Buffer.contents buf :: !words;
6969+ Buffer.clear buf);
7070+ decr in_parens
7171+ | _ ->
7272+ if !in_parens = 0 then
7373+ if is_ident_char c then Buffer.add_char buf c
7474+ else if is_whitespace c then
7575+ if Buffer.length buf > 0 then (
7676+ words := Buffer.contents buf :: !words;
7777+ Buffer.clear buf)
7878+ else ()
7979+ else if Buffer.length buf > 0 then (
8080+ words := Buffer.contents buf :: !words;
8181+ Buffer.clear buf)
8282+ done;
8383+8484+ if Buffer.length buf > 0 then words := Buffer.contents buf :: !words;
8585+ List.rev !words
8686+8787+(** Validate media query structure *)
8888+let validate_media_query s =
8989+ let s = String.trim s in
9090+ if String.length s = 0 then Error "Media query must not be empty"
9191+ else
9292+ (* Check balanced parentheses *)
9393+ match check_balanced_parens s with
9494+ | Error _ as e -> e
9595+ | Ok () ->
9696+ (* Extract and validate words *)
9797+ let words = extract_words s in
9898+ let words_lower = List.map String.lowercase_ascii words in
9999+100100+ (* Basic validation: check for invalid keyword combinations *)
101101+ let rec validate_words prev = function
102102+ | [] -> Ok ()
103103+ | word :: rest -> (
104104+ let word_lower = String.lowercase_ascii word in
105105+ match (prev, word_lower) with
106106+ | None, "and" | None, "or" ->
107107+ Error
108108+ (Printf.sprintf
109109+ "Media query cannot start with keyword '%s'" word)
110110+ | Some "and", "and" | Some "or", "or" | Some "not", "not" ->
111111+ Error
112112+ (Printf.sprintf "Consecutive '%s' keywords are not allowed"
113113+ word)
114114+ | Some "only", "only" ->
115115+ Error "Consecutive 'only' keywords are not allowed"
116116+ | _, _ -> validate_words (Some word_lower) rest)
117117+ in
118118+119119+ (* Check if query contains valid media types or features *)
120120+ let has_media_type =
121121+ List.exists
122122+ (fun w -> List.mem (String.lowercase_ascii w) media_types)
123123+ words
124124+ in
125125+ let has_features = String.contains s '(' in
126126+127127+ if not (has_media_type || has_features) then
128128+ (* Only keywords, no actual media type or features *)
129129+ if List.for_all (fun w -> List.mem w media_keywords) words_lower then
130130+ Error "Media query contains only keywords without media type or features"
131131+ else Ok () (* Assume other identifiers are valid *)
132132+ else validate_words None words
133133+134134+module Media_query = struct
135135+ let name = "media query"
136136+ let validate = validate_media_query
137137+138138+ let is_valid s =
139139+ match validate s with
140140+ | Ok () -> true
141141+ | Error _ -> false
142142+end
143143+144144+let datatypes = [ (module Media_query : Datatype.S) ]
+28
lib/html5_checker/datatype/dt_media_query.mli
···11+(** Media query datatype validator.
22+33+ This module provides a validator for CSS media queries as used in HTML5. *)
44+55+(** Media query validator.
66+77+ Validates CSS media queries used in media attributes and CSS @media rules.
88+99+ Examples:
1010+ - "screen"
1111+ - "print"
1212+ - "(min-width: 600px)"
1313+ - "screen and (color)"
1414+ - "not screen and (color)"
1515+ - "(min-width: 600px) and (max-width: 800px)"
1616+1717+ This is a simplified validator that checks:
1818+ - Balanced parentheses
1919+ - Basic media type keywords (all, screen, print, etc.)
2020+ - Basic logical operators (and, or, not, only)
2121+ - Valid feature queries in parentheses
2222+2323+ Note: This does not perform full CSS media query parsing. For production
2424+ use, consider integrating with a full CSS parser. *)
2525+module Media_query : Datatype.S
2626+2727+(** List of all datatypes defined in this module *)
2828+val datatypes : Datatype.t list
+210
lib/html5_checker/datatype/dt_mime.ml
···11+(** MIME type validation based on RFC 2045 and HTML5 spec *)
22+33+(** Check if character is whitespace *)
44+let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r'
55+66+(** Check if character is a token character (RFC 2045) *)
77+let is_token_char c =
88+ (c >= '\033' && c <= '\126')
99+ && not
1010+ (c = '(' || c = ')' || c = '<' || c = '>' || c = '@' || c = ','
1111+ || c = ';' || c = ':' || c = '\\' || c = '"' || c = '/' || c = '['
1212+ || c = ']' || c = '?' || c = '=' || c = '{' || c = '}')
1313+1414+(** Check if character is valid in quoted string (qdtext) *)
1515+let is_qdtext_char c =
1616+ (c >= ' ' && c <= '\126') || c = '\n' || c = '\r' || c = '\t'
1717+1818+(** States for MIME type parser *)
1919+type parse_state =
2020+ | At_start
2121+ | In_supertype
2222+ | At_subtype_start
2323+ | In_subtype
2424+ | Semicolon_seen
2525+ | Ws_before_semicolon
2626+ | In_param_name
2727+ | Equals_seen
2828+ | In_quoted_string
2929+ | In_unquoted_string
3030+ | In_quoted_pair
3131+ | Close_quote_seen
3232+3333+(** JavaScript MIME types that should not have parameters *)
3434+let javascript_mime_types =
3535+ [
3636+ "application/ecmascript";
3737+ "application/javascript";
3838+ "application/x-ecmascript";
3939+ "application/x-javascript";
4040+ "text/ecmascript";
4141+ "text/javascript";
4242+ "text/javascript1.0";
4343+ "text/javascript1.1";
4444+ "text/javascript1.2";
4545+ "text/javascript1.3";
4646+ "text/javascript1.4";
4747+ "text/javascript1.5";
4848+ "text/jscript";
4949+ "text/livescript";
5050+ "text/x-ecmascript";
5151+ "text/x-javascript";
5252+ ]
5353+5454+(** Validate a single MIME type *)
5555+let validate_mime_type s =
5656+ let len = String.length s in
5757+ let rec parse state i =
5858+ if i >= len then
5959+ (* End of string - check final state *)
6060+ match state with
6161+ | In_subtype | In_unquoted_string | Close_quote_seen -> Ok ()
6262+ | At_start -> Error "Expected a MIME type but saw the empty string"
6363+ | In_supertype | At_subtype_start -> Error "Subtype missing"
6464+ | Equals_seen | In_param_name -> Error "Parameter value missing"
6565+ | In_quoted_pair | In_quoted_string -> Error "Unfinished quoted string"
6666+ | Semicolon_seen ->
6767+ Error "Semicolon seen but there was no parameter following it"
6868+ | Ws_before_semicolon -> Error "Extraneous trailing whitespace"
6969+ else
7070+ let c = s.[i] in
7171+ match state with
7272+ | At_start ->
7373+ if is_token_char c then parse In_supertype (i + 1)
7474+ else
7575+ Error
7676+ (Printf.sprintf
7777+ "Expected a token character but saw '%c' instead" c)
7878+ | In_supertype ->
7979+ if is_token_char c then parse In_supertype (i + 1)
8080+ else if c = '/' then parse At_subtype_start (i + 1)
8181+ else
8282+ Error
8383+ (Printf.sprintf
8484+ "Expected a token character or '/' but saw '%c' instead" c)
8585+ | At_subtype_start ->
8686+ if is_token_char c then parse In_subtype (i + 1)
8787+ else
8888+ Error
8989+ (Printf.sprintf
9090+ "Expected a token character but saw '%c' instead" c)
9191+ | In_subtype ->
9292+ if is_token_char c then parse In_subtype (i + 1)
9393+ else if c = ';' then
9494+ (* Check if this is a JavaScript MIME type *)
9595+ let mime_type = String.sub s 0 i |> String.lowercase_ascii in
9696+ if List.mem mime_type javascript_mime_types then
9797+ Error
9898+ "A JavaScript MIME type must not contain any characters after \
9999+ the subtype"
100100+ else parse Semicolon_seen (i + 1)
101101+ else if is_whitespace c then parse Ws_before_semicolon (i + 1)
102102+ else
103103+ Error
104104+ (Printf.sprintf
105105+ "Expected a token character, whitespace or a semicolon but saw \
106106+ '%c' instead"
107107+ c)
108108+ | Ws_before_semicolon ->
109109+ if is_whitespace c then parse Ws_before_semicolon (i + 1)
110110+ else if c = ';' then parse Semicolon_seen (i + 1)
111111+ else
112112+ Error
113113+ (Printf.sprintf
114114+ "Expected whitespace or a semicolon but saw '%c' instead" c)
115115+ | Semicolon_seen ->
116116+ if is_whitespace c then parse Semicolon_seen (i + 1)
117117+ else if is_token_char c then parse In_param_name (i + 1)
118118+ else
119119+ Error
120120+ (Printf.sprintf
121121+ "Expected whitespace or a token character but saw '%c' instead"
122122+ c)
123123+ | In_param_name ->
124124+ if is_token_char c then parse In_param_name (i + 1)
125125+ else if c = '=' then parse Equals_seen (i + 1)
126126+ else
127127+ Error
128128+ (Printf.sprintf "Expected a token character or '=' but saw '%c' instead"
129129+ c)
130130+ | Equals_seen ->
131131+ if c = '"' then parse In_quoted_string (i + 1)
132132+ else if is_token_char c then parse In_unquoted_string (i + 1)
133133+ else
134134+ Error
135135+ (Printf.sprintf
136136+ "Expected a double quote or a token character but saw '%c' \
137137+ instead"
138138+ c)
139139+ | In_quoted_string ->
140140+ if c = '\\' then parse In_quoted_pair (i + 1)
141141+ else if c = '"' then parse Close_quote_seen (i + 1)
142142+ else if is_qdtext_char c then parse In_quoted_string (i + 1)
143143+ else
144144+ Error
145145+ (Printf.sprintf
146146+ "Expected a non-control ASCII character but saw '%c' instead" c)
147147+ | In_quoted_pair ->
148148+ if Char.code c <= 127 then parse In_quoted_string (i + 1)
149149+ else
150150+ Error
151151+ (Printf.sprintf "Expected an ASCII character but saw '%c' instead"
152152+ c)
153153+ | Close_quote_seen ->
154154+ if c = ';' then parse Semicolon_seen (i + 1)
155155+ else if is_whitespace c then parse Ws_before_semicolon (i + 1)
156156+ else
157157+ Error
158158+ (Printf.sprintf
159159+ "Expected a semicolon or whitespace but saw '%c' instead" c)
160160+ | In_unquoted_string ->
161161+ if is_token_char c then parse In_unquoted_string (i + 1)
162162+ else if c = ';' then parse Semicolon_seen (i + 1)
163163+ else if is_whitespace c then parse Ws_before_semicolon (i + 1)
164164+ else
165165+ Error
166166+ (Printf.sprintf
167167+ "Expected a token character, whitespace or a semicolon but saw \
168168+ '%c' instead"
169169+ c)
170170+ in
171171+ parse At_start 0
172172+173173+module Mime_type = struct
174174+ let name = "MIME type"
175175+ let validate = validate_mime_type
176176+177177+ let is_valid s =
178178+ match validate s with
179179+ | Ok () -> true
180180+ | Error _ -> false
181181+end
182182+183183+module Mime_type_list = struct
184184+ let name = "MIME type list"
185185+186186+ let validate s =
187187+ let s = String.trim s in
188188+ if String.length s = 0 then Error "MIME type list must not be empty"
189189+ else
190190+ (* Split on commas and validate each MIME type *)
191191+ let mime_types = String.split_on_char ',' s in
192192+ let rec check_all = function
193193+ | [] -> Ok ()
194194+ | mime :: rest -> (
195195+ let mime = String.trim mime in
196196+ match validate_mime_type mime with
197197+ | Ok () -> check_all rest
198198+ | Error msg ->
199199+ Error (Printf.sprintf "Invalid MIME type in list: %s" msg))
200200+ in
201201+ check_all mime_types
202202+203203+ let is_valid s =
204204+ match validate s with
205205+ | Ok () -> true
206206+ | Error _ -> false
207207+end
208208+209209+let datatypes =
210210+ [ (module Mime_type : Datatype.S); (module Mime_type_list : Datatype.S) ]
+32
lib/html5_checker/datatype/dt_mime.mli
···11+(** MIME type datatype validators.
22+33+ This module provides validators for MIME types (media types) as defined
44+ by RFC 2045 and used in HTML5. *)
55+66+(** MIME type validator.
77+88+ Validates a MIME type in the format: type/subtype[; parameters]
99+1010+ Examples:
1111+ - text/html
1212+ - application/json
1313+ - image/png
1414+ - text/html; charset=utf-8
1515+1616+ Validation rules:
1717+ - Must have a supertype (before /) and subtype (after /)
1818+ - Supertype and subtype must be token characters
1919+ - Optional semicolon-separated parameters
2020+ - Parameters must be name=value pairs
2121+ - Values can be quoted strings or tokens *)
2222+module Mime_type : Datatype.S
2323+2424+(** MIME type list validator.
2525+2626+ Validates a comma-separated list of MIME types.
2727+ Each MIME type in the list must be valid according to {!Mime_type} rules.
2828+ This is used for the 'accept' attribute on input elements. *)
2929+module Mime_type_list : Datatype.S
3030+3131+(** List of all datatypes defined in this module *)
3232+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_popover.ml
···11+(** Popover attribute validation for HTML5 *)
22+33+module Popover = struct
44+ let name = "popover"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "auto" | "manual" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid popover value. Expected 'auto', \
1414+ 'manual', or empty string."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Popover : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_popover.mli
···11+(** Popover attribute datatype validator for HTML5.
22+33+ This module provides a validator for the popover attribute, as defined by
44+ the HTML5 specification. *)
55+66+(** Popover attribute validator.
77+88+ Validates popover attribute values which can be:
99+ - "auto" - the popover can be light-dismissed (closed by clicking outside)
1010+ - "manual" - the popover must be explicitly closed
1111+ - "" (empty string) - equivalent to "auto"
1212+1313+ Values are case-insensitive.
1414+1515+ Examples:
1616+ - "auto"
1717+ - "manual"
1818+ - "" *)
1919+module Popover : Datatype.S
2020+2121+(** List of all datatypes defined in this module *)
2222+val datatypes : Datatype.t list
+22
lib/html5_checker/datatype/dt_preload.ml
···11+(** Media preload attribute validation based on HTML5 spec *)
22+33+(** Valid preload values *)
44+let valid_preloads = [ "none"; "metadata"; "auto"; "" ]
55+66+module Preload = struct
77+ let name = "preload"
88+99+ let validate s =
1010+ let s_lower = Datatype.string_to_ascii_lowercase s in
1111+ if List.mem s_lower valid_preloads then Ok ()
1212+ else
1313+ Error
1414+ (Printf.sprintf
1515+ "The value '%s' is not a valid preload value. Expected one of: \
1616+ 'none', 'metadata', 'auto', or empty string."
1717+ s)
1818+1919+ let is_valid s = Result.is_ok (validate s)
2020+end
2121+2222+let datatypes = [ (module Preload : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_preload.mli
···11+(** Media preload attribute datatype validator.
22+33+ This module provides a validator for the preload attribute used on
44+ audio and video elements, as defined by the HTML5 specification. *)
55+66+(** Media preload attribute validator.
77+88+ Validates media preload attribute values which can be:
99+ - none - No preloading (only load metadata when user starts playback)
1010+ - metadata - Preload metadata only (dimensions, duration, etc.)
1111+ - auto - Preload the entire resource if possible
1212+ - "" (empty string) - Equivalent to auto
1313+1414+ Values are matched case-insensitively according to HTML5 spec.
1515+1616+ Examples:
1717+ - "none"
1818+ - "metadata"
1919+ - "auto"
2020+ - "" *)
2121+module Preload : Datatype.S
2222+2323+(** List of all datatypes defined in this module *)
2424+val datatypes : Datatype.t list
+32
lib/html5_checker/datatype/dt_referrer.ml
···11+(** Referrer policy attribute validation for HTML5 *)
22+33+module Referrer_policy = struct
44+ let name = "referrerpolicy"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | ""
1010+ | "no-referrer"
1111+ | "no-referrer-when-downgrade"
1212+ | "origin"
1313+ | "origin-when-cross-origin"
1414+ | "same-origin"
1515+ | "strict-origin"
1616+ | "strict-origin-when-cross-origin"
1717+ | "unsafe-url" ->
1818+ Ok ()
1919+ | _ ->
2020+ Error
2121+ (Printf.sprintf
2222+ "The value '%s' is not a valid referrerpolicy value. Expected \
2323+ one of: empty string, 'no-referrer', \
2424+ 'no-referrer-when-downgrade', 'origin', \
2525+ 'origin-when-cross-origin', 'same-origin', 'strict-origin', \
2626+ 'strict-origin-when-cross-origin', or 'unsafe-url'."
2727+ s)
2828+2929+ let is_valid s = Result.is_ok (validate s)
3030+end
3131+3232+let datatypes = [ (module Referrer_policy : Datatype.S) ]
+29
lib/html5_checker/datatype/dt_referrer.mli
···11+(** Referrer policy attribute datatype validator.
22+33+ This module provides a validator for the referrerpolicy attribute used to
44+ control referrer information sent with requests, as defined by the HTML5 spec. *)
55+66+(** Referrer policy attribute validator.
77+88+ Validates referrerpolicy attribute values which can be:
99+ - "" (empty string, uses default policy)
1010+ - "no-referrer" (never send referrer)
1111+ - "no-referrer-when-downgrade" (send referrer to same security level)
1212+ - "origin" (send origin only)
1313+ - "origin-when-cross-origin" (full URL for same-origin, origin for cross-origin)
1414+ - "same-origin" (send referrer for same-origin only)
1515+ - "strict-origin" (send origin for same security level)
1616+ - "strict-origin-when-cross-origin" (full URL same-origin, origin cross-origin same security)
1717+ - "unsafe-url" (always send full URL)
1818+1919+ Values are case-insensitive after ASCII lowercasing.
2020+2121+ Examples:
2222+ - ""
2323+ - "no-referrer"
2424+ - "origin"
2525+ - "strict-origin-when-cross-origin" *)
2626+module Referrer_policy : Datatype.S
2727+2828+(** List of all datatypes defined in this module *)
2929+val datatypes : Datatype.t list
+53
lib/html5_checker/datatype/dt_sandbox.ml
···11+(** Sandbox tokens validation *)
22+33+(** Valid sandbox tokens (case-sensitive) *)
44+let valid_sandbox_tokens =
55+ [
66+ "allow-downloads";
77+ "allow-forms";
88+ "allow-modals";
99+ "allow-orientation-lock";
1010+ "allow-pointer-lock";
1111+ "allow-popups";
1212+ "allow-popups-to-escape-sandbox";
1313+ "allow-presentation";
1414+ "allow-same-origin";
1515+ "allow-scripts";
1616+ "allow-top-navigation";
1717+ "allow-top-navigation-by-user-activation";
1818+ "allow-top-navigation-to-custom-protocols";
1919+ ]
2020+2121+(** Validate sandbox attribute value *)
2222+let validate_sandbox s =
2323+ let trimmed = Datatype.trim_html_spaces s in
2424+ (* Empty value is valid (maximum restrictions) *)
2525+ if trimmed = "" then Ok ()
2626+ else
2727+ (* Split on whitespace *)
2828+ let tokens = String.split_on_char ' ' trimmed in
2929+ let tokens = List.filter (fun t -> Datatype.trim_html_spaces t <> "") tokens in
3030+ if tokens = [] then Ok () (* All whitespace is like empty *)
3131+ else
3232+ (* Validate each token *)
3333+ let rec check_tokens = function
3434+ | [] -> Ok ()
3535+ | token :: rest ->
3636+ (* Sandbox tokens are case-sensitive *)
3737+ if List.mem token valid_sandbox_tokens then check_tokens rest
3838+ else
3939+ Error
4040+ (Printf.sprintf
4141+ "The value '%s' is not a valid sandbox token. Valid tokens \
4242+ are: %s"
4343+ token (String.concat ", " valid_sandbox_tokens))
4444+ in
4545+ check_tokens tokens
4646+4747+module Sandbox = struct
4848+ let name = "sandbox"
4949+ let validate = validate_sandbox
5050+ let is_valid s = Result.is_ok (validate s)
5151+end
5252+5353+let datatypes = [ (module Sandbox : Datatype.S) ]
+37
lib/html5_checker/datatype/dt_sandbox.mli
···11+(** Sandbox tokens validator.
22+33+ This module provides a validator for the sandbox attribute used on iframe
44+ elements, as defined by the HTML5 specification. *)
55+66+(** Sandbox attribute validator.
77+88+ Validates sandbox attribute values which contain space-separated sandbox
99+ tokens. Each token enables a specific capability for the sandboxed iframe.
1010+1111+ Valid tokens:
1212+ - allow-downloads: Allow downloads
1313+ - allow-forms: Allow form submission
1414+ - allow-modals: Allow modal dialogs (alert, confirm, etc.)
1515+ - allow-orientation-lock: Allow orientation lock
1616+ - allow-pointer-lock: Allow pointer lock
1717+ - allow-popups: Allow popups (window.open, target="_blank", etc.)
1818+ - allow-popups-to-escape-sandbox: Allow popups that don't inherit sandboxing
1919+ - allow-presentation: Allow presentation sessions
2020+ - allow-same-origin: Allow same-origin access
2121+ - allow-scripts: Allow script execution
2222+ - allow-top-navigation: Allow navigating top-level browsing context
2323+ - allow-top-navigation-by-user-activation: Allow top navigation with user gesture
2424+ - allow-top-navigation-to-custom-protocols: Allow top navigation to custom protocols
2525+2626+ Examples:
2727+ - "" (empty = maximum restrictions)
2828+ - "allow-scripts"
2929+ - "allow-same-origin allow-scripts"
3030+ - "allow-forms allow-popups allow-scripts"
3131+3232+ Tokens are case-sensitive and must match exactly. Duplicate tokens are
3333+ allowed but redundant. An empty value means maximum sandbox restrictions. *)
3434+module Sandbox : Datatype.S
3535+3636+(** List of all datatypes defined in this module *)
3737+val datatypes : Datatype.t list
+22
lib/html5_checker/datatype/dt_scope.ml
···11+(** Table header scope attribute validation based on HTML5 spec *)
22+33+(** Valid scope values *)
44+let valid_scopes = [ "row"; "col"; "rowgroup"; "colgroup" ]
55+66+module Scope = struct
77+ let name = "scope"
88+99+ let validate s =
1010+ let s_lower = Datatype.string_to_ascii_lowercase s in
1111+ if List.mem s_lower valid_scopes then Ok ()
1212+ else
1313+ Error
1414+ (Printf.sprintf
1515+ "The value '%s' is not a valid scope value. Expected one of: %s."
1616+ s
1717+ (String.concat ", " valid_scopes))
1818+1919+ let is_valid s = Result.is_ok (validate s)
2020+end
2121+2222+let datatypes = [ (module Scope : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_scope.mli
···11+(** Table header scope attribute datatype validator.
22+33+ This module provides a validator for the scope attribute used on
44+ th elements, as defined by the HTML5 specification. *)
55+66+(** Table header scope attribute validator.
77+88+ Validates th scope attribute values which can be:
99+ - row - Header cell applies to some of the subsequent cells in the same row(s)
1010+ - col - Header cell applies to some of the subsequent cells in the same column(s)
1111+ - rowgroup - Header cell applies to all remaining cells in the row group
1212+ - colgroup - Header cell applies to all remaining cells in the column group
1313+1414+ Values are matched case-insensitively according to HTML5 spec.
1515+1616+ Examples:
1717+ - "row"
1818+ - "col"
1919+ - "rowgroup"
2020+ - "colgroup" *)
2121+module Scope : Datatype.S
2222+2323+(** List of all datatypes defined in this module *)
2424+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_shape.ml
···11+(** Shape attribute validation for HTML5 *)
22+33+module Shape = struct
44+ let name = "shape"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "default" | "rect" | "circle" | "poly" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid shape. Expected 'default', 'rect', \
1414+ 'circle', or 'poly'."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Shape : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_shape.mli
···11+(** Shape attribute datatype validator for HTML5.
22+33+ This module provides a validator for the shape attribute used on area
44+ elements within image maps, as defined by the HTML5 specification. *)
55+66+(** Shape attribute validator.
77+88+ Validates shape attribute values which can be:
99+ - "default" - entire region
1010+ - "rect" - rectangular region
1111+ - "circle" - circular region
1212+ - "poly" - polygonal region
1313+1414+ Values are case-insensitive.
1515+1616+ Examples:
1717+ - "rect"
1818+ - "circle"
1919+ - "poly"
2020+ - "default" *)
2121+module Shape : Datatype.S
2222+2323+(** List of all datatypes defined in this module *)
2424+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_spellcheck.ml
···11+(** Spellcheck attribute validation for HTML5 *)
22+33+module Spellcheck = struct
44+ let name = "spellcheck"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "true" | "false" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid spellcheck value. Expected 'true', \
1414+ 'false', or empty string."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Spellcheck : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_spellcheck.mli
···11+(** Spellcheck attribute datatype validator for HTML5.
22+33+ This module provides a validator for the spellcheck attribute, as defined by
44+ the HTML5 specification. *)
55+66+(** Spellcheck attribute validator.
77+88+ Validates spellcheck attribute values which can be:
99+ - "true" - spelling and grammar checking is enabled
1010+ - "false" - spelling and grammar checking is disabled
1111+ - "" (empty string) - default behavior (typically inherits from parent)
1212+1313+ Values are case-insensitive.
1414+1515+ Examples:
1616+ - "true"
1717+ - "false"
1818+ - "" *)
1919+module Spellcheck : Datatype.S
2020+2121+(** List of all datatypes defined in this module *)
2222+val datatypes : Datatype.t list
+142
lib/html5_checker/datatype/dt_srcset.ml
···11+(** Image source set and sizes attribute validation *)
22+33+(** Split string on commas, preserving parentheses groups *)
44+let split_on_commas s =
55+ let len = String.length s in
66+ let rec find_splits paren_depth start i acc =
77+ if i >= len then
88+ if start < len then List.rev (String.sub s start (len - start) :: acc)
99+ else List.rev acc
1010+ else
1111+ match s.[i] with
1212+ | '(' -> find_splits (paren_depth + 1) start (i + 1) acc
1313+ | ')' -> find_splits (max 0 (paren_depth - 1)) start (i + 1) acc
1414+ | ',' when paren_depth = 0 ->
1515+ let part = String.sub s start (i - start) in
1616+ find_splits 0 (i + 1) (i + 1) (part :: acc)
1717+ | _ -> find_splits paren_depth start (i + 1) acc
1818+ in
1919+ find_splits 0 0 0 []
2020+2121+(** Parse a descriptor (width or pixel density) *)
2222+let parse_descriptor s =
2323+ let trimmed = Datatype.trim_html_spaces s in
2424+ let len = String.length trimmed in
2525+ if len < 2 then None
2626+ else
2727+ let suffix = trimmed.[len - 1] in
2828+ let num_part = String.sub trimmed 0 (len - 1) in
2929+ match suffix with
3030+ | 'w' ->
3131+ (try
3232+ let n = int_of_string num_part in
3333+ if n > 0 then Some (`Width n) else None
3434+ with _ -> None)
3535+ | 'x' ->
3636+ (try
3737+ let f = float_of_string num_part in
3838+ if f > 0.0 then Some (`Density f) else None
3939+ with _ -> None)
4040+ | _ -> None
4141+4242+(** Validate a single image candidate *)
4343+let validate_image_candidate s =
4444+ let trimmed = Datatype.trim_html_spaces s in
4545+ if trimmed = "" then Error "Image candidate must not be empty"
4646+ else
4747+ (* Split on whitespace to get URL and optional descriptor *)
4848+ let parts = String.split_on_char ' ' trimmed in
4949+ let parts = List.filter (fun p -> Datatype.trim_html_spaces p <> "") parts in
5050+ match parts with
5151+ | [] -> Error "Image candidate must not be empty"
5252+ | [ _url ] -> Ok None (* Just URL, no descriptor *)
5353+ | [ _url; desc ] -> (
5454+ match parse_descriptor desc with
5555+ | Some d -> Ok (Some d)
5656+ | None ->
5757+ Error
5858+ (Printf.sprintf
5959+ "Invalid descriptor '%s'. Must be a positive integer followed \
6060+ by 'w' or a positive number followed by 'x'"
6161+ desc))
6262+ | _ ->
6363+ Error
6464+ "Image candidate must be a URL optionally followed by one descriptor"
6565+6666+(** Validate srcset value *)
6767+let validate_srcset s =
6868+ let trimmed = Datatype.trim_html_spaces s in
6969+ if trimmed = "" then Error "Srcset must not be empty" else
7070+ let candidates = split_on_commas trimmed in
7171+ let candidates = List.filter (fun c -> Datatype.trim_html_spaces c <> "") candidates in
7272+ if candidates = [] then Error "Srcset must contain at least one image candidate"
7373+ else
7474+ (* Validate each candidate and check for descriptor type consistency *)
7575+ let rec check_candidates has_width has_density = function
7676+ | [] -> Ok ()
7777+ | cand :: rest -> (
7878+ match validate_image_candidate cand with
7979+ | Error e -> Error e
8080+ | Ok None -> check_candidates has_width has_density rest
8181+ | Ok (Some (`Width _)) ->
8282+ if has_density then
8383+ Error
8484+ "Cannot mix width descriptors (w) and pixel density \
8585+ descriptors (x) in the same srcset"
8686+ else check_candidates true has_density rest
8787+ | Ok (Some (`Density _)) ->
8888+ if has_width then
8989+ Error
9090+ "Cannot mix width descriptors (w) and pixel density \
9191+ descriptors (x) in the same srcset"
9292+ else check_candidates has_width true rest)
9393+ in
9494+ check_candidates false false candidates
9595+9696+module Srcset = struct
9797+ let name = "srcset"
9898+ let validate = validate_srcset
9999+ let is_valid s = Result.is_ok (validate s)
100100+end
101101+102102+(** Validate sizes attribute *)
103103+let validate_sizes s =
104104+ let trimmed = Datatype.trim_html_spaces s in
105105+ if trimmed = "" then Error "Sizes attribute must not be empty"
106106+ else
107107+ (* Split on commas *)
108108+ let entries = split_on_commas trimmed in
109109+ let entries = List.filter (fun e -> Datatype.trim_html_spaces e <> "") entries in
110110+ if entries = [] then Error "Sizes attribute must contain at least one entry"
111111+ else
112112+ (* Each entry except the last should have a media condition
113113+ The last entry is just a size value
114114+ We do basic validation here *)
115115+ let rec check_entries = function
116116+ | [] -> Ok ()
117117+ | [ _last ] ->
118118+ (* Last entry - just a size value, accept anything non-empty *)
119119+ Ok ()
120120+ | entry :: rest ->
121121+ let entry_trimmed = Datatype.trim_html_spaces entry in
122122+ (* Check if it looks like it has a media condition (starts with '(') *)
123123+ if String.length entry_trimmed = 0 then
124124+ Error "Size entry must not be empty"
125125+ else if entry_trimmed.[0] <> '(' then
126126+ Error
127127+ (Printf.sprintf
128128+ "Size entry '%s' should start with a media condition in \
129129+ parentheses"
130130+ entry_trimmed)
131131+ else check_entries rest
132132+ in
133133+ check_entries entries
134134+135135+module Sizes = struct
136136+ let name = "sizes"
137137+ let validate = validate_sizes
138138+ let is_valid s = Result.is_ok (validate s)
139139+end
140140+141141+let datatypes =
142142+ [ (module Srcset : Datatype.S); (module Sizes : Datatype.S) ]
+44
lib/html5_checker/datatype/dt_srcset.mli
···11+(** Image source set and sizes attribute validators.
22+33+ This module provides validators for srcset and sizes attributes used on
44+ img and source elements, as defined by the HTML5 specification. *)
55+66+(** Srcset attribute validator.
77+88+ Validates srcset attribute values which contain comma-separated image
99+ candidates. Each image candidate consists of:
1010+ - A URL
1111+ - Optional whitespace followed by a width descriptor (e.g., "100w") or
1212+ pixel density descriptor (e.g., "2x")
1313+1414+ Examples:
1515+ - "image.jpg"
1616+ - "image.jpg 1x"
1717+ - "image-320.jpg 320w, image-640.jpg 640w"
1818+ - "image-1x.jpg 1x, image-2x.jpg 2x"
1919+2020+ Width descriptors must be positive integers followed by 'w'.
2121+ Pixel density descriptors must be positive numbers followed by 'x'.
2222+ Cannot mix width and density descriptors in the same srcset. *)
2323+module Srcset : Datatype.S
2424+2525+(** Sizes attribute validator.
2626+2727+ Validates sizes attribute values which contain comma-separated source size
2828+ entries. Each entry (except the last) consists of:
2929+ - A media condition
3030+ - Whitespace
3131+ - A source size value (length or "auto")
3232+3333+ The last entry is just a source size value (without media condition).
3434+3535+ Examples:
3636+ - "100vw"
3737+ - "(max-width: 600px) 100vw, 50vw"
3838+ - "(min-width: 800px) 800px, 100vw"
3939+4040+ This validator performs basic syntax checking. *)
4141+module Sizes : Datatype.S
4242+4343+(** List of all datatypes defined in this module *)
4444+val datatypes : Datatype.t list
+42
lib/html5_checker/datatype/dt_target.ml
···11+(** Browsing context and target attribute validation *)
22+33+(** Valid special target keywords (case-insensitive) *)
44+let special_keywords = [ "_blank"; "_self"; "_parent"; "_top" ]
55+66+(** Validate a browsing context name *)
77+let validate_browsing_context s =
88+ if String.length s = 0 then Error "Browsing context name must not be empty"
99+ else if s.[0] = '_' then
1010+ (* If starts with underscore, must be a special keyword *)
1111+ let lower = Datatype.string_to_ascii_lowercase s in
1212+ if List.mem lower special_keywords then Ok ()
1313+ else
1414+ Error
1515+ (Printf.sprintf
1616+ "Browsing context name '%s' starts with underscore but is not one \
1717+ of the special keywords: %s"
1818+ s (String.concat ", " special_keywords))
1919+ else Ok ()
2020+2121+module Browsing_context = struct
2222+ let name = "browsing-context"
2323+ let validate = validate_browsing_context
2424+ let is_valid s = Result.is_ok (validate s)
2525+end
2626+2727+(** Validate a target attribute value
2828+ (For now, this is the same as browsing context validation) *)
2929+let validate_target s =
3030+ if String.length s = 0 then Error "Target attribute must not be empty"
3131+ else validate_browsing_context s
3232+3333+module Target = struct
3434+ let name = "target"
3535+ let validate = validate_target
3636+ let is_valid s = Result.is_ok (validate s)
3737+end
3838+3939+let datatypes =
4040+ [
4141+ (module Target : Datatype.S); (module Browsing_context : Datatype.S);
4242+ ]
+50
lib/html5_checker/datatype/dt_target.mli
···11+(** Browsing context and target attribute validators.
22+33+ This module provides validators for browsing context names and target
44+ attributes used on a, area, base, and form elements, as defined by the
55+ HTML5 specification. *)
66+77+(** Target attribute validator.
88+99+ Validates target attribute values which specify where to display linked
1010+ content or form responses. Valid values include:
1111+1212+ Special keywords (case-insensitive):
1313+ - _blank: New window or tab
1414+ - _self: Same frame (default)
1515+ - _parent: Parent frame
1616+ - _top: Top-level window
1717+1818+ Or a valid browsing context name:
1919+ - Non-empty string
2020+ - If starts with underscore, must be one of the special keywords above
2121+2222+ Examples:
2323+ - "_blank"
2424+ - "_self"
2525+ - "myframe"
2626+ - "content-frame" *)
2727+module Target : Datatype.S
2828+2929+(** Browsing context name validator.
3030+3131+ Validates browsing context names used to identify frames and windows.
3232+ A valid browsing context name is:
3333+ - A non-empty string
3434+ - If it starts with an underscore (_), it must be one of the special
3535+ keywords: _blank, _self, _parent, or _top (case-insensitive)
3636+ - Otherwise, any non-empty string is valid
3737+3838+ Examples:
3939+ - "myframe"
4040+ - "content"
4141+ - "navigation-frame"
4242+ - "_blank" (special keyword)
4343+4444+ Invalid examples:
4545+ - "" (empty string)
4646+ - "_custom" (underscore prefix but not a special keyword) *)
4747+module Browsing_context : Datatype.S
4848+4949+(** List of all datatypes defined in this module *)
5050+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_translate.ml
···11+(** Translate attribute validation for HTML5 *)
22+33+module Translate = struct
44+ let name = "translate"
55+66+ let validate s =
77+ let s_lower = Datatype.string_to_ascii_lowercase s in
88+ match s_lower with
99+ | "" | "yes" | "no" -> Ok ()
1010+ | _ ->
1111+ Error
1212+ (Printf.sprintf
1313+ "The value '%s' is not a valid translate value. Expected 'yes', \
1414+ 'no', or empty string."
1515+ s)
1616+1717+ let is_valid s = Result.is_ok (validate s)
1818+end
1919+2020+let datatypes = [ (module Translate : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_translate.mli
···11+(** Translate attribute datatype validator for HTML5.
22+33+ This module provides a validator for the translate attribute, as defined by
44+ the HTML5 specification. *)
55+66+(** Translate attribute validator.
77+88+ Validates translate attribute values which can be:
99+ - "yes" - the element should be translated
1010+ - "no" - the element should not be translated
1111+ - "" (empty string) - equivalent to "yes"
1212+1313+ Values are case-insensitive.
1414+1515+ Examples:
1616+ - "yes"
1717+ - "no"
1818+ - "" *)
1919+module Translate : Datatype.S
2020+2121+(** List of all datatypes defined in this module *)
2222+val datatypes : Datatype.t list
+124
lib/html5_checker/datatype/dt_url.ml
···11+(** URL and IRI datatype validators for HTML5. *)
22+33+(** Check if a character is valid in a URL scheme name.
44+ Scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) *)
55+let is_scheme_char_initial = function 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
66+77+let is_scheme_char_subsequent = function
88+ | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '+' | '-' | '.' -> true
99+ | _ -> false
1010+1111+(** Split a URL into scheme and remainder.
1212+ Returns Some (scheme, rest) if a valid scheme is found, None otherwise.
1313+ Skips leading HTML whitespace before checking for scheme. *)
1414+let split_scheme s =
1515+ let len = String.length s in
1616+ (* Skip leading HTML whitespace *)
1717+ let rec skip_whitespace i =
1818+ if i >= len then len
1919+ else if Datatype.is_whitespace s.[i] then skip_whitespace (i + 1)
2020+ else i
2121+ in
2222+ let start = skip_whitespace 0 in
2323+ if start >= len then None
2424+ else if not (is_scheme_char_initial s.[start]) then None
2525+ else
2626+ (* Look for scheme *)
2727+ let rec find_colon i =
2828+ if i >= len then None
2929+ else
3030+ match s.[i] with
3131+ | ':' ->
3232+ let scheme =
3333+ String.sub s start (i - start) |> Datatype.string_to_ascii_lowercase
3434+ in
3535+ let rest = String.sub s (i + 1) (len - i - 1) in
3636+ Some (scheme, rest)
3737+ | c when is_scheme_char_subsequent c -> find_colon (i + 1)
3838+ | _ -> None
3939+ in
4040+ find_colon (start + 1)
4141+4242+(** Check if a scheme is well-known (http, https, ftp, mailto, file) *)
4343+let is_well_known_scheme = function
4444+ | "http" | "https" | "ftp" | "mailto" | "file" -> true
4545+ | _ -> false
4646+4747+module Url : Datatype.S = struct
4848+ let name = "URL"
4949+5050+ let validate s =
5151+ let trimmed = Datatype.trim_html_spaces s in
5252+ if trimmed = "" then Error "Must be non-empty."
5353+ else
5454+ (* Basic validation - check for control characters *)
5555+ let len = String.length s in
5656+ let rec check_chars i =
5757+ if i >= len then Ok ()
5858+ else
5959+ match s.[i] with
6060+ | '\x00' .. '\x1F' | '\x7F' ->
6161+ Error "URLs must not contain control characters."
6262+ | _ -> check_chars (i + 1)
6363+ in
6464+ check_chars 0
6565+6666+ let is_valid s = Result.is_ok (validate s)
6767+end
6868+6969+module Url_potentially_empty : Datatype.S = struct
7070+ let name = "URL (potentially empty)"
7171+7272+ let validate s =
7373+ let trimmed = Datatype.trim_html_spaces s in
7474+ if trimmed = "" then Ok ()
7575+ else
7676+ (* Use same validation as Url for non-empty values *)
7777+ Url.validate s
7878+7979+ let is_valid s = Result.is_ok (validate s)
8080+end
8181+8282+module Url_absolute : Datatype.S = struct
8383+ let name = "absolute URL"
8484+8585+ let validate s =
8686+ let trimmed = Datatype.trim_html_spaces s in
8787+ if trimmed = "" then Error "Must be non-empty."
8888+ else
8989+ match split_scheme s with
9090+ | None ->
9191+ Error (Printf.sprintf "The string \"%s\" is not an absolute URL." s)
9292+ | Some (scheme, _rest) ->
9393+ if is_well_known_scheme scheme || String.length scheme > 0 then
9494+ (* For well-known schemes, we could do more validation,
9595+ but for now we just check that it has a scheme *)
9696+ Ok ()
9797+ else Error "The string is not an absolute URL."
9898+9999+ let is_valid s = Result.is_ok (validate s)
100100+end
101101+102102+module Iri : Datatype.S = struct
103103+ let name = "absolute URL"
104104+105105+ (* IRI validation is the same as absolute URL validation *)
106106+ let validate = Url_absolute.validate
107107+ let is_valid s = Result.is_ok (validate s)
108108+end
109109+110110+module Iri_ref : Datatype.S = struct
111111+ let name = "URL"
112112+113113+ (* IRI reference validation is the same as URL validation *)
114114+ let validate = Url.validate
115115+ let is_valid s = Result.is_ok (validate s)
116116+end
117117+118118+let datatypes =
119119+ [ (module Url : Datatype.S)
120120+ ; (module Url_potentially_empty : Datatype.S)
121121+ ; (module Url_absolute : Datatype.S)
122122+ ; (module Iri : Datatype.S)
123123+ ; (module Iri_ref : Datatype.S)
124124+ ]
+48
lib/html5_checker/datatype/dt_url.mli
···11+(** URL and IRI datatype validators for HTML5.
22+33+ This module provides validators for URLs and Internationalized Resource
44+ Identifiers (IRIs) based on the Nu HTML Checker's implementation.
55+66+ The validators perform basic structural validation. Full URL parsing and
77+ validation according to WHATWG URL spec would require a complete URL parser. *)
88+99+(** URL validator (IriRef in Nu validator).
1010+1111+ Validates URL references which can be either absolute or relative URLs.
1212+ Basic validation ensures:
1313+ - Non-empty after trimming HTML whitespace
1414+ - No control characters
1515+ - Basic structural correctness
1616+1717+ This corresponds to the general URL/IRI reference type. *)
1818+module Url : Datatype.S
1919+2020+(** URL validator that allows empty values.
2121+2222+ Same as Url but permits empty strings. This is used for optional URL
2323+ attributes. *)
2424+module Url_potentially_empty : Datatype.S
2525+2626+(** Absolute URL validator (Iri in Nu validator).
2727+2828+ Validates that a URL is absolute (has a scheme). An absolute URL must:
2929+ - Start with a valid scheme (e.g., http:, https:, ftp:, mailto:)
3030+ - Not be empty
3131+ - Follow URL structure rules
3232+3333+ Scheme format: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) *)
3434+module Url_absolute : Datatype.S
3535+3636+(** IRI (Internationalized Resource Identifier) validator.
3737+3838+ Alias for Url_absolute. IRIs are the internationalized version of URIs,
3939+ allowing Unicode characters. *)
4040+module Iri : Datatype.S
4141+4242+(** IRI reference validator.
4343+4444+ Alias for Url. IRI references can be relative or absolute. *)
4545+module Iri_ref : Datatype.S
4646+4747+(** List of all URL/IRI-related datatypes for registration. *)
4848+val datatypes : Datatype.t list
+22
lib/html5_checker/datatype/dt_wrap.ml
···11+(** Textarea wrap attribute validation based on HTML5 spec *)
22+33+(** Valid wrap values *)
44+let valid_wraps = [ "soft"; "hard" ]
55+66+module Wrap = struct
77+ let name = "wrap"
88+99+ let validate s =
1010+ let s_lower = Datatype.string_to_ascii_lowercase s in
1111+ if List.mem s_lower valid_wraps then Ok ()
1212+ else
1313+ Error
1414+ (Printf.sprintf
1515+ "The value '%s' is not a valid wrap value. Expected one of: %s."
1616+ s
1717+ (String.concat ", " valid_wraps))
1818+1919+ let is_valid s = Result.is_ok (validate s)
2020+end
2121+2222+let datatypes = [ (module Wrap : Datatype.S) ]
+20
lib/html5_checker/datatype/dt_wrap.mli
···11+(** Textarea wrap attribute datatype validator.
22+33+ This module provides a validator for the wrap attribute used on
44+ textarea elements, as defined by the HTML5 specification. *)
55+66+(** Textarea wrap attribute validator.
77+88+ Validates textarea wrap attribute values which can be:
99+ - soft - Soft wrapping (line breaks not submitted, default)
1010+ - hard - Hard wrapping (line breaks are submitted)
1111+1212+ Values are matched case-insensitively according to HTML5 spec.
1313+1414+ Examples:
1515+ - "soft"
1616+ - "hard" *)
1717+module Wrap : Datatype.S
1818+1919+(** List of all datatypes defined in this module *)
2020+val datatypes : Datatype.t list
+94
lib/html5_checker/dom_walker.ml
···11+(** DOM tree traversal for HTML5 conformance checking. *)
22+33+(** Package a checker with its state for traversal. *)
44+type checker_state = {
55+ start_element :
66+ name:string ->
77+ namespace:string option ->
88+ attrs:(string * string) list ->
99+ Message_collector.t ->
1010+ unit;
1111+ end_element :
1212+ name:string -> namespace:string option -> Message_collector.t -> unit;
1313+ characters : string -> Message_collector.t -> unit;
1414+ end_document : Message_collector.t -> unit;
1515+}
1616+1717+(** Create a checker state package from a first-class module. *)
1818+let make_checker_state (module C : Checker.S) =
1919+ let state = C.create () in
2020+ {
2121+ start_element = (fun ~name ~namespace ~attrs collector ->
2222+ C.start_element state ~name ~namespace ~attrs collector);
2323+ end_element = (fun ~name ~namespace collector ->
2424+ C.end_element state ~name ~namespace collector);
2525+ characters = (fun text collector ->
2626+ C.characters state text collector);
2727+ end_document = (fun collector ->
2828+ C.end_document state collector);
2929+ }
3030+3131+(** Walk a DOM node with a single checker state. *)
3232+let rec walk_node_single cs collector node =
3333+ let open Html5rw.Dom in
3434+ match node.name with
3535+ | "#text" ->
3636+ (* Text node: emit characters event *)
3737+ cs.characters node.data collector
3838+ | "#comment" ->
3939+ (* Comment node: emit characters event with comment text *)
4040+ cs.characters node.data collector
4141+ | "#document" | "#document-fragment" ->
4242+ (* Document/fragment nodes: just traverse children *)
4343+ List.iter (walk_node_single cs collector) node.children
4444+ | "!doctype" ->
4545+ (* Doctype node: skip (no validation events for doctype) *)
4646+ ()
4747+ | _ ->
4848+ (* Element node: emit start, traverse children, emit end *)
4949+ cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector;
5050+ List.iter (walk_node_single cs collector) node.children;
5151+ cs.end_element ~name:node.name ~namespace:node.namespace collector
5252+5353+let walk checker collector node =
5454+ let cs = make_checker_state checker in
5555+ walk_node_single cs collector node;
5656+ cs.end_document collector
5757+5858+(** Walk a DOM node with multiple checker states. *)
5959+let rec walk_node_all css collector node =
6060+ let open Html5rw.Dom in
6161+ match node.name with
6262+ | "#text" ->
6363+ (* Text node: emit characters event to all checkers *)
6464+ List.iter (fun cs -> cs.characters node.data collector) css
6565+ | "#comment" ->
6666+ (* Comment node: emit characters event with comment text to all checkers *)
6767+ List.iter (fun cs -> cs.characters node.data collector) css
6868+ | "#document" | "#document-fragment" ->
6969+ (* Document/fragment nodes: just traverse children *)
7070+ List.iter (walk_node_all css collector) node.children
7171+ | "!doctype" ->
7272+ (* Doctype node: skip *)
7373+ ()
7474+ | _ ->
7575+ (* Element node: emit start to all checkers, traverse children, emit end to all *)
7676+ List.iter (fun cs ->
7777+ cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector
7878+ ) css;
7979+ List.iter (walk_node_all css collector) node.children;
8080+ List.iter (fun cs ->
8181+ cs.end_element ~name:node.name ~namespace:node.namespace collector
8282+ ) css
8383+8484+let walk_all checkers collector node =
8585+ (* Create checker state packages *)
8686+ let css = List.map make_checker_state checkers in
8787+ (* Traverse with all checkers *)
8888+ walk_node_all css collector node;
8989+ (* Call end_document on all checkers *)
9090+ List.iter (fun cs -> cs.end_document collector) css
9191+9292+let walk_registry registry collector node =
9393+ let checkers = Checker_registry.all registry in
9494+ walk_all checkers collector node
+197
lib/html5_checker/dom_walker.mli
···11+(** DOM tree traversal for HTML5 conformance checking.
22+33+ This module provides functions to traverse DOM trees and apply checkers
44+ to validate HTML5 documents. It implements a depth-first, in-order
55+ traversal that visits every node in the tree and notifies checkers
66+ of traversal events.
77+88+ {2 Traversal Model}
99+1010+ The walker follows a SAX-like event model, emitting events as it
1111+ encounters different node types during traversal:
1212+1313+ {v
1414+ Document
1515+ └── html (start_element "html")
1616+ ├── head (start_element "head")
1717+ │ └── title (start_element "title")
1818+ │ ├── #text "Page Title" (characters)
1919+ │ └── (end_element "title")
2020+ └── body (start_element "body")
2121+ └── p (start_element "p")
2222+ ├── #text "Hello " (characters)
2323+ ├── b (start_element "b")
2424+ │ ├── #text "world" (characters)
2525+ │ └── (end_element "b")
2626+ ├── #text "!" (characters)
2727+ └── (end_element "p")
2828+ end_document
2929+ v}
3030+3131+ {2 Event Sequence}
3232+3333+ For each element node:
3434+ 1. {!Checker.S.start_element} is called when entering the element
3535+ 2. Children are recursively traversed
3636+ 3. {!Checker.S.end_element} is called when exiting the element
3737+3838+ For text and comment nodes:
3939+ - {!Checker.S.characters} is called with the text content
4040+4141+ After the entire tree is traversed:
4242+ - {!Checker.S.end_document} is called on all checkers
4343+4444+ {2 Checker Coordination}
4545+4646+ When multiple checkers are used:
4747+ - All checkers receive the same event sequence
4848+ - Events are delivered to checkers in the order they appear in the list
4949+ - Each checker maintains independent state
5050+ - Messages from all checkers are collected together
5151+5252+ This allows composing orthogonal validation rules without interference.
5353+5454+ {2 Usage Examples}
5555+5656+ {b Single checker:}
5757+ {[
5858+ let checker = Checker.noop () in
5959+ let collector = Message_collector.create () in
6060+ walk checker collector dom;
6161+ let messages = Message_collector.messages collector in
6262+ List.iter Message.pp messages
6363+ ]}
6464+6565+ {b Multiple checkers:}
6666+ {[
6767+ let checkers = [checker1; checker2; checker3] in
6868+ let collector = Message_collector.create () in
6969+ walk_all checkers collector dom;
7070+ (* Analyze messages from all checkers *)
7171+ ]}
7272+7373+ {b Registry of checkers:}
7474+ {[
7575+ let registry = Checker_registry.default () in
7676+ let collector = Message_collector.create () in
7777+ walk_registry registry collector dom;
7878+ (* All registered checkers have validated the DOM *)
7979+ ]} *)
8080+8181+(** {1 Single Checker Traversal} *)
8282+8383+val walk : Checker.t -> Message_collector.t -> Html5rw.Dom.node -> unit
8484+(** [walk checker collector node] traverses a DOM tree with a single checker.
8585+8686+ @param checker The checker to apply during traversal
8787+ @param collector The message collector for validation messages
8888+ @param node The root node to start traversal from
8989+9090+ The traversal is depth-first and in-order: for each element, the
9191+ checker receives a {!Checker.S.start_element} event, then children
9292+ are recursively traversed, then an {!Checker.S.end_element} event
9393+ is emitted.
9494+9595+ After the entire tree is traversed, {!Checker.S.end_document} is
9696+ called to allow the checker to emit any final validation messages.
9797+9898+ {b Example:}
9999+ {[
100100+ (* Validate a parsed HTML document *)
101101+ let checker = Checker.noop () in
102102+ let collector = Message_collector.create () in
103103+ walk checker collector document_node;
104104+105105+ (* Check for errors *)
106106+ let messages = Message_collector.messages collector in
107107+ let errors = List.filter
108108+ (fun msg -> msg.Message.severity = Message.Error)
109109+ messages in
110110+ if errors <> [] then
111111+ Printf.printf "Found %d errors\n" (List.length errors)
112112+ ]}
113113+114114+ {b Notes:}
115115+ - Only element nodes trigger start/end events
116116+ - Text and comment nodes trigger character events
117117+ - Document and doctype nodes are silently skipped
118118+ - The traversal follows document order (parent before children,
119119+ earlier siblings before later ones) *)
120120+121121+(** {1 Multiple Checker Traversal} *)
122122+123123+val walk_all :
124124+ Checker.t list -> Message_collector.t -> Html5rw.Dom.node -> unit
125125+(** [walk_all checkers collector node] traverses a DOM tree with multiple
126126+ checkers.
127127+128128+ @param checkers List of checkers to apply during traversal
129129+ @param collector The message collector for validation messages
130130+ @param node The root node to start traversal from
131131+132132+ This performs a single tree traversal, delivering each event to all
133133+ checkers in sequence. This is more efficient than calling {!walk}
134134+ multiple times.
135135+136136+ All checkers receive events in the order they appear in the list.
137137+ Each checker maintains independent state, so validation rules can
138138+ be composed without interference.
139139+140140+ {b Example:}
141141+ {[
142142+ (* Run multiple validation passes in one traversal *)
143143+ let structure_checker = (module StructureChecker : Checker.S) in
144144+ let attribute_checker = (module AttributeChecker : Checker.S) in
145145+ let obsolete_checker = (module ObsoleteChecker : Checker.S) in
146146+147147+ let checkers = [structure_checker; attribute_checker; obsolete_checker] in
148148+ let collector = Message_collector.create () in
149149+150150+ walk_all checkers collector document_node;
151151+152152+ (* All three checkers have validated the document *)
153153+ let messages = Message_collector.messages collector in
154154+ Message_format.print_messages messages
155155+ ]}
156156+157157+ {b Empty list behavior:}
158158+ If the checkers list is empty, the tree is traversed but no validation
159159+ is performed. This is equivalent to calling [walk (Checker.noop ()) ...]. *)
160160+161161+(** {1 Registry-Based Traversal} *)
162162+163163+val walk_registry :
164164+ Checker_registry.t -> Message_collector.t -> Html5rw.Dom.node -> unit
165165+(** [walk_registry registry collector node] traverses a DOM tree with all
166166+ checkers from a registry.
167167+168168+ @param registry The registry containing checkers to apply
169169+ @param collector The message collector for validation messages
170170+ @param node The root node to start traversal from
171171+172172+ This is equivalent to:
173173+ {[
174174+ let checkers = Checker_registry.all registry in
175175+ walk_all checkers collector node
176176+ ]}
177177+178178+ Use this when you want to run a pre-configured set of checkers
179179+ without manually extracting them from the registry.
180180+181181+ {b Example:}
182182+ {[
183183+ (* Set up registry with desired checkers *)
184184+ let registry = Checker_registry.default () in
185185+ Checker_registry.register registry "custom" my_checker;
186186+187187+ (* Validate multiple documents with same checker set *)
188188+ List.iter (fun doc ->
189189+ let collector = Message_collector.create () in
190190+ walk_registry registry collector doc;
191191+ report_results collector
192192+ ) documents
193193+ ]}
194194+195195+ {b Empty registry behavior:}
196196+ If the registry is empty, the tree is traversed but no validation
197197+ is performed. *)
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+module Message = Message
77+module Message_collector = Message_collector
88+module Message_format = Message_format
99+module Parse_error_bridge = Parse_error_bridge
1010+module Content_category = Content_category
1111+module Content_model = Content_model
1212+module Attr_spec = Attr_spec
1313+module Element_spec = Element_spec
1414+1515+type t = {
1616+ doc : Html5rw.t;
1717+ msgs : Message.t list;
1818+ system_id : string option;
1919+}
2020+2121+let check ?(collect_parse_errors = true) ?system_id reader =
2222+ let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in
2323+ let collector = Message_collector.create () in
2424+2525+ (* Add parse errors if collected *)
2626+ if collect_parse_errors then begin
2727+ let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in
2828+ List.iter (Message_collector.add collector) parse_errors
2929+ end;
3030+3131+ (* TODO: Run checkers via dom_walker when available *)
3232+ (* Dom_walker.walk_registry registry (Html5rw.root doc) collector; *)
3333+3434+ { doc; msgs = Message_collector.messages collector; system_id }
3535+3636+let check_dom ?(collect_parse_errors = true) ?system_id doc =
3737+ let collector = Message_collector.create () in
3838+3939+ (* Add parse errors if requested *)
4040+ if collect_parse_errors then begin
4141+ let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in
4242+ List.iter (Message_collector.add collector) parse_errors
4343+ end;
4444+4545+ (* TODO: Run checkers via dom_walker when available *)
4646+ (* Dom_walker.walk_registry registry (Html5rw.root doc) collector; *)
4747+4848+ { doc; msgs = Message_collector.messages collector; system_id }
4949+5050+let messages t = t.msgs
5151+5252+let errors t =
5353+ List.filter
5454+ (fun msg -> msg.Message.severity = Message.Error)
5555+ t.msgs
5656+5757+let warnings t =
5858+ List.filter
5959+ (fun msg -> msg.Message.severity = Message.Warning)
6060+ t.msgs
6161+6262+let has_errors t =
6363+ List.exists
6464+ (fun msg -> msg.Message.severity = Message.Error)
6565+ t.msgs
6666+6767+let document t = t.doc
6868+6969+let system_id t = t.system_id
7070+7171+let format_text t =
7272+ Message_format.format_text ?system_id:t.system_id t.msgs
7373+7474+let format_json t =
7575+ Message_format.format_json ?system_id:t.system_id t.msgs
7676+7777+let format_gnu t =
7878+ Message_format.format_gnu ?system_id:t.system_id t.msgs
+102
lib/html5_checker/html5_checker.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** HTML5 conformance checker.
77+88+ This module provides HTML5 validation and conformance checking,
99+ combining parse error detection with structural validation rules. *)
1010+1111+(** {1 Re-exported modules} *)
1212+1313+(** Validation message types and constructors. *)
1414+module Message = Message
1515+1616+(** Message collection utilities. *)
1717+module Message_collector = Message_collector
1818+1919+(** Message output formatters. *)
2020+module Message_format = Message_format
2121+2222+(** Parse error bridge. *)
2323+module Parse_error_bridge = Parse_error_bridge
2424+2525+(** {2 Content Model Framework} *)
2626+2727+(** HTML5 content categories. *)
2828+module Content_category = Content_category
2929+3030+(** HTML5 element content models. *)
3131+module Content_model = Content_model
3232+3333+(** HTML5 attribute specifications. *)
3434+module Attr_spec = Attr_spec
3535+3636+(** HTML5 element specifications. *)
3737+module Element_spec = Element_spec
3838+3939+(** {1 Core Types} *)
4040+4141+(** Result of checking an HTML document. *)
4242+type t
4343+4444+(** {1 Checking Functions} *)
4545+4646+(** Parse and validate HTML from a reader.
4747+4848+ This function parses the HTML input and optionally collects parse errors.
4949+ Future versions will also run conformance checkers on the resulting DOM.
5050+5151+ @param collect_parse_errors If true, collect and include parse errors. Default: true.
5252+ @param system_id Optional file path or URL for error reporting.
5353+ @param reader Bytesrw reader containing HTML input. *)
5454+val check :
5555+ ?collect_parse_errors:bool ->
5656+ ?system_id:string ->
5757+ Bytesrw.Bytes.Reader.t ->
5858+ t
5959+6060+(** Validate an already-parsed HTML document.
6161+6262+ This function takes an existing Html5rw.t parse result and validates it.
6363+6464+ @param collect_parse_errors If true, collect and include parse errors from the result. Default: true.
6565+ @param system_id Optional file path or URL for error reporting.
6666+ @param result Already-parsed HTML document. *)
6767+val check_dom :
6868+ ?collect_parse_errors:bool ->
6969+ ?system_id:string ->
7070+ Html5rw.t ->
7171+ t
7272+7373+(** {1 Result Accessors} *)
7474+7575+(** Get all validation messages. *)
7676+val messages : t -> Message.t list
7777+7878+(** Get only error messages. *)
7979+val errors : t -> Message.t list
8080+8181+(** Get only warning messages. *)
8282+val warnings : t -> Message.t list
8383+8484+(** Check if there are any errors. *)
8585+val has_errors : t -> bool
8686+8787+(** Get the underlying parsed document. *)
8888+val document : t -> Html5rw.t
8989+9090+(** Get the system identifier if set. *)
9191+val system_id : t -> string option
9292+9393+(** {1 Formatting} *)
9494+9595+(** Format messages as human-readable text. *)
9696+val format_text : t -> string
9797+9898+(** Format messages as JSON. *)
9999+val format_json : t -> string
100100+101101+(** Format messages in GNU style. *)
102102+val format_gnu : t -> string
lib/html5_checker/message.cmi
This is a binary file and will not be displayed.
+80
lib/html5_checker/message.ml
···11+type severity = Error | Warning | Info
22+33+type location = {
44+ line : int;
55+ column : int;
66+ end_line : int option;
77+ end_column : int option;
88+ system_id : string option;
99+}
1010+1111+type t = {
1212+ severity : severity;
1313+ message : string;
1414+ code : string option;
1515+ location : location option;
1616+ element : string option;
1717+ attribute : string option;
1818+ extract : string option;
1919+}
2020+2121+let make ~severity ~message ?code ?location ?element ?attribute ?extract () =
2222+ { severity; message; code; location; element; attribute; extract }
2323+2424+let error ~message ?code ?location ?element ?attribute ?extract () =
2525+ make ~severity:Error ~message ?code ?location ?element ?attribute ?extract ()
2626+2727+let warning ~message ?code ?location ?element ?attribute ?extract () =
2828+ make ~severity:Warning ~message ?code ?location ?element ?attribute ?extract
2929+ ()
3030+3131+let info ~message ?code ?location ?element ?attribute ?extract () =
3232+ make ~severity:Info ~message ?code ?location ?element ?attribute ?extract ()
3333+3434+let make_location ~line ~column ?end_line ?end_column ?system_id () =
3535+ { line; column; end_line; end_column; system_id }
3636+3737+let severity_to_string = function
3838+ | Error -> "error"
3939+ | Warning -> "warning"
4040+ | Info -> "info"
4141+4242+let pp_severity fmt severity =
4343+ Format.pp_print_string fmt (severity_to_string severity)
4444+4545+let pp_location fmt loc =
4646+ match loc.system_id with
4747+ | Some sid -> Format.fprintf fmt "%s:" sid
4848+ | None -> ();
4949+ Format.fprintf fmt "%d:%d" loc.line loc.column;
5050+ match (loc.end_line, loc.end_column) with
5151+ | Some el, Some ec when el = loc.line && ec > loc.column ->
5252+ Format.fprintf fmt "-%d" ec
5353+ | Some el, Some ec when el > loc.line ->
5454+ Format.fprintf fmt "-%d:%d" el ec
5555+ | _ -> ()
5656+5757+let pp fmt msg =
5858+ (match msg.location with
5959+ | Some loc ->
6060+ pp_location fmt loc;
6161+ Format.fprintf fmt ": "
6262+ | None -> ());
6363+ pp_severity fmt msg.severity;
6464+ (match msg.code with
6565+ | Some code -> Format.fprintf fmt " [%s]" code
6666+ | None -> ());
6767+ Format.fprintf fmt ": %s" msg.message;
6868+ (match msg.element with
6969+ | Some elem -> Format.fprintf fmt " (element: %s)" elem
7070+ | None -> ());
7171+ match msg.attribute with
7272+ | Some attr -> Format.fprintf fmt " (attribute: %s)" attr
7373+ | None -> ()
7474+7575+let to_string msg =
7676+ let buf = Buffer.create 256 in
7777+ let fmt = Format.formatter_of_buffer buf in
7878+ pp fmt msg;
7979+ Format.pp_print_flush fmt ();
8080+ Buffer.contents buf
+104
lib/html5_checker/message.mli
···11+(** HTML5 validation messages.
22+33+ This module provides types for validation messages including errors,
44+ warnings, and informational messages with source location tracking. *)
55+66+(** Message severity levels. *)
77+type severity =
88+ | Error (** Conformance error - document is invalid *)
99+ | Warning (** Conformance warning - likely problematic *)
1010+ | Info (** Informational - suggestions for improvement *)
1111+1212+(** Source location information. *)
1313+type location = {
1414+ line : int; (** 1-indexed line number *)
1515+ column : int; (** 1-indexed column number *)
1616+ end_line : int option; (** Optional end line for ranges *)
1717+ end_column : int option; (** Optional end column *)
1818+ system_id : string option; (** File path or URL *)
1919+}
2020+2121+(** A validation message. *)
2222+type t = {
2323+ severity : severity;
2424+ message : string; (** Human-readable description *)
2525+ code : string option; (** Machine-readable error code *)
2626+ location : location option;
2727+ element : string option; (** Element name if relevant *)
2828+ attribute : string option; (** Attribute name if relevant *)
2929+ extract : string option; (** Source excerpt *)
3030+}
3131+3232+(** {1 Constructors} *)
3333+3434+(** Create a validation message with specified severity. *)
3535+val make :
3636+ severity:severity ->
3737+ message:string ->
3838+ ?code:string ->
3939+ ?location:location ->
4040+ ?element:string ->
4141+ ?attribute:string ->
4242+ ?extract:string ->
4343+ unit ->
4444+ t
4545+4646+(** Create an error message. *)
4747+val error :
4848+ message:string ->
4949+ ?code:string ->
5050+ ?location:location ->
5151+ ?element:string ->
5252+ ?attribute:string ->
5353+ ?extract:string ->
5454+ unit ->
5555+ t
5656+5757+(** Create a warning message. *)
5858+val warning :
5959+ message:string ->
6060+ ?code:string ->
6161+ ?location:location ->
6262+ ?element:string ->
6363+ ?attribute:string ->
6464+ ?extract:string ->
6565+ unit ->
6666+ t
6767+6868+(** Create an informational message. *)
6969+val info :
7070+ message:string ->
7171+ ?code:string ->
7272+ ?location:location ->
7373+ ?element:string ->
7474+ ?attribute:string ->
7575+ ?extract:string ->
7676+ unit ->
7777+ t
7878+7979+(** Create a location record. *)
8080+val make_location :
8181+ line:int ->
8282+ column:int ->
8383+ ?end_line:int ->
8484+ ?end_column:int ->
8585+ ?system_id:string ->
8686+ unit ->
8787+ location
8888+8989+(** {1 Formatting} *)
9090+9191+(** Convert severity to string representation. *)
9292+val severity_to_string : severity -> string
9393+9494+(** Pretty-print severity. *)
9595+val pp_severity : Format.formatter -> severity -> unit
9696+9797+(** Pretty-print location. *)
9898+val pp_location : Format.formatter -> location -> unit
9999+100100+(** Pretty-print a validation message. *)
101101+val pp : Format.formatter -> t -> unit
102102+103103+(** Convert a validation message to a string. *)
104104+val to_string : t -> string
+38
lib/html5_checker/message_collector.ml
···11+type t = { mutable messages : Message.t list }
22+33+let create () = { messages = [] }
44+55+let add t msg = t.messages <- msg :: t.messages
66+77+let add_error t ~message ?code ?location ?element ?attribute ?extract () =
88+ let msg =
99+ Message.error ~message ?code ?location ?element ?attribute ?extract ()
1010+ in
1111+ add t msg
1212+1313+let add_warning t ~message ?code ?location ?element ?attribute ?extract () =
1414+ let msg =
1515+ Message.warning ~message ?code ?location ?element ?attribute ?extract ()
1616+ in
1717+ add t msg
1818+1919+let messages t = List.rev t.messages
2020+2121+let errors t =
2222+ List.filter (fun msg -> msg.Message.severity = Message.Error) (messages t)
2323+2424+let warnings t =
2525+ List.filter (fun msg -> msg.Message.severity = Message.Warning) (messages t)
2626+2727+let has_errors t =
2828+ List.exists (fun msg -> msg.Message.severity = Message.Error) t.messages
2929+3030+let count t = List.length t.messages
3131+3232+let error_count t =
3333+ List.fold_left
3434+ (fun acc msg ->
3535+ if msg.Message.severity = Message.Error then acc + 1 else acc)
3636+ 0 t.messages
3737+3838+let clear t = t.messages <- []
+65
lib/html5_checker/message_collector.mli
···11+(** Message collector for accumulating validation messages. *)
22+33+(** The type of a message collector. *)
44+type t
55+66+(** {1 Creation} *)
77+88+(** Create a new empty message collector. *)
99+val create : unit -> t
1010+1111+(** {1 Adding Messages} *)
1212+1313+(** Add a message to the collector. *)
1414+val add : t -> Message.t -> unit
1515+1616+(** Add an error message to the collector. *)
1717+val add_error :
1818+ t ->
1919+ message:string ->
2020+ ?code:string ->
2121+ ?location:Message.location ->
2222+ ?element:string ->
2323+ ?attribute:string ->
2424+ ?extract:string ->
2525+ unit ->
2626+ unit
2727+2828+(** Add a warning message to the collector. *)
2929+val add_warning :
3030+ t ->
3131+ message:string ->
3232+ ?code:string ->
3333+ ?location:Message.location ->
3434+ ?element:string ->
3535+ ?attribute:string ->
3636+ ?extract:string ->
3737+ unit ->
3838+ unit
3939+4040+(** {1 Retrieving Messages} *)
4141+4242+(** Get all messages in the order they were added. *)
4343+val messages : t -> Message.t list
4444+4545+(** Get only error messages. *)
4646+val errors : t -> Message.t list
4747+4848+(** Get only warning messages. *)
4949+val warnings : t -> Message.t list
5050+5151+(** {1 Status Queries} *)
5252+5353+(** Check if the collector contains any error messages. *)
5454+val has_errors : t -> bool
5555+5656+(** Get the total number of messages. *)
5757+val count : t -> int
5858+5959+(** Get the number of error messages. *)
6060+val error_count : t -> int
6161+6262+(** {1 Modification} *)
6363+6464+(** Clear all messages from the collector. *)
6565+val clear : t -> unit
+130
lib/html5_checker/message_format.ml
···11+let format_text ?system_id messages =
22+ let buf = Buffer.create 1024 in
33+ List.iter
44+ (fun msg ->
55+ let loc_str =
66+ match msg.Message.location with
77+ | Some loc -> (
88+ let sid =
99+ match loc.Message.system_id with
1010+ | Some s -> s
1111+ | None -> (
1212+ match system_id with Some s -> s | None -> "input")
1313+ in
1414+ let col_info =
1515+ match (loc.end_line, loc.end_column) with
1616+ | Some el, Some ec when el = loc.line && ec > loc.column ->
1717+ Printf.sprintf "%d.%d-%d" loc.line loc.column ec
1818+ | Some el, Some ec when el > loc.line ->
1919+ Printf.sprintf "%d.%d-%d.%d" loc.line loc.column el ec
2020+ | _ -> Printf.sprintf "%d.%d" loc.line loc.column
2121+ in
2222+ Printf.sprintf "%s:%s" sid col_info)
2323+ | None -> (
2424+ match system_id with Some s -> s | None -> "input")
2525+ in
2626+ let severity_str = Message.severity_to_string msg.Message.severity in
2727+ let code_str =
2828+ match msg.Message.code with Some c -> " [" ^ c ^ "]" | None -> ""
2929+ in
3030+ let elem_str =
3131+ match msg.Message.element with
3232+ | Some e -> " (element: " ^ e ^ ")"
3333+ | None -> ""
3434+ in
3535+ let attr_str =
3636+ match msg.Message.attribute with
3737+ | Some a -> " (attribute: " ^ a ^ ")"
3838+ | None -> ""
3939+ in
4040+ Buffer.add_string buf
4141+ (Printf.sprintf "%s: %s%s: %s%s%s\n" loc_str severity_str code_str
4242+ msg.Message.message elem_str attr_str))
4343+ messages;
4444+ Buffer.contents buf
4545+4646+let format_gnu ?system_id messages =
4747+ let buf = Buffer.create 1024 in
4848+ List.iter
4949+ (fun msg ->
5050+ let loc_str =
5151+ match msg.Message.location with
5252+ | Some loc -> (
5353+ let sid =
5454+ match loc.Message.system_id with
5555+ | Some s -> s
5656+ | None -> (
5757+ match system_id with Some s -> s | None -> "input")
5858+ in
5959+ Printf.sprintf "%s:%d:%d" sid loc.line loc.column)
6060+ | None -> (
6161+ match system_id with Some s -> s ^ ":0:0" | None -> "input:0:0")
6262+ in
6363+ let severity_str = Message.severity_to_string msg.Message.severity in
6464+ let code_str =
6565+ match msg.Message.code with Some c -> " [" ^ c ^ "]" | None -> ""
6666+ in
6767+ Buffer.add_string buf
6868+ (Printf.sprintf "%s: %s%s: %s\n" loc_str severity_str code_str
6969+ msg.Message.message))
7070+ messages;
7171+ Buffer.contents buf
7272+7373+let message_to_json ?system_id msg =
7474+ let open Jsont in
7575+ let severity = String (Message.severity_to_string msg.Message.severity, Meta.none) in
7676+ let message_text = String (msg.Message.message, Meta.none) in
7777+ let base = [ (("type", Meta.none), severity); (("message", Meta.none), message_text) ] in
7878+ let with_code =
7979+ match msg.Message.code with
8080+ | Some c -> (("subType", Meta.none), String (c, Meta.none)) :: base
8181+ | None -> base
8282+ in
8383+ let with_location =
8484+ match msg.Message.location with
8585+ | Some loc ->
8686+ let line = Number (float_of_int loc.Message.line, Meta.none) in
8787+ let column = Number (float_of_int loc.Message.column, Meta.none) in
8888+ let loc_fields =
8989+ [ (("firstLine", Meta.none), line); (("firstColumn", Meta.none), column) ]
9090+ in
9191+ let loc_fields =
9292+ match loc.Message.end_line with
9393+ | Some el ->
9494+ (("lastLine", Meta.none), Number (float_of_int el, Meta.none)) :: loc_fields
9595+ | None -> loc_fields
9696+ in
9797+ let loc_fields =
9898+ match loc.Message.end_column with
9999+ | Some ec ->
100100+ (("lastColumn", Meta.none), Number (float_of_int ec, Meta.none))
101101+ :: loc_fields
102102+ | None -> loc_fields
103103+ in
104104+ let url =
105105+ match loc.Message.system_id with
106106+ | Some s -> s
107107+ | None -> (
108108+ match system_id with Some s -> s | None -> "input")
109109+ in
110110+ (("url", Meta.none), String (url, Meta.none)) :: loc_fields @ with_code
111111+ | None ->
112112+ let url =
113113+ match system_id with Some s -> s | None -> "input"
114114+ in
115115+ (("url", Meta.none), String (url, Meta.none)) :: with_code
116116+ in
117117+ let with_extract =
118118+ match msg.Message.extract with
119119+ | Some e -> (("extract", Meta.none), String (e, Meta.none)) :: with_location
120120+ | None -> with_location
121121+ in
122122+ Object (with_extract, Meta.none)
123123+124124+let format_json ?system_id messages =
125125+ let open Jsont in
126126+ let msg_array = Array (List.map (message_to_json ?system_id) messages, Meta.none) in
127127+ let obj = Object ([ (("messages", Meta.none), msg_array) ], Meta.none) in
128128+ match Jsont_bytesrw.encode_string ~format:Minify json obj with
129129+ | Ok s -> s
130130+ | Error e -> failwith ("JSON encoding error: " ^ e)
+28
lib/html5_checker/message_format.mli
···11+(** Message output formatters.
22+33+ This module provides various output formats for validation messages,
44+ including text, JSON, and GNU-style formats for IDE integration. *)
55+66+(** {1 Formatters} *)
77+88+(** Format messages as human-readable text.
99+1010+ Output format: [file:line:col: severity: message]
1111+1212+ @param system_id Optional default system identifier for messages without location. *)
1313+val format_text : ?system_id:string -> Message.t list -> string
1414+1515+(** Format messages as JSON.
1616+1717+ Produces output compatible with the Nu HTML Validator JSON format.
1818+1919+ @param system_id Optional default system identifier for messages without location. *)
2020+val format_json : ?system_id:string -> Message.t list -> string
2121+2222+(** Format messages in GNU style for IDE integration.
2323+2424+ Output format follows GNU conventions for error messages, compatible
2525+ with most IDEs and editors.
2626+2727+ @param system_id Optional default system identifier for messages without location. *)
2828+val format_gnu : ?system_id:string -> Message.t list -> string
+22
lib/html5_checker/parse_error_bridge.ml
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+let of_parse_error ?system_id err =
77+ let code = Html5rw.error_code err in
88+ let line = Html5rw.error_line err in
99+ let column = Html5rw.error_column err in
1010+ let location =
1111+ Message.make_location ~line ~column ?system_id ()
1212+ in
1313+ let code_str = Html5rw.Parse_error_code.to_string code in
1414+ Message.error
1515+ ~message:(Printf.sprintf "Parse error: %s" code_str)
1616+ ~code:code_str
1717+ ~location
1818+ ()
1919+2020+let collect_parse_errors ?system_id result =
2121+ let errors = Html5rw.errors result in
2222+ List.map (of_parse_error ?system_id) errors
+25
lib/html5_checker/parse_error_bridge.mli
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Bridge between Html5rw parse errors and validation messages.
77+88+ This module converts parse errors from the Html5rw parser into
99+ standardized validation messages. *)
1010+1111+(** Convert a parse error to a validation message.
1212+1313+ Extracts error code, line, column, and creates a Message.t with
1414+ severity set to Error.
1515+1616+ @param system_id Optional file path or URL to include in location *)
1717+val of_parse_error : ?system_id:string -> Html5rw.parse_error -> Message.t
1818+1919+(** Collect all parse errors from a parse result.
2020+2121+ Extracts all parse errors from the Html5rw.t result and converts
2222+ them to validation messages.
2323+2424+ @param system_id Optional file path or URL to include in locations *)
2525+val collect_parse_errors : ?system_id:string -> Html5rw.t -> Message.t list
+246
lib/html5_checker/semantic/form_checker.ml
···11+(** Form-related validation checker implementation. *)
22+33+type state = {
44+ mutable in_form : bool;
55+ (** Track if we're currently inside a <form> element *)
66+ mutable form_ids : string list;
77+ (** Stack of form IDs we're currently nested in *)
88+ mutable label_for_refs : string list;
99+ (** Collect all label[for] references to validate later *)
1010+ mutable element_ids : string list;
1111+ (** Collect all element IDs to validate label references *)
1212+ mutable unlabeled_controls : (string * string option) list;
1313+ (** Controls that might need labels: (type, id) *)
1414+}
1515+1616+let create () =
1717+ {
1818+ in_form = false;
1919+ form_ids = [];
2020+ label_for_refs = [];
2121+ element_ids = [];
2222+ unlabeled_controls = [];
2323+ }
2424+2525+let reset state =
2626+ state.in_form <- false;
2727+ state.form_ids <- [];
2828+ state.label_for_refs <- [];
2929+ state.element_ids <- [];
3030+ state.unlabeled_controls <- []
3131+3232+(** Check if an attribute list contains a specific attribute. *)
3333+let has_attr name attrs =
3434+ List.exists (fun (attr_name, _) -> String.equal attr_name name) attrs
3535+3636+(** Get the value of an attribute if present. *)
3737+let get_attr name attrs =
3838+ List.find_map
3939+ (fun (attr_name, value) ->
4040+ if String.equal attr_name name then Some value else None)
4141+ attrs
4242+4343+(** Check if an element is labelable. *)
4444+let _is_labelable_element name input_type =
4545+ match name with
4646+ | "button" | "meter" | "output" | "progress" | "select" | "textarea" -> true
4747+ | "input" -> (
4848+ match input_type with Some "hidden" -> false | _ -> true)
4949+ | _ -> false
5050+5151+(** Valid autocomplete tokens for various input types. *)
5252+let valid_autocomplete_tokens =
5353+ [
5454+ "on";
5555+ "off";
5656+ "name";
5757+ "honorific-prefix";
5858+ "given-name";
5959+ "additional-name";
6060+ "family-name";
6161+ "honorific-suffix";
6262+ "nickname";
6363+ "email";
6464+ "username";
6565+ "new-password";
6666+ "current-password";
6767+ "one-time-code";
6868+ "organization-title";
6969+ "organization";
7070+ "street-address";
7171+ "address-line1";
7272+ "address-line2";
7373+ "address-line3";
7474+ "address-level4";
7575+ "address-level3";
7676+ "address-level2";
7777+ "address-level1";
7878+ "country";
7979+ "country-name";
8080+ "postal-code";
8181+ "cc-name";
8282+ "cc-given-name";
8383+ "cc-additional-name";
8484+ "cc-family-name";
8585+ "cc-number";
8686+ "cc-exp";
8787+ "cc-exp-month";
8888+ "cc-exp-year";
8989+ "cc-csc";
9090+ "cc-type";
9191+ "transaction-currency";
9292+ "transaction-amount";
9393+ "language";
9494+ "bday";
9595+ "bday-day";
9696+ "bday-month";
9797+ "bday-year";
9898+ "sex";
9999+ "tel";
100100+ "tel-country-code";
101101+ "tel-national";
102102+ "tel-area-code";
103103+ "tel-local";
104104+ "tel-extension";
105105+ "impp";
106106+ "url";
107107+ "photo";
108108+ ]
109109+110110+let check_autocomplete_value value _input_type collector =
111111+ (* Parse autocomplete value - can be space-separated tokens *)
112112+ let tokens = String.split_on_char ' ' value |> List.map String.trim in
113113+ let tokens = List.filter (fun s -> String.length s > 0) tokens in
114114+115115+ (* The last token should be a valid autocomplete token *)
116116+ match List.rev tokens with
117117+ | [] -> ()
118118+ | last_token :: _prefix_tokens ->
119119+ if not (List.mem last_token valid_autocomplete_tokens) then
120120+ Message_collector.add_warning collector
121121+ ~message:
122122+ (Printf.sprintf "Unknown autocomplete value: %s" last_token)
123123+ ~code:"invalid-autocomplete-value" ~element:"input"
124124+ ~attribute:"autocomplete" ()
125125+126126+let check_input_element state attrs collector =
127127+ let input_type = get_attr "type" attrs in
128128+ let id = get_attr "id" attrs in
129129+130130+ (* Track this input's ID if present *)
131131+ (match id with
132132+ | Some id_val -> state.element_ids <- id_val :: state.element_ids
133133+ | None -> ());
134134+135135+ (* Check various input-specific rules *)
136136+ (match input_type with
137137+ | Some "radio" | Some "checkbox" ->
138138+ (* Radio and checkbox should have labels *)
139139+ state.unlabeled_controls <-
140140+ (Option.value input_type ~default:"text", id)
141141+ :: state.unlabeled_controls
142142+ | Some "submit" | Some "button" | Some "reset" ->
143143+ (* These don't need labels *)
144144+ ()
145145+ | _ -> ());
146146+147147+ (* Check autocomplete attribute *)
148148+ (match get_attr "autocomplete" attrs with
149149+ | Some autocomplete_value ->
150150+ check_autocomplete_value autocomplete_value input_type collector
151151+ | None -> ());
152152+153153+ (* Check for select multiple with size=1 *)
154154+ ()
155155+156156+let check_select_element attrs collector =
157157+ let multiple = has_attr "multiple" attrs in
158158+ let size = get_attr "size" attrs in
159159+160160+ match (multiple, size) with
161161+ | true, Some "1" ->
162162+ Message_collector.add_warning collector
163163+ ~message:"select element with multiple should not have size=\"1\""
164164+ ~code:"contradictory-attributes" ~element:"select" ~attribute:"size"
165165+ ()
166166+ | _ -> ()
167167+168168+let check_button_element state attrs collector =
169169+ (* button[type=submit] should be in form or have form attribute *)
170170+ let button_type = get_attr "type" attrs in
171171+ let has_form_attr = has_attr "form" attrs in
172172+173173+ match button_type with
174174+ | Some "submit" | None ->
175175+ (* Default type is submit *)
176176+ if (not state.in_form) && not has_form_attr then
177177+ Message_collector.add_warning collector
178178+ ~message:
179179+ "button element with type=\"submit\" should be inside a form or \
180180+ have form attribute"
181181+ ~code:"submit-button-outside-form" ~element:"button" ()
182182+ | _ -> ()
183183+184184+let check_label_element state attrs _collector =
185185+ (* Collect label[for] references *)
186186+ match get_attr "for" attrs with
187187+ | Some for_id -> state.label_for_refs <- for_id :: state.label_for_refs
188188+ | None -> ()
189189+190190+let start_element state ~name ~namespace:_ ~attrs collector =
191191+ (* Track element IDs *)
192192+ (match get_attr "id" attrs with
193193+ | Some id_val -> state.element_ids <- id_val :: state.element_ids
194194+ | None -> ());
195195+196196+ match name with
197197+ | "form" ->
198198+ state.in_form <- true;
199199+ (match get_attr "id" attrs with
200200+ | Some id -> state.form_ids <- id :: state.form_ids
201201+ | None -> ())
202202+ | "input" -> check_input_element state attrs collector
203203+ | "select" -> check_select_element attrs collector
204204+ | "button" -> check_button_element state attrs collector
205205+ | "label" -> check_label_element state attrs collector
206206+ | _ -> ()
207207+208208+let end_element state ~name ~namespace:_ _collector =
209209+ match name with
210210+ | "form" ->
211211+ state.in_form <- false;
212212+ (match state.form_ids with
213213+ | _ :: rest -> state.form_ids <- rest
214214+ | [] -> ())
215215+ | _ -> ()
216216+217217+let characters _state _text _collector = ()
218218+219219+let end_document state collector =
220220+ (* Validate label[for] references *)
221221+ List.iter
222222+ (fun for_id ->
223223+ if not (List.mem for_id state.element_ids) then
224224+ Message_collector.add_warning collector
225225+ ~message:
226226+ (Printf.sprintf
227227+ "label element references non-existent ID: %s" for_id)
228228+ ~code:"invalid-label-reference" ~element:"label" ~attribute:"for"
229229+ ())
230230+ state.label_for_refs;
231231+232232+ (* Note: We can't reliably detect unlabeled controls without tracking
233233+ label parent-child relationships, which would require more complex
234234+ state tracking. For now, we just validate explicit label[for] references. *)
235235+ ()
236236+237237+let checker = (module struct
238238+ type nonrec state = state
239239+240240+ let create = create
241241+ let reset = reset
242242+ let start_element = start_element
243243+ let end_element = end_element
244244+ let characters = characters
245245+ let end_document = end_document
246246+end : Checker.S)
+62
lib/html5_checker/semantic/form_checker.mli
···11+(** Form-related validation checker.
22+33+ Validates form control associations, label references, and form structure
44+ according to HTML5 accessibility and semantic requirements. This checker
55+ ensures that:
66+77+ - Form controls have proper labels
88+ - Label associations are valid
99+ - Form attributes are correctly configured
1010+ - Form structure follows HTML5 constraints
1111+1212+ {2 Validation Rules}
1313+1414+ {b Label Associations}
1515+ - [label] with [for] attribute should reference a labelable element ID
1616+ - [input\[type=radio\]] should have an associated visible label
1717+ - [input\[type=checkbox\]] should have an associated visible label
1818+1919+ {b Form Control Validation}
2020+ - [button\[type=submit\]] should be inside a [form] or have [form] attribute
2121+ - [input\[type=image\]] requires [alt] attribute (validated by required_attr_checker)
2222+ - [input\[type=hidden\]] should not have [required] attribute (validated by required_attr_checker)
2323+ - [input\[type=file\]] should not have [value] attribute (validated by required_attr_checker)
2424+2525+ {b Autocomplete}
2626+ - [autocomplete] values should be appropriate for the [input] type
2727+ - Common autocomplete values include: [on], [off], [name], [email],
2828+ [username], [current-password], [new-password], [street-address], etc.
2929+3030+ {b Select Elements}
3131+ - [select\[multiple\]] should not have [size="1"] (contradictory)
3232+ - [select] should contain at least one [option] or [optgroup]
3333+3434+ {b Accessibility}
3535+ - Form controls should be reachable and operable via keyboard
3636+ - Radio buttons with the same [name] should form a logical group
3737+3838+ {3 Labelable Elements}
3939+4040+ The following elements can be associated with a [label]:
4141+ - [button]
4242+ - [input] (except [type=hidden])
4343+ - [meter]
4444+ - [output]
4545+ - [progress]
4646+ - [select]
4747+ - [textarea]
4848+4949+ @see <https://html.spec.whatwg.org/multipage/forms.html> WHATWG HTML: Forms
5050+ @see <https://www.w3.org/WAI/WCAG21/Understanding/labels-or-instructions.html>
5151+ WCAG: Labels or Instructions *)
5252+5353+include Checker.S
5454+5555+val checker : Checker.t
5656+(** A first-class module instance of this checker.
5757+5858+ {b Usage:}
5959+ {[
6060+ let checker = Form_checker.checker in
6161+ Checker_registry.register "form-validation" checker
6262+ ]} *)
+219
lib/html5_checker/semantic/id_checker.ml
···11+(** ID uniqueness and reference checker.
22+33+ This checker validates that:
44+ - ID attributes are unique within the document
55+ - ID references point to existing IDs
66+ - ID values conform to HTML5 requirements *)
77+88+(** Location information for ID occurrences. *)
99+type id_location = {
1010+ element : string;
1111+ location : Message.location option;
1212+}
1313+1414+(** Information about an ID reference. *)
1515+type id_reference = {
1616+ referring_element : string;
1717+ attribute : string;
1818+ referenced_id : string;
1919+ location : Message.location option;
2020+}
2121+2222+(** Checker state tracking IDs and references. *)
2323+type state = {
2424+ ids : (string, id_location) Hashtbl.t;
2525+ mutable references : id_reference list;
2626+}
2727+2828+let create () =
2929+ {
3030+ ids = Hashtbl.create 64;
3131+ references = [];
3232+ }
3333+3434+let reset state =
3535+ Hashtbl.clear state.ids;
3636+ state.references <- []
3737+3838+(** Check if a string contains whitespace. *)
3939+let contains_whitespace s =
4040+ String.contains s ' ' || String.contains s '\t' ||
4141+ String.contains s '\n' || String.contains s '\r'
4242+4343+(** Extract ID from a usemap value (removes leading #). *)
4444+let extract_usemap_id value =
4545+ if String.length value > 0 && value.[0] = '#' then
4646+ Some (String.sub value 1 (String.length value - 1))
4747+ else
4848+ None
4949+5050+(** Split whitespace-separated ID references. *)
5151+let split_ids value =
5252+ let rec split acc start i =
5353+ if i >= String.length value then
5454+ if i > start then
5555+ (String.sub value start (i - start)) :: acc
5656+ else
5757+ acc
5858+ else
5959+ match value.[i] with
6060+ | ' ' | '\t' | '\n' | '\r' ->
6161+ let acc' =
6262+ if i > start then
6363+ (String.sub value start (i - start)) :: acc
6464+ else
6565+ acc
6666+ in
6767+ split acc' (i + 1) (i + 1)
6868+ | _ ->
6969+ split acc start (i + 1)
7070+ in
7171+ List.rev (split [] 0 0)
7272+7373+(** Attributes that reference a single ID. *)
7474+let single_id_ref_attrs = [
7575+ "for"; (* label *)
7676+ "form"; (* form-associated elements *)
7777+ "list"; (* input *)
7878+ "aria-activedescendant";
7979+]
8080+8181+(** Attributes that reference multiple IDs (space-separated). *)
8282+let multi_id_ref_attrs = [
8383+ "headers"; (* td, th *)
8484+ "aria-labelledby";
8585+ "aria-describedby";
8686+ "aria-controls";
8787+ "aria-flowto";
8888+ "aria-owns";
8989+ "itemref";
9090+]
9191+9292+(** Check and store an ID attribute. *)
9393+let check_id state ~element ~id ~location collector =
9494+ (* Check for empty ID *)
9595+ if String.length id = 0 then
9696+ Message_collector.add_error collector
9797+ ~message:"ID attribute must not be empty"
9898+ ~code:"empty-id"
9999+ ?location
100100+ ~element
101101+ ~attribute:"id"
102102+ ()
103103+ (* Check for whitespace in ID *)
104104+ else if contains_whitespace id then
105105+ Message_collector.add_error collector
106106+ ~message:(Printf.sprintf "ID attribute value '%s' must not contain whitespace" id)
107107+ ~code:"id-whitespace"
108108+ ?location
109109+ ~element
110110+ ~attribute:"id"
111111+ ()
112112+ (* Check for duplicate ID *)
113113+ else if Hashtbl.mem state.ids id then
114114+ let first_occurrence = Hashtbl.find state.ids id in
115115+ let first_loc_str = match first_occurrence.location with
116116+ | None -> ""
117117+ | Some loc -> Printf.sprintf " at line %d, column %d" loc.line loc.column
118118+ in
119119+ Message_collector.add_error collector
120120+ ~message:(Printf.sprintf
121121+ "Duplicate ID '%s': first used on <%s>%s, now on <%s>"
122122+ id first_occurrence.element first_loc_str element)
123123+ ~code:"duplicate-id"
124124+ ?location
125125+ ~element
126126+ ~attribute:"id"
127127+ ()
128128+ else
129129+ (* Store the ID *)
130130+ Hashtbl.add state.ids id { element; location }
131131+132132+(** Record a single ID reference. *)
133133+let add_reference state ~referring_element ~attribute ~referenced_id ~location =
134134+ if String.length referenced_id > 0 then
135135+ state.references <- {
136136+ referring_element;
137137+ attribute;
138138+ referenced_id;
139139+ location;
140140+ } :: state.references
141141+142142+(** Process attributes to check IDs and collect references. *)
143143+let process_attrs state ~element ~attrs ~location collector =
144144+ List.iter (fun (name, value) ->
145145+ match name with
146146+ | "id" ->
147147+ check_id state ~element ~id:value ~location collector
148148+149149+ | "usemap" ->
150150+ (* usemap references a map name, which is like an ID reference *)
151151+ begin match extract_usemap_id value with
152152+ | Some id ->
153153+ add_reference state ~referring_element:element
154154+ ~attribute:name ~referenced_id:id ~location
155155+ | None ->
156156+ if String.length value > 0 then
157157+ Message_collector.add_error collector
158158+ ~message:(Printf.sprintf
159159+ "usemap attribute value '%s' must start with '#'" value)
160160+ ~code:"invalid-usemap"
161161+ ?location
162162+ ~element
163163+ ~attribute:name
164164+ ()
165165+ end
166166+167167+ | attr when List.mem attr single_id_ref_attrs ->
168168+ add_reference state ~referring_element:element
169169+ ~attribute:attr ~referenced_id:value ~location
170170+171171+ | attr when List.mem attr multi_id_ref_attrs ->
172172+ (* Split space-separated IDs and add each as a reference *)
173173+ let ids = split_ids value in
174174+ List.iter (fun id ->
175175+ add_reference state ~referring_element:element
176176+ ~attribute:attr ~referenced_id:id ~location
177177+ ) ids
178178+179179+ | _ -> ()
180180+ ) attrs
181181+182182+let start_element state ~name ~namespace:_ ~attrs collector =
183183+ (* For now, we don't have location information from the DOM walker,
184184+ so we pass None. In a full implementation, this would be passed
185185+ from the parser. *)
186186+ let location = None in
187187+ process_attrs state ~element:name ~attrs ~location collector
188188+189189+let end_element _state ~name:_ ~namespace:_ _collector =
190190+ ()
191191+192192+let characters _state _text _collector =
193193+ ()
194194+195195+let end_document state collector =
196196+ (* Check all references point to existing IDs *)
197197+ List.iter (fun ref ->
198198+ if not (Hashtbl.mem state.ids ref.referenced_id) then
199199+ Message_collector.add_error collector
200200+ ~message:(Printf.sprintf
201201+ "The '%s' attribute on <%s> refers to ID '%s' which does not exist"
202202+ ref.attribute ref.referring_element ref.referenced_id)
203203+ ~code:"dangling-id-reference"
204204+ ?location:ref.location
205205+ ~element:ref.referring_element
206206+ ~attribute:ref.attribute
207207+ ()
208208+ ) state.references
209209+210210+let checker = (module struct
211211+ type nonrec state = state
212212+213213+ let create = create
214214+ let reset = reset
215215+ let start_element = start_element
216216+ let end_element = end_element
217217+ let characters = characters
218218+ let end_document = end_document
219219+end : Checker.S)
+11
lib/html5_checker/semantic/id_checker.mli
···11+(** ID uniqueness and reference checker.
22+33+ Validates:
44+ - ID attributes are unique within the document
55+ - ID references (for, headers, aria-*, etc.) point to existing IDs
66+ - ID values conform to HTML5 requirements *)
77+88+include Checker.S
99+1010+val checker : Checker.t
1111+(** [checker] is a checker instance for validating ID uniqueness and references. *)
+296
lib/html5_checker/semantic/nesting_checker.ml
···11+(** Interactive element nesting checker implementation. *)
22+33+(** Special ancestors that need tracking for nesting validation.
44+55+ This array defines the elements whose presence in the ancestor chain
66+ affects validation of descendant elements. The order is significant
77+ as it determines bit positions in the ancestor bitmask. *)
88+let special_ancestors =
99+ [| "a"; "address"; "body"; "button"; "caption"; "dfn"; "dt"; "figcaption";
1010+ "figure"; "footer"; "form"; "header"; "label"; "map"; "noscript"; "th";
1111+ "time"; "progress"; "meter"; "article"; "section"; "aside"; "nav"; "h1";
1212+ "h2"; "h3"; "h4"; "h5"; "h6" |]
1313+1414+(** Get the bit position for a special ancestor element.
1515+ Returns [-1] if the element is not a special ancestor. *)
1616+let special_ancestor_number name =
1717+ let rec find i =
1818+ if i >= Array.length special_ancestors then -1
1919+ else if special_ancestors.(i) = name then i
2020+ else find (i + 1)
2121+ in
2222+ find 0
2323+2424+(** Interactive elements that cannot be nested inside [a] or [button]. *)
2525+let interactive_elements =
2626+ [| "a"; "button"; "details"; "embed"; "iframe"; "label"; "select";
2727+ "textarea" |]
2828+2929+(** Map from descendant element name to bitmask of prohibited ancestors. *)
3030+let ancestor_mask_by_descendant : (string, int) Hashtbl.t =
3131+ Hashtbl.create 64
3232+3333+(** Register that [ancestor] is prohibited for [descendant]. *)
3434+let register_prohibited_ancestor ancestor descendant =
3535+ let number = special_ancestor_number ancestor in
3636+ if number = -1 then
3737+ failwith ("Ancestor not found in array: " ^ ancestor);
3838+ let mask =
3939+ match Hashtbl.find_opt ancestor_mask_by_descendant descendant with
4040+ | None -> 0
4141+ | Some m -> m
4242+ in
4343+ let new_mask = mask lor (1 lsl number) in
4444+ Hashtbl.replace ancestor_mask_by_descendant descendant new_mask
4545+4646+(** Initialize the prohibited ancestor map. *)
4747+let () =
4848+ (* Self-nesting restrictions *)
4949+ register_prohibited_ancestor "form" "form";
5050+ register_prohibited_ancestor "progress" "progress";
5151+ register_prohibited_ancestor "meter" "meter";
5252+ register_prohibited_ancestor "dfn" "dfn";
5353+ register_prohibited_ancestor "noscript" "noscript";
5454+ register_prohibited_ancestor "label" "label";
5555+5656+ (* Address restrictions *)
5757+ register_prohibited_ancestor "address" "address";
5858+ register_prohibited_ancestor "address" "section";
5959+ register_prohibited_ancestor "address" "nav";
6060+ register_prohibited_ancestor "address" "article";
6161+ register_prohibited_ancestor "address" "header";
6262+ register_prohibited_ancestor "address" "footer";
6363+ register_prohibited_ancestor "address" "h1";
6464+ register_prohibited_ancestor "address" "h2";
6565+ register_prohibited_ancestor "address" "h3";
6666+ register_prohibited_ancestor "address" "h4";
6767+ register_prohibited_ancestor "address" "h5";
6868+ register_prohibited_ancestor "address" "h6";
6969+7070+ (* Header/footer restrictions *)
7171+ register_prohibited_ancestor "header" "header";
7272+ register_prohibited_ancestor "footer" "header";
7373+ register_prohibited_ancestor "header" "footer";
7474+ register_prohibited_ancestor "footer" "footer";
7575+7676+ (* dt restrictions *)
7777+ register_prohibited_ancestor "dt" "header";
7878+ register_prohibited_ancestor "dt" "footer";
7979+ register_prohibited_ancestor "dt" "article";
8080+ register_prohibited_ancestor "dt" "nav";
8181+ register_prohibited_ancestor "dt" "section";
8282+ register_prohibited_ancestor "dt" "h1";
8383+ register_prohibited_ancestor "dt" "h2";
8484+ register_prohibited_ancestor "dt" "h3";
8585+ register_prohibited_ancestor "dt" "h4";
8686+ register_prohibited_ancestor "dt" "h5";
8787+ register_prohibited_ancestor "dt" "h6";
8888+ register_prohibited_ancestor "dt" "hgroup";
8989+9090+ (* th restrictions *)
9191+ register_prohibited_ancestor "th" "header";
9292+ register_prohibited_ancestor "th" "footer";
9393+ register_prohibited_ancestor "th" "article";
9494+ register_prohibited_ancestor "th" "nav";
9595+ register_prohibited_ancestor "th" "section";
9696+ register_prohibited_ancestor "th" "h1";
9797+ register_prohibited_ancestor "th" "h2";
9898+ register_prohibited_ancestor "th" "h3";
9999+ register_prohibited_ancestor "th" "h4";
100100+ register_prohibited_ancestor "th" "h5";
101101+ register_prohibited_ancestor "th" "h6";
102102+ register_prohibited_ancestor "th" "hgroup";
103103+104104+ (* Caption restriction *)
105105+ register_prohibited_ancestor "caption" "table";
106106+107107+ (* Interactive element restrictions: cannot be inside a or button *)
108108+ Array.iter (fun elem ->
109109+ register_prohibited_ancestor "a" elem;
110110+ register_prohibited_ancestor "button" elem
111111+ ) interactive_elements
112112+113113+(** Bitmask constants for common checks. *)
114114+let a_button_mask =
115115+ let a_num = special_ancestor_number "a" in
116116+ let button_num = special_ancestor_number "button" in
117117+ (1 lsl a_num) lor (1 lsl button_num)
118118+119119+let map_mask =
120120+ let map_num = special_ancestor_number "map" in
121121+ 1 lsl map_num
122122+123123+(** Stack node representing an element's context. *)
124124+type stack_node = {
125125+ ancestor_mask : int;
126126+ _name : string; [@warning "-69"]
127127+}
128128+129129+(** Checker state. *)
130130+type state = {
131131+ mutable stack : stack_node list;
132132+ mutable ancestor_mask : int;
133133+}
134134+135135+let create () =
136136+ { stack = []; ancestor_mask = 0 }
137137+138138+let reset state =
139139+ state.stack <- [];
140140+ state.ancestor_mask <- 0
141141+142142+(** Get attribute value by name from attribute list. *)
143143+let get_attr attrs name =
144144+ List.assoc_opt name attrs
145145+146146+(** Check if an attribute exists. *)
147147+let has_attr attrs name =
148148+ get_attr attrs name <> None
149149+150150+(** Check if element is interactive based on its attributes. *)
151151+let is_interactive_element name attrs =
152152+ match name with
153153+ | "a" ->
154154+ has_attr attrs "href"
155155+ | "audio" | "video" ->
156156+ has_attr attrs "controls"
157157+ | "img" | "object" ->
158158+ has_attr attrs "usemap"
159159+ | "input" ->
160160+ begin match get_attr attrs "type" with
161161+ | Some "hidden" -> false
162162+ | _ -> true
163163+ end
164164+ | "button" | "details" | "embed" | "iframe" | "label" | "select"
165165+ | "textarea" ->
166166+ true
167167+ | _ ->
168168+ false
169169+170170+(** Get a human-readable description of an element for error messages. *)
171171+let element_description name attrs =
172172+ match name with
173173+ | "a" when has_attr attrs "href" ->
174174+ "The element \"a\" with the attribute \"href\""
175175+ | "audio" when has_attr attrs "controls" ->
176176+ "The element \"audio\" with the attribute \"controls\""
177177+ | "video" when has_attr attrs "controls" ->
178178+ "The element \"video\" with the attribute \"controls\""
179179+ | "img" when has_attr attrs "usemap" ->
180180+ "The element \"img\" with the attribute \"usemap\""
181181+ | "object" when has_attr attrs "usemap" ->
182182+ "The element \"object\" with the attribute \"usemap\""
183183+ | _ ->
184184+ Printf.sprintf "The element \"%s\"" name
185185+186186+(** Report nesting violations. *)
187187+let check_nesting state name attrs collector =
188188+ (* Compute the prohibited ancestor mask for this element *)
189189+ let base_mask =
190190+ match Hashtbl.find_opt ancestor_mask_by_descendant name with
191191+ | Some m -> m
192192+ | None -> 0
193193+ in
194194+195195+ (* Add interactive element restrictions if applicable *)
196196+ let mask =
197197+ if is_interactive_element name attrs then
198198+ base_mask lor a_button_mask
199199+ else
200200+ base_mask
201201+ in
202202+203203+ (* Check for violations *)
204204+ if mask <> 0 then begin
205205+ let mask_hit = state.ancestor_mask land mask in
206206+ if mask_hit <> 0 then begin
207207+ let desc = element_description name attrs in
208208+ (* Find which ancestors are violated *)
209209+ Array.iteri (fun i ancestor ->
210210+ let bit = 1 lsl i in
211211+ if (mask_hit land bit) <> 0 then
212212+ Message_collector.add_error collector
213213+ ~message:(Printf.sprintf
214214+ "%s must not appear as a descendant of the \"%s\" element."
215215+ desc ancestor)
216216+ ~element:name
217217+ ()
218218+ ) special_ancestors
219219+ end
220220+ end
221221+222222+(** Check for required ancestors. *)
223223+let check_required_ancestors state name collector =
224224+ match name with
225225+ | "area" ->
226226+ if (state.ancestor_mask land map_mask) = 0 then
227227+ Message_collector.add_error collector
228228+ ~message:"The \"area\" element must have a \"map\" ancestor."
229229+ ~element:name
230230+ ()
231231+ | _ -> ()
232232+233233+let start_element state ~name ~namespace ~attrs collector =
234234+ (* Only check HTML elements, not SVG or MathML *)
235235+ match namespace with
236236+ | Some _ -> ()
237237+ | None ->
238238+ (* Check for nesting violations *)
239239+ check_nesting state name attrs collector;
240240+ check_required_ancestors state name collector;
241241+242242+ (* Update ancestor mask if this is a special ancestor *)
243243+ let new_mask = state.ancestor_mask in
244244+ let number = special_ancestor_number name in
245245+ let new_mask =
246246+ if number >= 0 then
247247+ new_mask lor (1 lsl number)
248248+ else
249249+ new_mask
250250+ in
251251+252252+ (* Add href tracking for <a> elements *)
253253+ let new_mask =
254254+ if name = "a" && has_attr attrs "href" then
255255+ let a_num = special_ancestor_number "a" in
256256+ new_mask lor (1 lsl a_num)
257257+ else
258258+ new_mask
259259+ in
260260+261261+ (* Push onto stack *)
262262+ let node = { ancestor_mask = state.ancestor_mask; _name = name } in
263263+ state.stack <- node :: state.stack;
264264+ state.ancestor_mask <- new_mask
265265+266266+let end_element state ~name:_ ~namespace _collector =
267267+ (* Only track HTML elements *)
268268+ match namespace with
269269+ | Some _ -> ()
270270+ | None ->
271271+ (* Pop from stack and restore ancestor mask *)
272272+ begin match state.stack with
273273+ | [] -> () (* Should not happen in well-formed documents *)
274274+ | node :: rest ->
275275+ state.stack <- rest;
276276+ state.ancestor_mask <- node.ancestor_mask
277277+ end
278278+279279+let characters _state _text _collector =
280280+ () (* No text-specific nesting checks *)
281281+282282+let end_document _state _collector =
283283+ () (* No document-level checks needed *)
284284+285285+(** Create the checker as a first-class module. *)
286286+let checker =
287287+ let module M = struct
288288+ type nonrec state = state
289289+ let create = create
290290+ let reset = reset
291291+ let start_element = start_element
292292+ let end_element = end_element
293293+ let characters = characters
294294+ let end_document = end_document
295295+ end in
296296+ (module M : Checker.S)
+79
lib/html5_checker/semantic/nesting_checker.mli
···11+(** Interactive element nesting checker.
22+33+ Validates that interactive elements are not nested in ways that violate
44+ HTML5 specifications. This checker tracks ancestor elements and ensures
55+ that prohibited nesting patterns are detected and reported.
66+77+ {2 Validation Rules}
88+99+ The checker enforces the following prohibited nesting relationships:
1010+1111+ {3 Self-nesting Restrictions}
1212+1313+ These elements cannot be nested inside themselves:
1414+ - [form] cannot contain [form]
1515+ - [progress] cannot contain [progress]
1616+ - [meter] cannot contain [meter]
1717+ - [dfn] cannot contain [dfn]
1818+ - [noscript] cannot contain [noscript]
1919+ - [label] cannot contain [label]
2020+2121+ {3 Structural Element Restrictions}
2222+2323+ - [header] cannot be inside [header], [footer], or [address]
2424+ - [footer] cannot be inside [header], [footer], or [address]
2525+ - [address] cannot contain [header], [footer], [article], [section],
2626+ [nav], or heading elements ([h1]-[h6])
2727+2828+ {3 Interactive Content Restrictions}
2929+3030+ Interactive elements cannot be descendants of [a] (with [href]) or
3131+ [button]:
3232+3333+ - [a] (when it has [href]) cannot be inside [a] or [button]
3434+ - [button] cannot be inside [a] or [button]
3535+ - [details] cannot be inside [a] or [button]
3636+ - [embed] cannot be inside [a] or [button]
3737+ - [iframe] cannot be inside [a] or [button]
3838+ - [label] cannot be inside [a] or [button]
3939+ - [select] cannot be inside [a] or [button]
4040+ - [textarea] cannot be inside [a] or [button]
4141+ - [audio] (with [controls]) cannot be inside [a] or [button]
4242+ - [video] (with [controls]) cannot be inside [a] or [button]
4343+ - [input] (except [type=hidden]) cannot be inside [a] or [button]
4444+ - [img] (with [usemap]) cannot be inside [a] or [button]
4545+ - [object] (with [usemap]) cannot be inside [a] or [button]
4646+4747+ {3 Table Cell Restrictions}
4848+4949+ - [dt] and [th] cannot contain [header], [footer], [article], [section],
5050+ [nav], heading elements ([h1]-[h6]), or [hgroup]
5151+5252+ {3 Other Restrictions}
5353+5454+ - [caption] cannot contain [table]
5555+ - [area] must have a [map] ancestor
5656+5757+ {2 Implementation Details}
5858+5959+ The checker uses a bitmask-based approach to efficiently track ancestor
6060+ elements. Each "special" ancestor element has a corresponding bit in the
6161+ ancestor mask. As elements are opened and closed during traversal, the
6262+ mask is updated to reflect the current ancestor context.
6363+6464+ When an element is encountered, the checker:
6565+ 1. Computes which ancestors would be prohibited for this element
6666+ 2. Checks if any of those prohibited ancestors are present in the
6767+ current ancestor mask
6868+ 3. Reports errors for any violations found
6969+ 4. Updates the ancestor mask to include the current element (if it's
7070+ a special ancestor)
7171+7272+ @see <https://html.spec.whatwg.org/multipage/dom.html#content-models>
7373+ HTML5 specification: Content models
7474+*)
7575+7676+include Checker.S
7777+7878+val checker : Checker.t
7979+(** [checker] is a checker instance for validating element nesting rules. *)
+339
lib/html5_checker/semantic/obsolete_checker.ml
···11+(** Obsolete elements map: element name -> suggestion message *)
22+let obsolete_elements =
33+ let tbl = Hashtbl.create 32 in
44+ Hashtbl.add tbl "applet" "Use \"embed\" or \"object\" element instead.";
55+ Hashtbl.add tbl "acronym" "Use the \"abbr\" element instead.";
66+ Hashtbl.add tbl "bgsound" "Use the \"audio\" element instead.";
77+ Hashtbl.add tbl "dir" "Use the \"ul\" element instead.";
88+ Hashtbl.add tbl "frame" "Use the \"iframe\" element and CSS instead, or use server-side includes.";
99+ Hashtbl.add tbl "frameset" "Use the \"iframe\" element and CSS instead, or use server-side includes.";
1010+ Hashtbl.add tbl "noframes" "Use the \"iframe\" element and CSS instead, or use server-side includes.";
1111+ Hashtbl.add tbl "isindex" "Use the \"form\" element containing \"input\" element of type \"text\" instead.";
1212+ Hashtbl.add tbl "keygen" "";
1313+ Hashtbl.add tbl "listing" "Use \"pre\" or \"code\" element instead.";
1414+ Hashtbl.add tbl "menuitem" "Use script to handle \"contextmenu\" event instead.";
1515+ Hashtbl.add tbl "nextid" "Use GUIDs instead.";
1616+ Hashtbl.add tbl "noembed" "Use the \"object\" element instead.";
1717+ Hashtbl.add tbl "param" "Use the \"data\" attribute of the \"object\" element to set the URL of the external resource.";
1818+ Hashtbl.add tbl "plaintext" "Use the \"text/plain\" MIME type instead.";
1919+ Hashtbl.add tbl "rb" "";
2020+ Hashtbl.add tbl "rtc" "";
2121+ Hashtbl.add tbl "strike" "Use \"del\" or \"s\" element instead.";
2222+ Hashtbl.add tbl "xmp" "Use \"pre\" or \"code\" element instead.";
2323+ Hashtbl.add tbl "basefont" "Use CSS instead.";
2424+ Hashtbl.add tbl "big" "Use CSS instead.";
2525+ Hashtbl.add tbl "blink" "Use CSS instead.";
2626+ Hashtbl.add tbl "center" "Use CSS instead.";
2727+ Hashtbl.add tbl "font" "Use CSS instead.";
2828+ Hashtbl.add tbl "marquee" "Use CSS instead.";
2929+ Hashtbl.add tbl "multicol" "Use CSS instead.";
3030+ Hashtbl.add tbl "nobr" "Use CSS instead.";
3131+ Hashtbl.add tbl "spacer" "Use CSS instead.";
3232+ Hashtbl.add tbl "tt" "Use CSS instead.";
3333+ tbl
3434+3535+(** Obsolete attributes map: attr_name -> (element_name -> suggestion message) *)
3636+let obsolete_attributes =
3737+ let tbl = Hashtbl.create 64 in
3838+3939+ (* Helper to register an attribute for multiple elements *)
4040+ let register attr_name elements suggestion =
4141+ let element_map =
4242+ match Hashtbl.find_opt tbl attr_name with
4343+ | Some m -> m
4444+ | None ->
4545+ let m = Hashtbl.create 16 in
4646+ Hashtbl.add tbl attr_name m;
4747+ m
4848+ in
4949+ List.iter (fun elem -> Hashtbl.add element_map elem suggestion) elements
5050+ in
5151+5252+ register "abbr" ["td"]
5353+ "Consider instead beginning the cell contents with concise text, followed by further elaboration if needed.";
5454+5555+ register "accept" ["form"]
5656+ "Use the \"accept\" attribute directly on the \"input\" elements instead.";
5757+5858+ register "archive" ["object"]
5959+ "Use the \"data\" and \"type\" attributes to invoke plugins.";
6060+6161+ register "a" ["object"]
6262+ "Use the \"data\" and \"type\" attributes to invoke plugins.";
6363+6464+ register "axis" ["td"; "th"]
6565+ "Use the \"scope\" attribute.";
6666+6767+ register "border" ["input"; "img"; "object"; "table"]
6868+ "Consider specifying \"img { border: 0; }\" in CSS instead.";
6969+7070+ register "charset" ["a"; "link"]
7171+ "Use an HTTP Content-Type header on the linked resource instead.";
7272+7373+ register "classid" ["object"]
7474+ "Use the \"data\" and \"type\" attributes to invoke plugins.";
7575+7676+ register "code" ["object"]
7777+ "Use the \"data\" and \"type\" attributes to invoke plugins.";
7878+7979+ register "codebase" ["object"]
8080+ "Use the \"data\" and \"type\" attributes to invoke plugins.";
8181+8282+ register "codetype" ["object"]
8383+ "Use the \"data\" and \"type\" attributes to invoke plugins.";
8484+8585+ register "coords" ["a"]
8686+ "Use \"area\" instead of \"a\" for image maps.";
8787+8888+ register "datafld" ["a"; "button"; "div"; "fieldset"; "iframe"; "img"; "input"; "label"; "legend"; "object"; "select"; "span"; "textarea"]
8989+ "Use script and a mechanism such as XMLHttpRequest to populate the page dynamically";
9090+9191+ register "dataformatas" ["button"; "div"; "input"; "label"; "legend"; "object"; "option"; "select"; "span"; "table"]
9292+ "Use script and a mechanism such as XMLHttpRequest to populate the page dynamically";
9393+9494+ register "datapagesize" ["table"]
9595+ "You can safely omit it.";
9696+9797+ register "datasrc" ["a"; "button"; "div"; "iframe"; "img"; "input"; "label"; "legend"; "object"; "option"; "select"; "span"; "table"; "textarea"]
9898+ "Use script and a mechanism such as XMLHttpRequest to populate the page dynamically";
9999+100100+ register "declare" ["object"]
101101+ "Repeat the \"object\" element completely each time the resource is to be reused.";
102102+103103+ register "event" ["script"]
104104+ "Use DOM Events mechanisms to register event listeners.";
105105+106106+ register "for" ["script"]
107107+ "Use DOM Events mechanisms to register event listeners.";
108108+109109+ register "hreflang" ["area"]
110110+ "You can safely omit it.";
111111+112112+ register "ismap" ["input"]
113113+ "You can safely omit it.";
114114+115115+ register "label" ["menu"]
116116+ "Use script to handle \"contextmenu\" event instead.";
117117+118118+ register "language" ["script"]
119119+ "Use the \"type\" attribute instead.";
120120+121121+ register "longdesc" ["iframe"; "img"]
122122+ "Use a regular \"a\" element to link to the description.";
123123+124124+ register "lowsrc" ["img"]
125125+ "Use a progressive JPEG image instead.";
126126+127127+ register "manifest" ["html"]
128128+ "Use service workers instead.";
129129+130130+ register "methods" ["a"; "link"]
131131+ "Use the HTTP OPTIONS feature instead.";
132132+133133+ register "name" ["a"; "embed"; "img"; "option"]
134134+ "Use the \"id\" attribute instead.";
135135+136136+ register "nohref" ["area"]
137137+ "Omitting the \"href\" attribute is sufficient.";
138138+139139+ register "profile" ["head"]
140140+ "To declare which \"meta\" terms are used in the document, instead register the names as meta extensions. To trigger specific UA behaviors, use a \"link\" element instead.";
141141+142142+ register "scheme" ["meta"]
143143+ "Use only one scheme per field, or make the scheme declaration part of the value.";
144144+145145+ register "scope" ["td"]
146146+ "Use the \"scope\" attribute on a \"th\" element instead.";
147147+148148+ register "shape" ["a"]
149149+ "Use \"area\" instead of \"a\" for image maps.";
150150+151151+ register "standby" ["object"]
152152+ "Optimise the linked resource so that it loads quickly or, at least, incrementally.";
153153+154154+ register "summary" ["table"]
155155+ "Consider describing the structure of the \"table\" in a \"caption\" element or in a \"figure\" element containing the \"table\"; or, simplify the structure of the \"table\" so that no description is needed.";
156156+157157+ register "target" ["link"]
158158+ "You can safely omit it.";
159159+160160+ register "type" ["param"; "area"; "menu"]
161161+ "You can safely omit it.";
162162+163163+ register "typemustmatch" ["object"]
164164+ "Avoid using \"object\" elements with untrusted resources.";
165165+166166+ register "urn" ["a"; "link"]
167167+ "Specify the preferred persistent identifier using the \"href\" attribute instead.";
168168+169169+ register "usemap" ["input"; "object"]
170170+ "Use the \"img\" element instead.";
171171+172172+ register "valuetype" ["param"]
173173+ "Use the \"name\" and \"value\" attributes without declaring value types.";
174174+175175+ register "version" ["html"]
176176+ "You can safely omit it.";
177177+178178+ tbl
179179+180180+(** Obsolete style attributes map: attr_name -> element_name list *)
181181+let obsolete_style_attrs =
182182+ let tbl = Hashtbl.create 64 in
183183+184184+ let register attr_name elements =
185185+ Hashtbl.add tbl attr_name elements
186186+ in
187187+188188+ register "align" ["caption"; "iframe"; "img"; "input"; "object"; "embed"; "legend"; "table"; "hr"; "div"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6"; "p"; "col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"];
189189+ register "alink" ["body"];
190190+ register "allowtransparency" ["iframe"];
191191+ register "background" ["body"; "table"; "thead"; "tbody"; "tfoot"; "tr"; "td"; "th"];
192192+ register "bgcolor" ["table"; "tr"; "td"; "th"; "body"];
193193+ register "bordercolor" ["table"];
194194+ register "cellpadding" ["table"];
195195+ register "cellspacing" ["table"];
196196+ register "char" ["col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"];
197197+ register "charoff" ["col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"];
198198+ register "clear" ["br"];
199199+ register "color" ["hr"];
200200+ register "compact" ["dl"; "menu"; "ol"; "ul"];
201201+ register "frameborder" ["iframe"];
202202+ register "framespacing" ["iframe"];
203203+ register "frame" ["table"];
204204+ register "height" ["table"; "thead"; "tbody"; "tfoot"; "tr"; "td"; "th"];
205205+ register "hspace" ["embed"; "iframe"; "input"; "img"; "object"];
206206+ register "link" ["body"];
207207+ register "bottommargin" ["body"];
208208+ register "marginheight" ["iframe"; "body"];
209209+ register "leftmargin" ["body"];
210210+ register "rightmargin" ["body"];
211211+ register "topmargin" ["body"];
212212+ register "marginwidth" ["iframe"; "body"];
213213+ register "noshade" ["hr"];
214214+ register "nowrap" ["td"; "th"];
215215+ register "rules" ["table"];
216216+ register "scrolling" ["iframe"];
217217+ register "size" ["hr"];
218218+ register "text" ["body"];
219219+ register "type" ["li"; "ul"];
220220+ register "valign" ["col"; "colgroup"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"];
221221+ register "vlink" ["body"];
222222+ register "vspace" ["embed"; "iframe"; "input"; "img"; "object"];
223223+ register "width" ["hr"; "table"; "td"; "th"; "col"; "colgroup"; "pre"];
224224+225225+ tbl
226226+227227+(** Obsolete global attributes map: attr_name -> suggestion message *)
228228+let obsolete_global_attrs =
229229+ let tbl = Hashtbl.create 8 in
230230+ Hashtbl.add tbl "contextmenu" "Use script to handle \"contextmenu\" event instead.";
231231+ Hashtbl.add tbl "dropzone" "Use script to handle the \"dragenter\" and \"dragover\" events instead.";
232232+ Hashtbl.add tbl "onshow" "Use script to handle \"contextmenu\" event instead.";
233233+ tbl
234234+235235+(** Checker state *)
236236+type state = unit
237237+238238+let create () = ()
239239+240240+let reset _state = ()
241241+242242+let start_element _state ~name ~namespace ~attrs collector =
243243+ (* Only check HTML elements (no namespace or explicit HTML namespace) *)
244244+ let is_html = match namespace with
245245+ | None -> true
246246+ | Some ns -> String.equal (String.lowercase_ascii ns) "html"
247247+ in
248248+249249+ if not is_html then ()
250250+ else begin
251251+ let name_lower = String.lowercase_ascii name in
252252+253253+ (* Check for obsolete element *)
254254+ (match Hashtbl.find_opt obsolete_elements name_lower with
255255+ | None -> ()
256256+ | Some suggestion ->
257257+ let message =
258258+ if String.length suggestion = 0 then
259259+ Printf.sprintf "The \"%s\" element is obsolete." name
260260+ else
261261+ Printf.sprintf "The \"%s\" element is obsolete. %s" name suggestion
262262+ in
263263+ Message_collector.add_error collector
264264+ ~message
265265+ ~code:"obsolete-element"
266266+ ~element:name
267267+ ());
268268+269269+ (* Check for obsolete attributes *)
270270+ List.iter (fun (attr_name, _attr_value) ->
271271+ let attr_lower = String.lowercase_ascii attr_name in
272272+273273+ (* Check specific obsolete attributes for this element *)
274274+ (match Hashtbl.find_opt obsolete_attributes attr_lower with
275275+ | None -> ()
276276+ | Some element_map ->
277277+ (match Hashtbl.find_opt element_map name_lower with
278278+ | None -> ()
279279+ | Some suggestion ->
280280+ let message =
281281+ Printf.sprintf "The \"%s\" attribute on the \"%s\" element is obsolete. %s"
282282+ attr_name name suggestion
283283+ in
284284+ Message_collector.add_error collector
285285+ ~message
286286+ ~code:"obsolete-attribute"
287287+ ~element:name
288288+ ~attribute:attr_name
289289+ ()));
290290+291291+ (* Check obsolete style attributes *)
292292+ (match Hashtbl.find_opt obsolete_style_attrs attr_lower with
293293+ | None -> ()
294294+ | Some elements ->
295295+ if List.mem name_lower elements then
296296+ let message =
297297+ Printf.sprintf "The \"%s\" attribute on the \"%s\" element is obsolete. Use CSS instead."
298298+ attr_name name
299299+ in
300300+ Message_collector.add_error collector
301301+ ~message
302302+ ~code:"obsolete-style-attribute"
303303+ ~element:name
304304+ ~attribute:attr_name
305305+ ());
306306+307307+ (* Check obsolete global attributes *)
308308+ (match Hashtbl.find_opt obsolete_global_attrs attr_lower with
309309+ | None -> ()
310310+ | Some suggestion ->
311311+ let message =
312312+ Printf.sprintf "The \"%s\" attribute is obsolete. %s" attr_name suggestion
313313+ in
314314+ Message_collector.add_error collector
315315+ ~message
316316+ ~code:"obsolete-global-attribute"
317317+ ~element:name
318318+ ~attribute:attr_name
319319+ ())
320320+ ) attrs
321321+ end
322322+323323+let end_element _state ~name:_ ~namespace:_ _collector = ()
324324+325325+let characters _state _text _collector = ()
326326+327327+let end_document _state _collector = ()
328328+329329+let checker =
330330+ let module M = struct
331331+ type nonrec state = state
332332+ let create = create
333333+ let reset = reset
334334+ let start_element = start_element
335335+ let end_element = end_element
336336+ let characters = characters
337337+ let end_document = end_document
338338+ end in
339339+ (module M : Checker.S)
+67
lib/html5_checker/semantic/obsolete_checker.mli
···11+(** Obsolete element and attribute checker.
22+33+ Reports errors for obsolete HTML elements and attributes that should
44+ not be used in modern HTML5 documents.
55+66+ This checker validates that documents do not use deprecated elements
77+ or attributes from earlier HTML versions. It reports:
88+99+ - {b Obsolete elements}: Elements like [<applet>], [<font>], [<center>]
1010+ that have been removed from HTML5
1111+ - {b Obsolete attributes}: Attributes like [align], [bgcolor], [border]
1212+ that should be replaced with CSS
1313+ - {b Obsolete global attributes}: Global attributes like [contextmenu]
1414+ that are no longer supported
1515+1616+ {2 Obsolete Elements}
1717+1818+ Elements that are flagged as obsolete include:
1919+ - Presentational elements: [<basefont>], [<big>], [<center>], [<font>],
2020+ [<strike>], [<tt>]
2121+ - Frame elements: [<frame>], [<frameset>], [<noframes>]
2222+ - Deprecated interactive elements: [<applet>], [<bgsound>], [<keygen>]
2323+ - Deprecated text elements: [<acronym>], [<dir>], [<listing>], [<xmp>]
2424+ - And many others
2525+2626+ {2 Obsolete Attributes}
2727+2828+ The checker validates against hundreds of obsolete attributes, including:
2929+ - Presentational attributes: [align], [bgcolor], [border], [color],
3030+ [height], [width] (on certain elements)
3131+ - Data binding attributes: [datafld], [dataformatas], [datasrc]
3232+ - Navigation attributes: [longdesc], [methods], [urn]
3333+ - And many element-specific obsolete attributes
3434+3535+ {2 Example}
3636+3737+ {[
3838+ let checker = Obsolete_checker.checker in
3939+ let module C = (val checker : Checker.S) in
4040+ let state = C.create () in
4141+4242+ (* This will emit an error *)
4343+ C.start_element state ~name:"center" ~namespace:None ~attrs:[] collector;
4444+ (* Error: Element "center" is obsolete. Use CSS instead. *)
4545+4646+ (* This will also emit an error *)
4747+ C.start_element state ~name:"div"
4848+ ~namespace:None
4949+ ~attrs:[("align", "center")]
5050+ collector;
5151+ (* Error: Attribute "align" on element "div" is obsolete. Use CSS instead. *)
5252+ ]}
5353+*)
5454+5555+(** Include the standard checker signature. *)
5656+include Checker.S
5757+5858+(** {1 Checker Instance} *)
5959+6060+val checker : Checker.t
6161+(** [checker] is a pre-configured obsolete checker instance that can be
6262+ registered with the checker registry.
6363+6464+ {b Example:}
6565+ {[
6666+ Checker_registry.register registry "obsolete" Obsolete_checker.checker
6767+ ]} *)
···11+(** Required attribute checker implementation. *)
22+33+type state = {
44+ mutable _in_figure : bool;
55+ (** Track if we're inside a <figure> element (alt is more critical there) *)
66+}
77+88+let create () = { _in_figure = false }
99+1010+let reset state = state._in_figure <- false
1111+1212+(** Check if an attribute list contains a specific attribute. *)
1313+let has_attr name attrs =
1414+ List.exists (fun (attr_name, _) -> String.equal attr_name name) attrs
1515+1616+(** Get the value of an attribute if present. *)
1717+let get_attr name attrs =
1818+ List.find_map
1919+ (fun (attr_name, value) ->
2020+ if String.equal attr_name name then Some value else None)
2121+ attrs
2222+2323+let check_img_element attrs collector =
2424+ (* Check for required src attribute *)
2525+ if not (has_attr "src" attrs) then
2626+ Message_collector.add_error collector ~message:"img element requires src attribute"
2727+ ~code:"missing-required-attribute" ~element:"img" ~attribute:"src" ();
2828+2929+ (* Check for alt attribute - always required *)
3030+ if not (has_attr "alt" attrs) then
3131+ Message_collector.add_error collector
3232+ ~message:"img element requires alt attribute for accessibility"
3333+ ~code:"missing-required-attribute" ~element:"img" ~attribute:"alt" ()
3434+3535+let check_area_element attrs collector =
3636+ (* area with href requires alt *)
3737+ if has_attr "href" attrs && not (has_attr "alt" attrs) then
3838+ Message_collector.add_error collector
3939+ ~message:"area element with href requires alt attribute" ~code:"missing-required-attribute"
4040+ ~element:"area" ~attribute:"alt" ()
4141+4242+let check_input_element attrs collector =
4343+ match get_attr "type" attrs with
4444+ | Some "image" ->
4545+ (* input[type=image] requires alt *)
4646+ if not (has_attr "alt" attrs) then
4747+ Message_collector.add_error collector
4848+ ~message:"input element with type=\"image\" requires alt attribute"
4949+ ~code:"missing-required-attribute" ~element:"input" ~attribute:"alt" ()
5050+ | Some "hidden" ->
5151+ (* input[type=hidden] should not have required attribute *)
5252+ if has_attr "required" attrs then
5353+ Message_collector.add_error collector
5454+ ~message:"input element with type=\"hidden\" cannot have required attribute"
5555+ ~code:"invalid-attribute-combination" ~element:"input" ~attribute:"required" ()
5656+ | Some "file" ->
5757+ (* input[type=file] should not have value attribute *)
5858+ if has_attr "value" attrs then
5959+ Message_collector.add_warning collector
6060+ ~message:"input element with type=\"file\" should not have value attribute"
6161+ ~code:"invalid-attribute-combination" ~element:"input" ~attribute:"value" ()
6262+ | _ -> ()
6363+6464+let check_script_element attrs _collector =
6565+ (* script requires src OR text content *)
6666+ if not (has_attr "src" attrs) then
6767+ (* We can't check for text content here; that would need to be done
6868+ in end_element or with state tracking *)
6969+ ()
7070+7171+let check_meta_element attrs collector =
7272+ (* meta requires charset OR (name AND content) OR (http-equiv AND content) *)
7373+ let has_charset = has_attr "charset" attrs in
7474+ let has_name = has_attr "name" attrs in
7575+ let has_content = has_attr "content" attrs in
7676+ let has_http_equiv = has_attr "http-equiv" attrs in
7777+7878+ let valid =
7979+ has_charset
8080+ || (has_name && has_content)
8181+ || (has_http_equiv && has_content)
8282+ in
8383+8484+ if not valid then
8585+ Message_collector.add_error collector
8686+ ~message:
8787+ "meta element requires either charset, or name+content, or http-equiv+content"
8888+ ~code:"missing-required-attribute" ~element:"meta" ()
8989+9090+let check_link_element attrs collector =
9191+ (* link[rel="stylesheet"] requires href *)
9292+ match get_attr "rel" attrs with
9393+ | Some rel when String.equal rel "stylesheet" ->
9494+ if not (has_attr "href" attrs) then
9595+ Message_collector.add_error collector
9696+ ~message:"link element with rel=\"stylesheet\" requires href attribute"
9797+ ~code:"missing-required-attribute" ~element:"link" ~attribute:"href" ()
9898+ | _ -> ()
9999+100100+let check_a_element attrs collector =
101101+ (* a[download] requires href *)
102102+ if has_attr "download" attrs && not (has_attr "href" attrs) then
103103+ Message_collector.add_error collector
104104+ ~message:"a element with download attribute requires href attribute"
105105+ ~code:"missing-required-attribute" ~element:"a" ~attribute:"href" ()
106106+107107+let check_map_element attrs collector =
108108+ (* map requires name *)
109109+ if not (has_attr "name" attrs) then
110110+ Message_collector.add_error collector
111111+ ~message:"map element requires name attribute" ~code:"missing-required-attribute"
112112+ ~element:"map" ~attribute:"name" ()
113113+114114+let start_element state ~name ~namespace:_ ~attrs collector =
115115+ match name with
116116+ | "img" -> check_img_element attrs collector
117117+ | "area" -> check_area_element attrs collector
118118+ | "input" -> check_input_element attrs collector
119119+ | "script" -> check_script_element attrs collector
120120+ | "meta" -> check_meta_element attrs collector
121121+ | "link" -> check_link_element attrs collector
122122+ | "a" -> check_a_element attrs collector
123123+ | "map" -> check_map_element attrs collector
124124+ | "figure" -> state._in_figure <- true
125125+ | _ -> ()
126126+127127+let end_element state ~name ~namespace:_ _collector =
128128+ match name with "figure" -> state._in_figure <- false | _ -> ()
129129+130130+let characters _state _text _collector = ()
131131+132132+let end_document _state _collector = ()
133133+134134+let checker = (module struct
135135+ type nonrec state = state
136136+137137+ let create = create
138138+ let reset = reset
139139+ let start_element = start_element
140140+ let end_element = end_element
141141+ let characters = characters
142142+ let end_document = end_document
143143+end : Checker.S)
···11+(** Required attribute checker.
22+33+ Validates that elements have their required attributes according to the
44+ HTML5 specification. This checker ensures that:
55+66+ - Elements have all mandatory attributes
77+ - Conditional attributes are present when required by context
88+ - Attributes that must appear together are all present
99+1010+ {2 Validation Rules}
1111+1212+ The checker validates these common required attributes:
1313+1414+ {b Images and Media}
1515+ - [img] requires [src] attribute
1616+ - [img] requires [alt] attribute (error in most contexts, warning otherwise)
1717+ - [area] with [href] requires [alt] attribute
1818+ - [input\[type=image\]] requires [alt] attribute
1919+2020+ {b Forms}
2121+ - [input] defaults to [type="text"] if [type] is omitted
2222+ - [map] requires [name] attribute
2323+2424+ {b Scripts and Styles}
2525+ - [script] requires either [src] attribute OR text content
2626+ - [style] with [scoped] requires appropriate positioning
2727+2828+ {b Metadata}
2929+ - [meta] requires one of:
3030+ - [charset] attribute, OR
3131+ - [name] and [content] attributes, OR
3232+ - [http-equiv] and [content] attributes
3333+ - [link\[rel="stylesheet"\]] requires [href] attribute
3434+3535+ {b Links}
3636+ - [a] with [download] attribute requires [href] attribute
3737+3838+ @see <https://html.spec.whatwg.org/multipage/indices.html#attributes-3>
3939+ WHATWG HTML: Attributes *)
4040+4141+include Checker.S
4242+4343+val checker : Checker.t
4444+(** A first-class module instance of this checker.
4545+4646+ {b Usage:}
4747+ {[
4848+ let checker = Required_attr_checker.checker in
4949+ Checker_registry.register "required-attributes" checker
5050+ ]} *)
+404
lib/html5_checker/specialized/aria_checker.ml
···11+(** ARIA validation checker implementation. *)
22+33+(** Valid WAI-ARIA 1.2 roles.
44+55+ These are all the valid role values according to the WAI-ARIA 1.2
66+ specification. Abstract roles are included but should not be used
77+ in HTML content. *)
88+let valid_aria_roles =
99+ let roles = [
1010+ (* Document structure roles *)
1111+ "article"; "associationlist"; "associationlistitemkey";
1212+ "associationlistitemvalue"; "blockquote"; "caption"; "cell"; "code";
1313+ "definition"; "deletion"; "directory"; "document"; "emphasis"; "feed";
1414+ "figure"; "generic"; "group"; "heading"; "img"; "insertion"; "list";
1515+ "listitem"; "mark"; "math"; "meter"; "none"; "note"; "paragraph";
1616+ "presentation"; "row"; "rowgroup"; "strong"; "subscript"; "suggestion";
1717+ "superscript"; "table"; "term"; "time"; "toolbar"; "tooltip";
1818+1919+ (* Widget roles *)
2020+ "button"; "checkbox"; "combobox"; "dialog"; "grid"; "gridcell"; "link";
2121+ "listbox"; "menu"; "menubar"; "menuitem"; "menuitemcheckbox";
2222+ "menuitemradio"; "option"; "progressbar"; "radio"; "radiogroup";
2323+ "scrollbar"; "searchbox"; "separator"; "slider"; "spinbutton"; "switch";
2424+ "tab"; "tablist"; "tabpanel"; "textbox"; "tree"; "treegrid"; "treeitem";
2525+2626+ (* Landmark roles *)
2727+ "banner"; "complementary"; "contentinfo"; "form"; "main"; "navigation";
2828+ "region"; "search";
2929+3030+ (* Live region roles *)
3131+ "alert"; "log"; "marquee"; "status"; "timer";
3232+3333+ (* Window roles *)
3434+ "alertdialog";
3535+3636+ (* Abstract roles - not for use in HTML content *)
3737+ "command"; "comment"; "composite"; "input"; "landmark"; "range";
3838+ "roletype"; "section"; "sectionhead"; "select"; "structure"; "widget";
3939+ "window";
4040+4141+ (* Additional roles *)
4242+ "application"; "columnheader"; "rowheader";
4343+ ] in
4444+ let tbl = Hashtbl.create (List.length roles) in
4545+ List.iter (fun role -> Hashtbl.add tbl role ()) roles;
4646+ tbl
4747+4848+(** Roles that cannot have accessible names.
4949+5050+ These roles must not have aria-label or aria-labelledby attributes. *)
5151+let roles_which_cannot_be_named =
5252+ let roles = [
5353+ "caption"; "code"; "deletion"; "emphasis"; "generic"; "insertion";
5454+ "paragraph"; "presentation"; "strong"; "subscript"; "superscript"
5555+ ] in
5656+ let tbl = Hashtbl.create (List.length roles) in
5757+ List.iter (fun role -> Hashtbl.add tbl role ()) roles;
5858+ tbl
5959+6060+(** Map from descendant role to set of required ancestor roles. *)
6161+let required_role_ancestor_by_descendant : (string, string list) Hashtbl.t =
6262+ Hashtbl.create 32
6363+6464+(** Register that a descendant role requires one of the given ancestor roles. *)
6565+let register_required_ancestor_role parents child =
6666+ Hashtbl.add required_role_ancestor_by_descendant child parents
6767+6868+(** Initialize required ancestor role relationships. *)
6969+let () =
7070+ register_required_ancestor_role ["listbox"] "option";
7171+ register_required_ancestor_role ["menu"; "menubar"] "menuitem";
7272+ register_required_ancestor_role ["menu"; "menubar"] "menuitemcheckbox";
7373+ register_required_ancestor_role ["menu"; "menubar"] "menuitemradio";
7474+ register_required_ancestor_role ["tablist"] "tab";
7575+ register_required_ancestor_role ["tree"; "group"] "treeitem";
7676+ register_required_ancestor_role ["list"; "group"] "listitem";
7777+ register_required_ancestor_role ["row"] "cell";
7878+ register_required_ancestor_role ["row"] "gridcell";
7979+ register_required_ancestor_role ["row"] "columnheader";
8080+ register_required_ancestor_role ["row"] "rowheader";
8181+ register_required_ancestor_role ["grid"; "rowgroup"; "table"; "treegrid"] "row";
8282+ register_required_ancestor_role ["grid"; "table"; "treegrid"] "rowgroup"
8383+8484+(** Map from HTML element name to implicit ARIA role. *)
8585+let elements_with_implicit_role : (string, string) Hashtbl.t =
8686+ let tbl = Hashtbl.create 64 in
8787+ Hashtbl.add tbl "article" "article";
8888+ Hashtbl.add tbl "aside" "complementary";
8989+ Hashtbl.add tbl "body" "document";
9090+ Hashtbl.add tbl "button" "button";
9191+ Hashtbl.add tbl "datalist" "listbox";
9292+ Hashtbl.add tbl "dd" "definition";
9393+ Hashtbl.add tbl "details" "group";
9494+ Hashtbl.add tbl "dialog" "dialog";
9595+ Hashtbl.add tbl "dfn" "term";
9696+ Hashtbl.add tbl "dt" "term";
9797+ Hashtbl.add tbl "fieldset" "group";
9898+ Hashtbl.add tbl "figure" "figure";
9999+ Hashtbl.add tbl "form" "form";
100100+ Hashtbl.add tbl "footer" "contentinfo";
101101+ Hashtbl.add tbl "h1" "heading";
102102+ Hashtbl.add tbl "h2" "heading";
103103+ Hashtbl.add tbl "h3" "heading";
104104+ Hashtbl.add tbl "h4" "heading";
105105+ Hashtbl.add tbl "h5" "heading";
106106+ Hashtbl.add tbl "h6" "heading";
107107+ Hashtbl.add tbl "hr" "separator";
108108+ Hashtbl.add tbl "header" "banner";
109109+ Hashtbl.add tbl "img" "img";
110110+ Hashtbl.add tbl "li" "listitem";
111111+ Hashtbl.add tbl "link" "link";
112112+ Hashtbl.add tbl "main" "main";
113113+ Hashtbl.add tbl "nav" "navigation";
114114+ Hashtbl.add tbl "ol" "list";
115115+ Hashtbl.add tbl "output" "status";
116116+ Hashtbl.add tbl "progress" "progressbar";
117117+ Hashtbl.add tbl "section" "region";
118118+ Hashtbl.add tbl "summary" "button";
119119+ Hashtbl.add tbl "s" "deletion";
120120+ Hashtbl.add tbl "table" "table";
121121+ Hashtbl.add tbl "tbody" "rowgroup";
122122+ Hashtbl.add tbl "textarea" "textbox";
123123+ Hashtbl.add tbl "tfoot" "rowgroup";
124124+ Hashtbl.add tbl "thead" "rowgroup";
125125+ Hashtbl.add tbl "td" "cell";
126126+ Hashtbl.add tbl "tr" "row";
127127+ Hashtbl.add tbl "ul" "list";
128128+ tbl
129129+130130+(** Map from HTML element name to multiple possible implicit roles.
131131+132132+ Some elements like 'th' can have different implicit roles depending
133133+ on context (columnheader or rowheader). *)
134134+let elements_with_implicit_roles : (string, string array) Hashtbl.t =
135135+ let tbl = Hashtbl.create 4 in
136136+ Hashtbl.add tbl "th" [| "columnheader"; "rowheader" |];
137137+ tbl
138138+139139+(** Map from input type to implicit ARIA role. *)
140140+let input_types_with_implicit_role : (string, string) Hashtbl.t =
141141+ let tbl = Hashtbl.create 8 in
142142+ Hashtbl.add tbl "button" "button";
143143+ Hashtbl.add tbl "checkbox" "checkbox";
144144+ Hashtbl.add tbl "image" "button";
145145+ Hashtbl.add tbl "number" "spinbutton";
146146+ Hashtbl.add tbl "radio" "radio";
147147+ Hashtbl.add tbl "range" "slider";
148148+ Hashtbl.add tbl "reset" "button";
149149+ Hashtbl.add tbl "submit" "button";
150150+ tbl
151151+152152+(** Map from ARIA attribute to array of roles for which it is deprecated.
153153+154154+ These attributes should not be used on elements with these roles. *)
155155+let aria_deprecated_attributes_by_role : (string, string array) Hashtbl.t =
156156+ let tbl = Hashtbl.create 8 in
157157+158158+ (* aria-disabled deprecated for many roles *)
159159+ Hashtbl.add tbl "aria-disabled" [|
160160+ "alert"; "alertdialog"; "article"; "associationlist";
161161+ "associationlistitemkey"; "associationlistitemvalue"; "banner";
162162+ "blockquote"; "caption"; "cell"; "code"; "command"; "comment";
163163+ "complementary"; "contentinfo"; "definition"; "deletion";
164164+ "dialog"; "directory"; "document"; "emphasis"; "feed"; "figure";
165165+ "form"; "generic"; "heading"; "img"; "insertion"; "landmark";
166166+ "list"; "listitem"; "log"; "main"; "mark"; "marquee"; "math";
167167+ "meter"; "navigation"; "note"; "paragraph"; "presentation";
168168+ "progressbar"; "range"; "region"; "rowgroup"; "search";
169169+ "section"; "sectionhead"; "status"; "strong"; "structure";
170170+ "subscript"; "suggestion"; "superscript"; "table"; "tabpanel";
171171+ "term"; "time"; "timer"; "tooltip"; "widget"; "window"
172172+ |];
173173+174174+ (* aria-errormessage deprecated for many roles *)
175175+ Hashtbl.add tbl "aria-errormessage" [|
176176+ "alert"; "alertdialog"; "article"; "associationlist";
177177+ "associationlistitemkey"; "associationlistitemvalue"; "banner";
178178+ "blockquote"; "button"; "caption"; "cell"; "code"; "command";
179179+ "comment"; "complementary"; "composite"; "contentinfo";
180180+ "definition"; "deletion"; "dialog"; "directory"; "document";
181181+ "emphasis"; "feed"; "figure"; "form"; "generic"; "grid"; "group";
182182+ "heading"; "img"; "input"; "insertion"; "landmark"; "link"; "list";
183183+ "listitem"; "log"; "main"; "mark"; "marquee"; "math"; "menu";
184184+ "menubar"; "menuitem"; "menuitemcheckbox"; "menuitemradio";
185185+ "meter"; "navigation"; "note"; "option"; "paragraph";
186186+ "presentation"; "progressbar"; "radio"; "range"; "region"; "row";
187187+ "rowgroup"; "scrollbar"; "search"; "section"; "sectionhead";
188188+ "select"; "separator"; "status"; "strong"; "structure";
189189+ "subscript"; "suggestion"; "superscript"; "tab"; "table";
190190+ "tablist"; "tabpanel"; "term"; "time"; "timer"; "toolbar";
191191+ "tooltip"; "treeitem"; "widget"; "window"
192192+ |];
193193+194194+ (* aria-haspopup deprecated for many roles *)
195195+ Hashtbl.add tbl "aria-haspopup" [|
196196+ "alert"; "alertdialog"; "article"; "associationlist";
197197+ "associationlistitemkey"; "associationlistitemvalue"; "banner";
198198+ "blockquote"; "caption"; "cell"; "checkbox"; "code"; "command";
199199+ "comment"; "complementary"; "composite"; "contentinfo";
200200+ "definition"; "deletion"; "dialog"; "directory"; "document";
201201+ "emphasis"; "feed"; "figure"; "form"; "generic"; "grid"; "group";
202202+ "heading"; "img"; "input"; "insertion"; "landmark"; "list";
203203+ "listbox"; "listitem"; "log"; "main"; "mark"; "marquee"; "math";
204204+ "menu"; "menubar"; "meter"; "navigation"; "note"; "option";
205205+ "paragraph"; "presentation"; "progressbar"; "radio"; "radiogroup";
206206+ "range"; "region"; "row"; "rowgroup"; "scrollbar"; "search";
207207+ "section"; "sectionhead"; "select"; "separator"; "spinbutton";
208208+ "status"; "strong"; "structure"; "subscript"; "suggestion";
209209+ "superscript"; "switch"; "table"; "tablist"; "tabpanel"; "term";
210210+ "time"; "timer"; "toolbar"; "tooltip"; "tree"; "treegrid";
211211+ "widget"; "window"
212212+ |];
213213+214214+ (* aria-invalid deprecated for many roles *)
215215+ Hashtbl.add tbl "aria-invalid" [|
216216+ "alert"; "alertdialog"; "article"; "associationlist";
217217+ "associationlistitemkey"; "associationlistitemvalue"; "banner";
218218+ "blockquote"; "button"; "caption"; "cell"; "code"; "command";
219219+ "comment"; "complementary"; "composite"; "contentinfo";
220220+ "definition"; "deletion"; "dialog"; "directory"; "document";
221221+ "emphasis"; "feed"; "figure"; "form"; "generic"; "grid"; "group";
222222+ "heading"; "img"; "input"; "insertion"; "landmark"; "link"; "list";
223223+ "listitem"; "log"; "main"; "mark"; "marquee"; "math"; "menu";
224224+ "menubar"; "menuitem"; "menuitemcheckbox"; "menuitemradio";
225225+ "meter"; "navigation"; "note"; "option"; "paragraph";
226226+ "presentation"; "progressbar"; "radio"; "range"; "region"; "row";
227227+ "rowgroup"; "scrollbar"; "search"; "section"; "sectionhead";
228228+ "select"; "separator"; "status"; "strong"; "structure";
229229+ "subscript"; "suggestion"; "superscript"; "tab"; "table";
230230+ "tablist"; "tabpanel"; "term"; "time"; "timer"; "toolbar";
231231+ "tooltip"; "treeitem"; "widget"; "window"
232232+ |];
233233+234234+ (* aria-level deprecated for listitem *)
235235+ Hashtbl.add tbl "aria-level" [| "listitem" |];
236236+237237+ tbl
238238+239239+(** Split a role attribute value into individual roles.
240240+241241+ The role attribute can contain multiple space-separated role tokens. *)
242242+let split_roles role_value =
243243+ let trimmed = String.trim role_value in
244244+ if trimmed = "" then []
245245+ else
246246+ String.split_on_char ' ' trimmed
247247+ |> List.filter (fun s -> String.trim s <> "")
248248+ |> List.map String.lowercase_ascii
249249+250250+(** Get the implicit role for an HTML element. *)
251251+let get_implicit_role element_name attrs =
252252+ (* Check for input element with type attribute *)
253253+ if element_name = "input" then begin
254254+ match List.assoc_opt "type" attrs with
255255+ | Some input_type ->
256256+ let input_type = String.lowercase_ascii input_type in
257257+ Hashtbl.find_opt input_types_with_implicit_role input_type
258258+ | None -> Some "textbox" (* default input type is text *)
259259+ end
260260+ else
261261+ Hashtbl.find_opt elements_with_implicit_role element_name
262262+263263+(** Get all possible implicit roles for an element (for elements like 'th').
264264+265265+ Note: This function is not currently used but is provided for completeness. *)
266266+let _get_implicit_roles element_name =
267267+ match Hashtbl.find_opt elements_with_implicit_roles element_name with
268268+ | Some roles -> Array.to_list roles
269269+ | None ->
270270+ match get_implicit_role element_name [] with
271271+ | Some role -> [role]
272272+ | None -> []
273273+274274+(** Stack node representing an element in the ancestor chain. *)
275275+type stack_node = {
276276+ explicit_roles : string list;
277277+ implicit_role : string option;
278278+}
279279+280280+(** Checker state. *)
281281+type state = {
282282+ mutable stack : stack_node list;
283283+}
284284+285285+let create () = { stack = [] }
286286+287287+let reset state = state.stack <- []
288288+289289+(** Check if any ancestor has one of the required roles. *)
290290+let has_required_ancestor_role state required_roles =
291291+ List.exists (fun ancestor ->
292292+ (* Check explicit roles *)
293293+ List.exists (fun role ->
294294+ List.mem role required_roles
295295+ ) ancestor.explicit_roles
296296+ ||
297297+ (* Check implicit role *)
298298+ match ancestor.implicit_role with
299299+ | Some implicit_role -> List.mem implicit_role required_roles
300300+ | None -> false
301301+ ) state.stack
302302+303303+(** Render a list of roles as a human-readable string. *)
304304+let render_role_set roles =
305305+ match roles with
306306+ | [] -> ""
307307+ | [role] -> "\"" ^ role ^ "\""
308308+ | _ ->
309309+ let quoted = List.map (fun r -> "\"" ^ r ^ "\"") roles in
310310+ String.concat " or " quoted
311311+312312+let start_element state ~name ~namespace ~attrs collector =
313313+ (* Only process HTML elements *)
314314+ match namespace with
315315+ | Some _ -> () (* Skip non-HTML elements *)
316316+ | None ->
317317+ let role_attr = List.assoc_opt "role" attrs in
318318+ let aria_label = List.assoc_opt "aria-label" attrs in
319319+ let aria_labelledby = List.assoc_opt "aria-labelledby" attrs in
320320+ let has_accessible_name =
321321+ (match aria_label with Some v -> String.trim v <> "" | None -> false) ||
322322+ (match aria_labelledby with Some v -> String.trim v <> "" | None -> false)
323323+ in
324324+325325+ (* Parse explicit roles from role attribute *)
326326+ let explicit_roles = match role_attr with
327327+ | Some role_value -> split_roles role_value
328328+ | None -> []
329329+ in
330330+331331+ (* Get implicit role for this element *)
332332+ let implicit_role = get_implicit_role name attrs in
333333+334334+ (* Validate explicit roles *)
335335+ List.iter (fun role ->
336336+ (* Check if role is valid *)
337337+ if not (Hashtbl.mem valid_aria_roles role) then
338338+ Message_collector.add_error collector
339339+ ~message:(Printf.sprintf "Invalid ARIA role \"%s\"." role) ();
340340+341341+ (* Check if role cannot be named *)
342342+ if Hashtbl.mem roles_which_cannot_be_named role && has_accessible_name then
343343+ Message_collector.add_error collector
344344+ ~message:(Printf.sprintf
345345+ "Elements with role=\"%s\" must not have accessible names (via aria-label or aria-labelledby)."
346346+ role) ();
347347+348348+ (* Check for required ancestor roles *)
349349+ begin match Hashtbl.find_opt required_role_ancestor_by_descendant role with
350350+ | Some required_ancestors ->
351351+ if not (has_required_ancestor_role state required_ancestors) then
352352+ Message_collector.add_error collector
353353+ ~message:(Printf.sprintf
354354+ "An element with \"role=%s\" must be contained in, or owned by, an element with the \"role\" value %s."
355355+ role
356356+ (render_role_set required_ancestors)) ()
357357+ | None -> ()
358358+ end;
359359+360360+ (* Check for deprecated ARIA attributes for this role *)
361361+ List.iter (fun (attr_name, _attr_value) ->
362362+ if String.starts_with ~prefix:"aria-" attr_name then
363363+ match Hashtbl.find_opt aria_deprecated_attributes_by_role attr_name with
364364+ | Some deprecated_for_roles ->
365365+ (* Check if current role is in the deprecated list *)
366366+ if Array.mem role deprecated_for_roles then
367367+ Message_collector.add_warning collector
368368+ ~message:(Printf.sprintf
369369+ "The \"%s\" attribute should not be used on any element which has \"role=%s\"."
370370+ attr_name role) ()
371371+ | None -> ()
372372+ ) attrs
373373+ ) explicit_roles;
374374+375375+ (* Push current element onto stack *)
376376+ let node = {
377377+ explicit_roles;
378378+ implicit_role;
379379+ } in
380380+ state.stack <- node :: state.stack
381381+382382+let end_element state ~name:_ ~namespace _collector =
383383+ (* Only process HTML elements *)
384384+ match namespace with
385385+ | Some _ -> () (* Skip non-HTML elements *)
386386+ | None ->
387387+ (* Pop from stack *)
388388+ match state.stack with
389389+ | _ :: rest -> state.stack <- rest
390390+ | [] -> () (* Stack underflow - shouldn't happen in well-formed docs *)
391391+392392+let characters _state _text _collector = ()
393393+394394+let end_document _state _collector = ()
395395+396396+let checker = (module struct
397397+ type nonrec state = state
398398+ let create = create
399399+ let reset = reset
400400+ let start_element = start_element
401401+ let end_element = end_element
402402+ let characters = characters
403403+ let end_document = end_document
404404+end : Checker.S)
+102
lib/html5_checker/specialized/aria_checker.mli
···11+(** ARIA (Accessible Rich Internet Applications) validation checker.
22+33+ Validates ARIA roles, required ancestor roles, implicit roles, and
44+ deprecated ARIA attributes according to the WAI-ARIA specification.
55+66+ {2 Validation Rules}
77+88+ The checker enforces the following ARIA validation rules:
99+1010+ {3 Valid ARIA Roles}
1111+1212+ All valid WAI-ARIA 1.2 roles are recognized:
1313+1414+ - {b Document structure roles}: article, definition, directory, document,
1515+ feed, figure, group, heading, img, list, listitem, math, none, note,
1616+ presentation, region, separator, table, term, toolbar, tooltip
1717+ - {b Widget roles}: button, checkbox, combobox, dialog, grid, gridcell,
1818+ link, listbox, menu, menubar, menuitem, menuitemcheckbox,
1919+ menuitemradio, option, progressbar, radio, radiogroup, scrollbar,
2020+ slider, spinbutton, switch, tab, tablist, tabpanel, textbox, tree,
2121+ treegrid, treeitem
2222+ - {b Landmark roles}: banner, complementary, contentinfo, form, main,
2323+ navigation, search
2424+ - {b Live region roles}: alert, log, marquee, status, timer
2525+ - {b Window roles}: alertdialog
2626+ - {b Abstract roles} (not for use in content): command, composite, input,
2727+ landmark, range, roletype, section, sectionhead, select, structure,
2828+ widget, window
2929+3030+ {3 Required Ancestor Roles}
3131+3232+ Certain ARIA roles require specific ancestor roles:
3333+3434+ - [option] requires [listbox]
3535+ - [menuitem], [menuitemcheckbox], [menuitemradio] require [menu] or
3636+ [menubar]
3737+ - [tab] requires [tablist]
3838+ - [treeitem] requires [tree] or [group]
3939+ - [listitem] requires [list] or [group]
4040+ - [cell], [gridcell], [columnheader], [rowheader] require [row]
4141+ - [row] requires [grid], [rowgroup], [table], or [treegrid]
4242+ - [rowgroup] requires [grid], [table], or [treegrid]
4343+4444+ {3 Roles That Cannot Be Named}
4545+4646+ These roles must not have accessible names (via aria-label or
4747+ aria-labelledby):
4848+4949+ - caption, code, deletion, emphasis, generic, insertion, paragraph,
5050+ presentation, strong, subscript, superscript
5151+5252+ {3 Implicit ARIA Roles}
5353+5454+ HTML elements have implicit ARIA roles:
5555+5656+ - [<article>] has implicit role [article]
5757+ - [<aside>] has implicit role [complementary]
5858+ - [<button>] has implicit role [button]
5959+ - [<dialog>] has implicit role [dialog]
6060+ - [<footer>] has implicit role [contentinfo]
6161+ - [<header>] has implicit role [banner]
6262+ - [<main>] has implicit role [main]
6363+ - [<nav>] has implicit role [navigation]
6464+ - And many more...
6565+6666+ {3 Deprecated ARIA Attributes}
6767+6868+ Certain ARIA attributes are deprecated for specific roles:
6969+7070+ - [aria-disabled] is deprecated for: alert, article, banner, cell,
7171+ document, feed, figure, heading, img, list, listitem, main, navigation,
7272+ region, and many others
7373+ - [aria-errormessage] is deprecated for: alert, article, banner, button,
7474+ cell, document, link, and many others
7575+ - [aria-haspopup] is deprecated for: alert, article, checkbox, listbox,
7676+ and many others
7777+ - [aria-invalid] is deprecated for: alert, article, button, cell,
7878+ document, link, and many others
7979+ - [aria-level] is deprecated for: listitem
8080+8181+ {2 Implementation Details}
8282+8383+ The checker maintains a stack of ancestor elements with their roles
8484+ (explicit and implicit) to validate required ancestor relationships.
8585+ When an element with a role attribute is encountered, the checker:
8686+8787+ 1. Parses the role attribute value (space-separated tokens)
8888+ 2. Validates each role against the list of valid roles
8989+ 3. Checks if the role requires an ancestor role
9090+ 4. Verifies the required ancestor is present in the ancestor stack
9191+ 5. Checks for deprecated ARIA attributes on elements with specific roles
9292+ 6. Validates that roles which cannot be named do not have aria-label or
9393+ aria-labelledby attributes
9494+9595+ @see <https://www.w3.org/TR/wai-aria-1.2/>
9696+ WAI-ARIA 1.2 specification
9797+*)
9898+9999+include Checker.S
100100+101101+val checker : Checker.t
102102+(** [checker] is a checker instance for validating ARIA roles and attributes. *)
+157
lib/html5_checker/specialized/heading_checker.ml
···11+(** Heading structure validation checker.
22+33+ This checker validates that:
44+ - Heading levels don't skip (e.g., h1 to h3)
55+ - Documents have at least one heading
66+ - Multiple h1 usage is noted
77+ - Headings are not empty *)
88+99+(** Checker state tracking heading structure. *)
1010+type state = {
1111+ mutable current_level : int option;
1212+ mutable h1_count : int;
1313+ mutable has_any_heading : bool;
1414+ mutable first_heading_checked : bool;
1515+ mutable in_heading : string option;
1616+ mutable heading_has_text : bool;
1717+}
1818+1919+let create () =
2020+ {
2121+ current_level = None;
2222+ h1_count = 0;
2323+ has_any_heading = false;
2424+ first_heading_checked = false;
2525+ in_heading = None;
2626+ heading_has_text = false;
2727+ }
2828+2929+let reset state =
3030+ state.current_level <- None;
3131+ state.h1_count <- 0;
3232+ state.has_any_heading <- false;
3333+ state.first_heading_checked <- false;
3434+ state.in_heading <- None;
3535+ state.heading_has_text <- false
3636+3737+(** Extract heading level from tag name (e.g., "h1" -> 1). *)
3838+let heading_level name =
3939+ match String.lowercase_ascii name with
4040+ | "h1" -> Some 1
4141+ | "h2" -> Some 2
4242+ | "h3" -> Some 3
4343+ | "h4" -> Some 4
4444+ | "h5" -> Some 5
4545+ | "h6" -> Some 6
4646+ | _ -> None
4747+4848+(** Check if text is effectively empty (only whitespace). *)
4949+let is_empty_text text =
5050+ let rec check i =
5151+ if i >= String.length text then
5252+ true
5353+ else
5454+ match text.[i] with
5555+ | ' ' | '\t' | '\n' | '\r' -> check (i + 1)
5656+ | _ -> false
5757+ in
5858+ check 0
5959+6060+let start_element state ~name ~namespace:_ ~attrs:_ collector =
6161+ match heading_level name with
6262+ | Some level ->
6363+ state.has_any_heading <- true;
6464+6565+ (* Check if this is the first heading *)
6666+ if not state.first_heading_checked then begin
6767+ state.first_heading_checked <- true;
6868+ if level <> 1 then
6969+ Message_collector.add_warning collector
7070+ ~message:(Printf.sprintf
7171+ "First heading in document is <%s>, should typically be <h1>"
7272+ name)
7373+ ~code:"first-heading-not-h1"
7474+ ~element:name
7575+ ()
7676+ end;
7777+7878+ (* Track h1 count *)
7979+ if level = 1 then begin
8080+ state.h1_count <- state.h1_count + 1;
8181+ if state.h1_count > 1 then
8282+ Message_collector.add_warning collector
8383+ ~message:"Multiple <h1> elements detected. While valid in HTML5 sectioning content, traditional advice suggests one <h1> per page"
8484+ ~code:"multiple-h1"
8585+ ~element:name
8686+ ()
8787+ end;
8888+8989+ (* Check for skipped levels *)
9090+ begin match state.current_level with
9191+ | None ->
9292+ state.current_level <- Some level
9393+ | Some prev_level ->
9494+ let diff = level - prev_level in
9595+ if diff > 1 then
9696+ Message_collector.add_warning collector
9797+ ~message:(Printf.sprintf
9898+ "Heading level skipped: <%s> follows <h%d>, skipping %d level%s. This can confuse screen reader users"
9999+ name prev_level (diff - 1) (if diff > 2 then "s" else ""))
100100+ ~code:"heading-level-skipped"
101101+ ~element:name
102102+ ();
103103+ state.current_level <- Some level
104104+ end;
105105+106106+ (* Track that we're in a heading to check for empty content *)
107107+ state.in_heading <- Some name;
108108+ state.heading_has_text <- false
109109+110110+ | None ->
111111+ (* Not a heading element *)
112112+ ()
113113+114114+let end_element state ~name ~namespace:_ collector =
115115+ match state.in_heading with
116116+ | Some heading when heading = name ->
117117+ (* Exiting the heading we're tracking *)
118118+ if not state.heading_has_text then
119119+ Message_collector.add_error collector
120120+ ~message:(Printf.sprintf
121121+ "Heading <%s> is empty or contains only whitespace. Empty headings are problematic for screen readers"
122122+ name)
123123+ ~code:"empty-heading"
124124+ ~element:name
125125+ ();
126126+ state.in_heading <- None;
127127+ state.heading_has_text <- false
128128+ | _ ->
129129+ ()
130130+131131+let characters state text _collector =
132132+ (* If we're inside a heading, check if this text is non-whitespace *)
133133+ match state.in_heading with
134134+ | Some _ ->
135135+ if not (is_empty_text text) then
136136+ state.heading_has_text <- true
137137+ | None ->
138138+ ()
139139+140140+let end_document state collector =
141141+ (* Check if document has any headings *)
142142+ if not state.has_any_heading then
143143+ Message_collector.add_warning collector
144144+ ~message:"Document contains no heading elements (h1-h6). Headings provide important document structure for accessibility"
145145+ ~code:"no-headings"
146146+ ()
147147+148148+let checker = (module struct
149149+ type nonrec state = state
150150+151151+ let create = create
152152+ let reset = reset
153153+ let start_element = start_element
154154+ let end_element = end_element
155155+ let characters = characters
156156+ let end_document = end_document
157157+end : Checker.S)
+12
lib/html5_checker/specialized/heading_checker.mli
···11+(** Heading structure validation checker.
22+33+ Validates:
44+ - Proper heading level hierarchy (no skipped levels)
55+ - Document should have at least one heading
66+ - Multiple h1 usage patterns
77+ - Headings should not be empty *)
88+99+include Checker.S
1010+1111+val checker : Checker.t
1212+(** [checker] is a checker instance for validating heading structure. *)
+111
lib/html5_checker/specialized/language_checker.ml
···11+(** Language attribute validation checker.
22+33+ Validates language attributes. *)
44+55+(** Checker state tracking language attributes. *)
66+type state = {
77+ mutable html_element_seen : bool;
88+ mutable html_has_lang : bool;
99+}
1010+1111+let create () =
1212+ {
1313+ html_element_seen = false;
1414+ html_has_lang = false;
1515+ }
1616+1717+let reset state =
1818+ state.html_element_seen <- false;
1919+ state.html_has_lang <- false
2020+2121+(** Get attribute value from attribute list. *)
2222+let get_attr attrs name =
2323+ try Some (List.assoc name attrs)
2424+ with Not_found -> None
2525+2626+(** Validate language attribute. *)
2727+let validate_lang_attr value ~location ~element collector =
2828+ match Dt_language.Language_or_empty.validate value with
2929+ | Ok () -> ()
3030+ | Error msg ->
3131+ Message_collector.add_error collector
3232+ ~message:(Printf.sprintf "Invalid lang attribute: %s" msg)
3333+ ~code:"invalid-lang"
3434+ ?location
3535+ ~element
3636+ ~attribute:"lang"
3737+ ()
3838+3939+(** Check if lang and xml:lang match. *)
4040+let check_lang_xmllang_match ~lang ~xmllang ~location ~element collector =
4141+ if lang <> xmllang then
4242+ Message_collector.add_warning collector
4343+ ~message:(Printf.sprintf
4444+ "lang attribute '%s' does not match xml:lang attribute '%s'" lang xmllang)
4545+ ~code:"lang-xmllang-mismatch"
4646+ ?location
4747+ ~element
4848+ ()
4949+5050+(** Process language attributes. *)
5151+let process_language_attrs state ~element ~namespace ~attrs ~location collector =
5252+ let lang_opt = get_attr attrs "lang" in
5353+ let xmllang_opt = get_attr attrs "xml:lang" in
5454+5555+ (* Check if this is the html element *)
5656+ if element = "html" && namespace = None then begin
5757+ state.html_element_seen <- true;
5858+ state.html_has_lang <- lang_opt <> None
5959+ end;
6060+6161+ (* Validate lang attribute *)
6262+ begin match lang_opt with
6363+ | Some lang ->
6464+ validate_lang_attr lang ~location ~element collector
6565+ | None -> ()
6666+ end;
6767+6868+ (* Validate xml:lang attribute *)
6969+ begin match xmllang_opt with
7070+ | Some xmllang ->
7171+ validate_lang_attr xmllang ~location ~element collector
7272+ | None -> ()
7373+ end;
7474+7575+ (* Check that lang and xml:lang match if both present *)
7676+ begin match lang_opt, xmllang_opt with
7777+ | Some lang, Some xmllang ->
7878+ check_lang_xmllang_match ~lang ~xmllang ~location ~element collector
7979+ | _ -> ()
8080+ end
8181+8282+let start_element state ~name ~namespace ~attrs collector =
8383+ let location = None in
8484+ process_language_attrs state ~element:name ~namespace ~attrs ~location collector
8585+8686+let end_element _state ~name:_ ~namespace:_ _collector =
8787+ ()
8888+8989+let characters _state _text _collector =
9090+ ()
9191+9292+let end_document state collector =
9393+ (* Warn if html element lacks lang attribute *)
9494+ if state.html_element_seen && not state.html_has_lang then
9595+ Message_collector.add_warning collector
9696+ ~message:"The <html> element should have a lang attribute to specify \
9797+ the document's primary language"
9898+ ~code:"missing-lang-on-html"
9999+ ~element:"html"
100100+ ()
101101+102102+let checker = (module struct
103103+ type nonrec state = state
104104+105105+ let create = create
106106+ let reset = reset
107107+ let start_element = start_element
108108+ let end_element = end_element
109109+ let characters = characters
110110+ let end_document = end_document
111111+end : Checker.S)
···11+(** Language attribute validation checker.
22+33+ Validates:
44+ - lang attribute values are valid BCP 47 tags
55+ - xml:lang matches lang when both present
66+ - Document has a lang attribute on root element
77+88+ This checker ensures proper language markup:
99+ - lang attribute values are validated using BCP 47 format
1010+ - When both lang and xml:lang are present, they must match
1111+ - Warning if <html> element lacks lang attribute
1212+ - Empty lang="" is valid (indicates unknown language)
1313+ - Primary language subtag should be valid *)
1414+1515+include Checker.S
1616+1717+val checker : Checker.t
1818+(** [checker] is a checker instance for validating language attributes. *)
···11+(** Microdata validation checker.
22+33+ Validates HTML5 microdata (itemscope, itemtype, itemprop, itemid, itemref).
44+55+ This checker verifies that microdata attributes are used correctly:
66+ - itemprop can only appear on elements that are properties of an item
77+ (descendant of itemscope or referenced by itemref)
88+ - itemid requires both itemscope and itemtype
99+ - itemref requires itemscope
1010+ - itemtype requires itemscope
1111+ - itemref values must reference existing IDs
1212+ - Detects itemref cycles (A references B, B references A)
1313+ - itemprop values must be valid property names (no colons unless URL) *)
1414+1515+include Checker.S
1616+1717+val checker : Checker.t
1818+(** [checker] is a checker instance for validating microdata. *)
+833
lib/html5_checker/specialized/table_checker.ml
···11+(** Table structure validation checker implementation.
22+33+ This module implements comprehensive table structure validation including
44+ cell overlap detection, span validation, and structural integrity checks. *)
55+66+(** HTML namespace constant *)
77+let html_ns = "http://www.w3.org/1999/xhtml"
88+99+(** Maximum allowed colspan value per HTML5 spec *)
1010+let max_colspan = 1000
1111+1212+(** Maximum allowed rowspan value per HTML5 spec *)
1313+let max_rowspan = 65534
1414+1515+(** Special rowspan value meaning "span to end of row group" *)
1616+let rowspan_zero_magic = max_rowspan
1717+1818+(** {1 Cell Representation} *)
1919+2020+(** A table cell with positioning information *)
2121+type cell = {
2222+ mutable left : int;
2323+ (** Column in which this cell starts (zero-indexed) *)
2424+ mutable right : int;
2525+ (** First column into which this cell does not span *)
2626+ mutable bottom : int;
2727+ (** First row onto which this cell does not span (or rowspan_zero_magic) *)
2828+ headers : string list;
2929+ (** IDs referenced by the headers attribute *)
3030+ is_header : bool;
3131+ (** Whether this is a th element *)
3232+ element_name : string;
3333+ (** "td" or "th" *)
3434+}
3535+3636+(** Create a cell from colspan and rowspan values *)
3737+let make_cell ~colspan ~rowspan ~headers ~is_header collector =
3838+ let colspan =
3939+ if colspan > max_colspan then (
4040+ Message_collector.add_error collector
4141+ ~message:
4242+ (Printf.sprintf
4343+ {|The value of the "colspan" attribute must be less than or equal to %d.|}
4444+ max_colspan)
4545+ ();
4646+ max_colspan)
4747+ else colspan
4848+ in
4949+ let rowspan =
5050+ if rowspan > max_rowspan then (
5151+ Message_collector.add_error collector
5252+ ~message:
5353+ (Printf.sprintf
5454+ {|The value of the "rowspan" attribute must be less than or equal to %d.|}
5555+ max_rowspan)
5656+ ();
5757+ max_rowspan)
5858+ else rowspan
5959+ in
6060+ {
6161+ left = 0;
6262+ right = colspan;
6363+ bottom = (if rowspan = 0 then rowspan_zero_magic else rowspan);
6464+ headers;
6565+ is_header;
6666+ element_name = (if is_header then "th" else "td");
6767+ }
6868+6969+(** Set the absolute position of a cell *)
7070+let set_cell_position cell ~row ~col =
7171+ cell.left <- col;
7272+ cell.right <- cell.right + col;
7373+ if cell.bottom <> rowspan_zero_magic then cell.bottom <- cell.bottom + row
7474+7575+(** Check if a cell should be removed from the active set *)
7676+let should_cull_cell cell ~row = row >= cell.bottom
7777+7878+(** Check if two cells overlap horizontally *)
7979+let cells_overlap_horizontally cell1 cell2 =
8080+ not (cell2.right <= cell1.left || cell1.right <= cell2.left)
8181+8282+(** Emit error for horizontal cell overlap *)
8383+let err_on_horizontal_overlap cell1 cell2 collector =
8484+ if cells_overlap_horizontally cell1 cell2 then (
8585+ Message_collector.add_error collector
8686+ ~message:"Table cell is overlapped by later table cell." ();
8787+ Message_collector.add_error collector
8888+ ~message:"Table cell overlaps an earlier table cell." ())
8989+9090+(** Check if cell spans past end of row group *)
9191+let err_if_not_rowspan_zero cell ~row_group_type collector =
9292+ if cell.bottom <> rowspan_zero_magic then
9393+ let group_desc =
9494+ match row_group_type with
9595+ | None -> "implicit row group"
9696+ | Some t -> Printf.sprintf {|row group established by a "%s" element|} t
9797+ in
9898+ Message_collector.add_error collector
9999+ ~message:
100100+ (Printf.sprintf
101101+ "Table cell spans past the end of its %s; clipped to the end of \
102102+ the row group."
103103+ group_desc)
104104+ ()
105105+106106+(** {1 Column Range Tracking} *)
107107+108108+(** A contiguous range of columns without cells *)
109109+type column_range = {
110110+ element : string;
111111+ (** Element that established this range (col/colgroup/td/th) *)
112112+ mutable left : int;
113113+ (** Leftmost column in range *)
114114+ mutable right : int;
115115+ (** First column to right not in range *)
116116+ mutable next : column_range option;
117117+ (** Next range in linked list *)
118118+}
119119+120120+(** Create a column range *)
121121+let make_column_range ~element ~left ~right =
122122+ { element; left; right; next = None }
123123+124124+(** Check if column range contains a single column *)
125125+let is_single_col range = range.left + 1 = range.right
126126+127127+(** Test if a column hits a range (-1=left, 0=in, 1=right) *)
128128+let hits_column range column =
129129+ if column < range.left then -1
130130+ else if column >= range.right then 1
131131+ else 0
132132+133133+(** Remove a column from a range, returning the new range(s) *)
134134+let remove_column range column =
135135+ if is_single_col range then None
136136+ else if column = range.left then (
137137+ range.left <- range.left + 1;
138138+ Some range)
139139+ else if column + 1 = range.right then (
140140+ range.right <- range.right - 1;
141141+ Some range)
142142+ else
143143+ (* Split into two ranges *)
144144+ let created = make_column_range ~element:range.element ~left:(column + 1) ~right:range.right in
145145+ created.next <- range.next;
146146+ range.next <- Some created;
147147+ range.right <- column;
148148+ Some created
149149+150150+(** {1 Row Group State} *)
151151+152152+(** State for a row group (explicit or implicit) *)
153153+type row_group = {
154154+ mutable current_row : int;
155155+ (** Current row index within this group *)
156156+ mutable insertion_point : int;
157157+ (** Column position for next cell insertion *)
158158+ mutable next_old_cell : int;
159159+ (** Index into cells_on_current_row *)
160160+ mutable row_had_cells : bool;
161161+ (** Whether current row has any cells *)
162162+ cells_in_effect : ((int * int), cell) Hashtbl.t;
163163+ (** Cells from previous rows still spanning down, keyed by (bottom, left) *)
164164+ mutable cells_on_current_row : cell array;
165165+ (** Cells from previous rows affecting current row, sorted by left *)
166166+ row_group_type : string option;
167167+ (** Name of row group element (thead/tbody/tfoot) or None for implicit *)
168168+}
169169+170170+(** Create a new row group *)
171171+let make_row_group ~row_group_type =
172172+ {
173173+ current_row = -1;
174174+ insertion_point = 0;
175175+ next_old_cell = 0;
176176+ row_had_cells = false;
177177+ cells_in_effect = Hashtbl.create 16;
178178+ cells_on_current_row = [||];
179179+ row_group_type;
180180+ }
181181+182182+(** Start a new row in the row group *)
183183+let start_row_in_group group =
184184+ group.current_row <- group.current_row + 1;
185185+ group.insertion_point <- 0;
186186+ group.next_old_cell <- 0;
187187+ group.row_had_cells <- false;
188188+ (* Collect cells still in effect and sort by left column *)
189189+ let active_cells : cell list =
190190+ Hashtbl.fold
191191+ (fun _ (cell : cell) acc -> if not (should_cull_cell cell ~row:group.current_row) then cell :: acc else acc)
192192+ group.cells_in_effect []
193193+ in
194194+ let sorted = List.sort (fun (c1 : cell) (c2 : cell) -> Int.compare c1.left c2.left) active_cells in
195195+ group.cells_on_current_row <- Array.of_list sorted
196196+197197+(** Find the next available insertion point *)
198198+let rec find_insertion_point group =
199199+ if group.next_old_cell < Array.length group.cells_on_current_row then
200200+ let other = group.cells_on_current_row.(group.next_old_cell) in
201201+ if group.insertion_point < other.left then ()
202202+ else (
203203+ let right = other.right in
204204+ if right > group.insertion_point then group.insertion_point <- right;
205205+ group.next_old_cell <- group.next_old_cell + 1;
206206+ find_insertion_point group)
207207+208208+(** Add a cell to the row group *)
209209+let add_cell_to_group group cell collector =
210210+ group.row_had_cells <- true;
211211+ find_insertion_point group;
212212+ set_cell_position cell ~row:group.current_row ~col:group.insertion_point;
213213+214214+ (* Check for overlaps with cells from previous rows *)
215215+ for i = group.next_old_cell to Array.length group.cells_on_current_row - 1 do
216216+ err_on_horizontal_overlap group.cells_on_current_row.(i) cell collector
217217+ done;
218218+219219+ (* Add to cells in effect if it spans beyond current row *)
220220+ if cell.bottom > group.current_row + 1 then
221221+ Hashtbl.add group.cells_in_effect (cell.bottom, cell.left) cell;
222222+223223+ group.insertion_point <- cell.right
224224+225225+(** End the current row *)
226226+let end_row_in_group group collector =
227227+ (if not group.row_had_cells then
228228+ let group_desc =
229229+ match group.row_group_type with
230230+ | None -> "an implicit row group"
231231+ | Some t -> Printf.sprintf {|a row group established by a "%s" element|} t
232232+ in
233233+ Message_collector.add_error collector
234234+ ~message:
235235+ (Printf.sprintf {|Row %d of %s has no cells beginning on it.|}
236236+ (group.current_row + 1) group_desc)
237237+ ());
238238+239239+ find_insertion_point group;
240240+ group.cells_on_current_row <- [||];
241241+242242+ (* Cull cells that don't span to next row *)
243243+ let to_remove = ref [] in
244244+ Hashtbl.iter
245245+ (fun key cell ->
246246+ if should_cull_cell cell ~row:(group.current_row + 1) then to_remove := key :: !to_remove)
247247+ group.cells_in_effect;
248248+ List.iter (Hashtbl.remove group.cells_in_effect) !to_remove;
249249+250250+ (* Return the final insertion point (row width) *)
251251+ group.insertion_point
252252+253253+(** End the row group *)
254254+let end_row_group group collector =
255255+ Hashtbl.iter
256256+ (fun _ cell -> err_if_not_rowspan_zero cell ~row_group_type:group.row_group_type collector)
257257+ group.cells_in_effect
258258+259259+(** {1 Table State} *)
260260+261261+(** Parser state within a table *)
262262+type table_state =
263263+ | InTableAtStart
264264+ | InTableAtPotentialRowGroupStart
265265+ | InColgroup
266266+ | InColInColgroup
267267+ | InColInImplicitGroup
268268+ | InRowGroup
269269+ | InRowInRowGroup
270270+ | InCellInRowGroup
271271+ | InRowInImplicitRowGroup
272272+ | InImplicitRowGroup
273273+ | InCellInImplicitRowGroup
274274+ | InTableColsSeen
275275+276276+(** State for a single table *)
277277+type table = {
278278+ mutable state : table_state;
279279+ mutable suppressed_starts : int;
280280+ (** Count of nested suppressed elements *)
281281+ mutable hard_width : bool;
282282+ (** Whether column count was set by col/colgroup *)
283283+ mutable column_count : int;
284284+ (** Established column count (-1 if not set) *)
285285+ mutable real_column_count : int;
286286+ (** Actual maximum column count seen *)
287287+ mutable pending_colgroup_span : int;
288288+ (** Span for colgroup without col children *)
289289+ header_ids : (string, unit) Hashtbl.t;
290290+ (** IDs of th elements *)
291291+ cells_with_headers : cell list ref;
292292+ (** Cells with headers attribute *)
293293+ mutable current_row_group : row_group option;
294294+ (** Current row group *)
295295+ mutable first_col_range : column_range option;
296296+ (** Head of column range list *)
297297+ mutable last_col_range : column_range option;
298298+ (** Tail of column range list *)
299299+ mutable current_col_range : column_range option;
300300+ (** Current range being inspected *)
301301+ mutable previous_col_range : column_range option;
302302+ (** Previous range inspected *)
303303+}
304304+305305+(** Create a new table *)
306306+let make_table () =
307307+ {
308308+ state = InTableAtStart;
309309+ suppressed_starts = 0;
310310+ hard_width = false;
311311+ column_count = -1;
312312+ real_column_count = 0;
313313+ pending_colgroup_span = 0;
314314+ header_ids = Hashtbl.create 16;
315315+ cells_with_headers = ref [];
316316+ current_row_group = None;
317317+ first_col_range = None;
318318+ last_col_range = None;
319319+ current_col_range = None;
320320+ previous_col_range = None;
321321+ }
322322+323323+(** Append a column range to the list *)
324324+let append_column_range table range =
325325+ match table.last_col_range with
326326+ | None ->
327327+ table.first_col_range <- Some range;
328328+ table.last_col_range <- Some range
329329+ | Some last ->
330330+ last.next <- Some range;
331331+ table.last_col_range <- Some range
332332+333333+(** Report a cell back to table for column tracking *)
334334+let report_cell_to_table table (cell : cell) =
335335+ let left = cell.left in
336336+ let right = cell.right in
337337+338338+ (* Check if cell extends past known columns *)
339339+ if right > table.real_column_count then (
340340+ if left = table.real_column_count then (
341341+ (* Entirely past existing columns *)
342342+ if left + 1 <> right then
343343+ append_column_range table
344344+ (make_column_range ~element:cell.element_name ~left:(left + 1) ~right);
345345+ table.real_column_count <- right)
346346+ else (
347347+ (* Partially past existing columns *)
348348+ append_column_range table
349349+ (make_column_range ~element:cell.element_name ~left:table.real_column_count ~right);
350350+ table.real_column_count <- right));
351351+352352+ (* Track column usage *)
353353+ let rec process_ranges () =
354354+ match table.current_col_range with
355355+ | None -> ()
356356+ | Some range ->
357357+ let hit = hits_column range left in
358358+ if hit = 0 then (
359359+ (* Column hits this range - remove it *)
360360+ match remove_column range left with
361361+ | None ->
362362+ (* Range destroyed *)
363363+ if Option.is_some table.previous_col_range then
364364+ (Option.get table.previous_col_range).next <- range.next;
365365+ if table.first_col_range = Some range then table.first_col_range <- range.next;
366366+ if table.last_col_range = Some range then table.last_col_range <- table.previous_col_range;
367367+ table.current_col_range <- range.next
368368+ | Some new_range ->
369369+ if table.last_col_range = Some range then table.last_col_range <- Some new_range;
370370+ table.current_col_range <- Some new_range)
371371+ else if hit = -1 then
372372+ ()
373373+ else (
374374+ (* hit = 1, try next range *)
375375+ table.previous_col_range <- Some range;
376376+ table.current_col_range <- range.next;
377377+ process_ranges ())
378378+ in
379379+ process_ranges ()
380380+381381+(** {1 Attribute Parsing} *)
382382+383383+(** Parse a non-negative integer attribute, returning 1 if absent or invalid *)
384384+let parse_non_negative_int attrs name =
385385+ match List.assoc_opt name attrs with
386386+ | None -> 1
387387+ | Some v -> (
388388+ try
389389+ let n = int_of_string v in
390390+ if n >= 0 then n else 1
391391+ with Failure _ -> 1)
392392+393393+(** Parse a positive integer attribute, returning 1 if absent or invalid *)
394394+let parse_positive_int attrs name =
395395+ match List.assoc_opt name attrs with
396396+ | None -> 1
397397+ | Some v -> (
398398+ try
399399+ let n = int_of_string v in
400400+ if n > 0 then n else 1
401401+ with Failure _ -> 1)
402402+403403+(** Parse the headers attribute into a list of IDs *)
404404+let parse_headers attrs =
405405+ match List.assoc_opt "headers" attrs with
406406+ | None -> []
407407+ | Some v ->
408408+ let parts = String.split_on_char ' ' v in
409409+ List.filter (fun s -> String.length s > 0) parts
410410+411411+(** Parse span attribute, clamping to max_colspan *)
412412+let parse_span attrs collector =
413413+ let span = parse_non_negative_int attrs "span" in
414414+ if span > max_colspan then (
415415+ Message_collector.add_error collector
416416+ ~message:
417417+ (Printf.sprintf {|The value of the "span" attribute must be less than or equal to %d.|}
418418+ max_colspan)
419419+ ();
420420+ max_colspan)
421421+ else span
422422+423423+(** {1 Table Event Handlers} *)
424424+425425+(** Check if we should suppress the start event *)
426426+let need_suppress_start table =
427427+ if table.suppressed_starts > 0 then (
428428+ table.suppressed_starts <- table.suppressed_starts + 1;
429429+ true)
430430+ else false
431431+432432+(** Check if we should suppress the end event *)
433433+let need_suppress_end table =
434434+ if table.suppressed_starts > 0 then (
435435+ table.suppressed_starts <- table.suppressed_starts - 1;
436436+ true)
437437+ else false
438438+439439+(** Start a row group *)
440440+let start_row_group table local_name collector =
441441+ if need_suppress_start table then ()
442442+ else
443443+ match table.state with
444444+ | InImplicitRowGroup -> (
445445+ match table.current_row_group with
446446+ | Some group ->
447447+ end_row_group group collector;
448448+ table.current_row_group <- Some (make_row_group ~row_group_type:(Some local_name));
449449+ table.state <- InRowGroup
450450+ | None -> failwith "Bug: InImplicitRowGroup but no current row group")
451451+ | InTableAtStart | InTableColsSeen | InTableAtPotentialRowGroupStart ->
452452+ table.current_row_group <- Some (make_row_group ~row_group_type:(Some local_name));
453453+ table.state <- InRowGroup
454454+ | _ -> table.suppressed_starts <- 1
455455+456456+(** End a row group *)
457457+let end_row_group_handler table collector =
458458+ if need_suppress_end table then ()
459459+ else
460460+ match table.state with
461461+ | InRowGroup -> (
462462+ match table.current_row_group with
463463+ | Some group ->
464464+ end_row_group group collector;
465465+ table.current_row_group <- None;
466466+ table.state <- InTableAtPotentialRowGroupStart
467467+ | None -> failwith "Bug: InRowGroup but no current row group")
468468+ | _ -> failwith "Bug: end_row_group in wrong state"
469469+470470+(** Start a row *)
471471+let start_row table collector =
472472+ if need_suppress_start table then ()
473473+ else
474474+ match table.state with
475475+ | InTableAtStart | InTableColsSeen | InTableAtPotentialRowGroupStart ->
476476+ table.current_row_group <- Some (make_row_group ~row_group_type:None);
477477+ table.state <- InRowInImplicitRowGroup;
478478+ table.current_col_range <- table.first_col_range;
479479+ table.previous_col_range <- None;
480480+ (match table.current_row_group with
481481+ | Some group -> start_row_in_group group
482482+ | None -> failwith "Bug: just created row group")
483483+ | InImplicitRowGroup ->
484484+ table.state <- InRowInImplicitRowGroup;
485485+ table.current_col_range <- table.first_col_range;
486486+ table.previous_col_range <- None;
487487+ (match table.current_row_group with
488488+ | Some group -> start_row_in_group group
489489+ | None -> failwith "Bug: InImplicitRowGroup but no row group")
490490+ | InRowGroup ->
491491+ table.state <- InRowInRowGroup;
492492+ table.current_col_range <- table.first_col_range;
493493+ table.previous_col_range <- None;
494494+ (match table.current_row_group with
495495+ | Some group -> start_row_in_group group
496496+ | None -> failwith "Bug: InRowGroup but no row group")
497497+ | _ -> table.suppressed_starts <- 1
498498+499499+(** End a row *)
500500+let end_row table collector =
501501+ if need_suppress_end table then ()
502502+ else
503503+ match table.state with
504504+ | InRowInRowGroup ->
505505+ table.state <- InRowGroup;
506506+ (match table.current_row_group with
507507+ | Some group ->
508508+ let row_width = end_row_in_group group collector in
509509+ (* Check row width against column count *)
510510+ if table.hard_width then (
511511+ if row_width > table.column_count then
512512+ Message_collector.add_error collector
513513+ ~message:
514514+ (Printf.sprintf
515515+ {|A table row was %d columns wide and exceeded the column count established using column markup (%d).|}
516516+ row_width table.column_count)
517517+ ()
518518+ else if row_width < table.column_count then
519519+ Message_collector.add_error collector
520520+ ~message:
521521+ (Printf.sprintf
522522+ {|A table row was %d columns wide, which is less than the column count established using column markup (%d).|}
523523+ row_width table.column_count)
524524+ ())
525525+ else if table.column_count = -1 then
526526+ table.column_count <- row_width
527527+ else (
528528+ if row_width > table.column_count then
529529+ Message_collector.add_warning collector
530530+ ~message:
531531+ (Printf.sprintf
532532+ {|A table row was %d columns wide and exceeded the column count established by the first row (%d).|}
533533+ row_width table.column_count)
534534+ ()
535535+ else if row_width < table.column_count then
536536+ Message_collector.add_warning collector
537537+ ~message:
538538+ (Printf.sprintf
539539+ {|A table row was %d columns wide, which is less than the column count established by the first row (%d).|}
540540+ row_width table.column_count)
541541+ ())
542542+ | None -> failwith "Bug: InRowInRowGroup but no row group")
543543+ | InRowInImplicitRowGroup ->
544544+ table.state <- InImplicitRowGroup;
545545+ (match table.current_row_group with
546546+ | Some group ->
547547+ let row_width = end_row_in_group group collector in
548548+ (* Same column count checking as above *)
549549+ if table.hard_width then (
550550+ if row_width > table.column_count then
551551+ Message_collector.add_error collector
552552+ ~message:
553553+ (Printf.sprintf
554554+ {|A table row was %d columns wide and exceeded the column count established using column markup (%d).|}
555555+ row_width table.column_count)
556556+ ()
557557+ else if row_width < table.column_count then
558558+ Message_collector.add_error collector
559559+ ~message:
560560+ (Printf.sprintf
561561+ {|A table row was %d columns wide, which is less than the column count established using column markup (%d).|}
562562+ row_width table.column_count)
563563+ ())
564564+ else if table.column_count = -1 then
565565+ table.column_count <- row_width
566566+ else (
567567+ if row_width > table.column_count then
568568+ Message_collector.add_warning collector
569569+ ~message:
570570+ (Printf.sprintf
571571+ {|A table row was %d columns wide and exceeded the column count established by the first row (%d).|}
572572+ row_width table.column_count)
573573+ ()
574574+ else if row_width < table.column_count then
575575+ Message_collector.add_warning collector
576576+ ~message:
577577+ (Printf.sprintf
578578+ {|A table row was %d columns wide, which is less than the column count established by the first row (%d).|}
579579+ row_width table.column_count)
580580+ ())
581581+ | None -> failwith "Bug: InRowInImplicitRowGroup but no row group")
582582+ | _ -> failwith "Bug: end_row in wrong state"
583583+584584+(** Start a cell *)
585585+let start_cell table is_header attrs collector =
586586+ if need_suppress_start table then ()
587587+ else
588588+ match table.state with
589589+ | InRowInRowGroup ->
590590+ table.state <- InCellInRowGroup;
591591+ (* Record header ID if present *)
592592+ if is_header then (
593593+ match List.assoc_opt "id" attrs with
594594+ | Some id when String.length id > 0 -> Hashtbl.replace table.header_ids id ()
595595+ | _ -> ());
596596+ (* Parse cell attributes *)
597597+ let colspan = abs (parse_positive_int attrs "colspan") in
598598+ let rowspan = abs (parse_non_negative_int attrs "rowspan") in
599599+ let headers = parse_headers attrs in
600600+ let cell = make_cell ~colspan ~rowspan ~headers ~is_header collector in
601601+ if List.length headers > 0 then table.cells_with_headers := cell :: !(table.cells_with_headers);
602602+ (match table.current_row_group with
603603+ | Some group ->
604604+ add_cell_to_group group cell collector;
605605+ report_cell_to_table table cell
606606+ | None -> failwith "Bug: InRowInRowGroup but no row group")
607607+ | InRowInImplicitRowGroup ->
608608+ table.state <- InCellInImplicitRowGroup;
609609+ (* Same logic as above *)
610610+ if is_header then (
611611+ match List.assoc_opt "id" attrs with
612612+ | Some id when String.length id > 0 -> Hashtbl.replace table.header_ids id ()
613613+ | _ -> ());
614614+ let colspan = abs (parse_positive_int attrs "colspan") in
615615+ let rowspan = abs (parse_non_negative_int attrs "rowspan") in
616616+ let headers = parse_headers attrs in
617617+ let cell = make_cell ~colspan ~rowspan ~headers ~is_header collector in
618618+ if List.length headers > 0 then table.cells_with_headers := cell :: !(table.cells_with_headers);
619619+ (match table.current_row_group with
620620+ | Some group ->
621621+ add_cell_to_group group cell collector;
622622+ report_cell_to_table table cell
623623+ | None -> failwith "Bug: InRowInImplicitRowGroup but no row group")
624624+ | _ -> table.suppressed_starts <- 1
625625+626626+(** End a cell *)
627627+let end_cell table =
628628+ if need_suppress_end table then ()
629629+ else
630630+ match table.state with
631631+ | InCellInRowGroup -> table.state <- InRowInRowGroup
632632+ | InCellInImplicitRowGroup -> table.state <- InRowInImplicitRowGroup
633633+ | _ -> failwith "Bug: end_cell in wrong state"
634634+635635+(** Start a colgroup *)
636636+let start_colgroup table attrs collector =
637637+ if need_suppress_start table then ()
638638+ else
639639+ match table.state with
640640+ | InTableAtStart ->
641641+ table.hard_width <- true;
642642+ table.column_count <- 0;
643643+ table.pending_colgroup_span <- parse_span attrs collector;
644644+ table.state <- InColgroup
645645+ | InTableColsSeen ->
646646+ table.pending_colgroup_span <- parse_span attrs collector;
647647+ table.state <- InColgroup
648648+ | _ -> table.suppressed_starts <- 1
649649+650650+(** End a colgroup *)
651651+let end_colgroup table =
652652+ if need_suppress_end table then ()
653653+ else
654654+ match table.state with
655655+ | InColgroup ->
656656+ if table.pending_colgroup_span <> 0 then (
657657+ let right = table.column_count + abs table.pending_colgroup_span in
658658+ let range = make_column_range ~element:"colgroup" ~left:table.column_count ~right in
659659+ append_column_range table range;
660660+ table.column_count <- right);
661661+ table.real_column_count <- table.column_count;
662662+ table.state <- InTableColsSeen
663663+ | _ -> failwith "Bug: end_colgroup in wrong state"
664664+665665+(** Start a col *)
666666+let start_col table attrs collector =
667667+ if need_suppress_start table then ()
668668+ else
669669+ match table.state with
670670+ | InTableAtStart ->
671671+ table.hard_width <- true;
672672+ table.column_count <- 0;
673673+ table.state <- InColInImplicitGroup;
674674+ let span = abs (parse_span attrs collector) in
675675+ let right = table.column_count + span in
676676+ let range = make_column_range ~element:"col" ~left:table.column_count ~right in
677677+ append_column_range table range;
678678+ table.column_count <- right;
679679+ table.real_column_count <- table.column_count
680680+ | InTableColsSeen ->
681681+ table.state <- InColInImplicitGroup;
682682+ let span = abs (parse_span attrs collector) in
683683+ let right = table.column_count + span in
684684+ let range = make_column_range ~element:"col" ~left:table.column_count ~right in
685685+ append_column_range table range;
686686+ table.column_count <- right;
687687+ table.real_column_count <- table.column_count
688688+ | InColgroup ->
689689+ if table.pending_colgroup_span > 0 then
690690+ Message_collector.add_warning collector
691691+ ~message:
692692+ (Printf.sprintf
693693+ "A col element causes a span attribute with value %d to be ignored on the \
694694+ parent colgroup."
695695+ table.pending_colgroup_span)
696696+ ();
697697+ table.pending_colgroup_span <- 0;
698698+ table.state <- InColInColgroup;
699699+ let span = abs (parse_span attrs collector) in
700700+ let right = table.column_count + span in
701701+ let range = make_column_range ~element:"col" ~left:table.column_count ~right in
702702+ append_column_range table range;
703703+ table.column_count <- right;
704704+ table.real_column_count <- table.column_count
705705+ | _ -> table.suppressed_starts <- 1
706706+707707+(** End a col *)
708708+let end_col table =
709709+ if need_suppress_end table then ()
710710+ else
711711+ match table.state with
712712+ | InColInImplicitGroup -> table.state <- InTableColsSeen
713713+ | InColInColgroup -> table.state <- InColgroup
714714+ | _ -> failwith "Bug: end_col in wrong state"
715715+716716+(** End a table *)
717717+let end_table table collector =
718718+ (match table.state with
719719+ | InImplicitRowGroup -> (
720720+ match table.current_row_group with
721721+ | Some group ->
722722+ end_row_group group collector;
723723+ table.current_row_group <- None
724724+ | None -> failwith "Bug: InImplicitRowGroup but no row group")
725725+ | InTableAtStart | InTableAtPotentialRowGroupStart | InTableColsSeen -> ()
726726+ | _ -> failwith "Bug: end_table in wrong state");
727727+728728+ (* Check header reference integrity *)
729729+ List.iter
730730+ (fun cell ->
731731+ List.iter
732732+ (fun heading ->
733733+ if not (Hashtbl.mem table.header_ids heading) then
734734+ Message_collector.add_error collector
735735+ ~message:
736736+ (Printf.sprintf
737737+ {|The "headers" attribute on the element "%s" refers to the ID "%s", but there is no "th" element with that ID in the same table.|}
738738+ cell.element_name heading)
739739+ ())
740740+ cell.headers)
741741+ !(table.cells_with_headers);
742742+743743+ (* Check that each column established by col/colgroup has cells *)
744744+ let rec check_ranges range =
745745+ match range with
746746+ | None -> ()
747747+ | Some r ->
748748+ if is_single_col r then
749749+ Message_collector.add_error collector
750750+ ~message:
751751+ (Printf.sprintf {|Table column %d established by element "%s" has no cells beginning in it.|}
752752+ r.right r.element)
753753+ ()
754754+ else
755755+ Message_collector.add_error collector
756756+ ~message:
757757+ (Printf.sprintf
758758+ {|Table columns in range %d…%d established by element "%s" have no cells beginning in them.|}
759759+ (r.left + 1) r.right r.element)
760760+ ();
761761+ check_ranges r.next
762762+ in
763763+ check_ranges table.first_col_range
764764+765765+(** {1 Checker State} *)
766766+767767+type state = { tables : table list ref (* Stack of nested tables *) }
768768+769769+let create () = { tables = ref [] }
770770+771771+let reset state = state.tables := []
772772+773773+let start_element state ~name ~namespace ~attrs collector =
774774+ match namespace with
775775+ | Some ns when ns = html_ns -> (
776776+ match name with
777777+ | "table" ->
778778+ (* Push a new table onto the stack *)
779779+ state.tables := make_table () :: !(state.tables)
780780+ | _ -> (
781781+ match !(state.tables) with
782782+ | [] -> ()
783783+ | table :: _ -> (
784784+ match name with
785785+ | "td" -> start_cell table false attrs collector
786786+ | "th" -> start_cell table true attrs collector
787787+ | "tr" -> start_row table collector
788788+ | "tbody" | "thead" | "tfoot" -> start_row_group table name collector
789789+ | "col" -> start_col table attrs collector
790790+ | "colgroup" -> start_colgroup table attrs collector
791791+ | _ -> ())))
792792+ | _ -> ()
793793+794794+let end_element state ~name ~namespace collector =
795795+ match namespace with
796796+ | Some ns when ns = html_ns -> (
797797+ match name with
798798+ | "table" -> (
799799+ match !(state.tables) with
800800+ | [] -> failwith "Bug: end table but no table on stack"
801801+ | table :: rest ->
802802+ end_table table collector;
803803+ state.tables := rest)
804804+ | _ -> (
805805+ match !(state.tables) with
806806+ | [] -> ()
807807+ | table :: _ -> (
808808+ match name with
809809+ | "td" | "th" -> end_cell table
810810+ | "tr" -> end_row table collector
811811+ | "tbody" | "thead" | "tfoot" -> end_row_group_handler table collector
812812+ | "col" -> end_col table
813813+ | "colgroup" -> end_colgroup table
814814+ | _ -> ())))
815815+ | _ -> ()
816816+817817+let characters _state _text _collector = ()
818818+819819+let end_document state collector =
820820+ if !(state.tables) <> [] then
821821+ Message_collector.add_error collector ~message:"Unclosed table element at end of document." ()
822822+823823+let checker =
824824+ (module struct
825825+ type nonrec state = state
826826+827827+ let create = create
828828+ let reset = reset
829829+ let start_element = start_element
830830+ let end_element = end_element
831831+ let characters = characters
832832+ let end_document = end_document
833833+ end : Checker.S)
+83
lib/html5_checker/specialized/table_checker.mli
···11+(** Table structure validation checker.
22+33+ Validates HTML table integrity including:
44+ - Cell overlap detection (rowspan/colspan causing overlap)
55+ - Spanning past table boundaries
66+ - Proper table structure (thead/tbody/tfoot ordering)
77+ - Maximum colspan limit (1000 per HTML spec)
88+ - Maximum rowspan limit (65534 per HTML spec)
99+1010+ {2 Validation Rules}
1111+1212+ {b Cell Positioning}
1313+ - Detects when two cells claim the same grid position
1414+ - Validates that rowspan/colspan don't cause cells to extend past boundaries
1515+ - Tracks cell positions accounting for rowspan/colspan from previous rows
1616+1717+ {b Span Limits}
1818+ - [colspan] must be positive and <= 1000
1919+ - [rowspan] must be non-negative and <= 65534
2020+ - [rowspan=0] is a special value meaning "span to end of row group"
2121+2222+ {b Table Structure}
2323+ - [caption] must be first child of [table] if present
2424+ - [thead] must come before [tbody] and [tfoot]
2525+ - Only one [thead] and one [tfoot] allowed per table
2626+ - [col] elements can establish explicit column count
2727+ - [colgroup] elements can group columns
2828+2929+ {b Row Validation}
3030+ - Each row must have at least one cell
3131+ - Row widths should match the established column count
3232+ - Cells cannot overlap horizontally in the same row
3333+3434+ {b Header References}
3535+ - [headers] attribute on [td]/[th] must reference valid [th] IDs
3636+ - All referenced header IDs must exist in the same table
3737+3838+ {3 Example Valid Table}
3939+4040+ {v
4141+ <table>
4242+ <thead>
4343+ <tr>
4444+ <th id="h1">Header 1</th>
4545+ <th id="h2">Header 2</th>
4646+ </tr>
4747+ </thead>
4848+ <tbody>
4949+ <tr>
5050+ <td headers="h1">Cell 1</td>
5151+ <td headers="h2">Cell 2</td>
5252+ </tr>
5353+ </tbody>
5454+ </table>
5555+ v}
5656+5757+ {3 Example Invalid Table (Overlapping Cells)}
5858+5959+ {v
6060+ <table>
6161+ <tr>
6262+ <td rowspan="2">A</td>
6363+ <td>B</td>
6464+ </tr>
6565+ <tr>
6666+ <td>C</td> <!-- Would overlap with A's rowspan -->
6767+ </tr>
6868+ </table>
6969+ v}
7070+7171+ @see <https://html.spec.whatwg.org/multipage/tables.html> WHATWG HTML: Tables
7272+ @see <https://www.w3.org/TR/html52/tabular-data.html> W3C HTML 5.2: Tables *)
7373+7474+include Checker.S
7575+7676+val checker : Checker.t
7777+(** A first-class module instance of this checker.
7878+7979+ {b Usage:}
8080+ {[
8181+ let checker = Table_checker.checker in
8282+ Checker_registry.register "table-structure" checker
8383+ ]} *)
+2-1
lib/html5rw/parser/parser_impl.ml
···2525module TreeBuilderSink = struct
2626 type t = Parser_tree_builder.t
27272828- let process tb token =
2828+ let process tb token ~line ~column =
2929+ Parser_tree_builder.set_position tb ~line ~column;
2930 Parser_tree_builder.process_token tb token;
3031 (* Check if we need to switch tokenizer state based on current element *)
3132 (* Only switch for HTML namespace elements - SVG/MathML use different rules *)
···194194*)
195195module type SINK = sig
196196 type t
197197- val process : t -> Tokenizer_token.t -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
197197+ val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
198198 val adjusted_current_node_in_html_namespace : t -> bool
199199end
200200
+41-38
lib/html5rw/tokenizer/tokenizer_impl.ml
···1111(* Token sink interface *)
1212module type SINK = sig
1313 type t
1414- val process : t -> Tokenizer_token.t -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
1414+ val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
1515 val adjusted_current_node_in_html_namespace : t -> bool
1616end
1717···184184 let data = Buffer.contents t.pending_chars in
185185 Buffer.clear t.pending_chars;
186186 let data = if t.xml_mode then transform_xml_chars data else data in
187187- ignore (S.process t.sink (Tokenizer_token.Character data))
187187+ let line, column = Tokenizer_stream.position t.stream in
188188+ ignore (S.process t.sink (Tokenizer_token.Character data) ~line ~column)
188189 end
189190 in
190191191192 let emit token =
192193 emit_pending_chars ();
193193- match S.process t.sink token with
194194+ let line, column = Tokenizer_stream.position t.stream in
195195+ match S.process t.sink token ~line ~column with
194196 | `Continue -> ()
195197 | `SwitchTo new_state -> t.state <- new_state
196198 in
···278280 handle_eof ()
279281 end else if Tokenizer_stream.is_eof t.stream then begin
280282 emit_pending_chars ();
281281- ignore (S.process t.sink Tokenizer_token.EOF)
283283+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
282284 end else begin
283285 step ();
284286 process_state ()
···288290 match t.state with
289291 | Tokenizer_state.Data ->
290292 emit_pending_chars ();
291291- ignore (S.process t.sink Tokenizer_token.EOF)
293293+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
292294 | Tokenizer_state.Tag_open ->
293295 error t "eof-before-tag-name";
294296 emit_char t '<';
295297 emit_pending_chars ();
296296- ignore (S.process t.sink Tokenizer_token.EOF)
298298+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
297299 | Tokenizer_state.End_tag_open ->
298300 error t "eof-before-tag-name";
299301 emit_str t "</";
300302 emit_pending_chars ();
301301- ignore (S.process t.sink Tokenizer_token.EOF)
303303+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
302304 | Tokenizer_state.Tag_name
303305 | Tokenizer_state.Before_attribute_name
304306 | Tokenizer_state.Attribute_name
···311313 | Tokenizer_state.Self_closing_start_tag ->
312314 error t "eof-in-tag";
313315 emit_pending_chars ();
314314- ignore (S.process t.sink Tokenizer_token.EOF)
316316+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
315317 | Tokenizer_state.Rawtext ->
316318 emit_pending_chars ();
317317- ignore (S.process t.sink Tokenizer_token.EOF)
319319+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
318320 | Tokenizer_state.Rawtext_less_than_sign ->
319321 emit_char t '<';
320322 emit_pending_chars ();
321321- ignore (S.process t.sink Tokenizer_token.EOF)
323323+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
322324 | Tokenizer_state.Rawtext_end_tag_open ->
323325 emit_str t "</";
324326 emit_pending_chars ();
325325- ignore (S.process t.sink Tokenizer_token.EOF)
327327+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
326328 | Tokenizer_state.Rawtext_end_tag_name ->
327329 emit_str t "</";
328330 emit_str t (Buffer.contents t.temp_buffer);
329331 emit_pending_chars ();
330330- ignore (S.process t.sink Tokenizer_token.EOF)
332332+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
331333 | Tokenizer_state.Rcdata ->
332334 emit_pending_chars ();
333333- ignore (S.process t.sink Tokenizer_token.EOF)
335335+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
334336 | Tokenizer_state.Rcdata_less_than_sign ->
335337 emit_char t '<';
336338 emit_pending_chars ();
337337- ignore (S.process t.sink Tokenizer_token.EOF)
339339+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
338340 | Tokenizer_state.Rcdata_end_tag_open ->
339341 emit_str t "</";
340342 emit_pending_chars ();
341341- ignore (S.process t.sink Tokenizer_token.EOF)
343343+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
342344 | Tokenizer_state.Rcdata_end_tag_name ->
343345 emit_str t "</";
344346 emit_str t (Buffer.contents t.temp_buffer);
345347 emit_pending_chars ();
346346- ignore (S.process t.sink Tokenizer_token.EOF)
348348+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
347349 | Tokenizer_state.Script_data ->
348350 emit_pending_chars ();
349349- ignore (S.process t.sink Tokenizer_token.EOF)
351351+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
350352 | Tokenizer_state.Script_data_less_than_sign ->
351353 emit_char t '<';
352354 emit_pending_chars ();
353353- ignore (S.process t.sink Tokenizer_token.EOF)
355355+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
354356 | Tokenizer_state.Script_data_end_tag_open ->
355357 emit_str t "</";
356358 emit_pending_chars ();
357357- ignore (S.process t.sink Tokenizer_token.EOF)
359359+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
358360 | Tokenizer_state.Script_data_end_tag_name ->
359361 emit_str t "</";
360362 emit_str t (Buffer.contents t.temp_buffer);
361363 emit_pending_chars ();
362362- ignore (S.process t.sink Tokenizer_token.EOF)
364364+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
363365 | Tokenizer_state.Script_data_escape_start
364366 | Tokenizer_state.Script_data_escape_start_dash
365367 | Tokenizer_state.Script_data_escaped
···367369 | Tokenizer_state.Script_data_escaped_dash_dash ->
368370 error t "eof-in-script-html-comment-like-text";
369371 emit_pending_chars ();
370370- ignore (S.process t.sink Tokenizer_token.EOF)
372372+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
371373 | Tokenizer_state.Script_data_escaped_less_than_sign ->
372374 emit_char t '<';
373375 emit_pending_chars ();
374374- ignore (S.process t.sink Tokenizer_token.EOF)
376376+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
375377 | Tokenizer_state.Script_data_escaped_end_tag_open ->
376378 emit_str t "</";
377379 emit_pending_chars ();
378378- ignore (S.process t.sink Tokenizer_token.EOF)
380380+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
379381 | Tokenizer_state.Script_data_escaped_end_tag_name ->
380382 emit_str t "</";
381383 emit_str t (Buffer.contents t.temp_buffer);
382384 emit_pending_chars ();
383383- ignore (S.process t.sink Tokenizer_token.EOF)
385385+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
384386 | Tokenizer_state.Script_data_double_escape_start
385387 | Tokenizer_state.Script_data_double_escaped
386388 | Tokenizer_state.Script_data_double_escaped_dash
387389 | Tokenizer_state.Script_data_double_escaped_dash_dash ->
388390 error t "eof-in-script-html-comment-like-text";
389391 emit_pending_chars ();
390390- ignore (S.process t.sink Tokenizer_token.EOF)
392392+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
391393 | Tokenizer_state.Script_data_double_escaped_less_than_sign ->
392394 (* '<' was already emitted when entering this state from Script_data_double_escaped *)
393395 emit_pending_chars ();
394394- ignore (S.process t.sink Tokenizer_token.EOF)
396396+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
395397 | Tokenizer_state.Script_data_double_escape_end ->
396398 emit_pending_chars ();
397397- ignore (S.process t.sink Tokenizer_token.EOF)
399399+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
398400 | Tokenizer_state.Plaintext ->
399401 emit_pending_chars ();
400400- ignore (S.process t.sink Tokenizer_token.EOF)
402402+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
401403 | Tokenizer_state.Comment_start
402404 | Tokenizer_state.Comment_start_dash
403405 | Tokenizer_state.Comment
···411413 error t "eof-in-comment";
412414 emit_current_comment ();
413415 emit_pending_chars ();
414414- ignore (S.process t.sink Tokenizer_token.EOF)
416416+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
415417 | Tokenizer_state.Bogus_comment ->
416418 emit_current_comment ();
417419 emit_pending_chars ();
418418- ignore (S.process t.sink Tokenizer_token.EOF)
420420+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
419421 | Tokenizer_state.Markup_declaration_open ->
420422 error t "incorrectly-opened-comment";
421423 Buffer.clear t.current_comment;
422424 emit_current_comment ();
423425 emit_pending_chars ();
424424- ignore (S.process t.sink Tokenizer_token.EOF)
426426+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
425427 | Tokenizer_state.Doctype
426428 | Tokenizer_state.Before_doctype_name ->
427429 error t "eof-in-doctype";
···429431 t.current_doctype_force_quirks <- true;
430432 emit_current_doctype ();
431433 emit_pending_chars ();
432432- ignore (S.process t.sink Tokenizer_token.EOF)
434434+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
433435 | Tokenizer_state.Doctype_name
434436 | Tokenizer_state.After_doctype_name
435437 | Tokenizer_state.After_doctype_public_keyword
···447449 t.current_doctype_force_quirks <- true;
448450 emit_current_doctype ();
449451 emit_pending_chars ();
450450- ignore (S.process t.sink Tokenizer_token.EOF)
452452+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
451453 | Tokenizer_state.Bogus_doctype ->
452454 emit_current_doctype ();
453455 emit_pending_chars ();
454454- ignore (S.process t.sink Tokenizer_token.EOF)
456456+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
455457 | Tokenizer_state.Cdata_section ->
456458 error t "eof-in-cdata";
457459 emit_pending_chars ();
458458- ignore (S.process t.sink Tokenizer_token.EOF)
460460+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
459461 | Tokenizer_state.Cdata_section_bracket ->
460462 error t "eof-in-cdata";
461463 emit_char t ']';
462464 emit_pending_chars ();
463463- ignore (S.process t.sink Tokenizer_token.EOF)
465465+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
464466 | Tokenizer_state.Cdata_section_end ->
465467 error t "eof-in-cdata";
466468 emit_str t "]]";
467469 emit_pending_chars ();
468468- ignore (S.process t.sink Tokenizer_token.EOF)
470470+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
469471 | Tokenizer_state.Character_reference ->
470472 (* state_character_reference never ran, so initialize temp_buffer with & *)
471473 Buffer.clear t.temp_buffer;
···617619 (* Emit pending chars first, then emit null separately for proper tree builder handling *)
618620 emit_pending_chars ();
619621 error t "unexpected-null-character";
620620- ignore (S.process t.sink (Tokenizer_token.Character "\x00"))
622622+ let line, column = Tokenizer_stream.position t.stream in
623623+ ignore (S.process t.sink (Tokenizer_token.Character "\x00") ~line ~column)
621624 | Some c ->
622625 emit_char_checked c
623626 | None -> ()
···11+(** Tests for the html5_checker library *)
22+33+(** Helper to create a reader from a string *)
44+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
55+66+(** Helper to check if a message contains a substring *)
77+let message_contains msg substring =
88+ String.lowercase_ascii msg.Html5_checker.Message.message
99+ |> fun s -> String.length s >= String.length substring &&
1010+ try
1111+ ignore (Str.search_forward (Str.regexp_case_fold (Str.quote substring)) s 0);
1212+ true
1313+ with Not_found -> false
1414+1515+(** Test that valid HTML5 produces no errors *)
1616+let test_valid_html5 () =
1717+ Printf.printf "Test 1: Valid HTML5 document\n";
1818+ let html = {|<!DOCTYPE html>
1919+<html lang="en">
2020+<head><title>Test</title></head>
2121+<body><p>Hello world</p></body>
2222+</html>|} in
2323+ let reader = reader_of_string html in
2424+ let result = Html5_checker.check reader in
2525+ let errors = Html5_checker.errors result in
2626+ Printf.printf " Found %d error(s)\n" (List.length errors);
2727+ if List.length errors > 0 then begin
2828+ List.iter (fun msg ->
2929+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
3030+ ) errors;
3131+ end else
3232+ Printf.printf " OK: No errors as expected\n"
3333+3434+(** Test that missing DOCTYPE is detected *)
3535+let test_missing_doctype () =
3636+ Printf.printf "\nTest 2: Missing DOCTYPE\n";
3737+ let html = "<html><body>Hello</body></html>" in
3838+ let reader = reader_of_string html in
3939+ let result = Html5_checker.check reader in
4040+ let errors = Html5_checker.errors result in
4141+ Printf.printf " Found %d error(s)\n" (List.length errors);
4242+ if List.length errors = 0 then
4343+ Printf.printf " Warning: Expected parse errors for missing DOCTYPE\n"
4444+ else begin
4545+ List.iter (fun msg ->
4646+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
4747+ ) errors;
4848+ end
4949+5050+(** Test that obsolete elements are detected *)
5151+let test_obsolete_element () =
5252+ Printf.printf "\nTest 3: Obsolete <center> element\n";
5353+ let html = "<!DOCTYPE html><html><body><center>Centered</center></body></html>" in
5454+ let reader = reader_of_string html in
5555+ let result = Html5_checker.check reader in
5656+ let all_msgs = Html5_checker.messages result in
5757+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
5858+ let obsolete_msgs = List.filter (fun m ->
5959+ message_contains m "obsolete" || message_contains m "center"
6060+ ) all_msgs in
6161+ if List.length obsolete_msgs > 0 then begin
6262+ Printf.printf " Found obsolete-related messages:\n";
6363+ List.iter (fun msg ->
6464+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
6565+ ) obsolete_msgs;
6666+ end else
6767+ Printf.printf " Note: No obsolete element warnings found (checker may not be enabled)\n"
6868+6969+(** Test duplicate IDs *)
7070+let test_duplicate_id () =
7171+ Printf.printf "\nTest 4: Duplicate ID attributes\n";
7272+ let html = {|<!DOCTYPE html><html><body>
7373+ <div id="foo">First</div>
7474+ <div id="foo">Second</div>
7575+ </body></html>|} in
7676+ let reader = reader_of_string html in
7777+ let result = Html5_checker.check reader in
7878+ let all_msgs = Html5_checker.messages result in
7979+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
8080+ let id_msgs = List.filter (fun m ->
8181+ message_contains m "duplicate" || message_contains m "id"
8282+ ) all_msgs in
8383+ if List.length id_msgs > 0 then begin
8484+ Printf.printf " Found ID-related messages:\n";
8585+ List.iter (fun msg ->
8686+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
8787+ ) id_msgs;
8888+ end else
8989+ Printf.printf " Note: No duplicate ID errors found (checker may not be enabled)\n"
9090+9191+(** Test heading structure *)
9292+let test_heading_skip () =
9393+ Printf.printf "\nTest 5: Skipped heading level\n";
9494+ let html = {|<!DOCTYPE html><html><body>
9595+ <h1>Title</h1>
9696+ <h3>Skipped h2</h3>
9797+ </body></html>|} in
9898+ let reader = reader_of_string html in
9999+ let result = Html5_checker.check reader in
100100+ let all_msgs = Html5_checker.messages result in
101101+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
102102+ let heading_msgs = List.filter (fun m ->
103103+ message_contains m "heading" || message_contains m "skip"
104104+ ) all_msgs in
105105+ if List.length heading_msgs > 0 then begin
106106+ Printf.printf " Found heading-related messages:\n";
107107+ List.iter (fun msg ->
108108+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
109109+ ) heading_msgs;
110110+ end else
111111+ Printf.printf " Note: No heading structure warnings found (checker may not be enabled)\n"
112112+113113+(** Test img without alt *)
114114+let test_img_without_alt () =
115115+ Printf.printf "\nTest 6: Image without alt attribute\n";
116116+ let html = {|<!DOCTYPE html><html><body>
117117+ <img src="test.jpg">
118118+ </body></html>|} in
119119+ let reader = reader_of_string html in
120120+ let result = Html5_checker.check reader in
121121+ let all_msgs = Html5_checker.messages result in
122122+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
123123+ let img_msgs = List.filter (fun m ->
124124+ message_contains m "alt" || (message_contains m "img" && message_contains m "attribute")
125125+ ) all_msgs in
126126+ if List.length img_msgs > 0 then begin
127127+ Printf.printf " Found img/alt-related messages:\n";
128128+ List.iter (fun msg ->
129129+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
130130+ ) img_msgs;
131131+ end else
132132+ Printf.printf " Note: No missing alt attribute errors found (checker may not be enabled)\n"
133133+134134+(** Test invalid nesting *)
135135+let test_invalid_nesting () =
136136+ Printf.printf "\nTest 7: Invalid nesting - <a> inside <a>\n";
137137+ let html = {|<!DOCTYPE html><html><body>
138138+ <a href="#">Link <a href="#">Nested</a></a>
139139+ </body></html>|} in
140140+ let reader = reader_of_string html in
141141+ let result = Html5_checker.check reader in
142142+ let all_msgs = Html5_checker.messages result in
143143+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
144144+ let nesting_msgs = List.filter (fun m ->
145145+ message_contains m "nesting" || message_contains m "nested" || message_contains m "ancestor"
146146+ ) all_msgs in
147147+ if List.length nesting_msgs > 0 then begin
148148+ Printf.printf " Found nesting-related messages:\n";
149149+ List.iter (fun msg ->
150150+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
151151+ ) nesting_msgs;
152152+ end else
153153+ Printf.printf " Note: No nesting errors found (checker may not be enabled)\n"
154154+155155+(** Test form inside form *)
156156+let test_form_nesting () =
157157+ Printf.printf "\nTest 8: Invalid nesting - <form> inside <form>\n";
158158+ let html = {|<!DOCTYPE html><html><body>
159159+ <form><form></form></form>
160160+ </body></html>|} in
161161+ let reader = reader_of_string html in
162162+ let result = Html5_checker.check reader in
163163+ let all_msgs = Html5_checker.messages result in
164164+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
165165+ let form_msgs = List.filter (fun m ->
166166+ message_contains m "form"
167167+ ) all_msgs in
168168+ if List.length form_msgs > 0 then begin
169169+ Printf.printf " Found form-related messages:\n";
170170+ List.iter (fun msg ->
171171+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
172172+ ) form_msgs;
173173+ end else
174174+ Printf.printf " Note: No form nesting errors found (checker may not be enabled)\n"
175175+176176+(** Test output formatting *)
177177+let test_output_formats () =
178178+ Printf.printf "\nTest 9: Output format testing\n";
179179+ let html = {|<!DOCTYPE html><html><body><p>Test</p></body></html>|} in
180180+ let reader = reader_of_string html in
181181+ let result = Html5_checker.check reader in
182182+183183+ Printf.printf " Testing text format:\n";
184184+ let text_output = Html5_checker.format_text result in
185185+ Printf.printf " Length: %d chars\n" (String.length text_output);
186186+187187+ Printf.printf " Testing JSON format:\n";
188188+ let json_output = Html5_checker.format_json result in
189189+ Printf.printf " Length: %d chars\n" (String.length json_output);
190190+191191+ Printf.printf " Testing GNU format:\n";
192192+ let gnu_output = Html5_checker.format_gnu result in
193193+ Printf.printf " Length: %d chars\n" (String.length gnu_output)
194194+195195+(** Test has_errors function *)
196196+let test_has_errors () =
197197+ Printf.printf "\nTest 10: has_errors function\n";
198198+199199+ (* Valid document should have no errors *)
200200+ let valid_html = "<!DOCTYPE html><html><body><p>Valid</p></body></html>" in
201201+ let result1 = Html5_checker.check (reader_of_string valid_html) in
202202+ Printf.printf " Valid document has_errors: %b\n" (Html5_checker.has_errors result1);
203203+204204+ (* Document with likely parse errors *)
205205+ let invalid_html = "<html><body><p>Unclosed" in
206206+ let result2 = Html5_checker.check (reader_of_string invalid_html) in
207207+ Printf.printf " Invalid document has_errors: %b\n" (Html5_checker.has_errors result2)
208208+209209+(** Test check_dom with pre-parsed document *)
210210+let test_check_dom () =
211211+ Printf.printf "\nTest 11: check_dom with pre-parsed document\n";
212212+ let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in
213213+ let reader = reader_of_string html in
214214+ let parsed = Html5rw.parse reader in
215215+ let result = Html5_checker.check_dom parsed in
216216+ let all_msgs = Html5_checker.messages result in
217217+ Printf.printf " check_dom found %d message(s)\n" (List.length all_msgs);
218218+ Printf.printf " OK: check_dom completed successfully\n"
219219+220220+(** Test system_id parameter *)
221221+let test_system_id () =
222222+ Printf.printf "\nTest 12: system_id parameter\n";
223223+ let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in
224224+ let reader = reader_of_string html in
225225+ let result = Html5_checker.check ~system_id:"test.html" reader in
226226+ match Html5_checker.system_id result with
227227+ | Some id -> Printf.printf " system_id: %s\n" id
228228+ | None -> Printf.printf " Warning: system_id not set\n"
229229+230230+(** Test collect_parse_errors flag *)
231231+let test_collect_parse_errors_flag () =
232232+ Printf.printf "\nTest 13: collect_parse_errors flag\n";
233233+ let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in
234234+235235+ let result_with = Html5_checker.check ~collect_parse_errors:true (reader_of_string html) in
236236+ let msgs_with = Html5_checker.messages result_with in
237237+ Printf.printf " With parse errors: %d message(s)\n" (List.length msgs_with);
238238+239239+ let result_without = Html5_checker.check ~collect_parse_errors:false (reader_of_string html) in
240240+ let msgs_without = Html5_checker.messages result_without in
241241+ Printf.printf " Without parse errors: %d message(s)\n" (List.length msgs_without)
242242+243243+(** Test document accessor *)
244244+let test_document_accessor () =
245245+ Printf.printf "\nTest 14: document accessor\n";
246246+ let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in
247247+ let reader = reader_of_string html in
248248+ let result = Html5_checker.check reader in
249249+ let _doc = Html5_checker.document result in
250250+ Printf.printf " OK: document accessor works\n"
251251+252252+(** Test message severity filtering *)
253253+let test_severity_filtering () =
254254+ Printf.printf "\nTest 15: Message severity filtering\n";
255255+ let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in
256256+ let reader = reader_of_string html in
257257+ let result = Html5_checker.check reader in
258258+259259+ let all_msgs = Html5_checker.messages result in
260260+ let errors = Html5_checker.errors result in
261261+ let warnings = Html5_checker.warnings result in
262262+263263+ Printf.printf " Total messages: %d\n" (List.length all_msgs);
264264+ Printf.printf " Errors: %d\n" (List.length errors);
265265+ Printf.printf " Warnings: %d\n" (List.length warnings);
266266+267267+ (* Verify that errors + warnings <= all messages *)
268268+ if List.length errors + List.length warnings <= List.length all_msgs then
269269+ Printf.printf " OK: Message counts are consistent\n"
270270+ else
271271+ Printf.printf " Warning: Message counts inconsistent\n"
272272+273273+(** Run all tests *)
274274+let () =
275275+ Printf.printf "Running html5_checker tests...\n";
276276+ Printf.printf "========================================\n\n";
277277+278278+ test_valid_html5 ();
279279+ test_missing_doctype ();
280280+ test_obsolete_element ();
281281+ test_duplicate_id ();
282282+ test_heading_skip ();
283283+ test_img_without_alt ();
284284+ test_invalid_nesting ();
285285+ test_form_nesting ();
286286+ test_output_formats ();
287287+ test_has_errors ();
288288+ test_check_dom ();
289289+ test_system_id ();
290290+ test_collect_parse_errors_flag ();
291291+ test_document_accessor ();
292292+ test_severity_filtering ();
293293+294294+ Printf.printf "\n========================================\n";
295295+ Printf.printf "All tests completed!\n";
296296+ Printf.printf "\nNote: Some checkers may not be enabled yet.\n";
297297+ Printf.printf "Tests marked with 'Note:' indicate features that may be\n";
298298+ Printf.printf "implemented in future versions.\n"
+126
test/test_nesting_checker.ml
···11+(** Simple test for nesting_checker functionality *)
22+33+let () =
44+ (* Create a message collector *)
55+ let collector = Html5_checker.Message_collector.create () in
66+77+ (* Get the nesting checker *)
88+ let module C = (val Html5_checker__Nesting_checker.checker : Html5_checker__Checker.S) in
99+ let state = C.create () in
1010+1111+ (* Test 1: <a> cannot contain another <a> *)
1212+ Printf.printf "Test 1: Checking <a href> inside <a href>\n";
1313+ C.start_element state ~name:"a" ~namespace:None ~attrs:[("href", "#")] collector;
1414+ C.start_element state ~name:"a" ~namespace:None ~attrs:[("href", "#")] collector;
1515+1616+ let errors1 = Html5_checker.Message_collector.errors collector in
1717+ Printf.printf " Found %d error(s)\n" (List.length errors1);
1818+ List.iter (fun msg ->
1919+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
2020+ ) errors1;
2121+2222+ C.end_element state ~name:"a" ~namespace:None collector;
2323+ C.end_element state ~name:"a" ~namespace:None collector;
2424+ Html5_checker.Message_collector.clear collector;
2525+2626+ (* Test 2: <button> inside <a> *)
2727+ Printf.printf "\nTest 2: Checking <button> inside <a href>\n";
2828+ C.start_element state ~name:"a" ~namespace:None ~attrs:[("href", "#")] collector;
2929+ C.start_element state ~name:"button" ~namespace:None ~attrs:[] collector;
3030+3131+ let errors2 = Html5_checker.Message_collector.errors collector in
3232+ Printf.printf " Found %d error(s)\n" (List.length errors2);
3333+ List.iter (fun msg ->
3434+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
3535+ ) errors2;
3636+3737+ C.end_element state ~name:"button" ~namespace:None collector;
3838+ C.end_element state ~name:"a" ~namespace:None collector;
3939+ Html5_checker.Message_collector.clear collector;
4040+4141+ (* Test 3: form inside form *)
4242+ Printf.printf "\nTest 3: Checking <form> inside <form>\n";
4343+ C.start_element state ~name:"form" ~namespace:None ~attrs:[] collector;
4444+ C.start_element state ~name:"form" ~namespace:None ~attrs:[] collector;
4545+4646+ let errors3 = Html5_checker.Message_collector.errors collector in
4747+ Printf.printf " Found %d error(s)\n" (List.length errors3);
4848+ List.iter (fun msg ->
4949+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
5050+ ) errors3;
5151+5252+ C.end_element state ~name:"form" ~namespace:None collector;
5353+ C.end_element state ~name:"form" ~namespace:None collector;
5454+ Html5_checker.Message_collector.clear collector;
5555+5656+ (* Test 4: header inside footer *)
5757+ Printf.printf "\nTest 4: Checking <header> inside <footer>\n";
5858+ C.start_element state ~name:"footer" ~namespace:None ~attrs:[] collector;
5959+ C.start_element state ~name:"header" ~namespace:None ~attrs:[] collector;
6060+6161+ let errors4 = Html5_checker.Message_collector.errors collector in
6262+ Printf.printf " Found %d error(s)\n" (List.length errors4);
6363+ List.iter (fun msg ->
6464+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
6565+ ) errors4;
6666+6767+ C.end_element state ~name:"header" ~namespace:None collector;
6868+ C.end_element state ~name:"footer" ~namespace:None collector;
6969+ Html5_checker.Message_collector.clear collector;
7070+7171+ (* Test 5: input (not hidden) inside button *)
7272+ Printf.printf "\nTest 5: Checking <input type=text> inside <button>\n";
7373+ C.start_element state ~name:"button" ~namespace:None ~attrs:[] collector;
7474+ C.start_element state ~name:"input" ~namespace:None ~attrs:[("type", "text")] collector;
7575+7676+ let errors5 = Html5_checker.Message_collector.errors collector in
7777+ Printf.printf " Found %d error(s)\n" (List.length errors5);
7878+ List.iter (fun msg ->
7979+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
8080+ ) errors5;
8181+8282+ C.end_element state ~name:"input" ~namespace:None collector;
8383+ C.end_element state ~name:"button" ~namespace:None collector;
8484+ Html5_checker.Message_collector.clear collector;
8585+8686+ (* Test 6: valid nesting - should not error *)
8787+ Printf.printf "\nTest 6: Checking valid nesting: <div> inside <div>\n";
8888+ C.start_element state ~name:"div" ~namespace:None ~attrs:[] collector;
8989+ C.start_element state ~name:"div" ~namespace:None ~attrs:[] collector;
9090+9191+ let errors6 = Html5_checker.Message_collector.errors collector in
9292+ Printf.printf " Found %d error(s)\n" (List.length errors6);
9393+ if List.length errors6 = 0 then
9494+ Printf.printf " OK: No errors as expected\n";
9595+9696+ C.end_element state ~name:"div" ~namespace:None collector;
9797+ C.end_element state ~name:"div" ~namespace:None collector;
9898+ Html5_checker.Message_collector.clear collector;
9999+100100+ (* Test 7: area without map ancestor *)
101101+ Printf.printf "\nTest 7: Checking <area> without <map> ancestor\n";
102102+ C.start_element state ~name:"area" ~namespace:None ~attrs:[] collector;
103103+104104+ let errors7 = Html5_checker.Message_collector.errors collector in
105105+ Printf.printf " Found %d error(s)\n" (List.length errors7);
106106+ List.iter (fun msg ->
107107+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
108108+ ) errors7;
109109+110110+ C.end_element state ~name:"area" ~namespace:None collector;
111111+ Html5_checker.Message_collector.clear collector;
112112+113113+ (* Test 8: area with map ancestor (valid) *)
114114+ Printf.printf "\nTest 8: Checking <area> with <map> ancestor (valid)\n";
115115+ C.start_element state ~name:"map" ~namespace:None ~attrs:[] collector;
116116+ C.start_element state ~name:"area" ~namespace:None ~attrs:[] collector;
117117+118118+ let errors8 = Html5_checker.Message_collector.errors collector in
119119+ Printf.printf " Found %d error(s)\n" (List.length errors8);
120120+ if List.length errors8 = 0 then
121121+ Printf.printf " OK: No errors as expected\n";
122122+123123+ C.end_element state ~name:"area" ~namespace:None collector;
124124+ C.end_element state ~name:"map" ~namespace:None collector;
125125+126126+ Printf.printf "\nAll tests completed!\n"