···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+---------------------------------------------------------------------------*)
5+6+(** html5check - HTML5 conformance checker CLI
7+8+ Command line interface for validating HTML5 documents. *)
9+10+open Cmdliner
11+12+let version = "0.1.0"
13+14+(** Exit codes *)
15+module Exit_code = struct
16+ let ok = Cmd.Exit.ok
17+ let validation_errors = 1
18+ let io_error = 2
19+end
20+21+(** Read input from file or stdin *)
22+let read_input file =
23+ try
24+ let ic =
25+ if file = "-" then stdin
26+ else open_in file
27+ in
28+ let reader = Bytesrw.Bytes.Reader.of_in_channel ic in
29+ Ok (reader, ic, file)
30+ with
31+ | Sys_error msg ->
32+ Error (`Io_error (Printf.sprintf "Cannot read file '%s': %s" file msg))
33+34+(** Format output based on the requested format *)
35+let format_output format result =
36+ match format with
37+ | `Text -> Html5_checker.format_text result
38+ | `Json -> Html5_checker.format_json result
39+ | `Gnu -> Html5_checker.format_gnu result
40+41+(** Run the validation *)
42+let run format errors_only exit_zero quiet verbose file =
43+ match read_input file with
44+ | Error (`Io_error msg) ->
45+ if not quiet then Printf.eprintf "Error: %s\n" msg;
46+ Exit_code.io_error
47+ | Ok (reader, ic, system_id) ->
48+ (* Run validation *)
49+ let result = Html5_checker.check ~system_id reader in
50+51+ (* Close input if it's not stdin *)
52+ if file <> "-" then close_in ic;
53+54+ (* Get messages based on filtering *)
55+ let messages =
56+ if errors_only then Html5_checker.errors result
57+ else Html5_checker.messages result
58+ in
59+60+ (* Output based on mode *)
61+ if quiet then begin
62+ (* Only show counts *)
63+ let error_count = List.length (Html5_checker.errors result) in
64+ let warning_count = List.length (Html5_checker.warnings result) in
65+ if errors_only then
66+ Printf.printf "%d error%s\n" error_count (if error_count = 1 then "" else "s")
67+ else
68+ Printf.printf "%d error%s, %d warning%s\n"
69+ error_count (if error_count = 1 then "" else "s")
70+ warning_count (if warning_count = 1 then "" else "s")
71+ end else begin
72+ (* Format and print messages *)
73+ let output = format_output format result in
74+ if output <> "" then print_string output;
75+76+ (* Show summary if verbose *)
77+ if verbose && messages <> [] then begin
78+ let error_count = List.length (Html5_checker.errors result) in
79+ let warning_count = List.length (Html5_checker.warnings result) in
80+ Printf.eprintf "\nSummary: %d error%s, %d warning%s\n"
81+ error_count (if error_count = 1 then "" else "s")
82+ warning_count (if warning_count = 1 then "" else "s")
83+ end
84+ end;
85+86+ (* Determine exit code *)
87+ if exit_zero || not (Html5_checker.has_errors result) then
88+ Exit_code.ok
89+ else
90+ Exit_code.validation_errors
91+92+(** Command line argument definitions *)
93+94+let format_arg =
95+ let formats = [("text", `Text); ("json", `Json); ("gnu", `Gnu)] in
96+ let doc =
97+ "Output format. $(docv) must be one of $(b,text) (human-readable, default), \
98+ $(b,json) (Nu validator compatible JSON), or $(b,gnu) (GNU-style for IDE integration)."
99+ in
100+ Arg.(value & opt (enum formats) `Text & info ["format"] ~docv:"FORMAT" ~doc)
101+102+let errors_only_arg =
103+ let doc = "Only show errors (suppress warnings)." in
104+ Arg.(value & flag & info ["errors-only"] ~doc)
105+106+let exit_zero_arg =
107+ let doc =
108+ "Always exit with status code 0, even if validation errors are found. \
109+ Useful for CI pipelines where you want to collect validation results \
110+ but not fail the build."
111+ in
112+ Arg.(value & flag & info ["exit-zero"] ~doc)
113+114+let quiet_arg =
115+ let doc = "Quiet mode - only show error and warning counts, no details." in
116+ Arg.(value & flag & info ["q"; "quiet"] ~doc)
117+118+let verbose_arg =
119+ let doc = "Verbose mode - show additional information including summary." in
120+ Arg.(value & flag & info ["v"; "verbose"] ~doc)
121+122+let file_arg =
123+ let doc =
124+ "HTML file to validate. Use $(b,-) to read from standard input. \
125+ If no file is specified, reads from stdin."
126+ in
127+ Arg.(value & pos 0 string "-" & info [] ~docv:"FILE" ~doc)
128+129+let cmd =
130+ let doc = "validate HTML5 documents for conformance" in
131+ let man = [
132+ `S Manpage.s_description;
133+ `P "$(tname) validates HTML5 documents against the WHATWG HTML5 specification. \
134+ It reports parse errors, structural validation issues, and conformance problems.";
135+ `P "The validator checks for:";
136+ `I ("Parse errors", "Malformed HTML syntax according to the WHATWG specification");
137+ `I ("Content model violations", "Elements in invalid parent/child relationships");
138+ `I ("Attribute errors", "Invalid or missing required attributes");
139+ `I ("Structural issues", "Other conformance problems");
140+ `S Manpage.s_options;
141+ `S "OUTPUT FORMATS";
142+ `P "The validator supports three output formats:";
143+ `I ("$(b,text)", "Human-readable format showing file:line:col: severity: message");
144+ `I ("$(b,json)", "JSON format compatible with the Nu Html Checker (v.Nu)");
145+ `I ("$(b,gnu)", "GNU-style format for IDE integration (file:line:column: message)");
146+ `S "EXIT STATUS";
147+ `P "The validator exits with one of the following status codes:";
148+ `I ("0", "No validation errors found (or --exit-zero was specified)");
149+ `I ("1", "Validation errors were found");
150+ `I ("2", "File not found or I/O error");
151+ `S Manpage.s_examples;
152+ `P "Validate a file:";
153+ `Pre " $(mname) index.html";
154+ `P "Validate from stdin:";
155+ `Pre " cat page.html | $(mname) -";
156+ `P "Show only errors in JSON format:";
157+ `Pre " $(mname) --format=json --errors-only page.html";
158+ `P "Quiet mode for CI:";
159+ `Pre " $(mname) --quiet --exit-zero index.html";
160+ `S Manpage.s_bugs;
161+ `P "Report bugs at https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues";
162+ ] in
163+ let info = Cmd.info "html5check" ~version ~doc ~man in
164+ Cmd.v info Term.(const run $ format_arg $ errors_only_arg $ exit_zero_arg
165+ $ quiet_arg $ verbose_arg $ file_arg)
166+167+let main () = Cmd.eval' cmd
168+let () = Stdlib.exit (main ())
···1+(** Base checker module for HTML5 conformance checking. *)
2+3+module type S = sig
4+ type state
5+6+ val create : unit -> state
7+ val reset : state -> unit
8+9+ val start_element :
10+ state ->
11+ name:string ->
12+ namespace:string option ->
13+ attrs:(string * string) list ->
14+ Message_collector.t ->
15+ unit
16+17+ val end_element :
18+ state -> name:string -> namespace:string option -> Message_collector.t -> unit
19+20+ val characters : state -> string -> Message_collector.t -> unit
21+ val end_document : state -> Message_collector.t -> unit
22+end
23+24+type t = (module S)
25+26+(** No-operation checker implementation. *)
27+module Noop = struct
28+ type state = unit
29+30+ let create () = ()
31+ let reset () = ()
32+33+ let start_element () ~name:_ ~namespace:_ ~attrs:_ _ = ()
34+ let end_element () ~name:_ ~namespace:_ _ = ()
35+ let characters () _ _ = ()
36+ let end_document () _ = ()
37+end
38+39+let noop () = (module Noop : S)
···1+(** Base checker module for HTML5 conformance checking.
2+3+ This module provides the core checker abstraction used throughout the
4+ html5_checker library. A checker validates HTML5 documents by observing
5+ DOM tree traversal events and emitting validation messages.
6+7+ {2 Design Overview}
8+9+ Checkers follow a SAX-like event model where they receive notifications
10+ about elements, text, and document boundaries as a DOM tree is traversed.
11+ This design allows for:
12+13+ - {b Stateful validation}: Each checker maintains its own state across
14+ multiple events
15+ - {b Composability}: Multiple checkers can validate the same document
16+ simultaneously
17+ - {b Efficiency}: DOM traversal happens once regardless of checker count
18+19+ {2 Checker Lifecycle}
20+21+ A checker progresses through these phases:
22+23+ 1. {b Creation}: Initialize with {!create} to set up initial state
24+ 2. {b Traversal}: Receive {!start_element}, {!characters}, and
25+ {!end_element} events as the DOM is walked
26+ 3. {b Completion}: Finalize validation with {!end_document}
27+ 4. {b Reset} (optional): Return to initial state with {!reset}
28+29+ {3 Event Sequence}
30+31+ For a document like [<p>Hello <b>world</b></p>], the event sequence is:
32+33+ {v
34+ start_element "p"
35+ characters "Hello "
36+ start_element "b"
37+ characters "world"
38+ end_element "b"
39+ end_element "p"
40+ end_document
41+ v}
42+43+ {2 First-Class Modules}
44+45+ Checkers are represented as first-class modules implementing the {!S}
46+ signature. This allows:
47+48+ - Dynamic checker registration and discovery
49+ - Heterogeneous collections of checkers
50+ - Checker selection at runtime based on validation requirements
51+52+ @see <https://v2.ocaml.org/manual/firstclassmodules.html>
53+ OCaml manual: First-class modules
54+*)
55+56+(** {1 Module Signature} *)
57+58+(** The signature that all checker modules must implement.
59+60+ A checker module maintains validation state and receives notifications
61+ about DOM tree traversal events. *)
62+module type S = sig
63+ (** The type of checker state.
64+65+ This is an abstract type that holds the checker's internal validation
66+ state. Different checkers will have different state representations
67+ depending on what they need to track during validation. *)
68+ type state
69+70+ (** {1 Lifecycle Operations} *)
71+72+ val create : unit -> state
73+ (** [create ()] initializes a new checker state.
74+75+ This function sets up the initial state needed for validation,
76+ such as empty stacks for context tracking, counters, or lookup
77+ tables. *)
78+79+ val reset : state -> unit
80+ (** [reset state] resets the checker to its initial state.
81+82+ This allows reusing a checker for multiple documents without
83+ reallocating. After reset, the checker behaves as if freshly
84+ created with {!create}. *)
85+86+ (** {1 DOM Traversal Events} *)
87+88+ val start_element :
89+ state ->
90+ name:string ->
91+ namespace:string option ->
92+ attrs:(string * string) list ->
93+ Message_collector.t ->
94+ unit
95+ (** [start_element state ~name ~namespace ~attrs collector] is called when
96+ entering an element during DOM traversal.
97+98+ @param state The checker state
99+ @param name The element tag name (e.g., "div", "p", "span")
100+ @param namespace The element namespace ([None] for HTML, [Some "svg"]
101+ for SVG, [Some "mathml"] for MathML)
102+ @param attrs The element's attributes as [(name, value)] pairs
103+ @param collector The message collector for emitting validation messages
104+105+ This is where checkers can validate:
106+ - Whether the element is allowed in the current context
107+ - Whether required attributes are present
108+ - Whether attribute values are valid
109+ - Whether the element opens a new validation context *)
110+111+ val end_element :
112+ state -> name:string -> namespace:string option -> Message_collector.t -> unit
113+ (** [end_element state ~name ~namespace collector] is called when exiting
114+ an element during DOM traversal.
115+116+ @param state The checker state
117+ @param name The element tag name
118+ @param namespace The element namespace
119+ @param collector The message collector for emitting validation messages
120+121+ This is where checkers can:
122+ - Pop validation contexts from stacks
123+ - Validate that required child elements were present
124+ - Emit messages about element-scoped validation rules *)
125+126+ val characters : state -> string -> Message_collector.t -> unit
127+ (** [characters state text collector] is called when text content is
128+ encountered during DOM traversal.
129+130+ @param state The checker state
131+ @param text The text content
132+ @param collector The message collector for emitting validation messages
133+134+ This is where checkers can validate:
135+ - Whether text is allowed in the current context
136+ - Whether text content follows specific patterns
137+ - Whether text matches expected formats *)
138+139+ val end_document : state -> Message_collector.t -> unit
140+ (** [end_document state collector] is called after the entire DOM tree has
141+ been traversed.
142+143+ @param state The checker state
144+ @param collector The message collector for emitting validation messages
145+146+ This is where checkers can:
147+ - Emit messages about missing required elements
148+ - Validate document-level constraints
149+ - Check that all opened contexts were properly closed
150+ - Report any accumulated validation failures *)
151+end
152+153+(** {1 Checker Values} *)
154+155+(** The type of a checker value.
156+157+ This is a packed first-class module containing both the checker
158+ implementation and its state. It enables storing heterogeneous
159+ checkers in collections and passing them around dynamically. *)
160+type t = (module S)
161+162+(** {1 Built-in Checkers} *)
163+164+val noop : unit -> t
165+(** [noop ()] creates a no-operation checker that performs no validation.
166+167+ This checker ignores all events and never emits messages. It is useful:
168+ - As a placeholder in checker registries
169+ - For testing checker infrastructure
170+ - As a base for building new checkers
171+172+ {b Example:}
173+ {[
174+ let checker = noop () in
175+ (* Does nothing when walked over a DOM tree *)
176+ ]}
177+*)
+22
lib/html5_checker/checker_registry.ml
···0000000000000000000000
···1+(** Registry for HTML5 conformance checkers. *)
2+3+type t = (string, Checker.t) Hashtbl.t
4+5+let create () = Hashtbl.create 16
6+7+let default () =
8+ (* In Phase 1, return an empty registry.
9+ Built-in checkers will be added in later phases. *)
10+ create ()
11+12+let register registry name checker = Hashtbl.replace registry name checker
13+14+let unregister registry name = Hashtbl.remove registry name
15+16+let get registry name = Hashtbl.find_opt registry name
17+18+let list_names registry =
19+ Hashtbl.to_seq_keys registry |> List.of_seq
20+21+let all registry =
22+ Hashtbl.to_seq_values registry |> List.of_seq
···1+(** Registry for HTML5 conformance checkers.
2+3+ This module provides a dynamic registry for managing collections of
4+ checkers. It enables:
5+6+ - {b Registration}: Add checkers under descriptive names
7+ - {b Discovery}: Retrieve checkers by name or list all available ones
8+ - {b Lifecycle management}: Register and unregister checkers at runtime
9+ - {b Defaults}: Access a pre-configured set of built-in checkers
10+11+ {2 Design Rationale}
12+13+ The registry pattern separates checker implementation from checker usage.
14+ Applications can:
15+16+ 1. Query available checkers to present options to users
17+ 2. Select specific checkers based on validation requirements
18+ 3. Add custom checkers without modifying library code
19+ 4. Share checker configurations across validation runs
20+21+ {2 Usage Pattern}
22+23+ {[
24+ (* Start with default checkers *)
25+ let reg = default () in
26+27+ (* Add a custom checker *)
28+ let my_checker = (module MyChecker : Checker.S) in
29+ register reg "my-custom-check" my_checker;
30+31+ (* List all available checkers *)
32+ let names = list_names reg in
33+ List.iter (Printf.printf "Available: %s\n") names;
34+35+ (* Retrieve a specific checker *)
36+ match get reg "my-custom-check" with
37+ | Some checker -> (* Use the checker *)
38+ | None -> (* Not found *)
39+40+ (* Get all checkers for validation *)
41+ let all_checkers = all reg in
42+ (* Pass to dom_walker *)
43+ ]}
44+45+ {2 Thread Safety}
46+47+ This registry is not thread-safe. If shared across threads, external
48+ synchronization is required. *)
49+50+(** {1 Types} *)
51+52+(** The type of a checker registry.
53+54+ This is an opaque type representing a mutable collection of named
55+ checkers. Internally implemented as a hash table for efficient lookups. *)
56+type t
57+58+(** {1 Creation} *)
59+60+val create : unit -> t
61+(** [create ()] creates a new empty checker registry.
62+63+ Use this when you want to build a custom set of checkers from scratch,
64+ without any defaults. *)
65+66+val default : unit -> t
67+(** [default ()] creates a registry with built-in checkers.
68+69+ The default registry is initially empty but serves as a starting point
70+ for adding standard validation checkers in future phases.
71+72+ Built-in checkers will include:
73+ - Document structure validation
74+ - Attribute validation
75+ - Content model checking
76+ - Accessibility checks
77+78+ Note: In Phase 1, the default registry is empty. Built-in checkers
79+ will be added in subsequent phases. *)
80+81+(** {1 Registration} *)
82+83+val register : t -> string -> Checker.t -> unit
84+(** [register registry name checker] adds a checker to the registry.
85+86+ @param registry The registry to add to
87+ @param name A unique identifier for the checker (e.g., "obsolete-elements",
88+ "required-attributes")
89+ @param checker The checker implementation
90+91+ If a checker with the same name already exists, it is replaced.
92+93+ {b Example:}
94+ {[
95+ let reg = create () in
96+ let checker = (module MyChecker : Checker.S) in
97+ register reg "my-check" checker
98+ ]} *)
99+100+val unregister : t -> string -> unit
101+(** [unregister registry name] removes a checker from the registry.
102+103+ @param registry The registry to remove from
104+ @param name The checker name
105+106+ If no checker with the given name exists, this is a no-op. *)
107+108+(** {1 Retrieval} *)
109+110+val get : t -> string -> Checker.t option
111+(** [get registry name] retrieves a checker by name.
112+113+ @param registry The registry to search
114+ @param name The checker name
115+ @return [Some checker] if found, [None] otherwise
116+117+ {b Example:}
118+ {[
119+ match get reg "obsolete-elements" with
120+ | Some checker -> (* Use checker *)
121+ | None -> (* Checker not registered *)
122+ ]} *)
123+124+val list_names : t -> string list
125+(** [list_names registry] returns all registered checker names.
126+127+ @param registry The registry to query
128+ @return A list of all checker names in arbitrary order
129+130+ This is useful for:
131+ - Displaying available checkers to users
132+ - Debugging registry contents
133+ - Iterating over specific subsets of checkers
134+135+ {b Example:}
136+ {[
137+ let names = list_names reg in
138+ Printf.printf "Available checkers: %s\n"
139+ (String.concat ", " names)
140+ ]} *)
141+142+val all : t -> Checker.t list
143+(** [all registry] returns all registered checkers.
144+145+ @param registry The registry to query
146+ @return A list of all checkers in arbitrary order
147+148+ This is the primary way to retrieve checkers for validation.
149+ Pass the result to {!Dom_walker.walk_all} to run all registered
150+ checkers on a DOM tree.
151+152+ {b Example:}
153+ {[
154+ let checkers = all reg in
155+ Dom_walker.walk_all checkers collector dom
156+ ]} *)
···1+(** HTML5 content categories.
2+3+ This module defines the content categories used in HTML5 to classify elements
4+ based on their characteristics and allowed contexts. Elements can belong to
5+ multiple categories.
6+7+ @see <https://html.spec.whatwg.org/multipage/dom.html#content-models> WHATWG HTML Specification *)
8+9+(** Content category type. *)
10+type t =
11+ | Metadata
12+ (** Metadata content sets up the presentation or behavior of the rest of
13+ the content, or sets up the relationship of the document with other
14+ documents, or conveys other "out of band" information. *)
15+ | Flow
16+ (** Most elements that are used in the body of documents and applications
17+ are categorized as flow content. *)
18+ | Sectioning
19+ (** Sectioning content is content that defines the scope of headings and
20+ footers. *)
21+ | Heading
22+ (** Heading content defines the heading of a section (whether explicitly
23+ marked up using sectioning content elements, or implied by the heading
24+ content itself). *)
25+ | Phrasing
26+ (** Phrasing content is the text of the document, as well as elements that
27+ mark up that text at the intra-paragraph level. *)
28+ | Embedded
29+ (** Embedded content is content that imports another resource into the
30+ document, or content from another vocabulary that is inserted into the
31+ document. *)
32+ | Interactive
33+ (** Interactive content is content that is specifically intended for user
34+ interaction. *)
35+ | Palpable
36+ (** As a general rule, elements whose content model allows any flow content
37+ or phrasing content should have at least one node in its contents that
38+ is palpable content and that does not have the hidden attribute specified. *)
39+ | Script_supporting
40+ (** Script-supporting elements are those that do not represent anything
41+ themselves (i.e., they are not rendered), but are used to support scripts. *)
42+ | Form_associated
43+ (** Form-associated elements can have a form owner. *)
44+ | Listed
45+ (** Listed form-associated elements have a form attribute that can point
46+ to a form element. *)
47+ | Labelable
48+ (** Labelable form-associated elements can be associated with label elements. *)
49+ | Submittable
50+ (** Submittable form-associated elements can be used for constructing the
51+ entry list when a form element is submitted. *)
52+ | Resettable
53+ (** Resettable form-associated elements are affected when a form element
54+ is reset. *)
55+ | Autocapitalize_inheriting
56+ (** Some elements inherit the autocapitalize attribute from their form owner. *)
57+ | Transparent
58+ (** Transparent content models adopt the content model of their parent
59+ element. *)
60+61+(** {1 Predicates} *)
62+63+val to_string : t -> string
64+(** [to_string category] returns a string representation of the category. *)
65+66+val compare : t -> t -> int
67+(** [compare c1 c2] compares two categories for ordering. *)
68+69+val equal : t -> t -> bool
70+(** [equal c1 c2] returns [true] if the categories are equal. *)
···1+(** Content model checker.
2+3+ Validates that HTML elements conform to their content model specifications.
4+5+ The content model checker performs structural validation of HTML documents
6+ by ensuring that:
7+8+ - Element children match the element's declared content model
9+ - No prohibited ancestor relationships exist (e.g., no [<a>] inside [<a>])
10+ - Void elements contain no children
11+ - Required children are present where mandated
12+13+ {2 Content Model Validation}
14+15+ The checker validates content models by:
16+17+ 1. Looking up the element specification in the registry
18+ 2. Checking each child element or text node against the content model
19+ 3. Tracking the ancestor stack to detect prohibited relationships
20+ 4. Emitting appropriate errors or warnings for violations
21+22+ {2 Usage Example}
23+24+ {[
25+ let checker = Content_checker.create (Message_collector.create ()) in
26+ let module C = (val checker : Checker.S) in
27+ let state = C.create () in
28+29+ (* Walk the DOM tree *)
30+ C.start_element state ~name:"div" ~namespace:None ~attrs:[] collector;
31+ C.characters state "Hello, world!" collector;
32+ C.end_element state ~name:"div" ~namespace:None collector;
33+ C.end_document state collector
34+ ]}
35+*)
36+37+(** Include the standard checker signature. *)
38+include Checker.S
39+40+(** {1 Creation} *)
41+42+val create_with_registry : ?registry:Element_registry.t -> Message_collector.t -> state
43+(** [create_with_registry ?registry collector] creates a content checker with an
44+ optional custom element registry.
45+46+ If no registry is provided, uses {!Element_registry.default}.
47+48+ @param registry Custom element registry (defaults to standard HTML5 elements)
49+ @param collector Message collector for validation messages *)
50+51+(** {1 First-Class Module} *)
52+53+val checker : Checker.t
54+(** [checker] is the content checker packaged as a first-class module.
55+56+ This allows the content checker to be used in checker registries and
57+ other contexts that work with heterogeneous checker collections. *)
···1+type t =
2+ | Nothing
3+ | Text
4+ | Transparent
5+ | Categories of Content_category.t list
6+ | Elements of string list
7+ | Mixed of Content_category.t list
8+ | One_or_more of t
9+ | Zero_or_more of t
10+ | Optional of t
11+ | Sequence of t list
12+ | Choice of t list
13+ | Except of t * Content_category.t list
14+15+let rec pp fmt = function
16+ | Nothing -> Format.fprintf fmt "Nothing"
17+ | Text -> Format.fprintf fmt "Text"
18+ | Transparent -> Format.fprintf fmt "Transparent"
19+ | Categories cats ->
20+ Format.fprintf fmt "Categories [%a]"
21+ (Format.pp_print_list
22+ ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ")
23+ Content_category.pp)
24+ cats
25+ | Elements elems ->
26+ Format.fprintf fmt "Elements [%a]"
27+ (Format.pp_print_list
28+ ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ")
29+ Format.pp_print_string)
30+ elems
31+ | Mixed cats ->
32+ Format.fprintf fmt "Mixed [%a]"
33+ (Format.pp_print_list
34+ ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ")
35+ Content_category.pp)
36+ cats
37+ | One_or_more t -> Format.fprintf fmt "One_or_more (%a)" pp t
38+ | Zero_or_more t -> Format.fprintf fmt "Zero_or_more (%a)" pp t
39+ | Optional t -> Format.fprintf fmt "Optional (%a)" pp t
40+ | Sequence ts ->
41+ Format.fprintf fmt "Sequence [%a]"
42+ (Format.pp_print_list ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ") pp)
43+ ts
44+ | Choice ts ->
45+ Format.fprintf fmt "Choice [%a]"
46+ (Format.pp_print_list ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ") pp)
47+ ts
48+ | Except (t, cats) ->
49+ Format.fprintf fmt "Except (%a, [%a])" pp t
50+ (Format.pp_print_list
51+ ~pp_sep:(fun fmt () -> Format.fprintf fmt "; ")
52+ Content_category.pp)
53+ cats
54+55+let to_string t =
56+ let buf = Buffer.create 256 in
57+ let fmt = Format.formatter_of_buffer buf in
58+ pp fmt t;
59+ Format.pp_print_flush fmt ();
60+ Buffer.contents buf
+21
lib/html5_checker/content_model/content_model.mli
···000000000000000000000
···1+(** HTML5 element content models.
2+3+ Defines what children an element can contain. *)
4+5+type t =
6+ | Nothing (** No children allowed (void elements) *)
7+ | Text (** Text only (no elements) *)
8+ | Transparent (** Inherits parent's content model *)
9+ | Categories of Content_category.t list (** Elements from categories *)
10+ | Elements of string list (** Specific elements only *)
11+ | Mixed of Content_category.t list (** Text + elements from categories *)
12+ | One_or_more of t (** At least one child matching *)
13+ | Zero_or_more of t (** Any number of children matching *)
14+ | Optional of t (** Zero or one child matching *)
15+ | Sequence of t list (** Ordered sequence *)
16+ | Choice of t list (** Any one of *)
17+ | Except of t * Content_category.t list (** t except categories *)
18+19+val pp : Format.formatter -> t -> unit
20+21+val to_string : t -> string
···1+(** Registry for HTML5 element specifications.
2+3+ Provides fast lookup of element specs by name. *)
4+5+(** The type of an element registry. *)
6+type t
7+8+(** {1 Creation and Modification} *)
9+10+val create : unit -> t
11+(** [create ()] creates a new empty element registry. *)
12+13+val register : t -> Element_spec.t -> unit
14+(** [register registry spec] adds an element specification to the registry.
15+16+ If an element with the same name already exists, it is replaced. *)
17+18+(** {1 Lookup} *)
19+20+val get : t -> string -> Element_spec.t option
21+(** [get registry name] looks up an element specification by tag name.
22+23+ Returns [None] if the element is not registered. Tag names are
24+ case-insensitive. *)
25+26+val list_names : t -> string list
27+(** [list_names registry] returns a sorted list of all registered element names. *)
28+29+val all : t -> Element_spec.t list
30+(** [all registry] returns all registered element specifications. *)
31+32+(** {1 Default Registry} *)
33+34+val default : unit -> t
35+(** [default ()] creates a registry pre-populated with all standard HTML5 elements.
36+37+ The registry includes elements from:
38+ - {!Elements_document} - Document structure and sectioning
39+ - {!Elements_text} - Text-level semantics
40+ - {!Elements_form} - Forms and input controls
41+ - {!Elements_embedded} - Embedded content
42+ - {!Elements_table} - Tables
43+ - {!Elements_interactive} - Interactive elements *)
···1+(** HTML5 structural and document element specifications.
2+3+ This module defines element specifications for HTML5 document structure,
4+ sectioning, and grouping elements according to the WHATWG HTML specification.
5+6+ @see <https://html.spec.whatwg.org/multipage/> WHATWG HTML Specification *)
7+8+(** {1 Document structure elements} *)
9+10+val html : Element_spec.t
11+(** The [html] element represents the root of an HTML document. *)
12+13+val head : Element_spec.t
14+(** The [head] element represents a collection of metadata for the document. *)
15+16+val body : Element_spec.t
17+(** The [body] element represents the contents of the document. *)
18+19+val title : Element_spec.t
20+(** The [title] element represents the document's title or name. *)
21+22+val base : Element_spec.t
23+(** The [base] element specifies the document base URL and/or default browsing
24+ context for navigation. *)
25+26+val link : Element_spec.t
27+(** The [link] element specifies relationships between the current document and
28+ external resources. *)
29+30+val meta : Element_spec.t
31+(** The [meta] element represents various kinds of metadata that cannot be
32+ expressed using other metadata elements. *)
33+34+val style : Element_spec.t
35+(** The [style] element allows authors to embed CSS style sheets in their documents. *)
36+37+(** {1 Sectioning elements} *)
38+39+val article : Element_spec.t
40+(** The [article] element represents a complete, or self-contained, composition
41+ in a document, page, application, or site. *)
42+43+val section : Element_spec.t
44+(** The [section] element represents a generic section of a document or application. *)
45+46+val nav : Element_spec.t
47+(** The [nav] element represents a section of a page that links to other pages
48+ or to parts within the page. *)
49+50+val aside : Element_spec.t
51+(** The [aside] element represents a section of a page that consists of content
52+ that is tangentially related to the content around it. *)
53+54+val h1 : Element_spec.t
55+(** The [h1] element represents a heading at level 1. *)
56+57+val h2 : Element_spec.t
58+(** The [h2] element represents a heading at level 2. *)
59+60+val h3 : Element_spec.t
61+(** The [h3] element represents a heading at level 3. *)
62+63+val h4 : Element_spec.t
64+(** The [h4] element represents a heading at level 4. *)
65+66+val h5 : Element_spec.t
67+(** The [h5] element represents a heading at level 5. *)
68+69+val h6 : Element_spec.t
70+(** The [h6] element represents a heading at level 6. *)
71+72+val hgroup : Element_spec.t
73+(** The [hgroup] element represents a heading and related content, such as
74+ subheadings, an alternative title, or a tagline. *)
75+76+val header : Element_spec.t
77+(** The [header] element represents introductory content for its nearest ancestor
78+ sectioning content or sectioning root element. *)
79+80+val footer : Element_spec.t
81+(** The [footer] element represents a footer for its nearest ancestor sectioning
82+ content or sectioning root element. *)
83+84+val address : Element_spec.t
85+(** The [address] element represents contact information for its nearest [article]
86+ or [body] element ancestor. *)
87+88+val main : Element_spec.t
89+(** The [main] element represents the dominant contents of the document. *)
90+91+(** {1 Grouping elements} *)
92+93+val p : Element_spec.t
94+(** The [p] element represents a paragraph. *)
95+96+val hr : Element_spec.t
97+(** The [hr] element represents a thematic break between paragraph-level elements. *)
98+99+val pre : Element_spec.t
100+(** The [pre] element represents a block of preformatted text. *)
101+102+val blockquote : Element_spec.t
103+(** The [blockquote] element represents a section that is quoted from another source. *)
104+105+val ol : Element_spec.t
106+(** The [ol] element represents a list of items, where the items have been
107+ intentionally ordered. *)
108+109+val ul : Element_spec.t
110+(** The [ul] element represents a list of items, where the order of the items
111+ is not important. *)
112+113+val menu : Element_spec.t
114+(** The [menu] element represents a toolbar consisting of its contents, in the
115+ form of an unordered list of items. *)
116+117+val li : Element_spec.t
118+(** The [li] element represents a list item. *)
119+120+val dl : Element_spec.t
121+(** The [dl] element represents an association list consisting of zero or more
122+ name-value groups (a description list). *)
123+124+val dt : Element_spec.t
125+(** The [dt] element represents the term, or name, part of a term-description
126+ group in a description list. *)
127+128+val dd : Element_spec.t
129+(** The [dd] element represents the description, definition, or value, part of
130+ a term-description group in a description list. *)
131+132+val figure : Element_spec.t
133+(** The [figure] element represents some flow content, optionally with a caption,
134+ that is self-contained and is typically referenced as a single unit from
135+ the main flow of the document. *)
136+137+val figcaption : Element_spec.t
138+(** The [figcaption] element represents a caption or legend for the rest of the
139+ contents of the parent [figure] element. *)
140+141+val div : Element_spec.t
142+(** The [div] element has no special meaning at all. It represents its children. *)
143+144+(** {1 Element registry} *)
145+146+val all : Element_spec.t list
147+(** [all] contains all element specifications defined in this module. *)
···1+(** HTML5 embedded content element specifications.
2+3+ Embedded content elements import resources into the document.
4+ See https://html.spec.whatwg.org/multipage/embedded-content.html *)
5+6+val picture : Element_spec.t
7+(** The picture element contains zero or more source elements followed by
8+ one img element to offer alternative versions of an image for different
9+ display scenarios. *)
10+11+val source : Element_spec.t
12+(** The source element specifies multiple media resources for picture, audio,
13+ and video elements. It is a void element. *)
14+15+val img : Element_spec.t
16+(** The img element represents an image. It is a void element. *)
17+18+val iframe : Element_spec.t
19+(** The iframe element represents a nested browsing context. *)
20+21+val embed : Element_spec.t
22+(** The embed element provides an integration point for an external application
23+ or interactive content. It is a void element. *)
24+25+val object_ : Element_spec.t
26+(** The object element can represent an external resource, which is treated as
27+ an image, a nested browsing context, or a plugin. *)
28+29+val param : Element_spec.t
30+(** The param element defines parameters for plugins invoked by object elements.
31+ Deprecated in favor of using data attributes. It is a void element. *)
32+33+val video : Element_spec.t
34+(** The video element is used for playing videos or movies, and audio files
35+ with captions. *)
36+37+val audio : Element_spec.t
38+(** The audio element represents a sound or audio stream. *)
39+40+val track : Element_spec.t
41+(** The track element allows authors to specify explicit external timed text
42+ tracks for media elements. It is a void element. *)
43+44+val map : Element_spec.t
45+(** The map element, in conjunction with img and area elements, defines an
46+ image map. *)
47+48+val area : Element_spec.t
49+(** The area element represents either a hyperlink with some text and a
50+ corresponding area on an image map, or a dead area on an image map.
51+ It is a void element. *)
52+53+val all : Element_spec.t list
54+(** List of all embedded content element specifications. *)
···1+(** HTML5 form element specifications.
2+3+ Form-associated elements for user input and form submission.
4+ See https://html.spec.whatwg.org/multipage/forms.html *)
5+6+(** {1 Form Container} *)
7+8+val form : Element_spec.t
9+(** The form element represents a hyperlink that can be manipulated through a
10+ collection of form-associated elements, some of which can represent editable
11+ values that can be submitted to a server for processing.
12+13+ Content model: Flow content, but must not contain form element descendants. *)
14+15+(** {1 Form Controls} *)
16+17+val label : Element_spec.t
18+(** The label element represents a caption in a user interface. The caption can
19+ be associated with a specific form control, known as the label element's
20+ labeled control.
21+22+ Content model: Phrasing content, but must not contain descendant label
23+ elements, and must not contain form control descendants other than the
24+ labeled control. *)
25+26+val input : Element_spec.t
27+(** The input element represents a typed data field, usually with a form control
28+ to allow the user to edit the data. It is a void element.
29+30+ Content model: Nothing (void element).
31+32+ The type attribute controls the data type (and associated control) of the
33+ element. *)
34+35+val button : Element_spec.t
36+(** The button element represents a button labeled by its contents.
37+38+ Content model: Phrasing content, but must not contain interactive content
39+ descendants. *)
40+41+val select : Element_spec.t
42+(** The select element represents a control for selecting amongst a set of
43+ options.
44+45+ Content model: Zero or more option, optgroup, and script-supporting elements. *)
46+47+val datalist : Element_spec.t
48+(** The datalist element represents a set of option elements that represent
49+ predefined options for other controls. In the rendering, the datalist element
50+ represents nothing and it, along with its children, should be hidden.
51+52+ Content model: Either phrasing content or zero or more option and
53+ script-supporting elements. *)
54+55+val optgroup : Element_spec.t
56+(** The optgroup element represents a group of option elements with a common
57+ label.
58+59+ Content model: Zero or more option and script-supporting elements. *)
60+61+val option : Element_spec.t
62+(** The option element represents an option in a select element or as part of a
63+ list of suggestions in a datalist element.
64+65+ Content model: Text, or empty if label attribute is present. *)
66+67+val textarea : Element_spec.t
68+(** The textarea element represents a multiline plain text edit control for the
69+ element's raw value.
70+71+ Content model: Text. *)
72+73+val output : Element_spec.t
74+(** The output element represents the result of a calculation performed by the
75+ application, or the result of a user action.
76+77+ Content model: Phrasing content. *)
78+79+val progress : Element_spec.t
80+(** The progress element represents the completion progress of a task.
81+82+ Content model: Phrasing content, but must not contain progress element
83+ descendants. *)
84+85+val meter : Element_spec.t
86+(** The meter element represents a scalar measurement within a known range, or a
87+ fractional value; for example disk usage, the relevance of a query result, or
88+ the fraction of a voting population to have selected a particular candidate.
89+90+ Content model: Phrasing content, but must not contain meter element
91+ descendants. *)
92+93+val fieldset : Element_spec.t
94+(** The fieldset element represents a set of form controls (or other content)
95+ grouped together, optionally with a caption.
96+97+ Content model: Optionally a legend element, followed by flow content. *)
98+99+val legend : Element_spec.t
100+(** The legend element represents a caption for the rest of the contents of the
101+ legend element's parent fieldset element, if any.
102+103+ Content model: Phrasing content, and optionally intermixed with heading
104+ content. *)
105+106+(** {1 Element List} *)
107+108+val all : Element_spec.t list
109+(** List of all form element specifications. *)
···1+(** HTML5 interactive and scripting element specifications.
2+3+ Interactive elements are specifically intended for user interaction,
4+ and scripting elements support scripts in the document.
5+ See https://html.spec.whatwg.org/multipage/interactive-elements.html
6+ and https://html.spec.whatwg.org/multipage/scripting.html *)
7+8+val details : Element_spec.t
9+(** The details element represents a disclosure widget from which the user can
10+ obtain additional information or controls. *)
11+12+val summary : Element_spec.t
13+(** The summary element represents a summary, caption, or legend for the rest
14+ of the contents of the summary element's parent details element. *)
15+16+val dialog : Element_spec.t
17+(** The dialog element represents a part of an application that a user
18+ interacts with to perform a task, such as a dialog box or modal. *)
19+20+val script : Element_spec.t
21+(** The script element allows authors to include dynamic script and data blocks
22+ in their documents. *)
23+24+val noscript : Element_spec.t
25+(** The noscript element represents fallback content for when scripting is
26+ disabled or not supported. *)
27+28+val template : Element_spec.t
29+(** The template element is used to declare fragments of HTML that can be cloned
30+ and inserted in the document by script. *)
31+32+val slot : Element_spec.t
33+(** The slot element is used as a placeholder inside a web component that users
34+ can fill with their own markup. *)
35+36+val canvas : Element_spec.t
37+(** The canvas element provides scripts with a resolution-dependent bitmap
38+ canvas for rendering graphs, game graphics, art, or other visual images. *)
39+40+val all : Element_spec.t list
41+(** List of all interactive and scripting element specifications. *)
···1+(** HTML5 table element specifications.
2+3+ Table elements represent data with more than one dimension.
4+ See https://html.spec.whatwg.org/multipage/tables.html *)
5+6+val table : Element_spec.t
7+(** The table element represents data with more than one dimension in the form
8+ of a table. *)
9+10+val caption : Element_spec.t
11+(** The caption element represents the title of the table. *)
12+13+val colgroup : Element_spec.t
14+(** The colgroup element represents a group of one or more columns in the table. *)
15+16+val col : Element_spec.t
17+(** The col element represents one or more columns in the table. It is a void
18+ element. *)
19+20+val tbody : Element_spec.t
21+(** The tbody element represents a block of rows that consist of a body of data
22+ for the table. *)
23+24+val thead : Element_spec.t
25+(** The thead element represents the block of rows that consist of the column
26+ labels (headers) for the table. *)
27+28+val tfoot : Element_spec.t
29+(** The tfoot element represents the block of rows that consist of the column
30+ summaries (footers) for the table. *)
31+32+val tr : Element_spec.t
33+(** The tr element represents a row of cells in a table. *)
34+35+val td : Element_spec.t
36+(** The td element represents a data cell in a table. *)
37+38+val th : Element_spec.t
39+(** The th element represents a header cell in a table. *)
40+41+val all : Element_spec.t list
42+(** List of all table element specifications. *)
···1+(** HTML5 text-level and edit element specifications.
2+3+ Text-level semantic elements and edit tracking elements.
4+ See https://html.spec.whatwg.org/multipage/text-level-semantics.html
5+ and https://html.spec.whatwg.org/multipage/edits.html *)
6+7+(** {1 Hyperlinks} *)
8+9+val a : Element_spec.t
10+(** The a element represents a hyperlink. When it has an href attribute, it
11+ represents a hyperlink (a hypertext anchor) labeled by its contents.
12+13+ Content model: Transparent, but must not contain interactive content
14+ descendants. *)
15+16+(** {1 Text-level Semantics} *)
17+18+val em : Element_spec.t
19+(** The em element represents stress emphasis of its contents. The level of
20+ emphasis is given by its number of ancestor em elements.
21+22+ Content model: Phrasing content. *)
23+24+val strong : Element_spec.t
25+(** The strong element represents strong importance, seriousness, or urgency
26+ for its contents.
27+28+ Content model: Phrasing content. *)
29+30+val small : Element_spec.t
31+(** The small element represents side comments such as small print. Small print
32+ typically features disclaimers, caveats, legal restrictions, or copyrights.
33+34+ Content model: Phrasing content. *)
35+36+val s : Element_spec.t
37+(** The s element represents contents that are no longer accurate or no longer
38+ relevant.
39+40+ Content model: Phrasing content. *)
41+42+val cite : Element_spec.t
43+(** The cite element represents the title of a work (e.g. a book, a paper,
44+ an essay, a poem, a score, a song, a script, a film, a TV show, a game,
45+ a sculpture, a painting, a theatre production, a play, an opera, a musical,
46+ an exhibition, a legal case report, a computer program, etc).
47+48+ Content model: Phrasing content. *)
49+50+val q : Element_spec.t
51+(** The q element represents some phrasing content quoted from another source.
52+53+ Content model: Phrasing content. *)
54+55+val dfn : Element_spec.t
56+(** The dfn element represents the defining instance of a term.
57+58+ Content model: Phrasing content, but must not contain dfn element descendants. *)
59+60+val abbr : Element_spec.t
61+(** The abbr element represents an abbreviation or acronym, optionally with its
62+ expansion. The title attribute may be used to provide an expansion of the
63+ abbreviation.
64+65+ Content model: Phrasing content. *)
66+67+val ruby : Element_spec.t
68+(** The ruby element allows one or more spans of phrasing content to be marked
69+ with ruby annotations. Ruby annotations are short runs of text presented
70+ alongside base text, primarily used in East Asian typography.
71+72+ Content model: Phrasing content, but must contain at least one rt or rp
73+ element. *)
74+75+val rt : Element_spec.t
76+(** The rt element marks the ruby text component of a ruby annotation.
77+78+ Content model: Phrasing content. *)
79+80+val rp : Element_spec.t
81+(** The rp element is used to provide fallback parentheses for browsers that
82+ don't support ruby annotations.
83+84+ Content model: Text, or phrasing content that represents what can be used
85+ as fallback in annotations. *)
86+87+val data : Element_spec.t
88+(** The data element represents its contents, along with a machine-readable
89+ form of those contents in the value attribute.
90+91+ Content model: Phrasing content. *)
92+93+val time : Element_spec.t
94+(** The time element represents its contents, along with a machine-readable
95+ form of those contents in the datetime attribute.
96+97+ Content model: Phrasing content, but must not contain time element descendants. *)
98+99+val code : Element_spec.t
100+(** The code element represents a fragment of computer code.
101+102+ Content model: Phrasing content. *)
103+104+val var : Element_spec.t
105+(** The var element represents a variable in a mathematical expression or a
106+ programming context.
107+108+ Content model: Phrasing content. *)
109+110+val samp : Element_spec.t
111+(** The samp element represents sample or quoted output from another program
112+ or computing system.
113+114+ Content model: Phrasing content. *)
115+116+val kbd : Element_spec.t
117+(** The kbd element represents user input (typically keyboard input, although
118+ it may also be used to represent other input, such as voice commands).
119+120+ Content model: Phrasing content. *)
121+122+val sub : Element_spec.t
123+(** The sub element represents a subscript.
124+125+ Content model: Phrasing content. *)
126+127+val sup : Element_spec.t
128+(** The sup element represents a superscript.
129+130+ Content model: Phrasing content. *)
131+132+val i : Element_spec.t
133+(** The i element represents a span of text in an alternate voice or mood, or
134+ otherwise offset from the normal prose in a manner indicating a different
135+ quality of text.
136+137+ Content model: Phrasing content. *)
138+139+val b : Element_spec.t
140+(** The b element represents a span of text to which attention is being drawn
141+ for utilitarian purposes without conveying any extra importance and with no
142+ implication of an alternate voice or mood.
143+144+ Content model: Phrasing content. *)
145+146+val u : Element_spec.t
147+(** The u element represents a span of text with an unarticulated, though
148+ explicitly rendered, non-textual annotation, such as labeling the text as
149+ being a proper name in Chinese text or labeling the text as being
150+ misspelt.
151+152+ Content model: Phrasing content. *)
153+154+val mark : Element_spec.t
155+(** The mark element represents a run of text in one document marked or
156+ highlighted for reference purposes, due to its relevance in another context.
157+158+ Content model: Phrasing content. *)
159+160+val bdi : Element_spec.t
161+(** The bdi element represents a span of text that is to be isolated from its
162+ surroundings for the purposes of bidirectional text formatting.
163+164+ Content model: Phrasing content. *)
165+166+val bdo : Element_spec.t
167+(** The bdo element represents explicit text directionality formatting control
168+ for its children. It allows authors to override the Unicode bidirectional
169+ algorithm.
170+171+ Content model: Phrasing content. *)
172+173+val span : Element_spec.t
174+(** The span element doesn't mean anything on its own, but can be useful when
175+ used together with the global attributes, e.g. class, lang, or dir.
176+177+ Content model: Phrasing content. *)
178+179+val br : Element_spec.t
180+(** The br element represents a line break. It is a void element.
181+182+ Content model: Nothing (void element). *)
183+184+val wbr : Element_spec.t
185+(** The wbr element represents a line break opportunity. It is a void element.
186+187+ Content model: Nothing (void element). *)
188+189+(** {1 Edits} *)
190+191+val ins : Element_spec.t
192+(** The ins element represents an addition to the document.
193+194+ Content model: Transparent. *)
195+196+val del : Element_spec.t
197+(** The del element represents a removal from the document.
198+199+ Content model: Transparent. *)
200+201+(** {1 Element List} *)
202+203+val all : Element_spec.t list
204+(** List of all text-level and edit element specifications. *)
lib/html5_checker/datatype/datatype.cmi
This is a binary file and will not be displayed.
+42
lib/html5_checker/datatype/datatype.ml
···000000000000000000000000000000000000000000
···1+module type S = sig
2+ val name : string
3+ val validate : string -> (unit, string) result
4+ val is_valid : string -> bool
5+end
6+7+type t = (module S)
8+9+let name (module D : S) = D.name
10+let validate (module D : S) s = D.validate s
11+let is_valid (module D : S) s = D.is_valid s
12+13+(* Helper utilities *)
14+15+let is_whitespace = function
16+ | ' ' | '\t' | '\n' | '\r' | '\012' (* FF *) -> true
17+ | _ -> false
18+19+let is_ascii_digit = function '0' .. '9' -> true | _ -> false
20+21+let to_ascii_lowercase c =
22+ match c with 'A' .. 'Z' -> Char.chr (Char.code c + 32) | _ -> c
23+24+let string_to_ascii_lowercase s =
25+ String.map to_ascii_lowercase s
26+27+let trim_html_spaces s =
28+ let len = String.length s in
29+ let rec find_start i =
30+ if i >= len then len
31+ else if is_whitespace s.[i] then find_start (i + 1)
32+ else i
33+ in
34+ let rec find_end i =
35+ if i < 0 then -1
36+ else if is_whitespace s.[i] then find_end (i - 1)
37+ else i
38+ in
39+ let start = find_start 0 in
40+ let end_pos = find_end (len - 1) in
41+ if start > end_pos then ""
42+ else String.sub s start (end_pos - start + 1)
+45
lib/html5_checker/datatype/datatype.mli
···000000000000000000000000000000000000000000000
···1+(** HTML5 datatype validation.
2+3+ This module provides the base interface for HTML5 attribute datatype
4+ validators. Each datatype validates string values according to HTML5 spec. *)
5+6+(** A datatype validator *)
7+module type S = sig
8+ (** Name of this datatype (e.g., "integer", "url") *)
9+ val name : string
10+11+ (** Validate a string value. Returns Ok () if valid, Error message otherwise *)
12+ val validate : string -> (unit, string) result
13+14+ (** Check if value is valid (convenience function) *)
15+ val is_valid : string -> bool
16+end
17+18+(** A datatype packed as a first-class module *)
19+type t = (module S)
20+21+(** Get the name of a datatype *)
22+val name : t -> string
23+24+(** Validate a value with a datatype *)
25+val validate : t -> string -> (unit, string) result
26+27+(** Check if a value is valid *)
28+val is_valid : t -> string -> bool
29+30+(** Helper utilities for implementing datatype validators. *)
31+32+(** Check if a character is HTML5 whitespace (space, tab, LF, FF, or CR). *)
33+val is_whitespace : char -> bool
34+35+(** Check if a character is an ASCII digit (0-9). *)
36+val is_ascii_digit : char -> bool
37+38+(** Convert an ASCII character to lowercase. *)
39+val to_ascii_lowercase : char -> char
40+41+(** Convert an ASCII string to lowercase. *)
42+val string_to_ascii_lowercase : string -> string
43+44+(** Trim HTML5 whitespace from both ends of a string. *)
45+val trim_html_spaces : string -> string
+31
lib/html5_checker/datatype/datatype_registry.ml
···0000000000000000000000000000000
···1+type t = { datatypes : (string, Datatype.t) Hashtbl.t }
2+3+let create () = { datatypes = Hashtbl.create 16 }
4+5+let register t dt =
6+ let name = Datatype.name dt in
7+ Hashtbl.replace t.datatypes name dt
8+9+let get t name = Hashtbl.find_opt t.datatypes name
10+11+let list_names t =
12+ Hashtbl.fold (fun name _ acc -> name :: acc) t.datatypes []
13+ |> List.sort String.compare
14+15+let default =
16+ let registry = ref None in
17+ fun () ->
18+ match !registry with
19+ | Some r -> r
20+ | None ->
21+ let r = create () in
22+ (* Register built-in datatypes *)
23+ register r (module Dt_integer.Integer : Datatype.S);
24+ register r (module Dt_integer.Integer_non_negative : Datatype.S);
25+ register r (module Dt_integer.Integer_positive : Datatype.S);
26+ register r (module Dt_float.Float_ : Datatype.S);
27+ register r (module Dt_float.Float_non_negative : Datatype.S);
28+ register r (module Dt_float.Float_positive : Datatype.S);
29+ register r (module Dt_boolean.Boolean : Datatype.S);
30+ registry := Some r;
31+ r
+19
lib/html5_checker/datatype/datatype_registry.mli
···0000000000000000000
···1+(** Registry for HTML5 datatypes *)
2+3+(** Registry type that holds datatypes indexed by name *)
4+type t
5+6+(** Create a new empty registry *)
7+val create : unit -> t
8+9+(** Register a datatype in the registry *)
10+val register : t -> Datatype.t -> unit
11+12+(** Get a datatype by name. Returns None if not found *)
13+val get : t -> string -> Datatype.t option
14+15+(** List all registered datatype names *)
16+val list_names : t -> string list
17+18+(** Default registry with all built-in datatypes *)
19+val default : unit -> t
···1+(** Autocomplete attribute validation based on HTML5 spec *)
2+3+(** Check if character is whitespace *)
4+let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r'
5+6+(** Convert character to ASCII lowercase *)
7+let to_ascii_lowercase c =
8+ if c >= 'A' && c <= 'Z' then Char.chr (Char.code c + 32) else c
9+10+(** Trim whitespace from string *)
11+let trim_whitespace s =
12+ let s = String.trim s in
13+ (* Also collapse internal whitespace *)
14+ let buf = Buffer.create (String.length s) in
15+ let in_space = ref false in
16+ String.iter
17+ (fun c ->
18+ if is_whitespace c then
19+ if not !in_space then (
20+ Buffer.add_char buf ' ';
21+ in_space := true)
22+ else (
23+ Buffer.add_char buf (to_ascii_lowercase c);
24+ in_space := false))
25+ s;
26+ Buffer.contents buf
27+28+(** Contact type tokens *)
29+let contact_types = [ "home"; "work"; "mobile"; "fax"; "pager" ]
30+31+(** All autofill field names *)
32+let all_field_names =
33+ [
34+ "name";
35+ "honorific-prefix";
36+ "given-name";
37+ "additional-name";
38+ "family-name";
39+ "honorific-suffix";
40+ "nickname";
41+ "organization-title";
42+ "username";
43+ "new-password";
44+ "current-password";
45+ "one-time-code";
46+ "organization";
47+ "street-address";
48+ "address-line1";
49+ "address-line2";
50+ "address-line3";
51+ "address-level4";
52+ "address-level3";
53+ "address-level2";
54+ "address-level1";
55+ "country";
56+ "country-name";
57+ "postal-code";
58+ "cc-name";
59+ "cc-given-name";
60+ "cc-additional-name";
61+ "cc-family-name";
62+ "cc-number";
63+ "cc-exp";
64+ "cc-exp-month";
65+ "cc-exp-year";
66+ "cc-csc";
67+ "cc-type";
68+ "transaction-currency";
69+ "transaction-amount";
70+ "language";
71+ "bday";
72+ "bday-day";
73+ "bday-month";
74+ "bday-year";
75+ "sex";
76+ "url";
77+ "photo";
78+ "tel";
79+ "tel-country-code";
80+ "tel-national";
81+ "tel-area-code";
82+ "tel-local";
83+ "tel-local-prefix";
84+ "tel-local-suffix";
85+ "tel-extension";
86+ "email";
87+ "impp";
88+ ]
89+90+(** Contact field names (subset that can be used with contact types) *)
91+let contact_field_names =
92+ [
93+ "tel";
94+ "tel-country-code";
95+ "tel-national";
96+ "tel-area-code";
97+ "tel-local";
98+ "tel-local-prefix";
99+ "tel-local-suffix";
100+ "tel-extension";
101+ "email";
102+ "impp";
103+ ]
104+105+(** Split string on whitespace *)
106+let split_on_whitespace s =
107+ let rec split acc start i =
108+ if i >= String.length s then
109+ if start < i then List.rev (String.sub s start (i - start) :: acc)
110+ else List.rev acc
111+ else if is_whitespace s.[i] then
112+ if start < i then
113+ split (String.sub s start (i - start) :: acc) (i + 1) (i + 1)
114+ else split acc (i + 1) (i + 1)
115+ else split acc start (i + 1)
116+ in
117+ split [] 0 0
118+119+(** Check if string starts with prefix *)
120+let starts_with s prefix =
121+ String.length s >= String.length prefix
122+ && String.sub s 0 (String.length prefix) = prefix
123+124+(** Validate detail tokens *)
125+let check_tokens tokens =
126+ let tokens = ref tokens in
127+ let is_contact_details = ref false in
128+129+ (* Check for section-* *)
130+ (match !tokens with
131+ | token :: rest when starts_with token "section-" ->
132+ tokens := rest
133+ | _ -> ());
134+135+ (* Check for shipping/billing *)
136+ (match !tokens with
137+ | "shipping" :: rest | "billing" :: rest ->
138+ tokens := rest
139+ | _ -> ());
140+141+ (* Check for contact type *)
142+ (match !tokens with
143+ | token :: rest when List.mem token contact_types ->
144+ tokens := rest;
145+ is_contact_details := true
146+ | _ -> ());
147+148+ (* Process remaining tokens *)
149+ let process_field_tokens = function
150+ | [] -> Error "A list of autofill details tokens must contain an autofill field name"
151+ | [ "webauthn" ] ->
152+ Error
153+ "The token \"webauthn\" must not be the only token in a list of \
154+ autofill detail tokens"
155+ | [ field_name ] ->
156+ if not (List.mem field_name all_field_names) then
157+ Error
158+ (Printf.sprintf
159+ "The string \"%s\" is not a valid autofill field name"
160+ field_name)
161+ else if !is_contact_details && not (List.mem field_name contact_field_names)
162+ then
163+ Error
164+ (Printf.sprintf
165+ "The autofill field name \"%s\" is not allowed in contact \
166+ context"
167+ field_name)
168+ else Ok ()
169+ | [ field_name; "webauthn" ] ->
170+ if not (List.mem field_name all_field_names) then
171+ Error
172+ (Printf.sprintf
173+ "The string \"%s\" is not a valid autofill field name"
174+ field_name)
175+ else if !is_contact_details && not (List.mem field_name contact_field_names)
176+ then
177+ Error
178+ (Printf.sprintf
179+ "The autofill field name \"%s\" is not allowed in contact \
180+ context"
181+ field_name)
182+ else Ok ()
183+ | token :: _ when List.mem token contact_types ->
184+ Error
185+ (Printf.sprintf
186+ "The token \"%s\" must only appear before any autofill field names"
187+ token)
188+ | token :: _ when starts_with token "section-" ->
189+ Error
190+ "A \"section-*\" indicator must only appear as the first token in a \
191+ list of autofill detail tokens"
192+ | "shipping" :: _ | "billing" :: _ as toks ->
193+ Error
194+ (Printf.sprintf
195+ "The token \"%s\" must only appear as either the first token in a \
196+ list of autofill detail tokens, or, if the first token is a \
197+ \"section-*\" indicator, as the second token"
198+ (List.hd toks))
199+ | _ :: "webauthn" :: _ :: _ ->
200+ Error
201+ "The token \"webauthn\" must only appear as the very last token in a \
202+ list of autofill detail tokens"
203+ | _ :: _ :: _ ->
204+ Error
205+ "A list of autofill details tokens must not contain more than one \
206+ autofill field name"
207+ in
208+ process_field_tokens !tokens
209+210+(** Validate autocomplete value *)
211+let validate_autocomplete s =
212+ let trimmed = trim_whitespace s in
213+ if String.length trimmed = 0 then Error "Must not be empty"
214+ else if trimmed = "on" || trimmed = "off" then Ok ()
215+ else
216+ let tokens = split_on_whitespace trimmed in
217+ check_tokens tokens
218+219+module Autocomplete = struct
220+ let name = "autocomplete"
221+ let validate = validate_autocomplete
222+223+ let is_valid s =
224+ match validate s with
225+ | Ok () -> true
226+ | Error _ -> false
227+end
228+229+let datatypes = [ (module Autocomplete : Datatype.S) ]
+41
lib/html5_checker/datatype/dt_autocomplete.mli
···00000000000000000000000000000000000000000
···1+(** Autocomplete attribute datatype validator.
2+3+ This module provides a validator for the autocomplete attribute used on
4+ form fields, as defined by the HTML5 specification. *)
5+6+(** Autocomplete attribute validator.
7+8+ Validates autocomplete attribute values which can be:
9+ - "on" or "off" (simple values)
10+ - Autofill detail tokens in the format:
11+ [section-*] [shipping|billing] [contact-type] field-name [webauthn]
12+13+ Contact types: home, work, mobile, fax, pager
14+15+ Field names include:
16+ - Name fields: name, honorific-prefix, given-name, additional-name,
17+ family-name, honorific-suffix, nickname, organization-title
18+ - Authentication: username, new-password, current-password, one-time-code
19+ - Organization: organization
20+ - Address: street-address, address-line1, address-line2, address-line3,
21+ address-level1, address-level2, address-level3, address-level4,
22+ country, country-name, postal-code
23+ - Credit card: cc-name, cc-given-name, cc-additional-name, cc-family-name,
24+ cc-number, cc-exp, cc-exp-month, cc-exp-year, cc-csc, cc-type
25+ - Transaction: transaction-currency, transaction-amount
26+ - Other: language, bday, bday-day, bday-month, bday-year, sex, url, photo
27+ - Contact: tel, tel-country-code, tel-national, tel-area-code, tel-local,
28+ tel-local-prefix, tel-local-suffix, tel-extension, email, impp
29+30+ Examples:
31+ - "on"
32+ - "off"
33+ - "name"
34+ - "email"
35+ - "shipping street-address"
36+ - "section-blue billing email"
37+ - "work tel" *)
38+module Autocomplete : Datatype.S
39+40+(** List of all datatypes defined in this module *)
41+val datatypes : Datatype.t list
+38
lib/html5_checker/datatype/dt_boolean.ml
···00000000000000000000000000000000000000
···1+(** Boolean attribute validation for HTML5 *)
2+module Boolean = struct
3+ let name = "boolean"
4+5+ let validate s =
6+ match s with
7+ | "" | "true" | "false" -> Ok ()
8+ | _ ->
9+ Error
10+ (Printf.sprintf
11+ "The value '%s' is not a valid boolean. Expected empty string, \
12+ 'true', or 'false'."
13+ s)
14+15+ let is_valid s = Result.is_ok (validate s)
16+17+ let with_name attr_name =
18+ let module M = struct
19+ let name = "boolean"
20+21+ let validate s =
22+ match s with
23+ | "" | "true" | "false" -> Ok ()
24+ | _ ->
25+ let s_lower = Datatype.string_to_ascii_lowercase s in
26+ let attr_lower = Datatype.string_to_ascii_lowercase attr_name in
27+ if s_lower = attr_lower then Ok ()
28+ else
29+ Error
30+ (Printf.sprintf
31+ "The value '%s' is not a valid boolean. Expected empty \
32+ string, 'true', 'false', or '%s'."
33+ s attr_name)
34+35+ let is_valid s = Result.is_ok (validate s)
36+ end in
37+ (module M : Datatype.S)
38+end
+19
lib/html5_checker/datatype/dt_boolean.mli
···0000000000000000000
···1+(** Boolean attribute datatype validator for HTML5 *)
2+3+(** Boolean attribute validation.
4+5+ In HTML5, boolean attributes can have the following values:
6+ - Empty string
7+ - "true"
8+ - "false"
9+ - The attribute name itself (case-insensitive)
10+11+ For attribute-name validation, use [Boolean.with_name]. *)
12+module Boolean : sig
13+ include Datatype.S
14+15+ (** Create a boolean validator that also accepts a specific attribute name.
16+ For example, [with_name "disabled"] will accept "", "true", "false", or
17+ "disabled" (case-insensitive). *)
18+ val with_name : string -> (module Datatype.S)
19+end
+22
lib/html5_checker/datatype/dt_button_type.ml
···0000000000000000000000
···1+(** Button type attribute validation based on HTML5 spec *)
2+3+(** Valid button type values *)
4+let valid_types = [ "submit"; "reset"; "button" ]
5+6+module Button_type = struct
7+ let name = "button-type"
8+9+ let validate s =
10+ let s_lower = Datatype.string_to_ascii_lowercase s in
11+ if List.mem s_lower valid_types then Ok ()
12+ else
13+ Error
14+ (Printf.sprintf
15+ "The value '%s' is not a valid button type. Expected one of: %s."
16+ s
17+ (String.concat ", " valid_types))
18+19+ let is_valid s = Result.is_ok (validate s)
20+end
21+22+let datatypes = [ (module Button_type : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_button_type.mli
···0000000000000000000000
···1+(** Button type attribute datatype validator.
2+3+ This module provides a validator for the type attribute used on
4+ button elements, as defined by the HTML5 specification. *)
5+6+(** Button type attribute validator.
7+8+ Validates button type attribute values which can be:
9+ - submit - Submit button (submits the form)
10+ - reset - Reset button (resets the form)
11+ - button - Push button (no default behavior)
12+13+ Values are matched case-insensitively according to HTML5 spec.
14+15+ Examples:
16+ - "submit"
17+ - "reset"
18+ - "button" *)
19+module Button_type : Datatype.S
20+21+(** List of all datatypes defined in this module *)
22+val datatypes : Datatype.t list
···1+(** Helper functions for charset validation *)
2+3+let is_valid_charset_char c =
4+ (c >= '0' && c <= '9') ||
5+ (c >= 'a' && c <= 'z') ||
6+ (c >= 'A' && c <= 'Z') ||
7+ c = '-' || c = '!' || c = '#' || c = '$' || c = '%' || c = '&' ||
8+ c = '\'' || c = '+' || c = '_' || c = '`' || c = '{' || c = '}' ||
9+ c = '~' || c = '^'
10+11+let to_lower s = String.lowercase_ascii s
12+13+(** Common encoding labels recognized by WHATWG Encoding Standard.
14+ This is a subset of the full list. *)
15+let known_charsets = [
16+ (* UTF-8 *)
17+ "utf-8"; "utf8"; "unicode-1-1-utf-8";
18+ (* Legacy single-byte encodings *)
19+ "iso-8859-1"; "iso8859-1"; "latin1"; "iso-8859-2"; "iso-8859-3";
20+ "iso-8859-4"; "iso-8859-5"; "iso-8859-6"; "iso-8859-7"; "iso-8859-8";
21+ "iso-8859-9"; "iso-8859-10"; "iso-8859-13"; "iso-8859-14"; "iso-8859-15";
22+ "iso-8859-16";
23+ (* Windows code pages *)
24+ "windows-1250"; "windows-1251"; "windows-1252"; "windows-1253";
25+ "windows-1254"; "windows-1255"; "windows-1256"; "windows-1257";
26+ "windows-1258";
27+ (* Other common encodings *)
28+ "us-ascii"; "ascii"; "utf-16"; "utf-16le"; "utf-16be";
29+ "gb2312"; "gbk"; "gb18030"; "big5"; "euc-jp"; "iso-2022-jp";
30+ "shift_jis"; "euc-kr"; "koi8-r"; "koi8-u";
31+ (* Macintosh encodings *)
32+ "macintosh"; "x-mac-roman";
33+]
34+35+(** Check if a charset name is recognized *)
36+let is_known_charset name =
37+ let lower = to_lower name in
38+ List.mem lower known_charsets
39+40+module Charset = struct
41+ let name = "encoding name"
42+43+ let validate s =
44+ if String.length s = 0 then
45+ Error "The empty string is not a valid character encoding name"
46+ else
47+ (* Check all characters are valid *)
48+ let rec check_chars i =
49+ if i >= String.length s then
50+ Ok ()
51+ else
52+ let c = s.[i] in
53+ if not (is_valid_charset_char c) then
54+ Error (Printf.sprintf "Value contained '%c', which is not a valid character in an encoding name" c)
55+ else
56+ check_chars (i + 1)
57+ in
58+ match check_chars 0 with
59+ | Error e -> Error e
60+ | Ok () ->
61+ let lower = to_lower s in
62+ (* Reject "replacement" encoding *)
63+ if lower = "replacement" then
64+ Error (Printf.sprintf "'%s' is not a valid character encoding name" s)
65+ (* Check if it's a known charset *)
66+ else if not (is_known_charset lower) then
67+ Error (Printf.sprintf "'%s' is not a valid character encoding name" s)
68+ else
69+ Ok ()
70+71+ let is_valid s = Result.is_ok (validate s)
72+end
73+74+module Meta_charset = struct
75+ let name = "legacy character encoding declaration"
76+77+ let is_whitespace c =
78+ c = ' ' || c = '\t' || c = '\n' || c = '\012' || c = '\r'
79+80+ let validate s =
81+ let lower = to_lower s in
82+ if not (String.starts_with ~prefix:"text/html;" lower) then
83+ Error "The legacy encoding declaration did not start with 'text/html;'"
84+ else if String.length lower = 10 then
85+ Error "The legacy encoding declaration ended prematurely"
86+ else
87+ (* Skip whitespace after semicolon *)
88+ let rec skip_ws i =
89+ if i >= String.length lower then
90+ Error "The legacy encoding declaration did not contain 'charset=' after the semicolon"
91+ else
92+ let c = lower.[i] in
93+ if is_whitespace c then
94+ skip_ws (i + 1)
95+ else if c = 'c' then
96+ Ok i
97+ else
98+ Error (Printf.sprintf "The legacy encoding declaration did not start with space characters or 'charset=' after the semicolon. Found '%c' instead" c)
99+ in
100+ match skip_ws 10 with
101+ | Error e -> Error e
102+ | Ok offset ->
103+ if not (String.sub lower offset (String.length lower - offset) |> String.starts_with ~prefix:"charset=") then
104+ Error "The legacy encoding declaration did not contain 'charset=' after the semicolon"
105+ else
106+ let charset_offset = offset + 8 in
107+ if charset_offset >= String.length lower then
108+ Error "The empty string is not a valid character encoding name"
109+ else
110+ (* Validate remaining characters *)
111+ let rec check_chars i =
112+ if i >= String.length lower then
113+ Ok ()
114+ else
115+ let c = lower.[i] in
116+ if not (is_valid_charset_char c) then
117+ Error (Printf.sprintf "The legacy encoding contained '%c', which is not a valid character in an encoding name" c)
118+ else
119+ check_chars (i + 1)
120+ in
121+ match check_chars charset_offset with
122+ | Error e -> Error e
123+ | Ok () ->
124+ let encoding_name = String.sub lower charset_offset (String.length lower - charset_offset) in
125+ if encoding_name <> "utf-8" then
126+ Error "'charset=' must be followed by 'utf-8'"
127+ else
128+ Ok ()
129+130+ let is_valid s = Result.is_ok (validate s)
131+end
132+133+let datatypes = [
134+ (module Charset : Datatype.S);
135+ (module Meta_charset : Datatype.S);
136+]
+37
lib/html5_checker/datatype/dt_charset.mli
···0000000000000000000000000000000000000
···1+(** Character encoding datatype validators for HTML5.
2+3+ This module provides validators for character encoding names as used in
4+ HTML5. Encoding names must conform to the WHATWG Encoding Standard. *)
5+6+(** Character encoding name datatype.
7+8+ Validates a character encoding name according to the WHATWG Encoding
9+ Standard. Valid encoding names include:
10+ - UTF-8 (and variants like "utf-8", "utf8")
11+ - Legacy encodings (ISO-8859-1, windows-1252, etc.)
12+13+ The validator checks:
14+ - Non-empty string
15+ - Valid characters (alphanumeric, hyphen, and special chars: ! # $ % & ' + _ ` \{ \} ~ ^)
16+ - Recognizes common encoding labels
17+18+ Note: This is a simplified validator that recognizes common encoding
19+ names but does not include the full WHATWG encoding label table.
20+ It accepts labels case-insensitively. *)
21+module Charset : Datatype.S
22+23+(** Meta charset datatype for legacy encoding declarations.
24+25+ Validates the charset attribute value in legacy meta elements of the form:
26+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
27+28+ The validator checks that:
29+ - String starts with "text/html;"
30+ - After optional whitespace, contains "charset="
31+ - The charset value is "utf-8" (the only allowed value in modern HTML5)
32+33+ Example valid value: "text/html; charset=utf-8" *)
34+module Meta_charset : Datatype.S
35+36+(** List of all charset datatypes *)
37+val datatypes : Datatype.t list
···1+(** Color validation *)
2+3+(** Named CSS colors *)
4+let named_colors =
5+ [
6+ "aliceblue";
7+ "antiquewhite";
8+ "aqua";
9+ "aquamarine";
10+ "azure";
11+ "beige";
12+ "bisque";
13+ "black";
14+ "blanchedalmond";
15+ "blue";
16+ "blueviolet";
17+ "brown";
18+ "burlywood";
19+ "cadetblue";
20+ "chartreuse";
21+ "chocolate";
22+ "coral";
23+ "cornflowerblue";
24+ "cornsilk";
25+ "crimson";
26+ "cyan";
27+ "darkblue";
28+ "darkcyan";
29+ "darkgoldenrod";
30+ "darkgray";
31+ "darkgrey";
32+ "darkgreen";
33+ "darkkhaki";
34+ "darkmagenta";
35+ "darkolivegreen";
36+ "darkorange";
37+ "darkorchid";
38+ "darkred";
39+ "darksalmon";
40+ "darkseagreen";
41+ "darkslateblue";
42+ "darkslategray";
43+ "darkslategrey";
44+ "darkturquoise";
45+ "darkviolet";
46+ "deeppink";
47+ "deepskyblue";
48+ "dimgray";
49+ "dimgrey";
50+ "dodgerblue";
51+ "firebrick";
52+ "floralwhite";
53+ "forestgreen";
54+ "fuchsia";
55+ "gainsboro";
56+ "ghostwhite";
57+ "gold";
58+ "goldenrod";
59+ "gray";
60+ "grey";
61+ "green";
62+ "greenyellow";
63+ "honeydew";
64+ "hotpink";
65+ "indianred";
66+ "indigo";
67+ "ivory";
68+ "khaki";
69+ "lavender";
70+ "lavenderblush";
71+ "lawngreen";
72+ "lemonchiffon";
73+ "lightblue";
74+ "lightcoral";
75+ "lightcyan";
76+ "lightgoldenrodyellow";
77+ "lightgray";
78+ "lightgrey";
79+ "lightgreen";
80+ "lightpink";
81+ "lightsalmon";
82+ "lightseagreen";
83+ "lightskyblue";
84+ "lightslategray";
85+ "lightslategrey";
86+ "lightsteelblue";
87+ "lightyellow";
88+ "lime";
89+ "limegreen";
90+ "linen";
91+ "magenta";
92+ "maroon";
93+ "mediumaquamarine";
94+ "mediumblue";
95+ "mediumorchid";
96+ "mediumpurple";
97+ "mediumseagreen";
98+ "mediumslateblue";
99+ "mediumspringgreen";
100+ "mediumturquoise";
101+ "mediumvioletred";
102+ "midnightblue";
103+ "mintcream";
104+ "mistyrose";
105+ "moccasin";
106+ "navajowhite";
107+ "navy";
108+ "oldlace";
109+ "olive";
110+ "olivedrab";
111+ "orange";
112+ "orangered";
113+ "orchid";
114+ "palegoldenrod";
115+ "palegreen";
116+ "paleturquoise";
117+ "palevioletred";
118+ "papayawhip";
119+ "peachpuff";
120+ "peru";
121+ "pink";
122+ "plum";
123+ "powderblue";
124+ "purple";
125+ "red";
126+ "rosybrown";
127+ "royalblue";
128+ "saddlebrown";
129+ "salmon";
130+ "sandybrown";
131+ "seagreen";
132+ "seashell";
133+ "sienna";
134+ "silver";
135+ "skyblue";
136+ "slateblue";
137+ "slategray";
138+ "slategrey";
139+ "snow";
140+ "springgreen";
141+ "steelblue";
142+ "tan";
143+ "teal";
144+ "thistle";
145+ "tomato";
146+ "transparent";
147+ "turquoise";
148+ "violet";
149+ "wheat";
150+ "white";
151+ "whitesmoke";
152+ "yellow";
153+ "yellowgreen";
154+ ]
155+156+(** Check if character is hex digit *)
157+let is_hex_digit c =
158+ (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
159+160+(** Validate hex color (#RGB or #RRGGBB) *)
161+let validate_hex_color s =
162+ let len = String.length s in
163+ if len <> 4 && len <> 7 then
164+ Error "Hex color must be #RGB or #RRGGBB format"
165+ else if s.[0] <> '#' then
166+ Error "Hex color must start with '#'"
167+ else
168+ let rec check_hex i =
169+ if i >= len then Ok ()
170+ else if is_hex_digit s.[i] then check_hex (i + 1)
171+ else
172+ Error
173+ (Printf.sprintf "Invalid hex digit '%c' at position %d" s.[i] i)
174+ in
175+ check_hex 1
176+177+(** Simple color validator - strict #RRGGBB format *)
178+module Simple_color = struct
179+ let name = "simple color"
180+181+ let validate s =
182+ let s = String.trim s in
183+ if String.length s <> 7 then
184+ Error "Incorrect length for color string (must be 7 characters)"
185+ else if s.[0] <> '#' then
186+ Error
187+ (Printf.sprintf
188+ "Color starts with incorrect character '%c'. Expected the number \
189+ sign '#'"
190+ s.[0])
191+ else
192+ let rec check_hex i =
193+ if i >= 7 then Ok ()
194+ else if is_hex_digit s.[i] then check_hex (i + 1)
195+ else
196+ Error
197+ (Printf.sprintf "'%c' is not a valid hexadecimal digit" s.[i])
198+ in
199+ check_hex 1
200+201+ let is_valid s =
202+ match validate s with
203+ | Ok () -> true
204+ | Error _ -> false
205+end
206+207+(** CSS color validator - supports multiple formats *)
208+module Color = struct
209+ let name = "color"
210+211+ let validate s =
212+ let s = String.trim s |> String.lowercase_ascii in
213+ if String.length s = 0 then Error "Color value must not be empty"
214+ else if List.mem s named_colors then Ok ()
215+ else if String.length s > 0 && s.[0] = '#' then validate_hex_color s
216+ else if
217+ String.length s > 4
218+ && (String.sub s 0 4 = "rgb(" || String.sub s 0 5 = "rgba(")
219+ then
220+ (* Basic validation for rgb/rgba - just check balanced parens *)
221+ if s.[String.length s - 1] = ')' then Ok ()
222+ else Error "rgb/rgba function must end with ')'"
223+ else if
224+ String.length s > 4
225+ && (String.sub s 0 4 = "hsl(" || String.sub s 0 5 = "hsla(")
226+ then
227+ (* Basic validation for hsl/hsla - just check balanced parens *)
228+ if s.[String.length s - 1] = ')' then Ok ()
229+ else Error "hsl/hsla function must end with ')'"
230+ else
231+ Error
232+ (Printf.sprintf
233+ "Unrecognized color format '%s' (expected named color, hex, rgb(), \
234+ rgba(), hsl(), or hsla())"
235+ s)
236+237+ let is_valid s =
238+ match validate s with
239+ | Ok () -> true
240+ | Error _ -> false
241+end
242+243+let datatypes = [ (module Color : Datatype.S); (module Simple_color : Datatype.S) ]
+29
lib/html5_checker/datatype/dt_color.mli
···00000000000000000000000000000
···1+(** Color value datatype validators.
2+3+ This module provides validators for CSS color values and simple colors
4+ as defined by the HTML5 specification. *)
5+6+(** CSS color value validator.
7+8+ Validates various CSS color formats:
9+ - Named colors (e.g., "red", "blue", "transparent")
10+ - Hex colors: #RGB or #RRGGBB
11+ - rgb() and rgba() functional notation
12+ - hsl() and hsla() functional notation
13+14+ This is a simplified validator and does not validate all edge cases. *)
15+module Color : Datatype.S
16+17+(** Simple color validator (for input[type=color]).
18+19+ Validates the simple color format:
20+ - Must be exactly 7 characters
21+ - Must start with '#'
22+ - Followed by exactly 6 hexadecimal digits (0-9, a-f, A-F)
23+ - Format: #RRGGBB
24+25+ This is the strict format required for input[type=color]. *)
26+module Simple_color : Datatype.S
27+28+(** List of all datatypes defined in this module *)
29+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_contenteditable.ml
···00000000000000000000
···1+(** Contenteditable attribute validation for HTML5 *)
2+3+module Contenteditable = struct
4+ let name = "contenteditable"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "true" | "false" | "plaintext-only" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid contenteditable value. Expected \
14+ 'true', 'false', 'plaintext-only', or empty string."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Contenteditable : Datatype.S) ]
+25
lib/html5_checker/datatype/dt_contenteditable.mli
···0000000000000000000000000
···1+(** Contenteditable attribute datatype validator for HTML5.
2+3+ This module provides a validator for the contenteditable attribute, as
4+ defined by the HTML5 specification. *)
5+6+(** Contenteditable attribute validator.
7+8+ Validates contenteditable attribute values which can be:
9+ - "true" - the element is editable
10+ - "false" - the element is not editable
11+ - "" (empty string) - equivalent to "true"
12+ - "plaintext-only" - the element is editable, but rich text formatting is
13+ disabled
14+15+ Values are case-insensitive.
16+17+ Examples:
18+ - "true"
19+ - "false"
20+ - ""
21+ - "plaintext-only" *)
22+module Contenteditable : Datatype.S
23+24+(** List of all datatypes defined in this module *)
25+val datatypes : Datatype.t list
+45
lib/html5_checker/datatype/dt_coords.ml
···000000000000000000000000000000000000000000000
···1+(** Coordinates attribute validation for HTML5 *)
2+3+module Coords = struct
4+ let name = "coords"
5+6+ let validate s =
7+ (* Empty string is valid for default shape *)
8+ if s = "" then Ok ()
9+ else
10+ (* Split on comma and validate each part is an integer *)
11+ let parts =
12+ String.split_on_char ',' s
13+ |> List.map (fun p ->
14+ let trimmed = String.trim p in
15+ (* Check if it's a valid integer *)
16+ try
17+ let _ = int_of_string trimmed in
18+ Ok ()
19+ with Failure _ ->
20+ Error (Printf.sprintf "The value '%s' is not a valid integer" p))
21+ in
22+ (* Check if all parts are valid *)
23+ let rec check = function
24+ | [] -> Ok ()
25+ | Ok () :: rest -> check rest
26+ | (Error msg) :: _ -> Error msg
27+ in
28+ match check parts with
29+ | Error msg ->
30+ Error
31+ (Printf.sprintf
32+ "The coords value '%s' is not valid. %s. Expected a \
33+ comma-separated list of integers."
34+ s msg)
35+ | Ok () -> (
36+ (* Verify we have at least some values *)
37+ let count = List.length parts in
38+ match count with
39+ | 0 -> Error "The coords value must not be empty unless for default shape"
40+ | _ -> Ok ())
41+42+ let is_valid s = Result.is_ok (validate s)
43+end
44+45+let datatypes = [ (module Coords : Datatype.S) ]
+25
lib/html5_checker/datatype/dt_coords.mli
···0000000000000000000000000
···1+(** Coordinates attribute datatype validator for HTML5.
2+3+ This module provides a validator for the coords attribute used on area
4+ elements within image maps, as defined by the HTML5 specification. *)
5+6+(** Coordinates attribute validator.
7+8+ Validates coords attribute values which must be a comma-separated list of
9+ valid integers. The number of values depends on the shape:
10+ - rect: exactly 4 values (x1,y1,x2,y2)
11+ - circle: exactly 3 values (x,y,radius)
12+ - poly: even number of values >= 6 (x1,y1,x2,y2,x3,y3,...)
13+ - default: should be empty or not present
14+15+ Note: This validator only checks that the value is a valid comma-separated
16+ list of integers. Shape-specific validation should be done at a higher level.
17+18+ Examples:
19+ - "0,0,10,10" (rect)
20+ - "50,50,25" (circle)
21+ - "0,0,50,0,50,50,0,50" (poly) *)
22+module Coords : Datatype.S
23+24+(** List of all datatypes defined in this module *)
25+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_crossorigin.ml
···00000000000000000000
···1+(** CORS crossorigin attribute validation for HTML5 *)
2+3+module Crossorigin = struct
4+ let name = "crossorigin"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "anonymous" | "use-credentials" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid crossorigin value. Expected \
14+ empty string, 'anonymous', or 'use-credentials'."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Crossorigin : Datatype.S) ]
+23
lib/html5_checker/datatype/dt_crossorigin.mli
···00000000000000000000000
···1+(** CORS crossorigin attribute datatype validator.
2+3+ This module provides a validator for the crossorigin attribute used on
4+ script, link, img, audio, video elements, as defined by the HTML5 spec. *)
5+6+(** Crossorigin attribute validator.
7+8+ Validates crossorigin attribute values which can be:
9+ - "" (empty string, equivalent to anonymous)
10+ - "anonymous" (requests use CORS without credentials)
11+ - "use-credentials" (requests use CORS with credentials)
12+13+ Values are case-insensitive after ASCII lowercasing.
14+15+ Examples:
16+ - ""
17+ - "anonymous"
18+ - "use-credentials"
19+ - "Anonymous" (equivalent to "anonymous") *)
20+module Crossorigin : Datatype.S
21+22+(** List of all datatypes defined in this module *)
23+val datatypes : Datatype.t list
···1+(** Helper functions for datetime validation *)
2+3+let is_digit c = c >= '0' && c <= '9'
4+5+let is_all_digits s =
6+ String.for_all is_digit s
7+8+let parse_int s =
9+ try Some (int_of_string s)
10+ with Failure _ -> None
11+12+(** Days in each month (non-leap year) *)
13+let days_in_month = [| 31; 28; 31; 30; 31; 30; 31; 31; 30; 31; 30; 31 |]
14+15+(** Check if a year is a leap year *)
16+let is_leap_year year =
17+ (year mod 400 = 0) || (year mod 4 = 0 && year mod 100 <> 0)
18+19+(** Get maximum day for a given month/year *)
20+let max_day_for_month year month =
21+ if month = 2 && is_leap_year year then 29
22+ else days_in_month.(month - 1)
23+24+(** Years in the 400-year cycle that have 53 weeks *)
25+let special_years_mod_400 = [|
26+ 4; 9; 15; 20; 26; 32; 37; 43; 48; 54; 60; 65; 71; 76; 82; 88; 93; 99;
27+ 105; 111; 116; 122; 128; 133; 139; 144; 150; 156; 161; 167; 172; 178;
28+ 184; 189; 195; 201; 207; 212; 218; 224; 229; 235; 240; 246; 252; 257;
29+ 263; 268; 274; 280; 285; 291; 296; 303; 308; 314; 320; 325; 331; 336;
30+ 342; 348; 353; 359; 364; 370; 376; 381; 387; 392; 398
31+|]
32+33+(** Check if a year has 53 weeks *)
34+let has_53_weeks year =
35+ let year_mod = year mod 400 in
36+ Array.exists (fun y -> y = year_mod) special_years_mod_400
37+38+module Year = struct
39+ let name = "year"
40+41+ let validate s =
42+ let len = String.length s in
43+ if len < 4 then
44+ Error "Year must be at least 4 digits"
45+ else if not (is_all_digits s) then
46+ Error "Year must contain only digits"
47+ else
48+ match parse_int s with
49+ | None -> Error "Year value out of range"
50+ | Some year ->
51+ if year < 1 then
52+ Error "Year cannot be less than 1"
53+ else
54+ Ok ()
55+56+ let is_valid s = Result.is_ok (validate s)
57+end
58+59+module Month = struct
60+ let name = "month"
61+62+ let validate s =
63+ if String.length s < 7 then
64+ Error "Month must be in YYYY-MM format"
65+ else
66+ let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)$" in
67+ if not (Str.string_match pattern s 0) then
68+ Error "Month must be in YYYY-MM format"
69+ else
70+ let year_s = Str.matched_group 1 s in
71+ let month_s = Str.matched_group 2 s in
72+ match (parse_int year_s, parse_int month_s) with
73+ | None, _ | _, None -> Error "Year or month out of range"
74+ | Some year, Some month ->
75+ if year < 1 then
76+ Error "Year cannot be less than 1"
77+ else if month < 1 then
78+ Error "Month cannot be less than 1"
79+ else if month > 12 then
80+ Error "Month cannot be greater than 12"
81+ else
82+ Ok ()
83+84+ let is_valid s = Result.is_ok (validate s)
85+end
86+87+module Week = struct
88+ let name = "week"
89+90+ let validate s =
91+ let pattern = Str.regexp "^\\([0-9]+\\)-W\\([0-9][0-9]\\)$" in
92+ if not (Str.string_match pattern s 0) then
93+ Error "Week must be in YYYY-Www format"
94+ else
95+ let year_s = Str.matched_group 1 s in
96+ let week_s = Str.matched_group 2 s in
97+ match (parse_int year_s, parse_int week_s) with
98+ | None, _ | _, None -> Error "Year or week out of range"
99+ | Some year, Some week ->
100+ if year < 1 then
101+ Error "Year cannot be less than 1"
102+ else if week < 1 then
103+ Error "Week cannot be less than 1"
104+ else if week > 53 then
105+ Error "Week out of range"
106+ else if week = 53 && not (has_53_weeks year) then
107+ Error "Week out of range"
108+ else
109+ Ok ()
110+111+ let is_valid s = Result.is_ok (validate s)
112+end
113+114+module Date = struct
115+ let name = "date"
116+117+ let validate s =
118+ let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)-\\([0-9][0-9]\\)$" in
119+ if not (Str.string_match pattern s 0) then
120+ Error "Date must be in YYYY-MM-DD format"
121+ else
122+ let year_s = Str.matched_group 1 s in
123+ let month_s = Str.matched_group 2 s in
124+ let day_s = Str.matched_group 3 s in
125+ if String.length year_s < 4 then
126+ Error "Year must be at least 4 digits"
127+ else
128+ match (parse_int year_s, parse_int month_s, parse_int day_s) with
129+ | None, _, _ | _, None, _ | _, _, None ->
130+ Error "Year, month, or day out of range"
131+ | Some year, Some month, Some day ->
132+ if year < 1 then
133+ Error "Year cannot be less than 1"
134+ else if month < 1 then
135+ Error "Month cannot be less than 1"
136+ else if month > 12 then
137+ Error "Month cannot be greater than 12"
138+ else if day < 1 then
139+ Error "Day cannot be less than 1"
140+ else
141+ let max_day = max_day_for_month year month in
142+ if day > max_day then
143+ Error "Day out of range"
144+ else
145+ Ok ()
146+147+ let is_valid s = Result.is_ok (validate s)
148+end
149+150+module Time = struct
151+ let name = "time"
152+153+ let validate s =
154+ let pattern = Str.regexp "^\\([0-9][0-9]\\):\\([0-9][0-9]\\)\\(:\\([0-9][0-9]\\)\\(\\.\\([0-9]+\\)\\)?\\)?$" in
155+ if not (Str.string_match pattern s 0) then
156+ Error "Time must be in HH:MM[:SS[.sss]] format"
157+ else
158+ let hour_s = Str.matched_group 1 s in
159+ let minute_s = Str.matched_group 2 s in
160+ let second_s = try Some (Str.matched_group 4 s) with Not_found -> None in
161+ let millis_s = try Some (Str.matched_group 6 s) with Not_found -> None in
162+ match (parse_int hour_s, parse_int minute_s) with
163+ | None, _ | _, None -> Error "Hour or minute out of range"
164+ | Some hour, Some minute ->
165+ if hour > 23 then
166+ Error "Hour cannot be greater than 23"
167+ else if minute > 59 then
168+ Error "Minute cannot be greater than 59"
169+ else
170+ match second_s with
171+ | None -> Ok ()
172+ | Some sec_s ->
173+ match parse_int sec_s with
174+ | None -> Error "Seconds out of range"
175+ | Some second ->
176+ if second > 59 then
177+ Error "Second cannot be greater than 59"
178+ else
179+ match millis_s with
180+ | None -> Ok ()
181+ | Some ms ->
182+ if String.length ms > 3 then
183+ Error "A fraction of a second must be one, two, or three digits"
184+ else if not (is_all_digits ms) then
185+ Error "Invalid milliseconds"
186+ else
187+ Ok ()
188+189+ let is_valid s = Result.is_ok (validate s)
190+end
191+192+module Datetime_local = struct
193+ let name = "local datetime"
194+195+ let validate s =
196+ let pattern = Str.regexp "^\\(.+\\)[T ]\\(.+\\)$" in
197+ if not (Str.string_match pattern s 0) then
198+ Error "Datetime must be in YYYY-MM-DD[T ]HH:MM[:SS[.sss]] format"
199+ else
200+ let date_s = Str.matched_group 1 s in
201+ let time_s = Str.matched_group 2 s in
202+ match Date.validate date_s with
203+ | Error e -> Error ("Invalid date: " ^ e)
204+ | Ok () ->
205+ match Time.validate time_s with
206+ | Error e -> Error ("Invalid time: " ^ e)
207+ | Ok () -> Ok ()
208+209+ let is_valid s = Result.is_ok (validate s)
210+end
211+212+module Datetime = struct
213+ let name = "datetime"
214+215+ let validate s =
216+ if not (String.ends_with ~suffix:"Z" s) then
217+ Error "Global datetime must end with 'Z'"
218+ else
219+ let s_without_z = String.sub s 0 (String.length s - 1) in
220+ match Datetime_local.validate s_without_z with
221+ | Error e -> Error e
222+ | Ok () -> Ok ()
223+224+ let is_valid s = Result.is_ok (validate s)
225+end
226+227+let datatypes = [
228+ (module Year : Datatype.S);
229+ (module Month : Datatype.S);
230+ (module Week : Datatype.S);
231+ (module Date : Datatype.S);
232+ (module Time : Datatype.S);
233+ (module Datetime_local : Datatype.S);
234+ (module Datetime : Datatype.S);
235+]
···1+(** Date and time datatype validators for HTML5.
2+3+ This module provides validators for various HTML5 date and time formats
4+ as specified in the HTML5 standard. Each validator checks that strings
5+ conform to the specific format and contain valid values. *)
6+7+(** Year datatype (YYYY format, minimum 1).
8+9+ Validates a year string consisting of 4 or more ASCII digits.
10+ The year must be at least 1. *)
11+module Year : Datatype.S
12+13+(** Month datatype (YYYY-MM format).
14+15+ Validates a month string in the format YYYY-MM where:
16+ - YYYY is a valid year (>= 1)
17+ - MM is a month from 01 to 12 *)
18+module Month : Datatype.S
19+20+(** Week datatype (YYYY-Www format).
21+22+ Validates a week string in the format YYYY-Www where:
23+ - YYYY is a valid year (>= 1)
24+ - W is the literal character 'W'
25+ - ww is a week number from 01 to 53
26+27+ Week 53 is only valid for years that have 53 weeks in the ISO 8601
28+ week-numbering calendar. *)
29+module Week : Datatype.S
30+31+(** Date datatype (YYYY-MM-DD format).
32+33+ Validates a date string in the format YYYY-MM-DD where:
34+ - YYYY is a valid year (>= 1)
35+ - MM is a month from 01 to 12
36+ - DD is a day valid for the given month/year
37+38+ Handles leap years correctly. *)
39+module Date : Datatype.S
40+41+(** Time datatype (HH:MM[:SS[.sss]] format).
42+43+ Validates a time string where:
44+ - HH is hours from 00 to 23
45+ - MM is minutes from 00 to 59
46+ - SS (optional) is seconds from 00 to 59
47+ - sss (optional) is milliseconds (1-3 digits)
48+49+ Valid formats:
50+ - HH:MM
51+ - HH:MM:SS
52+ - HH:MM:SS.s
53+ - HH:MM:SS.ss
54+ - HH:MM:SS.sss *)
55+module Time : Datatype.S
56+57+(** Local datetime datatype (YYYY-MM-DD[T ]HH:MM[:SS[.sss]] format).
58+59+ Validates a local datetime string combining date and time with 'T' or
60+ space separator. The date must be valid and the time must be valid.
61+62+ This format does not include timezone information. *)
63+module Datetime_local : Datatype.S
64+65+(** Global datetime datatype (YYYY-MM-DD[T ]HH:MM[:SS[.sss]]Z format).
66+67+ Validates a global datetime string in UTC (ending with 'Z').
68+ The date must be valid and the time must be valid.
69+70+ This is the format for datetime values that include timezone (UTC). *)
71+module Datetime : Datatype.S
72+73+(** List of all datetime datatypes *)
74+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_decoding.ml
···00000000000000000000
···1+(** Image decoding attribute validation for HTML5 *)
2+3+module Decoding = struct
4+ let name = "decoding"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "sync" | "async" | "auto" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid decoding value. Expected empty \
14+ string, 'sync', 'async', or 'auto'."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Decoding : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_decoding.mli
···000000000000000000000000
···1+(** Image decoding attribute datatype validator.
2+3+ This module provides a validator for the decoding attribute used to provide
4+ a hint for image decoding, as defined by the HTML5 spec. *)
5+6+(** Decoding attribute validator.
7+8+ Validates decoding attribute values which can be:
9+ - "" (empty string, default decoding behavior)
10+ - "sync" (decode synchronously for atomic presentation)
11+ - "async" (decode asynchronously to avoid delaying other content)
12+ - "auto" (no preference for decoding mode)
13+14+ Values are case-insensitive after ASCII lowercasing.
15+16+ Examples:
17+ - ""
18+ - "sync"
19+ - "async"
20+ - "auto" *)
21+module Decoding : Datatype.S
22+23+(** List of all datatypes defined in this module *)
24+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_dir.ml
···00000000000000000000
···1+(** Text direction attribute validation for HTML5 *)
2+3+module Dir = struct
4+ let name = "dir"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "ltr" | "rtl" | "auto" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid dir value. Expected empty \
14+ string, 'ltr', 'rtl', or 'auto'."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Dir : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_dir.mli
···000000000000000000000000
···1+(** Text direction attribute datatype validator.
2+3+ This module provides a validator for the dir attribute used to specify
4+ the text directionality of element content, as defined by the HTML5 spec. *)
5+6+(** Dir attribute validator.
7+8+ Validates dir attribute values which can be:
9+ - "" (empty string, inherits directionality)
10+ - "ltr" (left-to-right text direction)
11+ - "rtl" (right-to-left text direction)
12+ - "auto" (directionality determined from content)
13+14+ Values are case-insensitive after ASCII lowercasing.
15+16+ Examples:
17+ - ""
18+ - "ltr"
19+ - "rtl"
20+ - "auto" *)
21+module Dir : Datatype.S
22+23+(** List of all datatypes defined in this module *)
24+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_draggable.ml
···00000000000000000000
···1+(** Draggable attribute validation for HTML5 *)
2+3+module Draggable = struct
4+ let name = "draggable"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "true" | "false" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid draggable value. Expected 'true' \
14+ or 'false'."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Draggable : Datatype.S) ]
+20
lib/html5_checker/datatype/dt_draggable.mli
···00000000000000000000
···1+(** Draggable attribute datatype validator for HTML5.
2+3+ This module provides a validator for the draggable attribute, as defined by
4+ the HTML5 specification. *)
5+6+(** Draggable attribute validator.
7+8+ Validates draggable attribute values which can be:
9+ - "true" - the element is draggable
10+ - "false" - the element is not draggable
11+12+ Values are case-insensitive.
13+14+ Examples:
15+ - "true"
16+ - "false" *)
17+module Draggable : Datatype.S
18+19+(** List of all datatypes defined in this module *)
20+val datatypes : Datatype.t list
···1+(** Email address validation *)
2+3+(** Helper to check if a character is valid in email local/domain parts *)
4+let is_email_char c =
5+ (c >= 'a' && c <= 'z')
6+ || (c >= 'A' && c <= 'Z')
7+ || (c >= '0' && c <= '9')
8+ || c = '.' || c = '-' || c = '_' || c = '+' || c = '='
9+10+(** Validate a single email address using simplified rules *)
11+let validate_email s =
12+ let s = String.trim s in
13+ if String.length s = 0 then Error "Email address must not be empty"
14+ else
15+ (* Check for exactly one @ symbol *)
16+ let at_count = ref 0 in
17+ let at_pos = ref (-1) in
18+ String.iteri
19+ (fun i c -> if c = '@' then (
20+ incr at_count;
21+ at_pos := i
22+ ))
23+ s;
24+ if !at_count = 0 then Error "Email address must contain an '@' character"
25+ else if !at_count > 1 then
26+ Error "Email address must contain exactly one '@' character"
27+ else
28+ let local = String.sub s 0 !at_pos in
29+ let domain = String.sub s (!at_pos + 1) (String.length s - !at_pos - 1) in
30+31+ (* Validate local part *)
32+ if String.length local = 0 then
33+ Error "Email address must have a local part before '@'"
34+ else if local.[0] = '.' || local.[String.length local - 1] = '.' then
35+ Error "Email local part must not start or end with '.'"
36+ else if not (String.for_all is_email_char local) then
37+ Error "Email local part contains invalid characters"
38+ else (* Validate domain part *)
39+ if String.length domain = 0 then
40+ Error "Email address must have a domain part after '@'"
41+ else if not (String.contains domain '.') then
42+ Error "Email domain must contain at least one '.'"
43+ else if domain.[0] = '.' || domain.[String.length domain - 1] = '.' then
44+ Error "Email domain must not start or end with '.'"
45+ else if
46+ not
47+ (String.for_all
48+ (fun c -> is_email_char c || c = '.')
49+ domain)
50+ then Error "Email domain contains invalid characters"
51+ else Ok ()
52+53+module Email = struct
54+ let name = "email address"
55+ let validate = validate_email
56+57+ let is_valid s =
58+ match validate s with
59+ | Ok () -> true
60+ | Error _ -> false
61+end
62+63+module Email_list = struct
64+ let name = "email address list"
65+66+ let validate s =
67+ let s = String.trim s in
68+ if String.length s = 0 then Error "Email list must not be empty"
69+ else
70+ (* Split on commas and validate each email *)
71+ let emails = String.split_on_char ',' s in
72+ let rec check_all = function
73+ | [] -> Ok ()
74+ | email :: rest -> (
75+ match validate_email email with
76+ | Ok () -> check_all rest
77+ | Error msg ->
78+ Error (Printf.sprintf "Invalid email in list: %s" msg))
79+ in
80+ check_all emails
81+82+ let is_valid s =
83+ match validate s with
84+ | Ok () -> true
85+ | Error _ -> false
86+end
87+88+let datatypes = [ (module Email : Datatype.S); (module Email_list : Datatype.S) ]
+23
lib/html5_checker/datatype/dt_email.mli
···00000000000000000000000
···1+(** Email address datatype validators.
2+3+ This module provides validators for email addresses and email address lists
4+ as defined by the HTML5 specification. *)
5+6+(** Valid email address validator.
7+8+ Validates a single email address. Uses simplified validation rules:
9+ - Must contain exactly one '@' character
10+ - Local part (before @) must be non-empty
11+ - Domain part (after @) must be non-empty and contain at least one '.'
12+ - Only ASCII characters allowed *)
13+module Email : Datatype.S
14+15+(** Comma-separated email address list validator.
16+17+ Validates a comma-separated list of email addresses.
18+ Each address in the list must be valid according to {!Email} rules.
19+ Whitespace around commas is ignored. *)
20+module Email_list : Datatype.S
21+22+(** List of all datatypes defined in this module *)
23+val datatypes : Datatype.t list
+22
lib/html5_checker/datatype/dt_enterkeyhint.ml
···0000000000000000000000
···1+(** Enter key hint attribute validation for HTML5 *)
2+3+module Enterkeyhint = struct
4+ let name = "enterkeyhint"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "enter" | "done" | "go" | "next" | "previous" | "search" | "send" ->
10+ Ok ()
11+ | _ ->
12+ Error
13+ (Printf.sprintf
14+ "The value '%s' is not a valid enterkeyhint value. Expected \
15+ one of: empty string, 'enter', 'done', 'go', 'next', \
16+ 'previous', 'search', or 'send'."
17+ s)
18+19+ let is_valid s = Result.is_ok (validate s)
20+end
21+22+let datatypes = [ (module Enterkeyhint : Datatype.S) ]
+28
lib/html5_checker/datatype/dt_enterkeyhint.mli
···0000000000000000000000000000
···1+(** Enter key hint attribute datatype validator.
2+3+ This module provides a validator for the enterkeyhint attribute used to
4+ customize the enter key label on virtual keyboards, as defined by the HTML5 spec. *)
5+6+(** Enterkeyhint attribute validator.
7+8+ Validates enterkeyhint attribute values which can be:
9+ - "" (empty string, default enter key)
10+ - "enter" (insert new line)
11+ - "done" (close input method editor)
12+ - "go" (navigate to target)
13+ - "next" (advance to next field)
14+ - "previous" (go back to previous field)
15+ - "search" (perform search)
16+ - "send" (submit or deliver)
17+18+ Values are case-insensitive after ASCII lowercasing.
19+20+ Examples:
21+ - ""
22+ - "done"
23+ - "next"
24+ - "search" *)
25+module Enterkeyhint : Datatype.S
26+27+(** List of all datatypes defined in this module *)
28+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_fetchpriority.ml
···00000000000000000000
···1+(** Fetch priority attribute validation for HTML5 *)
2+3+module Fetchpriority = struct
4+ let name = "fetchpriority"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "high" | "low" | "auto" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid fetchpriority value. Expected \
14+ empty string, 'high', 'low', or 'auto'."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Fetchpriority : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_fetchpriority.mli
···000000000000000000000000
···1+(** Fetch priority attribute datatype validator.
2+3+ This module provides a validator for the fetchpriority attribute used to
4+ provide a hint for resource fetch priority, as defined by the HTML5 spec. *)
5+6+(** Fetchpriority attribute validator.
7+8+ Validates fetchpriority attribute values which can be:
9+ - "" (empty string, default fetch priority)
10+ - "high" (fetch at high priority relative to other resources)
11+ - "low" (fetch at low priority relative to other resources)
12+ - "auto" (no preference for fetch priority)
13+14+ Values are case-insensitive after ASCII lowercasing.
15+16+ Examples:
17+ - ""
18+ - "high"
19+ - "low"
20+ - "auto" *)
21+module Fetchpriority : Datatype.S
22+23+(** List of all datatypes defined in this module *)
24+val datatypes : Datatype.t list
···1+(** Valid HTML5 floating point number *)
2+module Float_ = struct
3+ let name = "floating point number"
4+5+ type state =
6+ | At_start
7+ | At_start_minus_seen
8+ | In_integer_part_digits_seen
9+ | Dot_seen
10+ | E_seen
11+ | In_decimal_part_digits_seen
12+ | In_exponent_sign_seen
13+ | In_exponent_digits_seen
14+15+ let validate s =
16+ let len = String.length s in
17+ let rec parse i state =
18+ if i >= len then
19+ match state with
20+ | In_integer_part_digits_seen | In_decimal_part_digits_seen
21+ | In_exponent_digits_seen ->
22+ Ok ()
23+ | At_start -> Error "The empty string is not a valid floating point number."
24+ | At_start_minus_seen ->
25+ Error "The minus sign alone is not a valid floating point number."
26+ | Dot_seen ->
27+ Error "A floating point number must not end with the decimal point."
28+ | E_seen ->
29+ Error "A floating point number must not end with the exponent 'e'."
30+ | In_exponent_sign_seen ->
31+ Error
32+ "A floating point number must not end with only a sign in the \
33+ exponent."
34+ else
35+ let c = s.[i] in
36+ match state with
37+ | At_start ->
38+ if c = '-' then parse (i + 1) At_start_minus_seen
39+ else if c = '.' then parse (i + 1) Dot_seen
40+ else if Datatype.is_ascii_digit c then
41+ parse (i + 1) In_integer_part_digits_seen
42+ else
43+ Error
44+ (Printf.sprintf
45+ "Expected a minus sign or a digit but saw '%c' instead." c)
46+ | At_start_minus_seen ->
47+ if Datatype.is_ascii_digit c then
48+ parse (i + 1) In_integer_part_digits_seen
49+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." c)
50+ | In_integer_part_digits_seen ->
51+ if c = '.' then parse (i + 1) Dot_seen
52+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
53+ else if Datatype.is_ascii_digit c then
54+ parse (i + 1) In_integer_part_digits_seen
55+ else
56+ Error
57+ (Printf.sprintf
58+ "Expected a decimal point, 'e', 'E' or a digit but saw '%c' \
59+ instead."
60+ c)
61+ | Dot_seen ->
62+ if Datatype.is_ascii_digit c then
63+ parse (i + 1) In_decimal_part_digits_seen
64+ else
65+ Error
66+ (Printf.sprintf
67+ "Expected a digit after the decimal point but saw '%c' instead."
68+ c)
69+ | In_decimal_part_digits_seen ->
70+ if Datatype.is_ascii_digit c then
71+ parse (i + 1) In_decimal_part_digits_seen
72+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
73+ else
74+ Error
75+ (Printf.sprintf "Expected 'e', 'E' or a digit but saw '%c' instead."
76+ c)
77+ | E_seen ->
78+ if c = '-' || c = '+' then parse (i + 1) In_exponent_sign_seen
79+ else if Datatype.is_ascii_digit c then
80+ parse (i + 1) In_exponent_digits_seen
81+ else
82+ Error
83+ (Printf.sprintf
84+ "Expected a minus sign, a plus sign or a digit but saw '%c' \
85+ instead."
86+ c)
87+ | In_exponent_sign_seen ->
88+ if Datatype.is_ascii_digit c then
89+ parse (i + 1) In_exponent_digits_seen
90+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." c)
91+ | In_exponent_digits_seen ->
92+ if Datatype.is_ascii_digit c then
93+ parse (i + 1) In_exponent_digits_seen
94+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." c)
95+ in
96+ parse 0 At_start
97+98+ let is_valid s = Result.is_ok (validate s)
99+end
100+101+(** Non-negative floating point number (>= 0) *)
102+module Float_non_negative = struct
103+ let name = "non-negative floating point number"
104+105+ type state =
106+ | At_start
107+ | At_start_minus_seen
108+ | In_integer_part_digits_seen
109+ | In_integer_part_digits_seen_zero
110+ | Dot_seen
111+ | Dot_seen_zero
112+ | E_seen
113+ | In_decimal_part_digits_seen
114+ | In_decimal_part_digits_seen_zero
115+ | In_exponent_sign_seen
116+ | In_exponent_digits_seen
117+118+ let validate s =
119+ let len = String.length s in
120+ let rec parse i state =
121+ if i >= len then
122+ match state with
123+ | In_integer_part_digits_seen | In_decimal_part_digits_seen
124+ | In_integer_part_digits_seen_zero | In_decimal_part_digits_seen_zero
125+ | In_exponent_digits_seen ->
126+ Ok ()
127+ | At_start ->
128+ Error "The empty string is not a valid non-negative floating point number."
129+ | At_start_minus_seen ->
130+ Error
131+ "The minus sign alone is not a valid non-negative floating point \
132+ number."
133+ | Dot_seen | Dot_seen_zero ->
134+ Error
135+ "A non-negative floating point number must not end with the \
136+ decimal point."
137+ | E_seen ->
138+ Error
139+ "A non-negative floating point number must not end with the \
140+ exponent 'e'."
141+ | In_exponent_sign_seen ->
142+ Error
143+ "A non-negative floating point number must not end with only a \
144+ sign in the exponent."
145+ else
146+ let c = s.[i] in
147+ match state with
148+ | At_start ->
149+ if c = '-' then parse (i + 1) At_start_minus_seen
150+ else if c = '.' then parse (i + 1) Dot_seen
151+ else if Datatype.is_ascii_digit c then
152+ parse (i + 1) In_integer_part_digits_seen
153+ else
154+ Error
155+ (Printf.sprintf
156+ "Expected a minus sign or a digit but saw '%c' instead." c)
157+ | At_start_minus_seen ->
158+ if c = '0' then parse (i + 1) In_integer_part_digits_seen_zero
159+ else Error (Printf.sprintf "Expected a zero but saw '%c' instead." c)
160+ | In_integer_part_digits_seen ->
161+ if c = '.' then parse (i + 1) Dot_seen
162+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
163+ else if Datatype.is_ascii_digit c then
164+ parse (i + 1) In_integer_part_digits_seen
165+ else
166+ Error
167+ (Printf.sprintf
168+ "Expected a decimal point, 'e', 'E' or a digit but saw '%c' \
169+ instead."
170+ c)
171+ | In_integer_part_digits_seen_zero ->
172+ if c = '.' then parse (i + 1) Dot_seen_zero
173+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
174+ else if c = '0' then parse (i + 1) In_integer_part_digits_seen_zero
175+ else
176+ Error
177+ (Printf.sprintf
178+ "Expected a decimal point, 'e', 'E' or a zero but saw '%c' \
179+ instead."
180+ c)
181+ | Dot_seen ->
182+ if Datatype.is_ascii_digit c then
183+ parse (i + 1) In_decimal_part_digits_seen
184+ else
185+ Error
186+ (Printf.sprintf
187+ "Expected a digit after the decimal point but saw '%c' instead."
188+ c)
189+ | Dot_seen_zero ->
190+ if c = '0' then parse (i + 1) In_decimal_part_digits_seen_zero
191+ else
192+ Error
193+ (Printf.sprintf
194+ "Expected a zero after the decimal point but saw '%c' instead."
195+ c)
196+ | In_decimal_part_digits_seen ->
197+ if Datatype.is_ascii_digit c then
198+ parse (i + 1) In_decimal_part_digits_seen
199+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
200+ else
201+ Error
202+ (Printf.sprintf "Expected 'e', 'E' or a digit but saw '%c' instead."
203+ c)
204+ | In_decimal_part_digits_seen_zero ->
205+ if c = '0' then parse (i + 1) In_decimal_part_digits_seen_zero
206+ else if c = 'e' || c = 'E' then parse (i + 1) E_seen
207+ else
208+ Error
209+ (Printf.sprintf "Expected 'e', 'E' or a zero but saw '%c' instead."
210+ c)
211+ | E_seen ->
212+ if c = '-' || c = '+' then parse (i + 1) In_exponent_sign_seen
213+ else if Datatype.is_ascii_digit c then
214+ parse (i + 1) In_exponent_digits_seen
215+ else
216+ Error
217+ (Printf.sprintf
218+ "Expected a minus sign, a plus sign or a digit but saw '%c' \
219+ instead."
220+ c)
221+ | In_exponent_sign_seen ->
222+ if Datatype.is_ascii_digit c then
223+ parse (i + 1) In_exponent_digits_seen
224+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." c)
225+ | In_exponent_digits_seen ->
226+ if Datatype.is_ascii_digit c then
227+ parse (i + 1) In_exponent_digits_seen
228+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." c)
229+ in
230+ parse 0 At_start
231+232+ let is_valid s = Result.is_ok (validate s)
233+end
234+235+(** Positive floating point number (> 0) *)
236+module Float_positive = struct
237+ let name = "positive floating point number"
238+239+ (* For positive floats, we validate it's a valid non-negative float,
240+ then check it's not zero *)
241+ let validate s =
242+ match Float_non_negative.validate s with
243+ | Error _ as e -> e
244+ | Ok () -> (
245+ (* Parse as float and check if it's positive *)
246+ try
247+ let f = float_of_string s in
248+ if f > 0.0 then Ok ()
249+ else Error "The value must be a positive floating point number."
250+ with Failure _ ->
251+ Error "Invalid floating point number format.")
252+253+ let is_valid s = Result.is_ok (validate s)
254+end
+16
lib/html5_checker/datatype/dt_float.mli
···0000000000000000
···1+(** Floating point datatype validators for HTML5 *)
2+3+(** Valid HTML5 floating point number *)
4+module Float_ : sig
5+ include Datatype.S
6+end
7+8+(** Non-negative floating point number (>= 0) *)
9+module Float_non_negative : sig
10+ include Datatype.S
11+end
12+13+(** Positive floating point number (> 0) *)
14+module Float_positive : sig
15+ include Datatype.S
16+end
+28
lib/html5_checker/datatype/dt_form_enctype.ml
···0000000000000000000000000000
···1+(** Form encoding type attribute validation based on HTML5 spec *)
2+3+(** Valid form enctype values *)
4+let valid_enctypes =
5+ [
6+ "application/x-www-form-urlencoded";
7+ "multipart/form-data";
8+ "text/plain";
9+ ]
10+11+module Form_enctype = struct
12+ let name = "form-enctype"
13+14+ let validate s =
15+ let s_lower = Datatype.string_to_ascii_lowercase s in
16+ if List.mem s_lower valid_enctypes then Ok ()
17+ else
18+ Error
19+ (Printf.sprintf
20+ "The value '%s' is not a valid form encoding type. Expected one of: \
21+ %s."
22+ s
23+ (String.concat ", " valid_enctypes))
24+25+ let is_valid s = Result.is_ok (validate s)
26+end
27+28+let datatypes = [ (module Form_enctype : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_form_enctype.mli
···0000000000000000000000
···1+(** Form encoding type attribute datatype validator.
2+3+ This module provides a validator for the enctype and formenctype attributes
4+ used on form and input/button elements, as defined by the HTML5 specification. *)
5+6+(** Form encoding type attribute validator.
7+8+ Validates form enctype/formenctype attribute values which can be:
9+ - application/x-www-form-urlencoded - Default encoding (form fields as name=value pairs)
10+ - multipart/form-data - Multipart encoding (required for file uploads)
11+ - text/plain - Plain text encoding (mostly for debugging)
12+13+ Values are matched case-insensitively according to HTML5 spec.
14+15+ Examples:
16+ - "application/x-www-form-urlencoded"
17+ - "multipart/form-data"
18+ - "text/plain" *)
19+module Form_enctype : Datatype.S
20+21+(** List of all datatypes defined in this module *)
22+val datatypes : Datatype.t list
+22
lib/html5_checker/datatype/dt_form_method.ml
···0000000000000000000000
···1+(** Form method attribute validation based on HTML5 spec *)
2+3+(** Valid form method values *)
4+let valid_methods = [ "get"; "post"; "dialog" ]
5+6+module Form_method = struct
7+ let name = "form-method"
8+9+ let validate s =
10+ let s_lower = Datatype.string_to_ascii_lowercase s in
11+ if List.mem s_lower valid_methods then Ok ()
12+ else
13+ Error
14+ (Printf.sprintf
15+ "The value '%s' is not a valid form method. Expected one of: %s."
16+ s
17+ (String.concat ", " valid_methods))
18+19+ let is_valid s = Result.is_ok (validate s)
20+end
21+22+let datatypes = [ (module Form_method : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_form_method.mli
···0000000000000000000000
···1+(** Form method attribute datatype validator.
2+3+ This module provides a validator for the method attribute used on
4+ form elements, as defined by the HTML5 specification. *)
5+6+(** Form method attribute validator.
7+8+ Validates form method attribute values which can be:
9+ - get - GET method (form data submitted in URL query string)
10+ - post - POST method (form data submitted in request body)
11+ - dialog - Dialog method (closes the dialog containing the form)
12+13+ Values are matched case-insensitively according to HTML5 spec.
14+15+ Examples:
16+ - "get"
17+ - "post"
18+ - "dialog" *)
19+module Form_method : Datatype.S
20+21+(** List of all datatypes defined in this module *)
22+val datatypes : Datatype.t list
+29
lib/html5_checker/datatype/dt_hash.ml
···00000000000000000000000000000
···1+(** Hash-name and fragment identifier datatype validators for HTML5. *)
2+3+module Hash_name : Datatype.S = struct
4+ let name = "hash-name reference"
5+6+ let validate s =
7+ let len = String.length s in
8+ if len = 0 then Error "The empty string is not a valid hash-name reference."
9+ else if s.[0] <> '#' then
10+ Error "A hash-name reference must start with \"#\"."
11+ else if len = 1 then
12+ Error "A hash-name reference must have at least one character after \"#\"."
13+ else Ok ()
14+15+ let is_valid s = Result.is_ok (validate s)
16+end
17+18+module Hash_or_empty : Datatype.S = struct
19+ let name = "hash-name reference (potentially empty)"
20+21+ let validate s =
22+ if String.length s = 0 then Ok ()
23+ else Hash_name.validate s
24+25+ let is_valid s = Result.is_ok (validate s)
26+end
27+28+let datatypes =
29+ [ (module Hash_name : Datatype.S); (module Hash_or_empty : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_hash.mli
···000000000000000000000000
···1+(** Hash-name and fragment identifier datatype validators for HTML5.
2+3+ This module provides validators for fragment identifiers (hash-name
4+ references) used in URLs to reference specific parts of a document. *)
5+6+(** Hash-name reference validator.
7+8+ A hash-name reference is a fragment identifier that starts with '#'
9+ followed by one or more characters.
10+11+ Requirements:
12+ - Must not be empty
13+ - Must start with '#'
14+ - Must have at least one character after '#' *)
15+module Hash_name : Datatype.S
16+17+(** Hash-name or empty validator.
18+19+ Same as Hash_name but allows empty strings. This is used for attributes
20+ where the hash-name reference is optional. *)
21+module Hash_or_empty : Datatype.S
22+23+(** List of all hash-related datatypes for registration. *)
24+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_hidden.ml
···00000000000000000000
···1+(** Hidden attribute validation for HTML5 *)
2+3+module Hidden = struct
4+ let name = "hidden"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "hidden" | "until-found" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid hidden value. Expected 'hidden', \
14+ 'until-found', or empty string."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Hidden : Datatype.S) ]
+23
lib/html5_checker/datatype/dt_hidden.mli
···00000000000000000000000
···1+(** Hidden attribute datatype validator for HTML5.
2+3+ This module provides a validator for the hidden attribute, as defined by
4+ the HTML5 specification. *)
5+6+(** Hidden attribute validator.
7+8+ Validates hidden attribute values which can be:
9+ - "" (empty string) - the element is hidden
10+ - "hidden" - the element is hidden
11+ - "until-found" - the element is hidden until found by a find-in-page or
12+ fragment navigation
13+14+ Values are case-insensitive.
15+16+ Examples:
17+ - ""
18+ - "hidden"
19+ - "until-found" *)
20+module Hidden : Datatype.S
21+22+(** List of all datatypes defined in this module *)
23+val datatypes : Datatype.t list
···1+(** ID-related datatype validators for HTML5. *)
2+3+module Id : Datatype.S = struct
4+ let name = "id"
5+6+ let validate s =
7+ let len = String.length s in
8+ if len = 0 then Error "An ID must not be the empty string."
9+ else
10+ match String.index_opt s ' ' with
11+ | Some _ -> Error "An ID must not contain whitespace."
12+ | None -> (
13+ (* Check for other whitespace characters *)
14+ let rec check_whitespace i =
15+ if i >= len then Ok ()
16+ else if Datatype.is_whitespace s.[i] then
17+ Error "An ID must not contain whitespace."
18+ else check_whitespace (i + 1)
19+ in
20+ check_whitespace 0)
21+22+ let is_valid s = Result.is_ok (validate s)
23+end
24+25+module Idref : Datatype.S = struct
26+ let name = "id reference"
27+28+ (* An IDREF has the same validation rules as an ID *)
29+ let validate = Id.validate
30+ let is_valid s = Result.is_ok (validate s)
31+end
32+33+module Idrefs : Datatype.S = struct
34+ let name = "id references"
35+36+ let validate s =
37+ (* IDREFS must contain at least one non-whitespace character *)
38+ let len = String.length s in
39+ let rec check_non_whitespace i =
40+ if i >= len then
41+ Error "An IDREFS value must contain at least one non-whitespace character."
42+ else if not (Datatype.is_whitespace s.[i]) then Ok ()
43+ else check_non_whitespace (i + 1)
44+ in
45+ check_non_whitespace 0
46+47+ let is_valid s = Result.is_ok (validate s)
48+end
49+50+let datatypes =
51+ [ (module Id : Datatype.S)
52+ ; (module Idref : Datatype.S)
53+ ; (module Idrefs : Datatype.S)
54+ ]
+35
lib/html5_checker/datatype/dt_id.mli
···00000000000000000000000000000000000
···1+(** ID-related datatype validators for HTML5.
2+3+ This module provides validators for HTML5 ID attributes and ID references
4+ based on the Nu HTML Checker's implementation. *)
5+6+(** ID validator.
7+8+ Accepts any string that consists of one or more characters and does not
9+ contain any whitespace characters.
10+11+ An ID must be:
12+ - Non-empty
13+ - Contain no whitespace characters (space, tab, LF, FF, CR) *)
14+module Id : Datatype.S
15+16+(** ID reference validator.
17+18+ An IDREF has the same validation rules as an ID - it must be non-empty
19+ and contain no whitespace. The semantic difference is that an IDREF
20+ references an existing ID rather than defining one. *)
21+module Idref : Datatype.S
22+23+(** ID references validator.
24+25+ Accepts a space-separated list of ID references. The value must contain
26+ at least one non-whitespace character.
27+28+ IDREFS values:
29+ - Must not be empty or contain only whitespace
30+ - Can contain multiple space-separated ID references
31+ - Each individual reference follows IDREF rules *)
32+module Idrefs : Datatype.S
33+34+(** List of all ID-related datatypes for registration. *)
35+val datatypes : Datatype.t list
+46
lib/html5_checker/datatype/dt_input_type.ml
···0000000000000000000000000000000000000000000000
···1+(** Input type attribute validation based on HTML5 spec *)
2+3+(** Valid input type values *)
4+let valid_types =
5+ [
6+ "hidden";
7+ "text";
8+ "search";
9+ "tel";
10+ "url";
11+ "email";
12+ "password";
13+ "date";
14+ "month";
15+ "week";
16+ "time";
17+ "datetime-local";
18+ "number";
19+ "range";
20+ "color";
21+ "checkbox";
22+ "radio";
23+ "file";
24+ "submit";
25+ "image";
26+ "reset";
27+ "button";
28+ ]
29+30+module Input_type = struct
31+ let name = "input-type"
32+33+ let validate s =
34+ let s_lower = Datatype.string_to_ascii_lowercase s in
35+ if List.mem s_lower valid_types then Ok ()
36+ else
37+ Error
38+ (Printf.sprintf
39+ "The value '%s' is not a valid input type. Expected one of: %s."
40+ s
41+ (String.concat ", " valid_types))
42+43+ let is_valid s = Result.is_ok (validate s)
44+end
45+46+let datatypes = [ (module Input_type : Datatype.S) ]
+42
lib/html5_checker/datatype/dt_input_type.mli
···000000000000000000000000000000000000000000
···1+(** Input type attribute datatype validator.
2+3+ This module provides a validator for the type attribute used on
4+ input elements, as defined by the HTML5 specification. *)
5+6+(** Input type attribute validator.
7+8+ Validates input type attribute values which can be:
9+ - hidden - Hidden input field
10+ - text - Single-line text field
11+ - search - Search input field
12+ - tel - Telephone number input field
13+ - url - URL input field
14+ - email - Email address input field
15+ - password - Password input field
16+ - date - Date input field (year, month, day)
17+ - month - Month input field (year, month)
18+ - week - Week input field (year, week)
19+ - time - Time input field (hour, minute, seconds, fractional seconds)
20+ - datetime-local - Local date and time input field
21+ - number - Numeric input field
22+ - range - Range control (slider)
23+ - color - Color picker
24+ - checkbox - Checkbox
25+ - radio - Radio button
26+ - file - File upload control
27+ - submit - Submit button
28+ - image - Image submit button
29+ - reset - Reset button
30+ - button - Push button
31+32+ Values are matched case-insensitively according to HTML5 spec.
33+34+ Examples:
35+ - "text"
36+ - "email"
37+ - "datetime-local"
38+ - "submit" *)
39+module Input_type : Datatype.S
40+41+(** List of all datatypes defined in this module *)
42+val datatypes : Datatype.t list
+23
lib/html5_checker/datatype/dt_inputmode.ml
···00000000000000000000000
···1+(** Input mode attribute validation for HTML5 *)
2+3+module Inputmode = struct
4+ let name = "inputmode"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "none" | "text" | "decimal" | "numeric" | "tel" | "search"
10+ | "email" | "url" ->
11+ Ok ()
12+ | _ ->
13+ Error
14+ (Printf.sprintf
15+ "The value '%s' is not a valid inputmode value. Expected one \
16+ of: empty string, 'none', 'text', 'decimal', 'numeric', 'tel', \
17+ 'search', 'email', or 'url'."
18+ s)
19+20+ let is_valid s = Result.is_ok (validate s)
21+end
22+23+let datatypes = [ (module Inputmode : Datatype.S) ]
+29
lib/html5_checker/datatype/dt_inputmode.mli
···00000000000000000000000000000
···1+(** Input mode attribute datatype validator.
2+3+ This module provides a validator for the inputmode attribute used to hint
4+ at the type of data the user might enter, as defined by the HTML5 spec. *)
5+6+(** Inputmode attribute validator.
7+8+ Validates inputmode attribute values which can be:
9+ - "" (empty string, no specific input mode)
10+ - "none" (no virtual keyboard)
11+ - "text" (standard text input)
12+ - "decimal" (decimal numeric input with locale-appropriate format)
13+ - "numeric" (numeric input)
14+ - "tel" (telephone number input)
15+ - "search" (search input)
16+ - "email" (email address input)
17+ - "url" (URL input)
18+19+ Values are case-insensitive after ASCII lowercasing.
20+21+ Examples:
22+ - ""
23+ - "numeric"
24+ - "email"
25+ - "tel" *)
26+module Inputmode : Datatype.S
27+28+(** List of all datatypes defined in this module *)
29+val datatypes : Datatype.t list
···1+(** Valid HTML5 integer (optional sign followed by digits) *)
2+module Integer = struct
3+ let name = "integer"
4+5+ let validate s =
6+ let len = String.length s in
7+ if len = 0 then Error "The empty string is not a valid integer."
8+ else
9+ let start_pos =
10+ if s.[0] = '-' then
11+ if len = 1 then failwith "unreachable"
12+ else 1
13+ else 0
14+ in
15+ (* First character must be minus or digit *)
16+ if start_pos = 0 && not (Datatype.is_ascii_digit s.[0]) then
17+ Error
18+ (Printf.sprintf "Expected a minus sign or a digit but saw '%c' instead."
19+ s.[0])
20+ else
21+ (* Rest must be digits *)
22+ let rec check_digits i =
23+ if i >= len then Ok ()
24+ else if Datatype.is_ascii_digit s.[i] then check_digits (i + 1)
25+ else
26+ Error (Printf.sprintf "Expected a digit but saw '%c' instead." s.[i])
27+ in
28+ check_digits start_pos
29+30+ let is_valid s = Result.is_ok (validate s)
31+end
32+33+(** Non-negative integer (>= 0) *)
34+module Integer_non_negative = struct
35+ let name = "non-negative integer"
36+37+ let validate s =
38+ let len = String.length s in
39+ if len = 0 then Error "The empty string is not a valid non-negative integer."
40+ else
41+ (* All characters must be digits *)
42+ let rec check_digits i =
43+ if i >= len then Ok ()
44+ else if Datatype.is_ascii_digit s.[i] then check_digits (i + 1)
45+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." s.[i])
46+ in
47+ check_digits 0
48+49+ let is_valid s = Result.is_ok (validate s)
50+end
51+52+(** Positive integer (> 0) *)
53+module Integer_positive = struct
54+ let name = "positive integer"
55+56+ let validate s =
57+ let len = String.length s in
58+ if len = 0 then Error "The empty string is not a valid positive integer."
59+ else
60+ (* All characters must be digits *)
61+ let rec check_digits i all_zeros =
62+ if i >= len then
63+ if all_zeros then Error "Zero is not a positive integer." else Ok ()
64+ else if Datatype.is_ascii_digit s.[i] then
65+ check_digits (i + 1) (all_zeros && s.[i] = '0')
66+ else Error (Printf.sprintf "Expected a digit but saw '%c' instead." s.[i])
67+ in
68+ check_digits 0 true
69+70+ let is_valid s = Result.is_ok (validate s)
71+end
+16
lib/html5_checker/datatype/dt_integer.mli
···0000000000000000
···1+(** Integer datatype validators for HTML5 *)
2+3+(** Valid HTML5 integer (optional sign followed by digits) *)
4+module Integer : sig
5+ include Datatype.S
6+end
7+8+(** Non-negative integer (>= 0) *)
9+module Integer_non_negative : sig
10+ include Datatype.S
11+end
12+13+(** Positive integer (> 0) *)
14+module Integer_positive : sig
15+ include Datatype.S
16+end
···1+(** Subresource integrity attribute validation *)
2+3+(** Valid hash algorithms *)
4+let valid_algorithms = [ "sha256"; "sha384"; "sha512" ]
5+6+(** Check if character is valid base64 character *)
7+let is_base64_char c =
8+ (c >= 'A' && c <= 'Z')
9+ || (c >= 'a' && c <= 'z')
10+ || (c >= '0' && c <= '9')
11+ || c = '+' || c = '/' || c = '='
12+13+(** Validate base64 encoding *)
14+let validate_base64 s =
15+ if String.length s = 0 then false
16+ else
17+ (* Check all characters are valid base64 *)
18+ let all_valid = ref true in
19+ for i = 0 to String.length s - 1 do
20+ if not (is_base64_char s.[i]) then all_valid := false
21+ done;
22+ if not !all_valid then false
23+ else
24+ (* Check padding is at the end only *)
25+ let has_padding = String.contains s '=' in
26+ if not has_padding then true
27+ else
28+ (* Find first '=' *)
29+ let first_eq = String.index s '=' in
30+ (* All chars after first '=' must be '=' *)
31+ let valid_padding = ref true in
32+ for i = first_eq to String.length s - 1 do
33+ if s.[i] <> '=' then valid_padding := false
34+ done;
35+ !valid_padding
36+ && (* At most 2 padding characters *)
37+ String.length s - first_eq <= 2
38+39+(** Validate a single hash value *)
40+let validate_hash_value s =
41+ let trimmed = Datatype.trim_html_spaces s in
42+ if trimmed = "" then Error "Hash value must not be empty"
43+ else
44+ (* Split on '-' to get algorithm and hash *)
45+ match String.index_opt trimmed '-' with
46+ | None ->
47+ Error
48+ (Printf.sprintf
49+ "Hash value '%s' must be in format 'algorithm-base64hash'" trimmed)
50+ | Some dash_pos ->
51+ let algorithm = String.sub trimmed 0 dash_pos in
52+ let algorithm_lower = Datatype.string_to_ascii_lowercase algorithm in
53+ if not (List.mem algorithm_lower valid_algorithms) then
54+ Error
55+ (Printf.sprintf
56+ "Hash algorithm '%s' is not supported. Must be one of: %s"
57+ algorithm (String.concat ", " valid_algorithms))
58+ else
59+ let rest = String.sub trimmed (dash_pos + 1) (String.length trimmed - dash_pos - 1) in
60+ (* Split on '?' to separate hash from options *)
61+ let hash_part =
62+ match String.index_opt rest '?' with
63+ | None -> rest
64+ | Some q_pos -> String.sub rest 0 q_pos
65+ in
66+ if String.length hash_part = 0 then
67+ Error "Hash value after algorithm must not be empty"
68+ else if not (validate_base64 hash_part) then
69+ Error
70+ (Printf.sprintf
71+ "Hash value '%s' is not valid base64 encoding" hash_part)
72+ else Ok ()
73+74+(** Validate integrity attribute value *)
75+let validate_integrity s =
76+ let trimmed = Datatype.trim_html_spaces s in
77+ if trimmed = "" then Error "Integrity attribute must not be empty"
78+ else
79+ (* Split on whitespace *)
80+ let hash_values = String.split_on_char ' ' trimmed in
81+ let hash_values =
82+ List.filter (fun h -> Datatype.trim_html_spaces h <> "") hash_values
83+ in
84+ if hash_values = [] then
85+ Error "Integrity attribute must contain at least one hash value"
86+ else
87+ (* Validate each hash value *)
88+ let rec check_hashes = function
89+ | [] -> Ok ()
90+ | h :: rest -> (
91+ match validate_hash_value h with
92+ | Error e -> Error e
93+ | Ok () -> check_hashes rest)
94+ in
95+ check_hashes hash_values
96+97+module Integrity = struct
98+ let name = "integrity"
99+ let validate = validate_integrity
100+ let is_valid s = Result.is_ok (validate s)
101+end
102+103+let datatypes = [ (module Integrity : Datatype.S) ]
+27
lib/html5_checker/datatype/dt_integrity.mli
···000000000000000000000000000
···1+(** Subresource integrity attribute validator.
2+3+ This module provides a validator for the integrity attribute used on
4+ script and link elements for subresource integrity checks, as defined
5+ by the W3C Subresource Integrity specification. *)
6+7+(** Integrity attribute validator.
8+9+ Validates integrity attribute values which contain space-separated hash
10+ values. Each hash value consists of:
11+ - An algorithm identifier (sha256, sha384, or sha512)
12+ - A hyphen (-)
13+ - The base64-encoded hash value
14+ - Optional options preceded by '?'
15+16+ Examples:
17+ - "sha256-abc123..."
18+ - "sha384-xyz789..."
19+ - "sha256-abc123... sha512-def456..."
20+ - "sha256-abc123...?ct=application/javascript"
21+22+ The base64 encoding must be valid and the algorithm must be one of the
23+ supported hash functions. *)
24+module Integrity : Datatype.S
25+26+(** List of all datatypes defined in this module *)
27+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_kind.ml
···00000000000000000000
···1+(** Kind attribute validation for HTML5 *)
2+3+module Kind = struct
4+ let name = "kind"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "subtitles" | "captions" | "descriptions" | "chapters" | "metadata" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid kind. Expected 'subtitles', \
14+ 'captions', 'descriptions', 'chapters', or 'metadata'."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Kind : Datatype.S) ]
+29
lib/html5_checker/datatype/dt_kind.mli
···00000000000000000000000000000
···1+(** Kind attribute datatype validator for HTML5.
2+3+ This module provides a validator for the kind attribute used on track
4+ elements, as defined by the HTML5 specification. *)
5+6+(** Kind attribute validator.
7+8+ Validates kind attribute values which can be:
9+ - "subtitles" - Transcription or translation of the dialogue, suitable for
10+ when the sound is available but not understood
11+ - "captions" - Transcription or translation of the dialogue, sound effects,
12+ relevant musical cues, and other relevant audio information, suitable for
13+ when sound is unavailable or not clearly audible
14+ - "descriptions" - Textual descriptions of the video component, suitable for
15+ audio synthesis when the visual component is unavailable
16+ - "chapters" - Chapter titles, for use in navigating the media resource
17+ - "metadata" - Tracks intended for use from script, not displayed by the user
18+ agent
19+20+ Values are case-insensitive.
21+22+ Examples:
23+ - "subtitles"
24+ - "captions"
25+ - "descriptions" *)
26+module Kind : Datatype.S
27+28+(** List of all datatypes defined in this module *)
29+val datatypes : Datatype.t list
···1+(** Helper functions for language tag validation *)
2+3+let is_lower_alpha c = c >= 'a' && c <= 'z'
4+let is_upper_alpha c = c >= 'A' && c <= 'Z'
5+let is_alpha c = is_lower_alpha c || is_upper_alpha c
6+let is_digit c = c >= '0' && c <= '9'
7+let is_alphanumeric c = is_alpha c || is_digit c
8+9+let is_all_alpha s =
10+ String.for_all is_alpha s
11+12+let _is_all_digits s =
13+ String.for_all is_digit s
14+15+let is_all_alphanumeric s =
16+ String.for_all is_alphanumeric s
17+18+let to_lower s =
19+ String.lowercase_ascii s
20+21+(** Validate language tag structure according to BCP 47.
22+ This is a simplified validator that checks structural validity
23+ but does not validate against the IANA registry. *)
24+let validate_language_structure s =
25+ if String.length s = 0 then
26+ Error "The empty string is not a valid language tag"
27+ else if String.starts_with ~prefix:"-" s then
28+ Error "Language tag must not start with HYPHEN-MINUS"
29+ else if String.ends_with ~suffix:"-" s then
30+ Error "Language tag must not end with HYPHEN-MINUS"
31+ else
32+ let subtags = String.split_on_char '-' s in
33+34+ (* Check for empty subtags and length constraints *)
35+ let rec check_subtag_constraints = function
36+ | [] -> Ok ()
37+ | subtag :: rest ->
38+ let len = String.length subtag in
39+ if len = 0 then
40+ Error "Zero-length subtag"
41+ else if len > 8 then
42+ Error "Subtags must not exceed 8 characters in length"
43+ else
44+ check_subtag_constraints rest
45+ in
46+47+ match check_subtag_constraints subtags with
48+ | Error e -> Error e
49+ | Ok () ->
50+ (* Primary language subtag validation *)
51+ match subtags with
52+ | [] -> Error "Language tag must have at least one subtag"
53+ | first :: rest ->
54+ let first_lower = to_lower first in
55+ let len = String.length first_lower in
56+57+ (* Check for private use tag *)
58+ if first_lower = "x" then
59+ if rest = [] then
60+ Error "No subtags in private use sequence"
61+ else
62+ (* Private use subtags must be 1-8 alphanumeric *)
63+ let rec check_private_use = function
64+ | [] -> Ok ()
65+ | subtag :: rest ->
66+ let subtag_lower = to_lower subtag in
67+ if String.length subtag_lower < 1 then
68+ Error "Private use subtag is too short"
69+ else if not (is_all_alphanumeric subtag_lower) then
70+ Error "Bad character in private use subtag"
71+ else
72+ check_private_use rest
73+ in
74+ check_private_use rest
75+ (* Primary language: 2-3 letters (ISO 639) *)
76+ else if (len = 2 || len = 3) && is_all_alpha first_lower then
77+ Ok ()
78+ (* Reserved: 4 letters *)
79+ else if len = 4 && is_all_alpha first_lower then
80+ Error "Found reserved language tag"
81+ (* Registered: 5+ letters *)
82+ else if len >= 5 && is_all_alpha first_lower then
83+ Ok ()
84+ else
85+ Error "Invalid language subtag format"
86+87+module Language = struct
88+ let name = "language tag"
89+90+ let validate s = validate_language_structure s
91+92+ let is_valid s = Result.is_ok (validate s)
93+end
94+95+module Language_or_empty = struct
96+ let name = "language tag or empty"
97+98+ let validate s =
99+ if String.length s = 0 then
100+ Ok ()
101+ else
102+ validate_language_structure s
103+104+ let is_valid s = Result.is_ok (validate s)
105+end
106+107+let datatypes = [
108+ (module Language : Datatype.S);
109+ (module Language_or_empty : Datatype.S);
110+]
+43
lib/html5_checker/datatype/dt_language.mli
···0000000000000000000000000000000000000000000
···1+(** Language tag datatype validators for HTML5.
2+3+ This module provides validators for BCP 47 language tags as used in HTML5.
4+ Language tags identify natural languages and consist of subtags separated
5+ by hyphens, following the IETF BCP 47 standard. *)
6+7+(** Language tag datatype (BCP 47 format).
8+9+ Validates a language tag according to BCP 47. A language tag consists of:
10+ - Primary language subtag (2-3 letters, or 5+ letters for registered languages)
11+ - Optional extended language subtag (3 letters)
12+ - Optional script subtag (4 letters)
13+ - Optional region subtag (2 letters or 3 digits)
14+ - Optional variant subtags (5-8 alphanumeric characters, or 4 starting with digit)
15+ - Optional extension subtags (single letter + subtags)
16+ - Optional private use subtags (starting with 'x-')
17+18+ Examples:
19+ - "en" (English)
20+ - "en-US" (US English)
21+ - "zh-Hans" (Simplified Chinese)
22+ - "zh-Hans-CN" (Simplified Chinese as used in China)
23+24+ The validator performs basic structural validation:
25+ - Tag cannot be empty
26+ - Tag cannot start or end with hyphen
27+ - Subtags cannot be empty
28+ - Subtags cannot exceed 8 characters (except for registered values)
29+ - Primary language subtag must be 2-3 letters (ISO 639) or 5+ letters (registered)
30+ - 4-letter primary subtags are reserved
31+32+ Note: This implementation does NOT validate against the IANA language
33+ subtag registry. It only validates the structural format. *)
34+module Language : Datatype.S
35+36+(** Language tag or empty string.
37+38+ Like Language but also accepts the empty string. This is used for cases
39+ where lang="" is valid to indicate an unknown or unspecified language. *)
40+module Language_or_empty : Datatype.S
41+42+(** List of all language datatypes *)
43+val datatypes : Datatype.t list
···1+(** Link relationship type validation *)
2+3+(** Valid link relationship types *)
4+let valid_link_types =
5+ [
6+ "alternate";
7+ "author";
8+ "bookmark";
9+ "canonical";
10+ "dns-prefetch";
11+ "external";
12+ "help";
13+ "icon";
14+ "license";
15+ "manifest";
16+ "modulepreload";
17+ "next";
18+ "nofollow";
19+ "noopener";
20+ "noreferrer";
21+ "opener";
22+ "pingback";
23+ "preconnect";
24+ "prefetch";
25+ "preload";
26+ "prerender";
27+ "prev";
28+ "search";
29+ "stylesheet";
30+ "tag";
31+ ]
32+33+(** Validate a single link type *)
34+let validate_link_type s =
35+ let trimmed = Datatype.trim_html_spaces s in
36+ if trimmed = "" then Error "Link type must not be empty"
37+ else
38+ let lower = Datatype.string_to_ascii_lowercase trimmed in
39+ if List.mem lower valid_link_types then Ok ()
40+ else
41+ Error
42+ (Printf.sprintf
43+ "The value '%s' is not a valid link type. Valid link types are: %s"
44+ s (String.concat ", " valid_link_types))
45+46+module Link_type = struct
47+ let name = "link-type"
48+ let validate = validate_link_type
49+ let is_valid s = Result.is_ok (validate s)
50+end
51+52+(** Validate space-separated link types *)
53+let validate_link_types s =
54+ let trimmed = Datatype.trim_html_spaces s in
55+ if trimmed = "" then Error "Link types must not be empty"
56+ else
57+ (* Split on whitespace *)
58+ let types = String.split_on_char ' ' trimmed in
59+ let types = List.filter (fun t -> Datatype.trim_html_spaces t <> "") types in
60+ if types = [] then Error "Link types must contain at least one link type"
61+ else
62+ (* Validate each link type *)
63+ let rec check_types = function
64+ | [] -> Ok ()
65+ | t :: rest -> (
66+ match validate_link_type t with
67+ | Error e -> Error e
68+ | Ok () -> check_types rest)
69+ in
70+ check_types types
71+72+module Link_types = struct
73+ let name = "link-types"
74+ let validate = validate_link_types
75+ let is_valid s = Result.is_ok (validate s)
76+end
77+78+let datatypes =
79+ [ (module Link_type : Datatype.S); (module Link_types : Datatype.S) ]
···1+(** Link relationship type validators.
2+3+ This module provides validators for link relationship types used in rel
4+ attributes on a and link elements, as defined by the HTML5 specification. *)
5+6+(** Single link type validator.
7+8+ Validates a single link relationship type value. Valid link types include:
9+10+ - alternate: Alternate representation of the current document
11+ - author: Link to author information
12+ - bookmark: Permanent URL for nearest ancestor article
13+ - canonical: Preferred URL for the current document
14+ - dns-prefetch: Hint that the browser should prefetch DNS for the target
15+ - external: Link to a different website
16+ - help: Link to context-sensitive help
17+ - icon: Icon representing the current document
18+ - license: Copyright license for the current document
19+ - manifest: Web app manifest
20+ - modulepreload: Preload a JavaScript module
21+ - next: Next document in a sequence
22+ - nofollow: Do not follow this link for ranking
23+ - noopener: Do not grant window.opener access
24+ - noreferrer: Do not send Referer header
25+ - opener: Grant window.opener access
26+ - pingback: Pingback server address
27+ - preconnect: Hint to preconnect to target origin
28+ - prefetch: Hint to prefetch the target resource
29+ - preload: Preload a resource
30+ - prerender: Hint to prerender the target page
31+ - prev: Previous document in a sequence
32+ - search: Link to search tool for the current document
33+ - stylesheet: External stylesheet
34+ - tag: Tag (keyword) for the current document
35+36+ Examples:
37+ - "stylesheet"
38+ - "icon"
39+ - "preload"
40+41+ Link types are case-insensitive. *)
42+module Link_type : Datatype.S
43+44+(** Space-separated link types validator.
45+46+ Validates space-separated link relationship types. Multiple link types
47+ can be specified separated by ASCII whitespace.
48+49+ Examples:
50+ - "stylesheet"
51+ - "icon preload"
52+ - "nofollow noopener noreferrer"
53+54+ Each token must be a valid link type. Duplicate link types are allowed
55+ but not recommended. *)
56+module Link_types : Datatype.S
57+58+(** List of all datatypes defined in this module *)
59+val datatypes : Datatype.t list
+42
lib/html5_checker/datatype/dt_list_type.ml
···000000000000000000000000000000000000000000
···1+(** List type attribute validation based on HTML5 spec *)
2+3+(** Valid ol type values (case-sensitive) *)
4+let valid_ol_types = [ "1"; "a"; "A"; "i"; "I" ]
5+6+(** Valid ul type values (deprecated but still parsed) *)
7+let valid_ul_types = [ "disc"; "circle"; "square" ]
8+9+module Ol_type = struct
10+ let name = "ol-type"
11+12+ let validate s =
13+ (* Note: ol type is case-sensitive *)
14+ if List.mem s valid_ol_types then Ok ()
15+ else
16+ Error
17+ (Printf.sprintf
18+ "The value '%s' is not a valid ol type. Expected one of: %s."
19+ s
20+ (String.concat ", " valid_ol_types))
21+22+ let is_valid s = Result.is_ok (validate s)
23+end
24+25+module Ul_type = struct
26+ let name = "ul-type"
27+28+ let validate s =
29+ let s_lower = Datatype.string_to_ascii_lowercase s in
30+ if List.mem s_lower valid_ul_types then Ok ()
31+ else
32+ Error
33+ (Printf.sprintf
34+ "The value '%s' is not a valid ul type. Expected one of: %s."
35+ s
36+ (String.concat ", " valid_ul_types))
37+38+ let is_valid s = Result.is_ok (validate s)
39+end
40+41+let datatypes =
42+ [ (module Ol_type : Datatype.S); (module Ul_type : Datatype.S) ]
+44
lib/html5_checker/datatype/dt_list_type.mli
···00000000000000000000000000000000000000000000
···1+(** List type attribute datatype validators.
2+3+ This module provides validators for the type attribute used on
4+ ol and ul elements, as defined by the HTML5 specification. *)
5+6+(** Ordered list type attribute validator.
7+8+ Validates ol type attribute values which can be:
9+ - 1 - Decimal numbers (1, 2, 3, ...)
10+ - a - Lowercase Latin letters (a, b, c, ...)
11+ - A - Uppercase Latin letters (A, B, C, ...)
12+ - i - Lowercase Roman numerals (i, ii, iii, ...)
13+ - I - Uppercase Roman numerals (I, II, III, ...)
14+15+ Values are matched case-sensitively for ol type.
16+17+ Examples:
18+ - "1"
19+ - "a"
20+ - "A"
21+ - "i"
22+ - "I" *)
23+module Ol_type : Datatype.S
24+25+(** Unordered list type attribute validator.
26+27+ Validates ul type attribute values which can be:
28+ - disc - Filled circle (default)
29+ - circle - Hollow circle
30+ - square - Filled square
31+32+ Note: The type attribute on ul is deprecated in HTML5 but may still
33+ be parsed for backwards compatibility.
34+35+ Values are matched case-insensitively according to HTML5 spec.
36+37+ Examples:
38+ - "disc"
39+ - "circle"
40+ - "square" *)
41+module Ul_type : Datatype.S
42+43+(** List of all datatypes defined in this module *)
44+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_loading.ml
···00000000000000000000
···1+(** Lazy loading attribute validation for HTML5 *)
2+3+module Loading = struct
4+ let name = "loading"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "lazy" | "eager" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid loading value. Expected empty \
14+ string, 'lazy', or 'eager'."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Loading : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_loading.mli
···0000000000000000000000
···1+(** Lazy loading attribute datatype validator.
2+3+ This module provides a validator for the loading attribute used to control
4+ lazy loading behavior for images and iframes, as defined by the HTML5 spec. *)
5+6+(** Loading attribute validator.
7+8+ Validates loading attribute values which can be:
9+ - "" (empty string, default loading behavior)
10+ - "lazy" (defer loading until needed)
11+ - "eager" (load immediately)
12+13+ Values are case-insensitive after ASCII lowercasing.
14+15+ Examples:
16+ - ""
17+ - "lazy"
18+ - "eager" *)
19+module Loading : Datatype.S
20+21+(** List of all datatypes defined in this module *)
22+val datatypes : Datatype.t list
···1+(** Media query validation - simplified implementation *)
2+3+(** Media types *)
4+let media_types =
5+ [
6+ "all";
7+ "screen";
8+ "print";
9+ "speech";
10+ "aural";
11+ "braille";
12+ "handheld";
13+ "projection";
14+ "tty";
15+ "tv";
16+ "embossed";
17+ ]
18+19+(** Media query keywords *)
20+let media_keywords = [ "and"; "or"; "not"; "only" ]
21+22+(** Check if character is whitespace *)
23+let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r'
24+25+(** Check if character can start an identifier *)
26+let is_ident_start c =
27+ (c >= 'a' && c <= 'z')
28+ || (c >= 'A' && c <= 'Z')
29+ || c = '_' || c = '-' || Char.code c >= 128
30+31+(** Check if character can be in an identifier *)
32+let is_ident_char c =
33+ is_ident_start c || (c >= '0' && c <= '9')
34+35+(** Check balanced parentheses *)
36+let check_balanced_parens s =
37+ let rec check depth i =
38+ if i >= String.length s then
39+ if depth = 0 then Ok ()
40+ else Error "Unbalanced parentheses: unclosed '('"
41+ else
42+ let c = s.[i] in
43+ match c with
44+ | '(' -> check (depth + 1) (i + 1)
45+ | ')' ->
46+ if depth = 0 then Error "Unbalanced parentheses: unexpected ')'"
47+ else check (depth - 1) (i + 1)
48+ | _ -> check depth (i + 1)
49+ in
50+ check 0 0
51+52+(** Extract words (identifiers and keywords) from media query *)
53+let extract_words s =
54+ let words = ref [] in
55+ let buf = Buffer.create 16 in
56+ let in_parens = ref 0 in
57+58+ for i = 0 to String.length s - 1 do
59+ let c = s.[i] in
60+ match c with
61+ | '(' ->
62+ if Buffer.length buf > 0 then (
63+ words := Buffer.contents buf :: !words;
64+ Buffer.clear buf);
65+ incr in_parens
66+ | ')' ->
67+ if Buffer.length buf > 0 then (
68+ words := Buffer.contents buf :: !words;
69+ Buffer.clear buf);
70+ decr in_parens
71+ | _ ->
72+ if !in_parens = 0 then
73+ if is_ident_char c then Buffer.add_char buf c
74+ else if is_whitespace c then
75+ if Buffer.length buf > 0 then (
76+ words := Buffer.contents buf :: !words;
77+ Buffer.clear buf)
78+ else ()
79+ else if Buffer.length buf > 0 then (
80+ words := Buffer.contents buf :: !words;
81+ Buffer.clear buf)
82+ done;
83+84+ if Buffer.length buf > 0 then words := Buffer.contents buf :: !words;
85+ List.rev !words
86+87+(** Validate media query structure *)
88+let validate_media_query s =
89+ let s = String.trim s in
90+ if String.length s = 0 then Error "Media query must not be empty"
91+ else
92+ (* Check balanced parentheses *)
93+ match check_balanced_parens s with
94+ | Error _ as e -> e
95+ | Ok () ->
96+ (* Extract and validate words *)
97+ let words = extract_words s in
98+ let words_lower = List.map String.lowercase_ascii words in
99+100+ (* Basic validation: check for invalid keyword combinations *)
101+ let rec validate_words prev = function
102+ | [] -> Ok ()
103+ | word :: rest -> (
104+ let word_lower = String.lowercase_ascii word in
105+ match (prev, word_lower) with
106+ | None, "and" | None, "or" ->
107+ Error
108+ (Printf.sprintf
109+ "Media query cannot start with keyword '%s'" word)
110+ | Some "and", "and" | Some "or", "or" | Some "not", "not" ->
111+ Error
112+ (Printf.sprintf "Consecutive '%s' keywords are not allowed"
113+ word)
114+ | Some "only", "only" ->
115+ Error "Consecutive 'only' keywords are not allowed"
116+ | _, _ -> validate_words (Some word_lower) rest)
117+ in
118+119+ (* Check if query contains valid media types or features *)
120+ let has_media_type =
121+ List.exists
122+ (fun w -> List.mem (String.lowercase_ascii w) media_types)
123+ words
124+ in
125+ let has_features = String.contains s '(' in
126+127+ if not (has_media_type || has_features) then
128+ (* Only keywords, no actual media type or features *)
129+ if List.for_all (fun w -> List.mem w media_keywords) words_lower then
130+ Error "Media query contains only keywords without media type or features"
131+ else Ok () (* Assume other identifiers are valid *)
132+ else validate_words None words
133+134+module Media_query = struct
135+ let name = "media query"
136+ let validate = validate_media_query
137+138+ let is_valid s =
139+ match validate s with
140+ | Ok () -> true
141+ | Error _ -> false
142+end
143+144+let datatypes = [ (module Media_query : Datatype.S) ]
+28
lib/html5_checker/datatype/dt_media_query.mli
···0000000000000000000000000000
···1+(** Media query datatype validator.
2+3+ This module provides a validator for CSS media queries as used in HTML5. *)
4+5+(** Media query validator.
6+7+ Validates CSS media queries used in media attributes and CSS @media rules.
8+9+ Examples:
10+ - "screen"
11+ - "print"
12+ - "(min-width: 600px)"
13+ - "screen and (color)"
14+ - "not screen and (color)"
15+ - "(min-width: 600px) and (max-width: 800px)"
16+17+ This is a simplified validator that checks:
18+ - Balanced parentheses
19+ - Basic media type keywords (all, screen, print, etc.)
20+ - Basic logical operators (and, or, not, only)
21+ - Valid feature queries in parentheses
22+23+ Note: This does not perform full CSS media query parsing. For production
24+ use, consider integrating with a full CSS parser. *)
25+module Media_query : Datatype.S
26+27+(** List of all datatypes defined in this module *)
28+val datatypes : Datatype.t list
···1+(** MIME type validation based on RFC 2045 and HTML5 spec *)
2+3+(** Check if character is whitespace *)
4+let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r'
5+6+(** Check if character is a token character (RFC 2045) *)
7+let is_token_char c =
8+ (c >= '\033' && c <= '\126')
9+ && not
10+ (c = '(' || c = ')' || c = '<' || c = '>' || c = '@' || c = ','
11+ || c = ';' || c = ':' || c = '\\' || c = '"' || c = '/' || c = '['
12+ || c = ']' || c = '?' || c = '=' || c = '{' || c = '}')
13+14+(** Check if character is valid in quoted string (qdtext) *)
15+let is_qdtext_char c =
16+ (c >= ' ' && c <= '\126') || c = '\n' || c = '\r' || c = '\t'
17+18+(** States for MIME type parser *)
19+type parse_state =
20+ | At_start
21+ | In_supertype
22+ | At_subtype_start
23+ | In_subtype
24+ | Semicolon_seen
25+ | Ws_before_semicolon
26+ | In_param_name
27+ | Equals_seen
28+ | In_quoted_string
29+ | In_unquoted_string
30+ | In_quoted_pair
31+ | Close_quote_seen
32+33+(** JavaScript MIME types that should not have parameters *)
34+let javascript_mime_types =
35+ [
36+ "application/ecmascript";
37+ "application/javascript";
38+ "application/x-ecmascript";
39+ "application/x-javascript";
40+ "text/ecmascript";
41+ "text/javascript";
42+ "text/javascript1.0";
43+ "text/javascript1.1";
44+ "text/javascript1.2";
45+ "text/javascript1.3";
46+ "text/javascript1.4";
47+ "text/javascript1.5";
48+ "text/jscript";
49+ "text/livescript";
50+ "text/x-ecmascript";
51+ "text/x-javascript";
52+ ]
53+54+(** Validate a single MIME type *)
55+let validate_mime_type s =
56+ let len = String.length s in
57+ let rec parse state i =
58+ if i >= len then
59+ (* End of string - check final state *)
60+ match state with
61+ | In_subtype | In_unquoted_string | Close_quote_seen -> Ok ()
62+ | At_start -> Error "Expected a MIME type but saw the empty string"
63+ | In_supertype | At_subtype_start -> Error "Subtype missing"
64+ | Equals_seen | In_param_name -> Error "Parameter value missing"
65+ | In_quoted_pair | In_quoted_string -> Error "Unfinished quoted string"
66+ | Semicolon_seen ->
67+ Error "Semicolon seen but there was no parameter following it"
68+ | Ws_before_semicolon -> Error "Extraneous trailing whitespace"
69+ else
70+ let c = s.[i] in
71+ match state with
72+ | At_start ->
73+ if is_token_char c then parse In_supertype (i + 1)
74+ else
75+ Error
76+ (Printf.sprintf
77+ "Expected a token character but saw '%c' instead" c)
78+ | In_supertype ->
79+ if is_token_char c then parse In_supertype (i + 1)
80+ else if c = '/' then parse At_subtype_start (i + 1)
81+ else
82+ Error
83+ (Printf.sprintf
84+ "Expected a token character or '/' but saw '%c' instead" c)
85+ | At_subtype_start ->
86+ if is_token_char c then parse In_subtype (i + 1)
87+ else
88+ Error
89+ (Printf.sprintf
90+ "Expected a token character but saw '%c' instead" c)
91+ | In_subtype ->
92+ if is_token_char c then parse In_subtype (i + 1)
93+ else if c = ';' then
94+ (* Check if this is a JavaScript MIME type *)
95+ let mime_type = String.sub s 0 i |> String.lowercase_ascii in
96+ if List.mem mime_type javascript_mime_types then
97+ Error
98+ "A JavaScript MIME type must not contain any characters after \
99+ the subtype"
100+ else parse Semicolon_seen (i + 1)
101+ else if is_whitespace c then parse Ws_before_semicolon (i + 1)
102+ else
103+ Error
104+ (Printf.sprintf
105+ "Expected a token character, whitespace or a semicolon but saw \
106+ '%c' instead"
107+ c)
108+ | Ws_before_semicolon ->
109+ if is_whitespace c then parse Ws_before_semicolon (i + 1)
110+ else if c = ';' then parse Semicolon_seen (i + 1)
111+ else
112+ Error
113+ (Printf.sprintf
114+ "Expected whitespace or a semicolon but saw '%c' instead" c)
115+ | Semicolon_seen ->
116+ if is_whitespace c then parse Semicolon_seen (i + 1)
117+ else if is_token_char c then parse In_param_name (i + 1)
118+ else
119+ Error
120+ (Printf.sprintf
121+ "Expected whitespace or a token character but saw '%c' instead"
122+ c)
123+ | In_param_name ->
124+ if is_token_char c then parse In_param_name (i + 1)
125+ else if c = '=' then parse Equals_seen (i + 1)
126+ else
127+ Error
128+ (Printf.sprintf "Expected a token character or '=' but saw '%c' instead"
129+ c)
130+ | Equals_seen ->
131+ if c = '"' then parse In_quoted_string (i + 1)
132+ else if is_token_char c then parse In_unquoted_string (i + 1)
133+ else
134+ Error
135+ (Printf.sprintf
136+ "Expected a double quote or a token character but saw '%c' \
137+ instead"
138+ c)
139+ | In_quoted_string ->
140+ if c = '\\' then parse In_quoted_pair (i + 1)
141+ else if c = '"' then parse Close_quote_seen (i + 1)
142+ else if is_qdtext_char c then parse In_quoted_string (i + 1)
143+ else
144+ Error
145+ (Printf.sprintf
146+ "Expected a non-control ASCII character but saw '%c' instead" c)
147+ | In_quoted_pair ->
148+ if Char.code c <= 127 then parse In_quoted_string (i + 1)
149+ else
150+ Error
151+ (Printf.sprintf "Expected an ASCII character but saw '%c' instead"
152+ c)
153+ | Close_quote_seen ->
154+ if c = ';' then parse Semicolon_seen (i + 1)
155+ else if is_whitespace c then parse Ws_before_semicolon (i + 1)
156+ else
157+ Error
158+ (Printf.sprintf
159+ "Expected a semicolon or whitespace but saw '%c' instead" c)
160+ | In_unquoted_string ->
161+ if is_token_char c then parse In_unquoted_string (i + 1)
162+ else if c = ';' then parse Semicolon_seen (i + 1)
163+ else if is_whitespace c then parse Ws_before_semicolon (i + 1)
164+ else
165+ Error
166+ (Printf.sprintf
167+ "Expected a token character, whitespace or a semicolon but saw \
168+ '%c' instead"
169+ c)
170+ in
171+ parse At_start 0
172+173+module Mime_type = struct
174+ let name = "MIME type"
175+ let validate = validate_mime_type
176+177+ let is_valid s =
178+ match validate s with
179+ | Ok () -> true
180+ | Error _ -> false
181+end
182+183+module Mime_type_list = struct
184+ let name = "MIME type list"
185+186+ let validate s =
187+ let s = String.trim s in
188+ if String.length s = 0 then Error "MIME type list must not be empty"
189+ else
190+ (* Split on commas and validate each MIME type *)
191+ let mime_types = String.split_on_char ',' s in
192+ let rec check_all = function
193+ | [] -> Ok ()
194+ | mime :: rest -> (
195+ let mime = String.trim mime in
196+ match validate_mime_type mime with
197+ | Ok () -> check_all rest
198+ | Error msg ->
199+ Error (Printf.sprintf "Invalid MIME type in list: %s" msg))
200+ in
201+ check_all mime_types
202+203+ let is_valid s =
204+ match validate s with
205+ | Ok () -> true
206+ | Error _ -> false
207+end
208+209+let datatypes =
210+ [ (module Mime_type : Datatype.S); (module Mime_type_list : Datatype.S) ]
+32
lib/html5_checker/datatype/dt_mime.mli
···00000000000000000000000000000000
···1+(** MIME type datatype validators.
2+3+ This module provides validators for MIME types (media types) as defined
4+ by RFC 2045 and used in HTML5. *)
5+6+(** MIME type validator.
7+8+ Validates a MIME type in the format: type/subtype[; parameters]
9+10+ Examples:
11+ - text/html
12+ - application/json
13+ - image/png
14+ - text/html; charset=utf-8
15+16+ Validation rules:
17+ - Must have a supertype (before /) and subtype (after /)
18+ - Supertype and subtype must be token characters
19+ - Optional semicolon-separated parameters
20+ - Parameters must be name=value pairs
21+ - Values can be quoted strings or tokens *)
22+module Mime_type : Datatype.S
23+24+(** MIME type list validator.
25+26+ Validates a comma-separated list of MIME types.
27+ Each MIME type in the list must be valid according to {!Mime_type} rules.
28+ This is used for the 'accept' attribute on input elements. *)
29+module Mime_type_list : Datatype.S
30+31+(** List of all datatypes defined in this module *)
32+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_popover.ml
···00000000000000000000
···1+(** Popover attribute validation for HTML5 *)
2+3+module Popover = struct
4+ let name = "popover"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "auto" | "manual" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid popover value. Expected 'auto', \
14+ 'manual', or empty string."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Popover : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_popover.mli
···0000000000000000000000
···1+(** Popover attribute datatype validator for HTML5.
2+3+ This module provides a validator for the popover attribute, as defined by
4+ the HTML5 specification. *)
5+6+(** Popover attribute validator.
7+8+ Validates popover attribute values which can be:
9+ - "auto" - the popover can be light-dismissed (closed by clicking outside)
10+ - "manual" - the popover must be explicitly closed
11+ - "" (empty string) - equivalent to "auto"
12+13+ Values are case-insensitive.
14+15+ Examples:
16+ - "auto"
17+ - "manual"
18+ - "" *)
19+module Popover : Datatype.S
20+21+(** List of all datatypes defined in this module *)
22+val datatypes : Datatype.t list
+22
lib/html5_checker/datatype/dt_preload.ml
···0000000000000000000000
···1+(** Media preload attribute validation based on HTML5 spec *)
2+3+(** Valid preload values *)
4+let valid_preloads = [ "none"; "metadata"; "auto"; "" ]
5+6+module Preload = struct
7+ let name = "preload"
8+9+ let validate s =
10+ let s_lower = Datatype.string_to_ascii_lowercase s in
11+ if List.mem s_lower valid_preloads then Ok ()
12+ else
13+ Error
14+ (Printf.sprintf
15+ "The value '%s' is not a valid preload value. Expected one of: \
16+ 'none', 'metadata', 'auto', or empty string."
17+ s)
18+19+ let is_valid s = Result.is_ok (validate s)
20+end
21+22+let datatypes = [ (module Preload : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_preload.mli
···000000000000000000000000
···1+(** Media preload attribute datatype validator.
2+3+ This module provides a validator for the preload attribute used on
4+ audio and video elements, as defined by the HTML5 specification. *)
5+6+(** Media preload attribute validator.
7+8+ Validates media preload attribute values which can be:
9+ - none - No preloading (only load metadata when user starts playback)
10+ - metadata - Preload metadata only (dimensions, duration, etc.)
11+ - auto - Preload the entire resource if possible
12+ - "" (empty string) - Equivalent to auto
13+14+ Values are matched case-insensitively according to HTML5 spec.
15+16+ Examples:
17+ - "none"
18+ - "metadata"
19+ - "auto"
20+ - "" *)
21+module Preload : Datatype.S
22+23+(** List of all datatypes defined in this module *)
24+val datatypes : Datatype.t list
+32
lib/html5_checker/datatype/dt_referrer.ml
···00000000000000000000000000000000
···1+(** Referrer policy attribute validation for HTML5 *)
2+3+module Referrer_policy = struct
4+ let name = "referrerpolicy"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | ""
10+ | "no-referrer"
11+ | "no-referrer-when-downgrade"
12+ | "origin"
13+ | "origin-when-cross-origin"
14+ | "same-origin"
15+ | "strict-origin"
16+ | "strict-origin-when-cross-origin"
17+ | "unsafe-url" ->
18+ Ok ()
19+ | _ ->
20+ Error
21+ (Printf.sprintf
22+ "The value '%s' is not a valid referrerpolicy value. Expected \
23+ one of: empty string, 'no-referrer', \
24+ 'no-referrer-when-downgrade', 'origin', \
25+ 'origin-when-cross-origin', 'same-origin', 'strict-origin', \
26+ 'strict-origin-when-cross-origin', or 'unsafe-url'."
27+ s)
28+29+ let is_valid s = Result.is_ok (validate s)
30+end
31+32+let datatypes = [ (module Referrer_policy : Datatype.S) ]
+29
lib/html5_checker/datatype/dt_referrer.mli
···00000000000000000000000000000
···1+(** Referrer policy attribute datatype validator.
2+3+ This module provides a validator for the referrerpolicy attribute used to
4+ control referrer information sent with requests, as defined by the HTML5 spec. *)
5+6+(** Referrer policy attribute validator.
7+8+ Validates referrerpolicy attribute values which can be:
9+ - "" (empty string, uses default policy)
10+ - "no-referrer" (never send referrer)
11+ - "no-referrer-when-downgrade" (send referrer to same security level)
12+ - "origin" (send origin only)
13+ - "origin-when-cross-origin" (full URL for same-origin, origin for cross-origin)
14+ - "same-origin" (send referrer for same-origin only)
15+ - "strict-origin" (send origin for same security level)
16+ - "strict-origin-when-cross-origin" (full URL same-origin, origin cross-origin same security)
17+ - "unsafe-url" (always send full URL)
18+19+ Values are case-insensitive after ASCII lowercasing.
20+21+ Examples:
22+ - ""
23+ - "no-referrer"
24+ - "origin"
25+ - "strict-origin-when-cross-origin" *)
26+module Referrer_policy : Datatype.S
27+28+(** List of all datatypes defined in this module *)
29+val datatypes : Datatype.t list
···1+(** Sandbox tokens validation *)
2+3+(** Valid sandbox tokens (case-sensitive) *)
4+let valid_sandbox_tokens =
5+ [
6+ "allow-downloads";
7+ "allow-forms";
8+ "allow-modals";
9+ "allow-orientation-lock";
10+ "allow-pointer-lock";
11+ "allow-popups";
12+ "allow-popups-to-escape-sandbox";
13+ "allow-presentation";
14+ "allow-same-origin";
15+ "allow-scripts";
16+ "allow-top-navigation";
17+ "allow-top-navigation-by-user-activation";
18+ "allow-top-navigation-to-custom-protocols";
19+ ]
20+21+(** Validate sandbox attribute value *)
22+let validate_sandbox s =
23+ let trimmed = Datatype.trim_html_spaces s in
24+ (* Empty value is valid (maximum restrictions) *)
25+ if trimmed = "" then Ok ()
26+ else
27+ (* Split on whitespace *)
28+ let tokens = String.split_on_char ' ' trimmed in
29+ let tokens = List.filter (fun t -> Datatype.trim_html_spaces t <> "") tokens in
30+ if tokens = [] then Ok () (* All whitespace is like empty *)
31+ else
32+ (* Validate each token *)
33+ let rec check_tokens = function
34+ | [] -> Ok ()
35+ | token :: rest ->
36+ (* Sandbox tokens are case-sensitive *)
37+ if List.mem token valid_sandbox_tokens then check_tokens rest
38+ else
39+ Error
40+ (Printf.sprintf
41+ "The value '%s' is not a valid sandbox token. Valid tokens \
42+ are: %s"
43+ token (String.concat ", " valid_sandbox_tokens))
44+ in
45+ check_tokens tokens
46+47+module Sandbox = struct
48+ let name = "sandbox"
49+ let validate = validate_sandbox
50+ let is_valid s = Result.is_ok (validate s)
51+end
52+53+let datatypes = [ (module Sandbox : Datatype.S) ]
+37
lib/html5_checker/datatype/dt_sandbox.mli
···0000000000000000000000000000000000000
···1+(** Sandbox tokens validator.
2+3+ This module provides a validator for the sandbox attribute used on iframe
4+ elements, as defined by the HTML5 specification. *)
5+6+(** Sandbox attribute validator.
7+8+ Validates sandbox attribute values which contain space-separated sandbox
9+ tokens. Each token enables a specific capability for the sandboxed iframe.
10+11+ Valid tokens:
12+ - allow-downloads: Allow downloads
13+ - allow-forms: Allow form submission
14+ - allow-modals: Allow modal dialogs (alert, confirm, etc.)
15+ - allow-orientation-lock: Allow orientation lock
16+ - allow-pointer-lock: Allow pointer lock
17+ - allow-popups: Allow popups (window.open, target="_blank", etc.)
18+ - allow-popups-to-escape-sandbox: Allow popups that don't inherit sandboxing
19+ - allow-presentation: Allow presentation sessions
20+ - allow-same-origin: Allow same-origin access
21+ - allow-scripts: Allow script execution
22+ - allow-top-navigation: Allow navigating top-level browsing context
23+ - allow-top-navigation-by-user-activation: Allow top navigation with user gesture
24+ - allow-top-navigation-to-custom-protocols: Allow top navigation to custom protocols
25+26+ Examples:
27+ - "" (empty = maximum restrictions)
28+ - "allow-scripts"
29+ - "allow-same-origin allow-scripts"
30+ - "allow-forms allow-popups allow-scripts"
31+32+ Tokens are case-sensitive and must match exactly. Duplicate tokens are
33+ allowed but redundant. An empty value means maximum sandbox restrictions. *)
34+module Sandbox : Datatype.S
35+36+(** List of all datatypes defined in this module *)
37+val datatypes : Datatype.t list
+22
lib/html5_checker/datatype/dt_scope.ml
···0000000000000000000000
···1+(** Table header scope attribute validation based on HTML5 spec *)
2+3+(** Valid scope values *)
4+let valid_scopes = [ "row"; "col"; "rowgroup"; "colgroup" ]
5+6+module Scope = struct
7+ let name = "scope"
8+9+ let validate s =
10+ let s_lower = Datatype.string_to_ascii_lowercase s in
11+ if List.mem s_lower valid_scopes then Ok ()
12+ else
13+ Error
14+ (Printf.sprintf
15+ "The value '%s' is not a valid scope value. Expected one of: %s."
16+ s
17+ (String.concat ", " valid_scopes))
18+19+ let is_valid s = Result.is_ok (validate s)
20+end
21+22+let datatypes = [ (module Scope : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_scope.mli
···000000000000000000000000
···1+(** Table header scope attribute datatype validator.
2+3+ This module provides a validator for the scope attribute used on
4+ th elements, as defined by the HTML5 specification. *)
5+6+(** Table header scope attribute validator.
7+8+ Validates th scope attribute values which can be:
9+ - row - Header cell applies to some of the subsequent cells in the same row(s)
10+ - col - Header cell applies to some of the subsequent cells in the same column(s)
11+ - rowgroup - Header cell applies to all remaining cells in the row group
12+ - colgroup - Header cell applies to all remaining cells in the column group
13+14+ Values are matched case-insensitively according to HTML5 spec.
15+16+ Examples:
17+ - "row"
18+ - "col"
19+ - "rowgroup"
20+ - "colgroup" *)
21+module Scope : Datatype.S
22+23+(** List of all datatypes defined in this module *)
24+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_shape.ml
···00000000000000000000
···1+(** Shape attribute validation for HTML5 *)
2+3+module Shape = struct
4+ let name = "shape"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "default" | "rect" | "circle" | "poly" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid shape. Expected 'default', 'rect', \
14+ 'circle', or 'poly'."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Shape : Datatype.S) ]
+24
lib/html5_checker/datatype/dt_shape.mli
···000000000000000000000000
···1+(** Shape attribute datatype validator for HTML5.
2+3+ This module provides a validator for the shape attribute used on area
4+ elements within image maps, as defined by the HTML5 specification. *)
5+6+(** Shape attribute validator.
7+8+ Validates shape attribute values which can be:
9+ - "default" - entire region
10+ - "rect" - rectangular region
11+ - "circle" - circular region
12+ - "poly" - polygonal region
13+14+ Values are case-insensitive.
15+16+ Examples:
17+ - "rect"
18+ - "circle"
19+ - "poly"
20+ - "default" *)
21+module Shape : Datatype.S
22+23+(** List of all datatypes defined in this module *)
24+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_spellcheck.ml
···00000000000000000000
···1+(** Spellcheck attribute validation for HTML5 *)
2+3+module Spellcheck = struct
4+ let name = "spellcheck"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "true" | "false" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid spellcheck value. Expected 'true', \
14+ 'false', or empty string."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Spellcheck : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_spellcheck.mli
···0000000000000000000000
···1+(** Spellcheck attribute datatype validator for HTML5.
2+3+ This module provides a validator for the spellcheck attribute, as defined by
4+ the HTML5 specification. *)
5+6+(** Spellcheck attribute validator.
7+8+ Validates spellcheck attribute values which can be:
9+ - "true" - spelling and grammar checking is enabled
10+ - "false" - spelling and grammar checking is disabled
11+ - "" (empty string) - default behavior (typically inherits from parent)
12+13+ Values are case-insensitive.
14+15+ Examples:
16+ - "true"
17+ - "false"
18+ - "" *)
19+module Spellcheck : Datatype.S
20+21+(** List of all datatypes defined in this module *)
22+val datatypes : Datatype.t list
···1+(** Image source set and sizes attribute validation *)
2+3+(** Split string on commas, preserving parentheses groups *)
4+let split_on_commas s =
5+ let len = String.length s in
6+ let rec find_splits paren_depth start i acc =
7+ if i >= len then
8+ if start < len then List.rev (String.sub s start (len - start) :: acc)
9+ else List.rev acc
10+ else
11+ match s.[i] with
12+ | '(' -> find_splits (paren_depth + 1) start (i + 1) acc
13+ | ')' -> find_splits (max 0 (paren_depth - 1)) start (i + 1) acc
14+ | ',' when paren_depth = 0 ->
15+ let part = String.sub s start (i - start) in
16+ find_splits 0 (i + 1) (i + 1) (part :: acc)
17+ | _ -> find_splits paren_depth start (i + 1) acc
18+ in
19+ find_splits 0 0 0 []
20+21+(** Parse a descriptor (width or pixel density) *)
22+let parse_descriptor s =
23+ let trimmed = Datatype.trim_html_spaces s in
24+ let len = String.length trimmed in
25+ if len < 2 then None
26+ else
27+ let suffix = trimmed.[len - 1] in
28+ let num_part = String.sub trimmed 0 (len - 1) in
29+ match suffix with
30+ | 'w' ->
31+ (try
32+ let n = int_of_string num_part in
33+ if n > 0 then Some (`Width n) else None
34+ with _ -> None)
35+ | 'x' ->
36+ (try
37+ let f = float_of_string num_part in
38+ if f > 0.0 then Some (`Density f) else None
39+ with _ -> None)
40+ | _ -> None
41+42+(** Validate a single image candidate *)
43+let validate_image_candidate s =
44+ let trimmed = Datatype.trim_html_spaces s in
45+ if trimmed = "" then Error "Image candidate must not be empty"
46+ else
47+ (* Split on whitespace to get URL and optional descriptor *)
48+ let parts = String.split_on_char ' ' trimmed in
49+ let parts = List.filter (fun p -> Datatype.trim_html_spaces p <> "") parts in
50+ match parts with
51+ | [] -> Error "Image candidate must not be empty"
52+ | [ _url ] -> Ok None (* Just URL, no descriptor *)
53+ | [ _url; desc ] -> (
54+ match parse_descriptor desc with
55+ | Some d -> Ok (Some d)
56+ | None ->
57+ Error
58+ (Printf.sprintf
59+ "Invalid descriptor '%s'. Must be a positive integer followed \
60+ by 'w' or a positive number followed by 'x'"
61+ desc))
62+ | _ ->
63+ Error
64+ "Image candidate must be a URL optionally followed by one descriptor"
65+66+(** Validate srcset value *)
67+let validate_srcset s =
68+ let trimmed = Datatype.trim_html_spaces s in
69+ if trimmed = "" then Error "Srcset must not be empty" else
70+ let candidates = split_on_commas trimmed in
71+ let candidates = List.filter (fun c -> Datatype.trim_html_spaces c <> "") candidates in
72+ if candidates = [] then Error "Srcset must contain at least one image candidate"
73+ else
74+ (* Validate each candidate and check for descriptor type consistency *)
75+ let rec check_candidates has_width has_density = function
76+ | [] -> Ok ()
77+ | cand :: rest -> (
78+ match validate_image_candidate cand with
79+ | Error e -> Error e
80+ | Ok None -> check_candidates has_width has_density rest
81+ | Ok (Some (`Width _)) ->
82+ if has_density then
83+ Error
84+ "Cannot mix width descriptors (w) and pixel density \
85+ descriptors (x) in the same srcset"
86+ else check_candidates true has_density rest
87+ | Ok (Some (`Density _)) ->
88+ if has_width then
89+ Error
90+ "Cannot mix width descriptors (w) and pixel density \
91+ descriptors (x) in the same srcset"
92+ else check_candidates has_width true rest)
93+ in
94+ check_candidates false false candidates
95+96+module Srcset = struct
97+ let name = "srcset"
98+ let validate = validate_srcset
99+ let is_valid s = Result.is_ok (validate s)
100+end
101+102+(** Validate sizes attribute *)
103+let validate_sizes s =
104+ let trimmed = Datatype.trim_html_spaces s in
105+ if trimmed = "" then Error "Sizes attribute must not be empty"
106+ else
107+ (* Split on commas *)
108+ let entries = split_on_commas trimmed in
109+ let entries = List.filter (fun e -> Datatype.trim_html_spaces e <> "") entries in
110+ if entries = [] then Error "Sizes attribute must contain at least one entry"
111+ else
112+ (* Each entry except the last should have a media condition
113+ The last entry is just a size value
114+ We do basic validation here *)
115+ let rec check_entries = function
116+ | [] -> Ok ()
117+ | [ _last ] ->
118+ (* Last entry - just a size value, accept anything non-empty *)
119+ Ok ()
120+ | entry :: rest ->
121+ let entry_trimmed = Datatype.trim_html_spaces entry in
122+ (* Check if it looks like it has a media condition (starts with '(') *)
123+ if String.length entry_trimmed = 0 then
124+ Error "Size entry must not be empty"
125+ else if entry_trimmed.[0] <> '(' then
126+ Error
127+ (Printf.sprintf
128+ "Size entry '%s' should start with a media condition in \
129+ parentheses"
130+ entry_trimmed)
131+ else check_entries rest
132+ in
133+ check_entries entries
134+135+module Sizes = struct
136+ let name = "sizes"
137+ let validate = validate_sizes
138+ let is_valid s = Result.is_ok (validate s)
139+end
140+141+let datatypes =
142+ [ (module Srcset : Datatype.S); (module Sizes : Datatype.S) ]
+44
lib/html5_checker/datatype/dt_srcset.mli
···00000000000000000000000000000000000000000000
···1+(** Image source set and sizes attribute validators.
2+3+ This module provides validators for srcset and sizes attributes used on
4+ img and source elements, as defined by the HTML5 specification. *)
5+6+(** Srcset attribute validator.
7+8+ Validates srcset attribute values which contain comma-separated image
9+ candidates. Each image candidate consists of:
10+ - A URL
11+ - Optional whitespace followed by a width descriptor (e.g., "100w") or
12+ pixel density descriptor (e.g., "2x")
13+14+ Examples:
15+ - "image.jpg"
16+ - "image.jpg 1x"
17+ - "image-320.jpg 320w, image-640.jpg 640w"
18+ - "image-1x.jpg 1x, image-2x.jpg 2x"
19+20+ Width descriptors must be positive integers followed by 'w'.
21+ Pixel density descriptors must be positive numbers followed by 'x'.
22+ Cannot mix width and density descriptors in the same srcset. *)
23+module Srcset : Datatype.S
24+25+(** Sizes attribute validator.
26+27+ Validates sizes attribute values which contain comma-separated source size
28+ entries. Each entry (except the last) consists of:
29+ - A media condition
30+ - Whitespace
31+ - A source size value (length or "auto")
32+33+ The last entry is just a source size value (without media condition).
34+35+ Examples:
36+ - "100vw"
37+ - "(max-width: 600px) 100vw, 50vw"
38+ - "(min-width: 800px) 800px, 100vw"
39+40+ This validator performs basic syntax checking. *)
41+module Sizes : Datatype.S
42+43+(** List of all datatypes defined in this module *)
44+val datatypes : Datatype.t list
+42
lib/html5_checker/datatype/dt_target.ml
···000000000000000000000000000000000000000000
···1+(** Browsing context and target attribute validation *)
2+3+(** Valid special target keywords (case-insensitive) *)
4+let special_keywords = [ "_blank"; "_self"; "_parent"; "_top" ]
5+6+(** Validate a browsing context name *)
7+let validate_browsing_context s =
8+ if String.length s = 0 then Error "Browsing context name must not be empty"
9+ else if s.[0] = '_' then
10+ (* If starts with underscore, must be a special keyword *)
11+ let lower = Datatype.string_to_ascii_lowercase s in
12+ if List.mem lower special_keywords then Ok ()
13+ else
14+ Error
15+ (Printf.sprintf
16+ "Browsing context name '%s' starts with underscore but is not one \
17+ of the special keywords: %s"
18+ s (String.concat ", " special_keywords))
19+ else Ok ()
20+21+module Browsing_context = struct
22+ let name = "browsing-context"
23+ let validate = validate_browsing_context
24+ let is_valid s = Result.is_ok (validate s)
25+end
26+27+(** Validate a target attribute value
28+ (For now, this is the same as browsing context validation) *)
29+let validate_target s =
30+ if String.length s = 0 then Error "Target attribute must not be empty"
31+ else validate_browsing_context s
32+33+module Target = struct
34+ let name = "target"
35+ let validate = validate_target
36+ let is_valid s = Result.is_ok (validate s)
37+end
38+39+let datatypes =
40+ [
41+ (module Target : Datatype.S); (module Browsing_context : Datatype.S);
42+ ]
···1+(** Browsing context and target attribute validators.
2+3+ This module provides validators for browsing context names and target
4+ attributes used on a, area, base, and form elements, as defined by the
5+ HTML5 specification. *)
6+7+(** Target attribute validator.
8+9+ Validates target attribute values which specify where to display linked
10+ content or form responses. Valid values include:
11+12+ Special keywords (case-insensitive):
13+ - _blank: New window or tab
14+ - _self: Same frame (default)
15+ - _parent: Parent frame
16+ - _top: Top-level window
17+18+ Or a valid browsing context name:
19+ - Non-empty string
20+ - If starts with underscore, must be one of the special keywords above
21+22+ Examples:
23+ - "_blank"
24+ - "_self"
25+ - "myframe"
26+ - "content-frame" *)
27+module Target : Datatype.S
28+29+(** Browsing context name validator.
30+31+ Validates browsing context names used to identify frames and windows.
32+ A valid browsing context name is:
33+ - A non-empty string
34+ - If it starts with an underscore (_), it must be one of the special
35+ keywords: _blank, _self, _parent, or _top (case-insensitive)
36+ - Otherwise, any non-empty string is valid
37+38+ Examples:
39+ - "myframe"
40+ - "content"
41+ - "navigation-frame"
42+ - "_blank" (special keyword)
43+44+ Invalid examples:
45+ - "" (empty string)
46+ - "_custom" (underscore prefix but not a special keyword) *)
47+module Browsing_context : Datatype.S
48+49+(** List of all datatypes defined in this module *)
50+val datatypes : Datatype.t list
+20
lib/html5_checker/datatype/dt_translate.ml
···00000000000000000000
···1+(** Translate attribute validation for HTML5 *)
2+3+module Translate = struct
4+ let name = "translate"
5+6+ let validate s =
7+ let s_lower = Datatype.string_to_ascii_lowercase s in
8+ match s_lower with
9+ | "" | "yes" | "no" -> Ok ()
10+ | _ ->
11+ Error
12+ (Printf.sprintf
13+ "The value '%s' is not a valid translate value. Expected 'yes', \
14+ 'no', or empty string."
15+ s)
16+17+ let is_valid s = Result.is_ok (validate s)
18+end
19+20+let datatypes = [ (module Translate : Datatype.S) ]
+22
lib/html5_checker/datatype/dt_translate.mli
···0000000000000000000000
···1+(** Translate attribute datatype validator for HTML5.
2+3+ This module provides a validator for the translate attribute, as defined by
4+ the HTML5 specification. *)
5+6+(** Translate attribute validator.
7+8+ Validates translate attribute values which can be:
9+ - "yes" - the element should be translated
10+ - "no" - the element should not be translated
11+ - "" (empty string) - equivalent to "yes"
12+13+ Values are case-insensitive.
14+15+ Examples:
16+ - "yes"
17+ - "no"
18+ - "" *)
19+module Translate : Datatype.S
20+21+(** List of all datatypes defined in this module *)
22+val datatypes : Datatype.t list
···1+(** URL and IRI datatype validators for HTML5. *)
2+3+(** Check if a character is valid in a URL scheme name.
4+ Scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) *)
5+let is_scheme_char_initial = function 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
6+7+let is_scheme_char_subsequent = function
8+ | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '+' | '-' | '.' -> true
9+ | _ -> false
10+11+(** Split a URL into scheme and remainder.
12+ Returns Some (scheme, rest) if a valid scheme is found, None otherwise.
13+ Skips leading HTML whitespace before checking for scheme. *)
14+let split_scheme s =
15+ let len = String.length s in
16+ (* Skip leading HTML whitespace *)
17+ let rec skip_whitespace i =
18+ if i >= len then len
19+ else if Datatype.is_whitespace s.[i] then skip_whitespace (i + 1)
20+ else i
21+ in
22+ let start = skip_whitespace 0 in
23+ if start >= len then None
24+ else if not (is_scheme_char_initial s.[start]) then None
25+ else
26+ (* Look for scheme *)
27+ let rec find_colon i =
28+ if i >= len then None
29+ else
30+ match s.[i] with
31+ | ':' ->
32+ let scheme =
33+ String.sub s start (i - start) |> Datatype.string_to_ascii_lowercase
34+ in
35+ let rest = String.sub s (i + 1) (len - i - 1) in
36+ Some (scheme, rest)
37+ | c when is_scheme_char_subsequent c -> find_colon (i + 1)
38+ | _ -> None
39+ in
40+ find_colon (start + 1)
41+42+(** Check if a scheme is well-known (http, https, ftp, mailto, file) *)
43+let is_well_known_scheme = function
44+ | "http" | "https" | "ftp" | "mailto" | "file" -> true
45+ | _ -> false
46+47+module Url : Datatype.S = struct
48+ let name = "URL"
49+50+ let validate s =
51+ let trimmed = Datatype.trim_html_spaces s in
52+ if trimmed = "" then Error "Must be non-empty."
53+ else
54+ (* Basic validation - check for control characters *)
55+ let len = String.length s in
56+ let rec check_chars i =
57+ if i >= len then Ok ()
58+ else
59+ match s.[i] with
60+ | '\x00' .. '\x1F' | '\x7F' ->
61+ Error "URLs must not contain control characters."
62+ | _ -> check_chars (i + 1)
63+ in
64+ check_chars 0
65+66+ let is_valid s = Result.is_ok (validate s)
67+end
68+69+module Url_potentially_empty : Datatype.S = struct
70+ let name = "URL (potentially empty)"
71+72+ let validate s =
73+ let trimmed = Datatype.trim_html_spaces s in
74+ if trimmed = "" then Ok ()
75+ else
76+ (* Use same validation as Url for non-empty values *)
77+ Url.validate s
78+79+ let is_valid s = Result.is_ok (validate s)
80+end
81+82+module Url_absolute : Datatype.S = struct
83+ let name = "absolute URL"
84+85+ let validate s =
86+ let trimmed = Datatype.trim_html_spaces s in
87+ if trimmed = "" then Error "Must be non-empty."
88+ else
89+ match split_scheme s with
90+ | None ->
91+ Error (Printf.sprintf "The string \"%s\" is not an absolute URL." s)
92+ | Some (scheme, _rest) ->
93+ if is_well_known_scheme scheme || String.length scheme > 0 then
94+ (* For well-known schemes, we could do more validation,
95+ but for now we just check that it has a scheme *)
96+ Ok ()
97+ else Error "The string is not an absolute URL."
98+99+ let is_valid s = Result.is_ok (validate s)
100+end
101+102+module Iri : Datatype.S = struct
103+ let name = "absolute URL"
104+105+ (* IRI validation is the same as absolute URL validation *)
106+ let validate = Url_absolute.validate
107+ let is_valid s = Result.is_ok (validate s)
108+end
109+110+module Iri_ref : Datatype.S = struct
111+ let name = "URL"
112+113+ (* IRI reference validation is the same as URL validation *)
114+ let validate = Url.validate
115+ let is_valid s = Result.is_ok (validate s)
116+end
117+118+let datatypes =
119+ [ (module Url : Datatype.S)
120+ ; (module Url_potentially_empty : Datatype.S)
121+ ; (module Url_absolute : Datatype.S)
122+ ; (module Iri : Datatype.S)
123+ ; (module Iri_ref : Datatype.S)
124+ ]
···1+(** URL and IRI datatype validators for HTML5.
2+3+ This module provides validators for URLs and Internationalized Resource
4+ Identifiers (IRIs) based on the Nu HTML Checker's implementation.
5+6+ The validators perform basic structural validation. Full URL parsing and
7+ validation according to WHATWG URL spec would require a complete URL parser. *)
8+9+(** URL validator (IriRef in Nu validator).
10+11+ Validates URL references which can be either absolute or relative URLs.
12+ Basic validation ensures:
13+ - Non-empty after trimming HTML whitespace
14+ - No control characters
15+ - Basic structural correctness
16+17+ This corresponds to the general URL/IRI reference type. *)
18+module Url : Datatype.S
19+20+(** URL validator that allows empty values.
21+22+ Same as Url but permits empty strings. This is used for optional URL
23+ attributes. *)
24+module Url_potentially_empty : Datatype.S
25+26+(** Absolute URL validator (Iri in Nu validator).
27+28+ Validates that a URL is absolute (has a scheme). An absolute URL must:
29+ - Start with a valid scheme (e.g., http:, https:, ftp:, mailto:)
30+ - Not be empty
31+ - Follow URL structure rules
32+33+ Scheme format: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) *)
34+module Url_absolute : Datatype.S
35+36+(** IRI (Internationalized Resource Identifier) validator.
37+38+ Alias for Url_absolute. IRIs are the internationalized version of URIs,
39+ allowing Unicode characters. *)
40+module Iri : Datatype.S
41+42+(** IRI reference validator.
43+44+ Alias for Url. IRI references can be relative or absolute. *)
45+module Iri_ref : Datatype.S
46+47+(** List of all URL/IRI-related datatypes for registration. *)
48+val datatypes : Datatype.t list
+22
lib/html5_checker/datatype/dt_wrap.ml
···0000000000000000000000
···1+(** Textarea wrap attribute validation based on HTML5 spec *)
2+3+(** Valid wrap values *)
4+let valid_wraps = [ "soft"; "hard" ]
5+6+module Wrap = struct
7+ let name = "wrap"
8+9+ let validate s =
10+ let s_lower = Datatype.string_to_ascii_lowercase s in
11+ if List.mem s_lower valid_wraps then Ok ()
12+ else
13+ Error
14+ (Printf.sprintf
15+ "The value '%s' is not a valid wrap value. Expected one of: %s."
16+ s
17+ (String.concat ", " valid_wraps))
18+19+ let is_valid s = Result.is_ok (validate s)
20+end
21+22+let datatypes = [ (module Wrap : Datatype.S) ]
+20
lib/html5_checker/datatype/dt_wrap.mli
···00000000000000000000
···1+(** Textarea wrap attribute datatype validator.
2+3+ This module provides a validator for the wrap attribute used on
4+ textarea elements, as defined by the HTML5 specification. *)
5+6+(** Textarea wrap attribute validator.
7+8+ Validates textarea wrap attribute values which can be:
9+ - soft - Soft wrapping (line breaks not submitted, default)
10+ - hard - Hard wrapping (line breaks are submitted)
11+12+ Values are matched case-insensitively according to HTML5 spec.
13+14+ Examples:
15+ - "soft"
16+ - "hard" *)
17+module Wrap : Datatype.S
18+19+(** List of all datatypes defined in this module *)
20+val datatypes : Datatype.t list
···1+(** DOM tree traversal for HTML5 conformance checking.
2+3+ This module provides functions to traverse DOM trees and apply checkers
4+ to validate HTML5 documents. It implements a depth-first, in-order
5+ traversal that visits every node in the tree and notifies checkers
6+ of traversal events.
7+8+ {2 Traversal Model}
9+10+ The walker follows a SAX-like event model, emitting events as it
11+ encounters different node types during traversal:
12+13+ {v
14+ Document
15+ └── html (start_element "html")
16+ ├── head (start_element "head")
17+ │ └── title (start_element "title")
18+ │ ├── #text "Page Title" (characters)
19+ │ └── (end_element "title")
20+ └── body (start_element "body")
21+ └── p (start_element "p")
22+ ├── #text "Hello " (characters)
23+ ├── b (start_element "b")
24+ │ ├── #text "world" (characters)
25+ │ └── (end_element "b")
26+ ├── #text "!" (characters)
27+ └── (end_element "p")
28+ end_document
29+ v}
30+31+ {2 Event Sequence}
32+33+ For each element node:
34+ 1. {!Checker.S.start_element} is called when entering the element
35+ 2. Children are recursively traversed
36+ 3. {!Checker.S.end_element} is called when exiting the element
37+38+ For text and comment nodes:
39+ - {!Checker.S.characters} is called with the text content
40+41+ After the entire tree is traversed:
42+ - {!Checker.S.end_document} is called on all checkers
43+44+ {2 Checker Coordination}
45+46+ When multiple checkers are used:
47+ - All checkers receive the same event sequence
48+ - Events are delivered to checkers in the order they appear in the list
49+ - Each checker maintains independent state
50+ - Messages from all checkers are collected together
51+52+ This allows composing orthogonal validation rules without interference.
53+54+ {2 Usage Examples}
55+56+ {b Single checker:}
57+ {[
58+ let checker = Checker.noop () in
59+ let collector = Message_collector.create () in
60+ walk checker collector dom;
61+ let messages = Message_collector.messages collector in
62+ List.iter Message.pp messages
63+ ]}
64+65+ {b Multiple checkers:}
66+ {[
67+ let checkers = [checker1; checker2; checker3] in
68+ let collector = Message_collector.create () in
69+ walk_all checkers collector dom;
70+ (* Analyze messages from all checkers *)
71+ ]}
72+73+ {b Registry of checkers:}
74+ {[
75+ let registry = Checker_registry.default () in
76+ let collector = Message_collector.create () in
77+ walk_registry registry collector dom;
78+ (* All registered checkers have validated the DOM *)
79+ ]} *)
80+81+(** {1 Single Checker Traversal} *)
82+83+val walk : Checker.t -> Message_collector.t -> Html5rw.Dom.node -> unit
84+(** [walk checker collector node] traverses a DOM tree with a single checker.
85+86+ @param checker The checker to apply during traversal
87+ @param collector The message collector for validation messages
88+ @param node The root node to start traversal from
89+90+ The traversal is depth-first and in-order: for each element, the
91+ checker receives a {!Checker.S.start_element} event, then children
92+ are recursively traversed, then an {!Checker.S.end_element} event
93+ is emitted.
94+95+ After the entire tree is traversed, {!Checker.S.end_document} is
96+ called to allow the checker to emit any final validation messages.
97+98+ {b Example:}
99+ {[
100+ (* Validate a parsed HTML document *)
101+ let checker = Checker.noop () in
102+ let collector = Message_collector.create () in
103+ walk checker collector document_node;
104+105+ (* Check for errors *)
106+ let messages = Message_collector.messages collector in
107+ let errors = List.filter
108+ (fun msg -> msg.Message.severity = Message.Error)
109+ messages in
110+ if errors <> [] then
111+ Printf.printf "Found %d errors\n" (List.length errors)
112+ ]}
113+114+ {b Notes:}
115+ - Only element nodes trigger start/end events
116+ - Text and comment nodes trigger character events
117+ - Document and doctype nodes are silently skipped
118+ - The traversal follows document order (parent before children,
119+ earlier siblings before later ones) *)
120+121+(** {1 Multiple Checker Traversal} *)
122+123+val walk_all :
124+ Checker.t list -> Message_collector.t -> Html5rw.Dom.node -> unit
125+(** [walk_all checkers collector node] traverses a DOM tree with multiple
126+ checkers.
127+128+ @param checkers List of checkers to apply during traversal
129+ @param collector The message collector for validation messages
130+ @param node The root node to start traversal from
131+132+ This performs a single tree traversal, delivering each event to all
133+ checkers in sequence. This is more efficient than calling {!walk}
134+ multiple times.
135+136+ All checkers receive events in the order they appear in the list.
137+ Each checker maintains independent state, so validation rules can
138+ be composed without interference.
139+140+ {b Example:}
141+ {[
142+ (* Run multiple validation passes in one traversal *)
143+ let structure_checker = (module StructureChecker : Checker.S) in
144+ let attribute_checker = (module AttributeChecker : Checker.S) in
145+ let obsolete_checker = (module ObsoleteChecker : Checker.S) in
146+147+ let checkers = [structure_checker; attribute_checker; obsolete_checker] in
148+ let collector = Message_collector.create () in
149+150+ walk_all checkers collector document_node;
151+152+ (* All three checkers have validated the document *)
153+ let messages = Message_collector.messages collector in
154+ Message_format.print_messages messages
155+ ]}
156+157+ {b Empty list behavior:}
158+ If the checkers list is empty, the tree is traversed but no validation
159+ is performed. This is equivalent to calling [walk (Checker.noop ()) ...]. *)
160+161+(** {1 Registry-Based Traversal} *)
162+163+val walk_registry :
164+ Checker_registry.t -> Message_collector.t -> Html5rw.Dom.node -> unit
165+(** [walk_registry registry collector node] traverses a DOM tree with all
166+ checkers from a registry.
167+168+ @param registry The registry containing checkers to apply
169+ @param collector The message collector for validation messages
170+ @param node The root node to start traversal from
171+172+ This is equivalent to:
173+ {[
174+ let checkers = Checker_registry.all registry in
175+ walk_all checkers collector node
176+ ]}
177+178+ Use this when you want to run a pre-configured set of checkers
179+ without manually extracting them from the registry.
180+181+ {b Example:}
182+ {[
183+ (* Set up registry with desired checkers *)
184+ let registry = Checker_registry.default () in
185+ Checker_registry.register registry "custom" my_checker;
186+187+ (* Validate multiple documents with same checker set *)
188+ List.iter (fun doc ->
189+ let collector = Message_collector.create () in
190+ walk_registry registry collector doc;
191+ report_results collector
192+ ) documents
193+ ]}
194+195+ {b Empty registry behavior:}
196+ If the registry is empty, the tree is traversed but no validation
197+ is performed. *)
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** HTML5 conformance checker.
7+8+ This module provides HTML5 validation and conformance checking,
9+ combining parse error detection with structural validation rules. *)
10+11+(** {1 Re-exported modules} *)
12+13+(** Validation message types and constructors. *)
14+module Message = Message
15+16+(** Message collection utilities. *)
17+module Message_collector = Message_collector
18+19+(** Message output formatters. *)
20+module Message_format = Message_format
21+22+(** Parse error bridge. *)
23+module Parse_error_bridge = Parse_error_bridge
24+25+(** {2 Content Model Framework} *)
26+27+(** HTML5 content categories. *)
28+module Content_category = Content_category
29+30+(** HTML5 element content models. *)
31+module Content_model = Content_model
32+33+(** HTML5 attribute specifications. *)
34+module Attr_spec = Attr_spec
35+36+(** HTML5 element specifications. *)
37+module Element_spec = Element_spec
38+39+(** {1 Core Types} *)
40+41+(** Result of checking an HTML document. *)
42+type t
43+44+(** {1 Checking Functions} *)
45+46+(** Parse and validate HTML from a reader.
47+48+ This function parses the HTML input and optionally collects parse errors.
49+ Future versions will also run conformance checkers on the resulting DOM.
50+51+ @param collect_parse_errors If true, collect and include parse errors. Default: true.
52+ @param system_id Optional file path or URL for error reporting.
53+ @param reader Bytesrw reader containing HTML input. *)
54+val check :
55+ ?collect_parse_errors:bool ->
56+ ?system_id:string ->
57+ Bytesrw.Bytes.Reader.t ->
58+ t
59+60+(** Validate an already-parsed HTML document.
61+62+ This function takes an existing Html5rw.t parse result and validates it.
63+64+ @param collect_parse_errors If true, collect and include parse errors from the result. Default: true.
65+ @param system_id Optional file path or URL for error reporting.
66+ @param result Already-parsed HTML document. *)
67+val check_dom :
68+ ?collect_parse_errors:bool ->
69+ ?system_id:string ->
70+ Html5rw.t ->
71+ t
72+73+(** {1 Result Accessors} *)
74+75+(** Get all validation messages. *)
76+val messages : t -> Message.t list
77+78+(** Get only error messages. *)
79+val errors : t -> Message.t list
80+81+(** Get only warning messages. *)
82+val warnings : t -> Message.t list
83+84+(** Check if there are any errors. *)
85+val has_errors : t -> bool
86+87+(** Get the underlying parsed document. *)
88+val document : t -> Html5rw.t
89+90+(** Get the system identifier if set. *)
91+val system_id : t -> string option
92+93+(** {1 Formatting} *)
94+95+(** Format messages as human-readable text. *)
96+val format_text : t -> string
97+98+(** Format messages as JSON. *)
99+val format_json : t -> string
100+101+(** Format messages in GNU style. *)
102+val format_gnu : t -> string
···1+(** Message collector for accumulating validation messages. *)
2+3+(** The type of a message collector. *)
4+type t
5+6+(** {1 Creation} *)
7+8+(** Create a new empty message collector. *)
9+val create : unit -> t
10+11+(** {1 Adding Messages} *)
12+13+(** Add a message to the collector. *)
14+val add : t -> Message.t -> unit
15+16+(** Add an error message to the collector. *)
17+val add_error :
18+ t ->
19+ message:string ->
20+ ?code:string ->
21+ ?location:Message.location ->
22+ ?element:string ->
23+ ?attribute:string ->
24+ ?extract:string ->
25+ unit ->
26+ unit
27+28+(** Add a warning message to the collector. *)
29+val add_warning :
30+ t ->
31+ message:string ->
32+ ?code:string ->
33+ ?location:Message.location ->
34+ ?element:string ->
35+ ?attribute:string ->
36+ ?extract:string ->
37+ unit ->
38+ unit
39+40+(** {1 Retrieving Messages} *)
41+42+(** Get all messages in the order they were added. *)
43+val messages : t -> Message.t list
44+45+(** Get only error messages. *)
46+val errors : t -> Message.t list
47+48+(** Get only warning messages. *)
49+val warnings : t -> Message.t list
50+51+(** {1 Status Queries} *)
52+53+(** Check if the collector contains any error messages. *)
54+val has_errors : t -> bool
55+56+(** Get the total number of messages. *)
57+val count : t -> int
58+59+(** Get the number of error messages. *)
60+val error_count : t -> int
61+62+(** {1 Modification} *)
63+64+(** Clear all messages from the collector. *)
65+val clear : t -> unit
···1+let format_text ?system_id messages =
2+ let buf = Buffer.create 1024 in
3+ List.iter
4+ (fun msg ->
5+ let loc_str =
6+ match msg.Message.location with
7+ | Some loc -> (
8+ let sid =
9+ match loc.Message.system_id with
10+ | Some s -> s
11+ | None -> (
12+ match system_id with Some s -> s | None -> "input")
13+ in
14+ let col_info =
15+ match (loc.end_line, loc.end_column) with
16+ | Some el, Some ec when el = loc.line && ec > loc.column ->
17+ Printf.sprintf "%d.%d-%d" loc.line loc.column ec
18+ | Some el, Some ec when el > loc.line ->
19+ Printf.sprintf "%d.%d-%d.%d" loc.line loc.column el ec
20+ | _ -> Printf.sprintf "%d.%d" loc.line loc.column
21+ in
22+ Printf.sprintf "%s:%s" sid col_info)
23+ | None -> (
24+ match system_id with Some s -> s | None -> "input")
25+ in
26+ let severity_str = Message.severity_to_string msg.Message.severity in
27+ let code_str =
28+ match msg.Message.code with Some c -> " [" ^ c ^ "]" | None -> ""
29+ in
30+ let elem_str =
31+ match msg.Message.element with
32+ | Some e -> " (element: " ^ e ^ ")"
33+ | None -> ""
34+ in
35+ let attr_str =
36+ match msg.Message.attribute with
37+ | Some a -> " (attribute: " ^ a ^ ")"
38+ | None -> ""
39+ in
40+ Buffer.add_string buf
41+ (Printf.sprintf "%s: %s%s: %s%s%s\n" loc_str severity_str code_str
42+ msg.Message.message elem_str attr_str))
43+ messages;
44+ Buffer.contents buf
45+46+let format_gnu ?system_id messages =
47+ let buf = Buffer.create 1024 in
48+ List.iter
49+ (fun msg ->
50+ let loc_str =
51+ match msg.Message.location with
52+ | Some loc -> (
53+ let sid =
54+ match loc.Message.system_id with
55+ | Some s -> s
56+ | None -> (
57+ match system_id with Some s -> s | None -> "input")
58+ in
59+ Printf.sprintf "%s:%d:%d" sid loc.line loc.column)
60+ | None -> (
61+ match system_id with Some s -> s ^ ":0:0" | None -> "input:0:0")
62+ in
63+ let severity_str = Message.severity_to_string msg.Message.severity in
64+ let code_str =
65+ match msg.Message.code with Some c -> " [" ^ c ^ "]" | None -> ""
66+ in
67+ Buffer.add_string buf
68+ (Printf.sprintf "%s: %s%s: %s\n" loc_str severity_str code_str
69+ msg.Message.message))
70+ messages;
71+ Buffer.contents buf
72+73+let message_to_json ?system_id msg =
74+ let open Jsont in
75+ let severity = String (Message.severity_to_string msg.Message.severity, Meta.none) in
76+ let message_text = String (msg.Message.message, Meta.none) in
77+ let base = [ (("type", Meta.none), severity); (("message", Meta.none), message_text) ] in
78+ let with_code =
79+ match msg.Message.code with
80+ | Some c -> (("subType", Meta.none), String (c, Meta.none)) :: base
81+ | None -> base
82+ in
83+ let with_location =
84+ match msg.Message.location with
85+ | Some loc ->
86+ let line = Number (float_of_int loc.Message.line, Meta.none) in
87+ let column = Number (float_of_int loc.Message.column, Meta.none) in
88+ let loc_fields =
89+ [ (("firstLine", Meta.none), line); (("firstColumn", Meta.none), column) ]
90+ in
91+ let loc_fields =
92+ match loc.Message.end_line with
93+ | Some el ->
94+ (("lastLine", Meta.none), Number (float_of_int el, Meta.none)) :: loc_fields
95+ | None -> loc_fields
96+ in
97+ let loc_fields =
98+ match loc.Message.end_column with
99+ | Some ec ->
100+ (("lastColumn", Meta.none), Number (float_of_int ec, Meta.none))
101+ :: loc_fields
102+ | None -> loc_fields
103+ in
104+ let url =
105+ match loc.Message.system_id with
106+ | Some s -> s
107+ | None -> (
108+ match system_id with Some s -> s | None -> "input")
109+ in
110+ (("url", Meta.none), String (url, Meta.none)) :: loc_fields @ with_code
111+ | None ->
112+ let url =
113+ match system_id with Some s -> s | None -> "input"
114+ in
115+ (("url", Meta.none), String (url, Meta.none)) :: with_code
116+ in
117+ let with_extract =
118+ match msg.Message.extract with
119+ | Some e -> (("extract", Meta.none), String (e, Meta.none)) :: with_location
120+ | None -> with_location
121+ in
122+ Object (with_extract, Meta.none)
123+124+let format_json ?system_id messages =
125+ let open Jsont in
126+ let msg_array = Array (List.map (message_to_json ?system_id) messages, Meta.none) in
127+ let obj = Object ([ (("messages", Meta.none), msg_array) ], Meta.none) in
128+ match Jsont_bytesrw.encode_string ~format:Minify json obj with
129+ | Ok s -> s
130+ | Error e -> failwith ("JSON encoding error: " ^ e)
+28
lib/html5_checker/message_format.mli
···0000000000000000000000000000
···1+(** Message output formatters.
2+3+ This module provides various output formats for validation messages,
4+ including text, JSON, and GNU-style formats for IDE integration. *)
5+6+(** {1 Formatters} *)
7+8+(** Format messages as human-readable text.
9+10+ Output format: [file:line:col: severity: message]
11+12+ @param system_id Optional default system identifier for messages without location. *)
13+val format_text : ?system_id:string -> Message.t list -> string
14+15+(** Format messages as JSON.
16+17+ Produces output compatible with the Nu HTML Validator JSON format.
18+19+ @param system_id Optional default system identifier for messages without location. *)
20+val format_json : ?system_id:string -> Message.t list -> string
21+22+(** Format messages in GNU style for IDE integration.
23+24+ Output format follows GNU conventions for error messages, compatible
25+ with most IDEs and editors.
26+27+ @param system_id Optional default system identifier for messages without location. *)
28+val format_gnu : ?system_id:string -> Message.t list -> string
+22
lib/html5_checker/parse_error_bridge.ml
···0000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+let of_parse_error ?system_id err =
7+ let code = Html5rw.error_code err in
8+ let line = Html5rw.error_line err in
9+ let column = Html5rw.error_column err in
10+ let location =
11+ Message.make_location ~line ~column ?system_id ()
12+ in
13+ let code_str = Html5rw.Parse_error_code.to_string code in
14+ Message.error
15+ ~message:(Printf.sprintf "Parse error: %s" code_str)
16+ ~code:code_str
17+ ~location
18+ ()
19+20+let collect_parse_errors ?system_id result =
21+ let errors = Html5rw.errors result in
22+ List.map (of_parse_error ?system_id) errors
+25
lib/html5_checker/parse_error_bridge.mli
···0000000000000000000000000
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** Bridge between Html5rw parse errors and validation messages.
7+8+ This module converts parse errors from the Html5rw parser into
9+ standardized validation messages. *)
10+11+(** Convert a parse error to a validation message.
12+13+ Extracts error code, line, column, and creates a Message.t with
14+ severity set to Error.
15+16+ @param system_id Optional file path or URL to include in location *)
17+val of_parse_error : ?system_id:string -> Html5rw.parse_error -> Message.t
18+19+(** Collect all parse errors from a parse result.
20+21+ Extracts all parse errors from the Html5rw.t result and converts
22+ them to validation messages.
23+24+ @param system_id Optional file path or URL to include in locations *)
25+val collect_parse_errors : ?system_id:string -> Html5rw.t -> Message.t list
···1+(** Form-related validation checker.
2+3+ Validates form control associations, label references, and form structure
4+ according to HTML5 accessibility and semantic requirements. This checker
5+ ensures that:
6+7+ - Form controls have proper labels
8+ - Label associations are valid
9+ - Form attributes are correctly configured
10+ - Form structure follows HTML5 constraints
11+12+ {2 Validation Rules}
13+14+ {b Label Associations}
15+ - [label] with [for] attribute should reference a labelable element ID
16+ - [input\[type=radio\]] should have an associated visible label
17+ - [input\[type=checkbox\]] should have an associated visible label
18+19+ {b Form Control Validation}
20+ - [button\[type=submit\]] should be inside a [form] or have [form] attribute
21+ - [input\[type=image\]] requires [alt] attribute (validated by required_attr_checker)
22+ - [input\[type=hidden\]] should not have [required] attribute (validated by required_attr_checker)
23+ - [input\[type=file\]] should not have [value] attribute (validated by required_attr_checker)
24+25+ {b Autocomplete}
26+ - [autocomplete] values should be appropriate for the [input] type
27+ - Common autocomplete values include: [on], [off], [name], [email],
28+ [username], [current-password], [new-password], [street-address], etc.
29+30+ {b Select Elements}
31+ - [select\[multiple\]] should not have [size="1"] (contradictory)
32+ - [select] should contain at least one [option] or [optgroup]
33+34+ {b Accessibility}
35+ - Form controls should be reachable and operable via keyboard
36+ - Radio buttons with the same [name] should form a logical group
37+38+ {3 Labelable Elements}
39+40+ The following elements can be associated with a [label]:
41+ - [button]
42+ - [input] (except [type=hidden])
43+ - [meter]
44+ - [output]
45+ - [progress]
46+ - [select]
47+ - [textarea]
48+49+ @see <https://html.spec.whatwg.org/multipage/forms.html> WHATWG HTML: Forms
50+ @see <https://www.w3.org/WAI/WCAG21/Understanding/labels-or-instructions.html>
51+ WCAG: Labels or Instructions *)
52+53+include Checker.S
54+55+val checker : Checker.t
56+(** A first-class module instance of this checker.
57+58+ {b Usage:}
59+ {[
60+ let checker = Form_checker.checker in
61+ Checker_registry.register "form-validation" checker
62+ ]} *)
···1+(** ID uniqueness and reference checker.
2+3+ This checker validates that:
4+ - ID attributes are unique within the document
5+ - ID references point to existing IDs
6+ - ID values conform to HTML5 requirements *)
7+8+(** Location information for ID occurrences. *)
9+type id_location = {
10+ element : string;
11+ location : Message.location option;
12+}
13+14+(** Information about an ID reference. *)
15+type id_reference = {
16+ referring_element : string;
17+ attribute : string;
18+ referenced_id : string;
19+ location : Message.location option;
20+}
21+22+(** Checker state tracking IDs and references. *)
23+type state = {
24+ ids : (string, id_location) Hashtbl.t;
25+ mutable references : id_reference list;
26+}
27+28+let create () =
29+ {
30+ ids = Hashtbl.create 64;
31+ references = [];
32+ }
33+34+let reset state =
35+ Hashtbl.clear state.ids;
36+ state.references <- []
37+38+(** Check if a string contains whitespace. *)
39+let contains_whitespace s =
40+ String.contains s ' ' || String.contains s '\t' ||
41+ String.contains s '\n' || String.contains s '\r'
42+43+(** Extract ID from a usemap value (removes leading #). *)
44+let extract_usemap_id value =
45+ if String.length value > 0 && value.[0] = '#' then
46+ Some (String.sub value 1 (String.length value - 1))
47+ else
48+ None
49+50+(** Split whitespace-separated ID references. *)
51+let split_ids value =
52+ let rec split acc start i =
53+ if i >= String.length value then
54+ if i > start then
55+ (String.sub value start (i - start)) :: acc
56+ else
57+ acc
58+ else
59+ match value.[i] with
60+ | ' ' | '\t' | '\n' | '\r' ->
61+ let acc' =
62+ if i > start then
63+ (String.sub value start (i - start)) :: acc
64+ else
65+ acc
66+ in
67+ split acc' (i + 1) (i + 1)
68+ | _ ->
69+ split acc start (i + 1)
70+ in
71+ List.rev (split [] 0 0)
72+73+(** Attributes that reference a single ID. *)
74+let single_id_ref_attrs = [
75+ "for"; (* label *)
76+ "form"; (* form-associated elements *)
77+ "list"; (* input *)
78+ "aria-activedescendant";
79+]
80+81+(** Attributes that reference multiple IDs (space-separated). *)
82+let multi_id_ref_attrs = [
83+ "headers"; (* td, th *)
84+ "aria-labelledby";
85+ "aria-describedby";
86+ "aria-controls";
87+ "aria-flowto";
88+ "aria-owns";
89+ "itemref";
90+]
91+92+(** Check and store an ID attribute. *)
93+let check_id state ~element ~id ~location collector =
94+ (* Check for empty ID *)
95+ if String.length id = 0 then
96+ Message_collector.add_error collector
97+ ~message:"ID attribute must not be empty"
98+ ~code:"empty-id"
99+ ?location
100+ ~element
101+ ~attribute:"id"
102+ ()
103+ (* Check for whitespace in ID *)
104+ else if contains_whitespace id then
105+ Message_collector.add_error collector
106+ ~message:(Printf.sprintf "ID attribute value '%s' must not contain whitespace" id)
107+ ~code:"id-whitespace"
108+ ?location
109+ ~element
110+ ~attribute:"id"
111+ ()
112+ (* Check for duplicate ID *)
113+ else if Hashtbl.mem state.ids id then
114+ let first_occurrence = Hashtbl.find state.ids id in
115+ let first_loc_str = match first_occurrence.location with
116+ | None -> ""
117+ | Some loc -> Printf.sprintf " at line %d, column %d" loc.line loc.column
118+ in
119+ Message_collector.add_error collector
120+ ~message:(Printf.sprintf
121+ "Duplicate ID '%s': first used on <%s>%s, now on <%s>"
122+ id first_occurrence.element first_loc_str element)
123+ ~code:"duplicate-id"
124+ ?location
125+ ~element
126+ ~attribute:"id"
127+ ()
128+ else
129+ (* Store the ID *)
130+ Hashtbl.add state.ids id { element; location }
131+132+(** Record a single ID reference. *)
133+let add_reference state ~referring_element ~attribute ~referenced_id ~location =
134+ if String.length referenced_id > 0 then
135+ state.references <- {
136+ referring_element;
137+ attribute;
138+ referenced_id;
139+ location;
140+ } :: state.references
141+142+(** Process attributes to check IDs and collect references. *)
143+let process_attrs state ~element ~attrs ~location collector =
144+ List.iter (fun (name, value) ->
145+ match name with
146+ | "id" ->
147+ check_id state ~element ~id:value ~location collector
148+149+ | "usemap" ->
150+ (* usemap references a map name, which is like an ID reference *)
151+ begin match extract_usemap_id value with
152+ | Some id ->
153+ add_reference state ~referring_element:element
154+ ~attribute:name ~referenced_id:id ~location
155+ | None ->
156+ if String.length value > 0 then
157+ Message_collector.add_error collector
158+ ~message:(Printf.sprintf
159+ "usemap attribute value '%s' must start with '#'" value)
160+ ~code:"invalid-usemap"
161+ ?location
162+ ~element
163+ ~attribute:name
164+ ()
165+ end
166+167+ | attr when List.mem attr single_id_ref_attrs ->
168+ add_reference state ~referring_element:element
169+ ~attribute:attr ~referenced_id:value ~location
170+171+ | attr when List.mem attr multi_id_ref_attrs ->
172+ (* Split space-separated IDs and add each as a reference *)
173+ let ids = split_ids value in
174+ List.iter (fun id ->
175+ add_reference state ~referring_element:element
176+ ~attribute:attr ~referenced_id:id ~location
177+ ) ids
178+179+ | _ -> ()
180+ ) attrs
181+182+let start_element state ~name ~namespace:_ ~attrs collector =
183+ (* For now, we don't have location information from the DOM walker,
184+ so we pass None. In a full implementation, this would be passed
185+ from the parser. *)
186+ let location = None in
187+ process_attrs state ~element:name ~attrs ~location collector
188+189+let end_element _state ~name:_ ~namespace:_ _collector =
190+ ()
191+192+let characters _state _text _collector =
193+ ()
194+195+let end_document state collector =
196+ (* Check all references point to existing IDs *)
197+ List.iter (fun ref ->
198+ if not (Hashtbl.mem state.ids ref.referenced_id) then
199+ Message_collector.add_error collector
200+ ~message:(Printf.sprintf
201+ "The '%s' attribute on <%s> refers to ID '%s' which does not exist"
202+ ref.attribute ref.referring_element ref.referenced_id)
203+ ~code:"dangling-id-reference"
204+ ?location:ref.location
205+ ~element:ref.referring_element
206+ ~attribute:ref.attribute
207+ ()
208+ ) state.references
209+210+let checker = (module struct
211+ type nonrec state = state
212+213+ let create = create
214+ let reset = reset
215+ let start_element = start_element
216+ let end_element = end_element
217+ let characters = characters
218+ let end_document = end_document
219+end : Checker.S)
+11
lib/html5_checker/semantic/id_checker.mli
···00000000000
···1+(** ID uniqueness and reference checker.
2+3+ Validates:
4+ - ID attributes are unique within the document
5+ - ID references (for, headers, aria-*, etc.) point to existing IDs
6+ - ID values conform to HTML5 requirements *)
7+8+include Checker.S
9+10+val checker : Checker.t
11+(** [checker] is a checker instance for validating ID uniqueness and references. *)
···1+(** Interactive element nesting checker implementation. *)
2+3+(** Special ancestors that need tracking for nesting validation.
4+5+ This array defines the elements whose presence in the ancestor chain
6+ affects validation of descendant elements. The order is significant
7+ as it determines bit positions in the ancestor bitmask. *)
8+let special_ancestors =
9+ [| "a"; "address"; "body"; "button"; "caption"; "dfn"; "dt"; "figcaption";
10+ "figure"; "footer"; "form"; "header"; "label"; "map"; "noscript"; "th";
11+ "time"; "progress"; "meter"; "article"; "section"; "aside"; "nav"; "h1";
12+ "h2"; "h3"; "h4"; "h5"; "h6" |]
13+14+(** Get the bit position for a special ancestor element.
15+ Returns [-1] if the element is not a special ancestor. *)
16+let special_ancestor_number name =
17+ let rec find i =
18+ if i >= Array.length special_ancestors then -1
19+ else if special_ancestors.(i) = name then i
20+ else find (i + 1)
21+ in
22+ find 0
23+24+(** Interactive elements that cannot be nested inside [a] or [button]. *)
25+let interactive_elements =
26+ [| "a"; "button"; "details"; "embed"; "iframe"; "label"; "select";
27+ "textarea" |]
28+29+(** Map from descendant element name to bitmask of prohibited ancestors. *)
30+let ancestor_mask_by_descendant : (string, int) Hashtbl.t =
31+ Hashtbl.create 64
32+33+(** Register that [ancestor] is prohibited for [descendant]. *)
34+let register_prohibited_ancestor ancestor descendant =
35+ let number = special_ancestor_number ancestor in
36+ if number = -1 then
37+ failwith ("Ancestor not found in array: " ^ ancestor);
38+ let mask =
39+ match Hashtbl.find_opt ancestor_mask_by_descendant descendant with
40+ | None -> 0
41+ | Some m -> m
42+ in
43+ let new_mask = mask lor (1 lsl number) in
44+ Hashtbl.replace ancestor_mask_by_descendant descendant new_mask
45+46+(** Initialize the prohibited ancestor map. *)
47+let () =
48+ (* Self-nesting restrictions *)
49+ register_prohibited_ancestor "form" "form";
50+ register_prohibited_ancestor "progress" "progress";
51+ register_prohibited_ancestor "meter" "meter";
52+ register_prohibited_ancestor "dfn" "dfn";
53+ register_prohibited_ancestor "noscript" "noscript";
54+ register_prohibited_ancestor "label" "label";
55+56+ (* Address restrictions *)
57+ register_prohibited_ancestor "address" "address";
58+ register_prohibited_ancestor "address" "section";
59+ register_prohibited_ancestor "address" "nav";
60+ register_prohibited_ancestor "address" "article";
61+ register_prohibited_ancestor "address" "header";
62+ register_prohibited_ancestor "address" "footer";
63+ register_prohibited_ancestor "address" "h1";
64+ register_prohibited_ancestor "address" "h2";
65+ register_prohibited_ancestor "address" "h3";
66+ register_prohibited_ancestor "address" "h4";
67+ register_prohibited_ancestor "address" "h5";
68+ register_prohibited_ancestor "address" "h6";
69+70+ (* Header/footer restrictions *)
71+ register_prohibited_ancestor "header" "header";
72+ register_prohibited_ancestor "footer" "header";
73+ register_prohibited_ancestor "header" "footer";
74+ register_prohibited_ancestor "footer" "footer";
75+76+ (* dt restrictions *)
77+ register_prohibited_ancestor "dt" "header";
78+ register_prohibited_ancestor "dt" "footer";
79+ register_prohibited_ancestor "dt" "article";
80+ register_prohibited_ancestor "dt" "nav";
81+ register_prohibited_ancestor "dt" "section";
82+ register_prohibited_ancestor "dt" "h1";
83+ register_prohibited_ancestor "dt" "h2";
84+ register_prohibited_ancestor "dt" "h3";
85+ register_prohibited_ancestor "dt" "h4";
86+ register_prohibited_ancestor "dt" "h5";
87+ register_prohibited_ancestor "dt" "h6";
88+ register_prohibited_ancestor "dt" "hgroup";
89+90+ (* th restrictions *)
91+ register_prohibited_ancestor "th" "header";
92+ register_prohibited_ancestor "th" "footer";
93+ register_prohibited_ancestor "th" "article";
94+ register_prohibited_ancestor "th" "nav";
95+ register_prohibited_ancestor "th" "section";
96+ register_prohibited_ancestor "th" "h1";
97+ register_prohibited_ancestor "th" "h2";
98+ register_prohibited_ancestor "th" "h3";
99+ register_prohibited_ancestor "th" "h4";
100+ register_prohibited_ancestor "th" "h5";
101+ register_prohibited_ancestor "th" "h6";
102+ register_prohibited_ancestor "th" "hgroup";
103+104+ (* Caption restriction *)
105+ register_prohibited_ancestor "caption" "table";
106+107+ (* Interactive element restrictions: cannot be inside a or button *)
108+ Array.iter (fun elem ->
109+ register_prohibited_ancestor "a" elem;
110+ register_prohibited_ancestor "button" elem
111+ ) interactive_elements
112+113+(** Bitmask constants for common checks. *)
114+let a_button_mask =
115+ let a_num = special_ancestor_number "a" in
116+ let button_num = special_ancestor_number "button" in
117+ (1 lsl a_num) lor (1 lsl button_num)
118+119+let map_mask =
120+ let map_num = special_ancestor_number "map" in
121+ 1 lsl map_num
122+123+(** Stack node representing an element's context. *)
124+type stack_node = {
125+ ancestor_mask : int;
126+ _name : string; [@warning "-69"]
127+}
128+129+(** Checker state. *)
130+type state = {
131+ mutable stack : stack_node list;
132+ mutable ancestor_mask : int;
133+}
134+135+let create () =
136+ { stack = []; ancestor_mask = 0 }
137+138+let reset state =
139+ state.stack <- [];
140+ state.ancestor_mask <- 0
141+142+(** Get attribute value by name from attribute list. *)
143+let get_attr attrs name =
144+ List.assoc_opt name attrs
145+146+(** Check if an attribute exists. *)
147+let has_attr attrs name =
148+ get_attr attrs name <> None
149+150+(** Check if element is interactive based on its attributes. *)
151+let is_interactive_element name attrs =
152+ match name with
153+ | "a" ->
154+ has_attr attrs "href"
155+ | "audio" | "video" ->
156+ has_attr attrs "controls"
157+ | "img" | "object" ->
158+ has_attr attrs "usemap"
159+ | "input" ->
160+ begin match get_attr attrs "type" with
161+ | Some "hidden" -> false
162+ | _ -> true
163+ end
164+ | "button" | "details" | "embed" | "iframe" | "label" | "select"
165+ | "textarea" ->
166+ true
167+ | _ ->
168+ false
169+170+(** Get a human-readable description of an element for error messages. *)
171+let element_description name attrs =
172+ match name with
173+ | "a" when has_attr attrs "href" ->
174+ "The element \"a\" with the attribute \"href\""
175+ | "audio" when has_attr attrs "controls" ->
176+ "The element \"audio\" with the attribute \"controls\""
177+ | "video" when has_attr attrs "controls" ->
178+ "The element \"video\" with the attribute \"controls\""
179+ | "img" when has_attr attrs "usemap" ->
180+ "The element \"img\" with the attribute \"usemap\""
181+ | "object" when has_attr attrs "usemap" ->
182+ "The element \"object\" with the attribute \"usemap\""
183+ | _ ->
184+ Printf.sprintf "The element \"%s\"" name
185+186+(** Report nesting violations. *)
187+let check_nesting state name attrs collector =
188+ (* Compute the prohibited ancestor mask for this element *)
189+ let base_mask =
190+ match Hashtbl.find_opt ancestor_mask_by_descendant name with
191+ | Some m -> m
192+ | None -> 0
193+ in
194+195+ (* Add interactive element restrictions if applicable *)
196+ let mask =
197+ if is_interactive_element name attrs then
198+ base_mask lor a_button_mask
199+ else
200+ base_mask
201+ in
202+203+ (* Check for violations *)
204+ if mask <> 0 then begin
205+ let mask_hit = state.ancestor_mask land mask in
206+ if mask_hit <> 0 then begin
207+ let desc = element_description name attrs in
208+ (* Find which ancestors are violated *)
209+ Array.iteri (fun i ancestor ->
210+ let bit = 1 lsl i in
211+ if (mask_hit land bit) <> 0 then
212+ Message_collector.add_error collector
213+ ~message:(Printf.sprintf
214+ "%s must not appear as a descendant of the \"%s\" element."
215+ desc ancestor)
216+ ~element:name
217+ ()
218+ ) special_ancestors
219+ end
220+ end
221+222+(** Check for required ancestors. *)
223+let check_required_ancestors state name collector =
224+ match name with
225+ | "area" ->
226+ if (state.ancestor_mask land map_mask) = 0 then
227+ Message_collector.add_error collector
228+ ~message:"The \"area\" element must have a \"map\" ancestor."
229+ ~element:name
230+ ()
231+ | _ -> ()
232+233+let start_element state ~name ~namespace ~attrs collector =
234+ (* Only check HTML elements, not SVG or MathML *)
235+ match namespace with
236+ | Some _ -> ()
237+ | None ->
238+ (* Check for nesting violations *)
239+ check_nesting state name attrs collector;
240+ check_required_ancestors state name collector;
241+242+ (* Update ancestor mask if this is a special ancestor *)
243+ let new_mask = state.ancestor_mask in
244+ let number = special_ancestor_number name in
245+ let new_mask =
246+ if number >= 0 then
247+ new_mask lor (1 lsl number)
248+ else
249+ new_mask
250+ in
251+252+ (* Add href tracking for <a> elements *)
253+ let new_mask =
254+ if name = "a" && has_attr attrs "href" then
255+ let a_num = special_ancestor_number "a" in
256+ new_mask lor (1 lsl a_num)
257+ else
258+ new_mask
259+ in
260+261+ (* Push onto stack *)
262+ let node = { ancestor_mask = state.ancestor_mask; _name = name } in
263+ state.stack <- node :: state.stack;
264+ state.ancestor_mask <- new_mask
265+266+let end_element state ~name:_ ~namespace _collector =
267+ (* Only track HTML elements *)
268+ match namespace with
269+ | Some _ -> ()
270+ | None ->
271+ (* Pop from stack and restore ancestor mask *)
272+ begin match state.stack with
273+ | [] -> () (* Should not happen in well-formed documents *)
274+ | node :: rest ->
275+ state.stack <- rest;
276+ state.ancestor_mask <- node.ancestor_mask
277+ end
278+279+let characters _state _text _collector =
280+ () (* No text-specific nesting checks *)
281+282+let end_document _state _collector =
283+ () (* No document-level checks needed *)
284+285+(** Create the checker as a first-class module. *)
286+let checker =
287+ let module M = struct
288+ type nonrec state = state
289+ let create = create
290+ let reset = reset
291+ let start_element = start_element
292+ let end_element = end_element
293+ let characters = characters
294+ let end_document = end_document
295+ end in
296+ (module M : Checker.S)
···1+(** Interactive element nesting checker.
2+3+ Validates that interactive elements are not nested in ways that violate
4+ HTML5 specifications. This checker tracks ancestor elements and ensures
5+ that prohibited nesting patterns are detected and reported.
6+7+ {2 Validation Rules}
8+9+ The checker enforces the following prohibited nesting relationships:
10+11+ {3 Self-nesting Restrictions}
12+13+ These elements cannot be nested inside themselves:
14+ - [form] cannot contain [form]
15+ - [progress] cannot contain [progress]
16+ - [meter] cannot contain [meter]
17+ - [dfn] cannot contain [dfn]
18+ - [noscript] cannot contain [noscript]
19+ - [label] cannot contain [label]
20+21+ {3 Structural Element Restrictions}
22+23+ - [header] cannot be inside [header], [footer], or [address]
24+ - [footer] cannot be inside [header], [footer], or [address]
25+ - [address] cannot contain [header], [footer], [article], [section],
26+ [nav], or heading elements ([h1]-[h6])
27+28+ {3 Interactive Content Restrictions}
29+30+ Interactive elements cannot be descendants of [a] (with [href]) or
31+ [button]:
32+33+ - [a] (when it has [href]) cannot be inside [a] or [button]
34+ - [button] cannot be inside [a] or [button]
35+ - [details] cannot be inside [a] or [button]
36+ - [embed] cannot be inside [a] or [button]
37+ - [iframe] cannot be inside [a] or [button]
38+ - [label] cannot be inside [a] or [button]
39+ - [select] cannot be inside [a] or [button]
40+ - [textarea] cannot be inside [a] or [button]
41+ - [audio] (with [controls]) cannot be inside [a] or [button]
42+ - [video] (with [controls]) cannot be inside [a] or [button]
43+ - [input] (except [type=hidden]) cannot be inside [a] or [button]
44+ - [img] (with [usemap]) cannot be inside [a] or [button]
45+ - [object] (with [usemap]) cannot be inside [a] or [button]
46+47+ {3 Table Cell Restrictions}
48+49+ - [dt] and [th] cannot contain [header], [footer], [article], [section],
50+ [nav], heading elements ([h1]-[h6]), or [hgroup]
51+52+ {3 Other Restrictions}
53+54+ - [caption] cannot contain [table]
55+ - [area] must have a [map] ancestor
56+57+ {2 Implementation Details}
58+59+ The checker uses a bitmask-based approach to efficiently track ancestor
60+ elements. Each "special" ancestor element has a corresponding bit in the
61+ ancestor mask. As elements are opened and closed during traversal, the
62+ mask is updated to reflect the current ancestor context.
63+64+ When an element is encountered, the checker:
65+ 1. Computes which ancestors would be prohibited for this element
66+ 2. Checks if any of those prohibited ancestors are present in the
67+ current ancestor mask
68+ 3. Reports errors for any violations found
69+ 4. Updates the ancestor mask to include the current element (if it's
70+ a special ancestor)
71+72+ @see <https://html.spec.whatwg.org/multipage/dom.html#content-models>
73+ HTML5 specification: Content models
74+*)
75+76+include Checker.S
77+78+val checker : Checker.t
79+(** [checker] is a checker instance for validating element nesting rules. *)
···1+(** Obsolete element and attribute checker.
2+3+ Reports errors for obsolete HTML elements and attributes that should
4+ not be used in modern HTML5 documents.
5+6+ This checker validates that documents do not use deprecated elements
7+ or attributes from earlier HTML versions. It reports:
8+9+ - {b Obsolete elements}: Elements like [<applet>], [<font>], [<center>]
10+ that have been removed from HTML5
11+ - {b Obsolete attributes}: Attributes like [align], [bgcolor], [border]
12+ that should be replaced with CSS
13+ - {b Obsolete global attributes}: Global attributes like [contextmenu]
14+ that are no longer supported
15+16+ {2 Obsolete Elements}
17+18+ Elements that are flagged as obsolete include:
19+ - Presentational elements: [<basefont>], [<big>], [<center>], [<font>],
20+ [<strike>], [<tt>]
21+ - Frame elements: [<frame>], [<frameset>], [<noframes>]
22+ - Deprecated interactive elements: [<applet>], [<bgsound>], [<keygen>]
23+ - Deprecated text elements: [<acronym>], [<dir>], [<listing>], [<xmp>]
24+ - And many others
25+26+ {2 Obsolete Attributes}
27+28+ The checker validates against hundreds of obsolete attributes, including:
29+ - Presentational attributes: [align], [bgcolor], [border], [color],
30+ [height], [width] (on certain elements)
31+ - Data binding attributes: [datafld], [dataformatas], [datasrc]
32+ - Navigation attributes: [longdesc], [methods], [urn]
33+ - And many element-specific obsolete attributes
34+35+ {2 Example}
36+37+ {[
38+ let checker = Obsolete_checker.checker in
39+ let module C = (val checker : Checker.S) in
40+ let state = C.create () in
41+42+ (* This will emit an error *)
43+ C.start_element state ~name:"center" ~namespace:None ~attrs:[] collector;
44+ (* Error: Element "center" is obsolete. Use CSS instead. *)
45+46+ (* This will also emit an error *)
47+ C.start_element state ~name:"div"
48+ ~namespace:None
49+ ~attrs:[("align", "center")]
50+ collector;
51+ (* Error: Attribute "align" on element "div" is obsolete. Use CSS instead. *)
52+ ]}
53+*)
54+55+(** Include the standard checker signature. *)
56+include Checker.S
57+58+(** {1 Checker Instance} *)
59+60+val checker : Checker.t
61+(** [checker] is a pre-configured obsolete checker instance that can be
62+ registered with the checker registry.
63+64+ {b Example:}
65+ {[
66+ Checker_registry.register registry "obsolete" Obsolete_checker.checker
67+ ]} *)
···1+(** Required attribute checker implementation. *)
2+3+type state = {
4+ mutable _in_figure : bool;
5+ (** Track if we're inside a <figure> element (alt is more critical there) *)
6+}
7+8+let create () = { _in_figure = false }
9+10+let reset state = state._in_figure <- false
11+12+(** Check if an attribute list contains a specific attribute. *)
13+let has_attr name attrs =
14+ List.exists (fun (attr_name, _) -> String.equal attr_name name) attrs
15+16+(** Get the value of an attribute if present. *)
17+let get_attr name attrs =
18+ List.find_map
19+ (fun (attr_name, value) ->
20+ if String.equal attr_name name then Some value else None)
21+ attrs
22+23+let check_img_element attrs collector =
24+ (* Check for required src attribute *)
25+ if not (has_attr "src" attrs) then
26+ Message_collector.add_error collector ~message:"img element requires src attribute"
27+ ~code:"missing-required-attribute" ~element:"img" ~attribute:"src" ();
28+29+ (* Check for alt attribute - always required *)
30+ if not (has_attr "alt" attrs) then
31+ Message_collector.add_error collector
32+ ~message:"img element requires alt attribute for accessibility"
33+ ~code:"missing-required-attribute" ~element:"img" ~attribute:"alt" ()
34+35+let check_area_element attrs collector =
36+ (* area with href requires alt *)
37+ if has_attr "href" attrs && not (has_attr "alt" attrs) then
38+ Message_collector.add_error collector
39+ ~message:"area element with href requires alt attribute" ~code:"missing-required-attribute"
40+ ~element:"area" ~attribute:"alt" ()
41+42+let check_input_element attrs collector =
43+ match get_attr "type" attrs with
44+ | Some "image" ->
45+ (* input[type=image] requires alt *)
46+ if not (has_attr "alt" attrs) then
47+ Message_collector.add_error collector
48+ ~message:"input element with type=\"image\" requires alt attribute"
49+ ~code:"missing-required-attribute" ~element:"input" ~attribute:"alt" ()
50+ | Some "hidden" ->
51+ (* input[type=hidden] should not have required attribute *)
52+ if has_attr "required" attrs then
53+ Message_collector.add_error collector
54+ ~message:"input element with type=\"hidden\" cannot have required attribute"
55+ ~code:"invalid-attribute-combination" ~element:"input" ~attribute:"required" ()
56+ | Some "file" ->
57+ (* input[type=file] should not have value attribute *)
58+ if has_attr "value" attrs then
59+ Message_collector.add_warning collector
60+ ~message:"input element with type=\"file\" should not have value attribute"
61+ ~code:"invalid-attribute-combination" ~element:"input" ~attribute:"value" ()
62+ | _ -> ()
63+64+let check_script_element attrs _collector =
65+ (* script requires src OR text content *)
66+ if not (has_attr "src" attrs) then
67+ (* We can't check for text content here; that would need to be done
68+ in end_element or with state tracking *)
69+ ()
70+71+let check_meta_element attrs collector =
72+ (* meta requires charset OR (name AND content) OR (http-equiv AND content) *)
73+ let has_charset = has_attr "charset" attrs in
74+ let has_name = has_attr "name" attrs in
75+ let has_content = has_attr "content" attrs in
76+ let has_http_equiv = has_attr "http-equiv" attrs in
77+78+ let valid =
79+ has_charset
80+ || (has_name && has_content)
81+ || (has_http_equiv && has_content)
82+ in
83+84+ if not valid then
85+ Message_collector.add_error collector
86+ ~message:
87+ "meta element requires either charset, or name+content, or http-equiv+content"
88+ ~code:"missing-required-attribute" ~element:"meta" ()
89+90+let check_link_element attrs collector =
91+ (* link[rel="stylesheet"] requires href *)
92+ match get_attr "rel" attrs with
93+ | Some rel when String.equal rel "stylesheet" ->
94+ if not (has_attr "href" attrs) then
95+ Message_collector.add_error collector
96+ ~message:"link element with rel=\"stylesheet\" requires href attribute"
97+ ~code:"missing-required-attribute" ~element:"link" ~attribute:"href" ()
98+ | _ -> ()
99+100+let check_a_element attrs collector =
101+ (* a[download] requires href *)
102+ if has_attr "download" attrs && not (has_attr "href" attrs) then
103+ Message_collector.add_error collector
104+ ~message:"a element with download attribute requires href attribute"
105+ ~code:"missing-required-attribute" ~element:"a" ~attribute:"href" ()
106+107+let check_map_element attrs collector =
108+ (* map requires name *)
109+ if not (has_attr "name" attrs) then
110+ Message_collector.add_error collector
111+ ~message:"map element requires name attribute" ~code:"missing-required-attribute"
112+ ~element:"map" ~attribute:"name" ()
113+114+let start_element state ~name ~namespace:_ ~attrs collector =
115+ match name with
116+ | "img" -> check_img_element attrs collector
117+ | "area" -> check_area_element attrs collector
118+ | "input" -> check_input_element attrs collector
119+ | "script" -> check_script_element attrs collector
120+ | "meta" -> check_meta_element attrs collector
121+ | "link" -> check_link_element attrs collector
122+ | "a" -> check_a_element attrs collector
123+ | "map" -> check_map_element attrs collector
124+ | "figure" -> state._in_figure <- true
125+ | _ -> ()
126+127+let end_element state ~name ~namespace:_ _collector =
128+ match name with "figure" -> state._in_figure <- false | _ -> ()
129+130+let characters _state _text _collector = ()
131+132+let end_document _state _collector = ()
133+134+let checker = (module struct
135+ type nonrec state = state
136+137+ let create = create
138+ let reset = reset
139+ let start_element = start_element
140+ let end_element = end_element
141+ let characters = characters
142+ let end_document = end_document
143+end : Checker.S)
···1+(** Required attribute checker.
2+3+ Validates that elements have their required attributes according to the
4+ HTML5 specification. This checker ensures that:
5+6+ - Elements have all mandatory attributes
7+ - Conditional attributes are present when required by context
8+ - Attributes that must appear together are all present
9+10+ {2 Validation Rules}
11+12+ The checker validates these common required attributes:
13+14+ {b Images and Media}
15+ - [img] requires [src] attribute
16+ - [img] requires [alt] attribute (error in most contexts, warning otherwise)
17+ - [area] with [href] requires [alt] attribute
18+ - [input\[type=image\]] requires [alt] attribute
19+20+ {b Forms}
21+ - [input] defaults to [type="text"] if [type] is omitted
22+ - [map] requires [name] attribute
23+24+ {b Scripts and Styles}
25+ - [script] requires either [src] attribute OR text content
26+ - [style] with [scoped] requires appropriate positioning
27+28+ {b Metadata}
29+ - [meta] requires one of:
30+ - [charset] attribute, OR
31+ - [name] and [content] attributes, OR
32+ - [http-equiv] and [content] attributes
33+ - [link\[rel="stylesheet"\]] requires [href] attribute
34+35+ {b Links}
36+ - [a] with [download] attribute requires [href] attribute
37+38+ @see <https://html.spec.whatwg.org/multipage/indices.html#attributes-3>
39+ WHATWG HTML: Attributes *)
40+41+include Checker.S
42+43+val checker : Checker.t
44+(** A first-class module instance of this checker.
45+46+ {b Usage:}
47+ {[
48+ let checker = Required_attr_checker.checker in
49+ Checker_registry.register "required-attributes" checker
50+ ]} *)
···1+(** ARIA (Accessible Rich Internet Applications) validation checker.
2+3+ Validates ARIA roles, required ancestor roles, implicit roles, and
4+ deprecated ARIA attributes according to the WAI-ARIA specification.
5+6+ {2 Validation Rules}
7+8+ The checker enforces the following ARIA validation rules:
9+10+ {3 Valid ARIA Roles}
11+12+ All valid WAI-ARIA 1.2 roles are recognized:
13+14+ - {b Document structure roles}: article, definition, directory, document,
15+ feed, figure, group, heading, img, list, listitem, math, none, note,
16+ presentation, region, separator, table, term, toolbar, tooltip
17+ - {b Widget roles}: button, checkbox, combobox, dialog, grid, gridcell,
18+ link, listbox, menu, menubar, menuitem, menuitemcheckbox,
19+ menuitemradio, option, progressbar, radio, radiogroup, scrollbar,
20+ slider, spinbutton, switch, tab, tablist, tabpanel, textbox, tree,
21+ treegrid, treeitem
22+ - {b Landmark roles}: banner, complementary, contentinfo, form, main,
23+ navigation, search
24+ - {b Live region roles}: alert, log, marquee, status, timer
25+ - {b Window roles}: alertdialog
26+ - {b Abstract roles} (not for use in content): command, composite, input,
27+ landmark, range, roletype, section, sectionhead, select, structure,
28+ widget, window
29+30+ {3 Required Ancestor Roles}
31+32+ Certain ARIA roles require specific ancestor roles:
33+34+ - [option] requires [listbox]
35+ - [menuitem], [menuitemcheckbox], [menuitemradio] require [menu] or
36+ [menubar]
37+ - [tab] requires [tablist]
38+ - [treeitem] requires [tree] or [group]
39+ - [listitem] requires [list] or [group]
40+ - [cell], [gridcell], [columnheader], [rowheader] require [row]
41+ - [row] requires [grid], [rowgroup], [table], or [treegrid]
42+ - [rowgroup] requires [grid], [table], or [treegrid]
43+44+ {3 Roles That Cannot Be Named}
45+46+ These roles must not have accessible names (via aria-label or
47+ aria-labelledby):
48+49+ - caption, code, deletion, emphasis, generic, insertion, paragraph,
50+ presentation, strong, subscript, superscript
51+52+ {3 Implicit ARIA Roles}
53+54+ HTML elements have implicit ARIA roles:
55+56+ - [<article>] has implicit role [article]
57+ - [<aside>] has implicit role [complementary]
58+ - [<button>] has implicit role [button]
59+ - [<dialog>] has implicit role [dialog]
60+ - [<footer>] has implicit role [contentinfo]
61+ - [<header>] has implicit role [banner]
62+ - [<main>] has implicit role [main]
63+ - [<nav>] has implicit role [navigation]
64+ - And many more...
65+66+ {3 Deprecated ARIA Attributes}
67+68+ Certain ARIA attributes are deprecated for specific roles:
69+70+ - [aria-disabled] is deprecated for: alert, article, banner, cell,
71+ document, feed, figure, heading, img, list, listitem, main, navigation,
72+ region, and many others
73+ - [aria-errormessage] is deprecated for: alert, article, banner, button,
74+ cell, document, link, and many others
75+ - [aria-haspopup] is deprecated for: alert, article, checkbox, listbox,
76+ and many others
77+ - [aria-invalid] is deprecated for: alert, article, button, cell,
78+ document, link, and many others
79+ - [aria-level] is deprecated for: listitem
80+81+ {2 Implementation Details}
82+83+ The checker maintains a stack of ancestor elements with their roles
84+ (explicit and implicit) to validate required ancestor relationships.
85+ When an element with a role attribute is encountered, the checker:
86+87+ 1. Parses the role attribute value (space-separated tokens)
88+ 2. Validates each role against the list of valid roles
89+ 3. Checks if the role requires an ancestor role
90+ 4. Verifies the required ancestor is present in the ancestor stack
91+ 5. Checks for deprecated ARIA attributes on elements with specific roles
92+ 6. Validates that roles which cannot be named do not have aria-label or
93+ aria-labelledby attributes
94+95+ @see <https://www.w3.org/TR/wai-aria-1.2/>
96+ WAI-ARIA 1.2 specification
97+*)
98+99+include Checker.S
100+101+val checker : Checker.t
102+(** [checker] is a checker instance for validating ARIA roles and attributes. *)
···1+(** Heading structure validation checker.
2+3+ This checker validates that:
4+ - Heading levels don't skip (e.g., h1 to h3)
5+ - Documents have at least one heading
6+ - Multiple h1 usage is noted
7+ - Headings are not empty *)
8+9+(** Checker state tracking heading structure. *)
10+type state = {
11+ mutable current_level : int option;
12+ mutable h1_count : int;
13+ mutable has_any_heading : bool;
14+ mutable first_heading_checked : bool;
15+ mutable in_heading : string option;
16+ mutable heading_has_text : bool;
17+}
18+19+let create () =
20+ {
21+ current_level = None;
22+ h1_count = 0;
23+ has_any_heading = false;
24+ first_heading_checked = false;
25+ in_heading = None;
26+ heading_has_text = false;
27+ }
28+29+let reset state =
30+ state.current_level <- None;
31+ state.h1_count <- 0;
32+ state.has_any_heading <- false;
33+ state.first_heading_checked <- false;
34+ state.in_heading <- None;
35+ state.heading_has_text <- false
36+37+(** Extract heading level from tag name (e.g., "h1" -> 1). *)
38+let heading_level name =
39+ match String.lowercase_ascii name with
40+ | "h1" -> Some 1
41+ | "h2" -> Some 2
42+ | "h3" -> Some 3
43+ | "h4" -> Some 4
44+ | "h5" -> Some 5
45+ | "h6" -> Some 6
46+ | _ -> None
47+48+(** Check if text is effectively empty (only whitespace). *)
49+let is_empty_text text =
50+ let rec check i =
51+ if i >= String.length text then
52+ true
53+ else
54+ match text.[i] with
55+ | ' ' | '\t' | '\n' | '\r' -> check (i + 1)
56+ | _ -> false
57+ in
58+ check 0
59+60+let start_element state ~name ~namespace:_ ~attrs:_ collector =
61+ match heading_level name with
62+ | Some level ->
63+ state.has_any_heading <- true;
64+65+ (* Check if this is the first heading *)
66+ if not state.first_heading_checked then begin
67+ state.first_heading_checked <- true;
68+ if level <> 1 then
69+ Message_collector.add_warning collector
70+ ~message:(Printf.sprintf
71+ "First heading in document is <%s>, should typically be <h1>"
72+ name)
73+ ~code:"first-heading-not-h1"
74+ ~element:name
75+ ()
76+ end;
77+78+ (* Track h1 count *)
79+ if level = 1 then begin
80+ state.h1_count <- state.h1_count + 1;
81+ if state.h1_count > 1 then
82+ Message_collector.add_warning collector
83+ ~message:"Multiple <h1> elements detected. While valid in HTML5 sectioning content, traditional advice suggests one <h1> per page"
84+ ~code:"multiple-h1"
85+ ~element:name
86+ ()
87+ end;
88+89+ (* Check for skipped levels *)
90+ begin match state.current_level with
91+ | None ->
92+ state.current_level <- Some level
93+ | Some prev_level ->
94+ let diff = level - prev_level in
95+ if diff > 1 then
96+ Message_collector.add_warning collector
97+ ~message:(Printf.sprintf
98+ "Heading level skipped: <%s> follows <h%d>, skipping %d level%s. This can confuse screen reader users"
99+ name prev_level (diff - 1) (if diff > 2 then "s" else ""))
100+ ~code:"heading-level-skipped"
101+ ~element:name
102+ ();
103+ state.current_level <- Some level
104+ end;
105+106+ (* Track that we're in a heading to check for empty content *)
107+ state.in_heading <- Some name;
108+ state.heading_has_text <- false
109+110+ | None ->
111+ (* Not a heading element *)
112+ ()
113+114+let end_element state ~name ~namespace:_ collector =
115+ match state.in_heading with
116+ | Some heading when heading = name ->
117+ (* Exiting the heading we're tracking *)
118+ if not state.heading_has_text then
119+ Message_collector.add_error collector
120+ ~message:(Printf.sprintf
121+ "Heading <%s> is empty or contains only whitespace. Empty headings are problematic for screen readers"
122+ name)
123+ ~code:"empty-heading"
124+ ~element:name
125+ ();
126+ state.in_heading <- None;
127+ state.heading_has_text <- false
128+ | _ ->
129+ ()
130+131+let characters state text _collector =
132+ (* If we're inside a heading, check if this text is non-whitespace *)
133+ match state.in_heading with
134+ | Some _ ->
135+ if not (is_empty_text text) then
136+ state.heading_has_text <- true
137+ | None ->
138+ ()
139+140+let end_document state collector =
141+ (* Check if document has any headings *)
142+ if not state.has_any_heading then
143+ Message_collector.add_warning collector
144+ ~message:"Document contains no heading elements (h1-h6). Headings provide important document structure for accessibility"
145+ ~code:"no-headings"
146+ ()
147+148+let checker = (module struct
149+ type nonrec state = state
150+151+ let create = create
152+ let reset = reset
153+ let start_element = start_element
154+ let end_element = end_element
155+ let characters = characters
156+ let end_document = end_document
157+end : Checker.S)
+12
lib/html5_checker/specialized/heading_checker.mli
···000000000000
···1+(** Heading structure validation checker.
2+3+ Validates:
4+ - Proper heading level hierarchy (no skipped levels)
5+ - Document should have at least one heading
6+ - Multiple h1 usage patterns
7+ - Headings should not be empty *)
8+9+include Checker.S
10+11+val checker : Checker.t
12+(** [checker] is a checker instance for validating heading structure. *)
···1+(** Language attribute validation checker.
2+3+ Validates language attributes. *)
4+5+(** Checker state tracking language attributes. *)
6+type state = {
7+ mutable html_element_seen : bool;
8+ mutable html_has_lang : bool;
9+}
10+11+let create () =
12+ {
13+ html_element_seen = false;
14+ html_has_lang = false;
15+ }
16+17+let reset state =
18+ state.html_element_seen <- false;
19+ state.html_has_lang <- false
20+21+(** Get attribute value from attribute list. *)
22+let get_attr attrs name =
23+ try Some (List.assoc name attrs)
24+ with Not_found -> None
25+26+(** Validate language attribute. *)
27+let validate_lang_attr value ~location ~element collector =
28+ match Dt_language.Language_or_empty.validate value with
29+ | Ok () -> ()
30+ | Error msg ->
31+ Message_collector.add_error collector
32+ ~message:(Printf.sprintf "Invalid lang attribute: %s" msg)
33+ ~code:"invalid-lang"
34+ ?location
35+ ~element
36+ ~attribute:"lang"
37+ ()
38+39+(** Check if lang and xml:lang match. *)
40+let check_lang_xmllang_match ~lang ~xmllang ~location ~element collector =
41+ if lang <> xmllang then
42+ Message_collector.add_warning collector
43+ ~message:(Printf.sprintf
44+ "lang attribute '%s' does not match xml:lang attribute '%s'" lang xmllang)
45+ ~code:"lang-xmllang-mismatch"
46+ ?location
47+ ~element
48+ ()
49+50+(** Process language attributes. *)
51+let process_language_attrs state ~element ~namespace ~attrs ~location collector =
52+ let lang_opt = get_attr attrs "lang" in
53+ let xmllang_opt = get_attr attrs "xml:lang" in
54+55+ (* Check if this is the html element *)
56+ if element = "html" && namespace = None then begin
57+ state.html_element_seen <- true;
58+ state.html_has_lang <- lang_opt <> None
59+ end;
60+61+ (* Validate lang attribute *)
62+ begin match lang_opt with
63+ | Some lang ->
64+ validate_lang_attr lang ~location ~element collector
65+ | None -> ()
66+ end;
67+68+ (* Validate xml:lang attribute *)
69+ begin match xmllang_opt with
70+ | Some xmllang ->
71+ validate_lang_attr xmllang ~location ~element collector
72+ | None -> ()
73+ end;
74+75+ (* Check that lang and xml:lang match if both present *)
76+ begin match lang_opt, xmllang_opt with
77+ | Some lang, Some xmllang ->
78+ check_lang_xmllang_match ~lang ~xmllang ~location ~element collector
79+ | _ -> ()
80+ end
81+82+let start_element state ~name ~namespace ~attrs collector =
83+ let location = None in
84+ process_language_attrs state ~element:name ~namespace ~attrs ~location collector
85+86+let end_element _state ~name:_ ~namespace:_ _collector =
87+ ()
88+89+let characters _state _text _collector =
90+ ()
91+92+let end_document state collector =
93+ (* Warn if html element lacks lang attribute *)
94+ if state.html_element_seen && not state.html_has_lang then
95+ Message_collector.add_warning collector
96+ ~message:"The <html> element should have a lang attribute to specify \
97+ the document's primary language"
98+ ~code:"missing-lang-on-html"
99+ ~element:"html"
100+ ()
101+102+let checker = (module struct
103+ type nonrec state = state
104+105+ let create = create
106+ let reset = reset
107+ let start_element = start_element
108+ let end_element = end_element
109+ let characters = characters
110+ let end_document = end_document
111+end : Checker.S)
···1+(** Language attribute validation checker.
2+3+ Validates:
4+ - lang attribute values are valid BCP 47 tags
5+ - xml:lang matches lang when both present
6+ - Document has a lang attribute on root element
7+8+ This checker ensures proper language markup:
9+ - lang attribute values are validated using BCP 47 format
10+ - When both lang and xml:lang are present, they must match
11+ - Warning if <html> element lacks lang attribute
12+ - Empty lang="" is valid (indicates unknown language)
13+ - Primary language subtag should be valid *)
14+15+include Checker.S
16+17+val checker : Checker.t
18+(** [checker] is a checker instance for validating language attributes. *)
···1+(** Microdata validation checker.
2+3+ Validates HTML5 microdata (itemscope, itemtype, itemprop, itemid, itemref).
4+5+ This checker verifies that microdata attributes are used correctly:
6+ - itemprop can only appear on elements that are properties of an item
7+ (descendant of itemscope or referenced by itemref)
8+ - itemid requires both itemscope and itemtype
9+ - itemref requires itemscope
10+ - itemtype requires itemscope
11+ - itemref values must reference existing IDs
12+ - Detects itemref cycles (A references B, B references A)
13+ - itemprop values must be valid property names (no colons unless URL) *)
14+15+include Checker.S
16+17+val checker : Checker.t
18+(** [checker] is a checker instance for validating microdata. *)
···1+(** Table structure validation checker implementation.
2+3+ This module implements comprehensive table structure validation including
4+ cell overlap detection, span validation, and structural integrity checks. *)
5+6+(** HTML namespace constant *)
7+let html_ns = "http://www.w3.org/1999/xhtml"
8+9+(** Maximum allowed colspan value per HTML5 spec *)
10+let max_colspan = 1000
11+12+(** Maximum allowed rowspan value per HTML5 spec *)
13+let max_rowspan = 65534
14+15+(** Special rowspan value meaning "span to end of row group" *)
16+let rowspan_zero_magic = max_rowspan
17+18+(** {1 Cell Representation} *)
19+20+(** A table cell with positioning information *)
21+type cell = {
22+ mutable left : int;
23+ (** Column in which this cell starts (zero-indexed) *)
24+ mutable right : int;
25+ (** First column into which this cell does not span *)
26+ mutable bottom : int;
27+ (** First row onto which this cell does not span (or rowspan_zero_magic) *)
28+ headers : string list;
29+ (** IDs referenced by the headers attribute *)
30+ is_header : bool;
31+ (** Whether this is a th element *)
32+ element_name : string;
33+ (** "td" or "th" *)
34+}
35+36+(** Create a cell from colspan and rowspan values *)
37+let make_cell ~colspan ~rowspan ~headers ~is_header collector =
38+ let colspan =
39+ if colspan > max_colspan then (
40+ Message_collector.add_error collector
41+ ~message:
42+ (Printf.sprintf
43+ {|The value of the "colspan" attribute must be less than or equal to %d.|}
44+ max_colspan)
45+ ();
46+ max_colspan)
47+ else colspan
48+ in
49+ let rowspan =
50+ if rowspan > max_rowspan then (
51+ Message_collector.add_error collector
52+ ~message:
53+ (Printf.sprintf
54+ {|The value of the "rowspan" attribute must be less than or equal to %d.|}
55+ max_rowspan)
56+ ();
57+ max_rowspan)
58+ else rowspan
59+ in
60+ {
61+ left = 0;
62+ right = colspan;
63+ bottom = (if rowspan = 0 then rowspan_zero_magic else rowspan);
64+ headers;
65+ is_header;
66+ element_name = (if is_header then "th" else "td");
67+ }
68+69+(** Set the absolute position of a cell *)
70+let set_cell_position cell ~row ~col =
71+ cell.left <- col;
72+ cell.right <- cell.right + col;
73+ if cell.bottom <> rowspan_zero_magic then cell.bottom <- cell.bottom + row
74+75+(** Check if a cell should be removed from the active set *)
76+let should_cull_cell cell ~row = row >= cell.bottom
77+78+(** Check if two cells overlap horizontally *)
79+let cells_overlap_horizontally cell1 cell2 =
80+ not (cell2.right <= cell1.left || cell1.right <= cell2.left)
81+82+(** Emit error for horizontal cell overlap *)
83+let err_on_horizontal_overlap cell1 cell2 collector =
84+ if cells_overlap_horizontally cell1 cell2 then (
85+ Message_collector.add_error collector
86+ ~message:"Table cell is overlapped by later table cell." ();
87+ Message_collector.add_error collector
88+ ~message:"Table cell overlaps an earlier table cell." ())
89+90+(** Check if cell spans past end of row group *)
91+let err_if_not_rowspan_zero cell ~row_group_type collector =
92+ if cell.bottom <> rowspan_zero_magic then
93+ let group_desc =
94+ match row_group_type with
95+ | None -> "implicit row group"
96+ | Some t -> Printf.sprintf {|row group established by a "%s" element|} t
97+ in
98+ Message_collector.add_error collector
99+ ~message:
100+ (Printf.sprintf
101+ "Table cell spans past the end of its %s; clipped to the end of \
102+ the row group."
103+ group_desc)
104+ ()
105+106+(** {1 Column Range Tracking} *)
107+108+(** A contiguous range of columns without cells *)
109+type column_range = {
110+ element : string;
111+ (** Element that established this range (col/colgroup/td/th) *)
112+ mutable left : int;
113+ (** Leftmost column in range *)
114+ mutable right : int;
115+ (** First column to right not in range *)
116+ mutable next : column_range option;
117+ (** Next range in linked list *)
118+}
119+120+(** Create a column range *)
121+let make_column_range ~element ~left ~right =
122+ { element; left; right; next = None }
123+124+(** Check if column range contains a single column *)
125+let is_single_col range = range.left + 1 = range.right
126+127+(** Test if a column hits a range (-1=left, 0=in, 1=right) *)
128+let hits_column range column =
129+ if column < range.left then -1
130+ else if column >= range.right then 1
131+ else 0
132+133+(** Remove a column from a range, returning the new range(s) *)
134+let remove_column range column =
135+ if is_single_col range then None
136+ else if column = range.left then (
137+ range.left <- range.left + 1;
138+ Some range)
139+ else if column + 1 = range.right then (
140+ range.right <- range.right - 1;
141+ Some range)
142+ else
143+ (* Split into two ranges *)
144+ let created = make_column_range ~element:range.element ~left:(column + 1) ~right:range.right in
145+ created.next <- range.next;
146+ range.next <- Some created;
147+ range.right <- column;
148+ Some created
149+150+(** {1 Row Group State} *)
151+152+(** State for a row group (explicit or implicit) *)
153+type row_group = {
154+ mutable current_row : int;
155+ (** Current row index within this group *)
156+ mutable insertion_point : int;
157+ (** Column position for next cell insertion *)
158+ mutable next_old_cell : int;
159+ (** Index into cells_on_current_row *)
160+ mutable row_had_cells : bool;
161+ (** Whether current row has any cells *)
162+ cells_in_effect : ((int * int), cell) Hashtbl.t;
163+ (** Cells from previous rows still spanning down, keyed by (bottom, left) *)
164+ mutable cells_on_current_row : cell array;
165+ (** Cells from previous rows affecting current row, sorted by left *)
166+ row_group_type : string option;
167+ (** Name of row group element (thead/tbody/tfoot) or None for implicit *)
168+}
169+170+(** Create a new row group *)
171+let make_row_group ~row_group_type =
172+ {
173+ current_row = -1;
174+ insertion_point = 0;
175+ next_old_cell = 0;
176+ row_had_cells = false;
177+ cells_in_effect = Hashtbl.create 16;
178+ cells_on_current_row = [||];
179+ row_group_type;
180+ }
181+182+(** Start a new row in the row group *)
183+let start_row_in_group group =
184+ group.current_row <- group.current_row + 1;
185+ group.insertion_point <- 0;
186+ group.next_old_cell <- 0;
187+ group.row_had_cells <- false;
188+ (* Collect cells still in effect and sort by left column *)
189+ let active_cells : cell list =
190+ Hashtbl.fold
191+ (fun _ (cell : cell) acc -> if not (should_cull_cell cell ~row:group.current_row) then cell :: acc else acc)
192+ group.cells_in_effect []
193+ in
194+ let sorted = List.sort (fun (c1 : cell) (c2 : cell) -> Int.compare c1.left c2.left) active_cells in
195+ group.cells_on_current_row <- Array.of_list sorted
196+197+(** Find the next available insertion point *)
198+let rec find_insertion_point group =
199+ if group.next_old_cell < Array.length group.cells_on_current_row then
200+ let other = group.cells_on_current_row.(group.next_old_cell) in
201+ if group.insertion_point < other.left then ()
202+ else (
203+ let right = other.right in
204+ if right > group.insertion_point then group.insertion_point <- right;
205+ group.next_old_cell <- group.next_old_cell + 1;
206+ find_insertion_point group)
207+208+(** Add a cell to the row group *)
209+let add_cell_to_group group cell collector =
210+ group.row_had_cells <- true;
211+ find_insertion_point group;
212+ set_cell_position cell ~row:group.current_row ~col:group.insertion_point;
213+214+ (* Check for overlaps with cells from previous rows *)
215+ for i = group.next_old_cell to Array.length group.cells_on_current_row - 1 do
216+ err_on_horizontal_overlap group.cells_on_current_row.(i) cell collector
217+ done;
218+219+ (* Add to cells in effect if it spans beyond current row *)
220+ if cell.bottom > group.current_row + 1 then
221+ Hashtbl.add group.cells_in_effect (cell.bottom, cell.left) cell;
222+223+ group.insertion_point <- cell.right
224+225+(** End the current row *)
226+let end_row_in_group group collector =
227+ (if not group.row_had_cells then
228+ let group_desc =
229+ match group.row_group_type with
230+ | None -> "an implicit row group"
231+ | Some t -> Printf.sprintf {|a row group established by a "%s" element|} t
232+ in
233+ Message_collector.add_error collector
234+ ~message:
235+ (Printf.sprintf {|Row %d of %s has no cells beginning on it.|}
236+ (group.current_row + 1) group_desc)
237+ ());
238+239+ find_insertion_point group;
240+ group.cells_on_current_row <- [||];
241+242+ (* Cull cells that don't span to next row *)
243+ let to_remove = ref [] in
244+ Hashtbl.iter
245+ (fun key cell ->
246+ if should_cull_cell cell ~row:(group.current_row + 1) then to_remove := key :: !to_remove)
247+ group.cells_in_effect;
248+ List.iter (Hashtbl.remove group.cells_in_effect) !to_remove;
249+250+ (* Return the final insertion point (row width) *)
251+ group.insertion_point
252+253+(** End the row group *)
254+let end_row_group group collector =
255+ Hashtbl.iter
256+ (fun _ cell -> err_if_not_rowspan_zero cell ~row_group_type:group.row_group_type collector)
257+ group.cells_in_effect
258+259+(** {1 Table State} *)
260+261+(** Parser state within a table *)
262+type table_state =
263+ | InTableAtStart
264+ | InTableAtPotentialRowGroupStart
265+ | InColgroup
266+ | InColInColgroup
267+ | InColInImplicitGroup
268+ | InRowGroup
269+ | InRowInRowGroup
270+ | InCellInRowGroup
271+ | InRowInImplicitRowGroup
272+ | InImplicitRowGroup
273+ | InCellInImplicitRowGroup
274+ | InTableColsSeen
275+276+(** State for a single table *)
277+type table = {
278+ mutable state : table_state;
279+ mutable suppressed_starts : int;
280+ (** Count of nested suppressed elements *)
281+ mutable hard_width : bool;
282+ (** Whether column count was set by col/colgroup *)
283+ mutable column_count : int;
284+ (** Established column count (-1 if not set) *)
285+ mutable real_column_count : int;
286+ (** Actual maximum column count seen *)
287+ mutable pending_colgroup_span : int;
288+ (** Span for colgroup without col children *)
289+ header_ids : (string, unit) Hashtbl.t;
290+ (** IDs of th elements *)
291+ cells_with_headers : cell list ref;
292+ (** Cells with headers attribute *)
293+ mutable current_row_group : row_group option;
294+ (** Current row group *)
295+ mutable first_col_range : column_range option;
296+ (** Head of column range list *)
297+ mutable last_col_range : column_range option;
298+ (** Tail of column range list *)
299+ mutable current_col_range : column_range option;
300+ (** Current range being inspected *)
301+ mutable previous_col_range : column_range option;
302+ (** Previous range inspected *)
303+}
304+305+(** Create a new table *)
306+let make_table () =
307+ {
308+ state = InTableAtStart;
309+ suppressed_starts = 0;
310+ hard_width = false;
311+ column_count = -1;
312+ real_column_count = 0;
313+ pending_colgroup_span = 0;
314+ header_ids = Hashtbl.create 16;
315+ cells_with_headers = ref [];
316+ current_row_group = None;
317+ first_col_range = None;
318+ last_col_range = None;
319+ current_col_range = None;
320+ previous_col_range = None;
321+ }
322+323+(** Append a column range to the list *)
324+let append_column_range table range =
325+ match table.last_col_range with
326+ | None ->
327+ table.first_col_range <- Some range;
328+ table.last_col_range <- Some range
329+ | Some last ->
330+ last.next <- Some range;
331+ table.last_col_range <- Some range
332+333+(** Report a cell back to table for column tracking *)
334+let report_cell_to_table table (cell : cell) =
335+ let left = cell.left in
336+ let right = cell.right in
337+338+ (* Check if cell extends past known columns *)
339+ if right > table.real_column_count then (
340+ if left = table.real_column_count then (
341+ (* Entirely past existing columns *)
342+ if left + 1 <> right then
343+ append_column_range table
344+ (make_column_range ~element:cell.element_name ~left:(left + 1) ~right);
345+ table.real_column_count <- right)
346+ else (
347+ (* Partially past existing columns *)
348+ append_column_range table
349+ (make_column_range ~element:cell.element_name ~left:table.real_column_count ~right);
350+ table.real_column_count <- right));
351+352+ (* Track column usage *)
353+ let rec process_ranges () =
354+ match table.current_col_range with
355+ | None -> ()
356+ | Some range ->
357+ let hit = hits_column range left in
358+ if hit = 0 then (
359+ (* Column hits this range - remove it *)
360+ match remove_column range left with
361+ | None ->
362+ (* Range destroyed *)
363+ if Option.is_some table.previous_col_range then
364+ (Option.get table.previous_col_range).next <- range.next;
365+ if table.first_col_range = Some range then table.first_col_range <- range.next;
366+ if table.last_col_range = Some range then table.last_col_range <- table.previous_col_range;
367+ table.current_col_range <- range.next
368+ | Some new_range ->
369+ if table.last_col_range = Some range then table.last_col_range <- Some new_range;
370+ table.current_col_range <- Some new_range)
371+ else if hit = -1 then
372+ ()
373+ else (
374+ (* hit = 1, try next range *)
375+ table.previous_col_range <- Some range;
376+ table.current_col_range <- range.next;
377+ process_ranges ())
378+ in
379+ process_ranges ()
380+381+(** {1 Attribute Parsing} *)
382+383+(** Parse a non-negative integer attribute, returning 1 if absent or invalid *)
384+let parse_non_negative_int attrs name =
385+ match List.assoc_opt name attrs with
386+ | None -> 1
387+ | Some v -> (
388+ try
389+ let n = int_of_string v in
390+ if n >= 0 then n else 1
391+ with Failure _ -> 1)
392+393+(** Parse a positive integer attribute, returning 1 if absent or invalid *)
394+let parse_positive_int attrs name =
395+ match List.assoc_opt name attrs with
396+ | None -> 1
397+ | Some v -> (
398+ try
399+ let n = int_of_string v in
400+ if n > 0 then n else 1
401+ with Failure _ -> 1)
402+403+(** Parse the headers attribute into a list of IDs *)
404+let parse_headers attrs =
405+ match List.assoc_opt "headers" attrs with
406+ | None -> []
407+ | Some v ->
408+ let parts = String.split_on_char ' ' v in
409+ List.filter (fun s -> String.length s > 0) parts
410+411+(** Parse span attribute, clamping to max_colspan *)
412+let parse_span attrs collector =
413+ let span = parse_non_negative_int attrs "span" in
414+ if span > max_colspan then (
415+ Message_collector.add_error collector
416+ ~message:
417+ (Printf.sprintf {|The value of the "span" attribute must be less than or equal to %d.|}
418+ max_colspan)
419+ ();
420+ max_colspan)
421+ else span
422+423+(** {1 Table Event Handlers} *)
424+425+(** Check if we should suppress the start event *)
426+let need_suppress_start table =
427+ if table.suppressed_starts > 0 then (
428+ table.suppressed_starts <- table.suppressed_starts + 1;
429+ true)
430+ else false
431+432+(** Check if we should suppress the end event *)
433+let need_suppress_end table =
434+ if table.suppressed_starts > 0 then (
435+ table.suppressed_starts <- table.suppressed_starts - 1;
436+ true)
437+ else false
438+439+(** Start a row group *)
440+let start_row_group table local_name collector =
441+ if need_suppress_start table then ()
442+ else
443+ match table.state with
444+ | InImplicitRowGroup -> (
445+ match table.current_row_group with
446+ | Some group ->
447+ end_row_group group collector;
448+ table.current_row_group <- Some (make_row_group ~row_group_type:(Some local_name));
449+ table.state <- InRowGroup
450+ | None -> failwith "Bug: InImplicitRowGroup but no current row group")
451+ | InTableAtStart | InTableColsSeen | InTableAtPotentialRowGroupStart ->
452+ table.current_row_group <- Some (make_row_group ~row_group_type:(Some local_name));
453+ table.state <- InRowGroup
454+ | _ -> table.suppressed_starts <- 1
455+456+(** End a row group *)
457+let end_row_group_handler table collector =
458+ if need_suppress_end table then ()
459+ else
460+ match table.state with
461+ | InRowGroup -> (
462+ match table.current_row_group with
463+ | Some group ->
464+ end_row_group group collector;
465+ table.current_row_group <- None;
466+ table.state <- InTableAtPotentialRowGroupStart
467+ | None -> failwith "Bug: InRowGroup but no current row group")
468+ | _ -> failwith "Bug: end_row_group in wrong state"
469+470+(** Start a row *)
471+let start_row table collector =
472+ if need_suppress_start table then ()
473+ else
474+ match table.state with
475+ | InTableAtStart | InTableColsSeen | InTableAtPotentialRowGroupStart ->
476+ table.current_row_group <- Some (make_row_group ~row_group_type:None);
477+ table.state <- InRowInImplicitRowGroup;
478+ table.current_col_range <- table.first_col_range;
479+ table.previous_col_range <- None;
480+ (match table.current_row_group with
481+ | Some group -> start_row_in_group group
482+ | None -> failwith "Bug: just created row group")
483+ | InImplicitRowGroup ->
484+ table.state <- InRowInImplicitRowGroup;
485+ table.current_col_range <- table.first_col_range;
486+ table.previous_col_range <- None;
487+ (match table.current_row_group with
488+ | Some group -> start_row_in_group group
489+ | None -> failwith "Bug: InImplicitRowGroup but no row group")
490+ | InRowGroup ->
491+ table.state <- InRowInRowGroup;
492+ table.current_col_range <- table.first_col_range;
493+ table.previous_col_range <- None;
494+ (match table.current_row_group with
495+ | Some group -> start_row_in_group group
496+ | None -> failwith "Bug: InRowGroup but no row group")
497+ | _ -> table.suppressed_starts <- 1
498+499+(** End a row *)
500+let end_row table collector =
501+ if need_suppress_end table then ()
502+ else
503+ match table.state with
504+ | InRowInRowGroup ->
505+ table.state <- InRowGroup;
506+ (match table.current_row_group with
507+ | Some group ->
508+ let row_width = end_row_in_group group collector in
509+ (* Check row width against column count *)
510+ if table.hard_width then (
511+ if row_width > table.column_count then
512+ Message_collector.add_error collector
513+ ~message:
514+ (Printf.sprintf
515+ {|A table row was %d columns wide and exceeded the column count established using column markup (%d).|}
516+ row_width table.column_count)
517+ ()
518+ else if row_width < table.column_count then
519+ Message_collector.add_error collector
520+ ~message:
521+ (Printf.sprintf
522+ {|A table row was %d columns wide, which is less than the column count established using column markup (%d).|}
523+ row_width table.column_count)
524+ ())
525+ else if table.column_count = -1 then
526+ table.column_count <- row_width
527+ else (
528+ if row_width > table.column_count then
529+ Message_collector.add_warning collector
530+ ~message:
531+ (Printf.sprintf
532+ {|A table row was %d columns wide and exceeded the column count established by the first row (%d).|}
533+ row_width table.column_count)
534+ ()
535+ else if row_width < table.column_count then
536+ Message_collector.add_warning collector
537+ ~message:
538+ (Printf.sprintf
539+ {|A table row was %d columns wide, which is less than the column count established by the first row (%d).|}
540+ row_width table.column_count)
541+ ())
542+ | None -> failwith "Bug: InRowInRowGroup but no row group")
543+ | InRowInImplicitRowGroup ->
544+ table.state <- InImplicitRowGroup;
545+ (match table.current_row_group with
546+ | Some group ->
547+ let row_width = end_row_in_group group collector in
548+ (* Same column count checking as above *)
549+ if table.hard_width then (
550+ if row_width > table.column_count then
551+ Message_collector.add_error collector
552+ ~message:
553+ (Printf.sprintf
554+ {|A table row was %d columns wide and exceeded the column count established using column markup (%d).|}
555+ row_width table.column_count)
556+ ()
557+ else if row_width < table.column_count then
558+ Message_collector.add_error collector
559+ ~message:
560+ (Printf.sprintf
561+ {|A table row was %d columns wide, which is less than the column count established using column markup (%d).|}
562+ row_width table.column_count)
563+ ())
564+ else if table.column_count = -1 then
565+ table.column_count <- row_width
566+ else (
567+ if row_width > table.column_count then
568+ Message_collector.add_warning collector
569+ ~message:
570+ (Printf.sprintf
571+ {|A table row was %d columns wide and exceeded the column count established by the first row (%d).|}
572+ row_width table.column_count)
573+ ()
574+ else if row_width < table.column_count then
575+ Message_collector.add_warning collector
576+ ~message:
577+ (Printf.sprintf
578+ {|A table row was %d columns wide, which is less than the column count established by the first row (%d).|}
579+ row_width table.column_count)
580+ ())
581+ | None -> failwith "Bug: InRowInImplicitRowGroup but no row group")
582+ | _ -> failwith "Bug: end_row in wrong state"
583+584+(** Start a cell *)
585+let start_cell table is_header attrs collector =
586+ if need_suppress_start table then ()
587+ else
588+ match table.state with
589+ | InRowInRowGroup ->
590+ table.state <- InCellInRowGroup;
591+ (* Record header ID if present *)
592+ if is_header then (
593+ match List.assoc_opt "id" attrs with
594+ | Some id when String.length id > 0 -> Hashtbl.replace table.header_ids id ()
595+ | _ -> ());
596+ (* Parse cell attributes *)
597+ let colspan = abs (parse_positive_int attrs "colspan") in
598+ let rowspan = abs (parse_non_negative_int attrs "rowspan") in
599+ let headers = parse_headers attrs in
600+ let cell = make_cell ~colspan ~rowspan ~headers ~is_header collector in
601+ if List.length headers > 0 then table.cells_with_headers := cell :: !(table.cells_with_headers);
602+ (match table.current_row_group with
603+ | Some group ->
604+ add_cell_to_group group cell collector;
605+ report_cell_to_table table cell
606+ | None -> failwith "Bug: InRowInRowGroup but no row group")
607+ | InRowInImplicitRowGroup ->
608+ table.state <- InCellInImplicitRowGroup;
609+ (* Same logic as above *)
610+ if is_header then (
611+ match List.assoc_opt "id" attrs with
612+ | Some id when String.length id > 0 -> Hashtbl.replace table.header_ids id ()
613+ | _ -> ());
614+ let colspan = abs (parse_positive_int attrs "colspan") in
615+ let rowspan = abs (parse_non_negative_int attrs "rowspan") in
616+ let headers = parse_headers attrs in
617+ let cell = make_cell ~colspan ~rowspan ~headers ~is_header collector in
618+ if List.length headers > 0 then table.cells_with_headers := cell :: !(table.cells_with_headers);
619+ (match table.current_row_group with
620+ | Some group ->
621+ add_cell_to_group group cell collector;
622+ report_cell_to_table table cell
623+ | None -> failwith "Bug: InRowInImplicitRowGroup but no row group")
624+ | _ -> table.suppressed_starts <- 1
625+626+(** End a cell *)
627+let end_cell table =
628+ if need_suppress_end table then ()
629+ else
630+ match table.state with
631+ | InCellInRowGroup -> table.state <- InRowInRowGroup
632+ | InCellInImplicitRowGroup -> table.state <- InRowInImplicitRowGroup
633+ | _ -> failwith "Bug: end_cell in wrong state"
634+635+(** Start a colgroup *)
636+let start_colgroup table attrs collector =
637+ if need_suppress_start table then ()
638+ else
639+ match table.state with
640+ | InTableAtStart ->
641+ table.hard_width <- true;
642+ table.column_count <- 0;
643+ table.pending_colgroup_span <- parse_span attrs collector;
644+ table.state <- InColgroup
645+ | InTableColsSeen ->
646+ table.pending_colgroup_span <- parse_span attrs collector;
647+ table.state <- InColgroup
648+ | _ -> table.suppressed_starts <- 1
649+650+(** End a colgroup *)
651+let end_colgroup table =
652+ if need_suppress_end table then ()
653+ else
654+ match table.state with
655+ | InColgroup ->
656+ if table.pending_colgroup_span <> 0 then (
657+ let right = table.column_count + abs table.pending_colgroup_span in
658+ let range = make_column_range ~element:"colgroup" ~left:table.column_count ~right in
659+ append_column_range table range;
660+ table.column_count <- right);
661+ table.real_column_count <- table.column_count;
662+ table.state <- InTableColsSeen
663+ | _ -> failwith "Bug: end_colgroup in wrong state"
664+665+(** Start a col *)
666+let start_col table attrs collector =
667+ if need_suppress_start table then ()
668+ else
669+ match table.state with
670+ | InTableAtStart ->
671+ table.hard_width <- true;
672+ table.column_count <- 0;
673+ table.state <- InColInImplicitGroup;
674+ let span = abs (parse_span attrs collector) in
675+ let right = table.column_count + span in
676+ let range = make_column_range ~element:"col" ~left:table.column_count ~right in
677+ append_column_range table range;
678+ table.column_count <- right;
679+ table.real_column_count <- table.column_count
680+ | InTableColsSeen ->
681+ table.state <- InColInImplicitGroup;
682+ let span = abs (parse_span attrs collector) in
683+ let right = table.column_count + span in
684+ let range = make_column_range ~element:"col" ~left:table.column_count ~right in
685+ append_column_range table range;
686+ table.column_count <- right;
687+ table.real_column_count <- table.column_count
688+ | InColgroup ->
689+ if table.pending_colgroup_span > 0 then
690+ Message_collector.add_warning collector
691+ ~message:
692+ (Printf.sprintf
693+ "A col element causes a span attribute with value %d to be ignored on the \
694+ parent colgroup."
695+ table.pending_colgroup_span)
696+ ();
697+ table.pending_colgroup_span <- 0;
698+ table.state <- InColInColgroup;
699+ let span = abs (parse_span attrs collector) in
700+ let right = table.column_count + span in
701+ let range = make_column_range ~element:"col" ~left:table.column_count ~right in
702+ append_column_range table range;
703+ table.column_count <- right;
704+ table.real_column_count <- table.column_count
705+ | _ -> table.suppressed_starts <- 1
706+707+(** End a col *)
708+let end_col table =
709+ if need_suppress_end table then ()
710+ else
711+ match table.state with
712+ | InColInImplicitGroup -> table.state <- InTableColsSeen
713+ | InColInColgroup -> table.state <- InColgroup
714+ | _ -> failwith "Bug: end_col in wrong state"
715+716+(** End a table *)
717+let end_table table collector =
718+ (match table.state with
719+ | InImplicitRowGroup -> (
720+ match table.current_row_group with
721+ | Some group ->
722+ end_row_group group collector;
723+ table.current_row_group <- None
724+ | None -> failwith "Bug: InImplicitRowGroup but no row group")
725+ | InTableAtStart | InTableAtPotentialRowGroupStart | InTableColsSeen -> ()
726+ | _ -> failwith "Bug: end_table in wrong state");
727+728+ (* Check header reference integrity *)
729+ List.iter
730+ (fun cell ->
731+ List.iter
732+ (fun heading ->
733+ if not (Hashtbl.mem table.header_ids heading) then
734+ Message_collector.add_error collector
735+ ~message:
736+ (Printf.sprintf
737+ {|The "headers" attribute on the element "%s" refers to the ID "%s", but there is no "th" element with that ID in the same table.|}
738+ cell.element_name heading)
739+ ())
740+ cell.headers)
741+ !(table.cells_with_headers);
742+743+ (* Check that each column established by col/colgroup has cells *)
744+ let rec check_ranges range =
745+ match range with
746+ | None -> ()
747+ | Some r ->
748+ if is_single_col r then
749+ Message_collector.add_error collector
750+ ~message:
751+ (Printf.sprintf {|Table column %d established by element "%s" has no cells beginning in it.|}
752+ r.right r.element)
753+ ()
754+ else
755+ Message_collector.add_error collector
756+ ~message:
757+ (Printf.sprintf
758+ {|Table columns in range %d…%d established by element "%s" have no cells beginning in them.|}
759+ (r.left + 1) r.right r.element)
760+ ();
761+ check_ranges r.next
762+ in
763+ check_ranges table.first_col_range
764+765+(** {1 Checker State} *)
766+767+type state = { tables : table list ref (* Stack of nested tables *) }
768+769+let create () = { tables = ref [] }
770+771+let reset state = state.tables := []
772+773+let start_element state ~name ~namespace ~attrs collector =
774+ match namespace with
775+ | Some ns when ns = html_ns -> (
776+ match name with
777+ | "table" ->
778+ (* Push a new table onto the stack *)
779+ state.tables := make_table () :: !(state.tables)
780+ | _ -> (
781+ match !(state.tables) with
782+ | [] -> ()
783+ | table :: _ -> (
784+ match name with
785+ | "td" -> start_cell table false attrs collector
786+ | "th" -> start_cell table true attrs collector
787+ | "tr" -> start_row table collector
788+ | "tbody" | "thead" | "tfoot" -> start_row_group table name collector
789+ | "col" -> start_col table attrs collector
790+ | "colgroup" -> start_colgroup table attrs collector
791+ | _ -> ())))
792+ | _ -> ()
793+794+let end_element state ~name ~namespace collector =
795+ match namespace with
796+ | Some ns when ns = html_ns -> (
797+ match name with
798+ | "table" -> (
799+ match !(state.tables) with
800+ | [] -> failwith "Bug: end table but no table on stack"
801+ | table :: rest ->
802+ end_table table collector;
803+ state.tables := rest)
804+ | _ -> (
805+ match !(state.tables) with
806+ | [] -> ()
807+ | table :: _ -> (
808+ match name with
809+ | "td" | "th" -> end_cell table
810+ | "tr" -> end_row table collector
811+ | "tbody" | "thead" | "tfoot" -> end_row_group_handler table collector
812+ | "col" -> end_col table
813+ | "colgroup" -> end_colgroup table
814+ | _ -> ())))
815+ | _ -> ()
816+817+let characters _state _text _collector = ()
818+819+let end_document state collector =
820+ if !(state.tables) <> [] then
821+ Message_collector.add_error collector ~message:"Unclosed table element at end of document." ()
822+823+let checker =
824+ (module struct
825+ type nonrec state = state
826+827+ let create = create
828+ let reset = reset
829+ let start_element = start_element
830+ let end_element = end_element
831+ let characters = characters
832+ let end_document = end_document
833+ end : Checker.S)
···1+(** Table structure validation checker.
2+3+ Validates HTML table integrity including:
4+ - Cell overlap detection (rowspan/colspan causing overlap)
5+ - Spanning past table boundaries
6+ - Proper table structure (thead/tbody/tfoot ordering)
7+ - Maximum colspan limit (1000 per HTML spec)
8+ - Maximum rowspan limit (65534 per HTML spec)
9+10+ {2 Validation Rules}
11+12+ {b Cell Positioning}
13+ - Detects when two cells claim the same grid position
14+ - Validates that rowspan/colspan don't cause cells to extend past boundaries
15+ - Tracks cell positions accounting for rowspan/colspan from previous rows
16+17+ {b Span Limits}
18+ - [colspan] must be positive and <= 1000
19+ - [rowspan] must be non-negative and <= 65534
20+ - [rowspan=0] is a special value meaning "span to end of row group"
21+22+ {b Table Structure}
23+ - [caption] must be first child of [table] if present
24+ - [thead] must come before [tbody] and [tfoot]
25+ - Only one [thead] and one [tfoot] allowed per table
26+ - [col] elements can establish explicit column count
27+ - [colgroup] elements can group columns
28+29+ {b Row Validation}
30+ - Each row must have at least one cell
31+ - Row widths should match the established column count
32+ - Cells cannot overlap horizontally in the same row
33+34+ {b Header References}
35+ - [headers] attribute on [td]/[th] must reference valid [th] IDs
36+ - All referenced header IDs must exist in the same table
37+38+ {3 Example Valid Table}
39+40+ {v
41+ <table>
42+ <thead>
43+ <tr>
44+ <th id="h1">Header 1</th>
45+ <th id="h2">Header 2</th>
46+ </tr>
47+ </thead>
48+ <tbody>
49+ <tr>
50+ <td headers="h1">Cell 1</td>
51+ <td headers="h2">Cell 2</td>
52+ </tr>
53+ </tbody>
54+ </table>
55+ v}
56+57+ {3 Example Invalid Table (Overlapping Cells)}
58+59+ {v
60+ <table>
61+ <tr>
62+ <td rowspan="2">A</td>
63+ <td>B</td>
64+ </tr>
65+ <tr>
66+ <td>C</td> <!-- Would overlap with A's rowspan -->
67+ </tr>
68+ </table>
69+ v}
70+71+ @see <https://html.spec.whatwg.org/multipage/tables.html> WHATWG HTML: Tables
72+ @see <https://www.w3.org/TR/html52/tabular-data.html> W3C HTML 5.2: Tables *)
73+74+include Checker.S
75+76+val checker : Checker.t
77+(** A first-class module instance of this checker.
78+79+ {b Usage:}
80+ {[
81+ let checker = Table_checker.checker in
82+ Checker_registry.register "table-structure" checker
83+ ]} *)
+2-1
lib/html5rw/parser/parser_impl.ml
···25module TreeBuilderSink = struct
26 type t = Parser_tree_builder.t
2728- let process tb token =
029 Parser_tree_builder.process_token tb token;
30 (* Check if we need to switch tokenizer state based on current element *)
31 (* Only switch for HTML namespace elements - SVG/MathML use different rules *)
···25module TreeBuilderSink = struct
26 type t = Parser_tree_builder.t
2728+ let process tb token ~line ~column =
29+ Parser_tree_builder.set_position tb ~line ~column;
30 Parser_tree_builder.process_token tb token;
31 (* Check if we need to switch tokenizer state based on current element *)
32 (* Only switch for HTML namespace elements - SVG/MathML use different rules *)
+10-1
lib/html5rw/parser/parser_tree_builder.ml
···42 fragment_context : fragment_context option;
43 mutable fragment_context_element : Dom.node option;
44 iframe_srcdoc : bool;
0045}
4647let create ?(collect_errors=false) ?fragment_context ?(iframe_srcdoc=false) () =
···66 fragment_context;
67 fragment_context_element = None;
68 iframe_srcdoc;
0069 } in
70 (* Initialize fragment parsing *)
71 (match fragment_context with
···110 | None -> ());
111 t
11200000113(* Error handling *)
114let parse_error t code =
115 if t.collect_errors then
116- t.errors <- { code = Parse_error_code.of_string code; line = 0; column = 0 } :: t.errors
117118(* Stack helpers *)
119let current_node t =
···194*)
195module type SINK = sig
196 type t
197- val process : t -> Tokenizer_token.t -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
198 val adjusted_current_node_in_html_namespace : t -> bool
199end
200
···194*)
195module type SINK = sig
196 type t
197+ val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
198 val adjusted_current_node_in_html_namespace : t -> bool
199end
200
+41-38
lib/html5rw/tokenizer/tokenizer_impl.ml
···11(* Token sink interface *)
12module type SINK = sig
13 type t
14- val process : t -> Tokenizer_token.t -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
15 val adjusted_current_node_in_html_namespace : t -> bool
16end
17···184 let data = Buffer.contents t.pending_chars in
185 Buffer.clear t.pending_chars;
186 let data = if t.xml_mode then transform_xml_chars data else data in
187- ignore (S.process t.sink (Tokenizer_token.Character data))
0188 end
189 in
190191 let emit token =
192 emit_pending_chars ();
193- match S.process t.sink token with
0194 | `Continue -> ()
195 | `SwitchTo new_state -> t.state <- new_state
196 in
···278 handle_eof ()
279 end else if Tokenizer_stream.is_eof t.stream then begin
280 emit_pending_chars ();
281- ignore (S.process t.sink Tokenizer_token.EOF)
282 end else begin
283 step ();
284 process_state ()
···288 match t.state with
289 | Tokenizer_state.Data ->
290 emit_pending_chars ();
291- ignore (S.process t.sink Tokenizer_token.EOF)
292 | Tokenizer_state.Tag_open ->
293 error t "eof-before-tag-name";
294 emit_char t '<';
295 emit_pending_chars ();
296- ignore (S.process t.sink Tokenizer_token.EOF)
297 | Tokenizer_state.End_tag_open ->
298 error t "eof-before-tag-name";
299 emit_str t "</";
300 emit_pending_chars ();
301- ignore (S.process t.sink Tokenizer_token.EOF)
302 | Tokenizer_state.Tag_name
303 | Tokenizer_state.Before_attribute_name
304 | Tokenizer_state.Attribute_name
···311 | Tokenizer_state.Self_closing_start_tag ->
312 error t "eof-in-tag";
313 emit_pending_chars ();
314- ignore (S.process t.sink Tokenizer_token.EOF)
315 | Tokenizer_state.Rawtext ->
316 emit_pending_chars ();
317- ignore (S.process t.sink Tokenizer_token.EOF)
318 | Tokenizer_state.Rawtext_less_than_sign ->
319 emit_char t '<';
320 emit_pending_chars ();
321- ignore (S.process t.sink Tokenizer_token.EOF)
322 | Tokenizer_state.Rawtext_end_tag_open ->
323 emit_str t "</";
324 emit_pending_chars ();
325- ignore (S.process t.sink Tokenizer_token.EOF)
326 | Tokenizer_state.Rawtext_end_tag_name ->
327 emit_str t "</";
328 emit_str t (Buffer.contents t.temp_buffer);
329 emit_pending_chars ();
330- ignore (S.process t.sink Tokenizer_token.EOF)
331 | Tokenizer_state.Rcdata ->
332 emit_pending_chars ();
333- ignore (S.process t.sink Tokenizer_token.EOF)
334 | Tokenizer_state.Rcdata_less_than_sign ->
335 emit_char t '<';
336 emit_pending_chars ();
337- ignore (S.process t.sink Tokenizer_token.EOF)
338 | Tokenizer_state.Rcdata_end_tag_open ->
339 emit_str t "</";
340 emit_pending_chars ();
341- ignore (S.process t.sink Tokenizer_token.EOF)
342 | Tokenizer_state.Rcdata_end_tag_name ->
343 emit_str t "</";
344 emit_str t (Buffer.contents t.temp_buffer);
345 emit_pending_chars ();
346- ignore (S.process t.sink Tokenizer_token.EOF)
347 | Tokenizer_state.Script_data ->
348 emit_pending_chars ();
349- ignore (S.process t.sink Tokenizer_token.EOF)
350 | Tokenizer_state.Script_data_less_than_sign ->
351 emit_char t '<';
352 emit_pending_chars ();
353- ignore (S.process t.sink Tokenizer_token.EOF)
354 | Tokenizer_state.Script_data_end_tag_open ->
355 emit_str t "</";
356 emit_pending_chars ();
357- ignore (S.process t.sink Tokenizer_token.EOF)
358 | Tokenizer_state.Script_data_end_tag_name ->
359 emit_str t "</";
360 emit_str t (Buffer.contents t.temp_buffer);
361 emit_pending_chars ();
362- ignore (S.process t.sink Tokenizer_token.EOF)
363 | Tokenizer_state.Script_data_escape_start
364 | Tokenizer_state.Script_data_escape_start_dash
365 | Tokenizer_state.Script_data_escaped
···367 | Tokenizer_state.Script_data_escaped_dash_dash ->
368 error t "eof-in-script-html-comment-like-text";
369 emit_pending_chars ();
370- ignore (S.process t.sink Tokenizer_token.EOF)
371 | Tokenizer_state.Script_data_escaped_less_than_sign ->
372 emit_char t '<';
373 emit_pending_chars ();
374- ignore (S.process t.sink Tokenizer_token.EOF)
375 | Tokenizer_state.Script_data_escaped_end_tag_open ->
376 emit_str t "</";
377 emit_pending_chars ();
378- ignore (S.process t.sink Tokenizer_token.EOF)
379 | Tokenizer_state.Script_data_escaped_end_tag_name ->
380 emit_str t "</";
381 emit_str t (Buffer.contents t.temp_buffer);
382 emit_pending_chars ();
383- ignore (S.process t.sink Tokenizer_token.EOF)
384 | Tokenizer_state.Script_data_double_escape_start
385 | Tokenizer_state.Script_data_double_escaped
386 | Tokenizer_state.Script_data_double_escaped_dash
387 | Tokenizer_state.Script_data_double_escaped_dash_dash ->
388 error t "eof-in-script-html-comment-like-text";
389 emit_pending_chars ();
390- ignore (S.process t.sink Tokenizer_token.EOF)
391 | Tokenizer_state.Script_data_double_escaped_less_than_sign ->
392 (* '<' was already emitted when entering this state from Script_data_double_escaped *)
393 emit_pending_chars ();
394- ignore (S.process t.sink Tokenizer_token.EOF)
395 | Tokenizer_state.Script_data_double_escape_end ->
396 emit_pending_chars ();
397- ignore (S.process t.sink Tokenizer_token.EOF)
398 | Tokenizer_state.Plaintext ->
399 emit_pending_chars ();
400- ignore (S.process t.sink Tokenizer_token.EOF)
401 | Tokenizer_state.Comment_start
402 | Tokenizer_state.Comment_start_dash
403 | Tokenizer_state.Comment
···411 error t "eof-in-comment";
412 emit_current_comment ();
413 emit_pending_chars ();
414- ignore (S.process t.sink Tokenizer_token.EOF)
415 | Tokenizer_state.Bogus_comment ->
416 emit_current_comment ();
417 emit_pending_chars ();
418- ignore (S.process t.sink Tokenizer_token.EOF)
419 | Tokenizer_state.Markup_declaration_open ->
420 error t "incorrectly-opened-comment";
421 Buffer.clear t.current_comment;
422 emit_current_comment ();
423 emit_pending_chars ();
424- ignore (S.process t.sink Tokenizer_token.EOF)
425 | Tokenizer_state.Doctype
426 | Tokenizer_state.Before_doctype_name ->
427 error t "eof-in-doctype";
···429 t.current_doctype_force_quirks <- true;
430 emit_current_doctype ();
431 emit_pending_chars ();
432- ignore (S.process t.sink Tokenizer_token.EOF)
433 | Tokenizer_state.Doctype_name
434 | Tokenizer_state.After_doctype_name
435 | Tokenizer_state.After_doctype_public_keyword
···447 t.current_doctype_force_quirks <- true;
448 emit_current_doctype ();
449 emit_pending_chars ();
450- ignore (S.process t.sink Tokenizer_token.EOF)
451 | Tokenizer_state.Bogus_doctype ->
452 emit_current_doctype ();
453 emit_pending_chars ();
454- ignore (S.process t.sink Tokenizer_token.EOF)
455 | Tokenizer_state.Cdata_section ->
456 error t "eof-in-cdata";
457 emit_pending_chars ();
458- ignore (S.process t.sink Tokenizer_token.EOF)
459 | Tokenizer_state.Cdata_section_bracket ->
460 error t "eof-in-cdata";
461 emit_char t ']';
462 emit_pending_chars ();
463- ignore (S.process t.sink Tokenizer_token.EOF)
464 | Tokenizer_state.Cdata_section_end ->
465 error t "eof-in-cdata";
466 emit_str t "]]";
467 emit_pending_chars ();
468- ignore (S.process t.sink Tokenizer_token.EOF)
469 | Tokenizer_state.Character_reference ->
470 (* state_character_reference never ran, so initialize temp_buffer with & *)
471 Buffer.clear t.temp_buffer;
···617 (* Emit pending chars first, then emit null separately for proper tree builder handling *)
618 emit_pending_chars ();
619 error t "unexpected-null-character";
620- ignore (S.process t.sink (Tokenizer_token.Character "\x00"))
0621 | Some c ->
622 emit_char_checked c
623 | None -> ()
···11(* Token sink interface *)
12module type SINK = sig
13 type t
14+ val process : t -> Tokenizer_token.t -> line:int -> column:int -> [ `Continue | `SwitchTo of Tokenizer_state.t ]
15 val adjusted_current_node_in_html_namespace : t -> bool
16end
17···184 let data = Buffer.contents t.pending_chars in
185 Buffer.clear t.pending_chars;
186 let data = if t.xml_mode then transform_xml_chars data else data in
187+ let line, column = Tokenizer_stream.position t.stream in
188+ ignore (S.process t.sink (Tokenizer_token.Character data) ~line ~column)
189 end
190 in
191192 let emit token =
193 emit_pending_chars ();
194+ let line, column = Tokenizer_stream.position t.stream in
195+ match S.process t.sink token ~line ~column with
196 | `Continue -> ()
197 | `SwitchTo new_state -> t.state <- new_state
198 in
···280 handle_eof ()
281 end else if Tokenizer_stream.is_eof t.stream then begin
282 emit_pending_chars ();
283+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
284 end else begin
285 step ();
286 process_state ()
···290 match t.state with
291 | Tokenizer_state.Data ->
292 emit_pending_chars ();
293+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
294 | Tokenizer_state.Tag_open ->
295 error t "eof-before-tag-name";
296 emit_char t '<';
297 emit_pending_chars ();
298+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
299 | Tokenizer_state.End_tag_open ->
300 error t "eof-before-tag-name";
301 emit_str t "</";
302 emit_pending_chars ();
303+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
304 | Tokenizer_state.Tag_name
305 | Tokenizer_state.Before_attribute_name
306 | Tokenizer_state.Attribute_name
···313 | Tokenizer_state.Self_closing_start_tag ->
314 error t "eof-in-tag";
315 emit_pending_chars ();
316+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
317 | Tokenizer_state.Rawtext ->
318 emit_pending_chars ();
319+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
320 | Tokenizer_state.Rawtext_less_than_sign ->
321 emit_char t '<';
322 emit_pending_chars ();
323+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
324 | Tokenizer_state.Rawtext_end_tag_open ->
325 emit_str t "</";
326 emit_pending_chars ();
327+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
328 | Tokenizer_state.Rawtext_end_tag_name ->
329 emit_str t "</";
330 emit_str t (Buffer.contents t.temp_buffer);
331 emit_pending_chars ();
332+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
333 | Tokenizer_state.Rcdata ->
334 emit_pending_chars ();
335+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
336 | Tokenizer_state.Rcdata_less_than_sign ->
337 emit_char t '<';
338 emit_pending_chars ();
339+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
340 | Tokenizer_state.Rcdata_end_tag_open ->
341 emit_str t "</";
342 emit_pending_chars ();
343+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
344 | Tokenizer_state.Rcdata_end_tag_name ->
345 emit_str t "</";
346 emit_str t (Buffer.contents t.temp_buffer);
347 emit_pending_chars ();
348+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
349 | Tokenizer_state.Script_data ->
350 emit_pending_chars ();
351+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
352 | Tokenizer_state.Script_data_less_than_sign ->
353 emit_char t '<';
354 emit_pending_chars ();
355+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
356 | Tokenizer_state.Script_data_end_tag_open ->
357 emit_str t "</";
358 emit_pending_chars ();
359+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
360 | Tokenizer_state.Script_data_end_tag_name ->
361 emit_str t "</";
362 emit_str t (Buffer.contents t.temp_buffer);
363 emit_pending_chars ();
364+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
365 | Tokenizer_state.Script_data_escape_start
366 | Tokenizer_state.Script_data_escape_start_dash
367 | Tokenizer_state.Script_data_escaped
···369 | Tokenizer_state.Script_data_escaped_dash_dash ->
370 error t "eof-in-script-html-comment-like-text";
371 emit_pending_chars ();
372+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
373 | Tokenizer_state.Script_data_escaped_less_than_sign ->
374 emit_char t '<';
375 emit_pending_chars ();
376+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
377 | Tokenizer_state.Script_data_escaped_end_tag_open ->
378 emit_str t "</";
379 emit_pending_chars ();
380+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
381 | Tokenizer_state.Script_data_escaped_end_tag_name ->
382 emit_str t "</";
383 emit_str t (Buffer.contents t.temp_buffer);
384 emit_pending_chars ();
385+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
386 | Tokenizer_state.Script_data_double_escape_start
387 | Tokenizer_state.Script_data_double_escaped
388 | Tokenizer_state.Script_data_double_escaped_dash
389 | Tokenizer_state.Script_data_double_escaped_dash_dash ->
390 error t "eof-in-script-html-comment-like-text";
391 emit_pending_chars ();
392+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
393 | Tokenizer_state.Script_data_double_escaped_less_than_sign ->
394 (* '<' was already emitted when entering this state from Script_data_double_escaped *)
395 emit_pending_chars ();
396+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
397 | Tokenizer_state.Script_data_double_escape_end ->
398 emit_pending_chars ();
399+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
400 | Tokenizer_state.Plaintext ->
401 emit_pending_chars ();
402+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
403 | Tokenizer_state.Comment_start
404 | Tokenizer_state.Comment_start_dash
405 | Tokenizer_state.Comment
···413 error t "eof-in-comment";
414 emit_current_comment ();
415 emit_pending_chars ();
416+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
417 | Tokenizer_state.Bogus_comment ->
418 emit_current_comment ();
419 emit_pending_chars ();
420+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
421 | Tokenizer_state.Markup_declaration_open ->
422 error t "incorrectly-opened-comment";
423 Buffer.clear t.current_comment;
424 emit_current_comment ();
425 emit_pending_chars ();
426+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
427 | Tokenizer_state.Doctype
428 | Tokenizer_state.Before_doctype_name ->
429 error t "eof-in-doctype";
···431 t.current_doctype_force_quirks <- true;
432 emit_current_doctype ();
433 emit_pending_chars ();
434+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
435 | Tokenizer_state.Doctype_name
436 | Tokenizer_state.After_doctype_name
437 | Tokenizer_state.After_doctype_public_keyword
···449 t.current_doctype_force_quirks <- true;
450 emit_current_doctype ();
451 emit_pending_chars ();
452+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
453 | Tokenizer_state.Bogus_doctype ->
454 emit_current_doctype ();
455 emit_pending_chars ();
456+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
457 | Tokenizer_state.Cdata_section ->
458 error t "eof-in-cdata";
459 emit_pending_chars ();
460+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
461 | Tokenizer_state.Cdata_section_bracket ->
462 error t "eof-in-cdata";
463 emit_char t ']';
464 emit_pending_chars ();
465+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
466 | Tokenizer_state.Cdata_section_end ->
467 error t "eof-in-cdata";
468 emit_str t "]]";
469 emit_pending_chars ();
470+ let line, column = Tokenizer_stream.position t.stream in ignore (S.process t.sink Tokenizer_token.EOF ~line ~column)
471 | Tokenizer_state.Character_reference ->
472 (* state_character_reference never ran, so initialize temp_buffer with & *)
473 Buffer.clear t.temp_buffer;
···619 (* Emit pending chars first, then emit null separately for proper tree builder handling *)
620 emit_pending_chars ();
621 error t "unexpected-null-character";
622+ let line, column = Tokenizer_stream.position t.stream in
623+ ignore (S.process t.sink (Tokenizer_token.Character "\x00") ~line ~column)
624 | Some c ->
625 emit_char_checked c
626 | None -> ()
···1+(** Tests for the html5_checker library *)
2+3+(** Helper to create a reader from a string *)
4+let reader_of_string s = Bytesrw.Bytes.Reader.of_string s
5+6+(** Helper to check if a message contains a substring *)
7+let message_contains msg substring =
8+ String.lowercase_ascii msg.Html5_checker.Message.message
9+ |> fun s -> String.length s >= String.length substring &&
10+ try
11+ ignore (Str.search_forward (Str.regexp_case_fold (Str.quote substring)) s 0);
12+ true
13+ with Not_found -> false
14+15+(** Test that valid HTML5 produces no errors *)
16+let test_valid_html5 () =
17+ Printf.printf "Test 1: Valid HTML5 document\n";
18+ let html = {|<!DOCTYPE html>
19+<html lang="en">
20+<head><title>Test</title></head>
21+<body><p>Hello world</p></body>
22+</html>|} in
23+ let reader = reader_of_string html in
24+ let result = Html5_checker.check reader in
25+ let errors = Html5_checker.errors result in
26+ Printf.printf " Found %d error(s)\n" (List.length errors);
27+ if List.length errors > 0 then begin
28+ List.iter (fun msg ->
29+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
30+ ) errors;
31+ end else
32+ Printf.printf " OK: No errors as expected\n"
33+34+(** Test that missing DOCTYPE is detected *)
35+let test_missing_doctype () =
36+ Printf.printf "\nTest 2: Missing DOCTYPE\n";
37+ let html = "<html><body>Hello</body></html>" in
38+ let reader = reader_of_string html in
39+ let result = Html5_checker.check reader in
40+ let errors = Html5_checker.errors result in
41+ Printf.printf " Found %d error(s)\n" (List.length errors);
42+ if List.length errors = 0 then
43+ Printf.printf " Warning: Expected parse errors for missing DOCTYPE\n"
44+ else begin
45+ List.iter (fun msg ->
46+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
47+ ) errors;
48+ end
49+50+(** Test that obsolete elements are detected *)
51+let test_obsolete_element () =
52+ Printf.printf "\nTest 3: Obsolete <center> element\n";
53+ let html = "<!DOCTYPE html><html><body><center>Centered</center></body></html>" in
54+ let reader = reader_of_string html in
55+ let result = Html5_checker.check reader in
56+ let all_msgs = Html5_checker.messages result in
57+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
58+ let obsolete_msgs = List.filter (fun m ->
59+ message_contains m "obsolete" || message_contains m "center"
60+ ) all_msgs in
61+ if List.length obsolete_msgs > 0 then begin
62+ Printf.printf " Found obsolete-related messages:\n";
63+ List.iter (fun msg ->
64+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
65+ ) obsolete_msgs;
66+ end else
67+ Printf.printf " Note: No obsolete element warnings found (checker may not be enabled)\n"
68+69+(** Test duplicate IDs *)
70+let test_duplicate_id () =
71+ Printf.printf "\nTest 4: Duplicate ID attributes\n";
72+ let html = {|<!DOCTYPE html><html><body>
73+ <div id="foo">First</div>
74+ <div id="foo">Second</div>
75+ </body></html>|} in
76+ let reader = reader_of_string html in
77+ let result = Html5_checker.check reader in
78+ let all_msgs = Html5_checker.messages result in
79+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
80+ let id_msgs = List.filter (fun m ->
81+ message_contains m "duplicate" || message_contains m "id"
82+ ) all_msgs in
83+ if List.length id_msgs > 0 then begin
84+ Printf.printf " Found ID-related messages:\n";
85+ List.iter (fun msg ->
86+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
87+ ) id_msgs;
88+ end else
89+ Printf.printf " Note: No duplicate ID errors found (checker may not be enabled)\n"
90+91+(** Test heading structure *)
92+let test_heading_skip () =
93+ Printf.printf "\nTest 5: Skipped heading level\n";
94+ let html = {|<!DOCTYPE html><html><body>
95+ <h1>Title</h1>
96+ <h3>Skipped h2</h3>
97+ </body></html>|} in
98+ let reader = reader_of_string html in
99+ let result = Html5_checker.check reader in
100+ let all_msgs = Html5_checker.messages result in
101+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
102+ let heading_msgs = List.filter (fun m ->
103+ message_contains m "heading" || message_contains m "skip"
104+ ) all_msgs in
105+ if List.length heading_msgs > 0 then begin
106+ Printf.printf " Found heading-related messages:\n";
107+ List.iter (fun msg ->
108+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
109+ ) heading_msgs;
110+ end else
111+ Printf.printf " Note: No heading structure warnings found (checker may not be enabled)\n"
112+113+(** Test img without alt *)
114+let test_img_without_alt () =
115+ Printf.printf "\nTest 6: Image without alt attribute\n";
116+ let html = {|<!DOCTYPE html><html><body>
117+ <img src="test.jpg">
118+ </body></html>|} in
119+ let reader = reader_of_string html in
120+ let result = Html5_checker.check reader in
121+ let all_msgs = Html5_checker.messages result in
122+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
123+ let img_msgs = List.filter (fun m ->
124+ message_contains m "alt" || (message_contains m "img" && message_contains m "attribute")
125+ ) all_msgs in
126+ if List.length img_msgs > 0 then begin
127+ Printf.printf " Found img/alt-related messages:\n";
128+ List.iter (fun msg ->
129+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
130+ ) img_msgs;
131+ end else
132+ Printf.printf " Note: No missing alt attribute errors found (checker may not be enabled)\n"
133+134+(** Test invalid nesting *)
135+let test_invalid_nesting () =
136+ Printf.printf "\nTest 7: Invalid nesting - <a> inside <a>\n";
137+ let html = {|<!DOCTYPE html><html><body>
138+ <a href="#">Link <a href="#">Nested</a></a>
139+ </body></html>|} in
140+ let reader = reader_of_string html in
141+ let result = Html5_checker.check reader in
142+ let all_msgs = Html5_checker.messages result in
143+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
144+ let nesting_msgs = List.filter (fun m ->
145+ message_contains m "nesting" || message_contains m "nested" || message_contains m "ancestor"
146+ ) all_msgs in
147+ if List.length nesting_msgs > 0 then begin
148+ Printf.printf " Found nesting-related messages:\n";
149+ List.iter (fun msg ->
150+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
151+ ) nesting_msgs;
152+ end else
153+ Printf.printf " Note: No nesting errors found (checker may not be enabled)\n"
154+155+(** Test form inside form *)
156+let test_form_nesting () =
157+ Printf.printf "\nTest 8: Invalid nesting - <form> inside <form>\n";
158+ let html = {|<!DOCTYPE html><html><body>
159+ <form><form></form></form>
160+ </body></html>|} in
161+ let reader = reader_of_string html in
162+ let result = Html5_checker.check reader in
163+ let all_msgs = Html5_checker.messages result in
164+ Printf.printf " Found %d message(s)\n" (List.length all_msgs);
165+ let form_msgs = List.filter (fun m ->
166+ message_contains m "form"
167+ ) all_msgs in
168+ if List.length form_msgs > 0 then begin
169+ Printf.printf " Found form-related messages:\n";
170+ List.iter (fun msg ->
171+ Printf.printf " - %s\n" msg.Html5_checker.Message.message
172+ ) form_msgs;
173+ end else
174+ Printf.printf " Note: No form nesting errors found (checker may not be enabled)\n"
175+176+(** Test output formatting *)
177+let test_output_formats () =
178+ Printf.printf "\nTest 9: Output format testing\n";
179+ let html = {|<!DOCTYPE html><html><body><p>Test</p></body></html>|} in
180+ let reader = reader_of_string html in
181+ let result = Html5_checker.check reader in
182+183+ Printf.printf " Testing text format:\n";
184+ let text_output = Html5_checker.format_text result in
185+ Printf.printf " Length: %d chars\n" (String.length text_output);
186+187+ Printf.printf " Testing JSON format:\n";
188+ let json_output = Html5_checker.format_json result in
189+ Printf.printf " Length: %d chars\n" (String.length json_output);
190+191+ Printf.printf " Testing GNU format:\n";
192+ let gnu_output = Html5_checker.format_gnu result in
193+ Printf.printf " Length: %d chars\n" (String.length gnu_output)
194+195+(** Test has_errors function *)
196+let test_has_errors () =
197+ Printf.printf "\nTest 10: has_errors function\n";
198+199+ (* Valid document should have no errors *)
200+ let valid_html = "<!DOCTYPE html><html><body><p>Valid</p></body></html>" in
201+ let result1 = Html5_checker.check (reader_of_string valid_html) in
202+ Printf.printf " Valid document has_errors: %b\n" (Html5_checker.has_errors result1);
203+204+ (* Document with likely parse errors *)
205+ let invalid_html = "<html><body><p>Unclosed" in
206+ let result2 = Html5_checker.check (reader_of_string invalid_html) in
207+ Printf.printf " Invalid document has_errors: %b\n" (Html5_checker.has_errors result2)
208+209+(** Test check_dom with pre-parsed document *)
210+let test_check_dom () =
211+ Printf.printf "\nTest 11: check_dom with pre-parsed document\n";
212+ let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in
213+ let reader = reader_of_string html in
214+ let parsed = Html5rw.parse reader in
215+ let result = Html5_checker.check_dom parsed in
216+ let all_msgs = Html5_checker.messages result in
217+ Printf.printf " check_dom found %d message(s)\n" (List.length all_msgs);
218+ Printf.printf " OK: check_dom completed successfully\n"
219+220+(** Test system_id parameter *)
221+let test_system_id () =
222+ Printf.printf "\nTest 12: system_id parameter\n";
223+ let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in
224+ let reader = reader_of_string html in
225+ let result = Html5_checker.check ~system_id:"test.html" reader in
226+ match Html5_checker.system_id result with
227+ | Some id -> Printf.printf " system_id: %s\n" id
228+ | None -> Printf.printf " Warning: system_id not set\n"
229+230+(** Test collect_parse_errors flag *)
231+let test_collect_parse_errors_flag () =
232+ Printf.printf "\nTest 13: collect_parse_errors flag\n";
233+ let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in
234+235+ let result_with = Html5_checker.check ~collect_parse_errors:true (reader_of_string html) in
236+ let msgs_with = Html5_checker.messages result_with in
237+ Printf.printf " With parse errors: %d message(s)\n" (List.length msgs_with);
238+239+ let result_without = Html5_checker.check ~collect_parse_errors:false (reader_of_string html) in
240+ let msgs_without = Html5_checker.messages result_without in
241+ Printf.printf " Without parse errors: %d message(s)\n" (List.length msgs_without)
242+243+(** Test document accessor *)
244+let test_document_accessor () =
245+ Printf.printf "\nTest 14: document accessor\n";
246+ let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in
247+ let reader = reader_of_string html in
248+ let result = Html5_checker.check reader in
249+ let _doc = Html5_checker.document result in
250+ Printf.printf " OK: document accessor works\n"
251+252+(** Test message severity filtering *)
253+let test_severity_filtering () =
254+ Printf.printf "\nTest 15: Message severity filtering\n";
255+ let html = "<!DOCTYPE html><html><body><p>Test</p></body></html>" in
256+ let reader = reader_of_string html in
257+ let result = Html5_checker.check reader in
258+259+ let all_msgs = Html5_checker.messages result in
260+ let errors = Html5_checker.errors result in
261+ let warnings = Html5_checker.warnings result in
262+263+ Printf.printf " Total messages: %d\n" (List.length all_msgs);
264+ Printf.printf " Errors: %d\n" (List.length errors);
265+ Printf.printf " Warnings: %d\n" (List.length warnings);
266+267+ (* Verify that errors + warnings <= all messages *)
268+ if List.length errors + List.length warnings <= List.length all_msgs then
269+ Printf.printf " OK: Message counts are consistent\n"
270+ else
271+ Printf.printf " Warning: Message counts inconsistent\n"
272+273+(** Run all tests *)
274+let () =
275+ Printf.printf "Running html5_checker tests...\n";
276+ Printf.printf "========================================\n\n";
277+278+ test_valid_html5 ();
279+ test_missing_doctype ();
280+ test_obsolete_element ();
281+ test_duplicate_id ();
282+ test_heading_skip ();
283+ test_img_without_alt ();
284+ test_invalid_nesting ();
285+ test_form_nesting ();
286+ test_output_formats ();
287+ test_has_errors ();
288+ test_check_dom ();
289+ test_system_id ();
290+ test_collect_parse_errors_flag ();
291+ test_document_accessor ();
292+ test_severity_filtering ();
293+294+ Printf.printf "\n========================================\n";
295+ Printf.printf "All tests completed!\n";
296+ Printf.printf "\nNote: Some checkers may not be enabled yet.\n";
297+ Printf.printf "Tests marked with 'Note:' indicate features that may be\n";
298+ Printf.printf "implemented in future versions.\n"