···1-(*---------------------------------------------------------------------------
2- Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3- SPDX-License-Identifier: MIT
4- ---------------------------------------------------------------------------*)
5-6-(** HTML5 conformance checker.
7-8- This module provides HTML5 validation and conformance checking,
9- combining parse error detection with structural validation rules. *)
10-11-(** {1 Re-exported modules} *)
12-13-(** Validation message types and constructors. *)
14-module Message = Message
15-16-(** Message collection utilities. *)
17-module Message_collector = Message_collector
18-19-(** Message output formatters. *)
20-module Message_format = Message_format
21-22-(** Parse error bridge. *)
23-module Parse_error_bridge = Parse_error_bridge
24-25-(** {2 Content Model Framework} *)
26-27-(** HTML5 content categories. *)
28-module Content_category = Content_category
29-30-(** HTML5 element content models. *)
31-module Content_model = Content_model
32-33-(** HTML5 attribute specifications. *)
34-module Attr_spec = Attr_spec
35-36-(** HTML5 element specifications. *)
37-module Element_spec = Element_spec
38-39-(** Typed error codes. *)
40-module Error_code = Error_code
41-42-(** {1 Core Types} *)
43-44-(** Result of checking an HTML document. *)
45-type t
46-47-(** {1 Checking Functions} *)
48-49-(** Parse and validate HTML from a reader.
50-51- This function parses the HTML input and optionally collects parse errors.
52- Future versions will also run conformance checkers on the resulting DOM.
53-54- @param collect_parse_errors If true, collect and include parse errors. Default: true.
55- @param system_id Optional file path or URL for error reporting.
56- @param reader Bytesrw reader containing HTML input. *)
57-val check :
58- ?collect_parse_errors:bool ->
59- ?system_id:string ->
60- Bytesrw.Bytes.Reader.t ->
61- t
62-63-(** Validate an already-parsed HTML document.
64-65- This function takes an existing Html5rw.t parse result and validates it.
66-67- @param collect_parse_errors If true, collect and include parse errors from the result. Default: true.
68- @param system_id Optional file path or URL for error reporting.
69- @param result Already-parsed HTML document. *)
70-val check_dom :
71- ?collect_parse_errors:bool ->
72- ?system_id:string ->
73- Html5rw.t ->
74- t
75-76-(** {1 Result Accessors} *)
77-78-(** Get all validation messages. *)
79-val messages : t -> Message.t list
80-81-(** Get only error messages. *)
82-val errors : t -> Message.t list
83-84-(** Get only warning messages. *)
85-val warnings : t -> Message.t list
86-87-(** Get only info messages. *)
88-val infos : t -> Message.t list
89-90-(** Check if there are any errors. *)
91-val has_errors : t -> bool
92-93-(** Get the underlying parsed document. *)
94-val document : t -> Html5rw.t
95-96-(** Get the system identifier if set. *)
97-val system_id : t -> string option
98-99-(** {1 Formatting} *)
100-101-(** Format messages as human-readable text. *)
102-val format_text : t -> string
103-104-(** Format messages as JSON. *)
105-val format_json : t -> string
106-107-(** Format messages in GNU style. *)
108-val format_gnu : t -> string
···1+(*---------------------------------------------------------------------------
2+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3+ SPDX-License-Identifier: MIT
4+ ---------------------------------------------------------------------------*)
5+6+(** HTML5 Conformance Checker
7+8+ This module validates HTML5 documents against the
9+ {{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard},
10+ reporting conformance errors, warnings, and suggestions.
11+12+ {2 Quick Start}
13+14+ {[
15+ (* Validate HTML from a string *)
16+ let html = "<html><body><img></body></html>" in
17+ let reader = Bytesrw.Bytes.Reader.of_string html in
18+ let result = Htmlrw_check.check reader in
19+20+ if Htmlrw_check.has_errors result then begin
21+ List.iter (fun msg ->
22+ Printf.printf "%s: %s\n"
23+ (Htmlrw_check.severity_to_string msg.Htmlrw_check.severity)
24+ msg.Htmlrw_check.text
25+ ) (Htmlrw_check.errors result)
26+ end
27+ ]}
28+29+ {2 What Gets Checked}
30+31+ The checker validates:
32+33+ - {b Parse errors}: Malformed HTML syntax (missing end tags, invalid
34+ nesting, etc.) per the WHATWG parsing specification
35+ - {b Content model}: Elements appearing in contexts where they're not
36+ allowed (e.g., [<div>] inside [<p>])
37+ - {b Attributes}: Missing required attributes, disallowed attributes,
38+ and invalid attribute values
39+ - {b Accessibility}: ARIA role/attribute misuse, missing alt text on
40+ images, form labeling issues
41+ - {b Document structure}: Missing DOCTYPE, duplicate IDs, heading
42+ hierarchy issues
43+ - {b Internationalization}: Missing or mismatched lang attributes
44+45+ {2 Output Formats}
46+47+ Results can be formatted as:
48+ - {b Text}: Human-readable messages for terminal output
49+ - {b JSON}: Machine-readable format compatible with Nu HTML Validator
50+ - {b GNU}: Error format for IDE integration
51+52+ @see <https://html.spec.whatwg.org/>
53+ WHATWG HTML Living Standard
54+ @see <https://validator.w3.org/nu/>
55+ Nu HTML Checker (reference validator) *)
56+57+(** {1:types Types} *)
58+59+(** Message severity level.
60+61+ - [Error]: Conformance error - the document violates the HTML5 spec
62+ - [Warning]: Likely problem - should be reviewed but may be intentional
63+ - [Info]: Suggestion - best practice recommendation *)
64+type severity = Error | Warning | Info
65+66+(** Source location of a validation issue.
67+68+ Locations use 1-based line and column numbers matching typical editor
69+ conventions. The [system_id] field contains the file path or URL if one
70+ was provided to the checker. *)
71+type location = {
72+ line : int;
73+ (** Line number (1-indexed) where the issue was found. *)
74+75+ column : int;
76+ (** Column number (1-indexed) within the line. *)
77+78+ end_line : int option;
79+ (** End line for issues spanning multiple lines. *)
80+81+ end_column : int option;
82+ (** End column for range-based issues. *)
83+84+ system_id : string option;
85+ (** File path or URL, if provided to the checker. *)
86+}
87+88+(** A validation message describing a conformance issue.
89+90+ Each message contains:
91+ - The {!field-severity} indicating how serious the issue is
92+ - Human-readable {!field-text} explaining the problem
93+ - Machine-readable {!field-code} for programmatic handling
94+ - Optional {!field-error_code} for fine-grained pattern matching
95+ - Source {!field-location} when available
96+ - Context ({!field-element}, {!field-attribute}) when relevant *)
97+type message = {
98+ severity : severity;
99+ (** Severity level of this message. *)
100+101+ text : string;
102+ (** Human-readable description of the issue.
103+104+ The text follows Nu HTML Validator message conventions, using
105+ Unicode quotes around element/attribute names:
106+ ["Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child..."] *)
107+108+ code : string;
109+ (** Machine-readable error code in kebab-case.
110+111+ Examples: ["missing-alt"], ["duplicate-id"], ["unexpected-end-tag"].
112+ Useful for filtering or categorizing errors programmatically. *)
113+114+ error_code : Error_code.t option;
115+ (** Typed error code for pattern matching.
116+117+ When present, allows fine-grained handling of specific errors:
118+ {[
119+ match msg.error_code with
120+ | Some (`Img `Missing_alt) -> suggest_alt_text ()
121+ | Some (`Attr (`Duplicate_id (`Id id))) -> highlight_duplicate id
122+ | _ -> show_generic_error msg
123+ ]} *)
124+125+ location : location option;
126+ (** Source location where the issue was detected.
127+128+ [None] for document-level issues or when location tracking is
129+ unavailable (e.g., for some content model errors). *)
130+131+ element : string option;
132+ (** Element name relevant to this message (e.g., ["img"], ["div"]).
133+134+ Lowercase, without angle brackets. *)
135+136+ attribute : string option;
137+ (** Attribute name relevant to this message (e.g., ["alt"], ["href"]).
138+139+ Lowercase. Only present for attribute-related errors. *)
140+141+ extract : string option;
142+ (** Source excerpt showing context around the error.
143+144+ Typically a few characters before and after the problematic location.
145+ Useful for displaying the error in context. *)
146+}
147+148+(** Validation result containing all messages and the parsed document.
149+150+ Use {!messages}, {!errors}, {!warnings}, and {!infos} to access
151+ the validation messages. Use {!document} to access the parsed DOM. *)
152+type t
153+154+(** {1:validation Validation Functions} *)
155+156+(** Validate HTML from a reader.
157+158+ Parses the HTML input and runs all conformance checks, returning
159+ a result containing any validation messages.
160+161+ {b Example:}
162+ {[
163+ let ic = open_in "page.html" in
164+ let reader = Bytesrw.Bytes.Reader.of_in_channel ic in
165+ let result = Htmlrw_check.check ~system_id:"page.html" reader in
166+ close_in ic;
167+168+ if Htmlrw_check.has_errors result then
169+ print_endline (Htmlrw_check.to_text result)
170+ ]}
171+172+ @param collect_parse_errors If [true] (default), include HTML parse
173+ errors in the results. Set to [false] to only get conformance
174+ checker errors (content model, attributes, etc.).
175+ @param system_id File path or URL for the document. Used in error
176+ messages and the {!location} field. Does not affect validation. *)
177+val check :
178+ ?collect_parse_errors:bool ->
179+ ?system_id:string ->
180+ Bytesrw.Bytes.Reader.t ->
181+ t
182+183+(** Validate an already-parsed HTML document.
184+185+ Runs conformance checks on an existing {!Html5rw.t} parse result.
186+ Useful when you've already parsed the document and want to validate
187+ it without re-parsing.
188+189+ {b Example:}
190+ {[
191+ let doc = Html5rw.parse reader in
192+ (* ... manipulate the DOM ... *)
193+ let result = Htmlrw_check.check_parsed doc in
194+ ]}
195+196+ @param collect_parse_errors If [true] (default), include any parse
197+ errors that were collected during the original parse.
198+ @param system_id File path or URL for error reporting. *)
199+val check_parsed :
200+ ?collect_parse_errors:bool ->
201+ ?system_id:string ->
202+ Html5rw.t ->
203+ t
204+205+(** {1:results Result Accessors} *)
206+207+(** Get all validation messages.
208+209+ Returns messages in the order they were generated, which roughly
210+ corresponds to document order for element-related errors. *)
211+val messages : t -> message list
212+213+(** Get only error messages.
214+215+ Errors indicate conformance violations - the document does not
216+ comply with the HTML5 specification. *)
217+val errors : t -> message list
218+219+(** Get only warning messages.
220+221+ Warnings indicate likely problems that may be intentional in
222+ some cases (e.g., deprecated features still in use). *)
223+val warnings : t -> message list
224+225+(** Get only informational messages.
226+227+ Info messages are suggestions for best practices that don't
228+ affect conformance. *)
229+val infos : t -> message list
230+231+(** Test if any errors were found.
232+233+ Equivalent to [errors result <> []] but more efficient. *)
234+val has_errors : t -> bool
235+236+(** Test if any warnings were found.
237+238+ Equivalent to [warnings result <> []] but more efficient. *)
239+val has_warnings : t -> bool
240+241+(** Get the parsed document.
242+243+ Returns the DOM tree that was validated. For {!check}, this is the
244+ newly parsed document. For {!check_parsed}, this is the document
245+ that was passed in. *)
246+val document : t -> Html5rw.t
247+248+(** Get the system identifier.
249+250+ Returns the file path or URL that was passed to {!check} or
251+ {!check_parsed}, or [None] if not provided. *)
252+val system_id : t -> string option
253+254+(** {1:formatting Output Formatting} *)
255+256+(** Format messages as human-readable text.
257+258+ Produces multi-line output suitable for terminal display:
259+ {v
260+ Error: Element "img" is missing required attribute "alt".
261+ At line 5, column 3
262+ <img src="photo.jpg">
263+ v}
264+265+ Messages are formatted with severity, description, location,
266+ and source excerpt when available. *)
267+val to_text : t -> string
268+269+(** Format messages as JSON.
270+271+ Produces JSON output compatible with the Nu HTML Validator format:
272+ {v
273+ {
274+ "messages": [
275+ {
276+ "type": "error",
277+ "message": "Element \"img\" is missing required attribute \"alt\".",
278+ "lastLine": 5,
279+ "lastColumn": 3
280+ }
281+ ]
282+ }
283+ v}
284+285+ Useful for machine processing and integration with other tools. *)
286+val to_json : t -> string
287+288+(** Format messages in GNU error format.
289+290+ Produces one-line-per-error output for IDE integration:
291+ {v
292+ page.html:5:3: error: Element "img" is missing required attribute "alt".
293+ v}
294+295+ This format is recognized by many editors and build tools. *)
296+val to_gnu : t -> string
297+298+(** {1:utilities Utility Functions} *)
299+300+(** Convert severity to lowercase string.
301+302+ Returns ["error"], ["warning"], or ["info"]. *)
303+val severity_to_string : severity -> string
304+305+(** Pretty-print a severity value. *)
306+val pp_severity : Format.formatter -> severity -> unit
307+308+(** Pretty-print a location. *)
309+val pp_location : Format.formatter -> location -> unit
310+311+(** Pretty-print a message.
312+313+ Includes severity, text, and location if available. *)
314+val pp_message : Format.formatter -> message -> unit
315+316+(** Convert a message to a single-line string.
317+318+ Includes severity and message text. *)
319+val message_to_string : message -> string
320+321+(** {1:error_codes Error Codes}
322+323+ The {!Error_code} module provides typed error codes for programmatic
324+ handling of validation issues. Use pattern matching to handle specific
325+ errors:
326+327+ {[
328+ let handle_message msg =
329+ match msg.Htmlrw_check.error_code with
330+ | Some (`Img `Missing_alt) ->
331+ (* Image accessibility issue *)
332+ suggest_alt_text msg
333+ | Some (`Attr (`Duplicate_id (`Id id))) ->
334+ (* Duplicate ID found *)
335+ highlight_all_with_id id
336+ | Some (`Aria _) ->
337+ (* Any ARIA-related error *)
338+ show_aria_help ()
339+ | _ ->
340+ (* Generic handling *)
341+ display_error msg
342+ ]}
343+344+ The error codes are organized into categories:
345+ - [`Attr _]: Attribute errors (missing, invalid, duplicate)
346+ - [`Element _]: Element/content model errors
347+ - [`Aria _]: ARIA accessibility errors
348+ - [`Img _]: Image-related errors
349+ - [`Table _]: Table structure errors
350+ - And more...
351+352+ See {!Error_code} for the complete type definition. *)
353+module Error_code = Error_code