···3535 Printf.printf " Line %d, Col %d: %s\n"
3636 (Html5rw.error_line err)
3737 (Html5rw.error_column err)
3838- (Html5rw.error_code err)
3838+ (Html5rw.Parse_error_code.to_string (Html5rw.error_code err))
3939 ) errs;
40404141 (* The parser still produces a valid DOM tree *)
+4-1
lib/html5rw/html5rw.ml
···36363737(** {1 Sub-modules} *)
38383939+(** Parse error code types *)
4040+module Parse_error_code = Parse_error_code
4141+3942(** DOM types and manipulation functions *)
4043module Dom = Dom
4144···9699(** Get the namespace from a fragment context *)
97100let fragment_context_namespace = Parser.fragment_context_namespace
981019999-(** Get the error code string *)
102102+(** Get the error code *)
100103let error_code = Parser.error_code
101104102105(** Get the line number of an error (1-indexed) *)
+23-5
lib/html5rw/html5rw.mli
···122122123123(** {1 Sub-modules} *)
124124125125+(** Parse error code types.
126126+127127+ This module provides the {!Parse_error_code.t} variant type that represents
128128+ all WHATWG-defined parse errors plus tree construction errors.
129129+130130+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
131131+ WHATWG: Parse errors *)
132132+module Parse_error_code = Parse_error_code
133133+125134(** DOM types and manipulation functions.
126135127136 This module provides the core types for representing HTML documents as
···334343 WHATWG: Complete list of parse errors *)
335344type parse_error = Parser.parse_error
336345337337-(** Get the error code string.
346346+(** Get the error code.
347347+348348+ Returns the {!Parse_error_code.t} variant representing this error.
349349+ This allows pattern matching on specific error types:
350350+351351+ {[
352352+ match Html5rw.error_code err with
353353+ | Parse_error_code.Unexpected_null_character -> (* handle *)
354354+ | Parse_error_code.Eof_in_tag -> (* handle *)
355355+ | Parse_error_code.Tree_construction_error msg -> (* handle tree error *)
356356+ | _ -> (* other *)
357357+ ]}
338358339339- Error codes are lowercase with hyphens, matching the WHATWG specification
340340- names. Examples: ["unexpected-null-character"], ["eof-in-tag"],
341341- ["missing-end-tag-name"].
359359+ Use {!Parse_error_code.to_string} to convert to a string representation.
342360343361 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
344362 WHATWG: Parse error codes *)
345345-val error_code : parse_error -> string
363363+val error_code : parse_error -> Parse_error_code.t
346364347365(** Get the line number where the error occurred (1-indexed).
348366
···11+(*---------------------------------------------------------------------------
22+ Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
33+ SPDX-License-Identifier: MIT
44+ ---------------------------------------------------------------------------*)
55+66+(** Parse error codes as defined by the WHATWG HTML5 specification.
77+88+ The HTML5 parser never fails - it always produces a DOM tree. However,
99+ the specification defines these error codes for conformance checkers to
1010+ report issues in HTML documents.
1111+1212+ Each error code corresponds to a specific condition in the WHATWG
1313+ specification's parsing algorithm.
1414+1515+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
1616+ WHATWG: Parse errors *)
1717+1818+type t =
1919+ | Abrupt_closing_of_empty_comment
2020+ (** Parser encounters [<!-->] or [<!--->]; comment is treated as
2121+ correctly closed.
2222+2323+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-closing-of-empty-comment> *)
2424+2525+ | Abrupt_doctype_public_identifier
2626+ (** [>] found in DOCTYPE public identifier before closing quote;
2727+ sets document to quirks mode.
2828+2929+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-doctype-public-identifier> *)
3030+3131+ | Abrupt_doctype_system_identifier
3232+ (** [>] found in DOCTYPE system identifier before closing quote;
3333+ sets document to quirks mode.
3434+3535+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-doctype-system-identifier> *)
3636+3737+ | Absence_of_digits_in_numeric_character_reference
3838+ (** Numeric character reference has no digits (e.g., [&#qux;]);
3939+ the reference is not resolved.
4040+4141+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-absence-of-digits-in-numeric-character-reference> *)
4242+4343+ | Cdata_in_html_content
4444+ (** CDATA section found outside SVG or MathML foreign content;
4545+ treated as a bogus comment.
4646+4747+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-cdata-in-html-content> *)
4848+4949+ | Character_reference_outside_unicode_range
5050+ (** Numeric reference exceeds U+10FFFF; resolves to U+FFFD
5151+ REPLACEMENT CHARACTER.
5252+5353+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-character-reference-outside-unicode-range> *)
5454+5555+ | Control_character_in_input_stream
5656+ (** Control code point (other than ASCII whitespace or NULL)
5757+ appears in the input; parsed as-is.
5858+5959+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-control-character-in-input-stream> *)
6060+6161+ | Control_character_reference
6262+ (** Numeric reference to a control character; handled per
6363+ specification replacement rules.
6464+6565+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-control-character-reference> *)
6666+6767+ | Duplicate_attribute
6868+ (** Tag contains duplicate attribute names; later duplicates
6969+ are removed.
7070+7171+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-duplicate-attribute> *)
7272+7373+ | End_tag_with_attributes
7474+ (** End tag includes attributes; attributes are ignored.
7575+7676+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-end-tag-with-attributes> *)
7777+7878+ | End_tag_with_trailing_solidus
7979+ (** End tag has [/] before [>] (like [</br/>]); treated as
8080+ regular end tag.
8181+8282+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-end-tag-with-trailing-solidus> *)
8383+8484+ | Eof_before_tag_name
8585+ (** End of input where tag name expected; [<] or [</] is
8686+ treated as text.
8787+8888+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-before-tag-name> *)
8989+9090+ | Eof_in_cdata
9191+ (** End of input within CDATA section; treated as immediately closed.
9292+9393+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-cdata> *)
9494+9595+ | Eof_in_comment
9696+ (** End of input within comment; comment is treated as
9797+ immediately closed.
9898+9999+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-comment> *)
100100+101101+ | Eof_in_doctype
102102+ (** End of input within DOCTYPE; sets document to quirks mode.
103103+104104+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-doctype> *)
105105+106106+ | Eof_in_script_html_comment_like_text
107107+ (** End of input within HTML-like comment syntax inside a script element.
108108+109109+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-script-html-comment-like-text> *)
110110+111111+ | Eof_in_tag
112112+ (** End of input within a start or end tag; the tag is ignored.
113113+114114+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-tag> *)
115115+116116+ | Incorrectly_closed_comment
117117+ (** Comment closed by [--!>] instead of [-->]; treated as
118118+ correctly closed.
119119+120120+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment> *)
121121+122122+ | Incorrectly_opened_comment
123123+ (** [<!] not followed by [--] (e.g., [<!ELEMENT]); content is
124124+ treated as a bogus comment.
125125+126126+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-opened-comment> *)
127127+128128+ | Invalid_character_sequence_after_doctype_name
129129+ (** Neither "PUBLIC" nor "SYSTEM" after DOCTYPE name; sets
130130+ document to quirks mode.
131131+132132+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-invalid-character-sequence-after-doctype-name> *)
133133+134134+ | Invalid_first_character_of_tag_name
135135+ (** Non-ASCII-alpha character where tag name start expected;
136136+ [<] is treated as text.
137137+138138+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-invalid-first-character-of-tag-name> *)
139139+140140+ | Missing_attribute_value
141141+ (** [>] where attribute value expected (e.g., [<div id=>]);
142142+ attribute gets empty string value.
143143+144144+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-attribute-value> *)
145145+146146+ | Missing_doctype_name
147147+ (** DOCTYPE has no name; sets document to quirks mode.
148148+149149+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-doctype-name> *)
150150+151151+ | Missing_doctype_public_identifier
152152+ (** [>] where public identifier expected; sets quirks mode.
153153+154154+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-doctype-public-identifier> *)
155155+156156+ | Missing_doctype_system_identifier
157157+ (** [>] where system identifier expected; sets quirks mode.
158158+159159+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-doctype-system-identifier> *)
160160+161161+ | Missing_end_tag_name
162162+ (** [>] where end tag name expected ([</>]); sequence is ignored.
163163+164164+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-end-tag-name> *)
165165+166166+ | Missing_quote_before_doctype_public_identifier
167167+ (** Public identifier lacks preceding quote; sets quirks mode.
168168+169169+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-quote-before-doctype-public-identifier> *)
170170+171171+ | Missing_quote_before_doctype_system_identifier
172172+ (** System identifier lacks preceding quote; sets quirks mode.
173173+174174+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-quote-before-doctype-system-identifier> *)
175175+176176+ | Missing_semicolon_after_character_reference
177177+ (** Character reference lacks terminating [;]; behaves as if
178178+ semicolon were present.
179179+180180+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-semicolon-after-character-reference> *)
181181+182182+ | Missing_whitespace_after_doctype_public_keyword
183183+ (** No whitespace between "PUBLIC" and identifier; treated as
184184+ if whitespace were present.
185185+186186+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-after-doctype-public-keyword> *)
187187+188188+ | Missing_whitespace_after_doctype_system_keyword
189189+ (** No whitespace between "SYSTEM" and identifier; treated as
190190+ if whitespace were present.
191191+192192+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-after-doctype-system-keyword> *)
193193+194194+ | Missing_whitespace_before_doctype_name
195195+ (** No whitespace between "DOCTYPE" and name; treated as if
196196+ whitespace were present.
197197+198198+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-before-doctype-name> *)
199199+200200+ | Missing_whitespace_between_attributes
201201+ (** Adjacent attributes lack separating whitespace; treated as
202202+ if whitespace were present.
203203+204204+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-between-attributes> *)
205205+206206+ | Missing_whitespace_between_doctype_public_and_system_identifiers
207207+ (** Public and system identifiers not separated by whitespace;
208208+ treated as if whitespace were present.
209209+210210+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-between-doctype-public-and-system-identifiers> *)
211211+212212+ | Nested_comment
213213+ (** Nested [<!--] detected within comment; comment still closes
214214+ at first [-->].
215215+216216+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-nested-comment> *)
217217+218218+ | Noncharacter_character_reference
219219+ (** Numeric reference to a Unicode noncharacter; resolved as-is
220220+ (not replaced).
221221+222222+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-noncharacter-character-reference> *)
223223+224224+ | Noncharacter_in_input_stream
225225+ (** Unicode noncharacter code point in input; parsed as-is.
226226+227227+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-noncharacter-in-input-stream> *)
228228+229229+ | Non_void_html_element_start_tag_with_trailing_solidus
230230+ (** Non-void element start tag has [/] before [>] (like
231231+ [<div/>]); the [/] is ignored.
232232+233233+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-non-void-html-element-start-tag-with-trailing-solidus> *)
234234+235235+ | Null_character_reference
236236+ (** Numeric reference to U+0000 (NULL); resolves to U+FFFD
237237+ REPLACEMENT CHARACTER.
238238+239239+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-null-character-reference> *)
240240+241241+ | Surrogate_character_reference
242242+ (** Numeric reference to a surrogate code point (U+D800-U+DFFF);
243243+ resolves to U+FFFD REPLACEMENT CHARACTER.
244244+245245+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-surrogate-character-reference> *)
246246+247247+ | Surrogate_in_input_stream
248248+ (** Surrogate code point in input stream; parsed as-is.
249249+250250+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-surrogate-in-input-stream> *)
251251+252252+ | Unexpected_character_after_doctype_system_identifier
253253+ (** Non-whitespace/non-[>] character after system identifier;
254254+ the character is ignored.
255255+256256+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-character-after-doctype-system-identifier> *)
257257+258258+ | Unexpected_character_in_attribute_name
259259+ (** Double quote, single quote, or less-than sign in attribute name;
260260+ included in the attribute name.
261261+262262+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-character-in-attribute-name> *)
263263+264264+ | Unexpected_character_in_unquoted_attribute_value
265265+ (** Double quote, equals sign, backtick, or less-than sign in
266266+ unquoted attribute value; included in the value.
267267+268268+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-character-in-unquoted-attribute-value> *)
269269+270270+ | Unexpected_equals_sign_before_attribute_name
271271+ (** [=] where attribute name expected; treated as first
272272+ character of attribute name.
273273+274274+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-equals-sign-before-attribute-name> *)
275275+276276+ | Unexpected_null_character
277277+ (** U+0000 (NULL) in various positions; ignored or replaced
278278+ with U+FFFD depending on context.
279279+280280+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-null-character> *)
281281+282282+ | Unexpected_question_mark_instead_of_tag_name
283283+ (** [?] where tag name expected (like [<?xml]); treated as
284284+ start of bogus comment.
285285+286286+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-question-mark-instead-of-tag-name> *)
287287+288288+ | Unexpected_solidus_in_tag
289289+ (** [/] in tag not immediately before [>]; treated as
290290+ whitespace.
291291+292292+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-solidus-in-tag> *)
293293+294294+ | Unknown_named_character_reference
295295+ (** Ambiguous ampersand: [&] followed by characters that don't
296296+ match any named reference; not resolved as reference.
297297+298298+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unknown-named-character-reference> *)
299299+300300+ | Tree_construction_error of string
301301+ (** Tree construction error not defined in the WHATWG specification.
302302+303303+ These are informative errors produced during tree construction
304304+ to indicate various issues like unexpected tags, missing closing
305305+ tags, etc. The string contains a descriptive error code. *)
306306+307307+val to_string : t -> string
308308+(** Convert an error code to its WHATWG specification string representation.
309309+310310+ The returned string is lowercase with hyphens, matching the WHATWG
311311+ specification naming convention. For example:
312312+ - [Abrupt_closing_of_empty_comment] becomes ["abrupt-closing-of-empty-comment"]
313313+ - [Eof_in_tag] becomes ["eof-in-tag"] *)
314314+315315+val of_string : string -> t
316316+(** Parse an error code from its WHATWG specification string representation.
317317+318318+ If the string matches a known WHATWG error code, returns that variant.
319319+ Otherwise, returns [Tree_construction_error s]. *)
320320+321321+val of_string_opt : string -> t option
322322+(** Parse an error code from its WHATWG specification string representation.
323323+324324+ Always returns [Some code]. For unrecognized strings, returns
325325+ [Some (Tree_construction_error s)]. *)
326326+327327+val is_whatwg_standard : t -> bool
328328+(** Check if an error code is defined in the WHATWG specification.
329329+330330+ Returns [false] for [Tree_construction_error _], [true] for all others. *)
···117117(** DOM types and manipulation. *)
118118module Dom = Dom
119119120120+(** Parse error code types.
121121+122122+ This module provides the {!Parse_error_code.t} variant type that represents
123123+ all WHATWG-defined parse errors plus tree construction errors.
124124+125125+ @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
126126+ WHATWG: Parse errors *)
127127+module Parse_error_code = Parse_error_code
128128+120129(** HTML5 tokenizer.
121130122131 The tokenizer implements the first stage of HTML5 parsing, converting
···242251 WHATWG: Complete list of parse errors *)
243252type parse_error
244253245245-(** Get the error code string.
254254+(** Get the error code.
255255+256256+ Returns the {!Parse_error_code.t} variant representing this error.
257257+ This allows pattern matching on specific error types:
258258+259259+ {[
260260+ match Parser.error_code err with
261261+ | Parse_error_code.Unexpected_null_character -> (* handle *)
262262+ | Parse_error_code.Eof_in_tag -> (* handle *)
263263+ | Parse_error_code.Tree_construction_error msg -> (* handle tree error *)
264264+ | _ -> (* other *)
265265+ ]}
246266247247- Error codes are lowercase with hyphens, exactly matching the WHATWG
248248- specification naming. Examples:
249249- - ["unexpected-null-character"]
250250- - ["eof-before-tag-name"]
251251- - ["missing-end-tag-name"]
252252- - ["duplicate-attribute"]
253253- - ["missing-doctype"]
267267+ Use {!Parse_error_code.to_string} to convert to a string representation.
254268255269 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors>
256270 WHATWG: Parse error codes *)
257257-val error_code : parse_error -> string
271271+val error_code : parse_error -> Parse_error_code.t
258272259273(** Get the line number where the error occurred.
260274
···140140(** Parse error types. *)
141141module Errors : sig
142142 type t = Tokenizer_errors.t = {
143143- code : string;
143143+ code : Parse_error_code.t;
144144 line : int;
145145 column : int;
146146 }
147147148148 val make : code:string -> line:int -> column:int -> t
149149+ (** Create an error from a string code. The string is converted to
150150+ {!Parse_error_code.t} using {!Parse_error_code.of_string}. *)
151151+152152+ val make_with_code : code:Parse_error_code.t -> line:int -> column:int -> t
153153+ (** Create an error with a typed error code. *)
154154+149155 val to_string : t -> string
150156end
151157