OCaml HTML5 parser/serialiser based on Python's JustHTML

switch to a typed error variant

+607 -22
+1 -1
examples/error_handling.ml
··· 35 35 Printf.printf " Line %d, Col %d: %s\n" 36 36 (Html5rw.error_line err) 37 37 (Html5rw.error_column err) 38 - (Html5rw.error_code err) 38 + (Html5rw.Parse_error_code.to_string (Html5rw.error_code err)) 39 39 ) errs; 40 40 41 41 (* The parser still produces a valid DOM tree *)
+4 -1
lib/html5rw/html5rw.ml
··· 36 36 37 37 (** {1 Sub-modules} *) 38 38 39 + (** Parse error code types *) 40 + module Parse_error_code = Parse_error_code 41 + 39 42 (** DOM types and manipulation functions *) 40 43 module Dom = Dom 41 44 ··· 96 99 (** Get the namespace from a fragment context *) 97 100 let fragment_context_namespace = Parser.fragment_context_namespace 98 101 99 - (** Get the error code string *) 102 + (** Get the error code *) 100 103 let error_code = Parser.error_code 101 104 102 105 (** Get the line number of an error (1-indexed) *)
+23 -5
lib/html5rw/html5rw.mli
··· 122 122 123 123 (** {1 Sub-modules} *) 124 124 125 + (** Parse error code types. 126 + 127 + This module provides the {!Parse_error_code.t} variant type that represents 128 + all WHATWG-defined parse errors plus tree construction errors. 129 + 130 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> 131 + WHATWG: Parse errors *) 132 + module Parse_error_code = Parse_error_code 133 + 125 134 (** DOM types and manipulation functions. 126 135 127 136 This module provides the core types for representing HTML documents as ··· 334 343 WHATWG: Complete list of parse errors *) 335 344 type parse_error = Parser.parse_error 336 345 337 - (** Get the error code string. 346 + (** Get the error code. 347 + 348 + Returns the {!Parse_error_code.t} variant representing this error. 349 + This allows pattern matching on specific error types: 350 + 351 + {[ 352 + match Html5rw.error_code err with 353 + | Parse_error_code.Unexpected_null_character -> (* handle *) 354 + | Parse_error_code.Eof_in_tag -> (* handle *) 355 + | Parse_error_code.Tree_construction_error msg -> (* handle tree error *) 356 + | _ -> (* other *) 357 + ]} 338 358 339 - Error codes are lowercase with hyphens, matching the WHATWG specification 340 - names. Examples: ["unexpected-null-character"], ["eof-in-tag"], 341 - ["missing-end-tag-name"]. 359 + Use {!Parse_error_code.to_string} to convert to a string representation. 342 360 343 361 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> 344 362 WHATWG: Parse error codes *) 345 - val error_code : parse_error -> string 363 + val error_code : parse_error -> Parse_error_code.t 346 364 347 365 (** Get the line number where the error occurred (1-indexed). 348 366
+209
lib/html5rw/parse_error_code.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Parse error codes as defined by the WHATWG HTML5 specification. 7 + 8 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> 9 + *) 10 + 11 + type t = 12 + | Abrupt_closing_of_empty_comment 13 + | Abrupt_doctype_public_identifier 14 + | Abrupt_doctype_system_identifier 15 + | Absence_of_digits_in_numeric_character_reference 16 + | Cdata_in_html_content 17 + | Character_reference_outside_unicode_range 18 + | Control_character_in_input_stream 19 + | Control_character_reference 20 + | Duplicate_attribute 21 + | End_tag_with_attributes 22 + | End_tag_with_trailing_solidus 23 + | Eof_before_tag_name 24 + | Eof_in_cdata 25 + | Eof_in_comment 26 + | Eof_in_doctype 27 + | Eof_in_script_html_comment_like_text 28 + | Eof_in_tag 29 + | Incorrectly_closed_comment 30 + | Incorrectly_opened_comment 31 + | Invalid_character_sequence_after_doctype_name 32 + | Invalid_first_character_of_tag_name 33 + | Missing_attribute_value 34 + | Missing_doctype_name 35 + | Missing_doctype_public_identifier 36 + | Missing_doctype_system_identifier 37 + | Missing_end_tag_name 38 + | Missing_quote_before_doctype_public_identifier 39 + | Missing_quote_before_doctype_system_identifier 40 + | Missing_semicolon_after_character_reference 41 + | Missing_whitespace_after_doctype_public_keyword 42 + | Missing_whitespace_after_doctype_system_keyword 43 + | Missing_whitespace_before_doctype_name 44 + | Missing_whitespace_between_attributes 45 + | Missing_whitespace_between_doctype_public_and_system_identifiers 46 + | Nested_comment 47 + | Noncharacter_character_reference 48 + | Noncharacter_in_input_stream 49 + | Non_void_html_element_start_tag_with_trailing_solidus 50 + | Null_character_reference 51 + | Surrogate_character_reference 52 + | Surrogate_in_input_stream 53 + | Unexpected_character_after_doctype_system_identifier 54 + | Unexpected_character_in_attribute_name 55 + | Unexpected_character_in_unquoted_attribute_value 56 + | Unexpected_equals_sign_before_attribute_name 57 + | Unexpected_null_character 58 + | Unexpected_question_mark_instead_of_tag_name 59 + | Unexpected_solidus_in_tag 60 + | Unknown_named_character_reference 61 + | Tree_construction_error of string 62 + 63 + let to_string = function 64 + | Abrupt_closing_of_empty_comment -> "abrupt-closing-of-empty-comment" 65 + | Abrupt_doctype_public_identifier -> "abrupt-doctype-public-identifier" 66 + | Abrupt_doctype_system_identifier -> "abrupt-doctype-system-identifier" 67 + | Absence_of_digits_in_numeric_character_reference -> 68 + "absence-of-digits-in-numeric-character-reference" 69 + | Cdata_in_html_content -> "cdata-in-html-content" 70 + | Character_reference_outside_unicode_range -> 71 + "character-reference-outside-unicode-range" 72 + | Control_character_in_input_stream -> "control-character-in-input-stream" 73 + | Control_character_reference -> "control-character-reference" 74 + | Duplicate_attribute -> "duplicate-attribute" 75 + | End_tag_with_attributes -> "end-tag-with-attributes" 76 + | End_tag_with_trailing_solidus -> "end-tag-with-trailing-solidus" 77 + | Eof_before_tag_name -> "eof-before-tag-name" 78 + | Eof_in_cdata -> "eof-in-cdata" 79 + | Eof_in_comment -> "eof-in-comment" 80 + | Eof_in_doctype -> "eof-in-doctype" 81 + | Eof_in_script_html_comment_like_text -> 82 + "eof-in-script-html-comment-like-text" 83 + | Eof_in_tag -> "eof-in-tag" 84 + | Incorrectly_closed_comment -> "incorrectly-closed-comment" 85 + | Incorrectly_opened_comment -> "incorrectly-opened-comment" 86 + | Invalid_character_sequence_after_doctype_name -> 87 + "invalid-character-sequence-after-doctype-name" 88 + | Invalid_first_character_of_tag_name -> 89 + "invalid-first-character-of-tag-name" 90 + | Missing_attribute_value -> "missing-attribute-value" 91 + | Missing_doctype_name -> "missing-doctype-name" 92 + | Missing_doctype_public_identifier -> "missing-doctype-public-identifier" 93 + | Missing_doctype_system_identifier -> "missing-doctype-system-identifier" 94 + | Missing_end_tag_name -> "missing-end-tag-name" 95 + | Missing_quote_before_doctype_public_identifier -> 96 + "missing-quote-before-doctype-public-identifier" 97 + | Missing_quote_before_doctype_system_identifier -> 98 + "missing-quote-before-doctype-system-identifier" 99 + | Missing_semicolon_after_character_reference -> 100 + "missing-semicolon-after-character-reference" 101 + | Missing_whitespace_after_doctype_public_keyword -> 102 + "missing-whitespace-after-doctype-public-keyword" 103 + | Missing_whitespace_after_doctype_system_keyword -> 104 + "missing-whitespace-after-doctype-system-keyword" 105 + | Missing_whitespace_before_doctype_name -> 106 + "missing-whitespace-before-doctype-name" 107 + | Missing_whitespace_between_attributes -> 108 + "missing-whitespace-between-attributes" 109 + | Missing_whitespace_between_doctype_public_and_system_identifiers -> 110 + "missing-whitespace-between-doctype-public-and-system-identifiers" 111 + | Nested_comment -> "nested-comment" 112 + | Noncharacter_character_reference -> "noncharacter-character-reference" 113 + | Noncharacter_in_input_stream -> "noncharacter-in-input-stream" 114 + | Non_void_html_element_start_tag_with_trailing_solidus -> 115 + "non-void-html-element-start-tag-with-trailing-solidus" 116 + | Null_character_reference -> "null-character-reference" 117 + | Surrogate_character_reference -> "surrogate-character-reference" 118 + | Surrogate_in_input_stream -> "surrogate-in-input-stream" 119 + | Unexpected_character_after_doctype_system_identifier -> 120 + "unexpected-character-after-doctype-system-identifier" 121 + | Unexpected_character_in_attribute_name -> 122 + "unexpected-character-in-attribute-name" 123 + | Unexpected_character_in_unquoted_attribute_value -> 124 + "unexpected-character-in-unquoted-attribute-value" 125 + | Unexpected_equals_sign_before_attribute_name -> 126 + "unexpected-equals-sign-before-attribute-name" 127 + | Unexpected_null_character -> "unexpected-null-character" 128 + | Unexpected_question_mark_instead_of_tag_name -> 129 + "unexpected-question-mark-instead-of-tag-name" 130 + | Unexpected_solidus_in_tag -> "unexpected-solidus-in-tag" 131 + | Unknown_named_character_reference -> "unknown-named-character-reference" 132 + | Tree_construction_error s -> s 133 + 134 + let of_string = function 135 + | "abrupt-closing-of-empty-comment" -> Abrupt_closing_of_empty_comment 136 + | "abrupt-doctype-public-identifier" -> Abrupt_doctype_public_identifier 137 + | "abrupt-doctype-system-identifier" -> Abrupt_doctype_system_identifier 138 + | "absence-of-digits-in-numeric-character-reference" -> 139 + Absence_of_digits_in_numeric_character_reference 140 + | "cdata-in-html-content" -> Cdata_in_html_content 141 + | "character-reference-outside-unicode-range" -> 142 + Character_reference_outside_unicode_range 143 + | "control-character-in-input-stream" -> Control_character_in_input_stream 144 + | "control-character-reference" -> Control_character_reference 145 + | "duplicate-attribute" -> Duplicate_attribute 146 + | "end-tag-with-attributes" -> End_tag_with_attributes 147 + | "end-tag-with-trailing-solidus" -> End_tag_with_trailing_solidus 148 + | "eof-before-tag-name" -> Eof_before_tag_name 149 + | "eof-in-cdata" -> Eof_in_cdata 150 + | "eof-in-comment" -> Eof_in_comment 151 + | "eof-in-doctype" -> Eof_in_doctype 152 + | "eof-in-script-html-comment-like-text" -> 153 + Eof_in_script_html_comment_like_text 154 + | "eof-in-tag" -> Eof_in_tag 155 + | "incorrectly-closed-comment" -> Incorrectly_closed_comment 156 + | "incorrectly-opened-comment" -> Incorrectly_opened_comment 157 + | "invalid-character-sequence-after-doctype-name" -> 158 + Invalid_character_sequence_after_doctype_name 159 + | "invalid-first-character-of-tag-name" -> 160 + Invalid_first_character_of_tag_name 161 + | "missing-attribute-value" -> Missing_attribute_value 162 + | "missing-doctype-name" -> Missing_doctype_name 163 + | "missing-doctype-public-identifier" -> Missing_doctype_public_identifier 164 + | "missing-doctype-system-identifier" -> Missing_doctype_system_identifier 165 + | "missing-end-tag-name" -> Missing_end_tag_name 166 + | "missing-quote-before-doctype-public-identifier" -> 167 + Missing_quote_before_doctype_public_identifier 168 + | "missing-quote-before-doctype-system-identifier" -> 169 + Missing_quote_before_doctype_system_identifier 170 + | "missing-semicolon-after-character-reference" -> 171 + Missing_semicolon_after_character_reference 172 + | "missing-whitespace-after-doctype-public-keyword" -> 173 + Missing_whitespace_after_doctype_public_keyword 174 + | "missing-whitespace-after-doctype-system-keyword" -> 175 + Missing_whitespace_after_doctype_system_keyword 176 + | "missing-whitespace-before-doctype-name" -> 177 + Missing_whitespace_before_doctype_name 178 + | "missing-whitespace-between-attributes" -> 179 + Missing_whitespace_between_attributes 180 + | "missing-whitespace-between-doctype-public-and-system-identifiers" -> 181 + Missing_whitespace_between_doctype_public_and_system_identifiers 182 + | "nested-comment" -> Nested_comment 183 + | "noncharacter-character-reference" -> Noncharacter_character_reference 184 + | "noncharacter-in-input-stream" -> Noncharacter_in_input_stream 185 + | "non-void-html-element-start-tag-with-trailing-solidus" -> 186 + Non_void_html_element_start_tag_with_trailing_solidus 187 + | "null-character-reference" -> Null_character_reference 188 + | "surrogate-character-reference" -> Surrogate_character_reference 189 + | "surrogate-in-input-stream" -> Surrogate_in_input_stream 190 + | "unexpected-character-after-doctype-system-identifier" -> 191 + Unexpected_character_after_doctype_system_identifier 192 + | "unexpected-character-in-attribute-name" -> 193 + Unexpected_character_in_attribute_name 194 + | "unexpected-character-in-unquoted-attribute-value" -> 195 + Unexpected_character_in_unquoted_attribute_value 196 + | "unexpected-equals-sign-before-attribute-name" -> 197 + Unexpected_equals_sign_before_attribute_name 198 + | "unexpected-null-character" -> Unexpected_null_character 199 + | "unexpected-question-mark-instead-of-tag-name" -> 200 + Unexpected_question_mark_instead_of_tag_name 201 + | "unexpected-solidus-in-tag" -> Unexpected_solidus_in_tag 202 + | "unknown-named-character-reference" -> Unknown_named_character_reference 203 + | s -> Tree_construction_error s 204 + 205 + let of_string_opt s = Some (of_string s) 206 + 207 + let is_whatwg_standard = function 208 + | Tree_construction_error _ -> false 209 + | _ -> true
+330
lib/html5rw/parse_error_code.mli
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** Parse error codes as defined by the WHATWG HTML5 specification. 7 + 8 + The HTML5 parser never fails - it always produces a DOM tree. However, 9 + the specification defines these error codes for conformance checkers to 10 + report issues in HTML documents. 11 + 12 + Each error code corresponds to a specific condition in the WHATWG 13 + specification's parsing algorithm. 14 + 15 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> 16 + WHATWG: Parse errors *) 17 + 18 + type t = 19 + | Abrupt_closing_of_empty_comment 20 + (** Parser encounters [<!-->] or [<!--->]; comment is treated as 21 + correctly closed. 22 + 23 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-closing-of-empty-comment> *) 24 + 25 + | Abrupt_doctype_public_identifier 26 + (** [>] found in DOCTYPE public identifier before closing quote; 27 + sets document to quirks mode. 28 + 29 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-doctype-public-identifier> *) 30 + 31 + | Abrupt_doctype_system_identifier 32 + (** [>] found in DOCTYPE system identifier before closing quote; 33 + sets document to quirks mode. 34 + 35 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-abrupt-doctype-system-identifier> *) 36 + 37 + | Absence_of_digits_in_numeric_character_reference 38 + (** Numeric character reference has no digits (e.g., [&#qux;]); 39 + the reference is not resolved. 40 + 41 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-absence-of-digits-in-numeric-character-reference> *) 42 + 43 + | Cdata_in_html_content 44 + (** CDATA section found outside SVG or MathML foreign content; 45 + treated as a bogus comment. 46 + 47 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-cdata-in-html-content> *) 48 + 49 + | Character_reference_outside_unicode_range 50 + (** Numeric reference exceeds U+10FFFF; resolves to U+FFFD 51 + REPLACEMENT CHARACTER. 52 + 53 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-character-reference-outside-unicode-range> *) 54 + 55 + | Control_character_in_input_stream 56 + (** Control code point (other than ASCII whitespace or NULL) 57 + appears in the input; parsed as-is. 58 + 59 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-control-character-in-input-stream> *) 60 + 61 + | Control_character_reference 62 + (** Numeric reference to a control character; handled per 63 + specification replacement rules. 64 + 65 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-control-character-reference> *) 66 + 67 + | Duplicate_attribute 68 + (** Tag contains duplicate attribute names; later duplicates 69 + are removed. 70 + 71 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-duplicate-attribute> *) 72 + 73 + | End_tag_with_attributes 74 + (** End tag includes attributes; attributes are ignored. 75 + 76 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-end-tag-with-attributes> *) 77 + 78 + | End_tag_with_trailing_solidus 79 + (** End tag has [/] before [>] (like [</br/>]); treated as 80 + regular end tag. 81 + 82 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-end-tag-with-trailing-solidus> *) 83 + 84 + | Eof_before_tag_name 85 + (** End of input where tag name expected; [<] or [</] is 86 + treated as text. 87 + 88 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-before-tag-name> *) 89 + 90 + | Eof_in_cdata 91 + (** End of input within CDATA section; treated as immediately closed. 92 + 93 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-cdata> *) 94 + 95 + | Eof_in_comment 96 + (** End of input within comment; comment is treated as 97 + immediately closed. 98 + 99 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-comment> *) 100 + 101 + | Eof_in_doctype 102 + (** End of input within DOCTYPE; sets document to quirks mode. 103 + 104 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-doctype> *) 105 + 106 + | Eof_in_script_html_comment_like_text 107 + (** End of input within HTML-like comment syntax inside a script element. 108 + 109 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-script-html-comment-like-text> *) 110 + 111 + | Eof_in_tag 112 + (** End of input within a start or end tag; the tag is ignored. 113 + 114 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-eof-in-tag> *) 115 + 116 + | Incorrectly_closed_comment 117 + (** Comment closed by [--!>] instead of [-->]; treated as 118 + correctly closed. 119 + 120 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment> *) 121 + 122 + | Incorrectly_opened_comment 123 + (** [<!] not followed by [--] (e.g., [<!ELEMENT]); content is 124 + treated as a bogus comment. 125 + 126 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-opened-comment> *) 127 + 128 + | Invalid_character_sequence_after_doctype_name 129 + (** Neither "PUBLIC" nor "SYSTEM" after DOCTYPE name; sets 130 + document to quirks mode. 131 + 132 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-invalid-character-sequence-after-doctype-name> *) 133 + 134 + | Invalid_first_character_of_tag_name 135 + (** Non-ASCII-alpha character where tag name start expected; 136 + [<] is treated as text. 137 + 138 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-invalid-first-character-of-tag-name> *) 139 + 140 + | Missing_attribute_value 141 + (** [>] where attribute value expected (e.g., [<div id=>]); 142 + attribute gets empty string value. 143 + 144 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-attribute-value> *) 145 + 146 + | Missing_doctype_name 147 + (** DOCTYPE has no name; sets document to quirks mode. 148 + 149 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-doctype-name> *) 150 + 151 + | Missing_doctype_public_identifier 152 + (** [>] where public identifier expected; sets quirks mode. 153 + 154 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-doctype-public-identifier> *) 155 + 156 + | Missing_doctype_system_identifier 157 + (** [>] where system identifier expected; sets quirks mode. 158 + 159 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-doctype-system-identifier> *) 160 + 161 + | Missing_end_tag_name 162 + (** [>] where end tag name expected ([</>]); sequence is ignored. 163 + 164 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-end-tag-name> *) 165 + 166 + | Missing_quote_before_doctype_public_identifier 167 + (** Public identifier lacks preceding quote; sets quirks mode. 168 + 169 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-quote-before-doctype-public-identifier> *) 170 + 171 + | Missing_quote_before_doctype_system_identifier 172 + (** System identifier lacks preceding quote; sets quirks mode. 173 + 174 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-quote-before-doctype-system-identifier> *) 175 + 176 + | Missing_semicolon_after_character_reference 177 + (** Character reference lacks terminating [;]; behaves as if 178 + semicolon were present. 179 + 180 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-semicolon-after-character-reference> *) 181 + 182 + | Missing_whitespace_after_doctype_public_keyword 183 + (** No whitespace between "PUBLIC" and identifier; treated as 184 + if whitespace were present. 185 + 186 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-after-doctype-public-keyword> *) 187 + 188 + | Missing_whitespace_after_doctype_system_keyword 189 + (** No whitespace between "SYSTEM" and identifier; treated as 190 + if whitespace were present. 191 + 192 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-after-doctype-system-keyword> *) 193 + 194 + | Missing_whitespace_before_doctype_name 195 + (** No whitespace between "DOCTYPE" and name; treated as if 196 + whitespace were present. 197 + 198 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-before-doctype-name> *) 199 + 200 + | Missing_whitespace_between_attributes 201 + (** Adjacent attributes lack separating whitespace; treated as 202 + if whitespace were present. 203 + 204 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-between-attributes> *) 205 + 206 + | Missing_whitespace_between_doctype_public_and_system_identifiers 207 + (** Public and system identifiers not separated by whitespace; 208 + treated as if whitespace were present. 209 + 210 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-missing-whitespace-between-doctype-public-and-system-identifiers> *) 211 + 212 + | Nested_comment 213 + (** Nested [<!--] detected within comment; comment still closes 214 + at first [-->]. 215 + 216 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-nested-comment> *) 217 + 218 + | Noncharacter_character_reference 219 + (** Numeric reference to a Unicode noncharacter; resolved as-is 220 + (not replaced). 221 + 222 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-noncharacter-character-reference> *) 223 + 224 + | Noncharacter_in_input_stream 225 + (** Unicode noncharacter code point in input; parsed as-is. 226 + 227 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-noncharacter-in-input-stream> *) 228 + 229 + | Non_void_html_element_start_tag_with_trailing_solidus 230 + (** Non-void element start tag has [/] before [>] (like 231 + [<div/>]); the [/] is ignored. 232 + 233 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-non-void-html-element-start-tag-with-trailing-solidus> *) 234 + 235 + | Null_character_reference 236 + (** Numeric reference to U+0000 (NULL); resolves to U+FFFD 237 + REPLACEMENT CHARACTER. 238 + 239 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-null-character-reference> *) 240 + 241 + | Surrogate_character_reference 242 + (** Numeric reference to a surrogate code point (U+D800-U+DFFF); 243 + resolves to U+FFFD REPLACEMENT CHARACTER. 244 + 245 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-surrogate-character-reference> *) 246 + 247 + | Surrogate_in_input_stream 248 + (** Surrogate code point in input stream; parsed as-is. 249 + 250 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-surrogate-in-input-stream> *) 251 + 252 + | Unexpected_character_after_doctype_system_identifier 253 + (** Non-whitespace/non-[>] character after system identifier; 254 + the character is ignored. 255 + 256 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-character-after-doctype-system-identifier> *) 257 + 258 + | Unexpected_character_in_attribute_name 259 + (** Double quote, single quote, or less-than sign in attribute name; 260 + included in the attribute name. 261 + 262 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-character-in-attribute-name> *) 263 + 264 + | Unexpected_character_in_unquoted_attribute_value 265 + (** Double quote, equals sign, backtick, or less-than sign in 266 + unquoted attribute value; included in the value. 267 + 268 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-character-in-unquoted-attribute-value> *) 269 + 270 + | Unexpected_equals_sign_before_attribute_name 271 + (** [=] where attribute name expected; treated as first 272 + character of attribute name. 273 + 274 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-equals-sign-before-attribute-name> *) 275 + 276 + | Unexpected_null_character 277 + (** U+0000 (NULL) in various positions; ignored or replaced 278 + with U+FFFD depending on context. 279 + 280 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-null-character> *) 281 + 282 + | Unexpected_question_mark_instead_of_tag_name 283 + (** [?] where tag name expected (like [<?xml]); treated as 284 + start of bogus comment. 285 + 286 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-question-mark-instead-of-tag-name> *) 287 + 288 + | Unexpected_solidus_in_tag 289 + (** [/] in tag not immediately before [>]; treated as 290 + whitespace. 291 + 292 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-solidus-in-tag> *) 293 + 294 + | Unknown_named_character_reference 295 + (** Ambiguous ampersand: [&] followed by characters that don't 296 + match any named reference; not resolved as reference. 297 + 298 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unknown-named-character-reference> *) 299 + 300 + | Tree_construction_error of string 301 + (** Tree construction error not defined in the WHATWG specification. 302 + 303 + These are informative errors produced during tree construction 304 + to indicate various issues like unexpected tags, missing closing 305 + tags, etc. The string contains a descriptive error code. *) 306 + 307 + val to_string : t -> string 308 + (** Convert an error code to its WHATWG specification string representation. 309 + 310 + The returned string is lowercase with hyphens, matching the WHATWG 311 + specification naming convention. For example: 312 + - [Abrupt_closing_of_empty_comment] becomes ["abrupt-closing-of-empty-comment"] 313 + - [Eof_in_tag] becomes ["eof-in-tag"] *) 314 + 315 + val of_string : string -> t 316 + (** Parse an error code from its WHATWG specification string representation. 317 + 318 + If the string matches a known WHATWG error code, returns that variant. 319 + Otherwise, returns [Tree_construction_error s]. *) 320 + 321 + val of_string_opt : string -> t option 322 + (** Parse an error code from its WHATWG specification string representation. 323 + 324 + Always returns [Some code]. For unrecognized strings, returns 325 + [Some (Tree_construction_error s)]. *) 326 + 327 + val is_whatwg_standard : t -> bool 328 + (** Check if an error code is defined in the WHATWG specification. 329 + 330 + Returns [false] for [Tree_construction_error _], [true] for all others. *)
+1
lib/html5rw/parser/parser.ml
··· 8 8 module Dom = Dom 9 9 module Tokenizer = Tokenizer 10 10 module Encoding = Encoding 11 + module Parse_error_code = Parse_error_code 11 12 module Constants = Parser_constants 12 13 module Insertion_mode = Parser_insertion_mode 13 14 module Tree_builder = Parser_tree_builder
+23 -9
lib/html5rw/parser/parser.mli
··· 117 117 (** DOM types and manipulation. *) 118 118 module Dom = Dom 119 119 120 + (** Parse error code types. 121 + 122 + This module provides the {!Parse_error_code.t} variant type that represents 123 + all WHATWG-defined parse errors plus tree construction errors. 124 + 125 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> 126 + WHATWG: Parse errors *) 127 + module Parse_error_code = Parse_error_code 128 + 120 129 (** HTML5 tokenizer. 121 130 122 131 The tokenizer implements the first stage of HTML5 parsing, converting ··· 242 251 WHATWG: Complete list of parse errors *) 243 252 type parse_error 244 253 245 - (** Get the error code string. 254 + (** Get the error code. 255 + 256 + Returns the {!Parse_error_code.t} variant representing this error. 257 + This allows pattern matching on specific error types: 258 + 259 + {[ 260 + match Parser.error_code err with 261 + | Parse_error_code.Unexpected_null_character -> (* handle *) 262 + | Parse_error_code.Eof_in_tag -> (* handle *) 263 + | Parse_error_code.Tree_construction_error msg -> (* handle tree error *) 264 + | _ -> (* other *) 265 + ]} 246 266 247 - Error codes are lowercase with hyphens, exactly matching the WHATWG 248 - specification naming. Examples: 249 - - ["unexpected-null-character"] 250 - - ["eof-before-tag-name"] 251 - - ["missing-end-tag-name"] 252 - - ["duplicate-attribute"] 253 - - ["missing-doctype"] 267 + Use {!Parse_error_code.to_string} to convert to a string representation. 254 268 255 269 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> 256 270 WHATWG: Parse error codes *) 257 - val error_code : parse_error -> string 271 + val error_code : parse_error -> Parse_error_code.t 258 272 259 273 (** Get the line number where the error occurred. 260 274
+2 -2
lib/html5rw/parser/parser_tree_builder.ml
··· 18 18 } 19 19 20 20 type parse_error = { 21 - code : string; 21 + code : Parse_error_code.t; 22 22 line : int; 23 23 column : int; 24 24 } ··· 113 113 (* Error handling *) 114 114 let parse_error t code = 115 115 if t.collect_errors then 116 - t.errors <- { code; line = 0; column = 0 } :: t.errors 116 + t.errors <- { code = Parse_error_code.of_string code; line = 0; column = 0 } :: t.errors 117 117 118 118 (* Stack helpers *) 119 119 let current_node t =
+7 -1
lib/html5rw/tokenizer/tokenizer.mli
··· 140 140 (** Parse error types. *) 141 141 module Errors : sig 142 142 type t = Tokenizer_errors.t = { 143 - code : string; 143 + code : Parse_error_code.t; 144 144 line : int; 145 145 column : int; 146 146 } 147 147 148 148 val make : code:string -> line:int -> column:int -> t 149 + (** Create an error from a string code. The string is converted to 150 + {!Parse_error_code.t} using {!Parse_error_code.of_string}. *) 151 + 152 + val make_with_code : code:Parse_error_code.t -> line:int -> column:int -> t 153 + (** Create an error with a typed error code. *) 154 + 149 155 val to_string : t -> string 150 156 end 151 157
+7 -3
lib/html5rw/tokenizer/tokenizer_errors.ml
··· 1 1 (* HTML5 parse error types *) 2 2 3 3 type t = { 4 - code : string; 4 + code : Parse_error_code.t; 5 5 line : int; 6 6 column : int; 7 7 } 8 8 9 - let make ~code ~line ~column = { code; line; column } 9 + let make ~code ~line ~column = 10 + { code = Parse_error_code.of_string code; line; column } 11 + 12 + let make_with_code ~code ~line ~column = { code; line; column } 10 13 11 14 let to_string err = 12 - Printf.sprintf "(%d,%d): %s" err.line err.column err.code 15 + Printf.sprintf "(%d,%d): %s" err.line err.column 16 + (Parse_error_code.to_string err.code)