···147147 is_contact_details := true
148148 | _ -> ());
149149150150+ (* Check if any token in the list is shipping/billing *)
151151+ let find_shipping_billing tokens =
152152+ List.find_opt (fun t -> t = "shipping" || t = "billing") tokens
153153+ in
154154+155155+ (* Check if any token in the list is a contact type *)
156156+ let find_contact_type tokens =
157157+ List.find_opt (fun t -> List.mem t contact_types) tokens
158158+ in
159159+160160+ (* Check if any token in the list is a section-* indicator *)
161161+ let find_section tokens =
162162+ List.find_opt (fun t -> starts_with t "section-") tokens
163163+ in
164164+150165 (* Process remaining tokens *)
151151- let process_field_tokens = function
152152- | [] -> Error "A list of autofill details tokens must contain an autofill field name"
166166+ let process_field_tokens tokens =
167167+ match tokens with
168168+ | [] -> Error "A list of autofill details tokens must contain an autofill field name."
153169 | [ "webauthn" ] ->
154170 Error
155171 "The token \"webauthn\" must not be the only token in a list of \
156156- autofill detail tokens"
172172+ autofill detail tokens."
157173 | [ field_name ] ->
158174 if not (List.mem field_name all_field_names) then
159175 Error
160176 (Printf.sprintf
161161- "The string \"%s\" is not a valid autofill field name"
177177+ "The string \"%s\" is not a valid autofill field name."
162178 field_name)
163179 else if !is_contact_details && not (List.mem field_name contact_field_names)
164180 then
165181 Error
166182 (Printf.sprintf
167183 "The autofill field name \"%s\" is not allowed in contact \
168168- context"
184184+ context."
169185 field_name)
170186 else Ok ()
171187 | [ field_name; "webauthn" ] ->
172188 if not (List.mem field_name all_field_names) then
173189 Error
174190 (Printf.sprintf
175175- "The string \"%s\" is not a valid autofill field name"
191191+ "The string \"%s\" is not a valid autofill field name."
176192 field_name)
177193 else if !is_contact_details && not (List.mem field_name contact_field_names)
178194 then
179195 Error
180196 (Printf.sprintf
181197 "The autofill field name \"%s\" is not allowed in contact \
182182- context"
198198+ context."
183199 field_name)
184200 else Ok ()
185201 | token :: _ when List.mem token contact_types ->
186202 Error
187203 (Printf.sprintf
188188- "The token \"%s\" must only appear before any autofill field names"
204204+ "The token \"%s\" must only appear before any autofill field names."
189205 token)
190206 | token :: _ when starts_with token "section-" ->
191207 Error
192208 "A \"section-*\" indicator must only appear as the first token in a \
193193- list of autofill detail tokens"
209209+ list of autofill detail tokens."
194210 | "shipping" :: _ | "billing" :: _ as toks ->
195211 Error
196212 (Printf.sprintf
197213 "The token \"%s\" must only appear as either the first token in a \
198214 list of autofill detail tokens, or, if the first token is a \
199199- \"section-*\" indicator, as the second token"
215215+ \"section-*\" indicator, as the second token."
200216 (List.hd toks))
201217 | _ :: "webauthn" :: _ :: _ ->
202218 Error
203219 "The token \"webauthn\" must only appear as the very last token in a \
204204- list of autofill detail tokens"
205205- | _ :: _ :: _ ->
206206- Error
207207- "A list of autofill details tokens must not contain more than one \
208208- autofill field name"
220220+ list of autofill detail tokens."
221221+ | _ :: rest ->
222222+ (* Check if any remaining token is a section-* indicator - position error takes precedence *)
223223+ (match find_section rest with
224224+ | Some _ ->
225225+ Error
226226+ "A \"section-*\" indicator must only appear as the first token in a \
227227+ list of autofill detail tokens."
228228+ | None ->
229229+ (* Check if any remaining token is a contact type - position error takes precedence *)
230230+ match find_contact_type rest with
231231+ | Some ct_token ->
232232+ Error
233233+ (Printf.sprintf
234234+ "The token \"%s\" must only appear before any autofill field names."
235235+ ct_token)
236236+ | None ->
237237+ (* Check if any remaining token is shipping/billing - position error takes precedence *)
238238+ match find_shipping_billing rest with
239239+ | Some sb_token ->
240240+ Error
241241+ (Printf.sprintf
242242+ "The token \"%s\" must only appear as either the first token in a \
243243+ list of autofill detail tokens, or, if the first token is a \
244244+ \"section-*\" indicator, as the second token."
245245+ sb_token)
246246+ | None ->
247247+ Error
248248+ "A list of autofill details tokens must not contain more than one \
249249+ autofill field name.")
209250 in
210251 process_field_tokens !tokens
211252212253(** Validate autocomplete value *)
213254let validate_autocomplete s =
214255 let trimmed = trim_whitespace s in
215215- if String.length trimmed = 0 then Error "Must not be empty"
256256+ if String.length trimmed = 0 then Error "Must not be empty."
216257 else if trimmed = "on" || trimmed = "off" then Ok ()
217258 else
218259 let tokens = split_on_whitespace trimmed in
+60-4
lib/html5_checker/parse_error_bridge.ml
···1111 Message.make_location ~line ~column ?system_id ()
1212 in
1313 let code_str = Html5rw.Parse_error_code.to_string code in
1414- let message = match code with
1414+ let (message, final_code) = match code with
1515 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
1616- "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag."
1717- | _ -> Printf.sprintf "Parse error: %s" code_str
1616+ ("Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag.", code_str)
1717+ | Html5rw.Parse_error_code.Tree_construction_error s ->
1818+ (* Check for control-character/noncharacter/surrogate with codepoint info *)
1919+ (try
2020+ if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then
2121+ let colon_pos = String.index s ':' in
2222+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
2323+ let cp = int_of_string ("0x" ^ cp_str) in
2424+ (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
2525+ else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then
2626+ let colon_pos = String.index s ':' in
2727+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
2828+ let cp = int_of_string ("0x" ^ cp_str) in
2929+ (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
3030+ else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then
3131+ let colon_pos = String.index s ':' in
3232+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
3333+ let cp = int_of_string ("0x" ^ cp_str) in
3434+ (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
3535+ (* Character reference errors *)
3636+ else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then
3737+ let cp_str = String.sub s 28 (String.length s - 28) in
3838+ let cp = int_of_string ("0x" ^ cp_str) in
3939+ if cp = 0x0D then
4040+ ("A numeric character reference expanded to carriage return.", "control-character-reference")
4141+ else
4242+ (Printf.sprintf "Character reference expands to a control character (U+%04x)." cp, "control-character-reference")
4343+ else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then
4444+ let colon_pos = String.index s ':' in
4545+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
4646+ let cp = int_of_string ("0x" ^ cp_str) in
4747+ (* U+FDD0-U+FDEF are "permanently unassigned" *)
4848+ if cp >= 0xFDD0 && cp <= 0xFDEF then
4949+ ("Character reference expands to a permanently unassigned code point.", "noncharacter-character-reference")
5050+ (* Astral noncharacters (planes 1-16) *)
5151+ else if cp >= 0x10000 then
5252+ (Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp, "noncharacter-character-reference")
5353+ else
5454+ (Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp, "noncharacter-character-reference")
5555+ else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then
5656+ let colon_pos = String.index s ':' in
5757+ let _ = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
5858+ ("Character reference outside the permissible Unicode range.", "character-reference-outside-unicode-range")
5959+ else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then
6060+ let colon_pos = String.index s ':' in
6161+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
6262+ let cp = int_of_string ("0x" ^ cp_str) in
6363+ (Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp, "surrogate-character-reference")
6464+ else if s = "no-p-element-in-scope" then
6565+ ("No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen.", "no-p-element-in-scope")
6666+ else if s = "end-tag-p-implied-but-open-elements" then
6767+ ("End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements.", "end-tag-p-implied")
6868+ else if s = "end-tag-br" then
6969+ ("End tag \xe2\x80\x9cbr\xe2\x80\x9d.", "end-tag-br")
7070+ else
7171+ (Printf.sprintf "Parse error: %s" s, s)
7272+ with _ -> (Printf.sprintf "Parse error: %s" s, s))
7373+ | _ -> (Printf.sprintf "Parse error: %s" code_str, code_str)
1874 in
1975 Message.error
2076 ~message
2121- ~code:code_str
7777+ ~code:final_code
2278 ~location
2379 ()
2480
+3-1
lib/html5_checker/semantic/form_checker.ml
···3232 match Dt_autocomplete.validate_autocomplete value with
3333 | Ok () -> ()
3434 | Error msg ->
3535+ (* Nu validator prefixes autocomplete errors with "Bad autocomplete detail tokens (any): " for select/textarea, but not for input *)
3636+ let reason = if element_name = "input" then msg else "Bad autocomplete detail tokens (any): " ^ msg in
3537 Message_collector.add_typed collector
3638 (Error_code.Bad_attr_value {
3739 element = element_name;
3840 attr = "autocomplete";
3941 value;
4040- reason = msg
4242+ reason
4143 })
4244 end
4345
+23-8
lib/html5_checker/semantic/id_checker.ml
···193193 so we pass None. In a full implementation, this would be passed
194194 from the parser. *)
195195 let location = None in
196196- process_attrs state ~element:name ~attrs ~location collector
196196+ process_attrs state ~element:name ~attrs ~location collector;
197197+198198+ (* Special check: map element must have matching id and name if both present *)
199199+ if name = "map" then begin
200200+ let id_opt = List.find_map (fun (n, v) -> if n = "id" then Some v else None) attrs in
201201+ let name_opt = List.find_map (fun (n, v) -> if n = "name" then Some v else None) attrs in
202202+ match id_opt, name_opt with
203203+ | Some id_val, Some name_val when id_val <> name_val ->
204204+ Message_collector.add_typed collector Error_code.Map_id_name_mismatch
205205+ | _ -> ()
206206+ end
197207198208let end_element _state ~name:_ ~namespace:_ _collector =
199209 ()
···204214let end_document state collector =
205215 (* Check all ID references point to existing IDs *)
206216 List.iter (fun ref ->
207207- if not (Hashtbl.mem state.ids ref.referenced_id) then
208208- (* Use generic for dangling references - format may vary *)
209209- Message_collector.add_typed collector
210210- (Error_code.Generic {
211211- message = Printf.sprintf "The %s attribute on the %s element refers to ID %s which does not exist in the document."
212212- (Error_code.q ref.attribute) (Error_code.q ref.referring_element) (Error_code.q ref.referenced_id)
213213- })
217217+ if not (Hashtbl.mem state.ids ref.referenced_id) then begin
218218+ (* Use specific error for list attribute on input *)
219219+ if ref.attribute = "list" && ref.referring_element = "input" then
220220+ Message_collector.add_typed collector Error_code.List_attr_requires_datalist
221221+ else
222222+ (* Use generic for dangling references - format may vary *)
223223+ Message_collector.add_typed collector
224224+ (Error_code.Generic {
225225+ message = Printf.sprintf "The %s attribute on the %s element refers to ID %s which does not exist in the document."
226226+ (Error_code.q ref.attribute) (Error_code.q ref.referring_element) (Error_code.q ref.referenced_id)
227227+ })
228228+ end
214229 ) state.references;
215230216231 (* Check all usemap references point to existing map names *)
···250250 Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
251251 attr_name name
252252 else if String.contains attr_value '%' then
253253- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
253253+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
254254 attr_value attr_name name
255255 else if String.length attr_value > 0 && attr_value.[0] = '-' then
256256- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The value must be non-negative."
256256+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead."
257257 attr_value attr_name name
258258 else
259259 (* Find first non-digit character *)
···268268 in
269269 match bad_char with
270270 | Some c ->
271271- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
271271+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
272272 attr_value attr_name name c
273273 | None ->
274274- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit."
274274+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit."
275275 attr_value attr_name name
276276 in
277277 Message_collector.add_error collector
···455455 List.iter (fun key ->
456456 if count_codepoints key > 1 then
457457 Message_collector.add_error collector
458458- ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The space-separated list of key labels contains a value \xe2\x80\x9c%s\xe2\x80\x9d that consists of more than a single code point."
459459- attr_value attr_name name key)
458458+ ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Key label has multiple characters. Each key label must be a single character."
459459+ attr_value attr_name name)
460460 ~code:"bad-attribute-value"
461461 ~element:name ~attribute:attr_name ()
462462 ) keys;
···466466 | k :: rest ->
467467 if List.mem k seen then
468468 Message_collector.add_error collector
469469- ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Duplicate key label."
469469+ ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Duplicate key label. Each key label must be unique."
470470 attr_value attr_name name)
471471 ~code:"bad-attribute-value"
472472 ~element:name ~attribute:attr_name ()
+9-13
lib/html5_checker/specialized/datetime_checker.ml
···241241 minute <> 0 && minute <> 30 && minute <> 45
242242 in
243243 if unusual_range then
244244- TzWarning "unusual timezone offset"
244244+ TzWarning "Hours in time zone designator should be from \"-12:00\" to \"+14:00\""
245245 else if unusual_minutes then
246246- TzWarning "unusual timezone offset minutes"
246246+ TzWarning "Minutes in time zone designator should be either \"00\", \"30\", or \"45\"."
247247 else
248248 TzOk
249249 end
···350350 match validate_datetime_with_timezone value with
351351 | DtOk -> Ok (* Valid datetime with timezone *)
352352 | DtWarning w ->
353353- (* Valid but with warning *)
354354- Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s."
353353+ (* Valid but with warning - format matches Nu validator *)
354354+ Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format."
355355 value attr_name element_name w)
356356 | DtError tz_error ->
357357 (* Try just date - valid for all elements *)
···359359 | (true, _) ->
360360 (* Date is valid, but check for suspicious year (5+ digits or old year) *)
361361 if has_suspicious_year value || has_old_year value then begin
362362- let date_msg = "Year may be mistyped." in
362362+ let date_msg = "Bad date: Year may be mistyped." in
363363 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in
364364- Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
364364+ Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
365365 value attr_name element_name date_msg tz_msg)
366366 end else
367367 Ok (* Valid date with normal year *)
···389389 match validate_duration value with
390390 | (true, _) -> Ok (* Valid duration P... *)
391391 | (false, _) ->
392392- let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in
393393- let date_msg = match date_error with
394394- | Some e -> Printf.sprintf "Bad date: %s." e
395395- | None -> "Bad date: The literal did not satisfy the date format."
396396- in
397397- Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
398398- value attr_name element_name tz_msg date_msg)
392392+ (* Use simplified message for time element matching Nu validator format *)
393393+ Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad time-datetime: The literal did not satisfy the time-datetime format."
394394+ value attr_name element_name)
399395 end
400396 else begin
401397 (* del/ins only allow date or datetime-with-timezone *)
···175175 | SlashKeyWithoutSlashValue of string (* property name where slash key doesn't have slash value *)
176176 | InvalidScopeKey (* scope key is not a valid URL *)
177177 | InvalidScopeValue of string (* scope value is not a valid URL *)
178178+ | ScopeValueNotObject (* a value inside scopes is not a JSON object *)
178179179180(** Check if a string looks like a valid URL-like specifier for importmaps *)
180181let is_valid_url_like s =
···255256 | JNull -> ()
256257 | _ -> add_error (NotString ("scopes[" ^ skey ^ "][" ^ sikey ^ "]"))
257258 ) scope_imports
258258- | _ -> add_error (NotObject ("scopes[" ^ skey ^ "]"))
259259+ | _ -> add_error ScopeValueNotObject
259260 ) scope_members
260261 | _ -> add_error (NotObject "scopes")
261262 end
···290291 Printf.sprintf "The value of the \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object." prop
291292 | NotString _ ->
292293 "A specifier map defined in a \xe2\x80\x9cimports\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain string values."
293293- | ForbiddenProperty prop ->
294294- Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d is not an allowed property." prop
294294+ | ForbiddenProperty _ ->
295295+ "A \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must contain a JSON object with no properties other than \xe2\x80\x9cimports\xe2\x80\x9d, \xe2\x80\x9cscopes\xe2\x80\x9d, and \xe2\x80\x9cintegrity\xe2\x80\x9d."
295296 | SlashKeyWithoutSlashValue prop ->
296297 Printf.sprintf "A specifier map defined in a \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must have values that end with \xe2\x80\x9c/\xe2\x80\x9d when its corresponding key ends with \xe2\x80\x9c/\xe2\x80\x9d." prop
297298 | InvalidScopeKey ->
298299 "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose keys are valid URL strings."
299300 | InvalidScopeValue _ ->
300301 "A specifier map defined in a \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain valid URL values."
302302+ | ScopeValueNotObject ->
303303+ "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose values are also JSON objects."
301304302305let end_element state ~name ~namespace collector =
303306 if namespace <> None then ()
+3-3
lib/html5_checker/specialized/language_checker.ml
···5757 | Some (deprecated, replacement) ->
5858 Message_collector.add_warning collector
5959 ~message:(Printf.sprintf
6060- "The language tag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead."
6161- deprecated replacement)
6060+ "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad language tag: The language subtag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead."
6161+ value attribute element deprecated replacement)
6262 ~code:"deprecated-lang"
6363 ?location
6464 ~element
6565- ~attribute:"lang"
6565+ ~attribute
6666 ()
6767 | None -> ()
6868
+68-27
lib/html5_checker/specialized/url_checker.ml
···239239 let _ = contains_invalid_unicode decoded in
240240 None
241241 with Exit ->
242242- Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host."
242242+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: A label or domain name contains disallowed characters.."
243243 url attr_name element_name)
244244245245(** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *)
···349349 end else
350350 None
351351352352-(** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments). *)
353353-let check_data_uri_fragment url attr_name element_name =
352352+(** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments).
353353+ The is_absolute_url parameter controls whether to use "Bad URL:" or "Bad absolute URL:" in the message. *)
354354+let check_data_uri_fragment ?(is_absolute_url=false) url attr_name element_name =
354355 match extract_scheme url with
355356 | None -> None
356357 | Some scheme ->
357358 if scheme = "data" && String.contains url '#' then
358358- Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Fragment is not allowed for data: URIs according to RFC 2397."
359359- url attr_name element_name)
359359+ let url_type = if is_absolute_url then "Bad absolute URL:" else "Bad URL:" in
360360+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s Fragment is not allowed for data: URIs according to RFC 2397."
361361+ url attr_name element_name url_type)
360362 else
361363 None
362364···373375 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
374376 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *)
375377 if String.length after_colon > 0 && after_colon.[0] = '/' then
376376- Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid %s: URL."
377377- url attr_name element_name scheme)
378378+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a token character or a semicolon but saw \xe2\x80\x9c/\xe2\x80\x9d instead."
379379+ url attr_name element_name)
378380 else
379381 None
380382 end else
···389391 (* Get scheme data (after the colon) *)
390392 let colon_pos = String.index url ':' in
391393 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
394394+ (* Check for tab in scheme data *)
395395+ if String.contains scheme_data '\t' then
396396+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: tab is not allowed."
397397+ url attr_name element_name)
398398+ (* Check for newline in scheme data *)
399399+ else if String.contains scheme_data '\n' then
400400+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed."
401401+ url attr_name element_name)
402402+ (* Check for carriage return in scheme data *)
403403+ else if String.contains scheme_data '\r' then
404404+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed."
405405+ url attr_name element_name)
392406 (* Check for space in scheme data *)
393393- if String.contains scheme_data ' ' then
407407+ else if String.contains scheme_data ' ' then
394408 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed."
395409 url attr_name element_name)
396410 else
···508522 try
509523 let fragment_start = String.index url '#' in
510524 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in
525525+ (* Check for backslash in fragment *)
526526+ if String.contains fragment '\\' then
527527+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed."
528528+ url attr_name element_name)
511529 (* Check for second hash in fragment *)
512512- if String.contains fragment '#' then
530530+ else if String.contains fragment '#' then
513531 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed."
514532 url attr_name element_name)
515533 (* Check for space in fragment *)
···560578 else if String.contains userinfo ' ' then
561579 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed."
562580 url attr_name element_name)
563563- else
564564- (* Check for non-ASCII characters (like emoji) *)
565565- let has_non_ascii = String.exists (fun c -> Char.code c > 127) userinfo in
566566- if has_non_ascii then
567567- Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password."
568568- url attr_name element_name)
569569- else
581581+ else begin
582582+ (* Check for non-ASCII characters (like emoji) using UTF-8 decoding *)
583583+ let find_non_ascii_char userinfo =
584584+ let decoder = Uutf.decoder ~encoding:`UTF_8 (`String userinfo) in
585585+ let rec find () =
586586+ match Uutf.decode decoder with
587587+ | `End | `Await -> None
588588+ | `Malformed _ -> find ()
589589+ | `Uchar uchar ->
590590+ let code = Uchar.to_int uchar in
591591+ (* Check if character is not allowed in userinfo *)
592592+ (* Per URL Standard: only ASCII letters, digits, and certain symbols allowed *)
593593+ if code > 127 then begin
594594+ let buf = Buffer.create 8 in
595595+ Buffer.add_utf_8_uchar buf uchar;
596596+ Some (Buffer.contents buf)
597597+ end else find ()
598598+ in
599599+ find ()
600600+ in
601601+ match find_non_ascii_char userinfo with
602602+ | Some bad_char ->
603603+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%s\xe2\x80\x9d is not allowed."
604604+ url attr_name element_name bad_char)
605605+ | None ->
570606 (* Check for other invalid chars *)
571607 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in
572608 match invalid with
···574610 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
575611 url attr_name element_name c)
576612 | None -> None
613613+ end
577614 with _ -> None
578615579616(** Attributes where empty URL is an error.
···613650 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace."
614651 original_url attr_name element_name)
615652 else None
616616- (* Check for newlines/tabs *)
617617- else if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then
618618- Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found."
619619- url attr_name element_name)
653653+ (* Check scheme data for non-special schemes FIRST - handles tab/newline/CR in scheme data *)
620654 else begin
655655+ match check_scheme_data url attr_name element_name with
656656+ | Some err -> Some err
657657+ | None ->
658658+ (* Check for newlines/tabs in special scheme URLs *)
659659+ if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then
660660+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found."
661661+ url attr_name element_name)
662662+ else begin
621663 (* Check for relative URL issues first *)
622664 match check_relative_url url attr_name element_name with
623665 | Some err -> Some err
···659701 url attr_name element_name)
660702 else
661703662662- (* Check scheme data for non-special schemes *)
663663- match check_scheme_data url attr_name element_name with
664664- | Some err -> Some err
665665- | None ->
666666-667704 (* Check path segment for illegal characters *)
668705 match check_path_segment url attr_name element_name with
669706 | Some err -> Some err
···688725 match host_opt with
689726 | Some host -> validate_host host url attr_name element_name scheme_str
690727 | None -> None
728728+ end
691729 end
692730 end
693731···761799 ()
762800 | Some _ ->
763801 (* Check for data: URI with fragment - emit warning *)
764764- (match check_data_uri_fragment url "value" name with
802802+ (* input[type=url] uses "Bad absolute URL:" format *)
803803+ (match check_data_uri_fragment ~is_absolute_url:true url "value" name with
765804 | Some warn_msg ->
766805 Message_collector.add_warning collector
767806 ~message:warn_msg
···786825 end
787826 end;
788827 (* Check microdata itemtype and itemid attributes for data: URI fragments *)
828828+ (* Microdata uses "Bad absolute URL:" format *)
789829 let itemtype_opt = get_attr_value "itemtype" attrs in
790830 (match itemtype_opt with
791831 | Some url when String.trim url <> "" ->
792792- (match check_data_uri_fragment url "itemtype" name with
832832+ (match check_data_uri_fragment ~is_absolute_url:true url "itemtype" name with
793833 | Some warn_msg ->
794834 Message_collector.add_warning collector
795835 ~message:warn_msg
···799839 ()
800840 | None -> ())
801841 | _ -> ());
842842+ (* itemid uses "Bad URL:" format (not "Bad absolute URL:") *)
802843 let itemid_opt = get_attr_value "itemid" attrs in
803844 (match itemid_opt with
804845 | Some url when String.trim url <> "" ->
+3-3
lib/html5rw/parser/parser_tree_builder.ml
···664664let close_p_element t =
665665 generate_implied_end_tags t ~except:"p" ();
666666 (match current_node t with
667667- | Some n when n.Dom.name <> "p" -> parse_error t "expected-p"
667667+ | Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements"
668668 | _ -> ());
669669 pop_until_tag t "p"
670670···12151215 end
12161216 | Token.Tag { kind = Token.End; name = "p"; _ } ->
12171217 if not (has_element_in_button_scope t "p") then begin
12181218- parse_error t "unexpected-end-tag";
12181218+ parse_error t "no-p-element-in-scope";
12191219 ignore (insert_element t "p" ~push:true [])
12201220 end;
12211221 close_p_element t
···13211321 t.frameset_ok <- false;
13221322 t.mode <- Parser_insertion_mode.In_table
13231323 | Token.Tag { kind = Token.End; name = "br"; _ } ->
13241324- parse_error t "unexpected-end-tag";
13241324+ parse_error t "end-tag-br";
13251325 reconstruct_active_formatting t;
13261326 ignore (insert_element t "br" ~push:true []);
13271327 pop_current t;
+5-5
lib/html5rw/tokenizer/tokenizer_impl.ml
···264264 code = 0x0B ||
265265 (code >= 0x0E && code <= 0x1F) ||
266266 code = 0x7F then
267267- error t "control-character-in-input-stream"
267267+ error t (Printf.sprintf "control-character-in-input-stream:%04x" code)
268268 in
269269270270···19371937 error t "null-character-reference";
19381938 replacement_char
19391939 end else if code > 0x10FFFF then begin
19401940- error t "character-reference-outside-unicode-range";
19401940+ error t (Printf.sprintf "character-reference-outside-unicode-range:%x" code);
19411941 replacement_char
19421942 end else if code >= 0xD800 && code <= 0xDFFF then begin
19431943- error t "surrogate-character-reference";
19431943+ error t (Printf.sprintf "surrogate-character-reference:%04x" code);
19441944 replacement_char
19451945 end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
19461946 List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
···19491949 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
19501950 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
19511951 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin
19521952- error t "noncharacter-character-reference";
19521952+ error t (Printf.sprintf "noncharacter-character-reference:%05x" code);
19531953 Entities.Numeric_ref.codepoint_to_utf8 code
19541954 end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
19551955 (code >= 0x0D && code <= 0x1F) ||
19561956 (code >= 0x7F && code <= 0x9F) then begin
19571957- error t "control-character-reference";
19571957+ error t (Printf.sprintf "control-character-reference:%04x" code);
19581958 (* Apply Windows-1252 replacement table for 0x80-0x9F *)
19591959 match Entities.Numeric_ref.find_replacement code with
19601960 | Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
+5-5
lib/html5rw/tokenizer/tokenizer_stream.ml
···9999let check_utf8_codepoint t lead_byte =
100100 let b0 = Char.code lead_byte in
101101 if b0 < 0x80 then
102102- (* ASCII - no surrogates or noncharacters possible in this range except control chars *)
102102+ (* ASCII - control characters are handled in tokenizer_impl.ml *)
103103 ()
104104 else if b0 >= 0xC2 && b0 <= 0xDF then begin
105105 (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *)
···112112 (* C1 controls: U+0080 to U+009F *)
113113 if cp >= 0x80 && cp <= 0x9F then
114114 (match t.error_callback with
115115- | Some cb -> cb "control-character-in-input-stream"
115115+ | Some cb -> cb (Printf.sprintf "control-character-in-input-stream:%04x" cp)
116116 | None -> ())
117117 | Some c1 ->
118118 push_back_char t c1
···132132 (* Check for surrogates and noncharacters *)
133133 (match t.error_callback with
134134 | Some cb ->
135135- if is_surrogate cp then cb "surrogate-in-input-stream"
136136- else if is_noncharacter cp then cb "noncharacter-in-input-stream"
135135+ if is_surrogate cp then cb (Printf.sprintf "surrogate-in-input-stream:%04x" cp)
136136+ else if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%04x" cp)
137137 | None -> ())
138138 | Some c2 ->
139139 push_back_char t c2;
···162162 (* Check for noncharacters (no surrogates in 4-byte range) *)
163163 (match t.error_callback with
164164 | Some cb ->
165165- if is_noncharacter cp then cb "noncharacter-in-input-stream"
165165+ if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%05x" cp)
166166 | None -> ())
167167 | Some c3 ->
168168 push_back_char t c3;