···147 is_contact_details := true
148 | _ -> ());
149000000000000000150 (* Process remaining tokens *)
151- let process_field_tokens = function
152- | [] -> Error "A list of autofill details tokens must contain an autofill field name"
0153 | [ "webauthn" ] ->
154 Error
155 "The token \"webauthn\" must not be the only token in a list of \
156- autofill detail tokens"
157 | [ field_name ] ->
158 if not (List.mem field_name all_field_names) then
159 Error
160 (Printf.sprintf
161- "The string \"%s\" is not a valid autofill field name"
162 field_name)
163 else if !is_contact_details && not (List.mem field_name contact_field_names)
164 then
165 Error
166 (Printf.sprintf
167 "The autofill field name \"%s\" is not allowed in contact \
168- context"
169 field_name)
170 else Ok ()
171 | [ field_name; "webauthn" ] ->
172 if not (List.mem field_name all_field_names) then
173 Error
174 (Printf.sprintf
175- "The string \"%s\" is not a valid autofill field name"
176 field_name)
177 else if !is_contact_details && not (List.mem field_name contact_field_names)
178 then
179 Error
180 (Printf.sprintf
181 "The autofill field name \"%s\" is not allowed in contact \
182- context"
183 field_name)
184 else Ok ()
185 | token :: _ when List.mem token contact_types ->
186 Error
187 (Printf.sprintf
188- "The token \"%s\" must only appear before any autofill field names"
189 token)
190 | token :: _ when starts_with token "section-" ->
191 Error
192 "A \"section-*\" indicator must only appear as the first token in a \
193- list of autofill detail tokens"
194 | "shipping" :: _ | "billing" :: _ as toks ->
195 Error
196 (Printf.sprintf
197 "The token \"%s\" must only appear as either the first token in a \
198 list of autofill detail tokens, or, if the first token is a \
199- \"section-*\" indicator, as the second token"
200 (List.hd toks))
201 | _ :: "webauthn" :: _ :: _ ->
202 Error
203 "The token \"webauthn\" must only appear as the very last token in a \
204- list of autofill detail tokens"
205- | _ :: _ :: _ ->
206- Error
207- "A list of autofill details tokens must not contain more than one \
208- autofill field name"
0000000000000000000000000209 in
210 process_field_tokens !tokens
211212(** Validate autocomplete value *)
213let validate_autocomplete s =
214 let trimmed = trim_whitespace s in
215- if String.length trimmed = 0 then Error "Must not be empty"
216 else if trimmed = "on" || trimmed = "off" then Ok ()
217 else
218 let tokens = split_on_whitespace trimmed in
···147 is_contact_details := true
148 | _ -> ());
149150+ (* Check if any token in the list is shipping/billing *)
151+ let find_shipping_billing tokens =
152+ List.find_opt (fun t -> t = "shipping" || t = "billing") tokens
153+ in
154+155+ (* Check if any token in the list is a contact type *)
156+ let find_contact_type tokens =
157+ List.find_opt (fun t -> List.mem t contact_types) tokens
158+ in
159+160+ (* Check if any token in the list is a section-* indicator *)
161+ let find_section tokens =
162+ List.find_opt (fun t -> starts_with t "section-") tokens
163+ in
164+165 (* Process remaining tokens *)
166+ let process_field_tokens tokens =
167+ match tokens with
168+ | [] -> Error "A list of autofill details tokens must contain an autofill field name."
169 | [ "webauthn" ] ->
170 Error
171 "The token \"webauthn\" must not be the only token in a list of \
172+ autofill detail tokens."
173 | [ field_name ] ->
174 if not (List.mem field_name all_field_names) then
175 Error
176 (Printf.sprintf
177+ "The string \"%s\" is not a valid autofill field name."
178 field_name)
179 else if !is_contact_details && not (List.mem field_name contact_field_names)
180 then
181 Error
182 (Printf.sprintf
183 "The autofill field name \"%s\" is not allowed in contact \
184+ context."
185 field_name)
186 else Ok ()
187 | [ field_name; "webauthn" ] ->
188 if not (List.mem field_name all_field_names) then
189 Error
190 (Printf.sprintf
191+ "The string \"%s\" is not a valid autofill field name."
192 field_name)
193 else if !is_contact_details && not (List.mem field_name contact_field_names)
194 then
195 Error
196 (Printf.sprintf
197 "The autofill field name \"%s\" is not allowed in contact \
198+ context."
199 field_name)
200 else Ok ()
201 | token :: _ when List.mem token contact_types ->
202 Error
203 (Printf.sprintf
204+ "The token \"%s\" must only appear before any autofill field names."
205 token)
206 | token :: _ when starts_with token "section-" ->
207 Error
208 "A \"section-*\" indicator must only appear as the first token in a \
209+ list of autofill detail tokens."
210 | "shipping" :: _ | "billing" :: _ as toks ->
211 Error
212 (Printf.sprintf
213 "The token \"%s\" must only appear as either the first token in a \
214 list of autofill detail tokens, or, if the first token is a \
215+ \"section-*\" indicator, as the second token."
216 (List.hd toks))
217 | _ :: "webauthn" :: _ :: _ ->
218 Error
219 "The token \"webauthn\" must only appear as the very last token in a \
220+ list of autofill detail tokens."
221+ | _ :: rest ->
222+ (* Check if any remaining token is a section-* indicator - position error takes precedence *)
223+ (match find_section rest with
224+ | Some _ ->
225+ Error
226+ "A \"section-*\" indicator must only appear as the first token in a \
227+ list of autofill detail tokens."
228+ | None ->
229+ (* Check if any remaining token is a contact type - position error takes precedence *)
230+ match find_contact_type rest with
231+ | Some ct_token ->
232+ Error
233+ (Printf.sprintf
234+ "The token \"%s\" must only appear before any autofill field names."
235+ ct_token)
236+ | None ->
237+ (* Check if any remaining token is shipping/billing - position error takes precedence *)
238+ match find_shipping_billing rest with
239+ | Some sb_token ->
240+ Error
241+ (Printf.sprintf
242+ "The token \"%s\" must only appear as either the first token in a \
243+ list of autofill detail tokens, or, if the first token is a \
244+ \"section-*\" indicator, as the second token."
245+ sb_token)
246+ | None ->
247+ Error
248+ "A list of autofill details tokens must not contain more than one \
249+ autofill field name.")
250 in
251 process_field_tokens !tokens
252253(** Validate autocomplete value *)
254let validate_autocomplete s =
255 let trimmed = trim_whitespace s in
256+ if String.length trimmed = 0 then Error "Must not be empty."
257 else if trimmed = "on" || trimmed = "off" then Ok ()
258 else
259 let tokens = split_on_whitespace trimmed in
+60-4
lib/html5_checker/parse_error_bridge.ml
···11 Message.make_location ~line ~column ?system_id ()
12 in
13 let code_str = Html5rw.Parse_error_code.to_string code in
14- let message = match code with
15 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
16- "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag."
17- | _ -> Printf.sprintf "Parse error: %s" code_str
0000000000000000000000000000000000000000000000000000000018 in
19 Message.error
20 ~message
21- ~code:code_str
22 ~location
23 ()
24
···11 Message.make_location ~line ~column ?system_id ()
12 in
13 let code_str = Html5rw.Parse_error_code.to_string code in
14+ let (message, final_code) = match code with
15 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
16+ ("Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag.", code_str)
17+ | Html5rw.Parse_error_code.Tree_construction_error s ->
18+ (* Check for control-character/noncharacter/surrogate with codepoint info *)
19+ (try
20+ if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then
21+ let colon_pos = String.index s ':' in
22+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
23+ let cp = int_of_string ("0x" ^ cp_str) in
24+ (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
25+ else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then
26+ let colon_pos = String.index s ':' in
27+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
28+ let cp = int_of_string ("0x" ^ cp_str) in
29+ (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
30+ else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then
31+ let colon_pos = String.index s ':' in
32+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
33+ let cp = int_of_string ("0x" ^ cp_str) in
34+ (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
35+ (* Character reference errors *)
36+ else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then
37+ let cp_str = String.sub s 28 (String.length s - 28) in
38+ let cp = int_of_string ("0x" ^ cp_str) in
39+ if cp = 0x0D then
40+ ("A numeric character reference expanded to carriage return.", "control-character-reference")
41+ else
42+ (Printf.sprintf "Character reference expands to a control character (U+%04x)." cp, "control-character-reference")
43+ else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then
44+ let colon_pos = String.index s ':' in
45+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
46+ let cp = int_of_string ("0x" ^ cp_str) in
47+ (* U+FDD0-U+FDEF are "permanently unassigned" *)
48+ if cp >= 0xFDD0 && cp <= 0xFDEF then
49+ ("Character reference expands to a permanently unassigned code point.", "noncharacter-character-reference")
50+ (* Astral noncharacters (planes 1-16) *)
51+ else if cp >= 0x10000 then
52+ (Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp, "noncharacter-character-reference")
53+ else
54+ (Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp, "noncharacter-character-reference")
55+ else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then
56+ let colon_pos = String.index s ':' in
57+ let _ = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
58+ ("Character reference outside the permissible Unicode range.", "character-reference-outside-unicode-range")
59+ else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then
60+ let colon_pos = String.index s ':' in
61+ let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
62+ let cp = int_of_string ("0x" ^ cp_str) in
63+ (Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp, "surrogate-character-reference")
64+ else if s = "no-p-element-in-scope" then
65+ ("No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen.", "no-p-element-in-scope")
66+ else if s = "end-tag-p-implied-but-open-elements" then
67+ ("End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements.", "end-tag-p-implied")
68+ else if s = "end-tag-br" then
69+ ("End tag \xe2\x80\x9cbr\xe2\x80\x9d.", "end-tag-br")
70+ else
71+ (Printf.sprintf "Parse error: %s" s, s)
72+ with _ -> (Printf.sprintf "Parse error: %s" s, s))
73+ | _ -> (Printf.sprintf "Parse error: %s" code_str, code_str)
74 in
75 Message.error
76 ~message
77+ ~code:final_code
78 ~location
79 ()
80
+3-1
lib/html5_checker/semantic/form_checker.ml
···32 match Dt_autocomplete.validate_autocomplete value with
33 | Ok () -> ()
34 | Error msg ->
0035 Message_collector.add_typed collector
36 (Error_code.Bad_attr_value {
37 element = element_name;
38 attr = "autocomplete";
39 value;
40- reason = msg
41 })
42 end
43
···32 match Dt_autocomplete.validate_autocomplete value with
33 | Ok () -> ()
34 | Error msg ->
35+ (* Nu validator prefixes autocomplete errors with "Bad autocomplete detail tokens (any): " for select/textarea, but not for input *)
36+ let reason = if element_name = "input" then msg else "Bad autocomplete detail tokens (any): " ^ msg in
37 Message_collector.add_typed collector
38 (Error_code.Bad_attr_value {
39 element = element_name;
40 attr = "autocomplete";
41 value;
42+ reason
43 })
44 end
45
+23-8
lib/html5_checker/semantic/id_checker.ml
···193 so we pass None. In a full implementation, this would be passed
194 from the parser. *)
195 let location = None in
196- process_attrs state ~element:name ~attrs ~location collector
0000000000197198let end_element _state ~name:_ ~namespace:_ _collector =
199 ()
···204let end_document state collector =
205 (* Check all ID references point to existing IDs *)
206 List.iter (fun ref ->
207- if not (Hashtbl.mem state.ids ref.referenced_id) then
208- (* Use generic for dangling references - format may vary *)
209- Message_collector.add_typed collector
210- (Error_code.Generic {
211- message = Printf.sprintf "The %s attribute on the %s element refers to ID %s which does not exist in the document."
212- (Error_code.q ref.attribute) (Error_code.q ref.referring_element) (Error_code.q ref.referenced_id)
213- })
00000214 ) state.references;
215216 (* Check all usemap references point to existing map names *)
···193 so we pass None. In a full implementation, this would be passed
194 from the parser. *)
195 let location = None in
196+ process_attrs state ~element:name ~attrs ~location collector;
197+198+ (* Special check: map element must have matching id and name if both present *)
199+ if name = "map" then begin
200+ let id_opt = List.find_map (fun (n, v) -> if n = "id" then Some v else None) attrs in
201+ let name_opt = List.find_map (fun (n, v) -> if n = "name" then Some v else None) attrs in
202+ match id_opt, name_opt with
203+ | Some id_val, Some name_val when id_val <> name_val ->
204+ Message_collector.add_typed collector Error_code.Map_id_name_mismatch
205+ | _ -> ()
206+ end
207208let end_element _state ~name:_ ~namespace:_ _collector =
209 ()
···214let end_document state collector =
215 (* Check all ID references point to existing IDs *)
216 List.iter (fun ref ->
217+ if not (Hashtbl.mem state.ids ref.referenced_id) then begin
218+ (* Use specific error for list attribute on input *)
219+ if ref.attribute = "list" && ref.referring_element = "input" then
220+ Message_collector.add_typed collector Error_code.List_attr_requires_datalist
221+ else
222+ (* Use generic for dangling references - format may vary *)
223+ Message_collector.add_typed collector
224+ (Error_code.Generic {
225+ message = Printf.sprintf "The %s attribute on the %s element refers to ID %s which does not exist in the document."
226+ (Error_code.q ref.attribute) (Error_code.q ref.referring_element) (Error_code.q ref.referenced_id)
227+ })
228+ end
229 ) state.references;
230231 (* Check all usemap references point to existing map names *)
···141 | "zh-tw" -> "zh-hant"
142 | _ -> code
14300000000000000000000000000000000000000000000000000000000000000000000000000000000144let start_element state ~name ~namespace ~attrs _collector =
145 let name_lower = String.lowercase_ascii name in
146 let ns = Option.value namespace ~default:"" in
···226 let original_declared = match state.html_lang with
227 | Some l -> l
228 | None -> ""
00000229 in
230 let detected_code = detected_lang in (* Keep full code like zh-tw *)
231 let detected_name = get_language_name detected_lang in
···141 | "zh-tw" -> "zh-hant"
142 | _ -> code
143144+(* Traditional Chinese-only characters (simplified versions don't exist) *)
145+(* These are characters that were simplified in Simplified Chinese *)
146+let traditional_chars = [|
147+ 0x570B; (* 國 -> 国 *)
148+ 0x5B78; (* 學 -> 学 *)
149+ 0x8AAA; (* 說 -> 说 *)
150+ 0x66F8; (* 書 -> 书 *)
151+ 0x8A9E; (* 語 -> 语 *)
152+ 0x6642; (* 時 -> 时 *)
153+ 0x6703; (* 會 -> 会 *)
154+ 0x7D93; (* 經 -> 经 *)
155+ 0x6A5F; (* 機 -> 机 *)
156+ 0x767C; (* 發 -> 发 *)
157+ 0x554F; (* 問 -> 问 *)
158+ 0x6578; (* 數 -> 数 *)
159+ 0x5BE6; (* 實 -> 实 *)
160+ 0x958B; (* 開 -> 开 *)
161+ 0x95DC; (* 關 -> 关 *)
162+ 0x9577; (* 長 -> 长 *)
163+ 0x9AD4; (* 體 -> 体 *)
164+ 0x9EDE; (* 點 -> 点 *)
165+ 0x96FB; (* 電 -> 电 *)
166+ 0x8CC7; (* 資 -> 资 *)
167+ 0x7FA9; (* 義 -> 义 *)
168+ 0x8B93; (* 讓 -> 让 *)
169+ 0x9054; (* 達 -> 达 *)
170+ 0x71DF; (* 營 -> 营 *)
171+ 0x8655; (* 處 -> 处 *)
172+ 0x6771; (* 東 -> 东 *)
173+ 0x8209; (* 舉 -> 举 *)
174+ 0x8A18; (* 記 -> 记 *)
175+ 0x5099; (* 備 -> 备 *)
176+ 0x5354; (* 協 -> 协 *)
177+ 0x8FA6; (* 辦 -> 办 *)
178+ 0x8457; (* 著 -> 着 *)
179+ 0x8F09; (* 載 -> 载 *)
180+ 0x52D9; (* 務 -> 务 *)
181+ 0x7121; (* 無 -> 无 *)
182+ 0x5F9E; (* 從 -> 从 *)
183+ 0x8B58; (* 識 -> 识 *)
184+ 0x8207; (* 與 -> 与 *)
185+ 0x78BA; (* 確 -> 确 *)
186+ 0x904E; (* 過 -> 过 *)
187+ 0x8A72; (* 該 -> 该 *)
188+ 0x9810; (* 預 -> 预 *)
189+ 0x7576; (* 當 -> 当 *)
190+ 0x5831; (* 報 -> 报 *)
191+ 0x9054; (* 達 -> 达 *)
192+ 0x91AB; (* 醫 -> 医 *)
193+ 0x5718; (* 團 -> 团 *)
194+ 0x8B70; (* 議 -> 议 *)
195+ 0x7D71; (* 統 -> 统 *)
196+ 0x898F; (* 規 -> 规 *)
197+|]
198+199+(* Check if text contains enough Traditional Chinese characters *)
200+let is_traditional_chinese text =
201+ let count = ref 0 in
202+ let total = ref 0 in
203+ let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
204+ let rec process () =
205+ if !total >= 1000 then () (* Sample first 1000 chars *)
206+ else match Uutf.decode decoder with
207+ | `Await | `End -> ()
208+ | `Malformed _ -> process ()
209+ | `Uchar uchar ->
210+ let code = Uchar.to_int uchar in
211+ (* Count CJK characters *)
212+ if code >= 0x4E00 && code <= 0x9FFF then begin
213+ incr total;
214+ (* Check if it's a Traditional-only character *)
215+ if Array.exists (fun c -> c = code) traditional_chars then
216+ incr count
217+ end;
218+ process ()
219+ in
220+ process ();
221+ (* If > 2% are Traditional-only characters, it's Traditional Chinese *)
222+ !total > 100 && (float_of_int !count /. float_of_int !total) > 0.02
223+224let start_element state ~name ~namespace ~attrs _collector =
225 let name_lower = String.lowercase_ascii name in
226 let ns = Option.value namespace ~default:"" in
···306 let original_declared = match state.html_lang with
307 | Some l -> l
308 | None -> ""
309+ in
310+ (* Correct for Traditional vs Simplified Chinese misdetection *)
311+ let detected_lang =
312+ if detected_lang = "zh-cn" && is_traditional_chinese text then "zh-tw"
313+ else detected_lang
314 in
315 let detected_code = detected_lang in (* Keep full code like zh-tw *)
316 let detected_name = get_language_name detected_lang in
+23-3
lib/html5_checker/specialized/aria_checker.ml
···368 mutable stack : stack_node list;
369 mutable has_active_tab : bool; (* Whether document has role=tab with aria-selected=true *)
370 mutable has_tabpanel : bool; (* Whether document has role=tabpanel elements *)
0371}
372373-let create () = { stack = []; has_active_tab = false; has_tabpanel = false }
374375let reset state =
376 state.stack <- [];
377 state.has_active_tab <- false;
378- state.has_tabpanel <- false
0379380(** Check if any ancestor has one of the required roles. *)
381let has_required_ancestor_role state required_roles =
···451 if aria_selected = Some "true" then state.has_active_tab <- true
452 end;
453 if List.mem "tabpanel" explicit_roles then state.has_tabpanel <- true;
00000000000000454455 (* Check br/wbr role restrictions - only none/presentation allowed *)
456 if (name_lower = "br" || name_lower = "wbr") && explicit_roles <> [] then begin
···784 Message_collector.add_error collector
785 ~message:"Every active \xe2\x80\x9crole=tab\xe2\x80\x9d element must have a corresponding \xe2\x80\x9crole=tabpanel\xe2\x80\x9d element."
786 ~code:"tab-without-tabpanel"
787- ()
0000788789let checker = (module struct
790 type nonrec state = state
···368 mutable stack : stack_node list;
369 mutable has_active_tab : bool; (* Whether document has role=tab with aria-selected=true *)
370 mutable has_tabpanel : bool; (* Whether document has role=tabpanel elements *)
371+ mutable visible_main_count : int; (* Count of visible elements with role=main *)
372}
373374+let create () = { stack = []; has_active_tab = false; has_tabpanel = false; visible_main_count = 0 }
375376let reset state =
377 state.stack <- [];
378 state.has_active_tab <- false;
379+ state.has_tabpanel <- false;
380+ state.visible_main_count <- 0
381382(** Check if any ancestor has one of the required roles. *)
383let has_required_ancestor_role state required_roles =
···453 if aria_selected = Some "true" then state.has_active_tab <- true
454 end;
455 if List.mem "tabpanel" explicit_roles then state.has_tabpanel <- true;
456+457+ (* Track visible main elements (explicit role=main or implicit main role) *)
458+ let is_hidden =
459+ let aria_hidden = List.assoc_opt "aria-hidden" attrs in
460+ aria_hidden = Some "true"
461+ in
462+ if not is_hidden then begin
463+ (* Check explicit role *)
464+ if List.mem "main" explicit_roles then
465+ state.visible_main_count <- state.visible_main_count + 1
466+ (* Check implicit role from <main> element *)
467+ else if name_lower = "main" then
468+ state.visible_main_count <- state.visible_main_count + 1
469+ end;
470471 (* Check br/wbr role restrictions - only none/presentation allowed *)
472 if (name_lower = "br" || name_lower = "wbr") && explicit_roles <> [] then begin
···800 Message_collector.add_error collector
801 ~message:"Every active \xe2\x80\x9crole=tab\xe2\x80\x9d element must have a corresponding \xe2\x80\x9crole=tabpanel\xe2\x80\x9d element."
802 ~code:"tab-without-tabpanel"
803+ ();
804+805+ (* Check for multiple visible main elements *)
806+ if state.visible_main_count > 1 then
807+ Message_collector.add_typed collector Error_code.Multiple_main_visible
808809let checker = (module struct
810 type nonrec state = state
···250 Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
251 attr_name name
252 else if String.contains attr_value '%' then
253- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
254 attr_value attr_name name
255 else if String.length attr_value > 0 && attr_value.[0] = '-' then
256- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The value must be non-negative."
257 attr_value attr_name name
258 else
259 (* Find first non-digit character *)
···268 in
269 match bad_char with
270 | Some c ->
271- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
272 attr_value attr_name name c
273 | None ->
274- Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit."
275 attr_value attr_name name
276 in
277 Message_collector.add_error collector
···455 List.iter (fun key ->
456 if count_codepoints key > 1 then
457 Message_collector.add_error collector
458- ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The space-separated list of key labels contains a value \xe2\x80\x9c%s\xe2\x80\x9d that consists of more than a single code point."
459- attr_value attr_name name key)
460 ~code:"bad-attribute-value"
461 ~element:name ~attribute:attr_name ()
462 ) keys;
···466 | k :: rest ->
467 if List.mem k seen then
468 Message_collector.add_error collector
469- ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Duplicate key label."
470 attr_value attr_name name)
471 ~code:"bad-attribute-value"
472 ~element:name ~attribute:attr_name ()
···250 Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
251 attr_name name
252 else if String.contains attr_value '%' then
253+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
254 attr_value attr_name name
255 else if String.length attr_value > 0 && attr_value.[0] = '-' then
256+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead."
257 attr_value attr_name name
258 else
259 (* Find first non-digit character *)
···268 in
269 match bad_char with
270 | Some c ->
271+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
272 attr_value attr_name name c
273 | None ->
274+ Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit."
275 attr_value attr_name name
276 in
277 Message_collector.add_error collector
···455 List.iter (fun key ->
456 if count_codepoints key > 1 then
457 Message_collector.add_error collector
458+ ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Key label has multiple characters. Each key label must be a single character."
459+ attr_value attr_name name)
460 ~code:"bad-attribute-value"
461 ~element:name ~attribute:attr_name ()
462 ) keys;
···466 | k :: rest ->
467 if List.mem k seen then
468 Message_collector.add_error collector
469+ ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Duplicate key label. Each key label must be unique."
470 attr_value attr_name name)
471 ~code:"bad-attribute-value"
472 ~element:name ~attribute:attr_name ()
+9-13
lib/html5_checker/specialized/datetime_checker.ml
···241 minute <> 0 && minute <> 30 && minute <> 45
242 in
243 if unusual_range then
244- TzWarning "unusual timezone offset"
245 else if unusual_minutes then
246- TzWarning "unusual timezone offset minutes"
247 else
248 TzOk
249 end
···350 match validate_datetime_with_timezone value with
351 | DtOk -> Ok (* Valid datetime with timezone *)
352 | DtWarning w ->
353- (* Valid but with warning *)
354- Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s."
355 value attr_name element_name w)
356 | DtError tz_error ->
357 (* Try just date - valid for all elements *)
···359 | (true, _) ->
360 (* Date is valid, but check for suspicious year (5+ digits or old year) *)
361 if has_suspicious_year value || has_old_year value then begin
362- let date_msg = "Year may be mistyped." in
363 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in
364- Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
365 value attr_name element_name date_msg tz_msg)
366 end else
367 Ok (* Valid date with normal year *)
···389 match validate_duration value with
390 | (true, _) -> Ok (* Valid duration P... *)
391 | (false, _) ->
392- let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in
393- let date_msg = match date_error with
394- | Some e -> Printf.sprintf "Bad date: %s." e
395- | None -> "Bad date: The literal did not satisfy the date format."
396- in
397- Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
398- value attr_name element_name tz_msg date_msg)
399 end
400 else begin
401 (* del/ins only allow date or datetime-with-timezone *)
···241 minute <> 0 && minute <> 30 && minute <> 45
242 in
243 if unusual_range then
244+ TzWarning "Hours in time zone designator should be from \"-12:00\" to \"+14:00\""
245 else if unusual_minutes then
246+ TzWarning "Minutes in time zone designator should be either \"00\", \"30\", or \"45\"."
247 else
248 TzOk
249 end
···350 match validate_datetime_with_timezone value with
351 | DtOk -> Ok (* Valid datetime with timezone *)
352 | DtWarning w ->
353+ (* Valid but with warning - format matches Nu validator *)
354+ Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format."
355 value attr_name element_name w)
356 | DtError tz_error ->
357 (* Try just date - valid for all elements *)
···359 | (true, _) ->
360 (* Date is valid, but check for suspicious year (5+ digits or old year) *)
361 if has_suspicious_year value || has_old_year value then begin
362+ let date_msg = "Bad date: Year may be mistyped." in
363 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in
364+ Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
365 value attr_name element_name date_msg tz_msg)
366 end else
367 Ok (* Valid date with normal year *)
···389 match validate_duration value with
390 | (true, _) -> Ok (* Valid duration P... *)
391 | (false, _) ->
392+ (* Use simplified message for time element matching Nu validator format *)
393+ Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad time-datetime: The literal did not satisfy the time-datetime format."
394+ value attr_name element_name)
0000395 end
396 else begin
397 (* del/ins only allow date or datetime-with-timezone *)
···175 | SlashKeyWithoutSlashValue of string (* property name where slash key doesn't have slash value *)
176 | InvalidScopeKey (* scope key is not a valid URL *)
177 | InvalidScopeValue of string (* scope value is not a valid URL *)
0178179(** Check if a string looks like a valid URL-like specifier for importmaps *)
180let is_valid_url_like s =
···255 | JNull -> ()
256 | _ -> add_error (NotString ("scopes[" ^ skey ^ "][" ^ sikey ^ "]"))
257 ) scope_imports
258- | _ -> add_error (NotObject ("scopes[" ^ skey ^ "]"))
259 ) scope_members
260 | _ -> add_error (NotObject "scopes")
261 end
···290 Printf.sprintf "The value of the \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object." prop
291 | NotString _ ->
292 "A specifier map defined in a \xe2\x80\x9cimports\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain string values."
293- | ForbiddenProperty prop ->
294- Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d is not an allowed property." prop
295 | SlashKeyWithoutSlashValue prop ->
296 Printf.sprintf "A specifier map defined in a \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must have values that end with \xe2\x80\x9c/\xe2\x80\x9d when its corresponding key ends with \xe2\x80\x9c/\xe2\x80\x9d." prop
297 | InvalidScopeKey ->
298 "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose keys are valid URL strings."
299 | InvalidScopeValue _ ->
300 "A specifier map defined in a \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain valid URL values."
00301302let end_element state ~name ~namespace collector =
303 if namespace <> None then ()
···175 | SlashKeyWithoutSlashValue of string (* property name where slash key doesn't have slash value *)
176 | InvalidScopeKey (* scope key is not a valid URL *)
177 | InvalidScopeValue of string (* scope value is not a valid URL *)
178+ | ScopeValueNotObject (* a value inside scopes is not a JSON object *)
179180(** Check if a string looks like a valid URL-like specifier for importmaps *)
181let is_valid_url_like s =
···256 | JNull -> ()
257 | _ -> add_error (NotString ("scopes[" ^ skey ^ "][" ^ sikey ^ "]"))
258 ) scope_imports
259+ | _ -> add_error ScopeValueNotObject
260 ) scope_members
261 | _ -> add_error (NotObject "scopes")
262 end
···291 Printf.sprintf "The value of the \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object." prop
292 | NotString _ ->
293 "A specifier map defined in a \xe2\x80\x9cimports\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain string values."
294+ | ForbiddenProperty _ ->
295+ "A \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must contain a JSON object with no properties other than \xe2\x80\x9cimports\xe2\x80\x9d, \xe2\x80\x9cscopes\xe2\x80\x9d, and \xe2\x80\x9cintegrity\xe2\x80\x9d."
296 | SlashKeyWithoutSlashValue prop ->
297 Printf.sprintf "A specifier map defined in a \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must have values that end with \xe2\x80\x9c/\xe2\x80\x9d when its corresponding key ends with \xe2\x80\x9c/\xe2\x80\x9d." prop
298 | InvalidScopeKey ->
299 "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose keys are valid URL strings."
300 | InvalidScopeValue _ ->
301 "A specifier map defined in a \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain valid URL values."
302+ | ScopeValueNotObject ->
303+ "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose values are also JSON objects."
304305let end_element state ~name ~namespace collector =
306 if namespace <> None then ()
+3-3
lib/html5_checker/specialized/language_checker.ml
···57 | Some (deprecated, replacement) ->
58 Message_collector.add_warning collector
59 ~message:(Printf.sprintf
60- "The language tag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead."
61- deprecated replacement)
62 ~code:"deprecated-lang"
63 ?location
64 ~element
65- ~attribute:"lang"
66 ()
67 | None -> ()
68
···57 | Some (deprecated, replacement) ->
58 Message_collector.add_warning collector
59 ~message:(Printf.sprintf
60+ "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad language tag: The language subtag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead."
61+ value attribute element deprecated replacement)
62 ~code:"deprecated-lang"
63 ?location
64 ~element
65+ ~attribute
66 ()
67 | None -> ()
68
+68-27
lib/html5_checker/specialized/url_checker.ml
···239 let _ = contains_invalid_unicode decoded in
240 None
241 with Exit ->
242- Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host."
243 url attr_name element_name)
244245(** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *)
···349 end else
350 None
351352-(** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments). *)
353-let check_data_uri_fragment url attr_name element_name =
0354 match extract_scheme url with
355 | None -> None
356 | Some scheme ->
357 if scheme = "data" && String.contains url '#' then
358- Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Fragment is not allowed for data: URIs according to RFC 2397."
359- url attr_name element_name)
0360 else
361 None
362···373 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
374 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *)
375 if String.length after_colon > 0 && after_colon.[0] = '/' then
376- Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid %s: URL."
377- url attr_name element_name scheme)
378 else
379 None
380 end else
···389 (* Get scheme data (after the colon) *)
390 let colon_pos = String.index url ':' in
391 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
000000000000392 (* Check for space in scheme data *)
393- if String.contains scheme_data ' ' then
394 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed."
395 url attr_name element_name)
396 else
···508 try
509 let fragment_start = String.index url '#' in
510 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in
0000511 (* Check for second hash in fragment *)
512- if String.contains fragment '#' then
513 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed."
514 url attr_name element_name)
515 (* Check for space in fragment *)
···560 else if String.contains userinfo ' ' then
561 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed."
562 url attr_name element_name)
563- else
564- (* Check for non-ASCII characters (like emoji) *)
565- let has_non_ascii = String.exists (fun c -> Char.code c > 127) userinfo in
566- if has_non_ascii then
567- Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password."
568- url attr_name element_name)
569- else
000000000000000000570 (* Check for other invalid chars *)
571 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in
572 match invalid with
···574 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
575 url attr_name element_name c)
576 | None -> None
0577 with _ -> None
578579(** Attributes where empty URL is an error.
···613 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace."
614 original_url attr_name element_name)
615 else None
616- (* Check for newlines/tabs *)
617- else if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then
618- Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found."
619- url attr_name element_name)
620 else begin
00000000621 (* Check for relative URL issues first *)
622 match check_relative_url url attr_name element_name with
623 | Some err -> Some err
···659 url attr_name element_name)
660 else
661662- (* Check scheme data for non-special schemes *)
663- match check_scheme_data url attr_name element_name with
664- | Some err -> Some err
665- | None ->
666-667 (* Check path segment for illegal characters *)
668 match check_path_segment url attr_name element_name with
669 | Some err -> Some err
···688 match host_opt with
689 | Some host -> validate_host host url attr_name element_name scheme_str
690 | None -> None
0691 end
692 end
693···761 ()
762 | Some _ ->
763 (* Check for data: URI with fragment - emit warning *)
764- (match check_data_uri_fragment url "value" name with
0765 | Some warn_msg ->
766 Message_collector.add_warning collector
767 ~message:warn_msg
···786 end
787 end;
788 (* Check microdata itemtype and itemid attributes for data: URI fragments *)
0789 let itemtype_opt = get_attr_value "itemtype" attrs in
790 (match itemtype_opt with
791 | Some url when String.trim url <> "" ->
792- (match check_data_uri_fragment url "itemtype" name with
793 | Some warn_msg ->
794 Message_collector.add_warning collector
795 ~message:warn_msg
···799 ()
800 | None -> ())
801 | _ -> ());
0802 let itemid_opt = get_attr_value "itemid" attrs in
803 (match itemid_opt with
804 | Some url when String.trim url <> "" ->
···239 let _ = contains_invalid_unicode decoded in
240 None
241 with Exit ->
242+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: A label or domain name contains disallowed characters.."
243 url attr_name element_name)
244245(** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *)
···349 end else
350 None
351352+(** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments).
353+ The is_absolute_url parameter controls whether to use "Bad URL:" or "Bad absolute URL:" in the message. *)
354+let check_data_uri_fragment ?(is_absolute_url=false) url attr_name element_name =
355 match extract_scheme url with
356 | None -> None
357 | Some scheme ->
358 if scheme = "data" && String.contains url '#' then
359+ let url_type = if is_absolute_url then "Bad absolute URL:" else "Bad URL:" in
360+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s Fragment is not allowed for data: URIs according to RFC 2397."
361+ url attr_name element_name url_type)
362 else
363 None
364···375 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
376 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *)
377 if String.length after_colon > 0 && after_colon.[0] = '/' then
378+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a token character or a semicolon but saw \xe2\x80\x9c/\xe2\x80\x9d instead."
379+ url attr_name element_name)
380 else
381 None
382 end else
···391 (* Get scheme data (after the colon) *)
392 let colon_pos = String.index url ':' in
393 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
394+ (* Check for tab in scheme data *)
395+ if String.contains scheme_data '\t' then
396+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: tab is not allowed."
397+ url attr_name element_name)
398+ (* Check for newline in scheme data *)
399+ else if String.contains scheme_data '\n' then
400+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed."
401+ url attr_name element_name)
402+ (* Check for carriage return in scheme data *)
403+ else if String.contains scheme_data '\r' then
404+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed."
405+ url attr_name element_name)
406 (* Check for space in scheme data *)
407+ else if String.contains scheme_data ' ' then
408 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed."
409 url attr_name element_name)
410 else
···522 try
523 let fragment_start = String.index url '#' in
524 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in
525+ (* Check for backslash in fragment *)
526+ if String.contains fragment '\\' then
527+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed."
528+ url attr_name element_name)
529 (* Check for second hash in fragment *)
530+ else if String.contains fragment '#' then
531 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed."
532 url attr_name element_name)
533 (* Check for space in fragment *)
···578 else if String.contains userinfo ' ' then
579 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed."
580 url attr_name element_name)
581+ else begin
582+ (* Check for non-ASCII characters (like emoji) using UTF-8 decoding *)
583+ let find_non_ascii_char userinfo =
584+ let decoder = Uutf.decoder ~encoding:`UTF_8 (`String userinfo) in
585+ let rec find () =
586+ match Uutf.decode decoder with
587+ | `End | `Await -> None
588+ | `Malformed _ -> find ()
589+ | `Uchar uchar ->
590+ let code = Uchar.to_int uchar in
591+ (* Check if character is not allowed in userinfo *)
592+ (* Per URL Standard: only ASCII letters, digits, and certain symbols allowed *)
593+ if code > 127 then begin
594+ let buf = Buffer.create 8 in
595+ Buffer.add_utf_8_uchar buf uchar;
596+ Some (Buffer.contents buf)
597+ end else find ()
598+ in
599+ find ()
600+ in
601+ match find_non_ascii_char userinfo with
602+ | Some bad_char ->
603+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%s\xe2\x80\x9d is not allowed."
604+ url attr_name element_name bad_char)
605+ | None ->
606 (* Check for other invalid chars *)
607 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in
608 match invalid with
···610 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
611 url attr_name element_name c)
612 | None -> None
613+ end
614 with _ -> None
615616(** Attributes where empty URL is an error.
···650 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace."
651 original_url attr_name element_name)
652 else None
653+ (* Check scheme data for non-special schemes FIRST - handles tab/newline/CR in scheme data *)
000654 else begin
655+ match check_scheme_data url attr_name element_name with
656+ | Some err -> Some err
657+ | None ->
658+ (* Check for newlines/tabs in special scheme URLs *)
659+ if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then
660+ Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found."
661+ url attr_name element_name)
662+ else begin
663 (* Check for relative URL issues first *)
664 match check_relative_url url attr_name element_name with
665 | Some err -> Some err
···701 url attr_name element_name)
702 else
70300000704 (* Check path segment for illegal characters *)
705 match check_path_segment url attr_name element_name with
706 | Some err -> Some err
···725 match host_opt with
726 | Some host -> validate_host host url attr_name element_name scheme_str
727 | None -> None
728+ end
729 end
730 end
731···799 ()
800 | Some _ ->
801 (* Check for data: URI with fragment - emit warning *)
802+ (* input[type=url] uses "Bad absolute URL:" format *)
803+ (match check_data_uri_fragment ~is_absolute_url:true url "value" name with
804 | Some warn_msg ->
805 Message_collector.add_warning collector
806 ~message:warn_msg
···825 end
826 end;
827 (* Check microdata itemtype and itemid attributes for data: URI fragments *)
828+ (* Microdata uses "Bad absolute URL:" format *)
829 let itemtype_opt = get_attr_value "itemtype" attrs in
830 (match itemtype_opt with
831 | Some url when String.trim url <> "" ->
832+ (match check_data_uri_fragment ~is_absolute_url:true url "itemtype" name with
833 | Some warn_msg ->
834 Message_collector.add_warning collector
835 ~message:warn_msg
···839 ()
840 | None -> ())
841 | _ -> ());
842+ (* itemid uses "Bad URL:" format (not "Bad absolute URL:") *)
843 let itemid_opt = get_attr_value "itemid" attrs in
844 (match itemid_opt with
845 | Some url when String.trim url <> "" ->
+3-3
lib/html5rw/parser/parser_tree_builder.ml
···664let close_p_element t =
665 generate_implied_end_tags t ~except:"p" ();
666 (match current_node t with
667- | Some n when n.Dom.name <> "p" -> parse_error t "expected-p"
668 | _ -> ());
669 pop_until_tag t "p"
670···1215 end
1216 | Token.Tag { kind = Token.End; name = "p"; _ } ->
1217 if not (has_element_in_button_scope t "p") then begin
1218- parse_error t "unexpected-end-tag";
1219 ignore (insert_element t "p" ~push:true [])
1220 end;
1221 close_p_element t
···1321 t.frameset_ok <- false;
1322 t.mode <- Parser_insertion_mode.In_table
1323 | Token.Tag { kind = Token.End; name = "br"; _ } ->
1324- parse_error t "unexpected-end-tag";
1325 reconstruct_active_formatting t;
1326 ignore (insert_element t "br" ~push:true []);
1327 pop_current t;
···664let close_p_element t =
665 generate_implied_end_tags t ~except:"p" ();
666 (match current_node t with
667+ | Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements"
668 | _ -> ());
669 pop_until_tag t "p"
670···1215 end
1216 | Token.Tag { kind = Token.End; name = "p"; _ } ->
1217 if not (has_element_in_button_scope t "p") then begin
1218+ parse_error t "no-p-element-in-scope";
1219 ignore (insert_element t "p" ~push:true [])
1220 end;
1221 close_p_element t
···1321 t.frameset_ok <- false;
1322 t.mode <- Parser_insertion_mode.In_table
1323 | Token.Tag { kind = Token.End; name = "br"; _ } ->
1324+ parse_error t "end-tag-br";
1325 reconstruct_active_formatting t;
1326 ignore (insert_element t "br" ~push:true []);
1327 pop_current t;
+5-5
lib/html5rw/tokenizer/tokenizer_impl.ml
···264 code = 0x0B ||
265 (code >= 0x0E && code <= 0x1F) ||
266 code = 0x7F then
267- error t "control-character-in-input-stream"
268 in
269270···1937 error t "null-character-reference";
1938 replacement_char
1939 end else if code > 0x10FFFF then begin
1940- error t "character-reference-outside-unicode-range";
1941 replacement_char
1942 end else if code >= 0xD800 && code <= 0xDFFF then begin
1943- error t "surrogate-character-reference";
1944 replacement_char
1945 end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1946 List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
···1949 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
1950 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
1951 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin
1952- error t "noncharacter-character-reference";
1953 Entities.Numeric_ref.codepoint_to_utf8 code
1954 end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
1955 (code >= 0x0D && code <= 0x1F) ||
1956 (code >= 0x7F && code <= 0x9F) then begin
1957- error t "control-character-reference";
1958 (* Apply Windows-1252 replacement table for 0x80-0x9F *)
1959 match Entities.Numeric_ref.find_replacement code with
1960 | Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
···264 code = 0x0B ||
265 (code >= 0x0E && code <= 0x1F) ||
266 code = 0x7F then
267+ error t (Printf.sprintf "control-character-in-input-stream:%04x" code)
268 in
269270···1937 error t "null-character-reference";
1938 replacement_char
1939 end else if code > 0x10FFFF then begin
1940+ error t (Printf.sprintf "character-reference-outside-unicode-range:%x" code);
1941 replacement_char
1942 end else if code >= 0xD800 && code <= 0xDFFF then begin
1943+ error t (Printf.sprintf "surrogate-character-reference:%04x" code);
1944 replacement_char
1945 end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1946 List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
···1949 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
1950 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
1951 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin
1952+ error t (Printf.sprintf "noncharacter-character-reference:%05x" code);
1953 Entities.Numeric_ref.codepoint_to_utf8 code
1954 end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
1955 (code >= 0x0D && code <= 0x1F) ||
1956 (code >= 0x7F && code <= 0x9F) then begin
1957+ error t (Printf.sprintf "control-character-reference:%04x" code);
1958 (* Apply Windows-1252 replacement table for 0x80-0x9F *)
1959 match Entities.Numeric_ref.find_replacement code with
1960 | Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
+5-5
lib/html5rw/tokenizer/tokenizer_stream.ml
···99let check_utf8_codepoint t lead_byte =
100 let b0 = Char.code lead_byte in
101 if b0 < 0x80 then
102- (* ASCII - no surrogates or noncharacters possible in this range except control chars *)
103 ()
104 else if b0 >= 0xC2 && b0 <= 0xDF then begin
105 (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *)
···112 (* C1 controls: U+0080 to U+009F *)
113 if cp >= 0x80 && cp <= 0x9F then
114 (match t.error_callback with
115- | Some cb -> cb "control-character-in-input-stream"
116 | None -> ())
117 | Some c1 ->
118 push_back_char t c1
···132 (* Check for surrogates and noncharacters *)
133 (match t.error_callback with
134 | Some cb ->
135- if is_surrogate cp then cb "surrogate-in-input-stream"
136- else if is_noncharacter cp then cb "noncharacter-in-input-stream"
137 | None -> ())
138 | Some c2 ->
139 push_back_char t c2;
···162 (* Check for noncharacters (no surrogates in 4-byte range) *)
163 (match t.error_callback with
164 | Some cb ->
165- if is_noncharacter cp then cb "noncharacter-in-input-stream"
166 | None -> ())
167 | Some c3 ->
168 push_back_char t c3;
···99let check_utf8_codepoint t lead_byte =
100 let b0 = Char.code lead_byte in
101 if b0 < 0x80 then
102+ (* ASCII - control characters are handled in tokenizer_impl.ml *)
103 ()
104 else if b0 >= 0xC2 && b0 <= 0xDF then begin
105 (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *)
···112 (* C1 controls: U+0080 to U+009F *)
113 if cp >= 0x80 && cp <= 0x9F then
114 (match t.error_callback with
115+ | Some cb -> cb (Printf.sprintf "control-character-in-input-stream:%04x" cp)
116 | None -> ())
117 | Some c1 ->
118 push_back_char t c1
···132 (* Check for surrogates and noncharacters *)
133 (match t.error_callback with
134 | Some cb ->
135+ if is_surrogate cp then cb (Printf.sprintf "surrogate-in-input-stream:%04x" cp)
136+ else if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%04x" cp)
137 | None -> ())
138 | Some c2 ->
139 push_back_char t c2;
···162 (* Check for noncharacters (no surrogates in 4-byte range) *)
163 (match t.error_callback with
164 | Some cb ->
165+ if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%05x" cp)
166 | None -> ())
167 | Some c3 ->
168 push_back_char t c3;