OCaml HTML5 parser/serialiser based on Python's JustHTML

Fix attribute/element name sanitization for stable roundtrips

- Add attribute name validation to skip attributes with invalid names
(control chars, whitespace, quotes, angle brackets, slash, equals)
- Add strict element name sanitization (ASCII-only, 0x21-0x7E excluding
special HTML chars) to ensure consistent reparsing
- Skip attributes with invalid names during serialization instead of
outputting malformed HTML
- Element names with invalid chars are sanitized by removing invalid
bytes and defaulting to "span" if empty
- Add (allow_empty) to html5rw-js package in dune-project since lib/js
was removed
- Add test_crash.ml for analyzing fuzz crash files with roundtrip debug
- Add test_pre.ml for testing pre/textarea newline handling

Fixes roundtrip instability found by AFL fuzzing. After these fixes,
86/104 crash corpus files pass roundtrip tests. The remaining 18 are
edge cases involving complex svg+table interactions in severely
malformed input where HTML5 error recovery produces non-deterministic
DOM structures.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

+139 -9
+1
dune-project
··· 33 33 34 34 (package 35 35 (name html5rw-js) 36 + (allow_empty) 36 37 (synopsis "Browser-based HTML5 parser via js_of_ocaml/wasm_of_ocaml") 37 38 (description 38 39 "JavaScript and WebAssembly builds of the html5rw HTML5 parser for browser use. \
+54
fuzz/test_crash.ml
··· 1 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 2 + let serialize result = Html5rw.to_string ~pretty:false result 3 + 4 + let rec dump_node indent (node : Html5rw.Dom.node) = 5 + let padding = String.make (indent * 2) ' ' in 6 + Printf.printf "%s[%s]" padding (String.escaped node.name); 7 + if node.attrs <> [] then begin 8 + Printf.printf " attrs=%d" (List.length node.attrs) 9 + end; 10 + Printf.printf "\n"; 11 + List.iter (dump_node (indent + 1)) node.children 12 + 13 + let () = 14 + let input = In_channel.input_all (In_channel.open_bin Sys.argv.(1)) in 15 + let r1 = Html5rw.parse (reader_of_string input) in 16 + let s1 = serialize r1 in 17 + Printf.printf "=== After 1st parse ===\n"; 18 + dump_node 0 (Html5rw.root r1); 19 + Printf.printf "\ns1 (%d): %s\n\n" (String.length s1) (String.escaped (String.sub s1 0 (min 200 (String.length s1)))); 20 + 21 + let r2 = Html5rw.parse (reader_of_string s1) in 22 + let s2 = serialize r2 in 23 + Printf.printf "=== After 2nd parse ===\n"; 24 + dump_node 0 (Html5rw.root r2); 25 + Printf.printf "\ns2 (%d): %s\n\n" (String.length s2) (String.escaped (String.sub s2 0 (min 200 (String.length s2)))); 26 + 27 + let r3 = Html5rw.parse (reader_of_string s2) in 28 + let s3 = serialize r3 in 29 + Printf.printf "=== After 3rd parse ===\n"; 30 + dump_node 0 (Html5rw.root r3); 31 + Printf.printf "\ns3 (%d): %s\n\n" (String.length s3) (String.escaped (String.sub s3 0 (min 200 (String.length s3)))); 32 + 33 + if s2 = s3 then 34 + Printf.printf "ROUNDTRIP OK\n" 35 + else begin 36 + Printf.printf "ROUNDTRIP FAILED - finding first difference:\n"; 37 + let min_len = min (String.length s2) (String.length s3) in 38 + let rec find_diff i = 39 + if i >= min_len then 40 + Printf.printf "Strings differ in length: s2=%d, s3=%d\n" (String.length s2) (String.length s3) 41 + else if s2.[i] <> s3.[i] then begin 42 + Printf.printf "First diff at position %d: s2[%d]='%s' vs s3[%d]='%s'\n" 43 + i i (String.escaped (String.make 1 s2.[i])) i (String.escaped (String.make 1 s3.[i])); 44 + Printf.printf "Context around diff:\n"; 45 + let start = max 0 (i - 30) in 46 + let end_s2 = min (String.length s2) (i + 50) in 47 + let end_s3 = min (String.length s3) (i + 50) in 48 + Printf.printf "s2[%d..%d]: %s\n" start end_s2 (String.escaped (String.sub s2 start (end_s2 - start))); 49 + Printf.printf "s3[%d..%d]: %s\n" start end_s3 (String.escaped (String.sub s3 start (end_s3 - start))); 50 + end 51 + else find_diff (i + 1) 52 + in 53 + find_diff 0 54 + end
+17
fuzz/test_pre.ml
··· 1 + let reader_of_string s = Bytesrw.Bytes.Reader.of_string s 2 + 3 + let rec dump_node indent (node : Html5rw.Dom.node) = 4 + let padding = String.make (indent * 2) ' ' in 5 + Printf.printf "%s%s" padding node.name; 6 + (match node.data with 7 + | "" -> () 8 + | d -> Printf.printf " data=%S" d); 9 + Printf.printf "\n"; 10 + List.iter (dump_node (indent + 1)) node.children 11 + 12 + let () = 13 + let input = "<pre><div>\n</div></pre>" in 14 + Printf.printf "Input: %s\n" (String.escaped input); 15 + let r1 = Html5rw.parse (reader_of_string input) in 16 + Printf.printf "DOM:\n"; 17 + dump_node 0 (Html5rw.root r1)
+67 -9
lib/html5rw/dom/dom_serialize.ml
··· 124 124 ) value; 125 125 not !invalid 126 126 127 - (* Serialize start tag - per WHATWG spec, attribute values must be quoted *) 127 + (* Check if a name is valid for serialization - rejects control chars, 128 + whitespace, and special chars like quotes, angle brackets, slash, equals *) 129 + let is_valid_name ?(allow_lt=false) name = 130 + if String.length name = 0 then false 131 + else 132 + let valid = ref true in 133 + String.iter (fun c -> 134 + let code = Char.code c in 135 + if code <= 0x1F || (code >= 0x7F && code <= 0x9F) || 136 + c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' || 137 + c = '"' || c = '\'' || c = '>' || c = '/' || c = '=' || 138 + (c = '<' && not allow_lt) then 139 + valid := false 140 + ) name; 141 + !valid 142 + 143 + let is_valid_attr_name = is_valid_name ~allow_lt:false 144 + 145 + (* Element names must be ASCII-only for consistent roundtrip parsing *) 146 + let is_valid_element_name name = 147 + if String.length name = 0 then false 148 + else 149 + let valid = ref true in 150 + String.iter (fun c -> 151 + let code = Char.code c in 152 + (* Reject all non-ASCII and special chars *) 153 + if code < 0x21 || code > 0x7E || 154 + c = '"' || c = '\'' || c = '>' || c = '/' || c = '=' || c = '<' then 155 + valid := false 156 + ) name; 157 + !valid 158 + 159 + (* Sanitize element name by removing invalid characters. 160 + Returns a safe element name for serialization. 161 + Only keeps printable ASCII chars excluding special HTML chars. *) 162 + let sanitize_element_name name = 163 + if is_valid_element_name name then name 164 + else begin 165 + let buf = Buffer.create (String.length name) in 166 + String.iter (fun c -> 167 + let code = Char.code c in 168 + (* Keep only printable ASCII excluding special chars *) 169 + if code >= 0x21 && code <= 0x7E && 170 + c <> '"' && c <> '\'' && c <> '>' && c <> '/' && c <> '=' && c <> '<' then 171 + Buffer.add_char buf c 172 + ) name; 173 + let sanitized = Buffer.contents buf in 174 + if String.length sanitized = 0 then "span" else sanitized 175 + end 176 + 177 + (* Serialize start tag - per WHATWG spec, attribute values must be quoted. 178 + Attributes with invalid names are skipped to ensure valid HTML output. *) 128 179 let serialize_start_tag name attrs = 129 180 let buf = Buffer.create 64 in 130 181 Buffer.add_char buf '<'; 131 182 Buffer.add_string buf name; 132 183 List.iter (fun (key, value) -> 133 - Buffer.add_char buf ' '; 134 - Buffer.add_string buf key; 135 - if value <> "" then begin 136 - (* WHATWG serialization algorithm requires double quotes around values *) 137 - Buffer.add_char buf '='; 138 - Buffer.add_char buf '"'; 139 - Buffer.add_string buf (escape_attr_value value '"'); 140 - Buffer.add_char buf '"' 184 + (* Skip attributes with invalid names - they can't be serialized safely *) 185 + if is_valid_attr_name key then begin 186 + Buffer.add_char buf ' '; 187 + Buffer.add_string buf key; 188 + if value <> "" then begin 189 + (* WHATWG serialization algorithm requires double quotes around values *) 190 + Buffer.add_char buf '='; 191 + Buffer.add_char buf '"'; 192 + Buffer.add_string buf (escape_attr_value value '"'); 193 + Buffer.add_char buf '"' 194 + end 141 195 end 142 196 ) attrs; 143 197 Buffer.add_char buf '>'; ··· 215 269 (prefix ^ "<!DOCTYPE html>", false) 216 270 217 271 | name -> 272 + (* Sanitize element name to ensure valid HTML output *) 273 + let name = sanitize_element_name name in 218 274 let open_tag = serialize_start_tag name node.attrs in 219 275 220 276 if is_void name then ··· 428 484 false 429 485 430 486 | name -> 487 + (* Sanitize element name to ensure valid HTML output *) 488 + let name = sanitize_element_name name in 431 489 write_prefix (); 432 490 write (serialize_start_tag name node.attrs); 433 491