OCaml HTML5 parser/serialiser based on Python's JustHTML

fixes from using more third party libraries

+70 -137
+2
dune-project
··· 20 20 tree construction, encoding detection, and CSS selector queries.") 21 21 (depends 22 22 (ocaml (>= 5.1.0)) 23 + (astring (>= 0.8.5)) 23 24 (bytesrw (>= 0.3.0)) 24 25 (uutf (>= 1.0.0)) 26 + (uuuu (>= 0.3.0)) 25 27 (odoc :with-doc) 26 28 (jsont (and :with-test (>= 0.2.0)))))
+2
html5rw.opam
··· 11 11 depends: [ 12 12 "dune" {>= "3.20"} 13 13 "ocaml" {>= "5.1.0"} 14 + "astring" {>= "0.8.5"} 14 15 "bytesrw" {>= "0.3.0"} 15 16 "uutf" {>= "1.0.0"} 17 + "uuuu" {>= "0.3.0"} 16 18 "odoc" {with-doc} 17 19 "jsont" {with-test & >= "0.2.0"} 18 20 ]
+16 -58
lib/encoding/decode.ml
··· 1 1 (* HTML5 encoding detection and decoding *) 2 2 3 + (* UTF-8 replacement character *) 4 + let replacement_char = Uchar.of_int 0xFFFD 5 + 3 6 let decode_utf16 data ~is_le ~bom_len = 4 7 let len = Bytes.length data in 5 8 let buf = Buffer.create len in ··· 28 31 let high = code_unit - 0xD800 in 29 32 let low = code_unit2 - 0xDC00 in 30 33 let cp = 0x10000 + (high lsl 10) lor low in 31 - Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18))); 32 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F))); 33 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 34 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 34 + Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp) 35 35 end else begin 36 36 (* Invalid surrogate, output replacement *) 37 - Buffer.add_string buf "\xEF\xBF\xBD" 37 + Uutf.Buffer.add_utf_8 buf replacement_char 38 38 end 39 39 end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin 40 40 (* Lone surrogate *) 41 - Buffer.add_string buf "\xEF\xBF\xBD" 42 - end else if code_unit <= 0x7F then begin 43 - Buffer.add_char buf (Char.chr code_unit) 44 - end else if code_unit <= 0x7FF then begin 45 - Buffer.add_char buf (Char.chr (0xC0 lor (code_unit lsr 6))); 46 - Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F))) 41 + Uutf.Buffer.add_utf_8 buf replacement_char 47 42 end else begin 48 - Buffer.add_char buf (Char.chr (0xE0 lor (code_unit lsr 12))); 49 - Buffer.add_char buf (Char.chr (0x80 lor ((code_unit lsr 6) land 0x3F))); 50 - Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F))) 43 + Uutf.Buffer.add_utf_8 buf (Uchar.of_int code_unit) 51 44 end 52 45 done; 53 46 54 47 (* Odd trailing byte *) 55 - if !i < len then Buffer.add_string buf "\xEF\xBF\xBD"; 48 + if !i < len then Uutf.Buffer.add_utf_8 buf replacement_char; 56 49 57 50 Buffer.contents buf 58 51 ··· 85 78 | Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len 86 79 87 80 | Encoding.Windows_1252 -> 81 + (* Windows-1252 mapping table for 0x80-0x9F range *) 88 82 let len = Bytes.length data in 89 83 let buf = Buffer.create len in 90 84 let table = [| ··· 100 94 if b >= 0x80 && b <= 0x9F then table.(b - 0x80) 101 95 else b 102 96 in 103 - if cp <= 0x7F then 104 - Buffer.add_char buf (Char.chr cp) 105 - else if cp <= 0x7FF then begin 106 - Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6))); 107 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 108 - end else begin 109 - Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12))); 110 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 111 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 112 - end 97 + Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp) 113 98 done; 114 99 Buffer.contents buf 115 100 116 101 | Encoding.Iso_8859_2 -> 102 + (* Use uuuu for ISO-8859-2 decoding *) 117 103 let len = Bytes.length data in 118 104 let buf = Buffer.create len in 119 - let table = [| 120 - (* 0xA0-0xBF *) 121 - 0x00A0; 0x0104; 0x02D8; 0x0141; 0x00A4; 0x013D; 0x015A; 0x00A7; 122 - 0x00A8; 0x0160; 0x015E; 0x0164; 0x0179; 0x00AD; 0x017D; 0x017B; 123 - 0x00B0; 0x0105; 0x02DB; 0x0142; 0x00B4; 0x013E; 0x015B; 0x02C7; 124 - 0x00B8; 0x0161; 0x015F; 0x0165; 0x017A; 0x02DD; 0x017E; 0x017C; 125 - (* 0xC0-0xFF *) 126 - 0x0154; 0x00C1; 0x00C2; 0x0102; 0x00C4; 0x0139; 0x0106; 0x00C7; 127 - 0x010C; 0x00C9; 0x0118; 0x00CB; 0x011A; 0x00CD; 0x00CE; 0x010E; 128 - 0x0110; 0x0143; 0x0147; 0x00D3; 0x00D4; 0x0150; 0x00D6; 0x00D7; 129 - 0x0158; 0x016E; 0x00DA; 0x0170; 0x00DC; 0x00DD; 0x0162; 0x00DF; 130 - 0x0155; 0x00E1; 0x00E2; 0x0103; 0x00E4; 0x013A; 0x0107; 0x00E7; 131 - 0x010D; 0x00E9; 0x0119; 0x00EB; 0x011B; 0x00ED; 0x00EE; 0x010F; 132 - 0x0111; 0x0144; 0x0148; 0x00F3; 0x00F4; 0x0151; 0x00F6; 0x00F7; 133 - 0x0159; 0x016F; 0x00FA; 0x0171; 0x00FC; 0x00FD; 0x0163; 0x02D9; 134 - |] in 135 - for i = bom_len to len - 1 do 136 - let b = Char.code (Bytes.get data i) in 137 - let cp = 138 - if b >= 0xA0 then table.(b - 0xA0) 139 - else b 140 - in 141 - if cp <= 0x7F then 142 - Buffer.add_char buf (Char.chr cp) 143 - else if cp <= 0x7FF then begin 144 - Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6))); 145 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 146 - end else begin 147 - Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12))); 148 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 149 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 150 - end 151 - done; 105 + let s = Bytes.sub_string data bom_len (len - bom_len) in 106 + Uuuu.String.fold `ISO_8859_2 (fun () _pos -> function 107 + | `Uchar u -> Uutf.Buffer.add_utf_8 buf u 108 + | `Malformed _ -> Uutf.Buffer.add_utf_8 buf replacement_char 109 + ) () s; 152 110 Buffer.contents buf 153 111 154 112 | Encoding.Euc_jp ->
+1 -1
lib/encoding/dune
··· 1 1 (library 2 2 (name html5rw_encoding) 3 3 (public_name html5rw.encoding) 4 - (libraries bytesrw uutf)) 4 + (libraries astring bytesrw uutf uuuu))
+1 -1
lib/encoding/labels.ml
··· 3 3 let normalize_label label = 4 4 if String.length label = 0 then None 5 5 else 6 - let s = String.lowercase_ascii (String.trim label) in 6 + let s = Astring.String.Ascii.lowercase (Astring.String.trim label) in 7 7 if String.length s = 0 then None 8 8 else 9 9 (* Security: never allow utf-7 *)
+3 -10
lib/encoding/prescan.ml
··· 1 1 (* HTML meta charset prescan per WHATWG spec *) 2 2 3 - let ascii_whitespace = ['\x09'; '\x0A'; '\x0C'; '\x0D'; '\x20'] 4 - 5 - let is_ascii_whitespace c = List.mem c ascii_whitespace 6 - 7 - let is_ascii_alpha c = 8 - (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 9 - 10 - let ascii_lower c = 11 - if c >= 'A' && c <= 'Z' then Char.chr (Char.code c + 32) 12 - else c 3 + (* Character classification using Astring *) 4 + let is_ascii_whitespace c = c = '\x09' || c = '\x0A' || c = '\x0C' || c = '\x0D' || c = '\x20' 5 + let is_ascii_alpha = Astring.Char.Ascii.is_letter 13 6 14 7 let skip_whitespace data i len = 15 8 let j = ref i in
+4 -11
lib/entities/decode.ml
··· 1 1 (* HTML5 entity decoding *) 2 2 3 - let is_alpha c = 4 - (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 5 - 6 - let is_alnum c = 7 - is_alpha c || (c >= '0' && c <= '9') 8 - 9 - let is_hex_digit c = 10 - (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') 11 - 12 - let is_digit c = 13 - c >= '0' && c <= '9' 3 + (* Character classification using Astring *) 4 + let is_alnum = Astring.Char.Ascii.is_alphanum 5 + let is_hex_digit = Astring.Char.Ascii.is_hex_digit 6 + let is_digit = Astring.Char.Ascii.is_digit 14 7 15 8 let decode_entities_in_text text ~in_attribute = 16 9 let len = String.length text in
+2 -1
lib/entities/dune
··· 1 1 (library 2 2 (name html5rw_entities) 3 - (public_name html5rw.entities)) 3 + (public_name html5rw.entities) 4 + (libraries astring uutf)) 4 5 5 6 (rule 6 7 (target entity_table.ml)
+2 -15
lib/entities/numeric_ref.ml
··· 43 43 in 44 44 search 0 45 45 46 + (* Encode a Unicode codepoint to UTF-8 using uutf *) 46 47 let codepoint_to_utf8 cp = 47 48 let buf = Buffer.create 4 in 48 - if cp <= 0x7F then 49 - Buffer.add_char buf (Char.chr cp) 50 - else if cp <= 0x7FF then begin 51 - Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6))); 52 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 53 - end else if cp <= 0xFFFF then begin 54 - Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12))); 55 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 56 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 57 - end else begin 58 - Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18))); 59 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F))); 60 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 61 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 62 - end; 49 + Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp); 63 50 Buffer.contents buf 64 51 65 52 let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *)
+7 -4
lib/parser/constants.ml
··· 1 1 (* HTML5 spec constants *) 2 2 3 + (* Use Astring for string operations *) 4 + let lowercase = Astring.String.Ascii.lowercase 5 + 3 6 (* Void elements - no end tag allowed *) 4 7 let void_elements = [ 5 8 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; ··· 70 73 71 74 let adjust_mathml_attrs attrs = 72 75 List.map (fun (k, v) -> 73 - match List.assoc_opt (String.lowercase_ascii k) mathml_attr_adjustments with 76 + match List.assoc_opt (lowercase k) mathml_attr_adjustments with 74 77 | Some adjusted_k -> (adjusted_k, v) 75 78 | None -> (k, v) 76 79 ) attrs ··· 282 285 let is_heading = List.mem 283 286 284 287 let adjust_svg_tag_name name = 285 - match List.assoc_opt (String.lowercase_ascii name) svg_tag_adjustments with 288 + match List.assoc_opt (lowercase name) svg_tag_adjustments with 286 289 | Some adjusted -> adjusted 287 290 | None -> name 288 291 289 292 let adjust_svg_attrs attrs = 290 293 List.map (fun (name, value) -> 291 294 let adjusted_name = 292 - match List.assoc_opt (String.lowercase_ascii name) svg_attr_adjustments with 295 + match List.assoc_opt (lowercase name) svg_attr_adjustments with 293 296 | Some n -> n 294 297 | None -> name 295 298 in ··· 298 301 299 302 let adjust_foreign_attrs attrs = 300 303 List.map (fun (name, value) -> 301 - match List.assoc_opt (String.lowercase_ascii name) foreign_attr_adjustments with 304 + match List.assoc_opt (lowercase name) foreign_attr_adjustments with 302 305 | Some (prefix, local, _ns) -> 303 306 if prefix = "" then (local, value) 304 307 else (prefix ^ ":" ^ local, value)
+1 -1
lib/parser/dune
··· 1 1 (library 2 2 (name html5rw_parser) 3 3 (public_name html5rw.parser) 4 - (libraries bytesrw html5rw.tokenizer html5rw.dom html5rw.encoding html5rw.selector)) 4 + (libraries astring bytesrw html5rw.tokenizer html5rw.dom html5rw.encoding html5rw.selector))
+1 -1
lib/selector/dune
··· 1 1 (library 2 2 (name html5rw_selector) 3 3 (public_name html5rw.selector) 4 - (libraries html5rw.dom)) 4 + (libraries astring html5rw.dom))
+21 -25
lib/selector/selector_match.ml
··· 3 3 module Dom = Html5rw_dom 4 4 open Selector_ast 5 5 6 + (* Use Astring for string operations *) 7 + let lowercase = Astring.String.Ascii.lowercase 8 + let trim = Astring.String.trim 9 + let find_sub = Astring.String.find_sub 10 + let fields = Astring.String.fields 11 + 6 12 (* Check if haystack contains needle as a substring *) 7 13 let string_contains ~haystack ~needle = 8 - let needle_len = String.length needle in 9 - let haystack_len = String.length haystack in 10 - if needle_len > haystack_len then false 11 - else if needle_len = 0 then true 12 - else 13 - let rec check i = 14 - if i > haystack_len - needle_len then false 15 - else if String.sub haystack i needle_len = needle then true 16 - else check (i + 1) 17 - in 18 - check 0 14 + Option.is_some (find_sub ~sub:needle haystack) 19 15 20 16 let is_element node = 21 17 let name = node.Dom.name in ··· 58 54 match node.Dom.parent with 59 55 | None -> false 60 56 | Some parent -> 61 - let name = String.lowercase_ascii node.Dom.name in 57 + let name = lowercase node.Dom.name in 62 58 let rec find = function 63 59 | [] -> false 64 - | n :: _ when String.lowercase_ascii n.Dom.name = name -> n == node 60 + | n :: _ when lowercase n.Dom.name = name -> n == node 65 61 | _ :: rest -> find rest 66 62 in 67 63 find (get_element_children parent) ··· 70 66 match node.Dom.parent with 71 67 | None -> false 72 68 | Some parent -> 73 - let name = String.lowercase_ascii node.Dom.name in 69 + let name = lowercase node.Dom.name in 74 70 let rec find last = function 75 71 | [] -> (match last with Some l -> l == node | None -> false) 76 - | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (Some n) rest 72 + | n :: rest when lowercase n.Dom.name = name -> find (Some n) rest 77 73 | _ :: rest -> find last rest 78 74 in 79 75 find None (get_element_children parent) ··· 94 90 match node.Dom.parent with 95 91 | None -> 0 96 92 | Some parent -> 97 - let name = String.lowercase_ascii node.Dom.name in 93 + let name = lowercase node.Dom.name in 98 94 let children = get_element_children parent in 99 95 let rec find idx = function 100 96 | [] -> 0 101 97 | n :: _ when n == node -> idx 102 - | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (idx + 1) rest 98 + | n :: rest when lowercase n.Dom.name = name -> find (idx + 1) rest 103 99 | _ :: rest -> find idx rest 104 100 in 105 101 find 1 children 106 102 107 103 (* Parse nth expression: "odd", "even", "3", "2n+1", etc *) 108 104 let parse_nth expr = 109 - let expr = String.lowercase_ascii (String.trim expr) in 105 + let expr = lowercase (trim expr) in 110 106 if expr = "odd" then Some (2, 1) 111 107 else if expr = "even" then Some (2, 0) 112 108 else 113 - let expr = String.concat "" (String.split_on_char ' ' expr) in 109 + let expr = String.concat "" (fields ~is_sep:(fun c -> c = ' ') expr) in 114 110 if String.contains expr 'n' then 115 111 let parts = String.split_on_char 'n' expr in 116 112 match parts with ··· 145 141 | Type_universal -> true 146 142 | Type_tag -> 147 143 (match selector.name with 148 - | Some name -> String.lowercase_ascii node.Dom.name = String.lowercase_ascii name 144 + | Some name -> lowercase node.Dom.name = lowercase name 149 145 | None -> false) 150 146 | Type_id -> 151 147 (match selector.name with ··· 159 155 | Some cls -> 160 156 (match Dom.get_attr node "class" with 161 157 | Some class_attr -> 162 - let classes = String.split_on_char ' ' class_attr in 158 + let classes = fields ~is_sep:(fun c -> c = ' ') class_attr in 163 159 List.mem cls classes 164 160 | None -> false) 165 161 | None -> false) 166 162 | Type_attr -> 167 163 (match selector.name with 168 164 | Some attr_name -> 169 - let attr_name_lower = String.lowercase_ascii attr_name in 165 + let attr_name_lower = lowercase attr_name in 170 166 let node_value = 171 167 List.find_map (fun (k, v) -> 172 - if String.lowercase_ascii k = attr_name_lower then Some v 168 + if lowercase k = attr_name_lower then Some v 173 169 else None 174 170 ) node.Dom.attrs 175 171 in ··· 181 177 (match selector.operator with 182 178 | Some "=" -> attr_value = value 183 179 | Some "~=" -> 184 - let words = String.split_on_char ' ' attr_value in 180 + let words = fields ~is_sep:(fun c -> c = ' ') attr_value in 185 181 List.mem value words 186 182 | Some "|=" -> 187 183 attr_value = value || String.length attr_value > String.length value && ··· 204 200 | Some "only-of-type" -> is_first_of_type node && is_last_of_type node 205 201 | Some "empty" -> 206 202 not (List.exists (fun c -> 207 - is_element c || (c.Dom.name = "#text" && String.trim c.Dom.data <> "") 203 + is_element c || (c.Dom.name = "#text" && trim c.Dom.data <> "") 208 204 ) node.Dom.children) 209 205 | Some "root" -> 210 206 (match node.Dom.parent with
+1 -1
lib/tokenizer/dune
··· 1 1 (library 2 2 (name html5rw_tokenizer) 3 3 (public_name html5rw.tokenizer) 4 - (libraries bytesrw html5rw.entities)) 4 + (libraries astring bytesrw html5rw.entities))
+6 -8
lib/tokenizer/tokenizer.ml
··· 1 1 (* HTML5 Tokenizer - implements WHATWG tokenization algorithm *) 2 2 3 - let is_ascii_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 4 - let is_ascii_upper c = c >= 'A' && c <= 'Z' 5 - let is_ascii_digit c = c >= '0' && c <= '9' 6 - let is_ascii_hex c = is_ascii_digit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') 7 - let is_ascii_alnum c = is_ascii_alpha c || is_ascii_digit c 3 + (* Character classification using Astring *) 4 + let is_ascii_alpha = Astring.Char.Ascii.is_letter 5 + let is_ascii_digit = Astring.Char.Ascii.is_digit 6 + let is_ascii_hex = Astring.Char.Ascii.is_hex_digit 7 + let is_ascii_alnum = Astring.Char.Ascii.is_alphanum 8 8 let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' 9 - 10 - let ascii_lower c = 11 - if is_ascii_upper c then Char.chr (Char.code c + 32) else c 9 + let ascii_lower = Astring.Char.Ascii.lowercase 12 10 13 11 (* Token sink interface *) 14 12 module type SINK = sig