OCaml HTML5 parser/serialiser based on Python's JustHTML

fixes from using more third party libraries

+70 -137
+2
dune-project
··· 20 tree construction, encoding detection, and CSS selector queries.") 21 (depends 22 (ocaml (>= 5.1.0)) 23 (bytesrw (>= 0.3.0)) 24 (uutf (>= 1.0.0)) 25 (odoc :with-doc) 26 (jsont (and :with-test (>= 0.2.0)))))
··· 20 tree construction, encoding detection, and CSS selector queries.") 21 (depends 22 (ocaml (>= 5.1.0)) 23 + (astring (>= 0.8.5)) 24 (bytesrw (>= 0.3.0)) 25 (uutf (>= 1.0.0)) 26 + (uuuu (>= 0.3.0)) 27 (odoc :with-doc) 28 (jsont (and :with-test (>= 0.2.0)))))
+2
html5rw.opam
··· 11 depends: [ 12 "dune" {>= "3.20"} 13 "ocaml" {>= "5.1.0"} 14 "bytesrw" {>= "0.3.0"} 15 "uutf" {>= "1.0.0"} 16 "odoc" {with-doc} 17 "jsont" {with-test & >= "0.2.0"} 18 ]
··· 11 depends: [ 12 "dune" {>= "3.20"} 13 "ocaml" {>= "5.1.0"} 14 + "astring" {>= "0.8.5"} 15 "bytesrw" {>= "0.3.0"} 16 "uutf" {>= "1.0.0"} 17 + "uuuu" {>= "0.3.0"} 18 "odoc" {with-doc} 19 "jsont" {with-test & >= "0.2.0"} 20 ]
+16 -58
lib/encoding/decode.ml
··· 1 (* HTML5 encoding detection and decoding *) 2 3 let decode_utf16 data ~is_le ~bom_len = 4 let len = Bytes.length data in 5 let buf = Buffer.create len in ··· 28 let high = code_unit - 0xD800 in 29 let low = code_unit2 - 0xDC00 in 30 let cp = 0x10000 + (high lsl 10) lor low in 31 - Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18))); 32 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F))); 33 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 34 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 35 end else begin 36 (* Invalid surrogate, output replacement *) 37 - Buffer.add_string buf "\xEF\xBF\xBD" 38 end 39 end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin 40 (* Lone surrogate *) 41 - Buffer.add_string buf "\xEF\xBF\xBD" 42 - end else if code_unit <= 0x7F then begin 43 - Buffer.add_char buf (Char.chr code_unit) 44 - end else if code_unit <= 0x7FF then begin 45 - Buffer.add_char buf (Char.chr (0xC0 lor (code_unit lsr 6))); 46 - Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F))) 47 end else begin 48 - Buffer.add_char buf (Char.chr (0xE0 lor (code_unit lsr 12))); 49 - Buffer.add_char buf (Char.chr (0x80 lor ((code_unit lsr 6) land 0x3F))); 50 - Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F))) 51 end 52 done; 53 54 (* Odd trailing byte *) 55 - if !i < len then Buffer.add_string buf "\xEF\xBF\xBD"; 56 57 Buffer.contents buf 58 ··· 85 | Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len 86 87 | Encoding.Windows_1252 -> 88 let len = Bytes.length data in 89 let buf = Buffer.create len in 90 let table = [| ··· 100 if b >= 0x80 && b <= 0x9F then table.(b - 0x80) 101 else b 102 in 103 - if cp <= 0x7F then 104 - Buffer.add_char buf (Char.chr cp) 105 - else if cp <= 0x7FF then begin 106 - Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6))); 107 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 108 - end else begin 109 - Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12))); 110 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 111 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 112 - end 113 done; 114 Buffer.contents buf 115 116 | Encoding.Iso_8859_2 -> 117 let len = Bytes.length data in 118 let buf = Buffer.create len in 119 - let table = [| 120 - (* 0xA0-0xBF *) 121 - 0x00A0; 0x0104; 0x02D8; 0x0141; 0x00A4; 0x013D; 0x015A; 0x00A7; 122 - 0x00A8; 0x0160; 0x015E; 0x0164; 0x0179; 0x00AD; 0x017D; 0x017B; 123 - 0x00B0; 0x0105; 0x02DB; 0x0142; 0x00B4; 0x013E; 0x015B; 0x02C7; 124 - 0x00B8; 0x0161; 0x015F; 0x0165; 0x017A; 0x02DD; 0x017E; 0x017C; 125 - (* 0xC0-0xFF *) 126 - 0x0154; 0x00C1; 0x00C2; 0x0102; 0x00C4; 0x0139; 0x0106; 0x00C7; 127 - 0x010C; 0x00C9; 0x0118; 0x00CB; 0x011A; 0x00CD; 0x00CE; 0x010E; 128 - 0x0110; 0x0143; 0x0147; 0x00D3; 0x00D4; 0x0150; 0x00D6; 0x00D7; 129 - 0x0158; 0x016E; 0x00DA; 0x0170; 0x00DC; 0x00DD; 0x0162; 0x00DF; 130 - 0x0155; 0x00E1; 0x00E2; 0x0103; 0x00E4; 0x013A; 0x0107; 0x00E7; 131 - 0x010D; 0x00E9; 0x0119; 0x00EB; 0x011B; 0x00ED; 0x00EE; 0x010F; 132 - 0x0111; 0x0144; 0x0148; 0x00F3; 0x00F4; 0x0151; 0x00F6; 0x00F7; 133 - 0x0159; 0x016F; 0x00FA; 0x0171; 0x00FC; 0x00FD; 0x0163; 0x02D9; 134 - |] in 135 - for i = bom_len to len - 1 do 136 - let b = Char.code (Bytes.get data i) in 137 - let cp = 138 - if b >= 0xA0 then table.(b - 0xA0) 139 - else b 140 - in 141 - if cp <= 0x7F then 142 - Buffer.add_char buf (Char.chr cp) 143 - else if cp <= 0x7FF then begin 144 - Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6))); 145 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 146 - end else begin 147 - Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12))); 148 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 149 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 150 - end 151 - done; 152 Buffer.contents buf 153 154 | Encoding.Euc_jp ->
··· 1 (* HTML5 encoding detection and decoding *) 2 3 + (* UTF-8 replacement character *) 4 + let replacement_char = Uchar.of_int 0xFFFD 5 + 6 let decode_utf16 data ~is_le ~bom_len = 7 let len = Bytes.length data in 8 let buf = Buffer.create len in ··· 31 let high = code_unit - 0xD800 in 32 let low = code_unit2 - 0xDC00 in 33 let cp = 0x10000 + (high lsl 10) lor low in 34 + Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp) 35 end else begin 36 (* Invalid surrogate, output replacement *) 37 + Uutf.Buffer.add_utf_8 buf replacement_char 38 end 39 end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin 40 (* Lone surrogate *) 41 + Uutf.Buffer.add_utf_8 buf replacement_char 42 end else begin 43 + Uutf.Buffer.add_utf_8 buf (Uchar.of_int code_unit) 44 end 45 done; 46 47 (* Odd trailing byte *) 48 + if !i < len then Uutf.Buffer.add_utf_8 buf replacement_char; 49 50 Buffer.contents buf 51 ··· 78 | Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len 79 80 | Encoding.Windows_1252 -> 81 + (* Windows-1252 mapping table for 0x80-0x9F range *) 82 let len = Bytes.length data in 83 let buf = Buffer.create len in 84 let table = [| ··· 94 if b >= 0x80 && b <= 0x9F then table.(b - 0x80) 95 else b 96 in 97 + Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp) 98 done; 99 Buffer.contents buf 100 101 | Encoding.Iso_8859_2 -> 102 + (* Use uuuu for ISO-8859-2 decoding *) 103 let len = Bytes.length data in 104 let buf = Buffer.create len in 105 + let s = Bytes.sub_string data bom_len (len - bom_len) in 106 + Uuuu.String.fold `ISO_8859_2 (fun () _pos -> function 107 + | `Uchar u -> Uutf.Buffer.add_utf_8 buf u 108 + | `Malformed _ -> Uutf.Buffer.add_utf_8 buf replacement_char 109 + ) () s; 110 Buffer.contents buf 111 112 | Encoding.Euc_jp ->
+1 -1
lib/encoding/dune
··· 1 (library 2 (name html5rw_encoding) 3 (public_name html5rw.encoding) 4 - (libraries bytesrw uutf))
··· 1 (library 2 (name html5rw_encoding) 3 (public_name html5rw.encoding) 4 + (libraries astring bytesrw uutf uuuu))
+1 -1
lib/encoding/labels.ml
··· 3 let normalize_label label = 4 if String.length label = 0 then None 5 else 6 - let s = String.lowercase_ascii (String.trim label) in 7 if String.length s = 0 then None 8 else 9 (* Security: never allow utf-7 *)
··· 3 let normalize_label label = 4 if String.length label = 0 then None 5 else 6 + let s = Astring.String.Ascii.lowercase (Astring.String.trim label) in 7 if String.length s = 0 then None 8 else 9 (* Security: never allow utf-7 *)
+3 -10
lib/encoding/prescan.ml
··· 1 (* HTML meta charset prescan per WHATWG spec *) 2 3 - let ascii_whitespace = ['\x09'; '\x0A'; '\x0C'; '\x0D'; '\x20'] 4 - 5 - let is_ascii_whitespace c = List.mem c ascii_whitespace 6 - 7 - let is_ascii_alpha c = 8 - (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 9 - 10 - let ascii_lower c = 11 - if c >= 'A' && c <= 'Z' then Char.chr (Char.code c + 32) 12 - else c 13 14 let skip_whitespace data i len = 15 let j = ref i in
··· 1 (* HTML meta charset prescan per WHATWG spec *) 2 3 + (* Character classification using Astring *) 4 + let is_ascii_whitespace c = c = '\x09' || c = '\x0A' || c = '\x0C' || c = '\x0D' || c = '\x20' 5 + let is_ascii_alpha = Astring.Char.Ascii.is_letter 6 7 let skip_whitespace data i len = 8 let j = ref i in
+4 -11
lib/entities/decode.ml
··· 1 (* HTML5 entity decoding *) 2 3 - let is_alpha c = 4 - (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 5 - 6 - let is_alnum c = 7 - is_alpha c || (c >= '0' && c <= '9') 8 - 9 - let is_hex_digit c = 10 - (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') 11 - 12 - let is_digit c = 13 - c >= '0' && c <= '9' 14 15 let decode_entities_in_text text ~in_attribute = 16 let len = String.length text in
··· 1 (* HTML5 entity decoding *) 2 3 + (* Character classification using Astring *) 4 + let is_alnum = Astring.Char.Ascii.is_alphanum 5 + let is_hex_digit = Astring.Char.Ascii.is_hex_digit 6 + let is_digit = Astring.Char.Ascii.is_digit 7 8 let decode_entities_in_text text ~in_attribute = 9 let len = String.length text in
+2 -1
lib/entities/dune
··· 1 (library 2 (name html5rw_entities) 3 - (public_name html5rw.entities)) 4 5 (rule 6 (target entity_table.ml)
··· 1 (library 2 (name html5rw_entities) 3 + (public_name html5rw.entities) 4 + (libraries astring uutf)) 5 6 (rule 7 (target entity_table.ml)
+2 -15
lib/entities/numeric_ref.ml
··· 43 in 44 search 0 45 46 let codepoint_to_utf8 cp = 47 let buf = Buffer.create 4 in 48 - if cp <= 0x7F then 49 - Buffer.add_char buf (Char.chr cp) 50 - else if cp <= 0x7FF then begin 51 - Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6))); 52 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 53 - end else if cp <= 0xFFFF then begin 54 - Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12))); 55 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 56 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 57 - end else begin 58 - Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18))); 59 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F))); 60 - Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F))); 61 - Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F))) 62 - end; 63 Buffer.contents buf 64 65 let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *)
··· 43 in 44 search 0 45 46 + (* Encode a Unicode codepoint to UTF-8 using uutf *) 47 let codepoint_to_utf8 cp = 48 let buf = Buffer.create 4 in 49 + Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp); 50 Buffer.contents buf 51 52 let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *)
+7 -4
lib/parser/constants.ml
··· 1 (* HTML5 spec constants *) 2 3 (* Void elements - no end tag allowed *) 4 let void_elements = [ 5 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; ··· 70 71 let adjust_mathml_attrs attrs = 72 List.map (fun (k, v) -> 73 - match List.assoc_opt (String.lowercase_ascii k) mathml_attr_adjustments with 74 | Some adjusted_k -> (adjusted_k, v) 75 | None -> (k, v) 76 ) attrs ··· 282 let is_heading = List.mem 283 284 let adjust_svg_tag_name name = 285 - match List.assoc_opt (String.lowercase_ascii name) svg_tag_adjustments with 286 | Some adjusted -> adjusted 287 | None -> name 288 289 let adjust_svg_attrs attrs = 290 List.map (fun (name, value) -> 291 let adjusted_name = 292 - match List.assoc_opt (String.lowercase_ascii name) svg_attr_adjustments with 293 | Some n -> n 294 | None -> name 295 in ··· 298 299 let adjust_foreign_attrs attrs = 300 List.map (fun (name, value) -> 301 - match List.assoc_opt (String.lowercase_ascii name) foreign_attr_adjustments with 302 | Some (prefix, local, _ns) -> 303 if prefix = "" then (local, value) 304 else (prefix ^ ":" ^ local, value)
··· 1 (* HTML5 spec constants *) 2 3 + (* Use Astring for string operations *) 4 + let lowercase = Astring.String.Ascii.lowercase 5 + 6 (* Void elements - no end tag allowed *) 7 let void_elements = [ 8 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input"; ··· 73 74 let adjust_mathml_attrs attrs = 75 List.map (fun (k, v) -> 76 + match List.assoc_opt (lowercase k) mathml_attr_adjustments with 77 | Some adjusted_k -> (adjusted_k, v) 78 | None -> (k, v) 79 ) attrs ··· 285 let is_heading = List.mem 286 287 let adjust_svg_tag_name name = 288 + match List.assoc_opt (lowercase name) svg_tag_adjustments with 289 | Some adjusted -> adjusted 290 | None -> name 291 292 let adjust_svg_attrs attrs = 293 List.map (fun (name, value) -> 294 let adjusted_name = 295 + match List.assoc_opt (lowercase name) svg_attr_adjustments with 296 | Some n -> n 297 | None -> name 298 in ··· 301 302 let adjust_foreign_attrs attrs = 303 List.map (fun (name, value) -> 304 + match List.assoc_opt (lowercase name) foreign_attr_adjustments with 305 | Some (prefix, local, _ns) -> 306 if prefix = "" then (local, value) 307 else (prefix ^ ":" ^ local, value)
+1 -1
lib/parser/dune
··· 1 (library 2 (name html5rw_parser) 3 (public_name html5rw.parser) 4 - (libraries bytesrw html5rw.tokenizer html5rw.dom html5rw.encoding html5rw.selector))
··· 1 (library 2 (name html5rw_parser) 3 (public_name html5rw.parser) 4 + (libraries astring bytesrw html5rw.tokenizer html5rw.dom html5rw.encoding html5rw.selector))
+1 -1
lib/selector/dune
··· 1 (library 2 (name html5rw_selector) 3 (public_name html5rw.selector) 4 - (libraries html5rw.dom))
··· 1 (library 2 (name html5rw_selector) 3 (public_name html5rw.selector) 4 + (libraries astring html5rw.dom))
+21 -25
lib/selector/selector_match.ml
··· 3 module Dom = Html5rw_dom 4 open Selector_ast 5 6 (* Check if haystack contains needle as a substring *) 7 let string_contains ~haystack ~needle = 8 - let needle_len = String.length needle in 9 - let haystack_len = String.length haystack in 10 - if needle_len > haystack_len then false 11 - else if needle_len = 0 then true 12 - else 13 - let rec check i = 14 - if i > haystack_len - needle_len then false 15 - else if String.sub haystack i needle_len = needle then true 16 - else check (i + 1) 17 - in 18 - check 0 19 20 let is_element node = 21 let name = node.Dom.name in ··· 58 match node.Dom.parent with 59 | None -> false 60 | Some parent -> 61 - let name = String.lowercase_ascii node.Dom.name in 62 let rec find = function 63 | [] -> false 64 - | n :: _ when String.lowercase_ascii n.Dom.name = name -> n == node 65 | _ :: rest -> find rest 66 in 67 find (get_element_children parent) ··· 70 match node.Dom.parent with 71 | None -> false 72 | Some parent -> 73 - let name = String.lowercase_ascii node.Dom.name in 74 let rec find last = function 75 | [] -> (match last with Some l -> l == node | None -> false) 76 - | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (Some n) rest 77 | _ :: rest -> find last rest 78 in 79 find None (get_element_children parent) ··· 94 match node.Dom.parent with 95 | None -> 0 96 | Some parent -> 97 - let name = String.lowercase_ascii node.Dom.name in 98 let children = get_element_children parent in 99 let rec find idx = function 100 | [] -> 0 101 | n :: _ when n == node -> idx 102 - | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (idx + 1) rest 103 | _ :: rest -> find idx rest 104 in 105 find 1 children 106 107 (* Parse nth expression: "odd", "even", "3", "2n+1", etc *) 108 let parse_nth expr = 109 - let expr = String.lowercase_ascii (String.trim expr) in 110 if expr = "odd" then Some (2, 1) 111 else if expr = "even" then Some (2, 0) 112 else 113 - let expr = String.concat "" (String.split_on_char ' ' expr) in 114 if String.contains expr 'n' then 115 let parts = String.split_on_char 'n' expr in 116 match parts with ··· 145 | Type_universal -> true 146 | Type_tag -> 147 (match selector.name with 148 - | Some name -> String.lowercase_ascii node.Dom.name = String.lowercase_ascii name 149 | None -> false) 150 | Type_id -> 151 (match selector.name with ··· 159 | Some cls -> 160 (match Dom.get_attr node "class" with 161 | Some class_attr -> 162 - let classes = String.split_on_char ' ' class_attr in 163 List.mem cls classes 164 | None -> false) 165 | None -> false) 166 | Type_attr -> 167 (match selector.name with 168 | Some attr_name -> 169 - let attr_name_lower = String.lowercase_ascii attr_name in 170 let node_value = 171 List.find_map (fun (k, v) -> 172 - if String.lowercase_ascii k = attr_name_lower then Some v 173 else None 174 ) node.Dom.attrs 175 in ··· 181 (match selector.operator with 182 | Some "=" -> attr_value = value 183 | Some "~=" -> 184 - let words = String.split_on_char ' ' attr_value in 185 List.mem value words 186 | Some "|=" -> 187 attr_value = value || String.length attr_value > String.length value && ··· 204 | Some "only-of-type" -> is_first_of_type node && is_last_of_type node 205 | Some "empty" -> 206 not (List.exists (fun c -> 207 - is_element c || (c.Dom.name = "#text" && String.trim c.Dom.data <> "") 208 ) node.Dom.children) 209 | Some "root" -> 210 (match node.Dom.parent with
··· 3 module Dom = Html5rw_dom 4 open Selector_ast 5 6 + (* Use Astring for string operations *) 7 + let lowercase = Astring.String.Ascii.lowercase 8 + let trim = Astring.String.trim 9 + let find_sub = Astring.String.find_sub 10 + let fields = Astring.String.fields 11 + 12 (* Check if haystack contains needle as a substring *) 13 let string_contains ~haystack ~needle = 14 + Option.is_some (find_sub ~sub:needle haystack) 15 16 let is_element node = 17 let name = node.Dom.name in ··· 54 match node.Dom.parent with 55 | None -> false 56 | Some parent -> 57 + let name = lowercase node.Dom.name in 58 let rec find = function 59 | [] -> false 60 + | n :: _ when lowercase n.Dom.name = name -> n == node 61 | _ :: rest -> find rest 62 in 63 find (get_element_children parent) ··· 66 match node.Dom.parent with 67 | None -> false 68 | Some parent -> 69 + let name = lowercase node.Dom.name in 70 let rec find last = function 71 | [] -> (match last with Some l -> l == node | None -> false) 72 + | n :: rest when lowercase n.Dom.name = name -> find (Some n) rest 73 | _ :: rest -> find last rest 74 in 75 find None (get_element_children parent) ··· 90 match node.Dom.parent with 91 | None -> 0 92 | Some parent -> 93 + let name = lowercase node.Dom.name in 94 let children = get_element_children parent in 95 let rec find idx = function 96 | [] -> 0 97 | n :: _ when n == node -> idx 98 + | n :: rest when lowercase n.Dom.name = name -> find (idx + 1) rest 99 | _ :: rest -> find idx rest 100 in 101 find 1 children 102 103 (* Parse nth expression: "odd", "even", "3", "2n+1", etc *) 104 let parse_nth expr = 105 + let expr = lowercase (trim expr) in 106 if expr = "odd" then Some (2, 1) 107 else if expr = "even" then Some (2, 0) 108 else 109 + let expr = String.concat "" (fields ~is_sep:(fun c -> c = ' ') expr) in 110 if String.contains expr 'n' then 111 let parts = String.split_on_char 'n' expr in 112 match parts with ··· 141 | Type_universal -> true 142 | Type_tag -> 143 (match selector.name with 144 + | Some name -> lowercase node.Dom.name = lowercase name 145 | None -> false) 146 | Type_id -> 147 (match selector.name with ··· 155 | Some cls -> 156 (match Dom.get_attr node "class" with 157 | Some class_attr -> 158 + let classes = fields ~is_sep:(fun c -> c = ' ') class_attr in 159 List.mem cls classes 160 | None -> false) 161 | None -> false) 162 | Type_attr -> 163 (match selector.name with 164 | Some attr_name -> 165 + let attr_name_lower = lowercase attr_name in 166 let node_value = 167 List.find_map (fun (k, v) -> 168 + if lowercase k = attr_name_lower then Some v 169 else None 170 ) node.Dom.attrs 171 in ··· 177 (match selector.operator with 178 | Some "=" -> attr_value = value 179 | Some "~=" -> 180 + let words = fields ~is_sep:(fun c -> c = ' ') attr_value in 181 List.mem value words 182 | Some "|=" -> 183 attr_value = value || String.length attr_value > String.length value && ··· 200 | Some "only-of-type" -> is_first_of_type node && is_last_of_type node 201 | Some "empty" -> 202 not (List.exists (fun c -> 203 + is_element c || (c.Dom.name = "#text" && trim c.Dom.data <> "") 204 ) node.Dom.children) 205 | Some "root" -> 206 (match node.Dom.parent with
+1 -1
lib/tokenizer/dune
··· 1 (library 2 (name html5rw_tokenizer) 3 (public_name html5rw.tokenizer) 4 - (libraries bytesrw html5rw.entities))
··· 1 (library 2 (name html5rw_tokenizer) 3 (public_name html5rw.tokenizer) 4 + (libraries astring bytesrw html5rw.entities))
+6 -8
lib/tokenizer/tokenizer.ml
··· 1 (* HTML5 Tokenizer - implements WHATWG tokenization algorithm *) 2 3 - let is_ascii_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 4 - let is_ascii_upper c = c >= 'A' && c <= 'Z' 5 - let is_ascii_digit c = c >= '0' && c <= '9' 6 - let is_ascii_hex c = is_ascii_digit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') 7 - let is_ascii_alnum c = is_ascii_alpha c || is_ascii_digit c 8 let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' 9 - 10 - let ascii_lower c = 11 - if is_ascii_upper c then Char.chr (Char.code c + 32) else c 12 13 (* Token sink interface *) 14 module type SINK = sig
··· 1 (* HTML5 Tokenizer - implements WHATWG tokenization algorithm *) 2 3 + (* Character classification using Astring *) 4 + let is_ascii_alpha = Astring.Char.Ascii.is_letter 5 + let is_ascii_digit = Astring.Char.Ascii.is_digit 6 + let is_ascii_hex = Astring.Char.Ascii.is_hex_digit 7 + let is_ascii_alnum = Astring.Char.Ascii.is_alphanum 8 let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r' 9 + let ascii_lower = Astring.Char.Ascii.lowercase 10 11 (* Token sink interface *) 12 module type SINK = sig