···33let normalize_label label =
44 if String.length label = 0 then None
55 else
66- let s = String.lowercase_ascii (String.trim label) in
66+ let s = Astring.String.Ascii.lowercase (Astring.String.trim label) in
77 if String.length s = 0 then None
88 else
99 (* Security: never allow utf-7 *)
+3-10
lib/encoding/prescan.ml
···11(* HTML meta charset prescan per WHATWG spec *)
2233-let ascii_whitespace = ['\x09'; '\x0A'; '\x0C'; '\x0D'; '\x20']
44-55-let is_ascii_whitespace c = List.mem c ascii_whitespace
66-77-let is_ascii_alpha c =
88- (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
99-1010-let ascii_lower c =
1111- if c >= 'A' && c <= 'Z' then Char.chr (Char.code c + 32)
1212- else c
33+(* Character classification using Astring *)
44+let is_ascii_whitespace c = c = '\x09' || c = '\x0A' || c = '\x0C' || c = '\x0D' || c = '\x20'
55+let is_ascii_alpha = Astring.Char.Ascii.is_letter
136147let skip_whitespace data i len =
158 let j = ref i in
+4-11
lib/entities/decode.ml
···11(* HTML5 entity decoding *)
2233-let is_alpha c =
44- (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
55-66-let is_alnum c =
77- is_alpha c || (c >= '0' && c <= '9')
88-99-let is_hex_digit c =
1010- (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
1111-1212-let is_digit c =
1313- c >= '0' && c <= '9'
33+(* Character classification using Astring *)
44+let is_alnum = Astring.Char.Ascii.is_alphanum
55+let is_hex_digit = Astring.Char.Ascii.is_hex_digit
66+let is_digit = Astring.Char.Ascii.is_digit
147158let decode_entities_in_text text ~in_attribute =
169 let len = String.length text in
···4343 in
4444 search 0
45454646+(* Encode a Unicode codepoint to UTF-8 using uutf *)
4647let codepoint_to_utf8 cp =
4748 let buf = Buffer.create 4 in
4848- if cp <= 0x7F then
4949- Buffer.add_char buf (Char.chr cp)
5050- else if cp <= 0x7FF then begin
5151- Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));
5252- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
5353- end else if cp <= 0xFFFF then begin
5454- Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));
5555- Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
5656- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
5757- end else begin
5858- Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));
5959- Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));
6060- Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
6161- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
6262- end;
4949+ Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp);
6350 Buffer.contents buf
64516552let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *)
+7-4
lib/parser/constants.ml
···11(* HTML5 spec constants *)
2233+(* Use Astring for string operations *)
44+let lowercase = Astring.String.Ascii.lowercase
55+36(* Void elements - no end tag allowed *)
47let void_elements = [
58 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
···70737174let adjust_mathml_attrs attrs =
7275 List.map (fun (k, v) ->
7373- match List.assoc_opt (String.lowercase_ascii k) mathml_attr_adjustments with
7676+ match List.assoc_opt (lowercase k) mathml_attr_adjustments with
7477 | Some adjusted_k -> (adjusted_k, v)
7578 | None -> (k, v)
7679 ) attrs
···282285let is_heading = List.mem
283286284287let adjust_svg_tag_name name =
285285- match List.assoc_opt (String.lowercase_ascii name) svg_tag_adjustments with
288288+ match List.assoc_opt (lowercase name) svg_tag_adjustments with
286289 | Some adjusted -> adjusted
287290 | None -> name
288291289292let adjust_svg_attrs attrs =
290293 List.map (fun (name, value) ->
291294 let adjusted_name =
292292- match List.assoc_opt (String.lowercase_ascii name) svg_attr_adjustments with
295295+ match List.assoc_opt (lowercase name) svg_attr_adjustments with
293296 | Some n -> n
294297 | None -> name
295298 in
···298301299302let adjust_foreign_attrs attrs =
300303 List.map (fun (name, value) ->
301301- match List.assoc_opt (String.lowercase_ascii name) foreign_attr_adjustments with
304304+ match List.assoc_opt (lowercase name) foreign_attr_adjustments with
302305 | Some (prefix, local, _ns) ->
303306 if prefix = "" then (local, value)
304307 else (prefix ^ ":" ^ local, value)
···33module Dom = Html5rw_dom
44open Selector_ast
5566+(* Use Astring for string operations *)
77+let lowercase = Astring.String.Ascii.lowercase
88+let trim = Astring.String.trim
99+let find_sub = Astring.String.find_sub
1010+let fields = Astring.String.fields
1111+612(* Check if haystack contains needle as a substring *)
713let string_contains ~haystack ~needle =
88- let needle_len = String.length needle in
99- let haystack_len = String.length haystack in
1010- if needle_len > haystack_len then false
1111- else if needle_len = 0 then true
1212- else
1313- let rec check i =
1414- if i > haystack_len - needle_len then false
1515- else if String.sub haystack i needle_len = needle then true
1616- else check (i + 1)
1717- in
1818- check 0
1414+ Option.is_some (find_sub ~sub:needle haystack)
19152016let is_element node =
2117 let name = node.Dom.name in
···5854 match node.Dom.parent with
5955 | None -> false
6056 | Some parent ->
6161- let name = String.lowercase_ascii node.Dom.name in
5757+ let name = lowercase node.Dom.name in
6258 let rec find = function
6359 | [] -> false
6464- | n :: _ when String.lowercase_ascii n.Dom.name = name -> n == node
6060+ | n :: _ when lowercase n.Dom.name = name -> n == node
6561 | _ :: rest -> find rest
6662 in
6763 find (get_element_children parent)
···7066 match node.Dom.parent with
7167 | None -> false
7268 | Some parent ->
7373- let name = String.lowercase_ascii node.Dom.name in
6969+ let name = lowercase node.Dom.name in
7470 let rec find last = function
7571 | [] -> (match last with Some l -> l == node | None -> false)
7676- | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (Some n) rest
7272+ | n :: rest when lowercase n.Dom.name = name -> find (Some n) rest
7773 | _ :: rest -> find last rest
7874 in
7975 find None (get_element_children parent)
···9490 match node.Dom.parent with
9591 | None -> 0
9692 | Some parent ->
9797- let name = String.lowercase_ascii node.Dom.name in
9393+ let name = lowercase node.Dom.name in
9894 let children = get_element_children parent in
9995 let rec find idx = function
10096 | [] -> 0
10197 | n :: _ when n == node -> idx
102102- | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (idx + 1) rest
9898+ | n :: rest when lowercase n.Dom.name = name -> find (idx + 1) rest
10399 | _ :: rest -> find idx rest
104100 in
105101 find 1 children
106102107103(* Parse nth expression: "odd", "even", "3", "2n+1", etc *)
108104let parse_nth expr =
109109- let expr = String.lowercase_ascii (String.trim expr) in
105105+ let expr = lowercase (trim expr) in
110106 if expr = "odd" then Some (2, 1)
111107 else if expr = "even" then Some (2, 0)
112108 else
113113- let expr = String.concat "" (String.split_on_char ' ' expr) in
109109+ let expr = String.concat "" (fields ~is_sep:(fun c -> c = ' ') expr) in
114110 if String.contains expr 'n' then
115111 let parts = String.split_on_char 'n' expr in
116112 match parts with
···145141 | Type_universal -> true
146142 | Type_tag ->
147143 (match selector.name with
148148- | Some name -> String.lowercase_ascii node.Dom.name = String.lowercase_ascii name
144144+ | Some name -> lowercase node.Dom.name = lowercase name
149145 | None -> false)
150146 | Type_id ->
151147 (match selector.name with
···159155 | Some cls ->
160156 (match Dom.get_attr node "class" with
161157 | Some class_attr ->
162162- let classes = String.split_on_char ' ' class_attr in
158158+ let classes = fields ~is_sep:(fun c -> c = ' ') class_attr in
163159 List.mem cls classes
164160 | None -> false)
165161 | None -> false)
166162 | Type_attr ->
167163 (match selector.name with
168164 | Some attr_name ->
169169- let attr_name_lower = String.lowercase_ascii attr_name in
165165+ let attr_name_lower = lowercase attr_name in
170166 let node_value =
171167 List.find_map (fun (k, v) ->
172172- if String.lowercase_ascii k = attr_name_lower then Some v
168168+ if lowercase k = attr_name_lower then Some v
173169 else None
174170 ) node.Dom.attrs
175171 in
···181177 (match selector.operator with
182178 | Some "=" -> attr_value = value
183179 | Some "~=" ->
184184- let words = String.split_on_char ' ' attr_value in
180180+ let words = fields ~is_sep:(fun c -> c = ' ') attr_value in
185181 List.mem value words
186182 | Some "|=" ->
187183 attr_value = value || String.length attr_value > String.length value &&
···204200 | Some "only-of-type" -> is_first_of_type node && is_last_of_type node
205201 | Some "empty" ->
206202 not (List.exists (fun c ->
207207- is_element c || (c.Dom.name = "#text" && String.trim c.Dom.data <> "")
203203+ is_element c || (c.Dom.name = "#text" && trim c.Dom.data <> "")
208204 ) node.Dom.children)
209205 | Some "root" ->
210206 (match node.Dom.parent with
···11(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *)
2233-let is_ascii_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
44-let is_ascii_upper c = c >= 'A' && c <= 'Z'
55-let is_ascii_digit c = c >= '0' && c <= '9'
66-let is_ascii_hex c = is_ascii_digit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
77-let is_ascii_alnum c = is_ascii_alpha c || is_ascii_digit c
33+(* Character classification using Astring *)
44+let is_ascii_alpha = Astring.Char.Ascii.is_letter
55+let is_ascii_digit = Astring.Char.Ascii.is_digit
66+let is_ascii_hex = Astring.Char.Ascii.is_hex_digit
77+let is_ascii_alnum = Astring.Char.Ascii.is_alphanum
88let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r'
99-1010-let ascii_lower c =
1111- if is_ascii_upper c then Char.chr (Char.code c + 32) else c
99+let ascii_lower = Astring.Char.Ascii.lowercase
12101311(* Token sink interface *)
1412module type SINK = sig