···1(* HTML5 encoding detection and decoding *)
20003let decode_utf16 data ~is_le ~bom_len =
4 let len = Bytes.length data in
5 let buf = Buffer.create len in
···28 let high = code_unit - 0xD800 in
29 let low = code_unit2 - 0xDC00 in
30 let cp = 0x10000 + (high lsl 10) lor low in
31- Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));
32- Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));
33- Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
34- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
35 end else begin
36 (* Invalid surrogate, output replacement *)
37- Buffer.add_string buf "\xEF\xBF\xBD"
38 end
39 end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin
40 (* Lone surrogate *)
41- Buffer.add_string buf "\xEF\xBF\xBD"
42- end else if code_unit <= 0x7F then begin
43- Buffer.add_char buf (Char.chr code_unit)
44- end else if code_unit <= 0x7FF then begin
45- Buffer.add_char buf (Char.chr (0xC0 lor (code_unit lsr 6)));
46- Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F)))
47 end else begin
48- Buffer.add_char buf (Char.chr (0xE0 lor (code_unit lsr 12)));
49- Buffer.add_char buf (Char.chr (0x80 lor ((code_unit lsr 6) land 0x3F)));
50- Buffer.add_char buf (Char.chr (0x80 lor (code_unit land 0x3F)))
51 end
52 done;
5354 (* Odd trailing byte *)
55- if !i < len then Buffer.add_string buf "\xEF\xBF\xBD";
5657 Buffer.contents buf
58···85 | Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
8687 | Encoding.Windows_1252 ->
088 let len = Bytes.length data in
89 let buf = Buffer.create len in
90 let table = [|
···100 if b >= 0x80 && b <= 0x9F then table.(b - 0x80)
101 else b
102 in
103- if cp <= 0x7F then
104- Buffer.add_char buf (Char.chr cp)
105- else if cp <= 0x7FF then begin
106- Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));
107- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
108- end else begin
109- Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));
110- Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
111- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
112- end
113 done;
114 Buffer.contents buf
115116 | Encoding.Iso_8859_2 ->
0117 let len = Bytes.length data in
118 let buf = Buffer.create len in
119- let table = [|
120- (* 0xA0-0xBF *)
121- 0x00A0; 0x0104; 0x02D8; 0x0141; 0x00A4; 0x013D; 0x015A; 0x00A7;
122- 0x00A8; 0x0160; 0x015E; 0x0164; 0x0179; 0x00AD; 0x017D; 0x017B;
123- 0x00B0; 0x0105; 0x02DB; 0x0142; 0x00B4; 0x013E; 0x015B; 0x02C7;
124- 0x00B8; 0x0161; 0x015F; 0x0165; 0x017A; 0x02DD; 0x017E; 0x017C;
125- (* 0xC0-0xFF *)
126- 0x0154; 0x00C1; 0x00C2; 0x0102; 0x00C4; 0x0139; 0x0106; 0x00C7;
127- 0x010C; 0x00C9; 0x0118; 0x00CB; 0x011A; 0x00CD; 0x00CE; 0x010E;
128- 0x0110; 0x0143; 0x0147; 0x00D3; 0x00D4; 0x0150; 0x00D6; 0x00D7;
129- 0x0158; 0x016E; 0x00DA; 0x0170; 0x00DC; 0x00DD; 0x0162; 0x00DF;
130- 0x0155; 0x00E1; 0x00E2; 0x0103; 0x00E4; 0x013A; 0x0107; 0x00E7;
131- 0x010D; 0x00E9; 0x0119; 0x00EB; 0x011B; 0x00ED; 0x00EE; 0x010F;
132- 0x0111; 0x0144; 0x0148; 0x00F3; 0x00F4; 0x0151; 0x00F6; 0x00F7;
133- 0x0159; 0x016F; 0x00FA; 0x0171; 0x00FC; 0x00FD; 0x0163; 0x02D9;
134- |] in
135- for i = bom_len to len - 1 do
136- let b = Char.code (Bytes.get data i) in
137- let cp =
138- if b >= 0xA0 then table.(b - 0xA0)
139- else b
140- in
141- if cp <= 0x7F then
142- Buffer.add_char buf (Char.chr cp)
143- else if cp <= 0x7FF then begin
144- Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));
145- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
146- end else begin
147- Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));
148- Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
149- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
150- end
151- done;
152 Buffer.contents buf
153154 | Encoding.Euc_jp ->
···1(* HTML5 encoding detection and decoding *)
23+(* UTF-8 replacement character *)
4+let replacement_char = Uchar.of_int 0xFFFD
5+6let decode_utf16 data ~is_le ~bom_len =
7 let len = Bytes.length data in
8 let buf = Buffer.create len in
···31 let high = code_unit - 0xD800 in
32 let low = code_unit2 - 0xDC00 in
33 let cp = 0x10000 + (high lsl 10) lor low in
34+ Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp)
00035 end else begin
36 (* Invalid surrogate, output replacement *)
37+ Uutf.Buffer.add_utf_8 buf replacement_char
38 end
39 end else if code_unit >= 0xD800 && code_unit <= 0xDFFF then begin
40 (* Lone surrogate *)
41+ Uutf.Buffer.add_utf_8 buf replacement_char
0000042 end else begin
43+ Uutf.Buffer.add_utf_8 buf (Uchar.of_int code_unit)
0044 end
45 done;
4647 (* Odd trailing byte *)
48+ if !i < len then Uutf.Buffer.add_utf_8 buf replacement_char;
4950 Buffer.contents buf
51···78 | Encoding.Utf16be -> decode_utf16 data ~is_le:false ~bom_len
7980 | Encoding.Windows_1252 ->
81+ (* Windows-1252 mapping table for 0x80-0x9F range *)
82 let len = Bytes.length data in
83 let buf = Buffer.create len in
84 let table = [|
···94 if b >= 0x80 && b <= 0x9F then table.(b - 0x80)
95 else b
96 in
97+ Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp)
00000000098 done;
99 Buffer.contents buf
100101 | Encoding.Iso_8859_2 ->
102+ (* Use uuuu for ISO-8859-2 decoding *)
103 let len = Bytes.length data in
104 let buf = Buffer.create len in
105+ let s = Bytes.sub_string data bom_len (len - bom_len) in
106+ Uuuu.String.fold `ISO_8859_2 (fun () _pos -> function
107+ | `Uchar u -> Uutf.Buffer.add_utf_8 buf u
108+ | `Malformed _ -> Uutf.Buffer.add_utf_8 buf replacement_char
109+ ) () s;
0000000000000000000000000000110 Buffer.contents buf
111112 | Encoding.Euc_jp ->
···3let normalize_label label =
4 if String.length label = 0 then None
5 else
6- let s = String.lowercase_ascii (String.trim label) in
7 if String.length s = 0 then None
8 else
9 (* Security: never allow utf-7 *)
···3let normalize_label label =
4 if String.length label = 0 then None
5 else
6+ let s = Astring.String.Ascii.lowercase (Astring.String.trim label) in
7 if String.length s = 0 then None
8 else
9 (* Security: never allow utf-7 *)
+3-10
lib/encoding/prescan.ml
···1(* HTML meta charset prescan per WHATWG spec *)
23-let ascii_whitespace = ['\x09'; '\x0A'; '\x0C'; '\x0D'; '\x20']
4-5-let is_ascii_whitespace c = List.mem c ascii_whitespace
6-7-let is_ascii_alpha c =
8- (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
9-10-let ascii_lower c =
11- if c >= 'A' && c <= 'Z' then Char.chr (Char.code c + 32)
12- else c
1314let skip_whitespace data i len =
15 let j = ref i in
···1(* HTML meta charset prescan per WHATWG spec *)
23+(* Character classification using Astring *)
4+let is_ascii_whitespace c = c = '\x09' || c = '\x0A' || c = '\x0C' || c = '\x0D' || c = '\x20'
5+let is_ascii_alpha = Astring.Char.Ascii.is_letter
000000067let skip_whitespace data i len =
8 let j = ref i in
+4-11
lib/entities/decode.ml
···1(* HTML5 entity decoding *)
23-let is_alpha c =
4- (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
5-6-let is_alnum c =
7- is_alpha c || (c >= '0' && c <= '9')
8-9-let is_hex_digit c =
10- (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
11-12-let is_digit c =
13- c >= '0' && c <= '9'
1415let decode_entities_in_text text ~in_attribute =
16 let len = String.length text in
···1(* HTML5 entity decoding *)
23+(* Character classification using Astring *)
4+let is_alnum = Astring.Char.Ascii.is_alphanum
5+let is_hex_digit = Astring.Char.Ascii.is_hex_digit
6+let is_digit = Astring.Char.Ascii.is_digit
000000078let decode_entities_in_text text ~in_attribute =
9 let len = String.length text in
···43 in
44 search 0
45046let codepoint_to_utf8 cp =
47 let buf = Buffer.create 4 in
48- if cp <= 0x7F then
49- Buffer.add_char buf (Char.chr cp)
50- else if cp <= 0x7FF then begin
51- Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));
52- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
53- end else if cp <= 0xFFFF then begin
54- Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));
55- Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
56- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
57- end else begin
58- Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));
59- Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));
60- Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));
61- Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))
62- end;
63 Buffer.contents buf
6465let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *)
···43 in
44 search 0
4546+(* Encode a Unicode codepoint to UTF-8 using uutf *)
47let codepoint_to_utf8 cp =
48 let buf = Buffer.create 4 in
49+ Uutf.Buffer.add_utf_8 buf (Uchar.of_int cp);
0000000000000050 Buffer.contents buf
5152let replacement_char = "\xEF\xBF\xBD" (* U+FFFD in UTF-8 *)
+7-4
lib/parser/constants.ml
···1(* HTML5 spec constants *)
20003(* Void elements - no end tag allowed *)
4let void_elements = [
5 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
···7071let adjust_mathml_attrs attrs =
72 List.map (fun (k, v) ->
73- match List.assoc_opt (String.lowercase_ascii k) mathml_attr_adjustments with
74 | Some adjusted_k -> (adjusted_k, v)
75 | None -> (k, v)
76 ) attrs
···282let is_heading = List.mem
283284let adjust_svg_tag_name name =
285- match List.assoc_opt (String.lowercase_ascii name) svg_tag_adjustments with
286 | Some adjusted -> adjusted
287 | None -> name
288289let adjust_svg_attrs attrs =
290 List.map (fun (name, value) ->
291 let adjusted_name =
292- match List.assoc_opt (String.lowercase_ascii name) svg_attr_adjustments with
293 | Some n -> n
294 | None -> name
295 in
···298299let adjust_foreign_attrs attrs =
300 List.map (fun (name, value) ->
301- match List.assoc_opt (String.lowercase_ascii name) foreign_attr_adjustments with
302 | Some (prefix, local, _ns) ->
303 if prefix = "" then (local, value)
304 else (prefix ^ ":" ^ local, value)
···1(* HTML5 spec constants *)
23+(* Use Astring for string operations *)
4+let lowercase = Astring.String.Ascii.lowercase
5+6(* Void elements - no end tag allowed *)
7let void_elements = [
8 "area"; "base"; "br"; "col"; "embed"; "hr"; "img"; "input";
···7374let adjust_mathml_attrs attrs =
75 List.map (fun (k, v) ->
76+ match List.assoc_opt (lowercase k) mathml_attr_adjustments with
77 | Some adjusted_k -> (adjusted_k, v)
78 | None -> (k, v)
79 ) attrs
···285let is_heading = List.mem
286287let adjust_svg_tag_name name =
288+ match List.assoc_opt (lowercase name) svg_tag_adjustments with
289 | Some adjusted -> adjusted
290 | None -> name
291292let adjust_svg_attrs attrs =
293 List.map (fun (name, value) ->
294 let adjusted_name =
295+ match List.assoc_opt (lowercase name) svg_attr_adjustments with
296 | Some n -> n
297 | None -> name
298 in
···301302let adjust_foreign_attrs attrs =
303 List.map (fun (name, value) ->
304+ match List.assoc_opt (lowercase name) foreign_attr_adjustments with
305 | Some (prefix, local, _ns) ->
306 if prefix = "" then (local, value)
307 else (prefix ^ ":" ^ local, value)
···3module Dom = Html5rw_dom
4open Selector_ast
50000006(* Check if haystack contains needle as a substring *)
7let string_contains ~haystack ~needle =
8- let needle_len = String.length needle in
9- let haystack_len = String.length haystack in
10- if needle_len > haystack_len then false
11- else if needle_len = 0 then true
12- else
13- let rec check i =
14- if i > haystack_len - needle_len then false
15- else if String.sub haystack i needle_len = needle then true
16- else check (i + 1)
17- in
18- check 0
1920let is_element node =
21 let name = node.Dom.name in
···58 match node.Dom.parent with
59 | None -> false
60 | Some parent ->
61- let name = String.lowercase_ascii node.Dom.name in
62 let rec find = function
63 | [] -> false
64- | n :: _ when String.lowercase_ascii n.Dom.name = name -> n == node
65 | _ :: rest -> find rest
66 in
67 find (get_element_children parent)
···70 match node.Dom.parent with
71 | None -> false
72 | Some parent ->
73- let name = String.lowercase_ascii node.Dom.name in
74 let rec find last = function
75 | [] -> (match last with Some l -> l == node | None -> false)
76- | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (Some n) rest
77 | _ :: rest -> find last rest
78 in
79 find None (get_element_children parent)
···94 match node.Dom.parent with
95 | None -> 0
96 | Some parent ->
97- let name = String.lowercase_ascii node.Dom.name in
98 let children = get_element_children parent in
99 let rec find idx = function
100 | [] -> 0
101 | n :: _ when n == node -> idx
102- | n :: rest when String.lowercase_ascii n.Dom.name = name -> find (idx + 1) rest
103 | _ :: rest -> find idx rest
104 in
105 find 1 children
106107(* Parse nth expression: "odd", "even", "3", "2n+1", etc *)
108let parse_nth expr =
109- let expr = String.lowercase_ascii (String.trim expr) in
110 if expr = "odd" then Some (2, 1)
111 else if expr = "even" then Some (2, 0)
112 else
113- let expr = String.concat "" (String.split_on_char ' ' expr) in
114 if String.contains expr 'n' then
115 let parts = String.split_on_char 'n' expr in
116 match parts with
···145 | Type_universal -> true
146 | Type_tag ->
147 (match selector.name with
148- | Some name -> String.lowercase_ascii node.Dom.name = String.lowercase_ascii name
149 | None -> false)
150 | Type_id ->
151 (match selector.name with
···159 | Some cls ->
160 (match Dom.get_attr node "class" with
161 | Some class_attr ->
162- let classes = String.split_on_char ' ' class_attr in
163 List.mem cls classes
164 | None -> false)
165 | None -> false)
166 | Type_attr ->
167 (match selector.name with
168 | Some attr_name ->
169- let attr_name_lower = String.lowercase_ascii attr_name in
170 let node_value =
171 List.find_map (fun (k, v) ->
172- if String.lowercase_ascii k = attr_name_lower then Some v
173 else None
174 ) node.Dom.attrs
175 in
···181 (match selector.operator with
182 | Some "=" -> attr_value = value
183 | Some "~=" ->
184- let words = String.split_on_char ' ' attr_value in
185 List.mem value words
186 | Some "|=" ->
187 attr_value = value || String.length attr_value > String.length value &&
···204 | Some "only-of-type" -> is_first_of_type node && is_last_of_type node
205 | Some "empty" ->
206 not (List.exists (fun c ->
207- is_element c || (c.Dom.name = "#text" && String.trim c.Dom.data <> "")
208 ) node.Dom.children)
209 | Some "root" ->
210 (match node.Dom.parent with
···3module Dom = Html5rw_dom
4open Selector_ast
56+(* Use Astring for string operations *)
7+let lowercase = Astring.String.Ascii.lowercase
8+let trim = Astring.String.trim
9+let find_sub = Astring.String.find_sub
10+let fields = Astring.String.fields
11+12(* Check if haystack contains needle as a substring *)
13let string_contains ~haystack ~needle =
14+ Option.is_some (find_sub ~sub:needle haystack)
00000000001516let is_element node =
17 let name = node.Dom.name in
···54 match node.Dom.parent with
55 | None -> false
56 | Some parent ->
57+ let name = lowercase node.Dom.name in
58 let rec find = function
59 | [] -> false
60+ | n :: _ when lowercase n.Dom.name = name -> n == node
61 | _ :: rest -> find rest
62 in
63 find (get_element_children parent)
···66 match node.Dom.parent with
67 | None -> false
68 | Some parent ->
69+ let name = lowercase node.Dom.name in
70 let rec find last = function
71 | [] -> (match last with Some l -> l == node | None -> false)
72+ | n :: rest when lowercase n.Dom.name = name -> find (Some n) rest
73 | _ :: rest -> find last rest
74 in
75 find None (get_element_children parent)
···90 match node.Dom.parent with
91 | None -> 0
92 | Some parent ->
93+ let name = lowercase node.Dom.name in
94 let children = get_element_children parent in
95 let rec find idx = function
96 | [] -> 0
97 | n :: _ when n == node -> idx
98+ | n :: rest when lowercase n.Dom.name = name -> find (idx + 1) rest
99 | _ :: rest -> find idx rest
100 in
101 find 1 children
102103(* Parse nth expression: "odd", "even", "3", "2n+1", etc *)
104let parse_nth expr =
105+ let expr = lowercase (trim expr) in
106 if expr = "odd" then Some (2, 1)
107 else if expr = "even" then Some (2, 0)
108 else
109+ let expr = String.concat "" (fields ~is_sep:(fun c -> c = ' ') expr) in
110 if String.contains expr 'n' then
111 let parts = String.split_on_char 'n' expr in
112 match parts with
···141 | Type_universal -> true
142 | Type_tag ->
143 (match selector.name with
144+ | Some name -> lowercase node.Dom.name = lowercase name
145 | None -> false)
146 | Type_id ->
147 (match selector.name with
···155 | Some cls ->
156 (match Dom.get_attr node "class" with
157 | Some class_attr ->
158+ let classes = fields ~is_sep:(fun c -> c = ' ') class_attr in
159 List.mem cls classes
160 | None -> false)
161 | None -> false)
162 | Type_attr ->
163 (match selector.name with
164 | Some attr_name ->
165+ let attr_name_lower = lowercase attr_name in
166 let node_value =
167 List.find_map (fun (k, v) ->
168+ if lowercase k = attr_name_lower then Some v
169 else None
170 ) node.Dom.attrs
171 in
···177 (match selector.operator with
178 | Some "=" -> attr_value = value
179 | Some "~=" ->
180+ let words = fields ~is_sep:(fun c -> c = ' ') attr_value in
181 List.mem value words
182 | Some "|=" ->
183 attr_value = value || String.length attr_value > String.length value &&
···200 | Some "only-of-type" -> is_first_of_type node && is_last_of_type node
201 | Some "empty" ->
202 not (List.exists (fun c ->
203+ is_element c || (c.Dom.name = "#text" && trim c.Dom.data <> "")
204 ) node.Dom.children)
205 | Some "root" ->
206 (match node.Dom.parent with
···1(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *)
23-let is_ascii_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
4-let is_ascii_upper c = c >= 'A' && c <= 'Z'
5-let is_ascii_digit c = c >= '0' && c <= '9'
6-let is_ascii_hex c = is_ascii_digit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
7-let is_ascii_alnum c = is_ascii_alpha c || is_ascii_digit c
8let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r'
9-10-let ascii_lower c =
11- if is_ascii_upper c then Char.chr (Char.code c + 32) else c
1213(* Token sink interface *)
14module type SINK = sig
···1(* HTML5 Tokenizer - implements WHATWG tokenization algorithm *)
23+(* Character classification using Astring *)
4+let is_ascii_alpha = Astring.Char.Ascii.is_letter
5+let is_ascii_digit = Astring.Char.Ascii.is_digit
6+let is_ascii_hex = Astring.Char.Ascii.is_hex_digit
7+let is_ascii_alnum = Astring.Char.Ascii.is_alphanum
8let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\x0C' || c = '\r'
9+let ascii_lower = Astring.Char.Ascii.lowercase
001011(* Token sink interface *)
12module type SINK = sig