web engine - experimental web browser
1//! `cmap` — Character to Glyph Index Mapping table.
2//!
3//! Maps Unicode code points to glyph indices. Supports format 4 (BMP) and
4//! format 12 (full Unicode).
5//! Reference: <https://learn.microsoft.com/en-us/typography/opentype/spec/cmap>
6
7use crate::font::parse::Reader;
8use crate::font::FontError;
9
10/// Parsed `cmap` table.
11#[derive(Debug)]
12pub struct CmapTable {
13 /// The best subtable we found (preferring format 12 over format 4).
14 subtable: CmapSubtable,
15}
16
17#[derive(Debug)]
18enum CmapSubtable {
19 Format4(Format4),
20 Format12(Format12),
21}
22
23/// cmap format 4: Segment mapping to delta values (BMP only).
24#[derive(Debug)]
25struct Format4 {
26 /// Parallel arrays defining segments.
27 end_codes: Vec<u16>,
28 start_codes: Vec<u16>,
29 id_deltas: Vec<i16>,
30 id_range_offsets: Vec<u16>,
31 /// The raw glyph index array following the segments.
32 glyph_indices: Vec<u16>,
33}
34
35/// cmap format 12: Segmented coverage for the full Unicode range.
36#[derive(Debug)]
37struct Format12 {
38 groups: Vec<SequentialMapGroup>,
39}
40
41#[derive(Debug)]
42struct SequentialMapGroup {
43 start_char: u32,
44 end_char: u32,
45 start_glyph: u32,
46}
47
48impl CmapTable {
49 /// Parse the `cmap` table from raw bytes.
50 ///
51 /// Selects the best available subtable:
52 /// 1. Platform 3 (Windows), Encoding 10 (Unicode full) — format 12
53 /// 2. Platform 0 (Unicode), Encoding 4 (Unicode full) — format 12
54 /// 3. Platform 3 (Windows), Encoding 1 (Unicode BMP) — format 4
55 /// 4. Platform 0 (Unicode), Encoding 3 (Unicode BMP) — format 4
56 /// 5. First platform 0 or 3 subtable that parses successfully
57 pub fn parse(data: &[u8]) -> Result<CmapTable, FontError> {
58 let r = Reader::new(data);
59 if r.len() < 4 {
60 return Err(FontError::MalformedTable("cmap"));
61 }
62
63 let num_tables = r.u16(2)? as usize;
64
65 // Collect encoding records.
66 struct EncodingRecord {
67 platform_id: u16,
68 encoding_id: u16,
69 offset: u32,
70 }
71
72 let mut records = Vec::with_capacity(num_tables);
73 for i in 0..num_tables {
74 let base = 4 + i * 8;
75 records.push(EncodingRecord {
76 platform_id: r.u16(base)?,
77 encoding_id: r.u16(base + 2)?,
78 offset: r.u32(base + 4)?,
79 });
80 }
81
82 // Try subtables in preference order.
83 // Priority: (3,10) > (0,4) > (0,6) > (3,1) > (0,3) > (0,*) > (3,*)
84 let priority = |pid: u16, eid: u16| -> u8 {
85 match (pid, eid) {
86 (3, 10) => 0,
87 (0, 4) => 1,
88 (0, 6) => 2,
89 (3, 1) => 3,
90 (0, 3) => 4,
91 (0, _) => 5,
92 (3, _) => 6,
93 _ => 255,
94 }
95 };
96
97 let mut best: Option<(u8, CmapSubtable)> = None;
98
99 for rec in &records {
100 let p = priority(rec.platform_id, rec.encoding_id);
101 if p == 255 {
102 continue;
103 }
104 if let Some((bp, _)) = &best {
105 if p >= *bp {
106 continue;
107 }
108 }
109
110 let offset = rec.offset as usize;
111 if offset + 2 > data.len() {
112 continue;
113 }
114 let format = r.u16(offset)?;
115
116 match format {
117 4 => {
118 if let Ok(st) = parse_format4(data, offset) {
119 best = Some((p, CmapSubtable::Format4(st)));
120 }
121 }
122 12 => {
123 if let Ok(st) = parse_format12(data, offset) {
124 best = Some((p, CmapSubtable::Format12(st)));
125 }
126 }
127 _ => {}
128 }
129 }
130
131 match best {
132 Some((_, subtable)) => Ok(CmapTable { subtable }),
133 None => Err(FontError::MalformedTable("cmap: no usable subtable")),
134 }
135 }
136
137 /// Look up a Unicode code point and return the corresponding glyph index.
138 ///
139 /// Returns `None` if the code point is not mapped (maps to glyph 0).
140 pub fn glyph_index(&self, codepoint: u32) -> Option<u16> {
141 let gid = match &self.subtable {
142 CmapSubtable::Format4(f4) => lookup_format4(f4, codepoint),
143 CmapSubtable::Format12(f12) => lookup_format12(f12, codepoint),
144 };
145 if gid == 0 {
146 None
147 } else {
148 Some(gid)
149 }
150 }
151}
152
153fn parse_format4(data: &[u8], offset: usize) -> Result<Format4, FontError> {
154 let r = Reader::new(data);
155 // format(2) + length(2) + language(2) + segCountX2(2)
156 if offset + 14 > data.len() {
157 return Err(FontError::MalformedTable("cmap format 4"));
158 }
159
160 let seg_count_x2 = r.u16(offset + 6)? as usize;
161 let seg_count = seg_count_x2 / 2;
162 // skip searchRange(2) + entrySelector(2) + rangeShift(2)
163 let end_codes_offset = offset + 14;
164 // After endCodes there is a reservedPad(2), then startCodes.
165 let start_codes_offset = end_codes_offset + seg_count_x2 + 2;
166 let id_delta_offset = start_codes_offset + seg_count_x2;
167 let id_range_offset = id_delta_offset + seg_count_x2;
168
169 let mut end_codes = Vec::with_capacity(seg_count);
170 let mut start_codes = Vec::with_capacity(seg_count);
171 let mut id_deltas = Vec::with_capacity(seg_count);
172 let mut id_range_offsets = Vec::with_capacity(seg_count);
173
174 for i in 0..seg_count {
175 end_codes.push(r.u16(end_codes_offset + i * 2)?);
176 start_codes.push(r.u16(start_codes_offset + i * 2)?);
177 id_deltas.push(r.i16(id_delta_offset + i * 2)?);
178 id_range_offsets.push(r.u16(id_range_offset + i * 2)?);
179 }
180
181 // Everything after idRangeOffset is the glyphIdArray.
182 let glyph_array_offset = id_range_offset + seg_count_x2;
183 let remaining_bytes = data.len().saturating_sub(glyph_array_offset);
184 let num_glyph_indices = remaining_bytes / 2;
185 let mut glyph_indices = Vec::with_capacity(num_glyph_indices);
186 for i in 0..num_glyph_indices {
187 glyph_indices.push(r.u16(glyph_array_offset + i * 2)?);
188 }
189
190 Ok(Format4 {
191 end_codes,
192 start_codes,
193 id_deltas,
194 id_range_offsets,
195 glyph_indices,
196 })
197}
198
199fn lookup_format4(f4: &Format4, codepoint: u32) -> u16 {
200 if codepoint > 0xFFFF {
201 return 0;
202 }
203 let cp = codepoint as u16;
204
205 for i in 0..f4.end_codes.len() {
206 if cp > f4.end_codes[i] {
207 continue;
208 }
209 if cp < f4.start_codes[i] {
210 return 0;
211 }
212
213 if f4.id_range_offsets[i] == 0 {
214 // Use delta.
215 return (cp as i32 + f4.id_deltas[i] as i32) as u16;
216 }
217
218 // Use range offset into glyphIdArray.
219 // The offset is relative to the position of idRangeOffset[i] in the data.
220 // index = idRangeOffset[i]/2 + (cp - startCode[i]) - segCount + i
221 let range_offset = f4.id_range_offsets[i] as usize;
222 let seg_count = f4.end_codes.len();
223 let idx = range_offset / 2 + (cp - f4.start_codes[i]) as usize;
224 // idx is relative to position of idRangeOffset[i], which is at
225 // range_offset_base + i*2 in the original data. We need to convert
226 // to an index into our glyph_indices array.
227 // The glyph_indices array starts at range_offset_base + seg_count*2.
228 // So the array index = idx - seg_count + i
229 let array_idx = idx.wrapping_sub(seg_count).wrapping_add(i);
230 if array_idx < f4.glyph_indices.len() {
231 let gid = f4.glyph_indices[array_idx];
232 if gid == 0 {
233 return 0;
234 }
235 return (gid as i32 + f4.id_deltas[i] as i32) as u16;
236 }
237
238 return 0;
239 }
240
241 0
242}
243
244fn parse_format12(data: &[u8], offset: usize) -> Result<Format12, FontError> {
245 let r = Reader::new(data);
246 // format(2) + reserved(2) + length(4) + language(4) + numGroups(4)
247 if offset + 16 > data.len() {
248 return Err(FontError::MalformedTable("cmap format 12"));
249 }
250
251 let num_groups = r.u32(offset + 12)? as usize;
252 let groups_offset = offset + 16;
253
254 let mut groups = Vec::with_capacity(num_groups);
255 for i in 0..num_groups {
256 let base = groups_offset + i * 12;
257 groups.push(SequentialMapGroup {
258 start_char: r.u32(base)?,
259 end_char: r.u32(base + 4)?,
260 start_glyph: r.u32(base + 8)?,
261 });
262 }
263
264 Ok(Format12 { groups })
265}
266
267fn lookup_format12(f12: &Format12, codepoint: u32) -> u16 {
268 // Binary search for the group containing codepoint.
269 let mut lo = 0usize;
270 let mut hi = f12.groups.len();
271 while lo < hi {
272 let mid = lo + (hi - lo) / 2;
273 let group = &f12.groups[mid];
274 if codepoint < group.start_char {
275 hi = mid;
276 } else if codepoint > group.end_char {
277 lo = mid + 1;
278 } else {
279 // Found it.
280 let gid = group.start_glyph + (codepoint - group.start_char);
281 return gid as u16;
282 }
283 }
284 0
285}