web engine - experimental web browser
at main 285 lines 9.1 kB view raw
1//! `cmap` — Character to Glyph Index Mapping table. 2//! 3//! Maps Unicode code points to glyph indices. Supports format 4 (BMP) and 4//! format 12 (full Unicode). 5//! Reference: <https://learn.microsoft.com/en-us/typography/opentype/spec/cmap> 6 7use crate::font::parse::Reader; 8use crate::font::FontError; 9 10/// Parsed `cmap` table. 11#[derive(Debug)] 12pub struct CmapTable { 13 /// The best subtable we found (preferring format 12 over format 4). 14 subtable: CmapSubtable, 15} 16 17#[derive(Debug)] 18enum CmapSubtable { 19 Format4(Format4), 20 Format12(Format12), 21} 22 23/// cmap format 4: Segment mapping to delta values (BMP only). 24#[derive(Debug)] 25struct Format4 { 26 /// Parallel arrays defining segments. 27 end_codes: Vec<u16>, 28 start_codes: Vec<u16>, 29 id_deltas: Vec<i16>, 30 id_range_offsets: Vec<u16>, 31 /// The raw glyph index array following the segments. 32 glyph_indices: Vec<u16>, 33} 34 35/// cmap format 12: Segmented coverage for the full Unicode range. 36#[derive(Debug)] 37struct Format12 { 38 groups: Vec<SequentialMapGroup>, 39} 40 41#[derive(Debug)] 42struct SequentialMapGroup { 43 start_char: u32, 44 end_char: u32, 45 start_glyph: u32, 46} 47 48impl CmapTable { 49 /// Parse the `cmap` table from raw bytes. 50 /// 51 /// Selects the best available subtable: 52 /// 1. Platform 3 (Windows), Encoding 10 (Unicode full) — format 12 53 /// 2. Platform 0 (Unicode), Encoding 4 (Unicode full) — format 12 54 /// 3. Platform 3 (Windows), Encoding 1 (Unicode BMP) — format 4 55 /// 4. Platform 0 (Unicode), Encoding 3 (Unicode BMP) — format 4 56 /// 5. First platform 0 or 3 subtable that parses successfully 57 pub fn parse(data: &[u8]) -> Result<CmapTable, FontError> { 58 let r = Reader::new(data); 59 if r.len() < 4 { 60 return Err(FontError::MalformedTable("cmap")); 61 } 62 63 let num_tables = r.u16(2)? as usize; 64 65 // Collect encoding records. 66 struct EncodingRecord { 67 platform_id: u16, 68 encoding_id: u16, 69 offset: u32, 70 } 71 72 let mut records = Vec::with_capacity(num_tables); 73 for i in 0..num_tables { 74 let base = 4 + i * 8; 75 records.push(EncodingRecord { 76 platform_id: r.u16(base)?, 77 encoding_id: r.u16(base + 2)?, 78 offset: r.u32(base + 4)?, 79 }); 80 } 81 82 // Try subtables in preference order. 83 // Priority: (3,10) > (0,4) > (0,6) > (3,1) > (0,3) > (0,*) > (3,*) 84 let priority = |pid: u16, eid: u16| -> u8 { 85 match (pid, eid) { 86 (3, 10) => 0, 87 (0, 4) => 1, 88 (0, 6) => 2, 89 (3, 1) => 3, 90 (0, 3) => 4, 91 (0, _) => 5, 92 (3, _) => 6, 93 _ => 255, 94 } 95 }; 96 97 let mut best: Option<(u8, CmapSubtable)> = None; 98 99 for rec in &records { 100 let p = priority(rec.platform_id, rec.encoding_id); 101 if p == 255 { 102 continue; 103 } 104 if let Some((bp, _)) = &best { 105 if p >= *bp { 106 continue; 107 } 108 } 109 110 let offset = rec.offset as usize; 111 if offset + 2 > data.len() { 112 continue; 113 } 114 let format = r.u16(offset)?; 115 116 match format { 117 4 => { 118 if let Ok(st) = parse_format4(data, offset) { 119 best = Some((p, CmapSubtable::Format4(st))); 120 } 121 } 122 12 => { 123 if let Ok(st) = parse_format12(data, offset) { 124 best = Some((p, CmapSubtable::Format12(st))); 125 } 126 } 127 _ => {} 128 } 129 } 130 131 match best { 132 Some((_, subtable)) => Ok(CmapTable { subtable }), 133 None => Err(FontError::MalformedTable("cmap: no usable subtable")), 134 } 135 } 136 137 /// Look up a Unicode code point and return the corresponding glyph index. 138 /// 139 /// Returns `None` if the code point is not mapped (maps to glyph 0). 140 pub fn glyph_index(&self, codepoint: u32) -> Option<u16> { 141 let gid = match &self.subtable { 142 CmapSubtable::Format4(f4) => lookup_format4(f4, codepoint), 143 CmapSubtable::Format12(f12) => lookup_format12(f12, codepoint), 144 }; 145 if gid == 0 { 146 None 147 } else { 148 Some(gid) 149 } 150 } 151} 152 153fn parse_format4(data: &[u8], offset: usize) -> Result<Format4, FontError> { 154 let r = Reader::new(data); 155 // format(2) + length(2) + language(2) + segCountX2(2) 156 if offset + 14 > data.len() { 157 return Err(FontError::MalformedTable("cmap format 4")); 158 } 159 160 let seg_count_x2 = r.u16(offset + 6)? as usize; 161 let seg_count = seg_count_x2 / 2; 162 // skip searchRange(2) + entrySelector(2) + rangeShift(2) 163 let end_codes_offset = offset + 14; 164 // After endCodes there is a reservedPad(2), then startCodes. 165 let start_codes_offset = end_codes_offset + seg_count_x2 + 2; 166 let id_delta_offset = start_codes_offset + seg_count_x2; 167 let id_range_offset = id_delta_offset + seg_count_x2; 168 169 let mut end_codes = Vec::with_capacity(seg_count); 170 let mut start_codes = Vec::with_capacity(seg_count); 171 let mut id_deltas = Vec::with_capacity(seg_count); 172 let mut id_range_offsets = Vec::with_capacity(seg_count); 173 174 for i in 0..seg_count { 175 end_codes.push(r.u16(end_codes_offset + i * 2)?); 176 start_codes.push(r.u16(start_codes_offset + i * 2)?); 177 id_deltas.push(r.i16(id_delta_offset + i * 2)?); 178 id_range_offsets.push(r.u16(id_range_offset + i * 2)?); 179 } 180 181 // Everything after idRangeOffset is the glyphIdArray. 182 let glyph_array_offset = id_range_offset + seg_count_x2; 183 let remaining_bytes = data.len().saturating_sub(glyph_array_offset); 184 let num_glyph_indices = remaining_bytes / 2; 185 let mut glyph_indices = Vec::with_capacity(num_glyph_indices); 186 for i in 0..num_glyph_indices { 187 glyph_indices.push(r.u16(glyph_array_offset + i * 2)?); 188 } 189 190 Ok(Format4 { 191 end_codes, 192 start_codes, 193 id_deltas, 194 id_range_offsets, 195 glyph_indices, 196 }) 197} 198 199fn lookup_format4(f4: &Format4, codepoint: u32) -> u16 { 200 if codepoint > 0xFFFF { 201 return 0; 202 } 203 let cp = codepoint as u16; 204 205 for i in 0..f4.end_codes.len() { 206 if cp > f4.end_codes[i] { 207 continue; 208 } 209 if cp < f4.start_codes[i] { 210 return 0; 211 } 212 213 if f4.id_range_offsets[i] == 0 { 214 // Use delta. 215 return (cp as i32 + f4.id_deltas[i] as i32) as u16; 216 } 217 218 // Use range offset into glyphIdArray. 219 // The offset is relative to the position of idRangeOffset[i] in the data. 220 // index = idRangeOffset[i]/2 + (cp - startCode[i]) - segCount + i 221 let range_offset = f4.id_range_offsets[i] as usize; 222 let seg_count = f4.end_codes.len(); 223 let idx = range_offset / 2 + (cp - f4.start_codes[i]) as usize; 224 // idx is relative to position of idRangeOffset[i], which is at 225 // range_offset_base + i*2 in the original data. We need to convert 226 // to an index into our glyph_indices array. 227 // The glyph_indices array starts at range_offset_base + seg_count*2. 228 // So the array index = idx - seg_count + i 229 let array_idx = idx.wrapping_sub(seg_count).wrapping_add(i); 230 if array_idx < f4.glyph_indices.len() { 231 let gid = f4.glyph_indices[array_idx]; 232 if gid == 0 { 233 return 0; 234 } 235 return (gid as i32 + f4.id_deltas[i] as i32) as u16; 236 } 237 238 return 0; 239 } 240 241 0 242} 243 244fn parse_format12(data: &[u8], offset: usize) -> Result<Format12, FontError> { 245 let r = Reader::new(data); 246 // format(2) + reserved(2) + length(4) + language(4) + numGroups(4) 247 if offset + 16 > data.len() { 248 return Err(FontError::MalformedTable("cmap format 12")); 249 } 250 251 let num_groups = r.u32(offset + 12)? as usize; 252 let groups_offset = offset + 16; 253 254 let mut groups = Vec::with_capacity(num_groups); 255 for i in 0..num_groups { 256 let base = groups_offset + i * 12; 257 groups.push(SequentialMapGroup { 258 start_char: r.u32(base)?, 259 end_char: r.u32(base + 4)?, 260 start_glyph: r.u32(base + 8)?, 261 }); 262 } 263 264 Ok(Format12 { groups }) 265} 266 267fn lookup_format12(f12: &Format12, codepoint: u32) -> u16 { 268 // Binary search for the group containing codepoint. 269 let mut lo = 0usize; 270 let mut hi = f12.groups.len(); 271 while lo < hi { 272 let mid = lo + (hi - lo) / 2; 273 let group = &f12.groups[mid]; 274 if codepoint < group.start_char { 275 hi = mid; 276 } else if codepoint > group.end_char { 277 lo = mid + 1; 278 } else { 279 // Found it. 280 let gid = group.start_glyph + (codepoint - group.start_char); 281 return gid as u16; 282 } 283 } 284 0 285}