Write on the margins of the internet. Powered by the AT Protocol. margin.at
extension web atproto comments
at main 366 lines 10 kB view raw
1interface TextNodeIndex { 2 start: number; 3 node: Text; 4 length: number; 5} 6 7interface TextPoint { 8 node: Text; 9 offset: number; 10} 11 12export class DOMTextMatcher { 13 private textNodes: Text[] = []; 14 private corpus = ''; 15 private indices: TextNodeIndex[] = []; 16 private built = false; 17 private isPDF = false; 18 19 constructor() {} 20 21 reset(): void { 22 this.textNodes = []; 23 this.corpus = ''; 24 this.indices = []; 25 this.built = false; 26 this.isPDF = false; 27 } 28 29 private ensureBuilt(): void { 30 if (!this.built) { 31 this.buildMap(); 32 this.built = true; 33 } 34 } 35 36 private buildMap(): void { 37 this.isPDF = this.detectPDF(); 38 39 const root = this.isPDF ? this.getPDFTextRoot() : document.body; 40 41 if (!root) return; 42 43 const walker = document.createTreeWalker(root, NodeFilter.SHOW_TEXT, { 44 acceptNode: (node: Text) => { 45 if (!node.parentNode) return NodeFilter.FILTER_REJECT; 46 const parent = node.parentNode as Element; 47 const tag = parent.tagName; 48 if (['SCRIPT', 'STYLE', 'NOSCRIPT', 'TEXTAREA', 'INPUT'].includes(tag)) { 49 return NodeFilter.FILTER_REJECT; 50 } 51 if (!node.textContent?.trim()) return NodeFilter.FILTER_SKIP; 52 const htmlParent = parent as HTMLElement; 53 if (htmlParent.style) { 54 const display = htmlParent.style.display; 55 const visibility = htmlParent.style.visibility; 56 if (display === 'none' || visibility === 'hidden') { 57 return NodeFilter.FILTER_REJECT; 58 } 59 } 60 if (this.isPDF && !this.isInPDFTextLayer(parent)) { 61 return NodeFilter.FILTER_REJECT; 62 } 63 return NodeFilter.FILTER_ACCEPT; 64 }, 65 }); 66 67 let currentNode: Text | null; 68 let index = 0; 69 const parts: string[] = []; 70 71 while ((currentNode = walker.nextNode() as Text | null)) { 72 const text = currentNode.textContent || ''; 73 this.textNodes.push(currentNode); 74 parts.push(text); 75 this.indices.push({ 76 start: index, 77 node: currentNode, 78 length: text.length, 79 }); 80 index += text.length; 81 } 82 83 this.corpus = parts.join(''); 84 } 85 86 private detectPDF(): boolean { 87 if (document.querySelector('.pdfViewer') || document.querySelector('#viewer.pdfViewer')) { 88 return true; 89 } 90 if (document.querySelectorAll('.textLayer span').length > 0) { 91 return true; 92 } 93 if (/\.pdf(\?|#|$)/i.test(window.location.href)) { 94 return true; 95 } 96 return false; 97 } 98 99 private getPDFTextRoot(): Element | null { 100 const viewer = 101 document.querySelector('.pdfViewer') || document.querySelector('#viewerContainer'); 102 if (viewer) return viewer; 103 return document.body; 104 } 105 106 private isInPDFTextLayer(element: Element): boolean { 107 if (!this.isPDF) return true; 108 let node: Element | null = element; 109 while (node) { 110 if (node.classList?.contains('textLayer')) return true; 111 node = node.parentElement; 112 } 113 return false; 114 } 115 116 findRange(searchText: string, prefix?: string, suffix?: string): Range | null { 117 if (!searchText) return null; 118 119 this.ensureBuilt(); 120 121 let matchIndex = this.corpus.indexOf(searchText); 122 if (matchIndex !== -1) { 123 return this.createRange(matchIndex, matchIndex + searchText.length); 124 } 125 126 const normalizedSearch = searchText.replace(/\s+/g, ' ').trim(); 127 const normalizedCorpus = this.corpus.replace(/\s+/g, ' '); 128 matchIndex = normalizedCorpus.indexOf(normalizedSearch); 129 if (matchIndex !== -1) { 130 const originalPos = this.mapNormalizedToOriginal(matchIndex, normalizedSearch.length); 131 if (originalPos) { 132 return this.createRange(originalPos.start, originalPos.end); 133 } 134 } 135 136 if (prefix || suffix) { 137 const contextMatch = this.findWithContext(searchText, prefix, suffix); 138 if (contextMatch) { 139 return this.createRange(contextMatch.start, contextMatch.end); 140 } 141 } 142 143 const fuzzyMatch = this.fuzzyFindInCorpus(searchText); 144 if (fuzzyMatch) { 145 return this.createRange(fuzzyMatch.start, fuzzyMatch.end); 146 } 147 148 const substringMatch = this.findLongestSubstring(searchText); 149 if (substringMatch) { 150 return this.createRange(substringMatch.start, substringMatch.end); 151 } 152 153 return null; 154 } 155 156 private mapNormalizedToOriginal( 157 normalizedIndex: number, 158 normalizedLength: number 159 ): { start: number; end: number } | null { 160 let normPos = 0; 161 let origPos = 0; 162 let startOrig = -1; 163 164 while (origPos < this.corpus.length && normPos <= normalizedIndex + normalizedLength) { 165 if (normPos === normalizedIndex) { 166 startOrig = origPos; 167 } 168 169 const char = this.corpus[origPos]; 170 171 if (/\s/.test(char)) { 172 while (origPos + 1 < this.corpus.length && /\s/.test(this.corpus[origPos + 1])) { 173 origPos++; 174 } 175 normPos++; 176 } else { 177 normPos++; 178 } 179 origPos++; 180 181 if (normPos === normalizedIndex + normalizedLength) { 182 if (startOrig !== -1) { 183 return { start: startOrig, end: origPos }; 184 } 185 } 186 } 187 return null; 188 } 189 190 private findWithContext( 191 searchText: string, 192 prefix?: string, 193 suffix?: string 194 ): { start: number; end: number } | null { 195 const normalizedText = searchText.replace(/\s+/g, ' ').trim(); 196 const corpusLower = this.corpus.toLowerCase(); 197 const textLower = normalizedText.toLowerCase(); 198 199 let searchStart = 0; 200 let bestMatch: { start: number; end: number; score: number } | null = null; 201 202 while (searchStart < corpusLower.length) { 203 const idx = corpusLower.indexOf(textLower, searchStart); 204 if (idx === -1) break; 205 206 let score = 0; 207 208 if (prefix) { 209 const prefixNorm = prefix.replace(/\s+/g, ' ').trim().toLowerCase(); 210 const contextBefore = corpusLower.slice(Math.max(0, idx - prefixNorm.length - 10), idx); 211 if (contextBefore.includes(prefixNorm)) { 212 score += prefixNorm.length; 213 } 214 } 215 216 if (suffix) { 217 const suffixNorm = suffix.replace(/\s+/g, ' ').trim().toLowerCase(); 218 const contextAfter = corpusLower.slice( 219 idx + textLower.length, 220 idx + textLower.length + suffixNorm.length + 10 221 ); 222 if (contextAfter.includes(suffixNorm)) { 223 score += suffixNorm.length; 224 } 225 } 226 227 if (!bestMatch || score > bestMatch.score) { 228 bestMatch = { start: idx, end: idx + normalizedText.length, score }; 229 } 230 231 searchStart = idx + 1; 232 } 233 234 return bestMatch && bestMatch.score > 0 ? bestMatch : null; 235 } 236 237 private fuzzyFindInCorpus(searchText: string): { start: number; end: number } | null { 238 const searchWords = searchText 239 .trim() 240 .split(/\s+/) 241 .filter((w) => w.length > 0); 242 if (searchWords.length === 0) return null; 243 244 const corpusLower = this.corpus.toLowerCase(); 245 const firstWord = searchWords[0].toLowerCase(); 246 let searchStart = 0; 247 248 while (searchStart < corpusLower.length) { 249 const wordStart = corpusLower.indexOf(firstWord, searchStart); 250 if (wordStart === -1) break; 251 252 let corpusPos = wordStart; 253 let matched = true; 254 let lastMatchEnd = wordStart; 255 256 for (const word of searchWords) { 257 const wordLower = word.toLowerCase(); 258 while (corpusPos < corpusLower.length && /\s/.test(this.corpus[corpusPos])) { 259 corpusPos++; 260 } 261 const corpusSlice = corpusLower.slice(corpusPos, corpusPos + wordLower.length); 262 if (corpusSlice !== wordLower) { 263 matched = false; 264 break; 265 } 266 corpusPos += wordLower.length; 267 lastMatchEnd = corpusPos; 268 } 269 270 if (matched) { 271 return { start: wordStart, end: lastMatchEnd }; 272 } 273 searchStart = wordStart + 1; 274 } 275 276 return null; 277 } 278 279 private findLongestSubstring(searchText: string): { start: number; end: number } | null { 280 if (searchText.length < 20) return null; 281 282 const minLen = Math.floor(searchText.length * 0.6); 283 const corpusLower = this.corpus.toLowerCase(); 284 const searchLower = searchText.toLowerCase(); 285 286 for (let len = searchLower.length; len >= minLen; len--) { 287 const sub = searchLower.slice(0, len); 288 const idx = corpusLower.indexOf(sub); 289 if (idx !== -1) { 290 return { start: idx, end: idx + len }; 291 } 292 } 293 294 for (let len = searchLower.length; len >= minLen; len--) { 295 const sub = searchLower.slice(searchLower.length - len); 296 const idx = corpusLower.indexOf(sub); 297 if (idx !== -1) { 298 return { start: idx, end: idx + len }; 299 } 300 } 301 302 return null; 303 } 304 305 private createRange(start: number, end: number): Range | null { 306 const startPoint = this.mapIndexToPoint(start); 307 const endPoint = this.mapIndexToPoint(end); 308 309 if (startPoint && endPoint) { 310 const range = document.createRange(); 311 range.setStart(startPoint.node, startPoint.offset); 312 range.setEnd(endPoint.node, endPoint.offset); 313 return range; 314 } 315 return null; 316 } 317 318 private mapIndexToPoint(corpusIndex: number): TextPoint | null { 319 if (this.indices.length === 0) return null; 320 321 let lo = 0; 322 let hi = this.indices.length - 1; 323 324 while (lo <= hi) { 325 const mid = (lo + hi) >>> 1; 326 const info = this.indices[mid]; 327 328 if (corpusIndex < info.start) { 329 hi = mid - 1; 330 } else if (corpusIndex >= info.start + info.length) { 331 lo = mid + 1; 332 } else { 333 return { node: info.node, offset: corpusIndex - info.start }; 334 } 335 } 336 if (this.indices.length > 0) { 337 const last = this.indices[this.indices.length - 1]; 338 if (corpusIndex === last.start + last.length) { 339 return { node: last.node, offset: last.length }; 340 } 341 } 342 343 return null; 344 } 345} 346 347export function findCanonicalUrl(range: Range): string | null { 348 let node: Node | null = range.commonAncestorContainer; 349 if (node.nodeType === Node.TEXT_NODE) { 350 node = node.parentNode; 351 } 352 353 while (node && node !== document.body) { 354 const element = node as Element; 355 if ( 356 (element.tagName === 'BLOCKQUOTE' || element.tagName === 'Q') && 357 element.hasAttribute('cite') 358 ) { 359 if (element.contains(range.commonAncestorContainer)) { 360 return element.getAttribute('cite'); 361 } 362 } 363 node = node.parentNode; 364 } 365 return null; 366}