extension/src/utils/text-matcher.ts at main

margin.at / margin
fork atom
Write on the margins of the internet. Powered by the AT Protocol. margin.at
extension web atproto comments
fork atom
margin / extension / src / utils / text-matcher.ts
at main 366 lines 10 kB view raw
wrap content
scanash.com v0.3.0 - PDF support, we now bundle pdf.js - You can now turn highlights into annotations - Improvements and bug fixes 4w ago
a83bcdb0
  1interface TextNodeIndex {
  2  start: number;
  3  node: Text;
  4  length: number;
  5}
  6
  7interface TextPoint {
  8  node: Text;
  9  offset: number;
 10}
 11
 12export class DOMTextMatcher {
 13  private textNodes: Text[] = [];
 14  private corpus = '';
 15  private indices: TextNodeIndex[] = [];
 16  private built = false;
 17  private isPDF = false;
 18
 19  constructor() {}
 20
 21  reset(): void {
 22    this.textNodes = [];
 23    this.corpus = '';
 24    this.indices = [];
 25    this.built = false;
 26    this.isPDF = false;
 27  }
 28
 29  private ensureBuilt(): void {
 30    if (!this.built) {
 31      this.buildMap();
 32      this.built = true;
 33    }
 34  }
 35
 36  private buildMap(): void {
 37    this.isPDF = this.detectPDF();
 38
 39    const root = this.isPDF ? this.getPDFTextRoot() : document.body;
 40
 41    if (!root) return;
 42
 43    const walker = document.createTreeWalker(root, NodeFilter.SHOW_TEXT, {
 44      acceptNode: (node: Text) => {
 45        if (!node.parentNode) return NodeFilter.FILTER_REJECT;
 46        const parent = node.parentNode as Element;
 47        const tag = parent.tagName;
 48        if (['SCRIPT', 'STYLE', 'NOSCRIPT', 'TEXTAREA', 'INPUT'].includes(tag)) {
 49          return NodeFilter.FILTER_REJECT;
 50        }
 51        if (!node.textContent?.trim()) return NodeFilter.FILTER_SKIP;
 52        const htmlParent = parent as HTMLElement;
 53        if (htmlParent.style) {
 54          const display = htmlParent.style.display;
 55          const visibility = htmlParent.style.visibility;
 56          if (display === 'none' || visibility === 'hidden') {
 57            return NodeFilter.FILTER_REJECT;
 58          }
 59        }
 60        if (this.isPDF && !this.isInPDFTextLayer(parent)) {
 61          return NodeFilter.FILTER_REJECT;
 62        }
 63        return NodeFilter.FILTER_ACCEPT;
 64      },
 65    });
 66
 67    let currentNode: Text | null;
 68    let index = 0;
 69    const parts: string[] = [];
 70
 71    while ((currentNode = walker.nextNode() as Text | null)) {
 72      const text = currentNode.textContent || '';
 73      this.textNodes.push(currentNode);
 74      parts.push(text);
 75      this.indices.push({
 76        start: index,
 77        node: currentNode,
 78        length: text.length,
 79      });
 80      index += text.length;
 81    }
 82
 83    this.corpus = parts.join('');
 84  }
 85
 86  private detectPDF(): boolean {
 87    if (document.querySelector('.pdfViewer') || document.querySelector('#viewer.pdfViewer')) {
 88      return true;
 89    }
 90    if (document.querySelectorAll('.textLayer span').length > 0) {
 91      return true;
 92    }
 93    if (/\.pdf(\?|#|$)/i.test(window.location.href)) {
 94      return true;
 95    }
 96    return false;
 97  }
 98
 99  private getPDFTextRoot(): Element | null {
100    const viewer =
101      document.querySelector('.pdfViewer') || document.querySelector('#viewerContainer');
102    if (viewer) return viewer;
103    return document.body;
104  }
105
106  private isInPDFTextLayer(element: Element): boolean {
107    if (!this.isPDF) return true;
108    let node: Element | null = element;
109    while (node) {
110      if (node.classList?.contains('textLayer')) return true;
111      node = node.parentElement;
112    }
113    return false;
114  }
115
116  findRange(searchText: string, prefix?: string, suffix?: string): Range | null {
117    if (!searchText) return null;
118
119    this.ensureBuilt();
120
121    let matchIndex = this.corpus.indexOf(searchText);
122    if (matchIndex !== -1) {
123      return this.createRange(matchIndex, matchIndex + searchText.length);
124    }
125
126    const normalizedSearch = searchText.replace(/\s+/g, ' ').trim();
127    const normalizedCorpus = this.corpus.replace(/\s+/g, ' ');
128    matchIndex = normalizedCorpus.indexOf(normalizedSearch);
129    if (matchIndex !== -1) {
130      const originalPos = this.mapNormalizedToOriginal(matchIndex, normalizedSearch.length);
131      if (originalPos) {
132        return this.createRange(originalPos.start, originalPos.end);
133      }
134    }
135
136    if (prefix || suffix) {
137      const contextMatch = this.findWithContext(searchText, prefix, suffix);
138      if (contextMatch) {
139        return this.createRange(contextMatch.start, contextMatch.end);
140      }
141    }
142
143    const fuzzyMatch = this.fuzzyFindInCorpus(searchText);
144    if (fuzzyMatch) {
145      return this.createRange(fuzzyMatch.start, fuzzyMatch.end);
146    }
147
148    const substringMatch = this.findLongestSubstring(searchText);
149    if (substringMatch) {
150      return this.createRange(substringMatch.start, substringMatch.end);
151    }
152
153    return null;
154  }
155
156  private mapNormalizedToOriginal(
157    normalizedIndex: number,
158    normalizedLength: number
159  ): { start: number; end: number } | null {
160    let normPos = 0;
161    let origPos = 0;
162    let startOrig = -1;
163
164    while (origPos < this.corpus.length && normPos <= normalizedIndex + normalizedLength) {
165      if (normPos === normalizedIndex) {
166        startOrig = origPos;
167      }
168
169      const char = this.corpus[origPos];
170
171      if (/\s/.test(char)) {
172        while (origPos + 1 < this.corpus.length && /\s/.test(this.corpus[origPos + 1])) {
173          origPos++;
174        }
175        normPos++;
176      } else {
177        normPos++;
178      }
179      origPos++;
180
181      if (normPos === normalizedIndex + normalizedLength) {
182        if (startOrig !== -1) {
183          return { start: startOrig, end: origPos };
184        }
185      }
186    }
187    return null;
188  }
189
190  private findWithContext(
191    searchText: string,
192    prefix?: string,
193    suffix?: string
194  ): { start: number; end: number } | null {
195    const normalizedText = searchText.replace(/\s+/g, ' ').trim();
196    const corpusLower = this.corpus.toLowerCase();
197    const textLower = normalizedText.toLowerCase();
198
199    let searchStart = 0;
200    let bestMatch: { start: number; end: number; score: number } | null = null;
201
202    while (searchStart < corpusLower.length) {
203      const idx = corpusLower.indexOf(textLower, searchStart);
204      if (idx === -1) break;
205
206      let score = 0;
207
208      if (prefix) {
209        const prefixNorm = prefix.replace(/\s+/g, ' ').trim().toLowerCase();
210        const contextBefore = corpusLower.slice(Math.max(0, idx - prefixNorm.length - 10), idx);
211        if (contextBefore.includes(prefixNorm)) {
212          score += prefixNorm.length;
213        }
214      }
215
216      if (suffix) {
217        const suffixNorm = suffix.replace(/\s+/g, ' ').trim().toLowerCase();
218        const contextAfter = corpusLower.slice(
219          idx + textLower.length,
220          idx + textLower.length + suffixNorm.length + 10
221        );
222        if (contextAfter.includes(suffixNorm)) {
223          score += suffixNorm.length;
224        }
225      }
226
227      if (!bestMatch || score > bestMatch.score) {
228        bestMatch = { start: idx, end: idx + normalizedText.length, score };
229      }
230
231      searchStart = idx + 1;
232    }
233
234    return bestMatch && bestMatch.score > 0 ? bestMatch : null;
235  }
236
237  private fuzzyFindInCorpus(searchText: string): { start: number; end: number } | null {
238    const searchWords = searchText
239      .trim()
240      .split(/\s+/)
241      .filter((w) => w.length > 0);
242    if (searchWords.length === 0) return null;
243
244    const corpusLower = this.corpus.toLowerCase();
245    const firstWord = searchWords[0].toLowerCase();
246    let searchStart = 0;
247
248    while (searchStart < corpusLower.length) {
249      const wordStart = corpusLower.indexOf(firstWord, searchStart);
250      if (wordStart === -1) break;
251
252      let corpusPos = wordStart;
253      let matched = true;
254      let lastMatchEnd = wordStart;
255
256      for (const word of searchWords) {
257        const wordLower = word.toLowerCase();
258        while (corpusPos < corpusLower.length && /\s/.test(this.corpus[corpusPos])) {
259          corpusPos++;
260        }
261        const corpusSlice = corpusLower.slice(corpusPos, corpusPos + wordLower.length);
262        if (corpusSlice !== wordLower) {
263          matched = false;
264          break;
265        }
266        corpusPos += wordLower.length;
267        lastMatchEnd = corpusPos;
268      }
269
270      if (matched) {
271        return { start: wordStart, end: lastMatchEnd };
272      }
273      searchStart = wordStart + 1;
274    }
275
276    return null;
277  }
278
279  private findLongestSubstring(searchText: string): { start: number; end: number } | null {
280    if (searchText.length < 20) return null;
281
282    const minLen = Math.floor(searchText.length * 0.6);
283    const corpusLower = this.corpus.toLowerCase();
284    const searchLower = searchText.toLowerCase();
285
286    for (let len = searchLower.length; len >= minLen; len--) {
287      const sub = searchLower.slice(0, len);
288      const idx = corpusLower.indexOf(sub);
289      if (idx !== -1) {
290        return { start: idx, end: idx + len };
291      }
292    }
293
294    for (let len = searchLower.length; len >= minLen; len--) {
295      const sub = searchLower.slice(searchLower.length - len);
296      const idx = corpusLower.indexOf(sub);
297      if (idx !== -1) {
298        return { start: idx, end: idx + len };
299      }
300    }
301
302    return null;
303  }
304
305  private createRange(start: number, end: number): Range | null {
306    const startPoint = this.mapIndexToPoint(start);
307    const endPoint = this.mapIndexToPoint(end);
308
309    if (startPoint && endPoint) {
310      const range = document.createRange();
311      range.setStart(startPoint.node, startPoint.offset);
312      range.setEnd(endPoint.node, endPoint.offset);
313      return range;
314    }
315    return null;
316  }
317
318  private mapIndexToPoint(corpusIndex: number): TextPoint | null {
319    if (this.indices.length === 0) return null;
320
321    let lo = 0;
322    let hi = this.indices.length - 1;
323
324    while (lo <= hi) {
325      const mid = (lo + hi) >>> 1;
326      const info = this.indices[mid];
327
328      if (corpusIndex < info.start) {
329        hi = mid - 1;
330      } else if (corpusIndex >= info.start + info.length) {
331        lo = mid + 1;
332      } else {
333        return { node: info.node, offset: corpusIndex - info.start };
334      }
335    }
336    if (this.indices.length > 0) {
337      const last = this.indices[this.indices.length - 1];
338      if (corpusIndex === last.start + last.length) {
339        return { node: last.node, offset: last.length };
340      }
341    }
342
343    return null;
344  }
345}
346
347export function findCanonicalUrl(range: Range): string | null {
348  let node: Node | null = range.commonAncestorContainer;
349  if (node.nodeType === Node.TEXT_NODE) {
350    node = node.parentNode;
351  }
352
353  while (node && node !== document.body) {
354    const element = node as Element;
355    if (
356      (element.tagName === 'BLOCKQUOTE' || element.tagName === 'Q') &&
357      element.hasAttribute('cite')
358    ) {
359      if (element.contains(range.commonAncestorContainer)) {
360        return element.getAttribute('cite');
361      }
362    }
363    node = node.parentNode;
364  }
365  return null;
366}