Write on the margins of the internet. Powered by the AT Protocol.
margin.at
extension
web
atproto
comments
1interface TextNodeIndex {
2 start: number;
3 node: Text;
4 length: number;
5}
6
7interface TextPoint {
8 node: Text;
9 offset: number;
10}
11
12export class DOMTextMatcher {
13 private textNodes: Text[] = [];
14 private corpus = '';
15 private indices: TextNodeIndex[] = [];
16 private built = false;
17 private isPDF = false;
18
19 constructor() {}
20
21 reset(): void {
22 this.textNodes = [];
23 this.corpus = '';
24 this.indices = [];
25 this.built = false;
26 this.isPDF = false;
27 }
28
29 private ensureBuilt(): void {
30 if (!this.built) {
31 this.buildMap();
32 this.built = true;
33 }
34 }
35
36 private buildMap(): void {
37 this.isPDF = this.detectPDF();
38
39 const root = this.isPDF ? this.getPDFTextRoot() : document.body;
40
41 if (!root) return;
42
43 const walker = document.createTreeWalker(root, NodeFilter.SHOW_TEXT, {
44 acceptNode: (node: Text) => {
45 if (!node.parentNode) return NodeFilter.FILTER_REJECT;
46 const parent = node.parentNode as Element;
47 const tag = parent.tagName;
48 if (['SCRIPT', 'STYLE', 'NOSCRIPT', 'TEXTAREA', 'INPUT'].includes(tag)) {
49 return NodeFilter.FILTER_REJECT;
50 }
51 if (!node.textContent?.trim()) return NodeFilter.FILTER_SKIP;
52 const htmlParent = parent as HTMLElement;
53 if (htmlParent.style) {
54 const display = htmlParent.style.display;
55 const visibility = htmlParent.style.visibility;
56 if (display === 'none' || visibility === 'hidden') {
57 return NodeFilter.FILTER_REJECT;
58 }
59 }
60 if (this.isPDF && !this.isInPDFTextLayer(parent)) {
61 return NodeFilter.FILTER_REJECT;
62 }
63 return NodeFilter.FILTER_ACCEPT;
64 },
65 });
66
67 let currentNode: Text | null;
68 let index = 0;
69 const parts: string[] = [];
70
71 while ((currentNode = walker.nextNode() as Text | null)) {
72 const text = currentNode.textContent || '';
73 this.textNodes.push(currentNode);
74 parts.push(text);
75 this.indices.push({
76 start: index,
77 node: currentNode,
78 length: text.length,
79 });
80 index += text.length;
81 }
82
83 this.corpus = parts.join('');
84 }
85
86 private detectPDF(): boolean {
87 if (document.querySelector('.pdfViewer') || document.querySelector('#viewer.pdfViewer')) {
88 return true;
89 }
90 if (document.querySelectorAll('.textLayer span').length > 0) {
91 return true;
92 }
93 if (/\.pdf(\?|#|$)/i.test(window.location.href)) {
94 return true;
95 }
96 return false;
97 }
98
99 private getPDFTextRoot(): Element | null {
100 const viewer =
101 document.querySelector('.pdfViewer') || document.querySelector('#viewerContainer');
102 if (viewer) return viewer;
103 return document.body;
104 }
105
106 private isInPDFTextLayer(element: Element): boolean {
107 if (!this.isPDF) return true;
108 let node: Element | null = element;
109 while (node) {
110 if (node.classList?.contains('textLayer')) return true;
111 node = node.parentElement;
112 }
113 return false;
114 }
115
116 findRange(searchText: string, prefix?: string, suffix?: string): Range | null {
117 if (!searchText) return null;
118
119 this.ensureBuilt();
120
121 let matchIndex = this.corpus.indexOf(searchText);
122 if (matchIndex !== -1) {
123 return this.createRange(matchIndex, matchIndex + searchText.length);
124 }
125
126 const normalizedSearch = searchText.replace(/\s+/g, ' ').trim();
127 const normalizedCorpus = this.corpus.replace(/\s+/g, ' ');
128 matchIndex = normalizedCorpus.indexOf(normalizedSearch);
129 if (matchIndex !== -1) {
130 const originalPos = this.mapNormalizedToOriginal(matchIndex, normalizedSearch.length);
131 if (originalPos) {
132 return this.createRange(originalPos.start, originalPos.end);
133 }
134 }
135
136 if (prefix || suffix) {
137 const contextMatch = this.findWithContext(searchText, prefix, suffix);
138 if (contextMatch) {
139 return this.createRange(contextMatch.start, contextMatch.end);
140 }
141 }
142
143 const fuzzyMatch = this.fuzzyFindInCorpus(searchText);
144 if (fuzzyMatch) {
145 return this.createRange(fuzzyMatch.start, fuzzyMatch.end);
146 }
147
148 const substringMatch = this.findLongestSubstring(searchText);
149 if (substringMatch) {
150 return this.createRange(substringMatch.start, substringMatch.end);
151 }
152
153 return null;
154 }
155
156 private mapNormalizedToOriginal(
157 normalizedIndex: number,
158 normalizedLength: number
159 ): { start: number; end: number } | null {
160 let normPos = 0;
161 let origPos = 0;
162 let startOrig = -1;
163
164 while (origPos < this.corpus.length && normPos <= normalizedIndex + normalizedLength) {
165 if (normPos === normalizedIndex) {
166 startOrig = origPos;
167 }
168
169 const char = this.corpus[origPos];
170
171 if (/\s/.test(char)) {
172 while (origPos + 1 < this.corpus.length && /\s/.test(this.corpus[origPos + 1])) {
173 origPos++;
174 }
175 normPos++;
176 } else {
177 normPos++;
178 }
179 origPos++;
180
181 if (normPos === normalizedIndex + normalizedLength) {
182 if (startOrig !== -1) {
183 return { start: startOrig, end: origPos };
184 }
185 }
186 }
187 return null;
188 }
189
190 private findWithContext(
191 searchText: string,
192 prefix?: string,
193 suffix?: string
194 ): { start: number; end: number } | null {
195 const normalizedText = searchText.replace(/\s+/g, ' ').trim();
196 const corpusLower = this.corpus.toLowerCase();
197 const textLower = normalizedText.toLowerCase();
198
199 let searchStart = 0;
200 let bestMatch: { start: number; end: number; score: number } | null = null;
201
202 while (searchStart < corpusLower.length) {
203 const idx = corpusLower.indexOf(textLower, searchStart);
204 if (idx === -1) break;
205
206 let score = 0;
207
208 if (prefix) {
209 const prefixNorm = prefix.replace(/\s+/g, ' ').trim().toLowerCase();
210 const contextBefore = corpusLower.slice(Math.max(0, idx - prefixNorm.length - 10), idx);
211 if (contextBefore.includes(prefixNorm)) {
212 score += prefixNorm.length;
213 }
214 }
215
216 if (suffix) {
217 const suffixNorm = suffix.replace(/\s+/g, ' ').trim().toLowerCase();
218 const contextAfter = corpusLower.slice(
219 idx + textLower.length,
220 idx + textLower.length + suffixNorm.length + 10
221 );
222 if (contextAfter.includes(suffixNorm)) {
223 score += suffixNorm.length;
224 }
225 }
226
227 if (!bestMatch || score > bestMatch.score) {
228 bestMatch = { start: idx, end: idx + normalizedText.length, score };
229 }
230
231 searchStart = idx + 1;
232 }
233
234 return bestMatch && bestMatch.score > 0 ? bestMatch : null;
235 }
236
237 private fuzzyFindInCorpus(searchText: string): { start: number; end: number } | null {
238 const searchWords = searchText
239 .trim()
240 .split(/\s+/)
241 .filter((w) => w.length > 0);
242 if (searchWords.length === 0) return null;
243
244 const corpusLower = this.corpus.toLowerCase();
245 const firstWord = searchWords[0].toLowerCase();
246 let searchStart = 0;
247
248 while (searchStart < corpusLower.length) {
249 const wordStart = corpusLower.indexOf(firstWord, searchStart);
250 if (wordStart === -1) break;
251
252 let corpusPos = wordStart;
253 let matched = true;
254 let lastMatchEnd = wordStart;
255
256 for (const word of searchWords) {
257 const wordLower = word.toLowerCase();
258 while (corpusPos < corpusLower.length && /\s/.test(this.corpus[corpusPos])) {
259 corpusPos++;
260 }
261 const corpusSlice = corpusLower.slice(corpusPos, corpusPos + wordLower.length);
262 if (corpusSlice !== wordLower) {
263 matched = false;
264 break;
265 }
266 corpusPos += wordLower.length;
267 lastMatchEnd = corpusPos;
268 }
269
270 if (matched) {
271 return { start: wordStart, end: lastMatchEnd };
272 }
273 searchStart = wordStart + 1;
274 }
275
276 return null;
277 }
278
279 private findLongestSubstring(searchText: string): { start: number; end: number } | null {
280 if (searchText.length < 20) return null;
281
282 const minLen = Math.floor(searchText.length * 0.6);
283 const corpusLower = this.corpus.toLowerCase();
284 const searchLower = searchText.toLowerCase();
285
286 for (let len = searchLower.length; len >= minLen; len--) {
287 const sub = searchLower.slice(0, len);
288 const idx = corpusLower.indexOf(sub);
289 if (idx !== -1) {
290 return { start: idx, end: idx + len };
291 }
292 }
293
294 for (let len = searchLower.length; len >= minLen; len--) {
295 const sub = searchLower.slice(searchLower.length - len);
296 const idx = corpusLower.indexOf(sub);
297 if (idx !== -1) {
298 return { start: idx, end: idx + len };
299 }
300 }
301
302 return null;
303 }
304
305 private createRange(start: number, end: number): Range | null {
306 const startPoint = this.mapIndexToPoint(start);
307 const endPoint = this.mapIndexToPoint(end);
308
309 if (startPoint && endPoint) {
310 const range = document.createRange();
311 range.setStart(startPoint.node, startPoint.offset);
312 range.setEnd(endPoint.node, endPoint.offset);
313 return range;
314 }
315 return null;
316 }
317
318 private mapIndexToPoint(corpusIndex: number): TextPoint | null {
319 if (this.indices.length === 0) return null;
320
321 let lo = 0;
322 let hi = this.indices.length - 1;
323
324 while (lo <= hi) {
325 const mid = (lo + hi) >>> 1;
326 const info = this.indices[mid];
327
328 if (corpusIndex < info.start) {
329 hi = mid - 1;
330 } else if (corpusIndex >= info.start + info.length) {
331 lo = mid + 1;
332 } else {
333 return { node: info.node, offset: corpusIndex - info.start };
334 }
335 }
336 if (this.indices.length > 0) {
337 const last = this.indices[this.indices.length - 1];
338 if (corpusIndex === last.start + last.length) {
339 return { node: last.node, offset: last.length };
340 }
341 }
342
343 return null;
344 }
345}
346
347export function findCanonicalUrl(range: Range): string | null {
348 let node: Node | null = range.commonAncestorContainer;
349 if (node.nodeType === Node.TEXT_NODE) {
350 node = node.parentNode;
351 }
352
353 while (node && node !== document.body) {
354 const element = node as Element;
355 if (
356 (element.tagName === 'BLOCKQUOTE' || element.tagName === 'Q') &&
357 element.hasAttribute('cite')
358 ) {
359 if (element.contains(range.commonAncestorContainer)) {
360 return element.getAttribute('cite');
361 }
362 }
363 node = node.parentNode;
364 }
365 return null;
366}