a tool for shared writing and social publishing

de-duplicate publications and documents

+122
+122
src/utils/deduplicateRecords.ts
··· 1 + /** 2 + * Utilities for deduplicating records that may exist under both 3 + * pub.leaflet.* and site.standard.* namespaces. 4 + * 5 + * After the migration to site.standard.*, records can exist in both namespaces 6 + * with the same DID and rkey. This utility deduplicates them, preferring 7 + * site.standard.* records when available. 8 + */ 9 + 10 + import { AtUri } from "@atproto/syntax"; 11 + 12 + /** 13 + * Extracts the identity key (DID + rkey) from an AT URI. 14 + * This key uniquely identifies a record across namespaces. 15 + * 16 + * @example 17 + * getRecordIdentityKey("at://did:plc:abc/pub.leaflet.document/3abc") 18 + * // Returns: "did:plc:abc/3abc" 19 + * 20 + * getRecordIdentityKey("at://did:plc:abc/site.standard.document/3abc") 21 + * // Returns: "did:plc:abc/3abc" (same key, different namespace) 22 + */ 23 + function getRecordIdentityKey(uri: string): string | null { 24 + try { 25 + const parsed = new AtUri(uri); 26 + return `${parsed.host}/${parsed.rkey}`; 27 + } catch { 28 + return null; 29 + } 30 + } 31 + 32 + /** 33 + * Checks if a URI is from the site.standard namespace. 34 + */ 35 + function isSiteStandardUri(uri: string): boolean { 36 + return uri.includes("/site.standard."); 37 + } 38 + 39 + /** 40 + * Deduplicates an array of records that have a `uri` property. 41 + * 42 + * When records exist under both pub.leaflet.* and site.standard.* namespaces 43 + * (same DID and rkey), this function keeps only the site.standard version. 44 + * 45 + * @param records - Array of records with a `uri` property 46 + * @returns Deduplicated array, preferring site.standard records 47 + * 48 + * @example 49 + * const docs = [ 50 + * { uri: "at://did:plc:abc/pub.leaflet.document/3abc", data: {...} }, 51 + * { uri: "at://did:plc:abc/site.standard.document/3abc", data: {...} }, 52 + * { uri: "at://did:plc:abc/pub.leaflet.document/3def", data: {...} }, 53 + * ]; 54 + * const deduped = deduplicateByUri(docs); 55 + * // Returns: [ 56 + * // { uri: "at://did:plc:abc/site.standard.document/3abc", data: {...} }, 57 + * // { uri: "at://did:plc:abc/pub.leaflet.document/3def", data: {...} }, 58 + * // ] 59 + */ 60 + export function deduplicateByUri<T extends { uri: string }>(records: T[]): T[] { 61 + const recordsByKey = new Map<string, T>(); 62 + 63 + for (const record of records) { 64 + const key = getRecordIdentityKey(record.uri); 65 + if (!key) { 66 + // Invalid URI, keep the record as-is 67 + continue; 68 + } 69 + 70 + const existing = recordsByKey.get(key); 71 + if (!existing) { 72 + recordsByKey.set(key, record); 73 + } else { 74 + // Prefer site.standard records over pub.leaflet records 75 + if (isSiteStandardUri(record.uri) && !isSiteStandardUri(existing.uri)) { 76 + recordsByKey.set(key, record); 77 + } 78 + // If both are same namespace or existing is already site.standard, keep existing 79 + } 80 + } 81 + 82 + return Array.from(recordsByKey.values()); 83 + } 84 + 85 + /** 86 + * Deduplicates records while preserving the original order based on the first 87 + * occurrence of each unique record. 88 + * 89 + * Same deduplication logic as deduplicateByUri, but maintains insertion order. 90 + * 91 + * @param records - Array of records with a `uri` property 92 + * @returns Deduplicated array in original order, preferring site.standard records 93 + */ 94 + export function deduplicateByUriOrdered<T extends { uri: string }>( 95 + records: T[] 96 + ): T[] { 97 + const recordsByKey = new Map<string, { record: T; index: number }>(); 98 + 99 + for (let i = 0; i < records.length; i++) { 100 + const record = records[i]; 101 + const key = getRecordIdentityKey(record.uri); 102 + if (!key) { 103 + continue; 104 + } 105 + 106 + const existing = recordsByKey.get(key); 107 + if (!existing) { 108 + recordsByKey.set(key, { record, index: i }); 109 + } else { 110 + // Prefer site.standard records over pub.leaflet records 111 + if (isSiteStandardUri(record.uri) && !isSiteStandardUri(existing.record.uri)) { 112 + // Replace with site.standard but keep original position 113 + recordsByKey.set(key, { record, index: existing.index }); 114 + } 115 + } 116 + } 117 + 118 + // Sort by original index to maintain order 119 + return Array.from(recordsByKey.values()) 120 + .sort((a, b) => a.index - b.index) 121 + .map((entry) => entry.record); 122 + }