forked from
j4ck.xyz/tweets2bsky
A simple tool which lets you scrape twitter accounts and crosspost them to bluesky accounts! Comes with a CLI and a webapp for managing profiles! Works with images/videos/link embeds/threads.
1import fs from 'node:fs';
2import path from 'node:path';
3import { fileURLToPath } from 'node:url';
4import Database from 'better-sqlite3';
5
6const __filename = fileURLToPath(import.meta.url);
7const __dirname = path.dirname(__filename);
8
9const DB_DIR = path.join(__dirname, '..', 'data');
10if (!fs.existsSync(DB_DIR)) {
11 fs.mkdirSync(DB_DIR);
12}
13
14const db = new Database(path.join(DB_DIR, 'database.sqlite'));
15
16// Enable WAL mode for better concurrency
17db.pragma('journal_mode = WAL');
18
19// --- Migration Support ---
20const tableInfo = db.prepare('PRAGMA table_info(processed_tweets)').all() as any[];
21
22if (tableInfo.length > 0) {
23 let schemaChanged = false;
24 const hasBskyIdentifier = tableInfo.some((col) => col.name === 'bsky_identifier');
25 const hasTweetText = tableInfo.some((col) => col.name === 'tweet_text');
26 const hasTailUri = tableInfo.some((col) => col.name === 'bsky_tail_uri');
27
28 if (!hasBskyIdentifier || !hasTweetText || !hasTailUri) {
29 console.log('🔄 Upgrading database schema...');
30
31 // SQLite doesn't support easy PK changes, so we recreate the table if identifier is missing
32 // Or if we just need to add a column, we can do ALTER TABLE if it's not the PK.
33 // However, since we might need to do both or one, let's just do the full migration pattern
34 // to be safe and consistent.
35
36 db.transaction(() => {
37 // 1. Rename existing table
38 db.exec(`ALTER TABLE processed_tweets RENAME TO processed_tweets_old;`);
39
40 // 2. Create new table with all columns
41 db.exec(`
42 CREATE TABLE processed_tweets (
43 twitter_id TEXT NOT NULL,
44 twitter_username TEXT NOT NULL,
45 bsky_identifier TEXT NOT NULL,
46 tweet_text TEXT,
47 bsky_uri TEXT,
48 bsky_cid TEXT,
49 bsky_root_uri TEXT,
50 bsky_root_cid TEXT,
51 bsky_tail_uri TEXT,
52 bsky_tail_cid TEXT,
53 status TEXT NOT NULL,
54 created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
55 PRIMARY KEY (twitter_id, bsky_identifier)
56 );
57 `);
58
59 // 3. Migrate data
60 // Handle the case where the old table might not have had bsky_identifier
61 const oldColumns = tableInfo.map((c) => c.name);
62
63 // Construct the SELECT part based on available old columns
64 // If old table didn't have bsky_identifier, we default to 'unknown'
65 const identifierSelect = oldColumns.includes('bsky_identifier') ? 'bsky_identifier' : "'unknown'";
66
67 // If old table didn't have tweet_text, we default to NULL
68 const textSelect = oldColumns.includes('tweet_text') ? 'tweet_text' : "NULL";
69
70 const tailUriSelect = oldColumns.includes('bsky_tail_uri') ? 'bsky_tail_uri' : "NULL";
71 const tailCidSelect = oldColumns.includes('bsky_tail_cid') ? 'bsky_tail_cid' : "NULL";
72
73 db.exec(`
74 INSERT INTO processed_tweets (
75 twitter_id,
76 twitter_username,
77 bsky_identifier,
78 tweet_text,
79 bsky_uri,
80 bsky_cid,
81 bsky_root_uri,
82 bsky_root_cid,
83 bsky_tail_uri,
84 bsky_tail_cid,
85 status,
86 created_at
87 )
88 SELECT
89 twitter_id,
90 twitter_username,
91 ${identifierSelect},
92 ${textSelect},
93 bsky_uri,
94 bsky_cid,
95 bsky_root_uri,
96 bsky_root_cid,
97 ${tailUriSelect},
98 ${tailCidSelect},
99 status,
100 created_at
101 FROM processed_tweets_old;
102 `);
103
104 // 4. Drop old table
105 db.exec(`DROP TABLE processed_tweets_old;`);
106 })();
107 console.log('✅ Database upgraded successfully.');
108 }
109} else {
110 // Initialize fresh schema
111 db.exec(`
112 CREATE TABLE IF NOT EXISTS processed_tweets (
113 twitter_id TEXT NOT NULL,
114 twitter_username TEXT NOT NULL,
115 bsky_identifier TEXT NOT NULL,
116 tweet_text TEXT,
117 bsky_uri TEXT,
118 bsky_cid TEXT,
119 bsky_root_uri TEXT,
120 bsky_root_cid TEXT,
121 bsky_tail_uri TEXT,
122 bsky_tail_cid TEXT,
123 status TEXT NOT NULL, -- 'migrated', 'skipped', 'failed'
124 created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
125 PRIMARY KEY (twitter_id, bsky_identifier)
126 );
127 `);
128}
129
130db.exec(`
131 CREATE INDEX IF NOT EXISTS idx_twitter_username ON processed_tweets(twitter_username);
132 CREATE INDEX IF NOT EXISTS idx_bsky_identifier ON processed_tweets(bsky_identifier);
133`);
134
135export interface ProcessedTweet {
136 twitter_id: string;
137 twitter_username: string;
138 bsky_identifier: string;
139 tweet_text?: string;
140 bsky_uri?: string;
141 bsky_cid?: string;
142 bsky_root_uri?: string;
143 bsky_root_cid?: string;
144 bsky_tail_uri?: string;
145 bsky_tail_cid?: string;
146 status: 'migrated' | 'skipped' | 'failed';
147 created_at?: string;
148}
149
150export interface ProcessedTweetSearchResult extends ProcessedTweet {
151 score: number;
152}
153
154function normalizeSearchValue(value: string): string {
155 return value
156 .toLowerCase()
157 .replace(/[^a-z0-9@#._\-\s]+/g, ' ')
158 .replace(/\s+/g, ' ')
159 .trim();
160}
161
162function tokenizeSearchValue(value: string): string[] {
163 if (!value) {
164 return [];
165 }
166 return value.split(' ').filter((token) => token.length > 0);
167}
168
169function orderedSubsequenceScore(query: string, candidate: string): number {
170 if (!query || !candidate) {
171 return 0;
172 }
173
174 let matched = 0;
175 let searchIndex = 0;
176 for (const char of query) {
177 const foundIndex = candidate.indexOf(char, searchIndex);
178 if (foundIndex === -1) {
179 continue;
180 }
181 matched += 1;
182 searchIndex = foundIndex + 1;
183 }
184
185 return matched / query.length;
186}
187
188function buildBigrams(value: string): Set<string> {
189 const result = new Set<string>();
190 if (value.length < 2) {
191 if (value.length === 1) {
192 result.add(value);
193 }
194 return result;
195 }
196
197 for (let i = 0; i < value.length - 1; i += 1) {
198 result.add(value.slice(i, i + 2));
199 }
200
201 return result;
202}
203
204function diceCoefficient(a: string, b: string): number {
205 const aBigrams = buildBigrams(a);
206 const bBigrams = buildBigrams(b);
207 if (aBigrams.size === 0 || bBigrams.size === 0) {
208 return 0;
209 }
210
211 let overlap = 0;
212 for (const gram of aBigrams) {
213 if (bBigrams.has(gram)) {
214 overlap += 1;
215 }
216 }
217
218 return (2 * overlap) / (aBigrams.size + bBigrams.size);
219}
220
221function scoreCandidateField(query: string, tokens: string[], candidateValue?: string): number {
222 const candidate = normalizeSearchValue(candidateValue || '');
223 if (!query || !candidate) {
224 return 0;
225 }
226
227 let score = 0;
228 if (candidate === query) {
229 score += 170;
230 } else if (candidate.startsWith(query)) {
231 score += 140;
232 } else if (candidate.includes(query)) {
233 score += 112;
234 }
235
236 let matchedTokens = 0;
237 for (const token of tokens) {
238 if (candidate.includes(token)) {
239 matchedTokens += 1;
240 score += token.length >= 4 ? 18 : 12;
241 }
242 }
243
244 if (tokens.length > 0) {
245 score += (matchedTokens / tokens.length) * 48;
246 }
247
248 score += orderedSubsequenceScore(query, candidate) * 46;
249 score += diceCoefficient(query, candidate) * 55;
250
251 return score;
252}
253
254function scoreProcessedTweet(tweet: ProcessedTweet, query: string, tokens: string[]): number {
255 const usernameScore = scoreCandidateField(query, tokens, tweet.twitter_username) * 1.25;
256 const identifierScore = scoreCandidateField(query, tokens, tweet.bsky_identifier) * 1.18;
257 const textScore = scoreCandidateField(query, tokens, tweet.tweet_text) * 0.98;
258 const idScore = scoreCandidateField(query, tokens, tweet.twitter_id) * 0.72;
259
260 const maxScore = Math.max(usernameScore, identifierScore, textScore, idScore);
261 const blendedScore = maxScore + (usernameScore + identifierScore + textScore + idScore - maxScore) * 0.22;
262
263 const recencyBoost = (() => {
264 if (!tweet.created_at) return 0;
265 const timestamp = Date.parse(tweet.created_at);
266 if (!Number.isFinite(timestamp)) return 0;
267 const ageDays = (Date.now() - timestamp) / (24 * 60 * 60 * 1000);
268 return Math.max(0, 7 - ageDays);
269 })();
270
271 return blendedScore + recencyBoost;
272}
273
274export const dbService = {
275 getTweet(twitterId: string, bskyIdentifier: string): ProcessedTweet | null {
276 const stmt = db.prepare('SELECT * FROM processed_tweets WHERE twitter_id = ? AND bsky_identifier = ?');
277 const row = stmt.get(twitterId, bskyIdentifier) as any;
278 if (!row) return null;
279 return {
280 twitter_id: row.twitter_id,
281 twitter_username: row.twitter_username,
282 bsky_identifier: row.bsky_identifier,
283 tweet_text: row.tweet_text,
284 bsky_uri: row.bsky_uri,
285 bsky_cid: row.bsky_cid,
286 bsky_root_uri: row.bsky_root_uri,
287 bsky_root_cid: row.bsky_root_cid,
288 bsky_tail_uri: row.bsky_tail_uri,
289 bsky_tail_cid: row.bsky_tail_cid,
290 status: row.status,
291 created_at: row.created_at
292 };
293 },
294
295 saveTweet(tweet: ProcessedTweet) {
296 const stmt = db.prepare(`
297 INSERT OR REPLACE INTO processed_tweets
298 (twitter_id, twitter_username, bsky_identifier, tweet_text, bsky_uri, bsky_cid, bsky_root_uri, bsky_root_cid, bsky_tail_uri, bsky_tail_cid, status)
299 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
300 `);
301 stmt.run(
302 tweet.twitter_id,
303 tweet.twitter_username,
304 tweet.bsky_identifier,
305 tweet.tweet_text || null,
306 tweet.bsky_uri || null,
307 tweet.bsky_cid || null,
308 tweet.bsky_root_uri || null,
309 tweet.bsky_root_cid || null,
310 tweet.bsky_tail_uri || null,
311 tweet.bsky_tail_cid || null,
312 tweet.status,
313 );
314 },
315
316 getTweetsByBskyIdentifier(bskyIdentifier: string): Record<string, any> {
317 const stmt = db.prepare('SELECT * FROM processed_tweets WHERE bsky_identifier = ?');
318 const rows = stmt.all(bskyIdentifier.toLowerCase()) as any[];
319 const map: Record<string, any> = {};
320 for (const row of rows) {
321 map[row.twitter_id] = {
322 uri: row.bsky_uri,
323 cid: row.bsky_cid,
324 root: row.bsky_root_uri ? { uri: row.bsky_root_uri, cid: row.bsky_root_cid } : undefined,
325 tail: (row.bsky_tail_uri && row.bsky_tail_cid) ? { uri: row.bsky_tail_uri, cid: row.bsky_tail_cid } : undefined,
326 migrated: row.status === 'migrated',
327 skipped: row.status === 'skipped',
328 };
329 }
330 return map;
331 },
332
333 getTweetsByUsername(username: string): Record<string, any> {
334 const stmt = db.prepare('SELECT * FROM processed_tweets WHERE twitter_username = ?');
335 const rows = stmt.all(username.toLowerCase()) as any[];
336 const map: Record<string, any> = {};
337 for (const row of rows) {
338 map[row.twitter_id] = {
339 uri: row.bsky_uri,
340 cid: row.bsky_cid,
341 root: row.bsky_root_uri ? { uri: row.bsky_root_uri, cid: row.bsky_root_cid } : undefined,
342 tail: (row.bsky_tail_uri && row.bsky_tail_cid) ? { uri: row.bsky_tail_uri, cid: row.bsky_tail_cid } : undefined,
343 migrated: row.status === 'migrated',
344 skipped: row.status === 'skipped'
345 };
346 }
347 return map;
348 },
349
350 getRecentProcessedTweets(limit = 50): ProcessedTweet[] {
351 const stmt = db.prepare('SELECT * FROM processed_tweets ORDER BY datetime(created_at) DESC, rowid DESC LIMIT ?');
352 return stmt.all(limit) as ProcessedTweet[];
353 },
354
355 searchMigratedTweets(query: string, limit = 60, scanLimit = 3000): ProcessedTweetSearchResult[] {
356 const normalizedQuery = normalizeSearchValue(query || '');
357 if (!normalizedQuery) {
358 return [];
359 }
360
361 const safeLimit = Number.isFinite(limit) ? Math.max(1, Math.min(limit, 200)) : 60;
362 const safeScanLimit = Number.isFinite(scanLimit) ? Math.max(safeLimit, Math.min(scanLimit, 8000)) : 3000;
363 const tokens = tokenizeSearchValue(normalizedQuery);
364
365 const stmt = db.prepare(
366 'SELECT * FROM processed_tweets WHERE status = "migrated" ORDER BY datetime(created_at) DESC, rowid DESC LIMIT ?',
367 );
368 const rows = stmt.all(safeScanLimit) as ProcessedTweet[];
369
370 return rows
371 .map((row) => ({
372 ...row,
373 score: scoreProcessedTweet(row, normalizedQuery, tokens),
374 }))
375 .filter((row) => row.score >= 22)
376 .sort((a, b) => {
377 if (b.score !== a.score) {
378 return b.score - a.score;
379 }
380 const aTime = a.created_at ? Date.parse(a.created_at) : 0;
381 const bTime = b.created_at ? Date.parse(b.created_at) : 0;
382 return (Number.isFinite(bTime) ? bTime : 0) - (Number.isFinite(aTime) ? aTime : 0);
383 })
384 .slice(0, safeLimit);
385 },
386
387 deleteTweetsByUsername(username: string) {
388 const stmt = db.prepare('DELETE FROM processed_tweets WHERE twitter_username = ?');
389 stmt.run(username.toLowerCase());
390 },
391
392 deleteTweetsByBskyIdentifier(bskyIdentifier: string) {
393 const stmt = db.prepare('DELETE FROM processed_tweets WHERE bsky_identifier = ?');
394 stmt.run(bskyIdentifier.toLowerCase());
395 },
396
397 repairUnknownIdentifiers(twitterUsername: string, bskyIdentifier: string) {
398 const stmt = db.prepare(
399 'UPDATE processed_tweets SET bsky_identifier = ? WHERE bsky_identifier = "unknown" AND twitter_username = ?',
400 );
401 stmt.run(bskyIdentifier.toLowerCase(), twitterUsername.toLowerCase());
402 },
403
404 clearAll() {
405 db.prepare('DELETE FROM processed_tweets').run();
406 },
407};