tangled
alpha
login
or
join now
indexx.dev
/
tweets2bsky
forked from
j4ck.xyz/tweets2bsky
0
fork
atom
A simple tool which lets you scrape twitter accounts and crosspost them to bluesky accounts! Comes with a CLI and a webapp for managing profiles! Works with images/videos/link embeds/threads.
0
fork
atom
overview
issues
pulls
pipelines
feat: include card media when available
jack
1 month ago
a072cde6
9a1e7580
+880
-507
1 changed file
expand all
collapse all
unified
split
src
index.ts
+880
-507
src/index.ts
···
2
import fs from 'node:fs';
3
import path from 'node:path';
4
import { fileURLToPath } from 'node:url';
5
-
import { BskyAgent, RichText } from '@atproto/api';
6
import type { BlobRef } from '@atproto/api';
7
import { Scraper } from '@the-convocation/twitter-scraper';
8
import type { Tweet as ScraperTweet } from '@the-convocation/twitter-scraper';
9
import axios from 'axios';
0
10
import { Command } from 'commander';
11
import * as francModule from 'franc-min';
12
import iso6391 from 'iso-639-1';
13
import puppeteer from 'puppeteer-core';
14
-
import * as cheerio from 'cheerio';
15
import sharp from 'sharp';
16
import { generateAltText } from './ai-manager.js';
17
···
21
const __filename = fileURLToPath(import.meta.url);
22
const __dirname = path.dirname(__filename);
23
24
-
// ============================================================================
25
// Type Definitions
26
-
// ============================================================================
27
28
interface ProcessedTweetEntry {
29
uri?: string;
···
44
expanded_url?: string;
45
}
46
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
47
interface MediaSize {
48
w: number;
49
h: number;
···
78
sizes?: MediaSizes;
79
original_info?: OriginalInfo;
80
video_info?: VideoInfo;
0
81
}
82
83
interface TweetEntities {
···
105
screen_name?: string;
106
id_str?: string;
107
};
0
0
108
}
109
110
interface AspectRatio {
···
120
121
import { dbService } from './db.js';
122
123
-
// ============================================================================
124
// State Management
125
-
// ============================================================================
126
127
const PROCESSED_DIR = path.join(__dirname, '..', 'processed');
128
129
async function migrateJsonToSqlite() {
130
if (!fs.existsSync(PROCESSED_DIR)) return;
131
-
132
-
const files = fs.readdirSync(PROCESSED_DIR).filter(f => f.endsWith('.json'));
133
if (files.length === 0) return;
134
135
console.log(`📦 Found ${files.length} legacy cache files. Migrating to SQLite...`);
136
const config = getConfig();
137
-
138
for (const file of files) {
139
const username = file.replace('.json', '').toLowerCase();
140
// Try to find a matching bskyIdentifier from config
141
-
const mapping = config.mappings.find(m => m.twitterUsernames.map(u => u.toLowerCase()).includes(username));
142
const bskyIdentifier = mapping?.bskyIdentifier || 'unknown';
143
144
try {
145
const filePath = path.join(PROCESSED_DIR, file);
146
const data = JSON.parse(fs.readFileSync(filePath, 'utf8')) as ProcessedTweetsMap;
147
-
148
for (const [twitterId, entry] of Object.entries(data)) {
149
dbService.saveTweet({
150
twitter_id: twitterId,
···
154
bsky_cid: entry.cid,
155
bsky_root_uri: entry.root?.uri,
156
bsky_root_cid: entry.root?.cid,
157
-
status: entry.migrated ? 'migrated' : (entry.skipped ? 'skipped' : 'failed')
158
});
159
}
160
// Move file to backup
···
172
dbService.repairUnknownIdentifiers(username, mapping.bskyIdentifier);
173
}
174
}
175
-
176
console.log('✅ Migration complete.');
177
}
178
···
180
return dbService.getTweetsByBskyIdentifier(bskyIdentifier);
181
}
182
183
-
function saveProcessedTweet(twitterUsername: string, bskyIdentifier: string, twitterId: string, entry: ProcessedTweetEntry): void {
0
0
0
0
0
184
dbService.saveTweet({
185
twitter_id: twitterId,
186
twitter_username: twitterUsername.toLowerCase(),
···
192
bsky_root_cid: entry.root?.cid,
193
bsky_tail_uri: entry.tail?.uri,
194
bsky_tail_cid: entry.tail?.cid,
195
-
status: entry.migrated || (entry.uri && entry.cid) ? 'migrated' : (entry.skipped ? 'skipped' : 'failed')
196
});
197
}
198
199
-
// ============================================================================
200
// Custom Twitter Client
201
-
// ============================================================================
202
203
let scraper: Scraper | null = null;
204
let currentTwitterCookies = { authToken: '', ct0: '' };
···
216
}
217
218
if (!authToken || !ct0) return null;
219
-
220
// Re-initialize if config changed, not yet initialized, or forced reset
221
-
if (
222
-
!scraper ||
223
-
forceReset ||
224
-
currentTwitterCookies.authToken !== authToken ||
225
-
currentTwitterCookies.ct0 !== ct0
226
-
) {
227
console.log(`🔄 Initializing Twitter scraper with ${useBackupCredentials ? 'BACKUP' : 'PRIMARY'} credentials...`);
228
scraper = new Scraper();
229
-
await scraper.setCookies([
230
-
`auth_token=${authToken}`,
231
-
`ct0=${ct0}`
232
-
]);
233
234
-
currentTwitterCookies = {
235
-
authToken: authToken,
236
-
ct0: ct0
237
};
238
}
239
return scraper;
···
247
await getTwitterScraper(true);
248
return true;
249
}
250
-
console.log("⚠️ No backup credentials available to switch to.");
251
return false;
252
}
253
254
function mapScraperTweetToLocalTweet(scraperTweet: ScraperTweet): Tweet {
255
-
const raw = scraperTweet.__raw_UNSTABLE;
256
-
if (!raw) {
257
-
// Fallback if raw data is missing (shouldn't happen for timeline tweets usually)
258
-
return {
259
-
id: scraperTweet.id,
260
-
id_str: scraperTweet.id,
261
-
text: scraperTweet.text,
262
-
full_text: scraperTweet.text,
263
-
isRetweet: scraperTweet.isRetweet,
264
-
// Construct minimal entities from parsed data
265
-
entities: {
266
-
urls: scraperTweet.urls.map((url: string) => ({ url, expanded_url: url })),
267
-
media: scraperTweet.photos.map((p: any) => ({
268
-
url: p.url,
269
-
expanded_url: p.url,
270
-
media_url_https: p.url,
271
-
type: 'photo',
272
-
ext_alt_text: p.alt_text,
273
-
})),
274
-
},
275
-
created_at: scraperTweet.timeParsed?.toUTCString()
276
-
};
277
-
}
278
-
279
return {
280
-
id: raw.id_str,
281
-
id_str: raw.id_str,
282
-
text: raw.full_text,
283
-
full_text: raw.full_text,
284
-
created_at: raw.created_at,
285
isRetweet: scraperTweet.isRetweet,
286
-
// biome-ignore lint/suspicious/noExplicitAny: raw types match compatible structure
287
-
entities: raw.entities as any,
288
-
// biome-ignore lint/suspicious/noExplicitAny: raw types match compatible structure
289
-
extended_entities: raw.extended_entities as any,
290
-
quoted_status_id_str: raw.quoted_status_id_str,
291
-
retweeted_status_id_str: raw.retweeted_status_id_str,
292
-
is_quote_status: !!raw.quoted_status_id_str,
293
-
in_reply_to_status_id_str: raw.in_reply_to_status_id_str,
294
-
// biome-ignore lint/suspicious/noExplicitAny: missing in LegacyTweetRaw type
295
-
in_reply_to_user_id_str: (raw as any).in_reply_to_user_id_str,
296
-
user: {
297
-
screen_name: scraperTweet.username,
298
-
id_str: scraperTweet.userId,
299
},
0
0
300
};
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
301
}
302
303
-
// ============================================================================
304
// Helper Functions
305
-
// ============================================================================
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
306
307
function detectLanguage(text: string): string[] {
308
if (!text || text.trim().length === 0) return ['en'];
···
335
return (response.request as any)?.res?.responseUrl || shortUrl;
336
} catch (e: any) {
337
if (e.code === 'ERR_FR_TOO_MANY_REDIRECTS' || e.response?.status === 403 || e.response?.status === 401) {
338
-
// Silent fallback for common expansion issues (redirect loops, login walls)
339
-
return shortUrl;
340
}
341
return shortUrl;
342
}
···
372
const isGif = mimeType === 'image/gif';
373
const isAnimation = isGif || isWebp;
374
375
-
if ((buffer.length > MAX_SIZE && (mimeType.startsWith('image/') || mimeType === 'application/octet-stream')) || (isPng && buffer.length > MAX_SIZE)) {
0
0
0
376
console.log(`[UPLOAD] ⚖️ Image too large (${(buffer.length / 1024).toFixed(2)} KB). Optimizing...`);
377
try {
378
let image = sharp(buffer);
···
386
while (currentBuffer.length > MAX_SIZE && attempts < 5) {
387
attempts++;
388
console.log(`[UPLOAD] 📉 Compression attempt ${attempts}: Width ${width}, Quality ${quality}...`);
389
-
390
if (isAnimation) {
391
-
// For animations (GIF/WebP), we can only do so much without losing frames
392
-
// Try to convert to WebP if it's a GIF, or optimize WebP
393
-
image = sharp(buffer, { animated: true });
394
-
if (isGif) {
395
-
// Convert GIF to WebP for better compression
396
-
image = image.webp({ quality: Math.max(quality, 50), effort: 6 });
397
-
finalMimeType = 'image/webp';
398
-
} else {
399
-
image = image.webp({ quality: Math.max(quality, 50), effort: 6 });
400
-
}
401
-
// Resize if really big
402
-
if (metadata.width && metadata.width > 800) {
403
-
image = image.resize({ width: 800, withoutEnlargement: true });
404
-
}
405
} else {
406
-
// Static images
407
-
if (width > 1600) width = 1600;
408
-
else if (attempts > 1) width = Math.floor(width * 0.8);
409
-
410
-
quality = Math.max(50, quality - 10);
411
-
412
-
image = sharp(buffer)
413
-
.resize({ width, withoutEnlargement: true })
414
-
.jpeg({ quality, mozjpeg: true });
415
-
416
-
finalMimeType = 'image/jpeg';
417
}
418
-
419
currentBuffer = await image.toBuffer();
420
if (currentBuffer.length <= MAX_SIZE) {
421
-
finalBuffer = currentBuffer;
422
-
console.log(`[UPLOAD] ✅ Optimized to ${(finalBuffer.length / 1024).toFixed(2)} KB`);
423
-
break;
424
}
425
}
426
-
427
if (finalBuffer.length > MAX_SIZE) {
428
-
console.warn(`[UPLOAD] ⚠️ Could not compress below limit. Current: ${(finalBuffer.length / 1024).toFixed(2)} KB. Upload might fail.`);
0
0
429
}
430
-
431
} catch (err) {
432
console.warn(`[UPLOAD] ⚠️ Optimization failed, attempting original upload:`, (err as Error).message);
433
finalBuffer = buffer;
···
455
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
456
];
457
458
-
const executablePath = browserPaths.find(p => fs.existsSync(p));
459
-
460
if (!executablePath) {
461
console.warn(`[SCREENSHOT] ⏩ Skipping screenshot (no Chrome/Chromium found at common paths).`);
462
return null;
···
500
`;
501
502
await page.setContent(html, { waitUntil: 'networkidle0' });
503
-
504
// Wait for the twitter iframe to load and render
505
try {
506
await page.waitForSelector('iframe', { timeout: 10000 });
507
// Small extra wait for images inside iframe
508
-
await new Promise(r => setTimeout(r, 2000));
509
} catch (e) {
510
console.warn(`[SCREENSHOT] ⚠️ Timeout waiting for tweet iframe, taking screenshot anyway.`);
511
}
···
515
const box = await element.boundingBox();
516
const buffer = await element.screenshot({ type: 'png', omitBackground: true });
517
if (box) {
518
-
console.log(`[SCREENSHOT] ✅ Captured successfully (${(buffer.length / 1024).toFixed(2)} KB) - ${Math.round(box.width)}x${Math.round(box.height)}`);
519
-
return { buffer: buffer as Buffer, width: Math.round(box.width), height: Math.round(box.height) };
0
0
520
}
521
}
522
} catch (err) {
···
534
535
while (!blob) {
536
attempts++;
537
-
const statusUrl = new URL("https://video.bsky.app/xrpc/app.bsky.video.getJobStatus");
538
-
statusUrl.searchParams.append("jobId", jobId);
539
540
const statusResponse = await fetch(statusUrl);
541
if (!statusResponse.ok) {
···
553
if (statusData.jobStatus.blob) {
554
blob = statusData.jobStatus.blob;
555
console.log(`[VIDEO] 🎉 Video processing complete! Blob ref obtained.`);
556
-
} else if (state === "JOB_STATE_FAILED") {
557
-
throw new Error(`Video processing failed: ${statusData.jobStatus.error || "Unknown error"}`);
558
} else {
559
// Wait before next poll
560
await new Promise((resolve) => setTimeout(resolve, 5000));
···
562
563
if (attempts > 60) {
564
// ~5 minute timeout
565
-
throw new Error("Video processing timed out after 5 minutes.");
566
}
567
}
568
return blob!;
···
572
try {
573
const response = await axios.get(url, {
574
headers: {
575
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
576
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
0
577
'Accept-Language': 'en-US,en;q=0.9',
578
},
579
timeout: 10000,
580
maxRedirects: 5,
581
});
582
-
583
const $ = cheerio.load(response.data);
584
const title = $('meta[property="og:title"]').attr('content') || $('title').text() || '';
585
-
const description = $('meta[property="og:description"]').attr('content') || $('meta[name="description"]').attr('content') || '';
0
586
let thumbBlob: BlobRef | undefined;
587
588
let imageUrl = $('meta[property="og:image"]').attr('content');
589
if (imageUrl) {
590
-
if (!imageUrl.startsWith('http')) {
591
-
const baseUrl = new URL(url);
592
-
imageUrl = new URL(imageUrl, baseUrl.origin).toString();
593
-
}
594
-
try {
595
-
const { buffer, mimeType } = await downloadMedia(imageUrl);
596
-
thumbBlob = await uploadToBluesky(agent, buffer, mimeType);
597
-
} catch (e) {
598
-
// SIlently fail thumbnail upload
599
-
}
600
}
601
602
if (!title && !description) return null;
603
604
const external: any = {
605
-
uri: url,
606
-
title: title || url,
607
-
description: description,
608
};
609
610
if (thumbBlob) {
611
-
external.thumb = thumbBlob;
612
}
613
614
return {
615
-
$type: 'app.bsky.embed.external',
616
-
external,
617
};
618
-
619
} catch (err: any) {
620
if (err.code === 'ERR_FR_TOO_MANY_REDIRECTS') {
621
-
// Ignore redirect loops
622
-
return null;
623
}
624
console.warn(`Failed to fetch embed card for ${url}:`, err.message || err);
625
return null;
···
627
}
628
629
async function uploadVideoToBluesky(agent: BskyAgent, buffer: Buffer, filename: string): Promise<BlobRef> {
630
-
const sanitizedFilename = filename.split("?")[0] || "video.mp4";
631
console.log(
632
`[VIDEO] 🟢 Starting upload process for ${sanitizedFilename} (${(buffer.length / 1024 / 1024).toFixed(2)} MB)`,
633
);
···
640
641
// didDoc might be present in repoDesc
642
const pdsService = (repoDesc as any).didDoc?.service?.find(
643
-
(s: any) => s.id === "#atproto_pds" || s.type === "AtProtoPds",
644
);
645
const pdsUrl = pdsService?.serviceEndpoint;
646
-
const pdsHost = pdsUrl ? new URL(pdsUrl).host : "bsky.social";
647
648
console.log(`[VIDEO] 🌐 PDS Host detected: ${pdsHost}`);
649
console.log(`[VIDEO] 🔑 Requesting service auth token for audience: did:web:${pdsHost}...`);
650
651
const { data: serviceAuth } = await agent.com.atproto.server.getServiceAuth({
652
aud: `did:web:${pdsHost}`,
653
-
lxm: "com.atproto.repo.uploadBlob",
654
exp: Math.floor(Date.now() / 1000) + 60 * 30,
655
});
656
console.log(`[VIDEO] ✅ Service auth token obtained.`);
···
658
const token = serviceAuth.token;
659
660
// 2. Upload to Video Service
661
-
const uploadUrl = new URL("https://video.bsky.app/xrpc/app.bsky.video.uploadVideo");
662
-
uploadUrl.searchParams.append("did", agent.session!.did!);
663
-
uploadUrl.searchParams.append("name", sanitizedFilename);
664
665
console.log(`[VIDEO] 📤 Uploading to ${uploadUrl.href}...`);
666
const uploadResponse = await fetch(uploadUrl, {
667
-
method: "POST",
668
headers: {
669
Authorization: `Bearer ${token}`,
670
-
"Content-Type": "video/mp4",
671
},
672
body: new Blob([new Uint8Array(buffer)]),
673
});
···
678
// Handle specific error cases
679
try {
680
const errorJson = JSON.parse(errorText);
681
-
682
// Handle server overload gracefully
683
-
if (uploadResponse.status === 503 || errorJson.error === "Server does not have enough capacity to handle uploads") {
684
-
console.warn(`[VIDEO] ⚠️ Server overloaded (503). Skipping video upload and falling back to link.`);
685
-
throw new Error("VIDEO_FALLBACK_503");
0
0
0
686
}
687
688
-
if (errorJson.error === "already_exists" && errorJson.jobId) {
689
console.log(`[VIDEO] ♻️ Video already exists. Resuming with Job ID: ${errorJson.jobId}`);
690
return await pollForVideoProcessing(agent, errorJson.jobId);
691
}
692
-
if (errorJson.error === "unconfirmed_email" || (errorJson.jobStatus && errorJson.jobStatus.error === "unconfirmed_email")) {
693
-
console.error(`[VIDEO] 🛑 BLUESKY ERROR: Your email is unconfirmed. You MUST verify your email on Bluesky to upload videos.`);
694
-
throw new Error("Bluesky Email Unconfirmed - Video Upload Rejected");
0
0
0
0
0
695
}
696
} catch (e) {
697
-
if ((e as Error).message === "VIDEO_FALLBACK_503") throw e;
698
-
// Not JSON or missing fields, proceed with throwing original error
699
}
700
-
701
console.error(`[VIDEO] ❌ Server responded with ${uploadResponse.status}: ${errorText}`);
702
throw new Error(`Video upload failed: ${uploadResponse.status} ${errorText}`);
703
}
···
740
// 4. Force split
741
742
let splitIndex = -1;
743
-
744
// Check paragraphs
745
let checkIndex = remaining.lastIndexOf('\n\n', effectiveLimit);
746
if (checkIndex !== -1) splitIndex = checkIndex;
747
748
// Check sentences
749
if (splitIndex === -1) {
750
-
// Look for punctuation followed by space
751
-
const sentenceMatches = Array.from(remaining.substring(0, effectiveLimit).matchAll(/[.!?]\s/g));
752
-
if (sentenceMatches.length > 0) {
753
-
const lastMatch = sentenceMatches[sentenceMatches.length - 1];
754
-
if (lastMatch && lastMatch.index !== undefined) {
755
-
splitIndex = lastMatch.index + 1; // Include punctuation
756
-
}
757
}
0
758
}
759
760
// Check spaces
761
if (splitIndex === -1) {
762
-
checkIndex = remaining.lastIndexOf(' ', effectiveLimit);
763
-
if (checkIndex !== -1) splitIndex = checkIndex;
764
}
765
766
// Force split if no good break point found
767
if (splitIndex === -1) {
768
-
splitIndex = effectiveLimit;
769
}
770
771
chunks.push(remaining.substring(0, splitIndex).trim());
···
814
async function fetchUserTweets(username: string, limit: number, processedIds?: Set<string>): Promise<Tweet[]> {
815
const client = await getTwitterScraper();
816
if (!client) return [];
817
-
818
let retries = 3;
819
while (retries > 0) {
820
try {
821
const tweets: Tweet[] = [];
822
const generator = client.getTweets(username, limit);
823
let consecutiveProcessedCount = 0;
824
-
825
for await (const t of generator) {
826
const tweet = mapScraperTweetToLocalTweet(t);
827
const tweetId = tweet.id_str || tweet.id;
828
-
829
// Early stopping logic: if we see 3 consecutive tweets we've already processed, stop.
830
// This assumes timeline order (mostly true).
831
if (processedIds && tweetId && processedIds.has(tweetId)) {
832
-
consecutiveProcessedCount++;
833
-
if (consecutiveProcessedCount >= 3) {
834
-
console.log(`[${username}] 🛑 Found 3 consecutive processed tweets. Stopping fetch early.`);
835
-
break;
836
-
}
837
} else {
838
-
consecutiveProcessedCount = 0;
839
}
840
841
tweets.push(tweet);
···
844
return tweets;
845
} catch (e: any) {
846
retries--;
847
-
const isRetryable = e.message?.includes('ServiceUnavailable') || e.message?.includes('Timeout') || e.message?.includes('429') || e.message?.includes('401');
848
-
0
0
0
0
849
// Check for Twitter Internal Server Error (often returns 400 with specific body)
850
if (e?.response?.status === 400 && JSON.stringify(e?.response?.data || {}).includes('InternalServerError')) {
851
-
console.warn(`⚠️ Twitter Internal Server Error (Transient) for ${username}.`);
852
-
// Treat as retryable
853
-
if (retries > 0) {
854
-
await new Promise(r => setTimeout(r, 5000));
855
-
continue;
856
-
}
857
}
858
859
if (isRetryable) {
860
console.warn(`⚠️ Error fetching tweets for ${username} (${e.message}).`);
861
-
862
// Attempt credential switch if we have backups
863
if (await switchCredentials()) {
864
-
console.log(`🔄 Retrying with new credentials...`);
865
-
continue; // Retry loop with new credentials
866
}
867
868
if (retries > 0) {
869
-
console.log(`Waiting 5s before retry...`);
870
-
await new Promise(r => setTimeout(r, 5000));
871
-
continue;
872
}
873
}
874
-
875
console.warn(`Error fetching tweets for ${username}:`, e.message || e);
876
return [];
877
}
878
}
879
-
880
console.log(`[${username}] ⚠️ Scraper returned 0 tweets (or failed silently) after retries.`);
881
return [];
882
}
883
884
-
// ============================================================================
885
// Main Processing Logic
886
-
// ============================================================================
887
888
-
// ============================================================================
889
// Main Processing Logic
890
-
// ============================================================================
891
892
async function processTweets(
893
agent: BskyAgent,
···
909
});
910
911
const processedTweets = loadProcessedTweets(bskyIdentifier);
912
-
913
// Maintain a local map that updates in real-time for intra-batch replies
914
const localProcessedMap: ProcessedTweetsMap = { ...processedTweets };
915
···
936
if (isRetweet) {
937
console.log(`[${twitterUsername}] ⏩ Skipping retweet ${tweetId}.`);
938
if (!dryRun) {
939
-
// Save as skipped so we don't check it again
940
-
saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, { skipped: true, text: tweet.text });
941
-
localProcessedMap[tweetId] = { skipped: true, text: tweet.text };
942
}
943
continue;
944
}
···
967
// Parent missing from local batch/DB. Attempt to fetch it if it's a self-thread.
968
// We assume it's a self-thread if we don't have it, but we'll verify author after fetch.
969
console.log(`[${twitterUsername}] 🕵️ Parent ${replyStatusId} missing. Checking if backfillable...`);
970
-
971
let parentBackfilled = false;
972
try {
973
-
const scraper = await getTwitterScraper();
974
-
if (scraper) {
975
-
const parentRaw = await scraper.getTweet(replyStatusId);
976
-
if (parentRaw) {
977
-
const parentTweet = mapScraperTweetToLocalTweet(parentRaw);
978
-
const parentAuthor = parentTweet.user?.screen_name;
979
-
980
-
if (parentAuthor?.toLowerCase() === twitterUsername.toLowerCase()) {
981
-
console.log(`[${twitterUsername}] 🔄 Parent is ours (@${parentAuthor}). Backfilling parent first...`);
982
-
// Recursively process the parent
983
-
await processTweets(agent, twitterUsername, bskyIdentifier, [parentTweet], dryRun);
984
-
985
-
// Check if it was saved
986
-
const savedParent = dbService.getTweet(replyStatusId, bskyIdentifier);
987
-
if (savedParent && savedParent.status === 'migrated') {
988
-
// Update local map
989
-
localProcessedMap[replyStatusId] = {
990
-
uri: savedParent.bsky_uri,
991
-
cid: savedParent.bsky_cid,
992
-
root: (savedParent.bsky_root_uri && savedParent.bsky_root_cid) ? { uri: savedParent.bsky_root_uri, cid: savedParent.bsky_root_cid } : undefined,
993
-
tail: (savedParent.bsky_tail_uri && savedParent.bsky_tail_cid) ? { uri: savedParent.bsky_tail_uri, cid: savedParent.bsky_tail_cid } : undefined,
994
-
migrated: true
995
-
};
996
-
replyParentInfo = localProcessedMap[replyStatusId] ?? null;
997
-
parentBackfilled = true;
998
-
console.log(`[${twitterUsername}] ✅ Parent backfilled. Resuming thread.`);
999
-
}
1000
-
} else {
1001
-
console.log(`[${twitterUsername}] ⏩ Parent is by @${parentAuthor}. Skipping external reply.`);
1002
-
}
0
0
1003
}
0
0
0
1004
}
0
1005
} catch (e) {
1006
-
console.warn(`[${twitterUsername}] ⚠️ Failed to fetch/backfill parent ${replyStatusId}:`, e);
1007
}
1008
1009
if (!parentBackfilled) {
1010
-
console.log(`[${twitterUsername}] ⏩ Skipping external/unknown reply (Parent not found or external).`);
1011
-
if (!dryRun) {
1012
-
saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, { skipped: true, text: tweetText });
1013
-
localProcessedMap[tweetId] = { skipped: true, text: tweetText };
1014
-
}
1015
-
continue;
1016
}
1017
} else {
1018
console.log(`[${twitterUsername}] ⏩ Skipping external/unknown reply.`);
···
1025
}
1026
1027
// Removed early dryRun continue to allow verifying logic
1028
-
1029
let text = tweetText
1030
.replace(/&/g, '&')
1031
.replace(/</g, '<')
1032
.replace(/>/g, '>')
1033
.replace(/"/g, '"')
1034
.replace(/'/g, "'");
1035
-
1036
// 1. Link Expansion
1037
console.log(`[${twitterUsername}] 🔗 Expanding links...`);
1038
const urls = tweet.entities?.urls || [];
···
1047
const matches = text.match(tcoRegex) || [];
1048
for (const tco of matches) {
1049
// Avoid re-resolving if we already handled it via entities
1050
-
if (urls.some(u => u.url === tco)) continue;
1051
1052
console.log(`[${twitterUsername}] 🔍 Resolving fallback link: ${tco}`);
1053
const resolved = await expandUrl(tco);
1054
if (resolved !== tco) {
1055
-
text = text.replace(tco, resolved);
1056
-
// Add to urls array so it can be used for card embedding later
1057
-
urls.push({ url: tco, expanded_url: resolved });
0
0
0
0
0
0
0
0
0
0
0
0
1058
}
1059
}
1060
···
1072
mediaLinksToRemove.push(media.url);
1073
if (media.expanded_url) mediaLinksToRemove.push(media.expanded_url);
1074
}
1075
-
0
0
0
1076
let aspectRatio: AspectRatio | undefined;
1077
if (media.sizes?.large) {
1078
aspectRatio = { width: media.sizes.large.w, height: media.sizes.large.h };
···
1088
console.log(`[${twitterUsername}] 📥 Downloading image (high quality): ${path.basename(highQualityUrl)}`);
1089
updateAppStatus({ message: `Downloading high quality image...` });
1090
const { buffer, mimeType } = await downloadMedia(highQualityUrl);
1091
-
1092
let blob: BlobRef;
1093
if (dryRun) {
1094
-
console.log(`[${twitterUsername}] 🧪 [DRY RUN] Would upload image (${(buffer.length/1024).toFixed(2)} KB)`);
1095
-
blob = { ref: { toString: () => 'mock-blob' }, mimeType, size: buffer.length } as any;
0
0
1096
} else {
1097
-
console.log(`[${twitterUsername}] 📤 Uploading image to Bluesky...`);
1098
-
updateAppStatus({ message: `Uploading image to Bluesky...` });
1099
-
blob = await uploadToBluesky(agent, buffer, mimeType);
1100
}
1101
-
1102
let altText = media.ext_alt_text;
1103
if (!altText) {
1104
-
console.log(`[${twitterUsername}] 🤖 Generating alt text via Gemini...`);
1105
-
// Use original tweet text for context, not the modified/cleaned one
1106
-
altText = await generateAltText(buffer, mimeType, tweetText);
1107
-
if (altText) console.log(`[${twitterUsername}] ✅ Alt text generated: ${altText.substring(0, 50)}...`);
1108
}
1109
1110
images.push({ alt: altText || 'Image from Twitter', image: blob, aspectRatio });
···
1125
} else if (media.type === 'video' || media.type === 'animated_gif') {
1126
const variants = media.video_info?.variants || [];
1127
const duration = media.video_info?.duration_millis || 0;
1128
-
1129
-
if (duration > 180000) { // 3 minutes
1130
-
console.warn(`[${twitterUsername}] ⚠️ Video too long (${(duration / 1000).toFixed(1)}s). Fallback to link.`);
1131
-
const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`;
1132
-
if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`;
1133
-
continue;
0
1134
}
1135
1136
const mp4s = variants
···
1145
console.log(`[${twitterUsername}] 📥 Downloading video: ${videoUrl}`);
1146
updateAppStatus({ message: `Downloading video: ${path.basename(videoUrl)}` });
1147
const { buffer, mimeType } = await downloadMedia(videoUrl);
1148
-
1149
if (buffer.length <= 90 * 1024 * 1024) {
1150
const filename = videoUrl.split('/').pop() || 'video.mp4';
1151
if (dryRun) {
1152
-
console.log(`[${twitterUsername}] 🧪 [DRY RUN] Would upload video: ${filename} (${(buffer.length/1024/1024).toFixed(2)} MB)`);
1153
-
videoBlob = { ref: { toString: () => 'mock-video-blob' }, mimeType: 'video/mp4', size: buffer.length } as any;
0
0
0
0
0
0
1154
} else {
1155
-
updateAppStatus({ message: `Uploading video to Bluesky...` });
1156
-
videoBlob = await uploadVideoToBluesky(agent, buffer, filename);
1157
}
1158
videoAspectRatio = aspectRatio;
1159
console.log(`[${twitterUsername}] ✅ Video upload process complete.`);
1160
break; // Prioritize first video
1161
}
1162
-
1163
-
console.warn(`[${twitterUsername}] ⚠️ Video too large (${(buffer.length / 1024 / 1024).toFixed(2)}MB). Fallback to link.`);
0
0
1164
const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`;
1165
if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`;
1166
} catch (err) {
1167
const errMsg = (err as Error).message;
1168
-
if (errMsg !== "VIDEO_FALLBACK_503") {
1169
-
console.error(`[${twitterUsername}] ❌ Failed video upload flow:`, errMsg);
1170
}
1171
const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`;
1172
if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`;
···
1178
1179
// Cleanup text
1180
for (const link of mediaLinksToRemove) text = text.split(link).join('').trim();
0
0
0
0
0
0
0
0
0
0
1181
text = text.replace(/\n\s*\n/g, '\n\n').trim();
0
1182
1183
// 3. Quoting Logic
1184
let quoteEmbed: { $type: string; record: { uri: string; cid: string } } | null = null;
···
1194
} else {
1195
const quoteUrlEntity = urls.find((u) => u.expanded_url?.includes(quoteId));
1196
const qUrl = quoteUrlEntity?.expanded_url || `https://twitter.com/i/status/${quoteId}`;
1197
-
1198
// Check if it's a self-quote (same user)
1199
-
const isSelfQuote = qUrl.toLowerCase().includes(`twitter.com/${twitterUsername.toLowerCase()}/`) ||
1200
-
qUrl.toLowerCase().includes(`x.com/${twitterUsername.toLowerCase()}/`);
1201
-
0
1202
if (!isSelfQuote) {
1203
externalQuoteUrl = qUrl;
1204
console.log(`[${twitterUsername}] 🔗 Quoted tweet is external: ${externalQuoteUrl}`);
1205
-
1206
// Try to capture screenshot for external QTs if we have space for images
1207
if (images.length < 4 && !videoBlob) {
1208
const ssResult = await captureTweetScreenshot(externalQuoteUrl);
···
1210
try {
1211
let blob: BlobRef;
1212
if (dryRun) {
1213
-
console.log(`[${twitterUsername}] 🧪 [DRY RUN] Would upload screenshot for quote (${(ssResult.buffer.length/1024).toFixed(2)} KB)`);
1214
-
blob = { ref: { toString: () => 'mock-ss-blob' }, mimeType: 'image/png', size: ssResult.buffer.length } as any;
0
0
0
0
0
0
1215
} else {
1216
-
blob = await uploadToBluesky(agent, ssResult.buffer, 'image/png');
1217
}
1218
-
images.push({
1219
-
alt: `Quote Tweet: ${externalQuoteUrl}`,
1220
-
image: blob,
1221
-
aspectRatio: { width: ssResult.width, height: ssResult.height }
1222
});
1223
} catch (e) {
1224
console.warn(`[${twitterUsername}] ⚠️ Failed to upload screenshot blob.`);
···
1229
console.log(`[${twitterUsername}] 🔁 Quoted tweet is a self-quote, skipping link.`);
1230
}
1231
}
1232
-
} else if (images.length === 0 && !videoBlob) {
1233
-
// If no media and no quote, check for external links to embed
1234
-
// We prioritize the LAST link found as it's often the main content
1235
-
const potentialLinks = urls
1236
-
.map(u => u.expanded_url)
1237
-
.filter(u => u && !u.includes('twitter.com') && !u.includes('x.com')) as string[];
1238
-
1239
-
if (potentialLinks.length > 0) {
1240
-
const linkToEmbed = potentialLinks[potentialLinks.length - 1];
1241
-
if (linkToEmbed) {
1242
-
// Optimization: If text is too long, but removing the link makes it fit, do it!
1243
-
// The link will be present in the embed card anyway.
1244
-
if (text.length > 300 && text.includes(linkToEmbed)) {
1245
-
const lengthWithoutLink = text.length - linkToEmbed.length;
1246
-
// Allow some buffer (e.g. whitespace cleanup might save 1-2 chars)
1247
-
if (lengthWithoutLink <= 300) {
1248
-
console.log(`[${twitterUsername}] 📏 Optimizing: Removing link ${linkToEmbed} from text to avoid threading (Card will embed it).`);
1249
-
text = text.replace(linkToEmbed, '').trim();
1250
-
// Clean up potential double punctuation/spaces left behind
1251
-
text = text.replace(/\s\.$/, '.').replace(/\s\s+/g, ' ');
1252
-
}
1253
-
}
1254
1255
-
console.log(`[${twitterUsername}] 🃏 Fetching link card for: ${linkToEmbed}`);
1256
-
linkCard = await fetchEmbedUrlCard(agent, linkToEmbed);
0
0
0
0
0
0
0
0
0
0
0
0
0
1257
}
0
0
0
0
1258
}
0
1259
}
1260
1261
// Only append link for external quotes IF we couldn't natively embed it OR screenshot it
1262
-
const hasScreenshot = images.some(img => img.alt.startsWith('Quote Tweet:'));
1263
if (externalQuoteUrl && !quoteEmbed && !hasScreenshot && !text.includes(externalQuoteUrl)) {
1264
text += `\n\nQT: ${externalQuoteUrl}`;
1265
}
1266
0
0
0
0
0
0
0
1267
// 4. Threading and Posting
1268
const chunks = splitText(text);
1269
console.log(`[${twitterUsername}] 📝 Splitting text into ${chunks.length} chunks.`);
1270
-
1271
let lastPostInfo: ProcessedTweetEntry | null = replyParentInfo;
1272
1273
// We will save the first chunk as the "Root" of this tweet, and the last chunk as the "Tail".
···
1276
1277
for (let i = 0; i < chunks.length; i++) {
1278
let chunk = chunks[i] as string;
1279
-
1280
// Add (i/n) if split
1281
if (chunks.length > 1) {
1282
-
chunk += ` (${i + 1}/${chunks.length})`;
1283
}
1284
1285
console.log(`[${twitterUsername}] 📤 Posting chunk ${i + 1}/${chunks.length}...`);
1286
updateAppStatus({ message: `Posting chunk ${i + 1}/${chunks.length}...` });
1287
-
1288
const rt = new RichText({ text: chunk });
1289
await rt.detectFacets(agent);
1290
const detectedLangs = detectLanguage(chunk);
···
1331
let rootRef: { uri: string; cid: string } | null = null;
1332
1333
if (lastPostInfo?.uri && lastPostInfo?.cid) {
1334
-
// If this is the start of a new tweet (i=0), check if parent has a tail
1335
-
if (i === 0 && lastPostInfo.tail) {
1336
-
parentRef = lastPostInfo.tail;
1337
-
} else {
1338
-
// Otherwise (intra-tweet or parent has no tail), use the main uri/cid (which is the previous post/chunk)
1339
-
parentRef = { uri: lastPostInfo.uri, cid: lastPostInfo.cid };
1340
-
}
1341
-
1342
-
rootRef = lastPostInfo.root || { uri: lastPostInfo.uri, cid: lastPostInfo.cid };
1343
}
1344
1345
if (parentRef && rootRef) {
···
1353
// Retry logic for network/socket errors
1354
let response: any;
1355
let retries = 3;
1356
-
1357
if (dryRun) {
1358
-
console.log(`[${twitterUsername}] 🧪 [DRY RUN] Would post chunk ${i + 1}/${chunks.length}`);
1359
-
if (postRecord.embed) console.log(` - With embed: ${postRecord.embed.$type}`);
1360
-
if (postRecord.reply) console.log(` - As reply to: ${postRecord.reply.parent.uri}`);
1361
-
response = { uri: 'at://did:plc:mock/app.bsky.feed.post/mock', cid: 'mock-cid' };
1362
} else {
1363
-
while (retries > 0) {
1364
-
try {
1365
-
response = await agent.post(postRecord);
1366
-
break;
1367
-
} catch (err: any) {
1368
-
retries--;
1369
-
if (retries === 0) throw err;
1370
-
console.warn(`[${twitterUsername}] ⚠️ Post failed (Socket/Network), retrying in 5s... (${retries} retries left)`);
1371
-
await new Promise(r => setTimeout(r, 5000));
1372
-
}
0
1373
}
0
1374
}
1375
-
1376
const currentPostInfo = {
1377
-
uri: response.uri,
1378
-
cid: response.cid,
1379
-
root: postRecord.reply ? postRecord.reply.root : { uri: response.uri, cid: response.cid },
1380
-
// Text is just the current chunk text
1381
-
text: chunk
1382
};
1383
-
1384
if (i === 0) firstChunkInfo = currentPostInfo;
1385
lastChunkInfo = currentPostInfo;
1386
lastPostInfo = currentPostInfo; // Update for next iteration
1387
1388
console.log(`[${twitterUsername}] ✅ Chunk ${i + 1} posted successfully.`);
1389
-
1390
if (chunks.length > 1) {
1391
await new Promise((r) => setTimeout(r, 3000));
1392
}
···
1395
break;
1396
}
1397
}
1398
-
1399
// Save to DB and Map
1400
if (firstChunkInfo && lastChunkInfo) {
1401
-
const entry: ProcessedTweetEntry = {
1402
-
uri: firstChunkInfo.uri,
1403
-
cid: firstChunkInfo.cid,
1404
-
root: firstChunkInfo.root,
1405
-
tail: { uri: lastChunkInfo.uri, cid: lastChunkInfo.cid }, // Save tail!
1406
-
text: tweetText
1407
-
};
1408
-
1409
-
if (!dryRun) {
1410
-
saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, entry);
1411
-
localProcessedMap[tweetId] = entry; // Update local map for subsequent replies in this batch
1412
-
}
1413
}
1414
-
1415
// Add a random delay between 5s and 15s to be more human-like
1416
const wait = Math.floor(Math.random() * 10000) + 5000;
1417
console.log(`[${twitterUsername}] 😴 Pacing: Waiting ${wait / 1000}s before next tweet.`);
···
1431
requestId?: string,
1432
): Promise<void> {
1433
const config = getConfig();
1434
-
const mapping = config.mappings.find((m) => m.twitterUsernames.map(u => u.toLowerCase()).includes(twitterUsername.toLowerCase()));
0
0
1435
if (!mapping) {
1436
console.error(`No mapping found for twitter username: ${twitterUsername}`);
1437
return;
···
1439
1440
let agent = await getAgent(mapping);
1441
if (!agent) {
1442
-
if (dryRun) {
1443
-
console.log("⚠️ Could not login to Bluesky, but proceeding with MOCK AGENT for Dry Run.");
1444
-
// biome-ignore lint/suspicious/noExplicitAny: mock agent
1445
-
agent = {
1446
-
post: async (record: any) => ({ uri: 'at://did:plc:mock/app.bsky.feed.post/mock', cid: 'mock-cid' }),
1447
-
uploadBlob: async (data: any) => ({ data: { blob: { ref: { toString: () => 'mock-blob' } } } }),
1448
-
// Add other necessary methods if they are called outside of the already mocked dryRun blocks
1449
-
// But since we mocked the calls inside processTweets for dryRun, we just need the object to exist.
1450
-
session: { did: 'did:plc:mock' },
1451
-
com: { atproto: { repo: { describeRepo: async () => ({ data: {} }) } } }
1452
-
} as any;
1453
-
} else {
1454
-
return;
1455
-
}
1456
}
1457
1458
console.log(`Starting full history import for ${twitterUsername} -> ${mapping.bskyIdentifier}...`);
···
1463
1464
console.log(`Fetching tweets for ${twitterUsername}...`);
1465
updateAppStatus({ message: `Fetching tweets...` });
1466
-
1467
const client = await getTwitterScraper();
1468
if (client) {
1469
-
try {
1470
-
// Use getTweets which reliably fetches user timeline
1471
-
// limit defaults to 15 in function signature, but for history import we might want more.
1472
-
// However, the generator will fetch as much as we ask.
1473
-
const fetchLimit = limit || 100;
1474
-
const generator = client.getTweets(twitterUsername, fetchLimit);
1475
-
1476
-
for await (const scraperTweet of generator) {
1477
-
if (!ignoreCancellation) {
1478
-
const stillPending = getPendingBackfills().some(b => b.id === mapping.id && (!requestId || b.requestId === requestId));
1479
-
if (!stillPending) {
1480
-
console.log(`[${twitterUsername}] 🛑 Backfill cancelled.`);
1481
-
break;
1482
-
}
1483
1484
-
}
1485
-
1486
-
const t = mapScraperTweetToLocalTweet(scraperTweet);
1487
-
const tid = t.id_str || t.id;
1488
-
if (!tid) continue;
1489
-
1490
-
if (!processedTweets[tid] && !seenIds.has(tid)) {
1491
-
allFoundTweets.push(t);
1492
-
seenIds.add(tid);
1493
-
}
1494
-
1495
-
if (allFoundTweets.length >= fetchLimit) break;
1496
}
1497
-
} catch(e) {
1498
-
console.warn("Error during history fetch:", e);
0
0
0
0
0
0
0
0
0
0
1499
}
0
0
0
1500
}
1501
1502
console.log(`Fetch complete. Found ${allFoundTweets.length} new tweets to import.`);
···
1510
const activeTasks = new Map<string, Promise<void>>();
1511
1512
async function runAccountTask(mapping: AccountMapping, backfillRequest?: PendingBackfill, dryRun = false) {
1513
-
if (activeTasks.has(mapping.id)) return; // Already running
1514
1515
-
const task = (async () => {
1516
-
try {
1517
-
const agent = await getAgent(mapping);
1518
-
if (!agent) return;
1519
1520
-
const backfillReq = backfillRequest ?? getPendingBackfills().find(b => b.id === mapping.id);
1521
-
1522
-
if (backfillReq) {
1523
-
const limit = backfillReq.limit || 15;
1524
-
console.log(`[${mapping.bskyIdentifier}] Running backfill for ${mapping.twitterUsernames.length} accounts (limit ${limit})...`);
1525
-
updateAppStatus({
1526
-
state: 'backfilling',
1527
-
currentAccount: mapping.twitterUsernames[0],
1528
-
message: `Starting backfill (limit ${limit})...`,
1529
-
backfillMappingId: mapping.id,
1530
-
backfillRequestId: backfillReq.requestId,
1531
-
});
1532
-
1533
-
for (const twitterUsername of mapping.twitterUsernames) {
1534
-
const stillPending = getPendingBackfills().some(
1535
-
(b) => b.id === mapping.id && b.requestId === backfillReq.requestId,
1536
-
);
1537
-
if (!stillPending) {
1538
-
console.log(`[${mapping.bskyIdentifier}] 🛑 Backfill request replaced; stopping.`);
1539
-
break;
1540
-
}
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1541
1542
-
try {
1543
-
updateAppStatus({
1544
-
state: 'backfilling',
1545
-
currentAccount: twitterUsername,
1546
-
message: `Starting backfill (limit ${limit})...`,
1547
-
backfillMappingId: mapping.id,
1548
-
backfillRequestId: backfillReq.requestId,
1549
-
});
1550
-
await importHistory(twitterUsername, mapping.bskyIdentifier, limit, dryRun, false, backfillReq.requestId);
1551
-
} catch (err) {
1552
-
console.error(`❌ Error backfilling ${twitterUsername}:`, err);
1553
-
}
1554
-
}
1555
-
clearBackfill(mapping.id, backfillReq.requestId);
1556
-
updateAppStatus({
1557
-
state: 'idle',
1558
-
message: `Backfill complete for ${mapping.bskyIdentifier}`,
1559
-
backfillMappingId: undefined,
1560
-
backfillRequestId: undefined,
1561
-
});
1562
-
console.log(`[${mapping.bskyIdentifier}] Backfill complete.`);
1563
-
} else {
1564
-
updateAppStatus({ backfillMappingId: undefined, backfillRequestId: undefined });
1565
1566
-
// Pre-load processed IDs for optimization
1567
-
const processedMap = loadProcessedTweets(mapping.bskyIdentifier);
1568
-
const processedIds = new Set(Object.keys(processedMap));
1569
1570
-
for (const twitterUsername of mapping.twitterUsernames) {
1571
-
try {
1572
-
console.log(`[${twitterUsername}] 🏁 Starting check for new tweets...`);
1573
-
updateAppStatus({
1574
-
state: 'checking',
1575
-
currentAccount: twitterUsername,
1576
-
message: 'Fetching latest tweets...',
1577
-
backfillMappingId: undefined,
1578
-
backfillRequestId: undefined,
1579
-
});
1580
-
1581
-
// Use fetchUserTweets with early stopping optimization
1582
-
// Increase limit slightly since we have early stopping now
1583
-
const tweets = await fetchUserTweets(twitterUsername, 50, processedIds);
1584
-
1585
-
if (!tweets || tweets.length === 0) {
1586
-
console.log(`[${twitterUsername}] ℹ️ No tweets found (or fetch failed).`);
1587
-
continue;
1588
-
}
1589
-
1590
-
console.log(`[${twitterUsername}] 📥 Fetched ${tweets.length} tweets.`);
1591
-
await processTweets(agent, twitterUsername, mapping.bskyIdentifier, tweets, dryRun);
1592
-
} catch (err) {
1593
-
console.error(`❌ Error checking ${twitterUsername}:`, err);
1594
-
}
1595
-
}
1596
}
1597
-
} catch (err) {
1598
-
console.error(`Error processing mapping ${mapping.bskyIdentifier}:`, err);
1599
-
} finally {
1600
-
activeTasks.delete(mapping.id);
0
0
1601
}
1602
-
})();
0
0
0
0
0
0
1603
1604
-
activeTasks.set(mapping.id, task);
1605
-
return task; // Return task promise for await in main loop
1606
}
1607
0
1608
import {
1609
-
startServer,
1610
-
updateLastCheckTime,
1611
-
getPendingBackfills,
1612
clearBackfill,
1613
getNextCheckTime,
0
0
1614
updateAppStatus,
0
1615
} from './server.js';
1616
import type { PendingBackfill } from './server.js';
1617
-
import { AccountMapping } from './config-manager.js';
1618
1619
async function main(): Promise<void> {
1620
const program = new Command();
···
1655
console.error('Twitter credentials not set. Cannot import history.');
1656
process.exit(1);
1657
}
1658
-
const mapping = config.mappings.find(m => m.twitterUsernames.map(u => u.toLowerCase()).includes(options.username.toLowerCase()));
0
0
1659
if (!mapping) {
1660
console.error(`No mapping found for ${options.username}`);
1661
process.exit(1);
···
1675
// Concurrency limit for processing accounts
1676
const runLimit = pLimit(3);
1677
1678
-
const findMappingById = (mappings: AccountMapping[], id: string) =>
1679
-
mappings.find((mapping) => mapping.id === id);
1680
1681
// Main loop
1682
while (true) {
···
1719
for (const mapping of config.mappings) {
1720
if (!mapping.enabled) continue;
1721
1722
-
tasks.push(runLimit(async () => {
1723
-
await runAccountTask(mapping, undefined, options.dryRun);
1724
-
}));
0
0
1725
}
1726
1727
if (tasks.length > 0) {
···
2
import fs from 'node:fs';
3
import path from 'node:path';
4
import { fileURLToPath } from 'node:url';
5
+
import { type BskyAgent, RichText } from '@atproto/api';
6
import type { BlobRef } from '@atproto/api';
7
import { Scraper } from '@the-convocation/twitter-scraper';
8
import type { Tweet as ScraperTweet } from '@the-convocation/twitter-scraper';
9
import axios from 'axios';
10
+
import * as cheerio from 'cheerio';
11
import { Command } from 'commander';
12
import * as francModule from 'franc-min';
13
import iso6391 from 'iso-639-1';
14
import puppeteer from 'puppeteer-core';
0
15
import sharp from 'sharp';
16
import { generateAltText } from './ai-manager.js';
17
···
21
const __filename = fileURLToPath(import.meta.url);
22
const __dirname = path.dirname(__filename);
23
24
+
// ============================================================================
25
// Type Definitions
26
+
// ============================================================================
27
28
interface ProcessedTweetEntry {
29
uri?: string;
···
44
expanded_url?: string;
45
}
46
47
+
interface CardImageValue {
48
+
url?: string;
49
+
width?: number;
50
+
height?: number;
51
+
alt?: string;
52
+
}
53
+
54
+
interface CardBindingValue {
55
+
type?: string;
56
+
string_value?: string;
57
+
image_value?: CardImageValue;
58
+
}
59
+
60
+
interface CardBindingEntry {
61
+
key?: string;
62
+
value?: CardBindingValue;
63
+
}
64
+
65
+
type CardBindingValues = Record<string, CardBindingValue> | CardBindingEntry[];
66
+
67
+
interface TweetCard {
68
+
name?: string;
69
+
binding_values?: CardBindingValues;
70
+
url?: string;
71
+
}
72
+
73
interface MediaSize {
74
w: number;
75
h: number;
···
104
sizes?: MediaSizes;
105
original_info?: OriginalInfo;
106
video_info?: VideoInfo;
107
+
source?: 'tweet' | 'card';
108
}
109
110
interface TweetEntities {
···
132
screen_name?: string;
133
id_str?: string;
134
};
135
+
card?: TweetCard | null;
136
+
permanentUrl?: string;
137
}
138
139
interface AspectRatio {
···
149
150
import { dbService } from './db.js';
151
152
+
// ============================================================================
153
// State Management
154
+
// ============================================================================
155
156
const PROCESSED_DIR = path.join(__dirname, '..', 'processed');
157
158
async function migrateJsonToSqlite() {
159
if (!fs.existsSync(PROCESSED_DIR)) return;
160
+
161
+
const files = fs.readdirSync(PROCESSED_DIR).filter((f) => f.endsWith('.json'));
162
if (files.length === 0) return;
163
164
console.log(`📦 Found ${files.length} legacy cache files. Migrating to SQLite...`);
165
const config = getConfig();
166
+
167
for (const file of files) {
168
const username = file.replace('.json', '').toLowerCase();
169
// Try to find a matching bskyIdentifier from config
170
+
const mapping = config.mappings.find((m) => m.twitterUsernames.map((u) => u.toLowerCase()).includes(username));
171
const bskyIdentifier = mapping?.bskyIdentifier || 'unknown';
172
173
try {
174
const filePath = path.join(PROCESSED_DIR, file);
175
const data = JSON.parse(fs.readFileSync(filePath, 'utf8')) as ProcessedTweetsMap;
176
+
177
for (const [twitterId, entry] of Object.entries(data)) {
178
dbService.saveTweet({
179
twitter_id: twitterId,
···
183
bsky_cid: entry.cid,
184
bsky_root_uri: entry.root?.uri,
185
bsky_root_cid: entry.root?.cid,
186
+
status: entry.migrated ? 'migrated' : entry.skipped ? 'skipped' : 'failed',
187
});
188
}
189
// Move file to backup
···
201
dbService.repairUnknownIdentifiers(username, mapping.bskyIdentifier);
202
}
203
}
204
+
205
console.log('✅ Migration complete.');
206
}
207
···
209
return dbService.getTweetsByBskyIdentifier(bskyIdentifier);
210
}
211
212
+
function saveProcessedTweet(
213
+
twitterUsername: string,
214
+
bskyIdentifier: string,
215
+
twitterId: string,
216
+
entry: ProcessedTweetEntry,
217
+
): void {
218
dbService.saveTweet({
219
twitter_id: twitterId,
220
twitter_username: twitterUsername.toLowerCase(),
···
226
bsky_root_cid: entry.root?.cid,
227
bsky_tail_uri: entry.tail?.uri,
228
bsky_tail_cid: entry.tail?.cid,
229
+
status: entry.migrated || (entry.uri && entry.cid) ? 'migrated' : entry.skipped ? 'skipped' : 'failed',
230
});
231
}
232
233
+
// ============================================================================
234
// Custom Twitter Client
235
+
// ============================================================================
236
237
let scraper: Scraper | null = null;
238
let currentTwitterCookies = { authToken: '', ct0: '' };
···
250
}
251
252
if (!authToken || !ct0) return null;
253
+
254
// Re-initialize if config changed, not yet initialized, or forced reset
255
+
if (!scraper || forceReset || currentTwitterCookies.authToken !== authToken || currentTwitterCookies.ct0 !== ct0) {
0
0
0
0
0
256
console.log(`🔄 Initializing Twitter scraper with ${useBackupCredentials ? 'BACKUP' : 'PRIMARY'} credentials...`);
257
scraper = new Scraper();
258
+
await scraper.setCookies([`auth_token=${authToken}`, `ct0=${ct0}`]);
0
0
0
259
260
+
currentTwitterCookies = {
261
+
authToken: authToken,
262
+
ct0: ct0,
263
};
264
}
265
return scraper;
···
273
await getTwitterScraper(true);
274
return true;
275
}
276
+
console.log('⚠️ No backup credentials available to switch to.');
277
return false;
278
}
279
280
function mapScraperTweetToLocalTweet(scraperTweet: ScraperTweet): Tweet {
281
+
const raw = scraperTweet.__raw_UNSTABLE;
282
+
if (!raw) {
283
+
// Fallback if raw data is missing (shouldn't happen for timeline tweets usually)
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
284
return {
285
+
id: scraperTweet.id,
286
+
id_str: scraperTweet.id,
287
+
text: scraperTweet.text,
288
+
full_text: scraperTweet.text,
0
289
isRetweet: scraperTweet.isRetweet,
290
+
// Construct minimal entities from parsed data
291
+
entities: {
292
+
urls: scraperTweet.urls.map((url: string) => ({ url, expanded_url: url })),
293
+
media: scraperTweet.photos.map((p: any) => ({
294
+
url: p.url,
295
+
expanded_url: p.url,
296
+
media_url_https: p.url,
297
+
type: 'photo',
298
+
ext_alt_text: p.alt_text,
299
+
})),
0
0
0
300
},
301
+
created_at: scraperTweet.timeParsed?.toUTCString(),
302
+
permanentUrl: scraperTweet.permanentUrl,
303
};
304
+
}
305
+
306
+
return {
307
+
id: raw.id_str,
308
+
id_str: raw.id_str,
309
+
text: raw.full_text,
310
+
full_text: raw.full_text,
311
+
created_at: raw.created_at,
312
+
isRetweet: scraperTweet.isRetweet,
313
+
// biome-ignore lint/suspicious/noExplicitAny: raw types match compatible structure
314
+
entities: raw.entities as any,
315
+
// biome-ignore lint/suspicious/noExplicitAny: raw types match compatible structure
316
+
extended_entities: raw.extended_entities as any,
317
+
quoted_status_id_str: raw.quoted_status_id_str,
318
+
retweeted_status_id_str: raw.retweeted_status_id_str,
319
+
is_quote_status: !!raw.quoted_status_id_str,
320
+
in_reply_to_status_id_str: raw.in_reply_to_status_id_str,
321
+
// biome-ignore lint/suspicious/noExplicitAny: missing in LegacyTweetRaw type
322
+
in_reply_to_user_id_str: (raw as any).in_reply_to_user_id_str,
323
+
// biome-ignore lint/suspicious/noExplicitAny: card comes from raw tweet
324
+
card: (raw as any).card,
325
+
permanentUrl: scraperTweet.permanentUrl,
326
+
user: {
327
+
screen_name: scraperTweet.username,
328
+
id_str: scraperTweet.userId,
329
+
},
330
+
};
331
}
332
333
+
// ============================================================================
334
// Helper Functions
335
+
// ============================================================================
336
+
337
+
function normalizeCardBindings(bindingValues?: CardBindingValues): Record<string, CardBindingValue> {
338
+
if (!bindingValues) return {};
339
+
if (Array.isArray(bindingValues)) {
340
+
return bindingValues.reduce(
341
+
(acc, entry) => {
342
+
if (entry?.key && entry.value) acc[entry.key] = entry.value;
343
+
return acc;
344
+
},
345
+
{} as Record<string, CardBindingValue>,
346
+
);
347
+
}
348
+
return bindingValues as Record<string, CardBindingValue>;
349
+
}
350
+
351
+
function isLikelyUrl(value?: string): value is string {
352
+
if (!value) return false;
353
+
return /^https?:\/\//i.test(value);
354
+
}
355
+
356
+
function extractCardImageUrl(bindingValues: CardBindingValues, preferredKeys: string[]): string | undefined {
357
+
const normalized = normalizeCardBindings(bindingValues);
358
+
for (const key of preferredKeys) {
359
+
const value = normalized[key];
360
+
const imageUrl = value?.image_value?.url;
361
+
if (imageUrl) return imageUrl;
362
+
}
363
+
const fallbackValue = Object.values(normalized).find((value) => value?.image_value?.url);
364
+
return fallbackValue?.image_value?.url;
365
+
}
366
+
367
+
function extractCardLink(bindingValues: CardBindingValues, preferredKeys: string[]): string | undefined {
368
+
const normalized = normalizeCardBindings(bindingValues);
369
+
for (const key of preferredKeys) {
370
+
const value = normalized[key];
371
+
const link = value?.string_value;
372
+
if (isLikelyUrl(link)) return link;
373
+
}
374
+
const fallbackValue = Object.values(normalized).find((value) => isLikelyUrl(value?.string_value));
375
+
return fallbackValue?.string_value;
376
+
}
377
+
378
+
function extractCardTitle(bindingValues: CardBindingValues, preferredKeys: string[]): string | undefined {
379
+
const normalized = normalizeCardBindings(bindingValues);
380
+
for (const key of preferredKeys) {
381
+
const value = normalized[key];
382
+
const title = value?.string_value;
383
+
if (title && !isLikelyUrl(title)) return title;
384
+
}
385
+
const fallbackValue = Object.values(normalized).find(
386
+
(value) => value?.string_value && !isLikelyUrl(value?.string_value),
387
+
);
388
+
return fallbackValue?.string_value;
389
+
}
390
+
391
+
function extractCardAlt(bindingValues: CardBindingValues): string | undefined {
392
+
const normalized = normalizeCardBindings(bindingValues);
393
+
const altValue = Object.values(normalized).find((value) => value?.image_value?.alt);
394
+
return altValue?.image_value?.alt;
395
+
}
396
+
397
+
function appendCallToAction(text: string, link?: string, label = 'Sponsored') {
398
+
if (!link) return text;
399
+
if (text.includes(link)) return text;
400
+
return `${text}\n\n${label}: ${link}`.trim();
401
+
}
402
+
403
+
function detectCardMedia(tweet: Tweet): { imageUrls: string[]; link?: string; title?: string; alt?: string } {
404
+
if (!tweet.card?.binding_values) return { imageUrls: [] };
405
+
const bindings = tweet.card.binding_values;
406
+
407
+
const imageUrls: string[] = [];
408
+
const preferredImageKeys = [
409
+
'photo_image_full_size',
410
+
'photo_image_full_size_original',
411
+
'thumbnail_image',
412
+
'image',
413
+
'thumbnail',
414
+
'summary_photo_image',
415
+
'player_image',
416
+
];
417
+
const preferredLinkKeys = ['site', 'destination', 'landing_url', 'cta_link', 'card_url', 'url'];
418
+
const preferredTitleKeys = ['title', 'summary', 'card_title'];
419
+
420
+
const primaryImage = extractCardImageUrl(bindings, preferredImageKeys);
421
+
if (primaryImage) imageUrls.push(primaryImage);
422
+
423
+
const imageKeys = normalizeCardBindings(bindings);
424
+
Object.values(imageKeys).forEach((value) => {
425
+
const url = value?.image_value?.url;
426
+
if (url && !imageUrls.includes(url)) imageUrls.push(url);
427
+
});
428
+
429
+
const link = extractCardLink(bindings, preferredLinkKeys);
430
+
const title = extractCardTitle(bindings, preferredTitleKeys);
431
+
const alt = extractCardAlt(bindings);
432
+
433
+
return { imageUrls, link, title, alt };
434
+
}
435
+
436
+
function buildCardMediaEntities(tweet: Tweet): { media: MediaEntity[]; link?: string } {
437
+
const cardData = detectCardMedia(tweet);
438
+
if (cardData.imageUrls.length === 0) return { media: [] };
439
+
440
+
const media = cardData.imageUrls.slice(0, 4).map((url) => ({
441
+
media_url_https: url,
442
+
type: 'photo' as const,
443
+
ext_alt_text: cardData.alt || cardData.title || 'Sponsored image',
444
+
source: 'card' as const,
445
+
}));
446
+
447
+
return { media, link: cardData.link };
448
+
}
449
+
450
+
function ensureUrlEntity(entities: TweetEntities | undefined, link?: string) {
451
+
if (!link) return;
452
+
if (!entities) return;
453
+
const urls = entities.urls || [];
454
+
if (!urls.some((url) => url.expanded_url === link || url.url === link)) {
455
+
urls.push({ url: link, expanded_url: link });
456
+
entities.urls = urls;
457
+
}
458
+
}
459
+
460
+
function detectSponsoredCard(tweet: Tweet): boolean {
461
+
if (!tweet.card?.binding_values) return false;
462
+
const cardName = tweet.card.name?.toLowerCase() || '';
463
+
const cardMedia = detectCardMedia(tweet);
464
+
const hasMultipleImages = cardMedia.imageUrls.length > 1;
465
+
const promoKeywords = ['promo', 'unified', 'carousel', 'collection', 'amplify'];
466
+
const hasPromoName = promoKeywords.some((keyword) => cardName.includes(keyword));
467
+
return hasMultipleImages || hasPromoName;
468
+
}
469
+
470
+
function mergeMediaEntities(primary: MediaEntity[], secondary: MediaEntity[], limit = 4): MediaEntity[] {
471
+
const merged: MediaEntity[] = [];
472
+
const seen = new Set<string>();
473
+
const ordered = [
474
+
...primary.filter((media) => media?.source !== 'card'),
475
+
...primary.filter((media) => media?.source === 'card'),
476
+
...secondary.filter((media) => media?.source !== 'card'),
477
+
...secondary.filter((media) => media?.source === 'card'),
478
+
];
479
+
480
+
for (const media of ordered) {
481
+
if (!media?.media_url_https) continue;
482
+
if (seen.has(media.media_url_https)) continue;
483
+
merged.push(media);
484
+
seen.add(media.media_url_https);
485
+
if (merged.length >= limit) break;
486
+
}
487
+
488
+
return merged;
489
+
}
490
+
491
+
function detectCarouselLinks(tweet: Tweet): string[] {
492
+
if (!tweet.card?.binding_values) return [];
493
+
const bindings = normalizeCardBindings(tweet.card.binding_values);
494
+
const links = Object.values(bindings)
495
+
.map((value) => value?.string_value)
496
+
.filter((value): value is string => isLikelyUrl(value));
497
+
return [...new Set(links)];
498
+
}
499
+
500
+
function mergeUrlEntities(entities: TweetEntities | undefined, links: string[]) {
501
+
if (!entities || links.length === 0) return;
502
+
const urls = entities.urls || [];
503
+
links.forEach((link) => {
504
+
if (!urls.some((url) => url.expanded_url === link || url.url === link)) {
505
+
urls.push({ url: link, expanded_url: link });
506
+
}
507
+
});
508
+
entities.urls = urls;
509
+
}
510
+
511
+
function injectCardMedia(tweet: Tweet) {
512
+
if (!tweet.card?.binding_values) return;
513
+
const cardMedia = buildCardMediaEntities(tweet);
514
+
if (cardMedia.media.length === 0) return;
515
+
516
+
const existingMedia = tweet.extended_entities?.media || tweet.entities?.media || [];
517
+
const mergedMedia = mergeMediaEntities(existingMedia, cardMedia.media);
518
+
519
+
if (!tweet.extended_entities) tweet.extended_entities = {};
520
+
tweet.extended_entities.media = mergedMedia;
521
+
if (!tweet.entities) tweet.entities = {};
522
+
if (!tweet.entities.media) tweet.entities.media = mergedMedia;
523
+
524
+
if (cardMedia.link) {
525
+
ensureUrlEntity(tweet.entities, cardMedia.link);
526
+
}
527
+
528
+
const carouselLinks = detectCarouselLinks(tweet);
529
+
mergeUrlEntities(tweet.entities, carouselLinks);
530
+
}
531
+
532
+
function ensureSponsoredLinks(text: string, tweet: Tweet): string {
533
+
if (!tweet.card?.binding_values) return text;
534
+
const carouselLinks = detectCarouselLinks(tweet);
535
+
const cardLink = detectCardMedia(tweet).link;
536
+
const links = [...new Set([cardLink, ...carouselLinks].filter(Boolean))] as string[];
537
+
if (links.length === 0) return text;
538
+
539
+
const appendedLinks = links.slice(0, 2).map((link) => `Link: ${link}`);
540
+
const updatedText = `${text}\n\n${appendedLinks.join('\n')}`.trim();
541
+
return updatedText;
542
+
}
543
+
544
+
function addTextFallbacks(text: string): string {
545
+
return text.replace(/\s+$/g, '').trim();
546
+
}
547
+
548
+
async function fetchSyndicationMedia(tweetUrl: string): Promise<{ images: string[] }> {
549
+
try {
550
+
const normalized = tweetUrl.replace('twitter.com', 'x.com');
551
+
const res = await axios.get('https://publish.twitter.com/oembed', {
552
+
params: { url: normalized },
553
+
headers: { 'User-Agent': 'Mozilla/5.0' },
554
+
});
555
+
const html = res.data?.html as string | undefined;
556
+
if (!html) return { images: [] };
557
+
558
+
const match = html.match(/status\/(\d+)/);
559
+
const tweetId = match?.[1];
560
+
if (!tweetId) return { images: [] };
561
+
562
+
const syndicationUrl = `https://cdn.syndication.twimg.com/tweet-result?id=${tweetId}`;
563
+
const syndication = await axios.get(syndicationUrl, {
564
+
headers: { 'User-Agent': 'Mozilla/5.0', Accept: 'application/json' },
565
+
});
566
+
const data = syndication.data as Record<string, unknown>;
567
+
const images = (data?.photos as { url?: string }[] | undefined)
568
+
?.map((photo) => photo.url)
569
+
.filter(Boolean) as string[];
570
+
return { images: images || [] };
571
+
} catch (err) {
572
+
return { images: [] };
573
+
}
574
+
}
575
+
576
+
function injectSyndicationMedia(tweet: Tweet, syndication: { images: string[] }) {
577
+
if (syndication.images.length === 0) return;
578
+
const media = syndication.images.slice(0, 4).map((url) => ({
579
+
media_url_https: url,
580
+
type: 'photo' as const,
581
+
ext_alt_text: 'Image from Twitter',
582
+
source: 'card' as const,
583
+
}));
584
+
585
+
const existingMedia = tweet.extended_entities?.media || tweet.entities?.media || [];
586
+
const mergedMedia = mergeMediaEntities(existingMedia, media);
587
+
588
+
if (!tweet.extended_entities) tweet.extended_entities = {};
589
+
tweet.extended_entities.media = mergedMedia;
590
+
if (!tweet.entities) tweet.entities = {};
591
+
if (!tweet.entities.media) tweet.entities.media = mergedMedia;
592
+
}
593
594
function detectLanguage(text: string): string[] {
595
if (!text || text.trim().length === 0) return ['en'];
···
622
return (response.request as any)?.res?.responseUrl || shortUrl;
623
} catch (e: any) {
624
if (e.code === 'ERR_FR_TOO_MANY_REDIRECTS' || e.response?.status === 403 || e.response?.status === 401) {
625
+
// Silent fallback for common expansion issues (redirect loops, login walls)
626
+
return shortUrl;
627
}
628
return shortUrl;
629
}
···
659
const isGif = mimeType === 'image/gif';
660
const isAnimation = isGif || isWebp;
661
662
+
if (
663
+
(buffer.length > MAX_SIZE && (mimeType.startsWith('image/') || mimeType === 'application/octet-stream')) ||
664
+
(isPng && buffer.length > MAX_SIZE)
665
+
) {
666
console.log(`[UPLOAD] ⚖️ Image too large (${(buffer.length / 1024).toFixed(2)} KB). Optimizing...`);
667
try {
668
let image = sharp(buffer);
···
676
while (currentBuffer.length > MAX_SIZE && attempts < 5) {
677
attempts++;
678
console.log(`[UPLOAD] 📉 Compression attempt ${attempts}: Width ${width}, Quality ${quality}...`);
679
+
680
if (isAnimation) {
681
+
// For animations (GIF/WebP), we can only do so much without losing frames
682
+
// Try to convert to WebP if it's a GIF, or optimize WebP
683
+
image = sharp(buffer, { animated: true });
684
+
if (isGif) {
685
+
// Convert GIF to WebP for better compression
686
+
image = image.webp({ quality: Math.max(quality, 50), effort: 6 });
687
+
finalMimeType = 'image/webp';
688
+
} else {
689
+
image = image.webp({ quality: Math.max(quality, 50), effort: 6 });
690
+
}
691
+
// Resize if really big
692
+
if (metadata.width && metadata.width > 800) {
693
+
image = image.resize({ width: 800, withoutEnlargement: true });
694
+
}
695
} else {
696
+
// Static images
697
+
if (width > 1600) width = 1600;
698
+
else if (attempts > 1) width = Math.floor(width * 0.8);
699
+
700
+
quality = Math.max(50, quality - 10);
701
+
702
+
image = sharp(buffer).resize({ width, withoutEnlargement: true }).jpeg({ quality, mozjpeg: true });
703
+
704
+
finalMimeType = 'image/jpeg';
0
0
705
}
706
+
707
currentBuffer = await image.toBuffer();
708
if (currentBuffer.length <= MAX_SIZE) {
709
+
finalBuffer = currentBuffer;
710
+
console.log(`[UPLOAD] ✅ Optimized to ${(finalBuffer.length / 1024).toFixed(2)} KB`);
711
+
break;
712
}
713
}
714
+
715
if (finalBuffer.length > MAX_SIZE) {
716
+
console.warn(
717
+
`[UPLOAD] ⚠️ Could not compress below limit. Current: ${(finalBuffer.length / 1024).toFixed(2)} KB. Upload might fail.`,
718
+
);
719
}
0
720
} catch (err) {
721
console.warn(`[UPLOAD] ⚠️ Optimization failed, attempting original upload:`, (err as Error).message);
722
finalBuffer = buffer;
···
744
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
745
];
746
747
+
const executablePath = browserPaths.find((p) => fs.existsSync(p));
748
+
749
if (!executablePath) {
750
console.warn(`[SCREENSHOT] ⏩ Skipping screenshot (no Chrome/Chromium found at common paths).`);
751
return null;
···
789
`;
790
791
await page.setContent(html, { waitUntil: 'networkidle0' });
792
+
793
// Wait for the twitter iframe to load and render
794
try {
795
await page.waitForSelector('iframe', { timeout: 10000 });
796
// Small extra wait for images inside iframe
797
+
await new Promise((r) => setTimeout(r, 2000));
798
} catch (e) {
799
console.warn(`[SCREENSHOT] ⚠️ Timeout waiting for tweet iframe, taking screenshot anyway.`);
800
}
···
804
const box = await element.boundingBox();
805
const buffer = await element.screenshot({ type: 'png', omitBackground: true });
806
if (box) {
807
+
console.log(
808
+
`[SCREENSHOT] ✅ Captured successfully (${(buffer.length / 1024).toFixed(2)} KB) - ${Math.round(box.width)}x${Math.round(box.height)}`,
809
+
);
810
+
return { buffer: buffer as Buffer, width: Math.round(box.width), height: Math.round(box.height) };
811
}
812
}
813
} catch (err) {
···
825
826
while (!blob) {
827
attempts++;
828
+
const statusUrl = new URL('https://video.bsky.app/xrpc/app.bsky.video.getJobStatus');
829
+
statusUrl.searchParams.append('jobId', jobId);
830
831
const statusResponse = await fetch(statusUrl);
832
if (!statusResponse.ok) {
···
844
if (statusData.jobStatus.blob) {
845
blob = statusData.jobStatus.blob;
846
console.log(`[VIDEO] 🎉 Video processing complete! Blob ref obtained.`);
847
+
} else if (state === 'JOB_STATE_FAILED') {
848
+
throw new Error(`Video processing failed: ${statusData.jobStatus.error || 'Unknown error'}`);
849
} else {
850
// Wait before next poll
851
await new Promise((resolve) => setTimeout(resolve, 5000));
···
853
854
if (attempts > 60) {
855
// ~5 minute timeout
856
+
throw new Error('Video processing timed out after 5 minutes.');
857
}
858
}
859
return blob!;
···
863
try {
864
const response = await axios.get(url, {
865
headers: {
866
+
'User-Agent':
867
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
868
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
869
'Accept-Language': 'en-US,en;q=0.9',
870
},
871
timeout: 10000,
872
maxRedirects: 5,
873
});
874
+
875
const $ = cheerio.load(response.data);
876
const title = $('meta[property="og:title"]').attr('content') || $('title').text() || '';
877
+
const description =
878
+
$('meta[property="og:description"]').attr('content') || $('meta[name="description"]').attr('content') || '';
879
let thumbBlob: BlobRef | undefined;
880
881
let imageUrl = $('meta[property="og:image"]').attr('content');
882
if (imageUrl) {
883
+
if (!imageUrl.startsWith('http')) {
884
+
const baseUrl = new URL(url);
885
+
imageUrl = new URL(imageUrl, baseUrl.origin).toString();
886
+
}
887
+
try {
888
+
const { buffer, mimeType } = await downloadMedia(imageUrl);
889
+
thumbBlob = await uploadToBluesky(agent, buffer, mimeType);
890
+
} catch (e) {
891
+
// SIlently fail thumbnail upload
892
+
}
893
}
894
895
if (!title && !description) return null;
896
897
const external: any = {
898
+
uri: url,
899
+
title: title || url,
900
+
description: description,
901
};
902
903
if (thumbBlob) {
904
+
external.thumb = thumbBlob;
905
}
906
907
return {
908
+
$type: 'app.bsky.embed.external',
909
+
external,
910
};
0
911
} catch (err: any) {
912
if (err.code === 'ERR_FR_TOO_MANY_REDIRECTS') {
913
+
// Ignore redirect loops
914
+
return null;
915
}
916
console.warn(`Failed to fetch embed card for ${url}:`, err.message || err);
917
return null;
···
919
}
920
921
async function uploadVideoToBluesky(agent: BskyAgent, buffer: Buffer, filename: string): Promise<BlobRef> {
922
+
const sanitizedFilename = filename.split('?')[0] || 'video.mp4';
923
console.log(
924
`[VIDEO] 🟢 Starting upload process for ${sanitizedFilename} (${(buffer.length / 1024 / 1024).toFixed(2)} MB)`,
925
);
···
932
933
// didDoc might be present in repoDesc
934
const pdsService = (repoDesc as any).didDoc?.service?.find(
935
+
(s: any) => s.id === '#atproto_pds' || s.type === 'AtProtoPds',
936
);
937
const pdsUrl = pdsService?.serviceEndpoint;
938
+
const pdsHost = pdsUrl ? new URL(pdsUrl).host : 'bsky.social';
939
940
console.log(`[VIDEO] 🌐 PDS Host detected: ${pdsHost}`);
941
console.log(`[VIDEO] 🔑 Requesting service auth token for audience: did:web:${pdsHost}...`);
942
943
const { data: serviceAuth } = await agent.com.atproto.server.getServiceAuth({
944
aud: `did:web:${pdsHost}`,
945
+
lxm: 'com.atproto.repo.uploadBlob',
946
exp: Math.floor(Date.now() / 1000) + 60 * 30,
947
});
948
console.log(`[VIDEO] ✅ Service auth token obtained.`);
···
950
const token = serviceAuth.token;
951
952
// 2. Upload to Video Service
953
+
const uploadUrl = new URL('https://video.bsky.app/xrpc/app.bsky.video.uploadVideo');
954
+
uploadUrl.searchParams.append('did', agent.session!.did!);
955
+
uploadUrl.searchParams.append('name', sanitizedFilename);
956
957
console.log(`[VIDEO] 📤 Uploading to ${uploadUrl.href}...`);
958
const uploadResponse = await fetch(uploadUrl, {
959
+
method: 'POST',
960
headers: {
961
Authorization: `Bearer ${token}`,
962
+
'Content-Type': 'video/mp4',
963
},
964
body: new Blob([new Uint8Array(buffer)]),
965
});
···
970
// Handle specific error cases
971
try {
972
const errorJson = JSON.parse(errorText);
973
+
974
// Handle server overload gracefully
975
+
if (
976
+
uploadResponse.status === 503 ||
977
+
errorJson.error === 'Server does not have enough capacity to handle uploads'
978
+
) {
979
+
console.warn(`[VIDEO] ⚠️ Server overloaded (503). Skipping video upload and falling back to link.`);
980
+
throw new Error('VIDEO_FALLBACK_503');
981
}
982
983
+
if (errorJson.error === 'already_exists' && errorJson.jobId) {
984
console.log(`[VIDEO] ♻️ Video already exists. Resuming with Job ID: ${errorJson.jobId}`);
985
return await pollForVideoProcessing(agent, errorJson.jobId);
986
}
987
+
if (
988
+
errorJson.error === 'unconfirmed_email' ||
989
+
(errorJson.jobStatus && errorJson.jobStatus.error === 'unconfirmed_email')
990
+
) {
991
+
console.error(
992
+
`[VIDEO] 🛑 BLUESKY ERROR: Your email is unconfirmed. You MUST verify your email on Bluesky to upload videos.`,
993
+
);
994
+
throw new Error('Bluesky Email Unconfirmed - Video Upload Rejected');
995
}
996
} catch (e) {
997
+
if ((e as Error).message === 'VIDEO_FALLBACK_503') throw e;
998
+
// Not JSON or missing fields, proceed with throwing original error
999
}
1000
+
1001
console.error(`[VIDEO] ❌ Server responded with ${uploadResponse.status}: ${errorText}`);
1002
throw new Error(`Video upload failed: ${uploadResponse.status} ${errorText}`);
1003
}
···
1040
// 4. Force split
1041
1042
let splitIndex = -1;
1043
+
1044
// Check paragraphs
1045
let checkIndex = remaining.lastIndexOf('\n\n', effectiveLimit);
1046
if (checkIndex !== -1) splitIndex = checkIndex;
1047
1048
// Check sentences
1049
if (splitIndex === -1) {
1050
+
// Look for punctuation followed by space
1051
+
const sentenceMatches = Array.from(remaining.substring(0, effectiveLimit).matchAll(/[.!?]\s/g));
1052
+
if (sentenceMatches.length > 0) {
1053
+
const lastMatch = sentenceMatches[sentenceMatches.length - 1];
1054
+
if (lastMatch && lastMatch.index !== undefined) {
1055
+
splitIndex = lastMatch.index + 1; // Include punctuation
0
1056
}
1057
+
}
1058
}
1059
1060
// Check spaces
1061
if (splitIndex === -1) {
1062
+
checkIndex = remaining.lastIndexOf(' ', effectiveLimit);
1063
+
if (checkIndex !== -1) splitIndex = checkIndex;
1064
}
1065
1066
// Force split if no good break point found
1067
if (splitIndex === -1) {
1068
+
splitIndex = effectiveLimit;
1069
}
1070
1071
chunks.push(remaining.substring(0, splitIndex).trim());
···
1114
async function fetchUserTweets(username: string, limit: number, processedIds?: Set<string>): Promise<Tweet[]> {
1115
const client = await getTwitterScraper();
1116
if (!client) return [];
1117
+
1118
let retries = 3;
1119
while (retries > 0) {
1120
try {
1121
const tweets: Tweet[] = [];
1122
const generator = client.getTweets(username, limit);
1123
let consecutiveProcessedCount = 0;
1124
+
1125
for await (const t of generator) {
1126
const tweet = mapScraperTweetToLocalTweet(t);
1127
const tweetId = tweet.id_str || tweet.id;
1128
+
1129
// Early stopping logic: if we see 3 consecutive tweets we've already processed, stop.
1130
// This assumes timeline order (mostly true).
1131
if (processedIds && tweetId && processedIds.has(tweetId)) {
1132
+
consecutiveProcessedCount++;
1133
+
if (consecutiveProcessedCount >= 3) {
1134
+
console.log(`[${username}] 🛑 Found 3 consecutive processed tweets. Stopping fetch early.`);
1135
+
break;
1136
+
}
1137
} else {
1138
+
consecutiveProcessedCount = 0;
1139
}
1140
1141
tweets.push(tweet);
···
1144
return tweets;
1145
} catch (e: any) {
1146
retries--;
1147
+
const isRetryable =
1148
+
e.message?.includes('ServiceUnavailable') ||
1149
+
e.message?.includes('Timeout') ||
1150
+
e.message?.includes('429') ||
1151
+
e.message?.includes('401');
1152
+
1153
// Check for Twitter Internal Server Error (often returns 400 with specific body)
1154
if (e?.response?.status === 400 && JSON.stringify(e?.response?.data || {}).includes('InternalServerError')) {
1155
+
console.warn(`⚠️ Twitter Internal Server Error (Transient) for ${username}.`);
1156
+
// Treat as retryable
1157
+
if (retries > 0) {
1158
+
await new Promise((r) => setTimeout(r, 5000));
1159
+
continue;
1160
+
}
1161
}
1162
1163
if (isRetryable) {
1164
console.warn(`⚠️ Error fetching tweets for ${username} (${e.message}).`);
1165
+
1166
// Attempt credential switch if we have backups
1167
if (await switchCredentials()) {
1168
+
console.log(`🔄 Retrying with new credentials...`);
1169
+
continue; // Retry loop with new credentials
1170
}
1171
1172
if (retries > 0) {
1173
+
console.log(`Waiting 5s before retry...`);
1174
+
await new Promise((r) => setTimeout(r, 5000));
1175
+
continue;
1176
}
1177
}
1178
+
1179
console.warn(`Error fetching tweets for ${username}:`, e.message || e);
1180
return [];
1181
}
1182
}
1183
+
1184
console.log(`[${username}] ⚠️ Scraper returned 0 tweets (or failed silently) after retries.`);
1185
return [];
1186
}
1187
1188
+
// ============================================================================
1189
// Main Processing Logic
1190
+
// ============================================================================
1191
1192
+
// ============================================================================
1193
// Main Processing Logic
1194
+
// ============================================================================
1195
1196
async function processTweets(
1197
agent: BskyAgent,
···
1213
});
1214
1215
const processedTweets = loadProcessedTweets(bskyIdentifier);
1216
+
1217
// Maintain a local map that updates in real-time for intra-batch replies
1218
const localProcessedMap: ProcessedTweetsMap = { ...processedTweets };
1219
···
1240
if (isRetweet) {
1241
console.log(`[${twitterUsername}] ⏩ Skipping retweet ${tweetId}.`);
1242
if (!dryRun) {
1243
+
// Save as skipped so we don't check it again
1244
+
saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, { skipped: true, text: tweet.text });
1245
+
localProcessedMap[tweetId] = { skipped: true, text: tweet.text };
1246
}
1247
continue;
1248
}
···
1271
// Parent missing from local batch/DB. Attempt to fetch it if it's a self-thread.
1272
// We assume it's a self-thread if we don't have it, but we'll verify author after fetch.
1273
console.log(`[${twitterUsername}] 🕵️ Parent ${replyStatusId} missing. Checking if backfillable...`);
1274
+
1275
let parentBackfilled = false;
1276
try {
1277
+
const scraper = await getTwitterScraper();
1278
+
if (scraper) {
1279
+
const parentRaw = await scraper.getTweet(replyStatusId);
1280
+
if (parentRaw) {
1281
+
const parentTweet = mapScraperTweetToLocalTweet(parentRaw);
1282
+
const parentAuthor = parentTweet.user?.screen_name;
1283
+
1284
+
if (parentAuthor?.toLowerCase() === twitterUsername.toLowerCase()) {
1285
+
console.log(`[${twitterUsername}] 🔄 Parent is ours (@${parentAuthor}). Backfilling parent first...`);
1286
+
// Recursively process the parent
1287
+
await processTweets(agent, twitterUsername, bskyIdentifier, [parentTweet], dryRun);
1288
+
1289
+
// Check if it was saved
1290
+
const savedParent = dbService.getTweet(replyStatusId, bskyIdentifier);
1291
+
if (savedParent && savedParent.status === 'migrated') {
1292
+
// Update local map
1293
+
localProcessedMap[replyStatusId] = {
1294
+
uri: savedParent.bsky_uri,
1295
+
cid: savedParent.bsky_cid,
1296
+
root:
1297
+
savedParent.bsky_root_uri && savedParent.bsky_root_cid
1298
+
? { uri: savedParent.bsky_root_uri, cid: savedParent.bsky_root_cid }
1299
+
: undefined,
1300
+
tail:
1301
+
savedParent.bsky_tail_uri && savedParent.bsky_tail_cid
1302
+
? { uri: savedParent.bsky_tail_uri, cid: savedParent.bsky_tail_cid }
1303
+
: undefined,
1304
+
migrated: true,
1305
+
};
1306
+
replyParentInfo = localProcessedMap[replyStatusId] ?? null;
1307
+
parentBackfilled = true;
1308
+
console.log(`[${twitterUsername}] ✅ Parent backfilled. Resuming thread.`);
1309
}
1310
+
} else {
1311
+
console.log(`[${twitterUsername}] ⏩ Parent is by @${parentAuthor}. Skipping external reply.`);
1312
+
}
1313
}
1314
+
}
1315
} catch (e) {
1316
+
console.warn(`[${twitterUsername}] ⚠️ Failed to fetch/backfill parent ${replyStatusId}:`, e);
1317
}
1318
1319
if (!parentBackfilled) {
1320
+
console.log(`[${twitterUsername}] ⏩ Skipping external/unknown reply (Parent not found or external).`);
1321
+
if (!dryRun) {
1322
+
saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, { skipped: true, text: tweetText });
1323
+
localProcessedMap[tweetId] = { skipped: true, text: tweetText };
1324
+
}
1325
+
continue;
1326
}
1327
} else {
1328
console.log(`[${twitterUsername}] ⏩ Skipping external/unknown reply.`);
···
1335
}
1336
1337
// Removed early dryRun continue to allow verifying logic
1338
+
1339
let text = tweetText
1340
.replace(/&/g, '&')
1341
.replace(/</g, '<')
1342
.replace(/>/g, '>')
1343
.replace(/"/g, '"')
1344
.replace(/'/g, "'");
1345
+
1346
// 1. Link Expansion
1347
console.log(`[${twitterUsername}] 🔗 Expanding links...`);
1348
const urls = tweet.entities?.urls || [];
···
1357
const matches = text.match(tcoRegex) || [];
1358
for (const tco of matches) {
1359
// Avoid re-resolving if we already handled it via entities
1360
+
if (urls.some((u) => u.url === tco)) continue;
1361
1362
console.log(`[${twitterUsername}] 🔍 Resolving fallback link: ${tco}`);
1363
const resolved = await expandUrl(tco);
1364
if (resolved !== tco) {
1365
+
text = text.replace(tco, resolved);
1366
+
// Add to urls array so it can be used for card embedding later
1367
+
urls.push({ url: tco, expanded_url: resolved });
1368
+
}
1369
+
}
1370
+
1371
+
const isSponsoredCard = detectSponsoredCard(tweet);
1372
+
if (isSponsoredCard) {
1373
+
console.log(`[${twitterUsername}] 🧩 Sponsored/card payload detected. Extracting carousel media...`);
1374
+
injectCardMedia(tweet);
1375
+
} else if (tweet.permanentUrl) {
1376
+
const syndication = await fetchSyndicationMedia(tweet.permanentUrl);
1377
+
if (syndication.images.length > 0) {
1378
+
console.log(`[${twitterUsername}] 🧩 Syndication carousel detected. Extracting media...`);
1379
+
injectSyndicationMedia(tweet, syndication);
1380
}
1381
}
1382
···
1394
mediaLinksToRemove.push(media.url);
1395
if (media.expanded_url) mediaLinksToRemove.push(media.expanded_url);
1396
}
1397
+
if (media.source === 'card' && media.media_url_https) {
1398
+
mediaLinksToRemove.push(media.media_url_https);
1399
+
}
1400
+
1401
let aspectRatio: AspectRatio | undefined;
1402
if (media.sizes?.large) {
1403
aspectRatio = { width: media.sizes.large.w, height: media.sizes.large.h };
···
1413
console.log(`[${twitterUsername}] 📥 Downloading image (high quality): ${path.basename(highQualityUrl)}`);
1414
updateAppStatus({ message: `Downloading high quality image...` });
1415
const { buffer, mimeType } = await downloadMedia(highQualityUrl);
1416
+
1417
let blob: BlobRef;
1418
if (dryRun) {
1419
+
console.log(
1420
+
`[${twitterUsername}] 🧪 [DRY RUN] Would upload image (${(buffer.length / 1024).toFixed(2)} KB)`,
1421
+
);
1422
+
blob = { ref: { toString: () => 'mock-blob' }, mimeType, size: buffer.length } as any;
1423
} else {
1424
+
console.log(`[${twitterUsername}] 📤 Uploading image to Bluesky...`);
1425
+
updateAppStatus({ message: `Uploading image to Bluesky...` });
1426
+
blob = await uploadToBluesky(agent, buffer, mimeType);
1427
}
1428
+
1429
let altText = media.ext_alt_text;
1430
if (!altText) {
1431
+
console.log(`[${twitterUsername}] 🤖 Generating alt text via Gemini...`);
1432
+
// Use original tweet text for context, not the modified/cleaned one
1433
+
altText = await generateAltText(buffer, mimeType, tweetText);
1434
+
if (altText) console.log(`[${twitterUsername}] ✅ Alt text generated: ${altText.substring(0, 50)}...`);
1435
}
1436
1437
images.push({ alt: altText || 'Image from Twitter', image: blob, aspectRatio });
···
1452
} else if (media.type === 'video' || media.type === 'animated_gif') {
1453
const variants = media.video_info?.variants || [];
1454
const duration = media.video_info?.duration_millis || 0;
1455
+
1456
+
if (duration > 180000) {
1457
+
// 3 minutes
1458
+
console.warn(`[${twitterUsername}] ⚠️ Video too long (${(duration / 1000).toFixed(1)}s). Fallback to link.`);
1459
+
const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`;
1460
+
if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`;
1461
+
continue;
1462
}
1463
1464
const mp4s = variants
···
1473
console.log(`[${twitterUsername}] 📥 Downloading video: ${videoUrl}`);
1474
updateAppStatus({ message: `Downloading video: ${path.basename(videoUrl)}` });
1475
const { buffer, mimeType } = await downloadMedia(videoUrl);
1476
+
1477
if (buffer.length <= 90 * 1024 * 1024) {
1478
const filename = videoUrl.split('/').pop() || 'video.mp4';
1479
if (dryRun) {
1480
+
console.log(
1481
+
`[${twitterUsername}] 🧪 [DRY RUN] Would upload video: ${filename} (${(buffer.length / 1024 / 1024).toFixed(2)} MB)`,
1482
+
);
1483
+
videoBlob = {
1484
+
ref: { toString: () => 'mock-video-blob' },
1485
+
mimeType: 'video/mp4',
1486
+
size: buffer.length,
1487
+
} as any;
1488
} else {
1489
+
updateAppStatus({ message: `Uploading video to Bluesky...` });
1490
+
videoBlob = await uploadVideoToBluesky(agent, buffer, filename);
1491
}
1492
videoAspectRatio = aspectRatio;
1493
console.log(`[${twitterUsername}] ✅ Video upload process complete.`);
1494
break; // Prioritize first video
1495
}
1496
+
1497
+
console.warn(
1498
+
`[${twitterUsername}] ⚠️ Video too large (${(buffer.length / 1024 / 1024).toFixed(2)}MB). Fallback to link.`,
1499
+
);
1500
const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`;
1501
if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`;
1502
} catch (err) {
1503
const errMsg = (err as Error).message;
1504
+
if (errMsg !== 'VIDEO_FALLBACK_503') {
1505
+
console.error(`[${twitterUsername}] ❌ Failed video upload flow:`, errMsg);
1506
}
1507
const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`;
1508
if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`;
···
1514
1515
// Cleanup text
1516
for (const link of mediaLinksToRemove) text = text.split(link).join('').trim();
1517
+
if (isSponsoredCard) {
1518
+
const cardLinks = detectCarouselLinks(tweet);
1519
+
const cardPrimaryLink = detectCardMedia(tweet).link;
1520
+
const requestedLinks = [cardPrimaryLink, ...cardLinks].filter(Boolean) as string[];
1521
+
requestedLinks.forEach((link) => {
1522
+
if (!urls.some((u) => u.expanded_url === link || u.url === link)) {
1523
+
urls.push({ url: link, expanded_url: link });
1524
+
}
1525
+
});
1526
+
}
1527
text = text.replace(/\n\s*\n/g, '\n\n').trim();
1528
+
text = addTextFallbacks(text);
1529
1530
// 3. Quoting Logic
1531
let quoteEmbed: { $type: string; record: { uri: string; cid: string } } | null = null;
···
1541
} else {
1542
const quoteUrlEntity = urls.find((u) => u.expanded_url?.includes(quoteId));
1543
const qUrl = quoteUrlEntity?.expanded_url || `https://twitter.com/i/status/${quoteId}`;
1544
+
1545
// Check if it's a self-quote (same user)
1546
+
const isSelfQuote =
1547
+
qUrl.toLowerCase().includes(`twitter.com/${twitterUsername.toLowerCase()}/`) ||
1548
+
qUrl.toLowerCase().includes(`x.com/${twitterUsername.toLowerCase()}/`);
1549
+
1550
if (!isSelfQuote) {
1551
externalQuoteUrl = qUrl;
1552
console.log(`[${twitterUsername}] 🔗 Quoted tweet is external: ${externalQuoteUrl}`);
1553
+
1554
// Try to capture screenshot for external QTs if we have space for images
1555
if (images.length < 4 && !videoBlob) {
1556
const ssResult = await captureTweetScreenshot(externalQuoteUrl);
···
1558
try {
1559
let blob: BlobRef;
1560
if (dryRun) {
1561
+
console.log(
1562
+
`[${twitterUsername}] 🧪 [DRY RUN] Would upload screenshot for quote (${(ssResult.buffer.length / 1024).toFixed(2)} KB)`,
1563
+
);
1564
+
blob = {
1565
+
ref: { toString: () => 'mock-ss-blob' },
1566
+
mimeType: 'image/png',
1567
+
size: ssResult.buffer.length,
1568
+
} as any;
1569
} else {
1570
+
blob = await uploadToBluesky(agent, ssResult.buffer, 'image/png');
1571
}
1572
+
images.push({
1573
+
alt: `Quote Tweet: ${externalQuoteUrl}`,
1574
+
image: blob,
1575
+
aspectRatio: { width: ssResult.width, height: ssResult.height },
1576
});
1577
} catch (e) {
1578
console.warn(`[${twitterUsername}] ⚠️ Failed to upload screenshot blob.`);
···
1583
console.log(`[${twitterUsername}] 🔁 Quoted tweet is a self-quote, skipping link.`);
1584
}
1585
}
1586
+
} else if ((images.length === 0 && !videoBlob) || isSponsoredCard) {
1587
+
// If no media and no quote, check for external links to embed
1588
+
// We prioritize the LAST link found as it's often the main content
1589
+
const potentialLinks = urls
1590
+
.map((u) => u.expanded_url)
1591
+
.filter((u) => u && !u.includes('twitter.com') && !u.includes('x.com')) as string[];
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1592
1593
+
if (potentialLinks.length > 0) {
1594
+
const linkToEmbed = potentialLinks[potentialLinks.length - 1];
1595
+
if (linkToEmbed) {
1596
+
// Optimization: If text is too long, but removing the link makes it fit, do it!
1597
+
// The link will be present in the embed card anyway.
1598
+
if (text.length > 300 && text.includes(linkToEmbed)) {
1599
+
const lengthWithoutLink = text.length - linkToEmbed.length;
1600
+
// Allow some buffer (e.g. whitespace cleanup might save 1-2 chars)
1601
+
if (lengthWithoutLink <= 300) {
1602
+
console.log(
1603
+
`[${twitterUsername}] 📏 Optimizing: Removing link ${linkToEmbed} from text to avoid threading (Card will embed it).`,
1604
+
);
1605
+
text = text.replace(linkToEmbed, '').trim();
1606
+
// Clean up potential double punctuation/spaces left behind
1607
+
text = text.replace(/\s\.$/, '.').replace(/\s\s+/g, ' ');
1608
}
1609
+
}
1610
+
1611
+
console.log(`[${twitterUsername}] 🃏 Fetching link card for: ${linkToEmbed}`);
1612
+
linkCard = await fetchEmbedUrlCard(agent, linkToEmbed);
1613
}
1614
+
}
1615
}
1616
1617
// Only append link for external quotes IF we couldn't natively embed it OR screenshot it
1618
+
const hasScreenshot = images.some((img) => img.alt.startsWith('Quote Tweet:'));
1619
if (externalQuoteUrl && !quoteEmbed && !hasScreenshot && !text.includes(externalQuoteUrl)) {
1620
text += `\n\nQT: ${externalQuoteUrl}`;
1621
}
1622
1623
+
if (isSponsoredCard) {
1624
+
const hasCardImages = mediaEntities.some((media) => media.source === 'card');
1625
+
if (hasCardImages) {
1626
+
text = ensureSponsoredLinks(text, tweet);
1627
+
}
1628
+
}
1629
+
1630
// 4. Threading and Posting
1631
const chunks = splitText(text);
1632
console.log(`[${twitterUsername}] 📝 Splitting text into ${chunks.length} chunks.`);
1633
+
1634
let lastPostInfo: ProcessedTweetEntry | null = replyParentInfo;
1635
1636
// We will save the first chunk as the "Root" of this tweet, and the last chunk as the "Tail".
···
1639
1640
for (let i = 0; i < chunks.length; i++) {
1641
let chunk = chunks[i] as string;
1642
+
1643
// Add (i/n) if split
1644
if (chunks.length > 1) {
1645
+
chunk += ` (${i + 1}/${chunks.length})`;
1646
}
1647
1648
console.log(`[${twitterUsername}] 📤 Posting chunk ${i + 1}/${chunks.length}...`);
1649
updateAppStatus({ message: `Posting chunk ${i + 1}/${chunks.length}...` });
1650
+
1651
const rt = new RichText({ text: chunk });
1652
await rt.detectFacets(agent);
1653
const detectedLangs = detectLanguage(chunk);
···
1694
let rootRef: { uri: string; cid: string } | null = null;
1695
1696
if (lastPostInfo?.uri && lastPostInfo?.cid) {
1697
+
// If this is the start of a new tweet (i=0), check if parent has a tail
1698
+
if (i === 0 && lastPostInfo.tail) {
1699
+
parentRef = lastPostInfo.tail;
1700
+
} else {
1701
+
// Otherwise (intra-tweet or parent has no tail), use the main uri/cid (which is the previous post/chunk)
1702
+
parentRef = { uri: lastPostInfo.uri, cid: lastPostInfo.cid };
1703
+
}
1704
+
1705
+
rootRef = lastPostInfo.root || { uri: lastPostInfo.uri, cid: lastPostInfo.cid };
1706
}
1707
1708
if (parentRef && rootRef) {
···
1716
// Retry logic for network/socket errors
1717
let response: any;
1718
let retries = 3;
1719
+
1720
if (dryRun) {
1721
+
console.log(`[${twitterUsername}] 🧪 [DRY RUN] Would post chunk ${i + 1}/${chunks.length}`);
1722
+
if (postRecord.embed) console.log(` - With embed: ${postRecord.embed.$type}`);
1723
+
if (postRecord.reply) console.log(` - As reply to: ${postRecord.reply.parent.uri}`);
1724
+
response = { uri: 'at://did:plc:mock/app.bsky.feed.post/mock', cid: 'mock-cid' };
1725
} else {
1726
+
while (retries > 0) {
1727
+
try {
1728
+
response = await agent.post(postRecord);
1729
+
break;
1730
+
} catch (err: any) {
1731
+
retries--;
1732
+
if (retries === 0) throw err;
1733
+
console.warn(
1734
+
`[${twitterUsername}] ⚠️ Post failed (Socket/Network), retrying in 5s... (${retries} retries left)`,
1735
+
);
1736
+
await new Promise((r) => setTimeout(r, 5000));
1737
}
1738
+
}
1739
}
1740
+
1741
const currentPostInfo = {
1742
+
uri: response.uri,
1743
+
cid: response.cid,
1744
+
root: postRecord.reply ? postRecord.reply.root : { uri: response.uri, cid: response.cid },
1745
+
// Text is just the current chunk text
1746
+
text: chunk,
1747
};
1748
+
1749
if (i === 0) firstChunkInfo = currentPostInfo;
1750
lastChunkInfo = currentPostInfo;
1751
lastPostInfo = currentPostInfo; // Update for next iteration
1752
1753
console.log(`[${twitterUsername}] ✅ Chunk ${i + 1} posted successfully.`);
1754
+
1755
if (chunks.length > 1) {
1756
await new Promise((r) => setTimeout(r, 3000));
1757
}
···
1760
break;
1761
}
1762
}
1763
+
1764
// Save to DB and Map
1765
if (firstChunkInfo && lastChunkInfo) {
1766
+
const entry: ProcessedTweetEntry = {
1767
+
uri: firstChunkInfo.uri,
1768
+
cid: firstChunkInfo.cid,
1769
+
root: firstChunkInfo.root,
1770
+
tail: { uri: lastChunkInfo.uri, cid: lastChunkInfo.cid }, // Save tail!
1771
+
text: tweetText,
1772
+
};
1773
+
1774
+
if (!dryRun) {
1775
+
saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, entry);
1776
+
localProcessedMap[tweetId] = entry; // Update local map for subsequent replies in this batch
1777
+
}
1778
}
1779
+
1780
// Add a random delay between 5s and 15s to be more human-like
1781
const wait = Math.floor(Math.random() * 10000) + 5000;
1782
console.log(`[${twitterUsername}] 😴 Pacing: Waiting ${wait / 1000}s before next tweet.`);
···
1796
requestId?: string,
1797
): Promise<void> {
1798
const config = getConfig();
1799
+
const mapping = config.mappings.find((m) =>
1800
+
m.twitterUsernames.map((u) => u.toLowerCase()).includes(twitterUsername.toLowerCase()),
1801
+
);
1802
if (!mapping) {
1803
console.error(`No mapping found for twitter username: ${twitterUsername}`);
1804
return;
···
1806
1807
let agent = await getAgent(mapping);
1808
if (!agent) {
1809
+
if (dryRun) {
1810
+
console.log('⚠️ Could not login to Bluesky, but proceeding with MOCK AGENT for Dry Run.');
1811
+
// biome-ignore lint/suspicious/noExplicitAny: mock agent
1812
+
agent = {
1813
+
post: async (record: any) => ({ uri: 'at://did:plc:mock/app.bsky.feed.post/mock', cid: 'mock-cid' }),
1814
+
uploadBlob: async (data: any) => ({ data: { blob: { ref: { toString: () => 'mock-blob' } } } }),
1815
+
// Add other necessary methods if they are called outside of the already mocked dryRun blocks
1816
+
// But since we mocked the calls inside processTweets for dryRun, we just need the object to exist.
1817
+
session: { did: 'did:plc:mock' },
1818
+
com: { atproto: { repo: { describeRepo: async () => ({ data: {} }) } } },
1819
+
} as any;
1820
+
} else {
1821
+
return;
1822
+
}
1823
}
1824
1825
console.log(`Starting full history import for ${twitterUsername} -> ${mapping.bskyIdentifier}...`);
···
1830
1831
console.log(`Fetching tweets for ${twitterUsername}...`);
1832
updateAppStatus({ message: `Fetching tweets...` });
1833
+
1834
const client = await getTwitterScraper();
1835
if (client) {
1836
+
try {
1837
+
// Use getTweets which reliably fetches user timeline
1838
+
// limit defaults to 15 in function signature, but for history import we might want more.
1839
+
// However, the generator will fetch as much as we ask.
1840
+
const fetchLimit = limit || 100;
1841
+
const generator = client.getTweets(twitterUsername, fetchLimit);
0
0
0
0
0
0
0
0
1842
1843
+
for await (const scraperTweet of generator) {
1844
+
if (!ignoreCancellation) {
1845
+
const stillPending = getPendingBackfills().some(
1846
+
(b) => b.id === mapping.id && (!requestId || b.requestId === requestId),
1847
+
);
1848
+
if (!stillPending) {
1849
+
console.log(`[${twitterUsername}] 🛑 Backfill cancelled.`);
1850
+
break;
0
0
0
0
1851
}
1852
+
}
1853
+
1854
+
const t = mapScraperTweetToLocalTweet(scraperTweet);
1855
+
const tid = t.id_str || t.id;
1856
+
if (!tid) continue;
1857
+
1858
+
if (!processedTweets[tid] && !seenIds.has(tid)) {
1859
+
allFoundTweets.push(t);
1860
+
seenIds.add(tid);
1861
+
}
1862
+
1863
+
if (allFoundTweets.length >= fetchLimit) break;
1864
}
1865
+
} catch (e) {
1866
+
console.warn('Error during history fetch:', e);
1867
+
}
1868
}
1869
1870
console.log(`Fetch complete. Found ${allFoundTweets.length} new tweets to import.`);
···
1878
const activeTasks = new Map<string, Promise<void>>();
1879
1880
async function runAccountTask(mapping: AccountMapping, backfillRequest?: PendingBackfill, dryRun = false) {
1881
+
if (activeTasks.has(mapping.id)) return; // Already running
1882
1883
+
const task = (async () => {
1884
+
try {
1885
+
const agent = await getAgent(mapping);
1886
+
if (!agent) return;
1887
1888
+
const backfillReq = backfillRequest ?? getPendingBackfills().find((b) => b.id === mapping.id);
1889
+
1890
+
if (backfillReq) {
1891
+
const limit = backfillReq.limit || 15;
1892
+
console.log(
1893
+
`[${mapping.bskyIdentifier}] Running backfill for ${mapping.twitterUsernames.length} accounts (limit ${limit})...`,
1894
+
);
1895
+
updateAppStatus({
1896
+
state: 'backfilling',
1897
+
currentAccount: mapping.twitterUsernames[0],
1898
+
message: `Starting backfill (limit ${limit})...`,
1899
+
backfillMappingId: mapping.id,
1900
+
backfillRequestId: backfillReq.requestId,
1901
+
});
1902
+
1903
+
for (const twitterUsername of mapping.twitterUsernames) {
1904
+
const stillPending = getPendingBackfills().some(
1905
+
(b) => b.id === mapping.id && b.requestId === backfillReq.requestId,
1906
+
);
1907
+
if (!stillPending) {
1908
+
console.log(`[${mapping.bskyIdentifier}] 🛑 Backfill request replaced; stopping.`);
1909
+
break;
1910
+
}
1911
+
1912
+
try {
1913
+
updateAppStatus({
1914
+
state: 'backfilling',
1915
+
currentAccount: twitterUsername,
1916
+
message: `Starting backfill (limit ${limit})...`,
1917
+
backfillMappingId: mapping.id,
1918
+
backfillRequestId: backfillReq.requestId,
1919
+
});
1920
+
await importHistory(twitterUsername, mapping.bskyIdentifier, limit, dryRun, false, backfillReq.requestId);
1921
+
} catch (err) {
1922
+
console.error(`❌ Error backfilling ${twitterUsername}:`, err);
1923
+
}
1924
+
}
1925
+
clearBackfill(mapping.id, backfillReq.requestId);
1926
+
updateAppStatus({
1927
+
state: 'idle',
1928
+
message: `Backfill complete for ${mapping.bskyIdentifier}`,
1929
+
backfillMappingId: undefined,
1930
+
backfillRequestId: undefined,
1931
+
});
1932
+
console.log(`[${mapping.bskyIdentifier}] Backfill complete.`);
1933
+
} else {
1934
+
updateAppStatus({ backfillMappingId: undefined, backfillRequestId: undefined });
1935
1936
+
// Pre-load processed IDs for optimization
1937
+
const processedMap = loadProcessedTweets(mapping.bskyIdentifier);
1938
+
const processedIds = new Set(Object.keys(processedMap));
1939
+
1940
+
for (const twitterUsername of mapping.twitterUsernames) {
1941
+
try {
1942
+
console.log(`[${twitterUsername}] 🏁 Starting check for new tweets...`);
1943
+
updateAppStatus({
1944
+
state: 'checking',
1945
+
currentAccount: twitterUsername,
1946
+
message: 'Fetching latest tweets...',
1947
+
backfillMappingId: undefined,
1948
+
backfillRequestId: undefined,
1949
+
});
0
0
0
0
0
0
0
0
0
1950
1951
+
// Use fetchUserTweets with early stopping optimization
1952
+
// Increase limit slightly since we have early stopping now
1953
+
const tweets = await fetchUserTweets(twitterUsername, 50, processedIds);
1954
1955
+
if (!tweets || tweets.length === 0) {
1956
+
console.log(`[${twitterUsername}] ℹ️ No tweets found (or fetch failed).`);
1957
+
continue;
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1958
}
1959
+
1960
+
console.log(`[${twitterUsername}] 📥 Fetched ${tweets.length} tweets.`);
1961
+
await processTweets(agent, twitterUsername, mapping.bskyIdentifier, tweets, dryRun);
1962
+
} catch (err) {
1963
+
console.error(`❌ Error checking ${twitterUsername}:`, err);
1964
+
}
1965
}
1966
+
}
1967
+
} catch (err) {
1968
+
console.error(`Error processing mapping ${mapping.bskyIdentifier}:`, err);
1969
+
} finally {
1970
+
activeTasks.delete(mapping.id);
1971
+
}
1972
+
})();
1973
1974
+
activeTasks.set(mapping.id, task);
1975
+
return task; // Return task promise for await in main loop
1976
}
1977
1978
+
import type { AccountMapping } from './config-manager.js';
1979
import {
0
0
0
1980
clearBackfill,
1981
getNextCheckTime,
1982
+
getPendingBackfills,
1983
+
startServer,
1984
updateAppStatus,
1985
+
updateLastCheckTime,
1986
} from './server.js';
1987
import type { PendingBackfill } from './server.js';
0
1988
1989
async function main(): Promise<void> {
1990
const program = new Command();
···
2025
console.error('Twitter credentials not set. Cannot import history.');
2026
process.exit(1);
2027
}
2028
+
const mapping = config.mappings.find((m) =>
2029
+
m.twitterUsernames.map((u) => u.toLowerCase()).includes(options.username.toLowerCase()),
2030
+
);
2031
if (!mapping) {
2032
console.error(`No mapping found for ${options.username}`);
2033
process.exit(1);
···
2047
// Concurrency limit for processing accounts
2048
const runLimit = pLimit(3);
2049
2050
+
const findMappingById = (mappings: AccountMapping[], id: string) => mappings.find((mapping) => mapping.id === id);
0
2051
2052
// Main loop
2053
while (true) {
···
2090
for (const mapping of config.mappings) {
2091
if (!mapping.enabled) continue;
2092
2093
+
tasks.push(
2094
+
runLimit(async () => {
2095
+
await runAccountTask(mapping, undefined, options.dryRun);
2096
+
}),
2097
+
);
2098
}
2099
2100
if (tasks.length > 0) {