feat: include card media when available · indexx.dev/tweets2bsky@a072cde

A simple tool which lets you scrape twitter accounts and crosspost them to bluesky accounts! Comes with a CLI and a webapp for managing profiles! Works with images/videos/link embeds/threads.

fork atom

feat: include card media when available

jack 1 month ago a072cde6 9a1e7580

+880 -507

1 changed file

expand all

unified split

src

index.ts

+880 -507

src/index.ts

··· 2 import fs from 'node:fs'; 3 import path from 'node:path'; 4 import { fileURLToPath } from 'node:url'; 5 - import { BskyAgent, RichText } from '@atproto/api'; 6 import type { BlobRef } from '@atproto/api'; 7 import { Scraper } from '@the-convocation/twitter-scraper'; 8 import type { Tweet as ScraperTweet } from '@the-convocation/twitter-scraper'; 9 import axios from 'axios'; 10 import { Command } from 'commander'; 11 import * as francModule from 'franc-min'; 12 import iso6391 from 'iso-639-1'; 13 import puppeteer from 'puppeteer-core'; 14 - import * as cheerio from 'cheerio'; 15 import sharp from 'sharp'; 16 import { generateAltText } from './ai-manager.js'; 17 ··· 21 const __filename = fileURLToPath(import.meta.url); 22 const __dirname = path.dirname(__filename); 23 24 - // ============================================================================ 25 // Type Definitions 26 - // ============================================================================ 27 28 interface ProcessedTweetEntry { 29 uri?: string; ··· 44 expanded_url?: string; 45 } 46 47 interface MediaSize { 48 w: number; 49 h: number; ··· 78 sizes?: MediaSizes; 79 original_info?: OriginalInfo; 80 video_info?: VideoInfo; 81 } 82 83 interface TweetEntities { ··· 105 screen_name?: string; 106 id_str?: string; 107 }; 108 } 109 110 interface AspectRatio { ··· 120 121 import { dbService } from './db.js'; 122 123 - // ============================================================================ 124 // State Management 125 - // ============================================================================ 126 127 const PROCESSED_DIR = path.join(__dirname, '..', 'processed'); 128 129 async function migrateJsonToSqlite() { 130 if (!fs.existsSync(PROCESSED_DIR)) return; 131 - 132 - const files = fs.readdirSync(PROCESSED_DIR).filter(f => f.endsWith('.json')); 133 if (files.length === 0) return; 134 135 console.log(`📦 Found ${files.length} legacy cache files. Migrating to SQLite...`); 136 const config = getConfig(); 137 - 138 for (const file of files) { 139 const username = file.replace('.json', '').toLowerCase(); 140 // Try to find a matching bskyIdentifier from config 141 - const mapping = config.mappings.find(m => m.twitterUsernames.map(u => u.toLowerCase()).includes(username)); 142 const bskyIdentifier = mapping?.bskyIdentifier || 'unknown'; 143 144 try { 145 const filePath = path.join(PROCESSED_DIR, file); 146 const data = JSON.parse(fs.readFileSync(filePath, 'utf8')) as ProcessedTweetsMap; 147 - 148 for (const [twitterId, entry] of Object.entries(data)) { 149 dbService.saveTweet({ 150 twitter_id: twitterId, ··· 154 bsky_cid: entry.cid, 155 bsky_root_uri: entry.root?.uri, 156 bsky_root_cid: entry.root?.cid, 157 - status: entry.migrated ? 'migrated' : (entry.skipped ? 'skipped' : 'failed') 158 }); 159 } 160 // Move file to backup ··· 172 dbService.repairUnknownIdentifiers(username, mapping.bskyIdentifier); 173 } 174 } 175 - 176 console.log('✅ Migration complete.'); 177 } 178 ··· 180 return dbService.getTweetsByBskyIdentifier(bskyIdentifier); 181 } 182 183 - function saveProcessedTweet(twitterUsername: string, bskyIdentifier: string, twitterId: string, entry: ProcessedTweetEntry): void { 184 dbService.saveTweet({ 185 twitter_id: twitterId, 186 twitter_username: twitterUsername.toLowerCase(), ··· 192 bsky_root_cid: entry.root?.cid, 193 bsky_tail_uri: entry.tail?.uri, 194 bsky_tail_cid: entry.tail?.cid, 195 - status: entry.migrated || (entry.uri && entry.cid) ? 'migrated' : (entry.skipped ? 'skipped' : 'failed') 196 }); 197 } 198 199 - // ============================================================================ 200 // Custom Twitter Client 201 - // ============================================================================ 202 203 let scraper: Scraper | null = null; 204 let currentTwitterCookies = { authToken: '', ct0: '' }; ··· 216 } 217 218 if (!authToken || !ct0) return null; 219 - 220 // Re-initialize if config changed, not yet initialized, or forced reset 221 - if ( 222 - !scraper || 223 - forceReset || 224 - currentTwitterCookies.authToken !== authToken || 225 - currentTwitterCookies.ct0 !== ct0 226 - ) { 227 console.log(`🔄 Initializing Twitter scraper with ${useBackupCredentials ? 'BACKUP' : 'PRIMARY'} credentials...`); 228 scraper = new Scraper(); 229 - await scraper.setCookies([ 230 - `auth_token=${authToken}`, 231 - `ct0=${ct0}` 232 - ]); 233 234 - currentTwitterCookies = { 235 - authToken: authToken, 236 - ct0: ct0 237 }; 238 } 239 return scraper; ··· 247 await getTwitterScraper(true); 248 return true; 249 } 250 - console.log("⚠️ No backup credentials available to switch to."); 251 return false; 252 } 253 254 function mapScraperTweetToLocalTweet(scraperTweet: ScraperTweet): Tweet { 255 - const raw = scraperTweet.__raw_UNSTABLE; 256 - if (!raw) { 257 - // Fallback if raw data is missing (shouldn't happen for timeline tweets usually) 258 - return { 259 - id: scraperTweet.id, 260 - id_str: scraperTweet.id, 261 - text: scraperTweet.text, 262 - full_text: scraperTweet.text, 263 - isRetweet: scraperTweet.isRetweet, 264 - // Construct minimal entities from parsed data 265 - entities: { 266 - urls: scraperTweet.urls.map((url: string) => ({ url, expanded_url: url })), 267 - media: scraperTweet.photos.map((p: any) => ({ 268 - url: p.url, 269 - expanded_url: p.url, 270 - media_url_https: p.url, 271 - type: 'photo', 272 - ext_alt_text: p.alt_text, 273 - })), 274 - }, 275 - created_at: scraperTweet.timeParsed?.toUTCString() 276 - }; 277 - } 278 - 279 return { 280 - id: raw.id_str, 281 - id_str: raw.id_str, 282 - text: raw.full_text, 283 - full_text: raw.full_text, 284 - created_at: raw.created_at, 285 isRetweet: scraperTweet.isRetweet, 286 - // biome-ignore lint/suspicious/noExplicitAny: raw types match compatible structure 287 - entities: raw.entities as any, 288 - // biome-ignore lint/suspicious/noExplicitAny: raw types match compatible structure 289 - extended_entities: raw.extended_entities as any, 290 - quoted_status_id_str: raw.quoted_status_id_str, 291 - retweeted_status_id_str: raw.retweeted_status_id_str, 292 - is_quote_status: !!raw.quoted_status_id_str, 293 - in_reply_to_status_id_str: raw.in_reply_to_status_id_str, 294 - // biome-ignore lint/suspicious/noExplicitAny: missing in LegacyTweetRaw type 295 - in_reply_to_user_id_str: (raw as any).in_reply_to_user_id_str, 296 - user: { 297 - screen_name: scraperTweet.username, 298 - id_str: scraperTweet.userId, 299 }, 300 }; 301 } 302 303 - // ============================================================================ 304 // Helper Functions 305 - // ============================================================================ 306 307 function detectLanguage(text: string): string[] { 308 if (!text || text.trim().length === 0) return ['en']; ··· 335 return (response.request as any)?.res?.responseUrl || shortUrl; 336 } catch (e: any) { 337 if (e.code === 'ERR_FR_TOO_MANY_REDIRECTS' || e.response?.status === 403 || e.response?.status === 401) { 338 - // Silent fallback for common expansion issues (redirect loops, login walls) 339 - return shortUrl; 340 } 341 return shortUrl; 342 } ··· 372 const isGif = mimeType === 'image/gif'; 373 const isAnimation = isGif || isWebp; 374 375 - if ((buffer.length > MAX_SIZE && (mimeType.startsWith('image/') || mimeType === 'application/octet-stream')) || (isPng && buffer.length > MAX_SIZE)) { 376 console.log(`[UPLOAD] ⚖️ Image too large (${(buffer.length / 1024).toFixed(2)} KB). Optimizing...`); 377 try { 378 let image = sharp(buffer); ··· 386 while (currentBuffer.length > MAX_SIZE && attempts < 5) { 387 attempts++; 388 console.log(`[UPLOAD] 📉 Compression attempt ${attempts}: Width ${width}, Quality ${quality}...`); 389 - 390 if (isAnimation) { 391 - // For animations (GIF/WebP), we can only do so much without losing frames 392 - // Try to convert to WebP if it's a GIF, or optimize WebP 393 - image = sharp(buffer, { animated: true }); 394 - if (isGif) { 395 - // Convert GIF to WebP for better compression 396 - image = image.webp({ quality: Math.max(quality, 50), effort: 6 }); 397 - finalMimeType = 'image/webp'; 398 - } else { 399 - image = image.webp({ quality: Math.max(quality, 50), effort: 6 }); 400 - } 401 - // Resize if really big 402 - if (metadata.width && metadata.width > 800) { 403 - image = image.resize({ width: 800, withoutEnlargement: true }); 404 - } 405 } else { 406 - // Static images 407 - if (width > 1600) width = 1600; 408 - else if (attempts > 1) width = Math.floor(width * 0.8); 409 - 410 - quality = Math.max(50, quality - 10); 411 - 412 - image = sharp(buffer) 413 - .resize({ width, withoutEnlargement: true }) 414 - .jpeg({ quality, mozjpeg: true }); 415 - 416 - finalMimeType = 'image/jpeg'; 417 } 418 - 419 currentBuffer = await image.toBuffer(); 420 if (currentBuffer.length <= MAX_SIZE) { 421 - finalBuffer = currentBuffer; 422 - console.log(`[UPLOAD] ✅ Optimized to ${(finalBuffer.length / 1024).toFixed(2)} KB`); 423 - break; 424 } 425 } 426 - 427 if (finalBuffer.length > MAX_SIZE) { 428 - console.warn(`[UPLOAD] ⚠️ Could not compress below limit. Current: ${(finalBuffer.length / 1024).toFixed(2)} KB. Upload might fail.`); 429 } 430 - 431 } catch (err) { 432 console.warn(`[UPLOAD] ⚠️ Optimization failed, attempting original upload:`, (err as Error).message); 433 finalBuffer = buffer; ··· 455 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe', 456 ]; 457 458 - const executablePath = browserPaths.find(p => fs.existsSync(p)); 459 - 460 if (!executablePath) { 461 console.warn(`[SCREENSHOT] ⏩ Skipping screenshot (no Chrome/Chromium found at common paths).`); 462 return null; ··· 500 `; 501 502 await page.setContent(html, { waitUntil: 'networkidle0' }); 503 - 504 // Wait for the twitter iframe to load and render 505 try { 506 await page.waitForSelector('iframe', { timeout: 10000 }); 507 // Small extra wait for images inside iframe 508 - await new Promise(r => setTimeout(r, 2000)); 509 } catch (e) { 510 console.warn(`[SCREENSHOT] ⚠️ Timeout waiting for tweet iframe, taking screenshot anyway.`); 511 } ··· 515 const box = await element.boundingBox(); 516 const buffer = await element.screenshot({ type: 'png', omitBackground: true }); 517 if (box) { 518 - console.log(`[SCREENSHOT] ✅ Captured successfully (${(buffer.length / 1024).toFixed(2)} KB) - ${Math.round(box.width)}x${Math.round(box.height)}`); 519 - return { buffer: buffer as Buffer, width: Math.round(box.width), height: Math.round(box.height) }; 520 } 521 } 522 } catch (err) { ··· 534 535 while (!blob) { 536 attempts++; 537 - const statusUrl = new URL("https://video.bsky.app/xrpc/app.bsky.video.getJobStatus"); 538 - statusUrl.searchParams.append("jobId", jobId); 539 540 const statusResponse = await fetch(statusUrl); 541 if (!statusResponse.ok) { ··· 553 if (statusData.jobStatus.blob) { 554 blob = statusData.jobStatus.blob; 555 console.log(`[VIDEO] 🎉 Video processing complete! Blob ref obtained.`); 556 - } else if (state === "JOB_STATE_FAILED") { 557 - throw new Error(`Video processing failed: ${statusData.jobStatus.error || "Unknown error"}`); 558 } else { 559 // Wait before next poll 560 await new Promise((resolve) => setTimeout(resolve, 5000)); ··· 562 563 if (attempts > 60) { 564 // ~5 minute timeout 565 - throw new Error("Video processing timed out after 5 minutes."); 566 } 567 } 568 return blob!; ··· 572 try { 573 const response = await axios.get(url, { 574 headers: { 575 - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 576 - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 577 'Accept-Language': 'en-US,en;q=0.9', 578 }, 579 timeout: 10000, 580 maxRedirects: 5, 581 }); 582 - 583 const $ = cheerio.load(response.data); 584 const title = $('meta[property="og:title"]').attr('content') || $('title').text() || ''; 585 - const description = $('meta[property="og:description"]').attr('content') || $('meta[name="description"]').attr('content') || ''; 586 let thumbBlob: BlobRef | undefined; 587 588 let imageUrl = $('meta[property="og:image"]').attr('content'); 589 if (imageUrl) { 590 - if (!imageUrl.startsWith('http')) { 591 - const baseUrl = new URL(url); 592 - imageUrl = new URL(imageUrl, baseUrl.origin).toString(); 593 - } 594 - try { 595 - const { buffer, mimeType } = await downloadMedia(imageUrl); 596 - thumbBlob = await uploadToBluesky(agent, buffer, mimeType); 597 - } catch (e) { 598 - // SIlently fail thumbnail upload 599 - } 600 } 601 602 if (!title && !description) return null; 603 604 const external: any = { 605 - uri: url, 606 - title: title || url, 607 - description: description, 608 }; 609 610 if (thumbBlob) { 611 - external.thumb = thumbBlob; 612 } 613 614 return { 615 - $type: 'app.bsky.embed.external', 616 - external, 617 }; 618 - 619 } catch (err: any) { 620 if (err.code === 'ERR_FR_TOO_MANY_REDIRECTS') { 621 - // Ignore redirect loops 622 - return null; 623 } 624 console.warn(`Failed to fetch embed card for ${url}:`, err.message || err); 625 return null; ··· 627 } 628 629 async function uploadVideoToBluesky(agent: BskyAgent, buffer: Buffer, filename: string): Promise<BlobRef> { 630 - const sanitizedFilename = filename.split("?")[0] || "video.mp4"; 631 console.log( 632 `[VIDEO] 🟢 Starting upload process for ${sanitizedFilename} (${(buffer.length / 1024 / 1024).toFixed(2)} MB)`, 633 ); ··· 640 641 // didDoc might be present in repoDesc 642 const pdsService = (repoDesc as any).didDoc?.service?.find( 643 - (s: any) => s.id === "#atproto_pds" || s.type === "AtProtoPds", 644 ); 645 const pdsUrl = pdsService?.serviceEndpoint; 646 - const pdsHost = pdsUrl ? new URL(pdsUrl).host : "bsky.social"; 647 648 console.log(`[VIDEO] 🌐 PDS Host detected: ${pdsHost}`); 649 console.log(`[VIDEO] 🔑 Requesting service auth token for audience: did:web:${pdsHost}...`); 650 651 const { data: serviceAuth } = await agent.com.atproto.server.getServiceAuth({ 652 aud: `did:web:${pdsHost}`, 653 - lxm: "com.atproto.repo.uploadBlob", 654 exp: Math.floor(Date.now() / 1000) + 60 * 30, 655 }); 656 console.log(`[VIDEO] ✅ Service auth token obtained.`); ··· 658 const token = serviceAuth.token; 659 660 // 2. Upload to Video Service 661 - const uploadUrl = new URL("https://video.bsky.app/xrpc/app.bsky.video.uploadVideo"); 662 - uploadUrl.searchParams.append("did", agent.session!.did!); 663 - uploadUrl.searchParams.append("name", sanitizedFilename); 664 665 console.log(`[VIDEO] 📤 Uploading to ${uploadUrl.href}...`); 666 const uploadResponse = await fetch(uploadUrl, { 667 - method: "POST", 668 headers: { 669 Authorization: `Bearer ${token}`, 670 - "Content-Type": "video/mp4", 671 }, 672 body: new Blob([new Uint8Array(buffer)]), 673 }); ··· 678 // Handle specific error cases 679 try { 680 const errorJson = JSON.parse(errorText); 681 - 682 // Handle server overload gracefully 683 - if (uploadResponse.status === 503 || errorJson.error === "Server does not have enough capacity to handle uploads") { 684 - console.warn(`[VIDEO] ⚠️ Server overloaded (503). Skipping video upload and falling back to link.`); 685 - throw new Error("VIDEO_FALLBACK_503"); 686 } 687 688 - if (errorJson.error === "already_exists" && errorJson.jobId) { 689 console.log(`[VIDEO] ♻️ Video already exists. Resuming with Job ID: ${errorJson.jobId}`); 690 return await pollForVideoProcessing(agent, errorJson.jobId); 691 } 692 - if (errorJson.error === "unconfirmed_email" || (errorJson.jobStatus && errorJson.jobStatus.error === "unconfirmed_email")) { 693 - console.error(`[VIDEO] 🛑 BLUESKY ERROR: Your email is unconfirmed. You MUST verify your email on Bluesky to upload videos.`); 694 - throw new Error("Bluesky Email Unconfirmed - Video Upload Rejected"); 695 } 696 } catch (e) { 697 - if ((e as Error).message === "VIDEO_FALLBACK_503") throw e; 698 - // Not JSON or missing fields, proceed with throwing original error 699 } 700 - 701 console.error(`[VIDEO] ❌ Server responded with ${uploadResponse.status}: ${errorText}`); 702 throw new Error(`Video upload failed: ${uploadResponse.status} ${errorText}`); 703 } ··· 740 // 4. Force split 741 742 let splitIndex = -1; 743 - 744 // Check paragraphs 745 let checkIndex = remaining.lastIndexOf('\n\n', effectiveLimit); 746 if (checkIndex !== -1) splitIndex = checkIndex; 747 748 // Check sentences 749 if (splitIndex === -1) { 750 - // Look for punctuation followed by space 751 - const sentenceMatches = Array.from(remaining.substring(0, effectiveLimit).matchAll(/[.!?]\s/g)); 752 - if (sentenceMatches.length > 0) { 753 - const lastMatch = sentenceMatches[sentenceMatches.length - 1]; 754 - if (lastMatch && lastMatch.index !== undefined) { 755 - splitIndex = lastMatch.index + 1; // Include punctuation 756 - } 757 } 758 } 759 760 // Check spaces 761 if (splitIndex === -1) { 762 - checkIndex = remaining.lastIndexOf(' ', effectiveLimit); 763 - if (checkIndex !== -1) splitIndex = checkIndex; 764 } 765 766 // Force split if no good break point found 767 if (splitIndex === -1) { 768 - splitIndex = effectiveLimit; 769 } 770 771 chunks.push(remaining.substring(0, splitIndex).trim()); ··· 814 async function fetchUserTweets(username: string, limit: number, processedIds?: Set<string>): Promise<Tweet[]> { 815 const client = await getTwitterScraper(); 816 if (!client) return []; 817 - 818 let retries = 3; 819 while (retries > 0) { 820 try { 821 const tweets: Tweet[] = []; 822 const generator = client.getTweets(username, limit); 823 let consecutiveProcessedCount = 0; 824 - 825 for await (const t of generator) { 826 const tweet = mapScraperTweetToLocalTweet(t); 827 const tweetId = tweet.id_str || tweet.id; 828 - 829 // Early stopping logic: if we see 3 consecutive tweets we've already processed, stop. 830 // This assumes timeline order (mostly true). 831 if (processedIds && tweetId && processedIds.has(tweetId)) { 832 - consecutiveProcessedCount++; 833 - if (consecutiveProcessedCount >= 3) { 834 - console.log(`[${username}] 🛑 Found 3 consecutive processed tweets. Stopping fetch early.`); 835 - break; 836 - } 837 } else { 838 - consecutiveProcessedCount = 0; 839 } 840 841 tweets.push(tweet); ··· 844 return tweets; 845 } catch (e: any) { 846 retries--; 847 - const isRetryable = e.message?.includes('ServiceUnavailable') || e.message?.includes('Timeout') || e.message?.includes('429') || e.message?.includes('401'); 848 - 849 // Check for Twitter Internal Server Error (often returns 400 with specific body) 850 if (e?.response?.status === 400 && JSON.stringify(e?.response?.data || {}).includes('InternalServerError')) { 851 - console.warn(`⚠️ Twitter Internal Server Error (Transient) for ${username}.`); 852 - // Treat as retryable 853 - if (retries > 0) { 854 - await new Promise(r => setTimeout(r, 5000)); 855 - continue; 856 - } 857 } 858 859 if (isRetryable) { 860 console.warn(`⚠️ Error fetching tweets for ${username} (${e.message}).`); 861 - 862 // Attempt credential switch if we have backups 863 if (await switchCredentials()) { 864 - console.log(`🔄 Retrying with new credentials...`); 865 - continue; // Retry loop with new credentials 866 } 867 868 if (retries > 0) { 869 - console.log(`Waiting 5s before retry...`); 870 - await new Promise(r => setTimeout(r, 5000)); 871 - continue; 872 } 873 } 874 - 875 console.warn(`Error fetching tweets for ${username}:`, e.message || e); 876 return []; 877 } 878 } 879 - 880 console.log(`[${username}] ⚠️ Scraper returned 0 tweets (or failed silently) after retries.`); 881 return []; 882 } 883 884 - // ============================================================================ 885 // Main Processing Logic 886 - // ============================================================================ 887 888 - // ============================================================================ 889 // Main Processing Logic 890 - // ============================================================================ 891 892 async function processTweets( 893 agent: BskyAgent, ··· 909 }); 910 911 const processedTweets = loadProcessedTweets(bskyIdentifier); 912 - 913 // Maintain a local map that updates in real-time for intra-batch replies 914 const localProcessedMap: ProcessedTweetsMap = { ...processedTweets }; 915 ··· 936 if (isRetweet) { 937 console.log(`[${twitterUsername}] ⏩ Skipping retweet ${tweetId}.`); 938 if (!dryRun) { 939 - // Save as skipped so we don't check it again 940 - saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, { skipped: true, text: tweet.text }); 941 - localProcessedMap[tweetId] = { skipped: true, text: tweet.text }; 942 } 943 continue; 944 } ··· 967 // Parent missing from local batch/DB. Attempt to fetch it if it's a self-thread. 968 // We assume it's a self-thread if we don't have it, but we'll verify author after fetch. 969 console.log(`[${twitterUsername}] 🕵️ Parent ${replyStatusId} missing. Checking if backfillable...`); 970 - 971 let parentBackfilled = false; 972 try { 973 - const scraper = await getTwitterScraper(); 974 - if (scraper) { 975 - const parentRaw = await scraper.getTweet(replyStatusId); 976 - if (parentRaw) { 977 - const parentTweet = mapScraperTweetToLocalTweet(parentRaw); 978 - const parentAuthor = parentTweet.user?.screen_name; 979 - 980 - if (parentAuthor?.toLowerCase() === twitterUsername.toLowerCase()) { 981 - console.log(`[${twitterUsername}] 🔄 Parent is ours (@${parentAuthor}). Backfilling parent first...`); 982 - // Recursively process the parent 983 - await processTweets(agent, twitterUsername, bskyIdentifier, [parentTweet], dryRun); 984 - 985 - // Check if it was saved 986 - const savedParent = dbService.getTweet(replyStatusId, bskyIdentifier); 987 - if (savedParent && savedParent.status === 'migrated') { 988 - // Update local map 989 - localProcessedMap[replyStatusId] = { 990 - uri: savedParent.bsky_uri, 991 - cid: savedParent.bsky_cid, 992 - root: (savedParent.bsky_root_uri && savedParent.bsky_root_cid) ? { uri: savedParent.bsky_root_uri, cid: savedParent.bsky_root_cid } : undefined, 993 - tail: (savedParent.bsky_tail_uri && savedParent.bsky_tail_cid) ? { uri: savedParent.bsky_tail_uri, cid: savedParent.bsky_tail_cid } : undefined, 994 - migrated: true 995 - }; 996 - replyParentInfo = localProcessedMap[replyStatusId] ?? null; 997 - parentBackfilled = true; 998 - console.log(`[${twitterUsername}] ✅ Parent backfilled. Resuming thread.`); 999 - } 1000 - } else { 1001 - console.log(`[${twitterUsername}] ⏩ Parent is by @${parentAuthor}. Skipping external reply.`); 1002 - } 1003 } 1004 } 1005 } catch (e) { 1006 - console.warn(`[${twitterUsername}] ⚠️ Failed to fetch/backfill parent ${replyStatusId}:`, e); 1007 } 1008 1009 if (!parentBackfilled) { 1010 - console.log(`[${twitterUsername}] ⏩ Skipping external/unknown reply (Parent not found or external).`); 1011 - if (!dryRun) { 1012 - saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, { skipped: true, text: tweetText }); 1013 - localProcessedMap[tweetId] = { skipped: true, text: tweetText }; 1014 - } 1015 - continue; 1016 } 1017 } else { 1018 console.log(`[${twitterUsername}] ⏩ Skipping external/unknown reply.`); ··· 1025 } 1026 1027 // Removed early dryRun continue to allow verifying logic 1028 - 1029 let text = tweetText 1030 .replace(/&/g, '&') 1031 .replace(/</g, '<') 1032 .replace(/>/g, '>') 1033 .replace(/"/g, '"') 1034 .replace(/'/g, "'"); 1035 - 1036 // 1. Link Expansion 1037 console.log(`[${twitterUsername}] 🔗 Expanding links...`); 1038 const urls = tweet.entities?.urls || []; ··· 1047 const matches = text.match(tcoRegex) || []; 1048 for (const tco of matches) { 1049 // Avoid re-resolving if we already handled it via entities 1050 - if (urls.some(u => u.url === tco)) continue; 1051 1052 console.log(`[${twitterUsername}] 🔍 Resolving fallback link: ${tco}`); 1053 const resolved = await expandUrl(tco); 1054 if (resolved !== tco) { 1055 - text = text.replace(tco, resolved); 1056 - // Add to urls array so it can be used for card embedding later 1057 - urls.push({ url: tco, expanded_url: resolved }); 1058 } 1059 } 1060 ··· 1072 mediaLinksToRemove.push(media.url); 1073 if (media.expanded_url) mediaLinksToRemove.push(media.expanded_url); 1074 } 1075 - 1076 let aspectRatio: AspectRatio | undefined; 1077 if (media.sizes?.large) { 1078 aspectRatio = { width: media.sizes.large.w, height: media.sizes.large.h }; ··· 1088 console.log(`[${twitterUsername}] 📥 Downloading image (high quality): ${path.basename(highQualityUrl)}`); 1089 updateAppStatus({ message: `Downloading high quality image...` }); 1090 const { buffer, mimeType } = await downloadMedia(highQualityUrl); 1091 - 1092 let blob: BlobRef; 1093 if (dryRun) { 1094 - console.log(`[${twitterUsername}] 🧪 [DRY RUN] Would upload image (${(buffer.length/1024).toFixed(2)} KB)`); 1095 - blob = { ref: { toString: () => 'mock-blob' }, mimeType, size: buffer.length } as any; 1096 } else { 1097 - console.log(`[${twitterUsername}] 📤 Uploading image to Bluesky...`); 1098 - updateAppStatus({ message: `Uploading image to Bluesky...` }); 1099 - blob = await uploadToBluesky(agent, buffer, mimeType); 1100 } 1101 - 1102 let altText = media.ext_alt_text; 1103 if (!altText) { 1104 - console.log(`[${twitterUsername}] 🤖 Generating alt text via Gemini...`); 1105 - // Use original tweet text for context, not the modified/cleaned one 1106 - altText = await generateAltText(buffer, mimeType, tweetText); 1107 - if (altText) console.log(`[${twitterUsername}] ✅ Alt text generated: ${altText.substring(0, 50)}...`); 1108 } 1109 1110 images.push({ alt: altText || 'Image from Twitter', image: blob, aspectRatio }); ··· 1125 } else if (media.type === 'video' || media.type === 'animated_gif') { 1126 const variants = media.video_info?.variants || []; 1127 const duration = media.video_info?.duration_millis || 0; 1128 - 1129 - if (duration > 180000) { // 3 minutes 1130 - console.warn(`[${twitterUsername}] ⚠️ Video too long (${(duration / 1000).toFixed(1)}s). Fallback to link.`); 1131 - const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`; 1132 - if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`; 1133 - continue; 1134 } 1135 1136 const mp4s = variants ··· 1145 console.log(`[${twitterUsername}] 📥 Downloading video: ${videoUrl}`); 1146 updateAppStatus({ message: `Downloading video: ${path.basename(videoUrl)}` }); 1147 const { buffer, mimeType } = await downloadMedia(videoUrl); 1148 - 1149 if (buffer.length <= 90 * 1024 * 1024) { 1150 const filename = videoUrl.split('/').pop() || 'video.mp4'; 1151 if (dryRun) { 1152 - console.log(`[${twitterUsername}] 🧪 [DRY RUN] Would upload video: ${filename} (${(buffer.length/1024/1024).toFixed(2)} MB)`); 1153 - videoBlob = { ref: { toString: () => 'mock-video-blob' }, mimeType: 'video/mp4', size: buffer.length } as any; 1154 } else { 1155 - updateAppStatus({ message: `Uploading video to Bluesky...` }); 1156 - videoBlob = await uploadVideoToBluesky(agent, buffer, filename); 1157 } 1158 videoAspectRatio = aspectRatio; 1159 console.log(`[${twitterUsername}] ✅ Video upload process complete.`); 1160 break; // Prioritize first video 1161 } 1162 - 1163 - console.warn(`[${twitterUsername}] ⚠️ Video too large (${(buffer.length / 1024 / 1024).toFixed(2)}MB). Fallback to link.`); 1164 const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`; 1165 if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`; 1166 } catch (err) { 1167 const errMsg = (err as Error).message; 1168 - if (errMsg !== "VIDEO_FALLBACK_503") { 1169 - console.error(`[${twitterUsername}] ❌ Failed video upload flow:`, errMsg); 1170 } 1171 const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`; 1172 if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`; ··· 1178 1179 // Cleanup text 1180 for (const link of mediaLinksToRemove) text = text.split(link).join('').trim(); 1181 text = text.replace(/\n\s*\n/g, '\n\n').trim(); 1182 1183 // 3. Quoting Logic 1184 let quoteEmbed: { $type: string; record: { uri: string; cid: string } } | null = null; ··· 1194 } else { 1195 const quoteUrlEntity = urls.find((u) => u.expanded_url?.includes(quoteId)); 1196 const qUrl = quoteUrlEntity?.expanded_url || `https://twitter.com/i/status/${quoteId}`; 1197 - 1198 // Check if it's a self-quote (same user) 1199 - const isSelfQuote = qUrl.toLowerCase().includes(`twitter.com/${twitterUsername.toLowerCase()}/`) || 1200 - qUrl.toLowerCase().includes(`x.com/${twitterUsername.toLowerCase()}/`); 1201 - 1202 if (!isSelfQuote) { 1203 externalQuoteUrl = qUrl; 1204 console.log(`[${twitterUsername}] 🔗 Quoted tweet is external: ${externalQuoteUrl}`); 1205 - 1206 // Try to capture screenshot for external QTs if we have space for images 1207 if (images.length < 4 && !videoBlob) { 1208 const ssResult = await captureTweetScreenshot(externalQuoteUrl); ··· 1210 try { 1211 let blob: BlobRef; 1212 if (dryRun) { 1213 - console.log(`[${twitterUsername}] 🧪 [DRY RUN] Would upload screenshot for quote (${(ssResult.buffer.length/1024).toFixed(2)} KB)`); 1214 - blob = { ref: { toString: () => 'mock-ss-blob' }, mimeType: 'image/png', size: ssResult.buffer.length } as any; 1215 } else { 1216 - blob = await uploadToBluesky(agent, ssResult.buffer, 'image/png'); 1217 } 1218 - images.push({ 1219 - alt: `Quote Tweet: ${externalQuoteUrl}`, 1220 - image: blob, 1221 - aspectRatio: { width: ssResult.width, height: ssResult.height } 1222 }); 1223 } catch (e) { 1224 console.warn(`[${twitterUsername}] ⚠️ Failed to upload screenshot blob.`); ··· 1229 console.log(`[${twitterUsername}] 🔁 Quoted tweet is a self-quote, skipping link.`); 1230 } 1231 } 1232 - } else if (images.length === 0 && !videoBlob) { 1233 - // If no media and no quote, check for external links to embed 1234 - // We prioritize the LAST link found as it's often the main content 1235 - const potentialLinks = urls 1236 - .map(u => u.expanded_url) 1237 - .filter(u => u && !u.includes('twitter.com') && !u.includes('x.com')) as string[]; 1238 - 1239 - if (potentialLinks.length > 0) { 1240 - const linkToEmbed = potentialLinks[potentialLinks.length - 1]; 1241 - if (linkToEmbed) { 1242 - // Optimization: If text is too long, but removing the link makes it fit, do it! 1243 - // The link will be present in the embed card anyway. 1244 - if (text.length > 300 && text.includes(linkToEmbed)) { 1245 - const lengthWithoutLink = text.length - linkToEmbed.length; 1246 - // Allow some buffer (e.g. whitespace cleanup might save 1-2 chars) 1247 - if (lengthWithoutLink <= 300) { 1248 - console.log(`[${twitterUsername}] 📏 Optimizing: Removing link ${linkToEmbed} from text to avoid threading (Card will embed it).`); 1249 - text = text.replace(linkToEmbed, '').trim(); 1250 - // Clean up potential double punctuation/spaces left behind 1251 - text = text.replace(/\s\.$/, '.').replace(/\s\s+/g, ' '); 1252 - } 1253 - } 1254 1255 - console.log(`[${twitterUsername}] 🃏 Fetching link card for: ${linkToEmbed}`); 1256 - linkCard = await fetchEmbedUrlCard(agent, linkToEmbed); 1257 } 1258 } 1259 } 1260 1261 // Only append link for external quotes IF we couldn't natively embed it OR screenshot it 1262 - const hasScreenshot = images.some(img => img.alt.startsWith('Quote Tweet:')); 1263 if (externalQuoteUrl && !quoteEmbed && !hasScreenshot && !text.includes(externalQuoteUrl)) { 1264 text += `\n\nQT: ${externalQuoteUrl}`; 1265 } 1266 1267 // 4. Threading and Posting 1268 const chunks = splitText(text); 1269 console.log(`[${twitterUsername}] 📝 Splitting text into ${chunks.length} chunks.`); 1270 - 1271 let lastPostInfo: ProcessedTweetEntry | null = replyParentInfo; 1272 1273 // We will save the first chunk as the "Root" of this tweet, and the last chunk as the "Tail". ··· 1276 1277 for (let i = 0; i < chunks.length; i++) { 1278 let chunk = chunks[i] as string; 1279 - 1280 // Add (i/n) if split 1281 if (chunks.length > 1) { 1282 - chunk += ` (${i + 1}/${chunks.length})`; 1283 } 1284 1285 console.log(`[${twitterUsername}] 📤 Posting chunk ${i + 1}/${chunks.length}...`); 1286 updateAppStatus({ message: `Posting chunk ${i + 1}/${chunks.length}...` }); 1287 - 1288 const rt = new RichText({ text: chunk }); 1289 await rt.detectFacets(agent); 1290 const detectedLangs = detectLanguage(chunk); ··· 1331 let rootRef: { uri: string; cid: string } | null = null; 1332 1333 if (lastPostInfo?.uri && lastPostInfo?.cid) { 1334 - // If this is the start of a new tweet (i=0), check if parent has a tail 1335 - if (i === 0 && lastPostInfo.tail) { 1336 - parentRef = lastPostInfo.tail; 1337 - } else { 1338 - // Otherwise (intra-tweet or parent has no tail), use the main uri/cid (which is the previous post/chunk) 1339 - parentRef = { uri: lastPostInfo.uri, cid: lastPostInfo.cid }; 1340 - } 1341 - 1342 - rootRef = lastPostInfo.root || { uri: lastPostInfo.uri, cid: lastPostInfo.cid }; 1343 } 1344 1345 if (parentRef && rootRef) { ··· 1353 // Retry logic for network/socket errors 1354 let response: any; 1355 let retries = 3; 1356 - 1357 if (dryRun) { 1358 - console.log(`[${twitterUsername}] 🧪 [DRY RUN] Would post chunk ${i + 1}/${chunks.length}`); 1359 - if (postRecord.embed) console.log(` - With embed: ${postRecord.embed.$type}`); 1360 - if (postRecord.reply) console.log(` - As reply to: ${postRecord.reply.parent.uri}`); 1361 - response = { uri: 'at://did:plc:mock/app.bsky.feed.post/mock', cid: 'mock-cid' }; 1362 } else { 1363 - while (retries > 0) { 1364 - try { 1365 - response = await agent.post(postRecord); 1366 - break; 1367 - } catch (err: any) { 1368 - retries--; 1369 - if (retries === 0) throw err; 1370 - console.warn(`[${twitterUsername}] ⚠️ Post failed (Socket/Network), retrying in 5s... (${retries} retries left)`); 1371 - await new Promise(r => setTimeout(r, 5000)); 1372 - } 1373 } 1374 } 1375 - 1376 const currentPostInfo = { 1377 - uri: response.uri, 1378 - cid: response.cid, 1379 - root: postRecord.reply ? postRecord.reply.root : { uri: response.uri, cid: response.cid }, 1380 - // Text is just the current chunk text 1381 - text: chunk 1382 }; 1383 - 1384 if (i === 0) firstChunkInfo = currentPostInfo; 1385 lastChunkInfo = currentPostInfo; 1386 lastPostInfo = currentPostInfo; // Update for next iteration 1387 1388 console.log(`[${twitterUsername}] ✅ Chunk ${i + 1} posted successfully.`); 1389 - 1390 if (chunks.length > 1) { 1391 await new Promise((r) => setTimeout(r, 3000)); 1392 } ··· 1395 break; 1396 } 1397 } 1398 - 1399 // Save to DB and Map 1400 if (firstChunkInfo && lastChunkInfo) { 1401 - const entry: ProcessedTweetEntry = { 1402 - uri: firstChunkInfo.uri, 1403 - cid: firstChunkInfo.cid, 1404 - root: firstChunkInfo.root, 1405 - tail: { uri: lastChunkInfo.uri, cid: lastChunkInfo.cid }, // Save tail! 1406 - text: tweetText 1407 - }; 1408 - 1409 - if (!dryRun) { 1410 - saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, entry); 1411 - localProcessedMap[tweetId] = entry; // Update local map for subsequent replies in this batch 1412 - } 1413 } 1414 - 1415 // Add a random delay between 5s and 15s to be more human-like 1416 const wait = Math.floor(Math.random() * 10000) + 5000; 1417 console.log(`[${twitterUsername}] 😴 Pacing: Waiting ${wait / 1000}s before next tweet.`); ··· 1431 requestId?: string, 1432 ): Promise<void> { 1433 const config = getConfig(); 1434 - const mapping = config.mappings.find((m) => m.twitterUsernames.map(u => u.toLowerCase()).includes(twitterUsername.toLowerCase())); 1435 if (!mapping) { 1436 console.error(`No mapping found for twitter username: ${twitterUsername}`); 1437 return; ··· 1439 1440 let agent = await getAgent(mapping); 1441 if (!agent) { 1442 - if (dryRun) { 1443 - console.log("⚠️ Could not login to Bluesky, but proceeding with MOCK AGENT for Dry Run."); 1444 - // biome-ignore lint/suspicious/noExplicitAny: mock agent 1445 - agent = { 1446 - post: async (record: any) => ({ uri: 'at://did:plc:mock/app.bsky.feed.post/mock', cid: 'mock-cid' }), 1447 - uploadBlob: async (data: any) => ({ data: { blob: { ref: { toString: () => 'mock-blob' } } } }), 1448 - // Add other necessary methods if they are called outside of the already mocked dryRun blocks 1449 - // But since we mocked the calls inside processTweets for dryRun, we just need the object to exist. 1450 - session: { did: 'did:plc:mock' }, 1451 - com: { atproto: { repo: { describeRepo: async () => ({ data: {} }) } } } 1452 - } as any; 1453 - } else { 1454 - return; 1455 - } 1456 } 1457 1458 console.log(`Starting full history import for ${twitterUsername} -> ${mapping.bskyIdentifier}...`); ··· 1463 1464 console.log(`Fetching tweets for ${twitterUsername}...`); 1465 updateAppStatus({ message: `Fetching tweets...` }); 1466 - 1467 const client = await getTwitterScraper(); 1468 if (client) { 1469 - try { 1470 - // Use getTweets which reliably fetches user timeline 1471 - // limit defaults to 15 in function signature, but for history import we might want more. 1472 - // However, the generator will fetch as much as we ask. 1473 - const fetchLimit = limit || 100; 1474 - const generator = client.getTweets(twitterUsername, fetchLimit); 1475 - 1476 - for await (const scraperTweet of generator) { 1477 - if (!ignoreCancellation) { 1478 - const stillPending = getPendingBackfills().some(b => b.id === mapping.id && (!requestId || b.requestId === requestId)); 1479 - if (!stillPending) { 1480 - console.log(`[${twitterUsername}] 🛑 Backfill cancelled.`); 1481 - break; 1482 - } 1483 1484 - } 1485 - 1486 - const t = mapScraperTweetToLocalTweet(scraperTweet); 1487 - const tid = t.id_str || t.id; 1488 - if (!tid) continue; 1489 - 1490 - if (!processedTweets[tid] && !seenIds.has(tid)) { 1491 - allFoundTweets.push(t); 1492 - seenIds.add(tid); 1493 - } 1494 - 1495 - if (allFoundTweets.length >= fetchLimit) break; 1496 } 1497 - } catch(e) { 1498 - console.warn("Error during history fetch:", e); 1499 } 1500 } 1501 1502 console.log(`Fetch complete. Found ${allFoundTweets.length} new tweets to import.`); ··· 1510 const activeTasks = new Map<string, Promise<void>>(); 1511 1512 async function runAccountTask(mapping: AccountMapping, backfillRequest?: PendingBackfill, dryRun = false) { 1513 - if (activeTasks.has(mapping.id)) return; // Already running 1514 1515 - const task = (async () => { 1516 - try { 1517 - const agent = await getAgent(mapping); 1518 - if (!agent) return; 1519 1520 - const backfillReq = backfillRequest ?? getPendingBackfills().find(b => b.id === mapping.id); 1521 - 1522 - if (backfillReq) { 1523 - const limit = backfillReq.limit || 15; 1524 - console.log(`[${mapping.bskyIdentifier}] Running backfill for ${mapping.twitterUsernames.length} accounts (limit ${limit})...`); 1525 - updateAppStatus({ 1526 - state: 'backfilling', 1527 - currentAccount: mapping.twitterUsernames[0], 1528 - message: `Starting backfill (limit ${limit})...`, 1529 - backfillMappingId: mapping.id, 1530 - backfillRequestId: backfillReq.requestId, 1531 - }); 1532 - 1533 - for (const twitterUsername of mapping.twitterUsernames) { 1534 - const stillPending = getPendingBackfills().some( 1535 - (b) => b.id === mapping.id && b.requestId === backfillReq.requestId, 1536 - ); 1537 - if (!stillPending) { 1538 - console.log(`[${mapping.bskyIdentifier}] 🛑 Backfill request replaced; stopping.`); 1539 - break; 1540 - } 1541 1542 - try { 1543 - updateAppStatus({ 1544 - state: 'backfilling', 1545 - currentAccount: twitterUsername, 1546 - message: `Starting backfill (limit ${limit})...`, 1547 - backfillMappingId: mapping.id, 1548 - backfillRequestId: backfillReq.requestId, 1549 - }); 1550 - await importHistory(twitterUsername, mapping.bskyIdentifier, limit, dryRun, false, backfillReq.requestId); 1551 - } catch (err) { 1552 - console.error(`❌ Error backfilling ${twitterUsername}:`, err); 1553 - } 1554 - } 1555 - clearBackfill(mapping.id, backfillReq.requestId); 1556 - updateAppStatus({ 1557 - state: 'idle', 1558 - message: `Backfill complete for ${mapping.bskyIdentifier}`, 1559 - backfillMappingId: undefined, 1560 - backfillRequestId: undefined, 1561 - }); 1562 - console.log(`[${mapping.bskyIdentifier}] Backfill complete.`); 1563 - } else { 1564 - updateAppStatus({ backfillMappingId: undefined, backfillRequestId: undefined }); 1565 1566 - // Pre-load processed IDs for optimization 1567 - const processedMap = loadProcessedTweets(mapping.bskyIdentifier); 1568 - const processedIds = new Set(Object.keys(processedMap)); 1569 1570 - for (const twitterUsername of mapping.twitterUsernames) { 1571 - try { 1572 - console.log(`[${twitterUsername}] 🏁 Starting check for new tweets...`); 1573 - updateAppStatus({ 1574 - state: 'checking', 1575 - currentAccount: twitterUsername, 1576 - message: 'Fetching latest tweets...', 1577 - backfillMappingId: undefined, 1578 - backfillRequestId: undefined, 1579 - }); 1580 - 1581 - // Use fetchUserTweets with early stopping optimization 1582 - // Increase limit slightly since we have early stopping now 1583 - const tweets = await fetchUserTweets(twitterUsername, 50, processedIds); 1584 - 1585 - if (!tweets || tweets.length === 0) { 1586 - console.log(`[${twitterUsername}] ℹ️ No tweets found (or fetch failed).`); 1587 - continue; 1588 - } 1589 - 1590 - console.log(`[${twitterUsername}] 📥 Fetched ${tweets.length} tweets.`); 1591 - await processTweets(agent, twitterUsername, mapping.bskyIdentifier, tweets, dryRun); 1592 - } catch (err) { 1593 - console.error(`❌ Error checking ${twitterUsername}:`, err); 1594 - } 1595 - } 1596 } 1597 - } catch (err) { 1598 - console.error(`Error processing mapping ${mapping.bskyIdentifier}:`, err); 1599 - } finally { 1600 - activeTasks.delete(mapping.id); 1601 } 1602 - })(); 1603 1604 - activeTasks.set(mapping.id, task); 1605 - return task; // Return task promise for await in main loop 1606 } 1607 1608 import { 1609 - startServer, 1610 - updateLastCheckTime, 1611 - getPendingBackfills, 1612 clearBackfill, 1613 getNextCheckTime, 1614 updateAppStatus, 1615 } from './server.js'; 1616 import type { PendingBackfill } from './server.js'; 1617 - import { AccountMapping } from './config-manager.js'; 1618 1619 async function main(): Promise<void> { 1620 const program = new Command(); ··· 1655 console.error('Twitter credentials not set. Cannot import history.'); 1656 process.exit(1); 1657 } 1658 - const mapping = config.mappings.find(m => m.twitterUsernames.map(u => u.toLowerCase()).includes(options.username.toLowerCase())); 1659 if (!mapping) { 1660 console.error(`No mapping found for ${options.username}`); 1661 process.exit(1); ··· 1675 // Concurrency limit for processing accounts 1676 const runLimit = pLimit(3); 1677 1678 - const findMappingById = (mappings: AccountMapping[], id: string) => 1679 - mappings.find((mapping) => mapping.id === id); 1680 1681 // Main loop 1682 while (true) { ··· 1719 for (const mapping of config.mappings) { 1720 if (!mapping.enabled) continue; 1721 1722 - tasks.push(runLimit(async () => { 1723 - await runAccountTask(mapping, undefined, options.dryRun); 1724 - })); 1725 } 1726 1727 if (tasks.length > 0) {

··· 2 import fs from 'node:fs'; 3 import path from 'node:path'; 4 import { fileURLToPath } from 'node:url'; 5 + import { type BskyAgent, RichText } from '@atproto/api'; 6 import type { BlobRef } from '@atproto/api'; 7 import { Scraper } from '@the-convocation/twitter-scraper'; 8 import type { Tweet as ScraperTweet } from '@the-convocation/twitter-scraper'; 9 import axios from 'axios'; 10 + import * as cheerio from 'cheerio'; 11 import { Command } from 'commander'; 12 import * as francModule from 'franc-min'; 13 import iso6391 from 'iso-639-1'; 14 import puppeteer from 'puppeteer-core'; 15 import sharp from 'sharp'; 16 import { generateAltText } from './ai-manager.js'; 17 ··· 21 const __filename = fileURLToPath(import.meta.url); 22 const __dirname = path.dirname(__filename); 23 24 + // ============================================================================ 25 // Type Definitions 26 + // ============================================================================ 27 28 interface ProcessedTweetEntry { 29 uri?: string; ··· 44 expanded_url?: string; 45 } 46 47 + interface CardImageValue { 48 + url?: string; 49 + width?: number; 50 + height?: number; 51 + alt?: string; 52 + } 53 + 54 + interface CardBindingValue { 55 + type?: string; 56 + string_value?: string; 57 + image_value?: CardImageValue; 58 + } 59 + 60 + interface CardBindingEntry { 61 + key?: string; 62 + value?: CardBindingValue; 63 + } 64 + 65 + type CardBindingValues = Record<string, CardBindingValue> | CardBindingEntry[]; 66 + 67 + interface TweetCard { 68 + name?: string; 69 + binding_values?: CardBindingValues; 70 + url?: string; 71 + } 72 + 73 interface MediaSize { 74 w: number; 75 h: number; ··· 104 sizes?: MediaSizes; 105 original_info?: OriginalInfo; 106 video_info?: VideoInfo; 107 + source?: 'tweet' | 'card'; 108 } 109 110 interface TweetEntities { ··· 132 screen_name?: string; 133 id_str?: string; 134 }; 135 + card?: TweetCard | null; 136 + permanentUrl?: string; 137 } 138 139 interface AspectRatio { ··· 149 150 import { dbService } from './db.js'; 151 152 + // ============================================================================ 153 // State Management 154 + // ============================================================================ 155 156 const PROCESSED_DIR = path.join(__dirname, '..', 'processed'); 157 158 async function migrateJsonToSqlite() { 159 if (!fs.existsSync(PROCESSED_DIR)) return; 160 + 161 + const files = fs.readdirSync(PROCESSED_DIR).filter((f) => f.endsWith('.json')); 162 if (files.length === 0) return; 163 164 console.log(`📦 Found ${files.length} legacy cache files. Migrating to SQLite...`); 165 const config = getConfig(); 166 + 167 for (const file of files) { 168 const username = file.replace('.json', '').toLowerCase(); 169 // Try to find a matching bskyIdentifier from config 170 + const mapping = config.mappings.find((m) => m.twitterUsernames.map((u) => u.toLowerCase()).includes(username)); 171 const bskyIdentifier = mapping?.bskyIdentifier || 'unknown'; 172 173 try { 174 const filePath = path.join(PROCESSED_DIR, file); 175 const data = JSON.parse(fs.readFileSync(filePath, 'utf8')) as ProcessedTweetsMap; 176 + 177 for (const [twitterId, entry] of Object.entries(data)) { 178 dbService.saveTweet({ 179 twitter_id: twitterId, ··· 183 bsky_cid: entry.cid, 184 bsky_root_uri: entry.root?.uri, 185 bsky_root_cid: entry.root?.cid, 186 + status: entry.migrated ? 'migrated' : entry.skipped ? 'skipped' : 'failed', 187 }); 188 } 189 // Move file to backup ··· 201 dbService.repairUnknownIdentifiers(username, mapping.bskyIdentifier); 202 } 203 } 204 + 205 console.log('✅ Migration complete.'); 206 } 207 ··· 209 return dbService.getTweetsByBskyIdentifier(bskyIdentifier); 210 } 211 212 + function saveProcessedTweet( 213 + twitterUsername: string, 214 + bskyIdentifier: string, 215 + twitterId: string, 216 + entry: ProcessedTweetEntry, 217 + ): void { 218 dbService.saveTweet({ 219 twitter_id: twitterId, 220 twitter_username: twitterUsername.toLowerCase(), ··· 226 bsky_root_cid: entry.root?.cid, 227 bsky_tail_uri: entry.tail?.uri, 228 bsky_tail_cid: entry.tail?.cid, 229 + status: entry.migrated || (entry.uri && entry.cid) ? 'migrated' : entry.skipped ? 'skipped' : 'failed', 230 }); 231 } 232 233 + // ============================================================================ 234 // Custom Twitter Client 235 + // ============================================================================ 236 237 let scraper: Scraper | null = null; 238 let currentTwitterCookies = { authToken: '', ct0: '' }; ··· 250 } 251 252 if (!authToken || !ct0) return null; 253 + 254 // Re-initialize if config changed, not yet initialized, or forced reset 255 + if (!scraper || forceReset || currentTwitterCookies.authToken !== authToken || currentTwitterCookies.ct0 !== ct0) { 256 console.log(`🔄 Initializing Twitter scraper with ${useBackupCredentials ? 'BACKUP' : 'PRIMARY'} credentials...`); 257 scraper = new Scraper(); 258 + await scraper.setCookies([`auth_token=${authToken}`, `ct0=${ct0}`]); 259 260 + currentTwitterCookies = { 261 + authToken: authToken, 262 + ct0: ct0, 263 }; 264 } 265 return scraper; ··· 273 await getTwitterScraper(true); 274 return true; 275 } 276 + console.log('⚠️ No backup credentials available to switch to.'); 277 return false; 278 } 279 280 function mapScraperTweetToLocalTweet(scraperTweet: ScraperTweet): Tweet { 281 + const raw = scraperTweet.__raw_UNSTABLE; 282 + if (!raw) { 283 + // Fallback if raw data is missing (shouldn't happen for timeline tweets usually) 284 return { 285 + id: scraperTweet.id, 286 + id_str: scraperTweet.id, 287 + text: scraperTweet.text, 288 + full_text: scraperTweet.text, 289 isRetweet: scraperTweet.isRetweet, 290 + // Construct minimal entities from parsed data 291 + entities: { 292 + urls: scraperTweet.urls.map((url: string) => ({ url, expanded_url: url })), 293 + media: scraperTweet.photos.map((p: any) => ({ 294 + url: p.url, 295 + expanded_url: p.url, 296 + media_url_https: p.url, 297 + type: 'photo', 298 + ext_alt_text: p.alt_text, 299 + })), 300 }, 301 + created_at: scraperTweet.timeParsed?.toUTCString(), 302 + permanentUrl: scraperTweet.permanentUrl, 303 }; 304 + } 305 + 306 + return { 307 + id: raw.id_str, 308 + id_str: raw.id_str, 309 + text: raw.full_text, 310 + full_text: raw.full_text, 311 + created_at: raw.created_at, 312 + isRetweet: scraperTweet.isRetweet, 313 + // biome-ignore lint/suspicious/noExplicitAny: raw types match compatible structure 314 + entities: raw.entities as any, 315 + // biome-ignore lint/suspicious/noExplicitAny: raw types match compatible structure 316 + extended_entities: raw.extended_entities as any, 317 + quoted_status_id_str: raw.quoted_status_id_str, 318 + retweeted_status_id_str: raw.retweeted_status_id_str, 319 + is_quote_status: !!raw.quoted_status_id_str, 320 + in_reply_to_status_id_str: raw.in_reply_to_status_id_str, 321 + // biome-ignore lint/suspicious/noExplicitAny: missing in LegacyTweetRaw type 322 + in_reply_to_user_id_str: (raw as any).in_reply_to_user_id_str, 323 + // biome-ignore lint/suspicious/noExplicitAny: card comes from raw tweet 324 + card: (raw as any).card, 325 + permanentUrl: scraperTweet.permanentUrl, 326 + user: { 327 + screen_name: scraperTweet.username, 328 + id_str: scraperTweet.userId, 329 + }, 330 + }; 331 } 332 333 + // ============================================================================ 334 // Helper Functions 335 + // ============================================================================ 336 + 337 + function normalizeCardBindings(bindingValues?: CardBindingValues): Record<string, CardBindingValue> { 338 + if (!bindingValues) return {}; 339 + if (Array.isArray(bindingValues)) { 340 + return bindingValues.reduce( 341 + (acc, entry) => { 342 + if (entry?.key && entry.value) acc[entry.key] = entry.value; 343 + return acc; 344 + }, 345 + {} as Record<string, CardBindingValue>, 346 + ); 347 + } 348 + return bindingValues as Record<string, CardBindingValue>; 349 + } 350 + 351 + function isLikelyUrl(value?: string): value is string { 352 + if (!value) return false; 353 + return /^https?:\/\//i.test(value); 354 + } 355 + 356 + function extractCardImageUrl(bindingValues: CardBindingValues, preferredKeys: string[]): string | undefined { 357 + const normalized = normalizeCardBindings(bindingValues); 358 + for (const key of preferredKeys) { 359 + const value = normalized[key]; 360 + const imageUrl = value?.image_value?.url; 361 + if (imageUrl) return imageUrl; 362 + } 363 + const fallbackValue = Object.values(normalized).find((value) => value?.image_value?.url); 364 + return fallbackValue?.image_value?.url; 365 + } 366 + 367 + function extractCardLink(bindingValues: CardBindingValues, preferredKeys: string[]): string | undefined { 368 + const normalized = normalizeCardBindings(bindingValues); 369 + for (const key of preferredKeys) { 370 + const value = normalized[key]; 371 + const link = value?.string_value; 372 + if (isLikelyUrl(link)) return link; 373 + } 374 + const fallbackValue = Object.values(normalized).find((value) => isLikelyUrl(value?.string_value)); 375 + return fallbackValue?.string_value; 376 + } 377 + 378 + function extractCardTitle(bindingValues: CardBindingValues, preferredKeys: string[]): string | undefined { 379 + const normalized = normalizeCardBindings(bindingValues); 380 + for (const key of preferredKeys) { 381 + const value = normalized[key]; 382 + const title = value?.string_value; 383 + if (title && !isLikelyUrl(title)) return title; 384 + } 385 + const fallbackValue = Object.values(normalized).find( 386 + (value) => value?.string_value && !isLikelyUrl(value?.string_value), 387 + ); 388 + return fallbackValue?.string_value; 389 + } 390 + 391 + function extractCardAlt(bindingValues: CardBindingValues): string | undefined { 392 + const normalized = normalizeCardBindings(bindingValues); 393 + const altValue = Object.values(normalized).find((value) => value?.image_value?.alt); 394 + return altValue?.image_value?.alt; 395 + } 396 + 397 + function appendCallToAction(text: string, link?: string, label = 'Sponsored') { 398 + if (!link) return text; 399 + if (text.includes(link)) return text; 400 + return `${text}\n\n${label}: ${link}`.trim(); 401 + } 402 + 403 + function detectCardMedia(tweet: Tweet): { imageUrls: string[]; link?: string; title?: string; alt?: string } { 404 + if (!tweet.card?.binding_values) return { imageUrls: [] }; 405 + const bindings = tweet.card.binding_values; 406 + 407 + const imageUrls: string[] = []; 408 + const preferredImageKeys = [ 409 + 'photo_image_full_size', 410 + 'photo_image_full_size_original', 411 + 'thumbnail_image', 412 + 'image', 413 + 'thumbnail', 414 + 'summary_photo_image', 415 + 'player_image', 416 + ]; 417 + const preferredLinkKeys = ['site', 'destination', 'landing_url', 'cta_link', 'card_url', 'url']; 418 + const preferredTitleKeys = ['title', 'summary', 'card_title']; 419 + 420 + const primaryImage = extractCardImageUrl(bindings, preferredImageKeys); 421 + if (primaryImage) imageUrls.push(primaryImage); 422 + 423 + const imageKeys = normalizeCardBindings(bindings); 424 + Object.values(imageKeys).forEach((value) => { 425 + const url = value?.image_value?.url; 426 + if (url && !imageUrls.includes(url)) imageUrls.push(url); 427 + }); 428 + 429 + const link = extractCardLink(bindings, preferredLinkKeys); 430 + const title = extractCardTitle(bindings, preferredTitleKeys); 431 + const alt = extractCardAlt(bindings); 432 + 433 + return { imageUrls, link, title, alt }; 434 + } 435 + 436 + function buildCardMediaEntities(tweet: Tweet): { media: MediaEntity[]; link?: string } { 437 + const cardData = detectCardMedia(tweet); 438 + if (cardData.imageUrls.length === 0) return { media: [] }; 439 + 440 + const media = cardData.imageUrls.slice(0, 4).map((url) => ({ 441 + media_url_https: url, 442 + type: 'photo' as const, 443 + ext_alt_text: cardData.alt || cardData.title || 'Sponsored image', 444 + source: 'card' as const, 445 + })); 446 + 447 + return { media, link: cardData.link }; 448 + } 449 + 450 + function ensureUrlEntity(entities: TweetEntities | undefined, link?: string) { 451 + if (!link) return; 452 + if (!entities) return; 453 + const urls = entities.urls || []; 454 + if (!urls.some((url) => url.expanded_url === link || url.url === link)) { 455 + urls.push({ url: link, expanded_url: link }); 456 + entities.urls = urls; 457 + } 458 + } 459 + 460 + function detectSponsoredCard(tweet: Tweet): boolean { 461 + if (!tweet.card?.binding_values) return false; 462 + const cardName = tweet.card.name?.toLowerCase() || ''; 463 + const cardMedia = detectCardMedia(tweet); 464 + const hasMultipleImages = cardMedia.imageUrls.length > 1; 465 + const promoKeywords = ['promo', 'unified', 'carousel', 'collection', 'amplify']; 466 + const hasPromoName = promoKeywords.some((keyword) => cardName.includes(keyword)); 467 + return hasMultipleImages || hasPromoName; 468 + } 469 + 470 + function mergeMediaEntities(primary: MediaEntity[], secondary: MediaEntity[], limit = 4): MediaEntity[] { 471 + const merged: MediaEntity[] = []; 472 + const seen = new Set<string>(); 473 + const ordered = [ 474 + ...primary.filter((media) => media?.source !== 'card'), 475 + ...primary.filter((media) => media?.source === 'card'), 476 + ...secondary.filter((media) => media?.source !== 'card'), 477 + ...secondary.filter((media) => media?.source === 'card'), 478 + ]; 479 + 480 + for (const media of ordered) { 481 + if (!media?.media_url_https) continue; 482 + if (seen.has(media.media_url_https)) continue; 483 + merged.push(media); 484 + seen.add(media.media_url_https); 485 + if (merged.length >= limit) break; 486 + } 487 + 488 + return merged; 489 + } 490 + 491 + function detectCarouselLinks(tweet: Tweet): string[] { 492 + if (!tweet.card?.binding_values) return []; 493 + const bindings = normalizeCardBindings(tweet.card.binding_values); 494 + const links = Object.values(bindings) 495 + .map((value) => value?.string_value) 496 + .filter((value): value is string => isLikelyUrl(value)); 497 + return [...new Set(links)]; 498 + } 499 + 500 + function mergeUrlEntities(entities: TweetEntities | undefined, links: string[]) { 501 + if (!entities || links.length === 0) return; 502 + const urls = entities.urls || []; 503 + links.forEach((link) => { 504 + if (!urls.some((url) => url.expanded_url === link || url.url === link)) { 505 + urls.push({ url: link, expanded_url: link }); 506 + } 507 + }); 508 + entities.urls = urls; 509 + } 510 + 511 + function injectCardMedia(tweet: Tweet) { 512 + if (!tweet.card?.binding_values) return; 513 + const cardMedia = buildCardMediaEntities(tweet); 514 + if (cardMedia.media.length === 0) return; 515 + 516 + const existingMedia = tweet.extended_entities?.media || tweet.entities?.media || []; 517 + const mergedMedia = mergeMediaEntities(existingMedia, cardMedia.media); 518 + 519 + if (!tweet.extended_entities) tweet.extended_entities = {}; 520 + tweet.extended_entities.media = mergedMedia; 521 + if (!tweet.entities) tweet.entities = {}; 522 + if (!tweet.entities.media) tweet.entities.media = mergedMedia; 523 + 524 + if (cardMedia.link) { 525 + ensureUrlEntity(tweet.entities, cardMedia.link); 526 + } 527 + 528 + const carouselLinks = detectCarouselLinks(tweet); 529 + mergeUrlEntities(tweet.entities, carouselLinks); 530 + } 531 + 532 + function ensureSponsoredLinks(text: string, tweet: Tweet): string { 533 + if (!tweet.card?.binding_values) return text; 534 + const carouselLinks = detectCarouselLinks(tweet); 535 + const cardLink = detectCardMedia(tweet).link; 536 + const links = [...new Set([cardLink, ...carouselLinks].filter(Boolean))] as string[]; 537 + if (links.length === 0) return text; 538 + 539 + const appendedLinks = links.slice(0, 2).map((link) => `Link: ${link}`); 540 + const updatedText = `${text}\n\n${appendedLinks.join('\n')}`.trim(); 541 + return updatedText; 542 + } 543 + 544 + function addTextFallbacks(text: string): string { 545 + return text.replace(/\s+$/g, '').trim(); 546 + } 547 + 548 + async function fetchSyndicationMedia(tweetUrl: string): Promise<{ images: string[] }> { 549 + try { 550 + const normalized = tweetUrl.replace('twitter.com', 'x.com'); 551 + const res = await axios.get('https://publish.twitter.com/oembed', { 552 + params: { url: normalized }, 553 + headers: { 'User-Agent': 'Mozilla/5.0' }, 554 + }); 555 + const html = res.data?.html as string | undefined; 556 + if (!html) return { images: [] }; 557 + 558 + const match = html.match(/status\/(\d+)/); 559 + const tweetId = match?.[1]; 560 + if (!tweetId) return { images: [] }; 561 + 562 + const syndicationUrl = `https://cdn.syndication.twimg.com/tweet-result?id=${tweetId}`; 563 + const syndication = await axios.get(syndicationUrl, { 564 + headers: { 'User-Agent': 'Mozilla/5.0', Accept: 'application/json' }, 565 + }); 566 + const data = syndication.data as Record<string, unknown>; 567 + const images = (data?.photos as { url?: string }[] | undefined) 568 + ?.map((photo) => photo.url) 569 + .filter(Boolean) as string[]; 570 + return { images: images || [] }; 571 + } catch (err) { 572 + return { images: [] }; 573 + } 574 + } 575 + 576 + function injectSyndicationMedia(tweet: Tweet, syndication: { images: string[] }) { 577 + if (syndication.images.length === 0) return; 578 + const media = syndication.images.slice(0, 4).map((url) => ({ 579 + media_url_https: url, 580 + type: 'photo' as const, 581 + ext_alt_text: 'Image from Twitter', 582 + source: 'card' as const, 583 + })); 584 + 585 + const existingMedia = tweet.extended_entities?.media || tweet.entities?.media || []; 586 + const mergedMedia = mergeMediaEntities(existingMedia, media); 587 + 588 + if (!tweet.extended_entities) tweet.extended_entities = {}; 589 + tweet.extended_entities.media = mergedMedia; 590 + if (!tweet.entities) tweet.entities = {}; 591 + if (!tweet.entities.media) tweet.entities.media = mergedMedia; 592 + } 593 594 function detectLanguage(text: string): string[] { 595 if (!text || text.trim().length === 0) return ['en']; ··· 622 return (response.request as any)?.res?.responseUrl || shortUrl; 623 } catch (e: any) { 624 if (e.code === 'ERR_FR_TOO_MANY_REDIRECTS' || e.response?.status === 403 || e.response?.status === 401) { 625 + // Silent fallback for common expansion issues (redirect loops, login walls) 626 + return shortUrl; 627 } 628 return shortUrl; 629 } ··· 659 const isGif = mimeType === 'image/gif'; 660 const isAnimation = isGif || isWebp; 661 662 + if ( 663 + (buffer.length > MAX_SIZE && (mimeType.startsWith('image/') || mimeType === 'application/octet-stream')) || 664 + (isPng && buffer.length > MAX_SIZE) 665 + ) { 666 console.log(`[UPLOAD] ⚖️ Image too large (${(buffer.length / 1024).toFixed(2)} KB). Optimizing...`); 667 try { 668 let image = sharp(buffer); ··· 676 while (currentBuffer.length > MAX_SIZE && attempts < 5) { 677 attempts++; 678 console.log(`[UPLOAD] 📉 Compression attempt ${attempts}: Width ${width}, Quality ${quality}...`); 679 + 680 if (isAnimation) { 681 + // For animations (GIF/WebP), we can only do so much without losing frames 682 + // Try to convert to WebP if it's a GIF, or optimize WebP 683 + image = sharp(buffer, { animated: true }); 684 + if (isGif) { 685 + // Convert GIF to WebP for better compression 686 + image = image.webp({ quality: Math.max(quality, 50), effort: 6 }); 687 + finalMimeType = 'image/webp'; 688 + } else { 689 + image = image.webp({ quality: Math.max(quality, 50), effort: 6 }); 690 + } 691 + // Resize if really big 692 + if (metadata.width && metadata.width > 800) { 693 + image = image.resize({ width: 800, withoutEnlargement: true }); 694 + } 695 } else { 696 + // Static images 697 + if (width > 1600) width = 1600; 698 + else if (attempts > 1) width = Math.floor(width * 0.8); 699 + 700 + quality = Math.max(50, quality - 10); 701 + 702 + image = sharp(buffer).resize({ width, withoutEnlargement: true }).jpeg({ quality, mozjpeg: true }); 703 + 704 + finalMimeType = 'image/jpeg'; 705 } 706 + 707 currentBuffer = await image.toBuffer(); 708 if (currentBuffer.length <= MAX_SIZE) { 709 + finalBuffer = currentBuffer; 710 + console.log(`[UPLOAD] ✅ Optimized to ${(finalBuffer.length / 1024).toFixed(2)} KB`); 711 + break; 712 } 713 } 714 + 715 if (finalBuffer.length > MAX_SIZE) { 716 + console.warn( 717 + `[UPLOAD] ⚠️ Could not compress below limit. Current: ${(finalBuffer.length / 1024).toFixed(2)} KB. Upload might fail.`, 718 + ); 719 } 720 } catch (err) { 721 console.warn(`[UPLOAD] ⚠️ Optimization failed, attempting original upload:`, (err as Error).message); 722 finalBuffer = buffer; ··· 744 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe', 745 ]; 746 747 + const executablePath = browserPaths.find((p) => fs.existsSync(p)); 748 + 749 if (!executablePath) { 750 console.warn(`[SCREENSHOT] ⏩ Skipping screenshot (no Chrome/Chromium found at common paths).`); 751 return null; ··· 789 `; 790 791 await page.setContent(html, { waitUntil: 'networkidle0' }); 792 + 793 // Wait for the twitter iframe to load and render 794 try { 795 await page.waitForSelector('iframe', { timeout: 10000 }); 796 // Small extra wait for images inside iframe 797 + await new Promise((r) => setTimeout(r, 2000)); 798 } catch (e) { 799 console.warn(`[SCREENSHOT] ⚠️ Timeout waiting for tweet iframe, taking screenshot anyway.`); 800 } ··· 804 const box = await element.boundingBox(); 805 const buffer = await element.screenshot({ type: 'png', omitBackground: true }); 806 if (box) { 807 + console.log( 808 + `[SCREENSHOT] ✅ Captured successfully (${(buffer.length / 1024).toFixed(2)} KB) - ${Math.round(box.width)}x${Math.round(box.height)}`, 809 + ); 810 + return { buffer: buffer as Buffer, width: Math.round(box.width), height: Math.round(box.height) }; 811 } 812 } 813 } catch (err) { ··· 825 826 while (!blob) { 827 attempts++; 828 + const statusUrl = new URL('https://video.bsky.app/xrpc/app.bsky.video.getJobStatus'); 829 + statusUrl.searchParams.append('jobId', jobId); 830 831 const statusResponse = await fetch(statusUrl); 832 if (!statusResponse.ok) { ··· 844 if (statusData.jobStatus.blob) { 845 blob = statusData.jobStatus.blob; 846 console.log(`[VIDEO] 🎉 Video processing complete! Blob ref obtained.`); 847 + } else if (state === 'JOB_STATE_FAILED') { 848 + throw new Error(`Video processing failed: ${statusData.jobStatus.error || 'Unknown error'}`); 849 } else { 850 // Wait before next poll 851 await new Promise((resolve) => setTimeout(resolve, 5000)); ··· 853 854 if (attempts > 60) { 855 // ~5 minute timeout 856 + throw new Error('Video processing timed out after 5 minutes.'); 857 } 858 } 859 return blob!; ··· 863 try { 864 const response = await axios.get(url, { 865 headers: { 866 + 'User-Agent': 867 + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 868 + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 869 'Accept-Language': 'en-US,en;q=0.9', 870 }, 871 timeout: 10000, 872 maxRedirects: 5, 873 }); 874 + 875 const $ = cheerio.load(response.data); 876 const title = $('meta[property="og:title"]').attr('content') || $('title').text() || ''; 877 + const description = 878 + $('meta[property="og:description"]').attr('content') || $('meta[name="description"]').attr('content') || ''; 879 let thumbBlob: BlobRef | undefined; 880 881 let imageUrl = $('meta[property="og:image"]').attr('content'); 882 if (imageUrl) { 883 + if (!imageUrl.startsWith('http')) { 884 + const baseUrl = new URL(url); 885 + imageUrl = new URL(imageUrl, baseUrl.origin).toString(); 886 + } 887 + try { 888 + const { buffer, mimeType } = await downloadMedia(imageUrl); 889 + thumbBlob = await uploadToBluesky(agent, buffer, mimeType); 890 + } catch (e) { 891 + // SIlently fail thumbnail upload 892 + } 893 } 894 895 if (!title && !description) return null; 896 897 const external: any = { 898 + uri: url, 899 + title: title || url, 900 + description: description, 901 }; 902 903 if (thumbBlob) { 904 + external.thumb = thumbBlob; 905 } 906 907 return { 908 + $type: 'app.bsky.embed.external', 909 + external, 910 }; 911 } catch (err: any) { 912 if (err.code === 'ERR_FR_TOO_MANY_REDIRECTS') { 913 + // Ignore redirect loops 914 + return null; 915 } 916 console.warn(`Failed to fetch embed card for ${url}:`, err.message || err); 917 return null; ··· 919 } 920 921 async function uploadVideoToBluesky(agent: BskyAgent, buffer: Buffer, filename: string): Promise<BlobRef> { 922 + const sanitizedFilename = filename.split('?')[0] || 'video.mp4'; 923 console.log( 924 `[VIDEO] 🟢 Starting upload process for ${sanitizedFilename} (${(buffer.length / 1024 / 1024).toFixed(2)} MB)`, 925 ); ··· 932 933 // didDoc might be present in repoDesc 934 const pdsService = (repoDesc as any).didDoc?.service?.find( 935 + (s: any) => s.id === '#atproto_pds' || s.type === 'AtProtoPds', 936 ); 937 const pdsUrl = pdsService?.serviceEndpoint; 938 + const pdsHost = pdsUrl ? new URL(pdsUrl).host : 'bsky.social'; 939 940 console.log(`[VIDEO] 🌐 PDS Host detected: ${pdsHost}`); 941 console.log(`[VIDEO] 🔑 Requesting service auth token for audience: did:web:${pdsHost}...`); 942 943 const { data: serviceAuth } = await agent.com.atproto.server.getServiceAuth({ 944 aud: `did:web:${pdsHost}`, 945 + lxm: 'com.atproto.repo.uploadBlob', 946 exp: Math.floor(Date.now() / 1000) + 60 * 30, 947 }); 948 console.log(`[VIDEO] ✅ Service auth token obtained.`); ··· 950 const token = serviceAuth.token; 951 952 // 2. Upload to Video Service 953 + const uploadUrl = new URL('https://video.bsky.app/xrpc/app.bsky.video.uploadVideo'); 954 + uploadUrl.searchParams.append('did', agent.session!.did!); 955 + uploadUrl.searchParams.append('name', sanitizedFilename); 956 957 console.log(`[VIDEO] 📤 Uploading to ${uploadUrl.href}...`); 958 const uploadResponse = await fetch(uploadUrl, { 959 + method: 'POST', 960 headers: { 961 Authorization: `Bearer ${token}`, 962 + 'Content-Type': 'video/mp4', 963 }, 964 body: new Blob([new Uint8Array(buffer)]), 965 }); ··· 970 // Handle specific error cases 971 try { 972 const errorJson = JSON.parse(errorText); 973 + 974 // Handle server overload gracefully 975 + if ( 976 + uploadResponse.status === 503 || 977 + errorJson.error === 'Server does not have enough capacity to handle uploads' 978 + ) { 979 + console.warn(`[VIDEO] ⚠️ Server overloaded (503). Skipping video upload and falling back to link.`); 980 + throw new Error('VIDEO_FALLBACK_503'); 981 } 982 983 + if (errorJson.error === 'already_exists' && errorJson.jobId) { 984 console.log(`[VIDEO] ♻️ Video already exists. Resuming with Job ID: ${errorJson.jobId}`); 985 return await pollForVideoProcessing(agent, errorJson.jobId); 986 } 987 + if ( 988 + errorJson.error === 'unconfirmed_email' || 989 + (errorJson.jobStatus && errorJson.jobStatus.error === 'unconfirmed_email') 990 + ) { 991 + console.error( 992 + `[VIDEO] 🛑 BLUESKY ERROR: Your email is unconfirmed. You MUST verify your email on Bluesky to upload videos.`, 993 + ); 994 + throw new Error('Bluesky Email Unconfirmed - Video Upload Rejected'); 995 } 996 } catch (e) { 997 + if ((e as Error).message === 'VIDEO_FALLBACK_503') throw e; 998 + // Not JSON or missing fields, proceed with throwing original error 999 } 1000 + 1001 console.error(`[VIDEO] ❌ Server responded with ${uploadResponse.status}: ${errorText}`); 1002 throw new Error(`Video upload failed: ${uploadResponse.status} ${errorText}`); 1003 } ··· 1040 // 4. Force split 1041 1042 let splitIndex = -1; 1043 + 1044 // Check paragraphs 1045 let checkIndex = remaining.lastIndexOf('\n\n', effectiveLimit); 1046 if (checkIndex !== -1) splitIndex = checkIndex; 1047 1048 // Check sentences 1049 if (splitIndex === -1) { 1050 + // Look for punctuation followed by space 1051 + const sentenceMatches = Array.from(remaining.substring(0, effectiveLimit).matchAll(/[.!?]\s/g)); 1052 + if (sentenceMatches.length > 0) { 1053 + const lastMatch = sentenceMatches[sentenceMatches.length - 1]; 1054 + if (lastMatch && lastMatch.index !== undefined) { 1055 + splitIndex = lastMatch.index + 1; // Include punctuation 1056 } 1057 + } 1058 } 1059 1060 // Check spaces 1061 if (splitIndex === -1) { 1062 + checkIndex = remaining.lastIndexOf(' ', effectiveLimit); 1063 + if (checkIndex !== -1) splitIndex = checkIndex; 1064 } 1065 1066 // Force split if no good break point found 1067 if (splitIndex === -1) { 1068 + splitIndex = effectiveLimit; 1069 } 1070 1071 chunks.push(remaining.substring(0, splitIndex).trim()); ··· 1114 async function fetchUserTweets(username: string, limit: number, processedIds?: Set<string>): Promise<Tweet[]> { 1115 const client = await getTwitterScraper(); 1116 if (!client) return []; 1117 + 1118 let retries = 3; 1119 while (retries > 0) { 1120 try { 1121 const tweets: Tweet[] = []; 1122 const generator = client.getTweets(username, limit); 1123 let consecutiveProcessedCount = 0; 1124 + 1125 for await (const t of generator) { 1126 const tweet = mapScraperTweetToLocalTweet(t); 1127 const tweetId = tweet.id_str || tweet.id; 1128 + 1129 // Early stopping logic: if we see 3 consecutive tweets we've already processed, stop. 1130 // This assumes timeline order (mostly true). 1131 if (processedIds && tweetId && processedIds.has(tweetId)) { 1132 + consecutiveProcessedCount++; 1133 + if (consecutiveProcessedCount >= 3) { 1134 + console.log(`[${username}] 🛑 Found 3 consecutive processed tweets. Stopping fetch early.`); 1135 + break; 1136 + } 1137 } else { 1138 + consecutiveProcessedCount = 0; 1139 } 1140 1141 tweets.push(tweet); ··· 1144 return tweets; 1145 } catch (e: any) { 1146 retries--; 1147 + const isRetryable = 1148 + e.message?.includes('ServiceUnavailable') || 1149 + e.message?.includes('Timeout') || 1150 + e.message?.includes('429') || 1151 + e.message?.includes('401'); 1152 + 1153 // Check for Twitter Internal Server Error (often returns 400 with specific body) 1154 if (e?.response?.status === 400 && JSON.stringify(e?.response?.data || {}).includes('InternalServerError')) { 1155 + console.warn(`⚠️ Twitter Internal Server Error (Transient) for ${username}.`); 1156 + // Treat as retryable 1157 + if (retries > 0) { 1158 + await new Promise((r) => setTimeout(r, 5000)); 1159 + continue; 1160 + } 1161 } 1162 1163 if (isRetryable) { 1164 console.warn(`⚠️ Error fetching tweets for ${username} (${e.message}).`); 1165 + 1166 // Attempt credential switch if we have backups 1167 if (await switchCredentials()) { 1168 + console.log(`🔄 Retrying with new credentials...`); 1169 + continue; // Retry loop with new credentials 1170 } 1171 1172 if (retries > 0) { 1173 + console.log(`Waiting 5s before retry...`); 1174 + await new Promise((r) => setTimeout(r, 5000)); 1175 + continue; 1176 } 1177 } 1178 + 1179 console.warn(`Error fetching tweets for ${username}:`, e.message || e); 1180 return []; 1181 } 1182 } 1183 + 1184 console.log(`[${username}] ⚠️ Scraper returned 0 tweets (or failed silently) after retries.`); 1185 return []; 1186 } 1187 1188 + // ============================================================================ 1189 // Main Processing Logic 1190 + // ============================================================================ 1191 1192 + // ============================================================================ 1193 // Main Processing Logic 1194 + // ============================================================================ 1195 1196 async function processTweets( 1197 agent: BskyAgent, ··· 1213 }); 1214 1215 const processedTweets = loadProcessedTweets(bskyIdentifier); 1216 + 1217 // Maintain a local map that updates in real-time for intra-batch replies 1218 const localProcessedMap: ProcessedTweetsMap = { ...processedTweets }; 1219 ··· 1240 if (isRetweet) { 1241 console.log(`[${twitterUsername}] ⏩ Skipping retweet ${tweetId}.`); 1242 if (!dryRun) { 1243 + // Save as skipped so we don't check it again 1244 + saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, { skipped: true, text: tweet.text }); 1245 + localProcessedMap[tweetId] = { skipped: true, text: tweet.text }; 1246 } 1247 continue; 1248 } ··· 1271 // Parent missing from local batch/DB. Attempt to fetch it if it's a self-thread. 1272 // We assume it's a self-thread if we don't have it, but we'll verify author after fetch. 1273 console.log(`[${twitterUsername}] 🕵️ Parent ${replyStatusId} missing. Checking if backfillable...`); 1274 + 1275 let parentBackfilled = false; 1276 try { 1277 + const scraper = await getTwitterScraper(); 1278 + if (scraper) { 1279 + const parentRaw = await scraper.getTweet(replyStatusId); 1280 + if (parentRaw) { 1281 + const parentTweet = mapScraperTweetToLocalTweet(parentRaw); 1282 + const parentAuthor = parentTweet.user?.screen_name; 1283 + 1284 + if (parentAuthor?.toLowerCase() === twitterUsername.toLowerCase()) { 1285 + console.log(`[${twitterUsername}] 🔄 Parent is ours (@${parentAuthor}). Backfilling parent first...`); 1286 + // Recursively process the parent 1287 + await processTweets(agent, twitterUsername, bskyIdentifier, [parentTweet], dryRun); 1288 + 1289 + // Check if it was saved 1290 + const savedParent = dbService.getTweet(replyStatusId, bskyIdentifier); 1291 + if (savedParent && savedParent.status === 'migrated') { 1292 + // Update local map 1293 + localProcessedMap[replyStatusId] = { 1294 + uri: savedParent.bsky_uri, 1295 + cid: savedParent.bsky_cid, 1296 + root: 1297 + savedParent.bsky_root_uri && savedParent.bsky_root_cid 1298 + ? { uri: savedParent.bsky_root_uri, cid: savedParent.bsky_root_cid } 1299 + : undefined, 1300 + tail: 1301 + savedParent.bsky_tail_uri && savedParent.bsky_tail_cid 1302 + ? { uri: savedParent.bsky_tail_uri, cid: savedParent.bsky_tail_cid } 1303 + : undefined, 1304 + migrated: true, 1305 + }; 1306 + replyParentInfo = localProcessedMap[replyStatusId] ?? null; 1307 + parentBackfilled = true; 1308 + console.log(`[${twitterUsername}] ✅ Parent backfilled. Resuming thread.`); 1309 } 1310 + } else { 1311 + console.log(`[${twitterUsername}] ⏩ Parent is by @${parentAuthor}. Skipping external reply.`); 1312 + } 1313 } 1314 + } 1315 } catch (e) { 1316 + console.warn(`[${twitterUsername}] ⚠️ Failed to fetch/backfill parent ${replyStatusId}:`, e); 1317 } 1318 1319 if (!parentBackfilled) { 1320 + console.log(`[${twitterUsername}] ⏩ Skipping external/unknown reply (Parent not found or external).`); 1321 + if (!dryRun) { 1322 + saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, { skipped: true, text: tweetText }); 1323 + localProcessedMap[tweetId] = { skipped: true, text: tweetText }; 1324 + } 1325 + continue; 1326 } 1327 } else { 1328 console.log(`[${twitterUsername}] ⏩ Skipping external/unknown reply.`); ··· 1335 } 1336 1337 // Removed early dryRun continue to allow verifying logic 1338 + 1339 let text = tweetText 1340 .replace(/&/g, '&') 1341 .replace(/</g, '<') 1342 .replace(/>/g, '>') 1343 .replace(/"/g, '"') 1344 .replace(/'/g, "'"); 1345 + 1346 // 1. Link Expansion 1347 console.log(`[${twitterUsername}] 🔗 Expanding links...`); 1348 const urls = tweet.entities?.urls || []; ··· 1357 const matches = text.match(tcoRegex) || []; 1358 for (const tco of matches) { 1359 // Avoid re-resolving if we already handled it via entities 1360 + if (urls.some((u) => u.url === tco)) continue; 1361 1362 console.log(`[${twitterUsername}] 🔍 Resolving fallback link: ${tco}`); 1363 const resolved = await expandUrl(tco); 1364 if (resolved !== tco) { 1365 + text = text.replace(tco, resolved); 1366 + // Add to urls array so it can be used for card embedding later 1367 + urls.push({ url: tco, expanded_url: resolved }); 1368 + } 1369 + } 1370 + 1371 + const isSponsoredCard = detectSponsoredCard(tweet); 1372 + if (isSponsoredCard) { 1373 + console.log(`[${twitterUsername}] 🧩 Sponsored/card payload detected. Extracting carousel media...`); 1374 + injectCardMedia(tweet); 1375 + } else if (tweet.permanentUrl) { 1376 + const syndication = await fetchSyndicationMedia(tweet.permanentUrl); 1377 + if (syndication.images.length > 0) { 1378 + console.log(`[${twitterUsername}] 🧩 Syndication carousel detected. Extracting media...`); 1379 + injectSyndicationMedia(tweet, syndication); 1380 } 1381 } 1382 ··· 1394 mediaLinksToRemove.push(media.url); 1395 if (media.expanded_url) mediaLinksToRemove.push(media.expanded_url); 1396 } 1397 + if (media.source === 'card' && media.media_url_https) { 1398 + mediaLinksToRemove.push(media.media_url_https); 1399 + } 1400 + 1401 let aspectRatio: AspectRatio | undefined; 1402 if (media.sizes?.large) { 1403 aspectRatio = { width: media.sizes.large.w, height: media.sizes.large.h }; ··· 1413 console.log(`[${twitterUsername}] 📥 Downloading image (high quality): ${path.basename(highQualityUrl)}`); 1414 updateAppStatus({ message: `Downloading high quality image...` }); 1415 const { buffer, mimeType } = await downloadMedia(highQualityUrl); 1416 + 1417 let blob: BlobRef; 1418 if (dryRun) { 1419 + console.log( 1420 + `[${twitterUsername}] 🧪 [DRY RUN] Would upload image (${(buffer.length / 1024).toFixed(2)} KB)`, 1421 + ); 1422 + blob = { ref: { toString: () => 'mock-blob' }, mimeType, size: buffer.length } as any; 1423 } else { 1424 + console.log(`[${twitterUsername}] 📤 Uploading image to Bluesky...`); 1425 + updateAppStatus({ message: `Uploading image to Bluesky...` }); 1426 + blob = await uploadToBluesky(agent, buffer, mimeType); 1427 } 1428 + 1429 let altText = media.ext_alt_text; 1430 if (!altText) { 1431 + console.log(`[${twitterUsername}] 🤖 Generating alt text via Gemini...`); 1432 + // Use original tweet text for context, not the modified/cleaned one 1433 + altText = await generateAltText(buffer, mimeType, tweetText); 1434 + if (altText) console.log(`[${twitterUsername}] ✅ Alt text generated: ${altText.substring(0, 50)}...`); 1435 } 1436 1437 images.push({ alt: altText || 'Image from Twitter', image: blob, aspectRatio }); ··· 1452 } else if (media.type === 'video' || media.type === 'animated_gif') { 1453 const variants = media.video_info?.variants || []; 1454 const duration = media.video_info?.duration_millis || 0; 1455 + 1456 + if (duration > 180000) { 1457 + // 3 minutes 1458 + console.warn(`[${twitterUsername}] ⚠️ Video too long (${(duration / 1000).toFixed(1)}s). Fallback to link.`); 1459 + const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`; 1460 + if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`; 1461 + continue; 1462 } 1463 1464 const mp4s = variants ··· 1473 console.log(`[${twitterUsername}] 📥 Downloading video: ${videoUrl}`); 1474 updateAppStatus({ message: `Downloading video: ${path.basename(videoUrl)}` }); 1475 const { buffer, mimeType } = await downloadMedia(videoUrl); 1476 + 1477 if (buffer.length <= 90 * 1024 * 1024) { 1478 const filename = videoUrl.split('/').pop() || 'video.mp4'; 1479 if (dryRun) { 1480 + console.log( 1481 + `[${twitterUsername}] 🧪 [DRY RUN] Would upload video: ${filename} (${(buffer.length / 1024 / 1024).toFixed(2)} MB)`, 1482 + ); 1483 + videoBlob = { 1484 + ref: { toString: () => 'mock-video-blob' }, 1485 + mimeType: 'video/mp4', 1486 + size: buffer.length, 1487 + } as any; 1488 } else { 1489 + updateAppStatus({ message: `Uploading video to Bluesky...` }); 1490 + videoBlob = await uploadVideoToBluesky(agent, buffer, filename); 1491 } 1492 videoAspectRatio = aspectRatio; 1493 console.log(`[${twitterUsername}] ✅ Video upload process complete.`); 1494 break; // Prioritize first video 1495 } 1496 + 1497 + console.warn( 1498 + `[${twitterUsername}] ⚠️ Video too large (${(buffer.length / 1024 / 1024).toFixed(2)}MB). Fallback to link.`, 1499 + ); 1500 const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`; 1501 if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`; 1502 } catch (err) { 1503 const errMsg = (err as Error).message; 1504 + if (errMsg !== 'VIDEO_FALLBACK_503') { 1505 + console.error(`[${twitterUsername}] ❌ Failed video upload flow:`, errMsg); 1506 } 1507 const tweetUrl = `https://twitter.com/${twitterUsername}/status/${tweetId}`; 1508 if (!text.includes(tweetUrl)) text += `\n\nVideo: ${tweetUrl}`; ··· 1514 1515 // Cleanup text 1516 for (const link of mediaLinksToRemove) text = text.split(link).join('').trim(); 1517 + if (isSponsoredCard) { 1518 + const cardLinks = detectCarouselLinks(tweet); 1519 + const cardPrimaryLink = detectCardMedia(tweet).link; 1520 + const requestedLinks = [cardPrimaryLink, ...cardLinks].filter(Boolean) as string[]; 1521 + requestedLinks.forEach((link) => { 1522 + if (!urls.some((u) => u.expanded_url === link || u.url === link)) { 1523 + urls.push({ url: link, expanded_url: link }); 1524 + } 1525 + }); 1526 + } 1527 text = text.replace(/\n\s*\n/g, '\n\n').trim(); 1528 + text = addTextFallbacks(text); 1529 1530 // 3. Quoting Logic 1531 let quoteEmbed: { $type: string; record: { uri: string; cid: string } } | null = null; ··· 1541 } else { 1542 const quoteUrlEntity = urls.find((u) => u.expanded_url?.includes(quoteId)); 1543 const qUrl = quoteUrlEntity?.expanded_url || `https://twitter.com/i/status/${quoteId}`; 1544 + 1545 // Check if it's a self-quote (same user) 1546 + const isSelfQuote = 1547 + qUrl.toLowerCase().includes(`twitter.com/${twitterUsername.toLowerCase()}/`) || 1548 + qUrl.toLowerCase().includes(`x.com/${twitterUsername.toLowerCase()}/`); 1549 + 1550 if (!isSelfQuote) { 1551 externalQuoteUrl = qUrl; 1552 console.log(`[${twitterUsername}] 🔗 Quoted tweet is external: ${externalQuoteUrl}`); 1553 + 1554 // Try to capture screenshot for external QTs if we have space for images 1555 if (images.length < 4 && !videoBlob) { 1556 const ssResult = await captureTweetScreenshot(externalQuoteUrl); ··· 1558 try { 1559 let blob: BlobRef; 1560 if (dryRun) { 1561 + console.log( 1562 + `[${twitterUsername}] 🧪 [DRY RUN] Would upload screenshot for quote (${(ssResult.buffer.length / 1024).toFixed(2)} KB)`, 1563 + ); 1564 + blob = { 1565 + ref: { toString: () => 'mock-ss-blob' }, 1566 + mimeType: 'image/png', 1567 + size: ssResult.buffer.length, 1568 + } as any; 1569 } else { 1570 + blob = await uploadToBluesky(agent, ssResult.buffer, 'image/png'); 1571 } 1572 + images.push({ 1573 + alt: `Quote Tweet: ${externalQuoteUrl}`, 1574 + image: blob, 1575 + aspectRatio: { width: ssResult.width, height: ssResult.height }, 1576 }); 1577 } catch (e) { 1578 console.warn(`[${twitterUsername}] ⚠️ Failed to upload screenshot blob.`); ··· 1583 console.log(`[${twitterUsername}] 🔁 Quoted tweet is a self-quote, skipping link.`); 1584 } 1585 } 1586 + } else if ((images.length === 0 && !videoBlob) || isSponsoredCard) { 1587 + // If no media and no quote, check for external links to embed 1588 + // We prioritize the LAST link found as it's often the main content 1589 + const potentialLinks = urls 1590 + .map((u) => u.expanded_url) 1591 + .filter((u) => u && !u.includes('twitter.com') && !u.includes('x.com')) as string[]; 1592 1593 + if (potentialLinks.length > 0) { 1594 + const linkToEmbed = potentialLinks[potentialLinks.length - 1]; 1595 + if (linkToEmbed) { 1596 + // Optimization: If text is too long, but removing the link makes it fit, do it! 1597 + // The link will be present in the embed card anyway. 1598 + if (text.length > 300 && text.includes(linkToEmbed)) { 1599 + const lengthWithoutLink = text.length - linkToEmbed.length; 1600 + // Allow some buffer (e.g. whitespace cleanup might save 1-2 chars) 1601 + if (lengthWithoutLink <= 300) { 1602 + console.log( 1603 + `[${twitterUsername}] 📏 Optimizing: Removing link ${linkToEmbed} from text to avoid threading (Card will embed it).`, 1604 + ); 1605 + text = text.replace(linkToEmbed, '').trim(); 1606 + // Clean up potential double punctuation/spaces left behind 1607 + text = text.replace(/\s\.$/, '.').replace(/\s\s+/g, ' '); 1608 } 1609 + } 1610 + 1611 + console.log(`[${twitterUsername}] 🃏 Fetching link card for: ${linkToEmbed}`); 1612 + linkCard = await fetchEmbedUrlCard(agent, linkToEmbed); 1613 } 1614 + } 1615 } 1616 1617 // Only append link for external quotes IF we couldn't natively embed it OR screenshot it 1618 + const hasScreenshot = images.some((img) => img.alt.startsWith('Quote Tweet:')); 1619 if (externalQuoteUrl && !quoteEmbed && !hasScreenshot && !text.includes(externalQuoteUrl)) { 1620 text += `\n\nQT: ${externalQuoteUrl}`; 1621 } 1622 1623 + if (isSponsoredCard) { 1624 + const hasCardImages = mediaEntities.some((media) => media.source === 'card'); 1625 + if (hasCardImages) { 1626 + text = ensureSponsoredLinks(text, tweet); 1627 + } 1628 + } 1629 + 1630 // 4. Threading and Posting 1631 const chunks = splitText(text); 1632 console.log(`[${twitterUsername}] 📝 Splitting text into ${chunks.length} chunks.`); 1633 + 1634 let lastPostInfo: ProcessedTweetEntry | null = replyParentInfo; 1635 1636 // We will save the first chunk as the "Root" of this tweet, and the last chunk as the "Tail". ··· 1639 1640 for (let i = 0; i < chunks.length; i++) { 1641 let chunk = chunks[i] as string; 1642 + 1643 // Add (i/n) if split 1644 if (chunks.length > 1) { 1645 + chunk += ` (${i + 1}/${chunks.length})`; 1646 } 1647 1648 console.log(`[${twitterUsername}] 📤 Posting chunk ${i + 1}/${chunks.length}...`); 1649 updateAppStatus({ message: `Posting chunk ${i + 1}/${chunks.length}...` }); 1650 + 1651 const rt = new RichText({ text: chunk }); 1652 await rt.detectFacets(agent); 1653 const detectedLangs = detectLanguage(chunk); ··· 1694 let rootRef: { uri: string; cid: string } | null = null; 1695 1696 if (lastPostInfo?.uri && lastPostInfo?.cid) { 1697 + // If this is the start of a new tweet (i=0), check if parent has a tail 1698 + if (i === 0 && lastPostInfo.tail) { 1699 + parentRef = lastPostInfo.tail; 1700 + } else { 1701 + // Otherwise (intra-tweet or parent has no tail), use the main uri/cid (which is the previous post/chunk) 1702 + parentRef = { uri: lastPostInfo.uri, cid: lastPostInfo.cid }; 1703 + } 1704 + 1705 + rootRef = lastPostInfo.root || { uri: lastPostInfo.uri, cid: lastPostInfo.cid }; 1706 } 1707 1708 if (parentRef && rootRef) { ··· 1716 // Retry logic for network/socket errors 1717 let response: any; 1718 let retries = 3; 1719 + 1720 if (dryRun) { 1721 + console.log(`[${twitterUsername}] 🧪 [DRY RUN] Would post chunk ${i + 1}/${chunks.length}`); 1722 + if (postRecord.embed) console.log(` - With embed: ${postRecord.embed.$type}`); 1723 + if (postRecord.reply) console.log(` - As reply to: ${postRecord.reply.parent.uri}`); 1724 + response = { uri: 'at://did:plc:mock/app.bsky.feed.post/mock', cid: 'mock-cid' }; 1725 } else { 1726 + while (retries > 0) { 1727 + try { 1728 + response = await agent.post(postRecord); 1729 + break; 1730 + } catch (err: any) { 1731 + retries--; 1732 + if (retries === 0) throw err; 1733 + console.warn( 1734 + `[${twitterUsername}] ⚠️ Post failed (Socket/Network), retrying in 5s... (${retries} retries left)`, 1735 + ); 1736 + await new Promise((r) => setTimeout(r, 5000)); 1737 } 1738 + } 1739 } 1740 + 1741 const currentPostInfo = { 1742 + uri: response.uri, 1743 + cid: response.cid, 1744 + root: postRecord.reply ? postRecord.reply.root : { uri: response.uri, cid: response.cid }, 1745 + // Text is just the current chunk text 1746 + text: chunk, 1747 }; 1748 + 1749 if (i === 0) firstChunkInfo = currentPostInfo; 1750 lastChunkInfo = currentPostInfo; 1751 lastPostInfo = currentPostInfo; // Update for next iteration 1752 1753 console.log(`[${twitterUsername}] ✅ Chunk ${i + 1} posted successfully.`); 1754 + 1755 if (chunks.length > 1) { 1756 await new Promise((r) => setTimeout(r, 3000)); 1757 } ··· 1760 break; 1761 } 1762 } 1763 + 1764 // Save to DB and Map 1765 if (firstChunkInfo && lastChunkInfo) { 1766 + const entry: ProcessedTweetEntry = { 1767 + uri: firstChunkInfo.uri, 1768 + cid: firstChunkInfo.cid, 1769 + root: firstChunkInfo.root, 1770 + tail: { uri: lastChunkInfo.uri, cid: lastChunkInfo.cid }, // Save tail! 1771 + text: tweetText, 1772 + }; 1773 + 1774 + if (!dryRun) { 1775 + saveProcessedTweet(twitterUsername, bskyIdentifier, tweetId, entry); 1776 + localProcessedMap[tweetId] = entry; // Update local map for subsequent replies in this batch 1777 + } 1778 } 1779 + 1780 // Add a random delay between 5s and 15s to be more human-like 1781 const wait = Math.floor(Math.random() * 10000) + 5000; 1782 console.log(`[${twitterUsername}] 😴 Pacing: Waiting ${wait / 1000}s before next tweet.`); ··· 1796 requestId?: string, 1797 ): Promise<void> { 1798 const config = getConfig(); 1799 + const mapping = config.mappings.find((m) => 1800 + m.twitterUsernames.map((u) => u.toLowerCase()).includes(twitterUsername.toLowerCase()), 1801 + ); 1802 if (!mapping) { 1803 console.error(`No mapping found for twitter username: ${twitterUsername}`); 1804 return; ··· 1806 1807 let agent = await getAgent(mapping); 1808 if (!agent) { 1809 + if (dryRun) { 1810 + console.log('⚠️ Could not login to Bluesky, but proceeding with MOCK AGENT for Dry Run.'); 1811 + // biome-ignore lint/suspicious/noExplicitAny: mock agent 1812 + agent = { 1813 + post: async (record: any) => ({ uri: 'at://did:plc:mock/app.bsky.feed.post/mock', cid: 'mock-cid' }), 1814 + uploadBlob: async (data: any) => ({ data: { blob: { ref: { toString: () => 'mock-blob' } } } }), 1815 + // Add other necessary methods if they are called outside of the already mocked dryRun blocks 1816 + // But since we mocked the calls inside processTweets for dryRun, we just need the object to exist. 1817 + session: { did: 'did:plc:mock' }, 1818 + com: { atproto: { repo: { describeRepo: async () => ({ data: {} }) } } }, 1819 + } as any; 1820 + } else { 1821 + return; 1822 + } 1823 } 1824 1825 console.log(`Starting full history import for ${twitterUsername} -> ${mapping.bskyIdentifier}...`); ··· 1830 1831 console.log(`Fetching tweets for ${twitterUsername}...`); 1832 updateAppStatus({ message: `Fetching tweets...` }); 1833 + 1834 const client = await getTwitterScraper(); 1835 if (client) { 1836 + try { 1837 + // Use getTweets which reliably fetches user timeline 1838 + // limit defaults to 15 in function signature, but for history import we might want more. 1839 + // However, the generator will fetch as much as we ask. 1840 + const fetchLimit = limit || 100; 1841 + const generator = client.getTweets(twitterUsername, fetchLimit); 1842 1843 + for await (const scraperTweet of generator) { 1844 + if (!ignoreCancellation) { 1845 + const stillPending = getPendingBackfills().some( 1846 + (b) => b.id === mapping.id && (!requestId || b.requestId === requestId), 1847 + ); 1848 + if (!stillPending) { 1849 + console.log(`[${twitterUsername}] 🛑 Backfill cancelled.`); 1850 + break; 1851 } 1852 + } 1853 + 1854 + const t = mapScraperTweetToLocalTweet(scraperTweet); 1855 + const tid = t.id_str || t.id; 1856 + if (!tid) continue; 1857 + 1858 + if (!processedTweets[tid] && !seenIds.has(tid)) { 1859 + allFoundTweets.push(t); 1860 + seenIds.add(tid); 1861 + } 1862 + 1863 + if (allFoundTweets.length >= fetchLimit) break; 1864 } 1865 + } catch (e) { 1866 + console.warn('Error during history fetch:', e); 1867 + } 1868 } 1869 1870 console.log(`Fetch complete. Found ${allFoundTweets.length} new tweets to import.`); ··· 1878 const activeTasks = new Map<string, Promise<void>>(); 1879 1880 async function runAccountTask(mapping: AccountMapping, backfillRequest?: PendingBackfill, dryRun = false) { 1881 + if (activeTasks.has(mapping.id)) return; // Already running 1882 1883 + const task = (async () => { 1884 + try { 1885 + const agent = await getAgent(mapping); 1886 + if (!agent) return; 1887 1888 + const backfillReq = backfillRequest ?? getPendingBackfills().find((b) => b.id === mapping.id); 1889 + 1890 + if (backfillReq) { 1891 + const limit = backfillReq.limit || 15; 1892 + console.log( 1893 + `[${mapping.bskyIdentifier}] Running backfill for ${mapping.twitterUsernames.length} accounts (limit ${limit})...`, 1894 + ); 1895 + updateAppStatus({ 1896 + state: 'backfilling', 1897 + currentAccount: mapping.twitterUsernames[0], 1898 + message: `Starting backfill (limit ${limit})...`, 1899 + backfillMappingId: mapping.id, 1900 + backfillRequestId: backfillReq.requestId, 1901 + }); 1902 + 1903 + for (const twitterUsername of mapping.twitterUsernames) { 1904 + const stillPending = getPendingBackfills().some( 1905 + (b) => b.id === mapping.id && b.requestId === backfillReq.requestId, 1906 + ); 1907 + if (!stillPending) { 1908 + console.log(`[${mapping.bskyIdentifier}] 🛑 Backfill request replaced; stopping.`); 1909 + break; 1910 + } 1911 + 1912 + try { 1913 + updateAppStatus({ 1914 + state: 'backfilling', 1915 + currentAccount: twitterUsername, 1916 + message: `Starting backfill (limit ${limit})...`, 1917 + backfillMappingId: mapping.id, 1918 + backfillRequestId: backfillReq.requestId, 1919 + }); 1920 + await importHistory(twitterUsername, mapping.bskyIdentifier, limit, dryRun, false, backfillReq.requestId); 1921 + } catch (err) { 1922 + console.error(`❌ Error backfilling ${twitterUsername}:`, err); 1923 + } 1924 + } 1925 + clearBackfill(mapping.id, backfillReq.requestId); 1926 + updateAppStatus({ 1927 + state: 'idle', 1928 + message: `Backfill complete for ${mapping.bskyIdentifier}`, 1929 + backfillMappingId: undefined, 1930 + backfillRequestId: undefined, 1931 + }); 1932 + console.log(`[${mapping.bskyIdentifier}] Backfill complete.`); 1933 + } else { 1934 + updateAppStatus({ backfillMappingId: undefined, backfillRequestId: undefined }); 1935 1936 + // Pre-load processed IDs for optimization 1937 + const processedMap = loadProcessedTweets(mapping.bskyIdentifier); 1938 + const processedIds = new Set(Object.keys(processedMap)); 1939 + 1940 + for (const twitterUsername of mapping.twitterUsernames) { 1941 + try { 1942 + console.log(`[${twitterUsername}] 🏁 Starting check for new tweets...`); 1943 + updateAppStatus({ 1944 + state: 'checking', 1945 + currentAccount: twitterUsername, 1946 + message: 'Fetching latest tweets...', 1947 + backfillMappingId: undefined, 1948 + backfillRequestId: undefined, 1949 + }); 1950 1951 + // Use fetchUserTweets with early stopping optimization 1952 + // Increase limit slightly since we have early stopping now 1953 + const tweets = await fetchUserTweets(twitterUsername, 50, processedIds); 1954 1955 + if (!tweets || tweets.length === 0) { 1956 + console.log(`[${twitterUsername}] ℹ️ No tweets found (or fetch failed).`); 1957 + continue; 1958 } 1959 + 1960 + console.log(`[${twitterUsername}] 📥 Fetched ${tweets.length} tweets.`); 1961 + await processTweets(agent, twitterUsername, mapping.bskyIdentifier, tweets, dryRun); 1962 + } catch (err) { 1963 + console.error(`❌ Error checking ${twitterUsername}:`, err); 1964 + } 1965 } 1966 + } 1967 + } catch (err) { 1968 + console.error(`Error processing mapping ${mapping.bskyIdentifier}:`, err); 1969 + } finally { 1970 + activeTasks.delete(mapping.id); 1971 + } 1972 + })(); 1973 1974 + activeTasks.set(mapping.id, task); 1975 + return task; // Return task promise for await in main loop 1976 } 1977 1978 + import type { AccountMapping } from './config-manager.js'; 1979 import { 1980 clearBackfill, 1981 getNextCheckTime, 1982 + getPendingBackfills, 1983 + startServer, 1984 updateAppStatus, 1985 + updateLastCheckTime, 1986 } from './server.js'; 1987 import type { PendingBackfill } from './server.js'; 1988 1989 async function main(): Promise<void> { 1990 const program = new Command(); ··· 2025 console.error('Twitter credentials not set. Cannot import history.'); 2026 process.exit(1); 2027 } 2028 + const mapping = config.mappings.find((m) => 2029 + m.twitterUsernames.map((u) => u.toLowerCase()).includes(options.username.toLowerCase()), 2030 + ); 2031 if (!mapping) { 2032 console.error(`No mapping found for ${options.username}`); 2033 process.exit(1); ··· 2047 // Concurrency limit for processing accounts 2048 const runLimit = pLimit(3); 2049 2050 + const findMappingById = (mappings: AccountMapping[], id: string) => mappings.find((mapping) => mapping.id === id); 2051 2052 // Main loop 2053 while (true) { ··· 2090 for (const mapping of config.mappings) { 2091 if (!mapping.enabled) continue; 2092 2093 + tasks.push( 2094 + runLimit(async () => { 2095 + await runAccountTask(mapping, undefined, options.dryRun); 2096 + }), 2097 + ); 2098 } 2099 2100 if (tasks.length > 0) {