scraping atproto for the most followed accounts
at main 184 lines 5.0 kB view raw
1import { Client, simpleFetchHandler } from "@atcute/client"; 2import { AppBskyActorDefs } from "@atcute/bluesky"; 3import type { Did } from "@atcute/lexicons"; 4import fs from "fs"; 5 6interface didWithPds { 7 did: Did; 8 pds: string; 9} 10 11interface profileWithPds extends AppBskyActorDefs.ProfileViewDetailed { 12 pds: string; 13} 14 15interface Pds { 16 inviteCodeRequired?: boolean; 17 version?: string; 18 errorAt?: string; 19} 20 21const client = new Client({ 22 handler: simpleFetchHandler({ service: "https://public.api.bsky.app" }), 23}); 24 25function isBlueskyHost(host: string): boolean { 26 return /^(?:https?:\/\/)?(?:[^\/]+\.)?(?:bsky\.network|bsky\.app|bsky\.dev|bsky\.social)\/?$/.test( 27 host, 28 ); 29} 30 31async function getAccountsOnPds( 32 pds: string, 33 cursor: string | undefined = undefined, 34 accounts: didWithPds[] = [], 35): Promise<didWithPds[]> { 36 const url = `${pds}xrpc/com.atproto.sync.listRepos${cursor ? `?cursor=${cursor}` : ""}`; 37 38 const response = await fetch(url, { 39 method: "GET", 40 headers: { "Content-Type": "application/json" }, 41 }); 42 43 if (!response.ok) { 44 console.log(`failed to retrieve accounts for ${pds}`); 45 return []; 46 } 47 48 const data = await response.json(); 49 50 // only get at did's from the accounts, and propagate the pds and filter out 51 // inactive accounts 52 const accs: didWithPds[] = data.repos 53 .map((acc: { did: string; active: boolean }) => { 54 if (!acc.active) return null; 55 return { did: acc.did, pds }; 56 }) 57 .filter((x: didWithPds | undefined) => x); 58 59 accounts.push(...accs); 60 61 if (data.cursor) { 62 return await getAccountsOnPds(pds, data.cursor, accounts); 63 } 64 65 return accounts; 66} 67 68async function getProfiles( 69 actorsWithPds: didWithPds[], 70): Promise<profileWithPds[]> { 71 const dids = actorsWithPds.map((acc) => acc.did); 72 const didToPds = new Map(actorsWithPds.map((acc) => [acc.did, acc.pds])); 73 74 const response = await client.get("app.bsky.actor.getProfiles", { 75 params: { actors: dids }, 76 }); 77 78 if (!response.ok) return []; 79 80 return response.data.profiles.map((profile) => ({ 81 ...profile, 82 pds: didToPds.get(profile.did) || "", 83 })); 84} 85 86async function fetchAllAccounts(pdses: string[], concurrency = 5) { 87 const results: didWithPds[] = []; 88 const queue = [...pdses]; 89 90 const workers = Array.from({ length: concurrency }, async () => { 91 while (queue.length > 0) { 92 const pds = queue.pop(); 93 if (!pds) continue; 94 try { 95 const accountsOnPds = await getAccountsOnPds(pds); 96 results.push(...accountsOnPds); 97 console.log(`Found ${accountsOnPds.length} accounts on ${pds}`); 98 } catch (e) { 99 console.log(`fetch error ${e}`); 100 } 101 } 102 }); 103 104 await Promise.all(workers); 105 return results; 106} 107 108async function createPdsList(accounts: profileWithPds[]) { 109 const pdses = new Map<string, { followers: number; accounts: number }>(); 110 for (const account of accounts) { 111 if (pdses.has(account.pds)) { 112 const data = pdses.get(account.pds); 113 const totalFollows = data.followers + (account.followersCount || 0); 114 const totalAccounts = data.accounts + 1; 115 pdses.set(account.pds, { 116 followers: totalFollows, 117 accounts: totalAccounts, 118 }); 119 } else { 120 pdses.set(account.pds, { 121 followers: account.followersCount || 0, 122 accounts: 1, 123 }); 124 } 125 } 126 127 // sort pdses by followers count 128 const sortedPdses = new Map( 129 [...pdses.entries()].sort((a, b) => b[1].followers - a[1].followers), 130 ); 131 132 // convert map to array of objects 133 const pdsesArray = Array.from(sortedPdses, ([pds, data]) => ({ 134 pds: pds.replace("https://", "").replace(/\/$/, ""), 135 ...data, 136 "followers to accounts ratio": Math.round(data.followers / data.accounts), 137 })); 138 139 fs.writeFileSync("data/pdses.json", JSON.stringify(pdsesArray)); 140} 141 142// finally do the thing 143async function main() { 144 const data = fs.readFileSync("data/data.json", "utf8"); 145 const pdsMap: Map<string, Pds> = JSON.parse(data).pdses; 146 147 const pdses: string[] = []; 148 for (const [host, val] of Object.entries(pdsMap)) { 149 // i don't want to count bsky accounts 150 if (isBlueskyHost(host)) continue; 151 152 // remove any failing pdses 153 if (val.errorAt) continue; 154 155 // this is massive and full of 0 follower andies 156 if ( 157 host === "https://atproto.brid.gy/" || 158 host === "https://pds.si46.world/" 159 ) 160 continue; 161 162 pdses.push(host); 163 } 164 165 const accounts = await fetchAllAccounts(pdses, 5); 166 167 const accountsToWrite: profileWithPds[] = []; 168 for (let i = 0; i < accounts.length; i += 25) { 169 const batch = accounts.slice(i, i + 25); 170 const fetchedProfiles = await getProfiles(batch); 171 accountsToWrite.push(...fetchedProfiles); 172 } 173 174 // sort the accounts by followers count 175 accountsToWrite.sort( 176 (a, b) => (b.followersCount || 0) - (a.followersCount || 0), 177 ); 178 179 fs.writeFileSync("data/accounts.json", JSON.stringify(accountsToWrite)); 180 181 createPdsList(accountsToWrite) 182} 183 184main();