scraping atproto for the most followed accounts
1import { Client, simpleFetchHandler } from "@atcute/client";
2import { AppBskyActorDefs } from "@atcute/bluesky";
3import type { Did } from "@atcute/lexicons";
4import fs from "fs";
5
6interface didWithPds {
7 did: Did;
8 pds: string;
9}
10
11interface profileWithPds extends AppBskyActorDefs.ProfileViewDetailed {
12 pds: string;
13}
14
15interface Pds {
16 inviteCodeRequired?: boolean;
17 version?: string;
18 errorAt?: string;
19}
20
21const client = new Client({
22 handler: simpleFetchHandler({ service: "https://public.api.bsky.app" }),
23});
24
25function isBlueskyHost(host: string): boolean {
26 return /^(?:https?:\/\/)?(?:[^\/]+\.)?(?:bsky\.network|bsky\.app|bsky\.dev|bsky\.social)\/?$/.test(
27 host,
28 );
29}
30
31async function getAccountsOnPds(
32 pds: string,
33 cursor: string | undefined = undefined,
34 accounts: didWithPds[] = [],
35): Promise<didWithPds[]> {
36 const url = `${pds}xrpc/com.atproto.sync.listRepos${cursor ? `?cursor=${cursor}` : ""}`;
37
38 const response = await fetch(url, {
39 method: "GET",
40 headers: { "Content-Type": "application/json" },
41 });
42
43 if (!response.ok) {
44 console.log(`failed to retrieve accounts for ${pds}`);
45 return [];
46 }
47
48 const data = await response.json();
49
50 // only get at did's from the accounts, and propagate the pds and filter out
51 // inactive accounts
52 const accs: didWithPds[] = data.repos
53 .map((acc: { did: string; active: boolean }) => {
54 if (!acc.active) return null;
55 return { did: acc.did, pds };
56 })
57 .filter((x: didWithPds | undefined) => x);
58
59 accounts.push(...accs);
60
61 if (data.cursor) {
62 return await getAccountsOnPds(pds, data.cursor, accounts);
63 }
64
65 return accounts;
66}
67
68async function getProfiles(
69 actorsWithPds: didWithPds[],
70): Promise<profileWithPds[]> {
71 const dids = actorsWithPds.map((acc) => acc.did);
72 const didToPds = new Map(actorsWithPds.map((acc) => [acc.did, acc.pds]));
73
74 const response = await client.get("app.bsky.actor.getProfiles", {
75 params: { actors: dids },
76 });
77
78 if (!response.ok) return [];
79
80 return response.data.profiles.map((profile) => ({
81 ...profile,
82 pds: didToPds.get(profile.did) || "",
83 }));
84}
85
86async function fetchAllAccounts(pdses: string[], concurrency = 5) {
87 const results: didWithPds[] = [];
88 const queue = [...pdses];
89
90 const workers = Array.from({ length: concurrency }, async () => {
91 while (queue.length > 0) {
92 const pds = queue.pop();
93 if (!pds) continue;
94 try {
95 const accountsOnPds = await getAccountsOnPds(pds);
96 results.push(...accountsOnPds);
97 console.log(`Found ${accountsOnPds.length} accounts on ${pds}`);
98 } catch (e) {
99 console.log(`fetch error ${e}`);
100 }
101 }
102 });
103
104 await Promise.all(workers);
105 return results;
106}
107
108async function createPdsList(accounts: profileWithPds[]) {
109 const pdses = new Map<string, { followers: number; accounts: number }>();
110 for (const account of accounts) {
111 if (pdses.has(account.pds)) {
112 const data = pdses.get(account.pds);
113 const totalFollows = data.followers + (account.followersCount || 0);
114 const totalAccounts = data.accounts + 1;
115 pdses.set(account.pds, {
116 followers: totalFollows,
117 accounts: totalAccounts,
118 });
119 } else {
120 pdses.set(account.pds, {
121 followers: account.followersCount || 0,
122 accounts: 1,
123 });
124 }
125 }
126
127 // sort pdses by followers count
128 const sortedPdses = new Map(
129 [...pdses.entries()].sort((a, b) => b[1].followers - a[1].followers),
130 );
131
132 // convert map to array of objects
133 const pdsesArray = Array.from(sortedPdses, ([pds, data]) => ({
134 pds: pds.replace("https://", "").replace(/\/$/, ""),
135 ...data,
136 "followers to accounts ratio": Math.round(data.followers / data.accounts),
137 }));
138
139 fs.writeFileSync("data/pdses.json", JSON.stringify(pdsesArray));
140}
141
142// finally do the thing
143async function main() {
144 const data = fs.readFileSync("data/data.json", "utf8");
145 const pdsMap: Map<string, Pds> = JSON.parse(data).pdses;
146
147 const pdses: string[] = [];
148 for (const [host, val] of Object.entries(pdsMap)) {
149 // i don't want to count bsky accounts
150 if (isBlueskyHost(host)) continue;
151
152 // remove any failing pdses
153 if (val.errorAt) continue;
154
155 // this is massive and full of 0 follower andies
156 if (
157 host === "https://atproto.brid.gy/" ||
158 host === "https://pds.si46.world/"
159 )
160 continue;
161
162 pdses.push(host);
163 }
164
165 const accounts = await fetchAllAccounts(pdses, 5);
166
167 const accountsToWrite: profileWithPds[] = [];
168 for (let i = 0; i < accounts.length; i += 25) {
169 const batch = accounts.slice(i, i + 25);
170 const fetchedProfiles = await getProfiles(batch);
171 accountsToWrite.push(...fetchedProfiles);
172 }
173
174 // sort the accounts by followers count
175 accountsToWrite.sort(
176 (a, b) => (b.followersCount || 0) - (a.followersCount || 0),
177 );
178
179 fs.writeFileSync("data/accounts.json", JSON.stringify(accountsToWrite));
180
181 createPdsList(accountsToWrite)
182}
183
184main();