···85- `resync_buffer`: Maps `{DID}|{Rev}` -> `Commit` (MessagePack). Used to buffer live events during backfill.
86- `counts`: Maps `k|{NAME}` or `r|{DID}|{COL}` -> `Count` (u64 BE Bytes).
87- `filter`: Stores filter config. Handled by the `db::filter` module. Includes mode key `m` -> `FilterMode` (MessagePack), and set entries for signals (`s|{NSID}`), collections (`c|{NSID}`), and excludes (`x|{DID}`) -> empty value.
08889## Safe commands
90
···85- `resync_buffer`: Maps `{DID}|{Rev}` -> `Commit` (MessagePack). Used to buffer live events during backfill.
86- `counts`: Maps `k|{NAME}` or `r|{DID}|{COL}` -> `Count` (u64 BE Bytes).
87- `filter`: Stores filter config. Handled by the `db::filter` module. Includes mode key `m` -> `FilterMode` (MessagePack), and set entries for signals (`s|{NSID}`), collections (`c|{NSID}`), and excludes (`x|{DID}`) -> empty value.
88+- `crawler`: Stores crawler state with prefixed keys. Failed crawl entries use `f|{DID}` -> empty value, representing repos that failed signal checking during crawl discovery.
8990## Safe commands
91
+258-80
src/crawler/mod.rs
···01use crate::db::{Db, keys, ser_repo_state};
2use crate::state::AppState;
3use crate::types::RepoState;
···6use jacquard_common::{IntoStatic, types::string::Did};
7use miette::{IntoDiagnostic, Result};
8use rand::Rng;
09use rand::rngs::SmallRng;
10use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
11use reqwest_retry::Jitter;
12use reqwest_retry::{RetryTransientMiddleware, policies::ExponentialBackoff};
13use smol_str::SmolStr;
14use std::sync::Arc;
015use std::time::Duration;
16use tracing::{debug, error, info, trace};
17use url::Url;
18000000019pub struct Crawler {
20 state: Arc<AppState>,
21 relay_host: Url,
22 http: Arc<ClientWithMiddleware>,
23 max_pending: usize,
24 resume_pending: usize,
025}
2627impl Crawler {
···55 http,
56 max_pending,
57 resume_pending,
058 }
59 }
6061 pub async fn run(self) -> Result<()> {
62 info!("crawler started");
630000000000000000000000064 let mut api_url = self.relay_host.clone();
65 if api_url.scheme() == "wss" {
66 api_url
···82 .await?
83 .map(|bytes| {
84 let s = String::from_utf8_lossy(&bytes);
85- info!("resuming crawler from cursor: {}", s);
86 s.into()
87 });
88 let mut was_throttled = false;
···181 && !filter.has_glob_signals();
182183 // 3. process repos
184- let mut unknown_repos = Vec::new();
185 for repo in output.repos {
186- let parsed_did: Did = repo.did.parse().unwrap();
187- let did_key = keys::repo_key(&parsed_did);
188189 let excl_key = crate::db::filter::exclude_key(repo.did.as_str())?;
190 if db.filter.contains_key(&excl_key).into_diagnostic()? {
···193194 // check if known
195 if !Db::contains_key(db.repos.clone(), &did_key).await? {
196- unknown_repos.push(repo);
197 }
198 }
199200- let mut valid_repos = Vec::new();
201- if check_signals && !unknown_repos.is_empty() {
202- let mut set = tokio::task::JoinSet::new();
203- for repo in unknown_repos {
204- let http = self.http.clone();
205- let api_url = api_url.clone();
206- let filter = filter.clone();
207- set.spawn(async move {
208- let mut found_signal = false;
209- for signal in filter.signals.iter() {
210- let mut list_records_url =
211- api_url.join("/xrpc/com.atproto.repo.listRecords").unwrap();
212- list_records_url
213- .query_pairs_mut()
214- .append_pair("repo", &repo.did)
215- .append_pair("collection", signal)
216- .append_pair("limit", "1");
217-218- match http.get(list_records_url).send().await {
219- Ok(res) => {
220- let Ok(bytes) = res.bytes().await else {
221- error!("failed to read bytes from listRecords response for repo {}, signal {signal}", repo.did);
222- continue;
223- };
224- if let Ok(out) = serde_json::from_slice::<ListRecordsOutput>(&bytes) {
225- if !out.records.is_empty() {
226- found_signal = true;
227- break;
228- }
229- }
230- }
231- Err(e) => {
232- error!(
233- "failed to listRecords for repo {}, signal {signal}: {e}",
234- repo.did
235- );
236- continue;
237- }
238- }
239- }
240-241- if !found_signal {
242- trace!(
243- "crawler skipped repo {}: no records match signals",
244- repo.did
245- );
246- }
247-248- (repo, found_signal)
249- });
250- }
251-252- while let Some(res) = set.join_next().await {
253- let (repo, found_signal) = res.into_diagnostic()?;
254- if found_signal {
255- valid_repos.push(repo);
256- }
257- }
258 } else {
259- valid_repos = unknown_repos;
260- }
261262- for repo in valid_repos {
263- let parsed_did: Did = repo.did.parse().unwrap();
264- let did_key = keys::repo_key(&parsed_did);
265- trace!("crawler found new repo: {}", repo.did);
266267 let state = RepoState::backfilling(rng.next_u64());
268 batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
269 batch.insert(&db.pending, keys::pending_key(state.index_id), &did_key);
270- to_queue.push(repo.did.clone());
271 }
272273 // 4. update cursor
···289 .await
290 .into_diagnostic()??;
291292- // update counts if we found new repos
293- if !to_queue.is_empty() {
294- let count = to_queue.len() as i64;
295- self.state.db.update_count_async("repos", count).await;
296- self.state.db.update_count_async("pending", count).await;
00297 }
00298299- // 5. notify backfill worker
300- if !to_queue.is_empty() {
301- self.state.notify_backfill();
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000302 }
0303304- if cursor.is_none() {
305- tokio::time::sleep(Duration::from_secs(3600)).await;
0000000000000000000000000000000000000000306 }
0000307 }
0000000000000000000000000000000308 }
309}
···1+use crate::db::types::TrimmedDid;
2use crate::db::{Db, keys, ser_repo_state};
3use crate::state::AppState;
4use crate::types::RepoState;
···7use jacquard_common::{IntoStatic, types::string::Did};
8use miette::{IntoDiagnostic, Result};
9use rand::Rng;
10+use rand::RngExt;
11use rand::rngs::SmallRng;
12use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
13use reqwest_retry::Jitter;
14use reqwest_retry::{RetryTransientMiddleware, policies::ExponentialBackoff};
15use smol_str::SmolStr;
16use std::sync::Arc;
17+use std::sync::atomic::{AtomicUsize, Ordering};
18use std::time::Duration;
19use tracing::{debug, error, info, trace};
20use url::Url;
2122+enum CrawlCheckResult {
23+ Signal,
24+ NoSignal,
25+ Ratelimited,
26+ Failed,
27+}
28+29pub struct Crawler {
30 state: Arc<AppState>,
31 relay_host: Url,
32 http: Arc<ClientWithMiddleware>,
33 max_pending: usize,
34 resume_pending: usize,
35+ count: Arc<AtomicUsize>,
36}
3738impl Crawler {
···66 http,
67 max_pending,
68 resume_pending,
69+ count: Arc::new(AtomicUsize::new(0)),
70 }
71 }
7273 pub async fn run(self) -> Result<()> {
74 info!("crawler started");
7576+ tokio::spawn({
77+ let count = self.count.clone();
78+ let mut last_time = std::time::Instant::now();
79+ let mut interval = tokio::time::interval(Duration::from_secs(60));
80+ async move {
81+ loop {
82+ interval.tick().await;
83+ let delta = count.swap(0, Ordering::Relaxed);
84+ if delta == 0 {
85+ continue;
86+ }
87+ let elapsed = last_time.elapsed().as_secs_f64();
88+ let rate = if elapsed > 0.0 {
89+ delta as f64 / elapsed
90+ } else {
91+ 0.0
92+ };
93+ info!("crawler: {rate:.2} repos/s ({delta} repos in {elapsed:.1}s)");
94+ last_time = std::time::Instant::now();
95+ }
96+ }
97+ });
98+99 let mut api_url = self.relay_host.clone();
100 if api_url.scheme() == "wss" {
101 api_url
···117 .await?
118 .map(|bytes| {
119 let s = String::from_utf8_lossy(&bytes);
120+ info!("resuming crawler from cursor: {s}");
121 s.into()
122 });
123 let mut was_throttled = false;
···216 && !filter.has_glob_signals();
217218 // 3. process repos
219+ let mut unknown_dids = Vec::new();
220 for repo in output.repos {
221+ let did_key = keys::repo_key(&repo.did);
0222223 let excl_key = crate::db::filter::exclude_key(repo.did.as_str())?;
224 if db.filter.contains_key(&excl_key).into_diagnostic()? {
···227228 // check if known
229 if !Db::contains_key(db.repos.clone(), &did_key).await? {
230+ unknown_dids.push(repo.did.into_static());
231 }
232 }
233234+ let valid_dids = if check_signals && !unknown_dids.is_empty() {
235+ self.check_signals_batch(&unknown_dids, &filter, &mut batch)
236+ .await?
0000000000000000000000000000000000000000000000000000000237 } else {
238+ unknown_dids
239+ };
240241+ for did in &valid_dids {
242+ let did_key = keys::repo_key(did);
243+ trace!("crawler found new repo: {did}");
0244245 let state = RepoState::backfilling(rng.next_u64());
246 batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
247 batch.insert(&db.pending, keys::pending_key(state.index_id), &did_key);
248+ to_queue.push(did.clone());
249 }
250251 // 4. update cursor
···267 .await
268 .into_diagnostic()??;
269270+ self.account_new_repos(to_queue.len()).await;
271+272+ if cursor.is_none() {
273+ // 6. retry previously failed repos before sleeping
274+ self.retry_failed_repos(&mut rng).await?;
275+276+ tokio::time::sleep(Duration::from_secs(3600)).await;
277 }
278+ }
279+ }
280281+ async fn check_signals_batch(
282+ &self,
283+ dids: &[Did<'static>],
284+ filter: &crate::filter::FilterConfig,
285+ batch: &mut fjall::OwnedWriteBatch,
286+ ) -> Result<Vec<Did<'static>>> {
287+ let db = &self.state.db;
288+ let mut valid = Vec::new();
289+ let mut set = tokio::task::JoinSet::new();
290+291+ for did in dids {
292+ let did = did.clone();
293+ let http = self.http.clone();
294+ let resolver = self.state.resolver.clone();
295+ let filter = filter.clone();
296+ set.spawn(async move {
297+ const MAX_RETRIES: u32 = 8;
298+ let mut rng: SmallRng = rand::make_rng();
299+300+ let pds_url = {
301+ let mut attempt = 0u32;
302+ loop {
303+ match resolver.resolve_identity_info(&did).await {
304+ Ok((url, _)) => break url,
305+ Err(crate::resolver::ResolverError::Ratelimited)
306+ if attempt < MAX_RETRIES =>
307+ {
308+ let base = Duration::from_secs(1 << attempt);
309+ let jitter = Duration::from_millis(rng.random_range(0..2000));
310+ let try_in = base + jitter;
311+ debug!(
312+ "crawler: rate limited resolving {did}, retry {}/{MAX_RETRIES} in {}s",
313+ attempt + 1,
314+ try_in.as_secs_f64()
315+ );
316+ tokio::time::sleep(try_in).await;
317+ attempt += 1;
318+ }
319+ Err(crate::resolver::ResolverError::Ratelimited) => {
320+ error!(
321+ "crawler: rate limited resolving {did} after {MAX_RETRIES} retries"
322+ );
323+ return (did, CrawlCheckResult::Ratelimited);
324+ }
325+ Err(e) => {
326+ error!("crawler: failed to resolve {did}: {e}");
327+ return (did, CrawlCheckResult::Failed);
328+ }
329+ }
330+ }
331+ };
332+333+ let mut found_signal = false;
334+ for signal in filter.signals.iter() {
335+ let mut list_records_url =
336+ pds_url.join("/xrpc/com.atproto.repo.listRecords").unwrap();
337+ list_records_url
338+ .query_pairs_mut()
339+ .append_pair("repo", &did)
340+ .append_pair("collection", signal)
341+ .append_pair("limit", "1");
342+343+ let res = http
344+ .get(list_records_url)
345+ .send()
346+ .await
347+ .into_diagnostic()
348+ .map(|res| res.error_for_status().into_diagnostic())
349+ .flatten();
350+ match res {
351+ Ok(res) => {
352+ let Ok(bytes) = res.bytes().await else {
353+ error!(
354+ "failed to read bytes from listRecords response for repo {did}, signal {signal}"
355+ );
356+ return (did, CrawlCheckResult::Failed);
357+ };
358+ match serde_json::from_slice::<ListRecordsOutput>(&bytes) {
359+ Ok(out) => {
360+ if !out.records.is_empty() {
361+ found_signal = true;
362+ break;
363+ }
364+ }
365+ Err(e) => {
366+ error!(
367+ "failed to parse listRecords response for repo {did}, signal {signal}: {e}"
368+ );
369+ return (did, CrawlCheckResult::Failed);
370+ }
371+ }
372+ }
373+ Err(e) => {
374+ error!(
375+ "failed to listRecords for repo {did}, signal {signal}: {e}"
376+ );
377+ return (did, CrawlCheckResult::Failed);
378+ }
379+ }
380+ }
381+382+ if found_signal {
383+ (did, CrawlCheckResult::Signal)
384+ } else {
385+ trace!("crawler skipped repo {did}: no records match signals");
386+ (did, CrawlCheckResult::NoSignal)
387+ }
388+ });
389+ }
390+391+ while let Some(res) = set.join_next().await {
392+ let (did, result) = res.into_diagnostic()?;
393+ match result {
394+ CrawlCheckResult::Signal => {
395+ batch.remove(&db.crawler, keys::crawler_failed_key(&did));
396+ valid.push(did);
397+ }
398+ CrawlCheckResult::NoSignal => {
399+ batch.remove(&db.crawler, keys::crawler_failed_key(&did));
400+ }
401+ CrawlCheckResult::Ratelimited | CrawlCheckResult::Failed => {
402+ batch.insert(&db.crawler, keys::crawler_failed_key(&did), []);
403+ }
404 }
405+ }
406407+ Ok(valid)
408+ }
409+410+ async fn retry_failed_repos(&self, rng: &mut SmallRng) -> Result<()> {
411+ let db = &self.state.db;
412+ let filter = self.state.filter.load();
413+414+ let check_signals = filter.mode == crate::filter::FilterMode::Filter
415+ && !filter.signals.is_empty()
416+ && !filter.has_glob_signals();
417+418+ if !check_signals {
419+ return Ok(());
420+ }
421+422+ let mut failed_dids = Vec::new();
423+ for guard in db.crawler.prefix(keys::CRAWLER_FAILED_PREFIX) {
424+ let (key, _) = guard.into_inner().into_diagnostic()?;
425+ let did_bytes = &key[keys::CRAWLER_FAILED_PREFIX.len()..];
426+ let trimmed = TrimmedDid::try_from(did_bytes)?;
427+ failed_dids.push(trimmed.to_did());
428+ }
429+430+ if failed_dids.is_empty() {
431+ return Ok(());
432+ }
433+434+ info!(
435+ "crawler: retrying {} previously failed repos",
436+ failed_dids.len()
437+ );
438+439+ let mut batch = db.inner.batch();
440+ let valid_dids = self
441+ .check_signals_batch(&failed_dids, &filter, &mut batch)
442+ .await?;
443+444+ for did in &valid_dids {
445+ let did_key = keys::repo_key(did);
446+447+ if Db::contains_key(db.repos.clone(), &did_key).await? {
448+ continue;
449 }
450+451+ let state = RepoState::backfilling(rng.next_u64());
452+ batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
453+ batch.insert(&db.pending, keys::pending_key(state.index_id), &did_key);
454 }
455+456+ tokio::task::spawn_blocking(move || batch.commit().into_diagnostic())
457+ .await
458+ .into_diagnostic()??;
459+460+ if !valid_dids.is_empty() {
461+ info!(
462+ "crawler: recovered {} repos from failed retry",
463+ valid_dids.len()
464+ );
465+ self.account_new_repos(valid_dids.len()).await;
466+ }
467+468+ Ok(())
469+ }
470+471+ async fn account_new_repos(&self, count: usize) {
472+ if count == 0 {
473+ return;
474+ }
475+476+ self.count.fetch_add(count, Ordering::Relaxed);
477+ self.state
478+ .db
479+ .update_count_async("repos", count as i64)
480+ .await;
481+ self.state
482+ .db
483+ .update_count_async("pending", count as i64)
484+ .await;
485+ self.state.notify_backfill();
486 }
487}