···8585- `resync_buffer`: Maps `{DID}|{Rev}` -> `Commit` (MessagePack). Used to buffer live events during backfill.
8686- `counts`: Maps `k|{NAME}` or `r|{DID}|{COL}` -> `Count` (u64 BE Bytes).
8787- `filter`: Stores filter config. Handled by the `db::filter` module. Includes mode key `m` -> `FilterMode` (MessagePack), and set entries for signals (`s|{NSID}`), collections (`c|{NSID}`), and excludes (`x|{DID}`) -> empty value.
8888+- `crawler`: Stores crawler state with prefixed keys. Failed crawl entries use `f|{DID}` -> empty value, representing repos that failed signal checking during crawl discovery.
88898990## Safe commands
9091
+258-80
src/crawler/mod.rs
···11+use crate::db::types::TrimmedDid;
12use crate::db::{Db, keys, ser_repo_state};
23use crate::state::AppState;
34use crate::types::RepoState;
···67use jacquard_common::{IntoStatic, types::string::Did};
78use miette::{IntoDiagnostic, Result};
89use rand::Rng;
1010+use rand::RngExt;
911use rand::rngs::SmallRng;
1012use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
1113use reqwest_retry::Jitter;
1214use reqwest_retry::{RetryTransientMiddleware, policies::ExponentialBackoff};
1315use smol_str::SmolStr;
1416use std::sync::Arc;
1717+use std::sync::atomic::{AtomicUsize, Ordering};
1518use std::time::Duration;
1619use tracing::{debug, error, info, trace};
1720use url::Url;
18212222+enum CrawlCheckResult {
2323+ Signal,
2424+ NoSignal,
2525+ Ratelimited,
2626+ Failed,
2727+}
2828+1929pub struct Crawler {
2030 state: Arc<AppState>,
2131 relay_host: Url,
2232 http: Arc<ClientWithMiddleware>,
2333 max_pending: usize,
2434 resume_pending: usize,
3535+ count: Arc<AtomicUsize>,
2536}
26372738impl Crawler {
···5566 http,
5667 max_pending,
5768 resume_pending,
6969+ count: Arc::new(AtomicUsize::new(0)),
5870 }
5971 }
60726173 pub async fn run(self) -> Result<()> {
6274 info!("crawler started");
63757676+ tokio::spawn({
7777+ let count = self.count.clone();
7878+ let mut last_time = std::time::Instant::now();
7979+ let mut interval = tokio::time::interval(Duration::from_secs(60));
8080+ async move {
8181+ loop {
8282+ interval.tick().await;
8383+ let delta = count.swap(0, Ordering::Relaxed);
8484+ if delta == 0 {
8585+ continue;
8686+ }
8787+ let elapsed = last_time.elapsed().as_secs_f64();
8888+ let rate = if elapsed > 0.0 {
8989+ delta as f64 / elapsed
9090+ } else {
9191+ 0.0
9292+ };
9393+ info!("crawler: {rate:.2} repos/s ({delta} repos in {elapsed:.1}s)");
9494+ last_time = std::time::Instant::now();
9595+ }
9696+ }
9797+ });
9898+6499 let mut api_url = self.relay_host.clone();
65100 if api_url.scheme() == "wss" {
66101 api_url
···82117 .await?
83118 .map(|bytes| {
84119 let s = String::from_utf8_lossy(&bytes);
8585- info!("resuming crawler from cursor: {}", s);
120120+ info!("resuming crawler from cursor: {s}");
86121 s.into()
87122 });
88123 let mut was_throttled = false;
···181216 && !filter.has_glob_signals();
182217183218 // 3. process repos
184184- let mut unknown_repos = Vec::new();
219219+ let mut unknown_dids = Vec::new();
185220 for repo in output.repos {
186186- let parsed_did: Did = repo.did.parse().unwrap();
187187- let did_key = keys::repo_key(&parsed_did);
221221+ let did_key = keys::repo_key(&repo.did);
188222189223 let excl_key = crate::db::filter::exclude_key(repo.did.as_str())?;
190224 if db.filter.contains_key(&excl_key).into_diagnostic()? {
···193227194228 // check if known
195229 if !Db::contains_key(db.repos.clone(), &did_key).await? {
196196- unknown_repos.push(repo);
230230+ unknown_dids.push(repo.did.into_static());
197231 }
198232 }
199233200200- let mut valid_repos = Vec::new();
201201- if check_signals && !unknown_repos.is_empty() {
202202- let mut set = tokio::task::JoinSet::new();
203203- for repo in unknown_repos {
204204- let http = self.http.clone();
205205- let api_url = api_url.clone();
206206- let filter = filter.clone();
207207- set.spawn(async move {
208208- let mut found_signal = false;
209209- for signal in filter.signals.iter() {
210210- let mut list_records_url =
211211- api_url.join("/xrpc/com.atproto.repo.listRecords").unwrap();
212212- list_records_url
213213- .query_pairs_mut()
214214- .append_pair("repo", &repo.did)
215215- .append_pair("collection", signal)
216216- .append_pair("limit", "1");
217217-218218- match http.get(list_records_url).send().await {
219219- Ok(res) => {
220220- let Ok(bytes) = res.bytes().await else {
221221- error!("failed to read bytes from listRecords response for repo {}, signal {signal}", repo.did);
222222- continue;
223223- };
224224- if let Ok(out) = serde_json::from_slice::<ListRecordsOutput>(&bytes) {
225225- if !out.records.is_empty() {
226226- found_signal = true;
227227- break;
228228- }
229229- }
230230- }
231231- Err(e) => {
232232- error!(
233233- "failed to listRecords for repo {}, signal {signal}: {e}",
234234- repo.did
235235- );
236236- continue;
237237- }
238238- }
239239- }
240240-241241- if !found_signal {
242242- trace!(
243243- "crawler skipped repo {}: no records match signals",
244244- repo.did
245245- );
246246- }
247247-248248- (repo, found_signal)
249249- });
250250- }
251251-252252- while let Some(res) = set.join_next().await {
253253- let (repo, found_signal) = res.into_diagnostic()?;
254254- if found_signal {
255255- valid_repos.push(repo);
256256- }
257257- }
234234+ let valid_dids = if check_signals && !unknown_dids.is_empty() {
235235+ self.check_signals_batch(&unknown_dids, &filter, &mut batch)
236236+ .await?
258237 } else {
259259- valid_repos = unknown_repos;
260260- }
238238+ unknown_dids
239239+ };
261240262262- for repo in valid_repos {
263263- let parsed_did: Did = repo.did.parse().unwrap();
264264- let did_key = keys::repo_key(&parsed_did);
265265- trace!("crawler found new repo: {}", repo.did);
241241+ for did in &valid_dids {
242242+ let did_key = keys::repo_key(did);
243243+ trace!("crawler found new repo: {did}");
266244267245 let state = RepoState::backfilling(rng.next_u64());
268246 batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
269247 batch.insert(&db.pending, keys::pending_key(state.index_id), &did_key);
270270- to_queue.push(repo.did.clone());
248248+ to_queue.push(did.clone());
271249 }
272250273251 // 4. update cursor
···289267 .await
290268 .into_diagnostic()??;
291269292292- // update counts if we found new repos
293293- if !to_queue.is_empty() {
294294- let count = to_queue.len() as i64;
295295- self.state.db.update_count_async("repos", count).await;
296296- self.state.db.update_count_async("pending", count).await;
270270+ self.account_new_repos(to_queue.len()).await;
271271+272272+ if cursor.is_none() {
273273+ // 6. retry previously failed repos before sleeping
274274+ self.retry_failed_repos(&mut rng).await?;
275275+276276+ tokio::time::sleep(Duration::from_secs(3600)).await;
297277 }
278278+ }
279279+ }
298280299299- // 5. notify backfill worker
300300- if !to_queue.is_empty() {
301301- self.state.notify_backfill();
281281+ async fn check_signals_batch(
282282+ &self,
283283+ dids: &[Did<'static>],
284284+ filter: &crate::filter::FilterConfig,
285285+ batch: &mut fjall::OwnedWriteBatch,
286286+ ) -> Result<Vec<Did<'static>>> {
287287+ let db = &self.state.db;
288288+ let mut valid = Vec::new();
289289+ let mut set = tokio::task::JoinSet::new();
290290+291291+ for did in dids {
292292+ let did = did.clone();
293293+ let http = self.http.clone();
294294+ let resolver = self.state.resolver.clone();
295295+ let filter = filter.clone();
296296+ set.spawn(async move {
297297+ const MAX_RETRIES: u32 = 8;
298298+ let mut rng: SmallRng = rand::make_rng();
299299+300300+ let pds_url = {
301301+ let mut attempt = 0u32;
302302+ loop {
303303+ match resolver.resolve_identity_info(&did).await {
304304+ Ok((url, _)) => break url,
305305+ Err(crate::resolver::ResolverError::Ratelimited)
306306+ if attempt < MAX_RETRIES =>
307307+ {
308308+ let base = Duration::from_secs(1 << attempt);
309309+ let jitter = Duration::from_millis(rng.random_range(0..2000));
310310+ let try_in = base + jitter;
311311+ debug!(
312312+ "crawler: rate limited resolving {did}, retry {}/{MAX_RETRIES} in {}s",
313313+ attempt + 1,
314314+ try_in.as_secs_f64()
315315+ );
316316+ tokio::time::sleep(try_in).await;
317317+ attempt += 1;
318318+ }
319319+ Err(crate::resolver::ResolverError::Ratelimited) => {
320320+ error!(
321321+ "crawler: rate limited resolving {did} after {MAX_RETRIES} retries"
322322+ );
323323+ return (did, CrawlCheckResult::Ratelimited);
324324+ }
325325+ Err(e) => {
326326+ error!("crawler: failed to resolve {did}: {e}");
327327+ return (did, CrawlCheckResult::Failed);
328328+ }
329329+ }
330330+ }
331331+ };
332332+333333+ let mut found_signal = false;
334334+ for signal in filter.signals.iter() {
335335+ let mut list_records_url =
336336+ pds_url.join("/xrpc/com.atproto.repo.listRecords").unwrap();
337337+ list_records_url
338338+ .query_pairs_mut()
339339+ .append_pair("repo", &did)
340340+ .append_pair("collection", signal)
341341+ .append_pair("limit", "1");
342342+343343+ let res = http
344344+ .get(list_records_url)
345345+ .send()
346346+ .await
347347+ .into_diagnostic()
348348+ .map(|res| res.error_for_status().into_diagnostic())
349349+ .flatten();
350350+ match res {
351351+ Ok(res) => {
352352+ let Ok(bytes) = res.bytes().await else {
353353+ error!(
354354+ "failed to read bytes from listRecords response for repo {did}, signal {signal}"
355355+ );
356356+ return (did, CrawlCheckResult::Failed);
357357+ };
358358+ match serde_json::from_slice::<ListRecordsOutput>(&bytes) {
359359+ Ok(out) => {
360360+ if !out.records.is_empty() {
361361+ found_signal = true;
362362+ break;
363363+ }
364364+ }
365365+ Err(e) => {
366366+ error!(
367367+ "failed to parse listRecords response for repo {did}, signal {signal}: {e}"
368368+ );
369369+ return (did, CrawlCheckResult::Failed);
370370+ }
371371+ }
372372+ }
373373+ Err(e) => {
374374+ error!(
375375+ "failed to listRecords for repo {did}, signal {signal}: {e}"
376376+ );
377377+ return (did, CrawlCheckResult::Failed);
378378+ }
379379+ }
380380+ }
381381+382382+ if found_signal {
383383+ (did, CrawlCheckResult::Signal)
384384+ } else {
385385+ trace!("crawler skipped repo {did}: no records match signals");
386386+ (did, CrawlCheckResult::NoSignal)
387387+ }
388388+ });
389389+ }
390390+391391+ while let Some(res) = set.join_next().await {
392392+ let (did, result) = res.into_diagnostic()?;
393393+ match result {
394394+ CrawlCheckResult::Signal => {
395395+ batch.remove(&db.crawler, keys::crawler_failed_key(&did));
396396+ valid.push(did);
397397+ }
398398+ CrawlCheckResult::NoSignal => {
399399+ batch.remove(&db.crawler, keys::crawler_failed_key(&did));
400400+ }
401401+ CrawlCheckResult::Ratelimited | CrawlCheckResult::Failed => {
402402+ batch.insert(&db.crawler, keys::crawler_failed_key(&did), []);
403403+ }
302404 }
405405+ }
303406304304- if cursor.is_none() {
305305- tokio::time::sleep(Duration::from_secs(3600)).await;
407407+ Ok(valid)
408408+ }
409409+410410+ async fn retry_failed_repos(&self, rng: &mut SmallRng) -> Result<()> {
411411+ let db = &self.state.db;
412412+ let filter = self.state.filter.load();
413413+414414+ let check_signals = filter.mode == crate::filter::FilterMode::Filter
415415+ && !filter.signals.is_empty()
416416+ && !filter.has_glob_signals();
417417+418418+ if !check_signals {
419419+ return Ok(());
420420+ }
421421+422422+ let mut failed_dids = Vec::new();
423423+ for guard in db.crawler.prefix(keys::CRAWLER_FAILED_PREFIX) {
424424+ let (key, _) = guard.into_inner().into_diagnostic()?;
425425+ let did_bytes = &key[keys::CRAWLER_FAILED_PREFIX.len()..];
426426+ let trimmed = TrimmedDid::try_from(did_bytes)?;
427427+ failed_dids.push(trimmed.to_did());
428428+ }
429429+430430+ if failed_dids.is_empty() {
431431+ return Ok(());
432432+ }
433433+434434+ info!(
435435+ "crawler: retrying {} previously failed repos",
436436+ failed_dids.len()
437437+ );
438438+439439+ let mut batch = db.inner.batch();
440440+ let valid_dids = self
441441+ .check_signals_batch(&failed_dids, &filter, &mut batch)
442442+ .await?;
443443+444444+ for did in &valid_dids {
445445+ let did_key = keys::repo_key(did);
446446+447447+ if Db::contains_key(db.repos.clone(), &did_key).await? {
448448+ continue;
306449 }
450450+451451+ let state = RepoState::backfilling(rng.next_u64());
452452+ batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
453453+ batch.insert(&db.pending, keys::pending_key(state.index_id), &did_key);
307454 }
455455+456456+ tokio::task::spawn_blocking(move || batch.commit().into_diagnostic())
457457+ .await
458458+ .into_diagnostic()??;
459459+460460+ if !valid_dids.is_empty() {
461461+ info!(
462462+ "crawler: recovered {} repos from failed retry",
463463+ valid_dids.len()
464464+ );
465465+ self.account_new_repos(valid_dids.len()).await;
466466+ }
467467+468468+ Ok(())
469469+ }
470470+471471+ async fn account_new_repos(&self, count: usize) {
472472+ if count == 0 {
473473+ return;
474474+ }
475475+476476+ self.count.fetch_add(count, Ordering::Relaxed);
477477+ self.state
478478+ .db
479479+ .update_count_async("repos", count as i64)
480480+ .await;
481481+ self.state
482482+ .db
483483+ .update_count_async("pending", count as i64)
484484+ .await;
485485+ self.state.notify_backfill();
308486 }
309487}