···199199 let resp = async {
200200 let resp = http
201201 .get(describe_url)
202202+ .timeout(throttle.timeout())
202203 .send()
203204 .await
204205 .map_err(RequestError::Reqwest)?;
···232233 return (did, throttle.to_retry_state().into());
233234 }
234235 Err(RequestError::Reqwest(e)) => {
236236+ if e.is_timeout() && !throttle.record_timeout() {
237237+ // first or second timeout, just requeue
238238+ let mut retry_state = RetryState::new(60);
239239+ retry_state.status = e.status();
240240+ return (did, retry_state.into());
241241+ }
242242+ // third timeout, if timeout fail is_throttle_worthy will ban the pds
243243+235244 if is_throttle_worthy(&e) {
236245 if let Some(mins) = throttle.record_failure() {
237246 warn!(url = %pds_url, mins, "throttling pds due to hard failure");
···282291 trace!("no signal-matching collections found");
283292 }
284293285285- (
294294+ return (
286295 did,
287296 found_signal
288297 .then_some(CrawlCheckResult::Signal)
289298 .unwrap_or(CrawlCheckResult::NoSignal),
290290- )
299299+ );
291300}
292301293302#[derive(Debug, Serialize, Deserialize)]
···439448 .into_diagnostic()
440449 .wrap_err("can't parse cursor")?
441450 .unwrap_or(Cursor::Next(None));
442442- let mut was_throttled = false;
443451452452+ match cursor {
453453+ Cursor::Next(Some(ref cursor)) => info!(cursor = %cursor, "resuming"),
454454+ Cursor::Next(None) => info!("starting from scratch"),
455455+ Cursor::Done => info!("was done, resuming"),
456456+ }
457457+458458+ let mut was_throttled = false;
444459 loop {
445460 // throttle check
446461 loop {
···535550 }
536551 };
537552538538- let output = match serde_json::from_slice::<ListReposOutput>(&bytes) {
539539- Ok(out) => out.into_static(),
540540- Err(e) => {
541541- error!(err = %e, "failed to parse listRepos response");
542542- continue;
543543- }
544544- };
553553+ let mut batch = db.inner.batch();
554554+ let mut to_queue = Vec::new();
555555+ let filter = crawler.state.filter.load();
545556546546- if output.repos.is_empty() {
547547- info!("finished enumeration (or empty page)");
548548- tokio::time::sleep(Duration::from_secs(3600)).await;
549549- continue;
557557+ struct ParseResult {
558558+ unknown_dids: Vec<Did<'static>>,
559559+ cursor: Option<smol_str::SmolStr>,
560560+ count: usize,
550561 }
551562552552- debug!(count = output.repos.len(), "fetched repos");
553553- crawler
554554- .crawled_count
555555- .fetch_add(output.repos.len(), Ordering::Relaxed);
563563+ let parse_result = {
564564+ let repos = db.repos.clone();
565565+ let filter_ks = db.filter.clone();
566566+ let crawler_ks = db.crawler.clone();
567567+ tokio::task::spawn_blocking(move || -> miette::Result<Option<ParseResult>> {
568568+ let output = match serde_json::from_slice::<ListReposOutput>(&bytes) {
569569+ Ok(out) => out.into_static(),
570570+ Err(e) => {
571571+ error!(err = %e, "failed to parse listRepos response");
572572+ return Ok(None);
573573+ }
574574+ };
556575557557- let mut batch = db.inner.batch();
558558- let mut to_queue = Vec::new();
559559- let filter = crawler.state.filter.load();
576576+ if output.repos.is_empty() {
577577+ return Ok(None);
578578+ }
560579561561- let mut unknown_dids = Vec::new();
562562- for repo in output.repos {
563563- let did_key = keys::repo_key(&repo.did);
580580+ let count = output.repos.len();
581581+ let next_cursor = output.cursor.map(|c| c.as_str().into());
582582+ let mut unknown = Vec::new();
583583+ for repo in output.repos {
584584+ let excl_key = crate::db::filter::exclude_key(repo.did.as_str())?;
585585+ if filter_ks.contains_key(&excl_key).into_diagnostic()? {
586586+ continue;
587587+ }
564588565565- let excl_key = crate::db::filter::exclude_key(repo.did.as_str())?;
566566- if db.filter.contains_key(&excl_key).into_diagnostic()? {
567567- continue;
568568- }
589589+ // already in retry queue — let the retry thread handle it
590590+ let retry_key = keys::crawler_retry_key(&repo.did);
591591+ if crawler_ks.contains_key(&retry_key).into_diagnostic()? {
592592+ continue;
593593+ }
569594570570- // already in retry queue — let the retry thread handle it
571571- let retry_key = keys::crawler_retry_key(&repo.did);
572572- if db.crawler.contains_key(&retry_key).into_diagnostic()? {
573573- continue;
574574- }
595595+ let did_key = keys::repo_key(&repo.did);
596596+ if !repos.contains_key(&did_key).into_diagnostic()? {
597597+ unknown.push(repo.did.into_static());
598598+ }
599599+ }
575600576576- if !Db::contains_key(db.repos.clone(), &did_key).await? {
577577- unknown_dids.push(repo.did.into_static());
578578- }
579579- }
601601+ Ok(Some(ParseResult {
602602+ unknown_dids: unknown,
603603+ cursor: next_cursor,
604604+ count,
605605+ }))
606606+ })
607607+ .await
608608+ .into_diagnostic()??
609609+ };
610610+611611+ let Some(ParseResult {
612612+ unknown_dids,
613613+ cursor: next_cursor,
614614+ count,
615615+ }) = parse_result
616616+ else {
617617+ info!("finished enumeration (or empty page)");
618618+ tokio::time::sleep(Duration::from_secs(3600)).await;
619619+ continue;
620620+ };
621621+622622+ debug!(count, "fetched repos");
623623+ crawler.crawled_count.fetch_add(count, Ordering::Relaxed);
580624581625 let valid_dids = if filter.check_signals() && !unknown_dids.is_empty() {
582626 // we dont need to pass any existing since we have none; we are crawling after all
···597641 to_queue.push(did.clone());
598642 }
599643600600- if let Some(new_cursor) = output.cursor {
644644+ if let Some(new_cursor) = next_cursor {
601645 cursor = Cursor::Next(Some(new_cursor.as_str().into()));
602646 } else {
603647 info!("reached end of list.");
+19
src/crawler/throttle.rs
···22use std::future::Future;
33use std::sync::Arc;
44use std::sync::atomic::{AtomicI64, AtomicUsize, Ordering};
55+use std::time::Duration;
56use tokio::sync::{Notify, Semaphore, SemaphorePermit};
67use url::Url;
78···4748struct State {
4849 throttled_until: AtomicI64,
4950 consecutive_failures: AtomicUsize,
5151+ consecutive_timeouts: AtomicUsize,
5052 /// only fires on hard failures (timeout, TLS, bad gateway, etc).
5153 /// ratelimits do NOT fire this — they just store `throttled_until` and
5254 /// let tasks exit naturally, deferring to the background retry loop.
···5961 Self {
6062 throttled_until: AtomicI64::new(0),
6163 consecutive_failures: AtomicUsize::new(0),
6464+ consecutive_timeouts: AtomicUsize::new(0),
6265 failure_notify: Notify::new(),
6366 semaphore: Semaphore::new(PER_PDS_CONCURRENCY),
6467 }
···82858386 pub fn record_success(&self) {
8487 self.state.consecutive_failures.store(0, Ordering::Release);
8888+ self.state.consecutive_timeouts.store(0, Ordering::Release);
8589 self.state.throttled_until.store(0, Ordering::Release);
8690 }
8791···123127 self.state.failure_notify.notify_waiters();
124128125129 Some(minutes)
130130+ }
131131+132132+ /// returns current timeout duration — 3s, 6s, or 12s depending on prior timeouts.
133133+ pub fn timeout(&self) -> Duration {
134134+ let n = self.state.consecutive_timeouts.load(Ordering::Acquire);
135135+ Duration::from_secs(3 * 2u64.pow(n.min(2) as u32))
136136+ }
137137+138138+ pub fn record_timeout(&self) -> bool {
139139+ let timeouts = self
140140+ .state
141141+ .consecutive_timeouts
142142+ .fetch_add(1, Ordering::AcqRel)
143143+ + 1;
144144+ timeouts > 2
126145 }
127146128147 /// acquire a concurrency slot for this PDS. hold the returned permit
+2-4
src/db/mod.rs
···113113 let repos = open_ks(
114114 "repos",
115115 opts()
116116- // most lookups hit since repo must exist after discovery
117117- // we don't hit here if it's not tracked anyway (that happens in filter)
118118- .expect_point_read_hits(true)
116116+ // crawler checks if a repo doesn't exist
117117+ .expect_point_read_hits(false)
119118 .max_memtable_size(cfg.db_repos_memtable_size_mb * 1024 * 1024)
120119 .data_block_size_policy(BlockSizePolicy::all(kb(4))),
121120 )?;
···196195 let crawler = open_ks(
197196 "crawler",
198197 opts()
199199- .expect_point_read_hits(true)
200198 .max_memtable_size((kb(1024) * 16) as u64)
201199 .data_block_size_policy(BlockSizePolicy::all(kb(1))),
202200 )?;