at protocol indexer with flexible filtering, xrpc queries, and a cursor-backed event stream, built on fjall
at-protocol atproto indexer rust fjall

[crawler] fix cursor getting reset to 0

ptr.pet 6638fccb 218ce852

verified
+37 -20
+34 -14
src/crawler/mod.rs
··· 10 10 use rand::RngExt; 11 11 use rand::rngs::SmallRng; 12 12 use reqwest::StatusCode; 13 + use serde::{Deserialize, Serialize}; 13 14 use smol_str::SmolStr; 14 15 use std::future::Future; 15 16 use std::ops::Mul; ··· 376 377 ) 377 378 } 378 379 380 + #[derive(Debug, Serialize, Deserialize)] 381 + enum Cursor { 382 + Done, 383 + Next(Option<SmolStr>), 384 + } 385 + 379 386 pub mod throttle; 380 - use throttle::{OrThrottle, Throttler}; 387 + use throttle::{OrFailure, Throttler}; 381 388 382 389 pub struct Crawler { 383 390 state: Arc<AppState>, ··· 505 512 let db = &crawler.state.db; 506 513 507 514 let cursor_key = b"crawler_cursor"; 508 - let mut cursor: Option<SmolStr> = Db::get(db.cursors.clone(), cursor_key.to_vec()) 509 - .await? 510 - .map(|bytes| { 511 - let s = String::from_utf8_lossy(&bytes); 512 - info!(cursor = %s, "resuming"); 513 - s.into() 514 - }); 515 + let cursor_bytes = Db::get(db.cursors.clone(), cursor_key.to_vec()).await?; 516 + let mut cursor: Cursor = cursor_bytes 517 + .as_deref() 518 + .map(rmp_serde::from_slice) 519 + .transpose() 520 + .into_diagnostic()? 521 + .unwrap_or(Cursor::Next(None)); 515 522 let mut was_throttled = false; 516 523 517 524 loop { ··· 569 576 list_repos_url 570 577 .query_pairs_mut() 571 578 .append_pair("limit", "1000"); 572 - if let Some(c) = &cursor { 579 + if let Cursor::Next(Some(c)) = &cursor { 573 580 list_repos_url 574 581 .query_pairs_mut() 575 582 .append_pair("cursor", c.as_str()); ··· 664 671 } 665 672 666 673 if let Some(new_cursor) = output.cursor { 667 - cursor = Some(new_cursor.as_str().into()); 674 + cursor = Cursor::Next(Some(new_cursor.as_str().into())); 668 675 batch.insert( 669 676 &db.cursors, 670 677 cursor_key.to_vec(), ··· 672 679 ); 673 680 } else { 674 681 info!("reached end of list."); 675 - cursor = None; 682 + cursor = Cursor::Done; 676 683 } 677 684 678 685 tokio::task::spawn_blocking(move || batch.commit().into_diagnostic()) ··· 681 688 682 689 crawler.account_new_repos(to_queue.len()).await; 683 690 684 - if cursor.is_none() { 691 + if matches!(cursor, Cursor::Done) { 685 692 tokio::time::sleep(Duration::from_secs(3600)).await; 686 693 } 687 694 } ··· 699 706 700 707 let mut rng: SmallRng = rand::make_rng(); 701 708 709 + let mut batch = db.inner.batch(); 702 710 for guard in db.crawler.prefix(keys::CRAWLER_RETRY_PREFIX) { 703 711 let (key, val) = guard.into_inner().into_diagnostic()?; 704 - let (retry_after, _) = keys::crawler_retry_parse_value(&val)?; 712 + let (retry_after, _) = match keys::crawler_retry_parse_value(&val) { 713 + Ok(x) => x, 714 + Err(_) => { 715 + // this handles the old db format 716 + // todo: remove this later!! its just for testing... 717 + let retry_after = now + 60 * 5; 718 + batch.insert( 719 + &db.crawler, 720 + key.clone(), 721 + keys::crawler_retry_value(retry_after, 0), 722 + ); 723 + (retry_after, 0) 724 + } 725 + }; 705 726 let did = keys::crawler_retry_parse_key(&key)?.to_did(); 706 727 707 728 // we check an extra backoff of 1 - 7% just to make it less likely for ··· 727 748 info!(count = ready.len(), "retrying pending repos"); 728 749 729 750 let handle = tokio::runtime::Handle::current(); 730 - let mut batch = db.inner.batch(); 731 751 let filter = self.state.filter.load(); 732 752 let valid_dids = handle.block_on(self.check_signals_batch(&ready, &filter, &mut batch))?; 733 753
+3 -6
src/crawler/throttle.rs
··· 148 148 } 149 149 } 150 150 151 - /// extension trait that adds `.or_throttle()` to any future returning `Result<T, E>`. 152 - /// 153 - /// races the future against a hard-failure notification. soft ratelimits (429) do NOT 154 - /// trigger cancellation — those are handled by the background retry loop. 151 + /// adds a method for racing the future against a hard-failure notification. 155 152 #[allow(async_fn_in_trait)] 156 - pub trait OrThrottle<T, E>: Future<Output = Result<T, E>> { 153 + pub trait OrFailure<T, E>: Future<Output = Result<T, E>> { 157 154 async fn or_failure( 158 155 self, 159 156 handle: &ThrottleHandle, ··· 169 166 } 170 167 } 171 168 172 - impl<T, E, F: Future<Output = Result<T, E>>> OrThrottle<T, E> for F {} 169 + impl<T, E, F: Future<Output = Result<T, E>>> OrFailure<T, E> for F {}