Parakeet is a Rust-based Bluesky AppServer aiming to implement most of the functionality required to support the Bluesky client
appview atproto bluesky rust appserver

feat: backfill spidering

mia.omg.lol 8f8bf0f0 924b6f6a

verified
+89 -7
+12 -7
crates/consumer/src/backfill/mod.rs
··· 25 25 mod repo; 26 26 mod utils; 27 27 28 + const BF_QUEUE: &str = "backfill_queue"; 28 29 const DL_DUP_KEY: &str = "bf_completed"; 29 30 // There's a 4MiB limit on parakeet-index, so break delta batches up if there's loads. 30 31 // this should be plenty low enough to not trigger the size limit. (59k did slightly) ··· 80 81 break; 81 82 } 82 83 83 - let did: String = match self.redis.lpop("backfill_queue", None).await { 84 + let did: String = match self.redis.lpop(BF_QUEUE, None).await { 84 85 Ok(Some(did)) => did, 85 86 Ok(None) => { 86 87 tokio::time::sleep(Duration::from_millis(250)).await; ··· 96 97 97 98 let inner = self.inner.clone(); 98 99 let mut conn = self.pool.get().await?; 99 - let rc = self.redis.clone(); 100 + let mut rc = self.redis.clone(); 100 101 101 102 tracker.spawn(async move { 102 103 let _p = p; 103 104 tracing::trace!("backfilling {did}"); 104 105 105 - if let Err(e) = do_actor_backfill(&mut conn, rc, inner, &did).await { 106 + if let Err(e) = do_actor_backfill(&mut conn, &mut rc, inner, &did).await { 106 107 tracing::error!(did, "backfill failed: {e}"); 107 108 counter!("backfill_failure").increment(1); 108 109 } else { ··· 111 112 db::backfill_job_write(&mut conn, &did, "successful") 112 113 .await 113 114 .unwrap(); 115 + 116 + if let Err(e) = utils::handle_spider(&mut conn, &mut rc, &did).await { 117 + tracing::error!("failed to trigger spider for {did}: {e}"); 118 + } 114 119 } 115 120 }); 116 121 } ··· 123 128 124 129 async fn do_actor_backfill( 125 130 conn: &mut Object, 126 - mut rc: MultiplexedConnection, 131 + rc: &mut MultiplexedConnection, 127 132 mut inner: BackfillManagerInner, 128 133 did: &str, 129 134 ) -> eyre::Result<()> { ··· 191 196 } 192 197 } 193 198 194 - utils::enforce_ratelimit(&mut rc, &pds).await?; 199 + utils::enforce_ratelimit(rc, &pds).await?; 195 200 196 - match backfill_repo(conn, &mut rc, &mut inner, &pds, did).await { 201 + match backfill_repo(conn, rc, &mut inner, &pds, did).await { 197 202 Ok(Some((rem, reset))) => { 198 203 let _ = rc.zadd(utils::BF_REM_KEY, &pds, rem).await; 199 204 let _ = rc.zadd(utils::BF_RESET_KEY, &pds, reset).await; ··· 203 208 pds, 204 209 "got response with no ratelimit headers, using defaults." 205 210 ); 206 - utils::handle_default_ratelimit(&mut rc, &pds).await?; 211 + utils::handle_default_ratelimit(rc, &pds).await?; 207 212 } 208 213 Err(e) => { 209 214 tracing::error!(did, "backfill failed: {e}");
+41
crates/consumer/src/backfill/utils.rs
··· 9 9 use std::time::Duration; 10 10 use tracing::instrument; 11 11 12 + const SPIDER_KEY: &str = "bf_spider"; 12 13 pub const BF_RESET_KEY: &str = "bf_ratelimit_reset"; 13 14 pub const BF_REM_KEY: &str = "bf_ratelimit_rem"; 14 15 const BF_REM_DEFAULT: i32 = 1000; 16 + 17 + pub async fn handle_spider( 18 + conn: &mut Object, 19 + rc: &mut MultiplexedConnection, 20 + did: &str, 21 + ) -> eyre::Result<()> { 22 + let Some(spider_count) = 23 + redis::AsyncCommands::hget::<_, _, Option<i32>>(rc, SPIDER_KEY, did).await? 24 + else { 25 + return Ok(()); 26 + }; 27 + rc.hdel(SPIDER_KEY, did).await?; 28 + 29 + let new_count = spider_count - 1; 30 + 31 + let follows = conn 32 + .query("SELECT subject FROM follows WHERE did=$1", &[&did]) 33 + .await?; 34 + if follows.is_empty() { 35 + return Ok(()); 36 + } 37 + 38 + let follows = follows.iter().map(|v| v.get::<_, String>(0)); 39 + 40 + let items = follows 41 + .clone() 42 + .map(|follow| (follow, new_count)) 43 + .collect::<Vec<_>>(); 44 + 45 + if new_count > 0 { 46 + // write all the new accounts 47 + rc.hset_multiple(SPIDER_KEY, &items).await.unwrap(); 48 + } 49 + 50 + // and then to backfill 51 + let follows = follows.collect::<Vec<_>>(); 52 + rc.rpush(super::BF_QUEUE, &follows).await.unwrap(); 53 + 54 + Ok(()) 55 + } 15 56 16 57 #[derive(Debug, Deserialize)] 17 58 pub struct GetRepoStatusRes {
+35
crates/parakeet/src/xrpc/at_parakeet/admin.rs
··· 53 53 54 54 Ok(()) 55 55 } 56 + 57 + #[derive(Debug, Deserialize)] 58 + pub struct RequestSpiderReq { 59 + pub depth: i32, 60 + pub dids: Vec<String>, 61 + } 62 + 63 + pub async fn request_spider( 64 + State(mut state): State<GlobalState>, 65 + auth: AtpAuth, 66 + Json(form): Json<RequestSpiderReq>, 67 + ) -> XrpcResult<()> { 68 + if !check_admin_did(&state, &auth.0) { 69 + return Err(Error::new(StatusCode::FORBIDDEN, "Forbidden", None)); 70 + } 71 + 72 + let items = form 73 + .dids 74 + .iter() 75 + .clone() 76 + .map(|did| (did, form.depth)) 77 + .collect::<Vec<_>>(); 78 + 79 + if let Err(e) = state.redis_mp.hset_multiple("bf_spider", &items).await { 80 + tracing::error!("failed to push to spider store: {e}"); 81 + return Err(Error::server_error(None)); 82 + } 83 + 84 + if let Err(e) = state.redis_mp.rpush(BACKFILL_QUEUE, form.dids).await { 85 + tracing::error!("failed to push to backfill queue: {e}"); 86 + return Err(Error::server_error(None)); 87 + } 88 + 89 + Ok(()) 90 + }
+1
crates/parakeet/src/xrpc/at_parakeet/mod.rs
··· 8 8 Router::new() 9 9 .route("/at.parakeet.admin.backfillQueueSize", get(admin::backfill_queue_size)) 10 10 .route("/at.parakeet.admin.requestBackfill", post(admin::request_backfill)) 11 + .route("/at.parakeet.admin.requestSpider", post(admin::request_spider)) 11 12 } 12 13 13 14 pub fn check_admin_did(state: &crate::GlobalState, did: &String) -> bool {