at protocol indexer with flexible filtering, xrpc queries, and a cursor-backed event stream, built on fjall
at-protocol atproto indexer rust fjall

[crawler] use describeRepo instead of listRecords...

ptr.pet e263dacf d58e0261

verified
+72 -90
+72 -90
src/crawler/mod.rs
··· 5 5 use crate::util::{ErrorForStatus, RetryOutcome, RetryWithBackoff, parse_retry_after}; 6 6 use chrono::{DateTime, TimeDelta, Utc}; 7 7 use futures::FutureExt; 8 - use jacquard_api::com_atproto::repo::list_records::ListRecordsOutput; 8 + use jacquard_api::com_atproto::repo::describe_repo::DescribeRepoOutput; 9 9 use jacquard_api::com_atproto::sync::list_repos::ListReposOutput; 10 10 use jacquard_common::{IntoStatic, types::string::Did}; 11 11 use miette::{Context, IntoDiagnostic, Result}; ··· 193 193 Throttled, 194 194 } 195 195 196 - let mut found_signal = false; 197 - for signal in filter.signals.iter() { 198 - let res = async { 199 - let mut list_records_url = pds_url.join("/xrpc/com.atproto.repo.listRecords").unwrap(); 200 - list_records_url 201 - .query_pairs_mut() 202 - .append_pair("repo", &did) 203 - .append_pair("collection", signal) 204 - .append_pair("limit", "1"); 196 + let mut describe_url = pds_url.join("/xrpc/com.atproto.repo.describeRepo").unwrap(); 197 + describe_url.query_pairs_mut().append_pair("repo", &did); 205 198 206 - let resp = async { 207 - let resp = http 208 - .get(list_records_url.clone()) 209 - .send() 210 - .await 211 - .map_err(RequestError::Reqwest)?; 199 + let resp = async { 200 + let resp = http 201 + .get(describe_url) 202 + .send() 203 + .await 204 + .map_err(RequestError::Reqwest)?; 212 205 213 - // dont retry ratelimits since we will just put it in a queue to be tried again later 214 - if resp.status() == StatusCode::TOO_MANY_REQUESTS { 215 - return Err(RequestError::RateLimited(parse_retry_after(&resp))); 216 - } 206 + // dont retry ratelimits since we will just put it in a queue to be tried again later 207 + if resp.status() == StatusCode::TOO_MANY_REQUESTS { 208 + return Err(RequestError::RateLimited(parse_retry_after(&resp))); 209 + } 217 210 218 - resp.error_for_status().map_err(RequestError::Reqwest) 219 - } 220 - .or_failure(&throttle, || RequestError::Throttled) 221 - .await; 211 + resp.error_for_status().map_err(RequestError::Reqwest) 212 + } 213 + .or_failure(&throttle, || RequestError::Throttled) 214 + .await; 222 215 223 - let resp = match resp { 224 - Ok(r) => { 225 - throttle.record_success(); 226 - r 227 - } 228 - Err(RequestError::RateLimited(secs)) => { 229 - throttle.record_ratelimit(secs); 230 - return throttle 231 - .to_retry_state() 232 - .with_status(StatusCode::TOO_MANY_REQUESTS) 233 - .into(); 234 - } 235 - Err(RequestError::Throttled) => { 236 - return throttle.to_retry_state().into(); 216 + let resp = match resp { 217 + Ok(r) => { 218 + throttle.record_success(); 219 + r 220 + } 221 + Err(RequestError::RateLimited(secs)) => { 222 + throttle.record_ratelimit(secs); 223 + return ( 224 + did, 225 + throttle 226 + .to_retry_state() 227 + .with_status(StatusCode::TOO_MANY_REQUESTS) 228 + .into(), 229 + ); 230 + } 231 + Err(RequestError::Throttled) => { 232 + return (did, throttle.to_retry_state().into()); 233 + } 234 + Err(RequestError::Reqwest(e)) => { 235 + if is_throttle_worthy(&e) { 236 + if let Some(mins) = throttle.record_failure() { 237 + warn!(url = %pds_url, mins, "throttling pds due to hard failure"); 237 238 } 238 - Err(RequestError::Reqwest(e)) => { 239 - if is_throttle_worthy(&e) { 240 - if let Some(mins) = throttle.record_failure() { 241 - warn!(url = %pds_url, mins, "throttling pds due to hard failure"); 242 - } 243 - let mut retry_state = throttle.to_retry_state(); 244 - retry_state.status = e.status(); 245 - return retry_state.into(); 246 - } 239 + let mut retry_state = throttle.to_retry_state(); 240 + retry_state.status = e.status(); 241 + return (did, retry_state.into()); 242 + } 247 243 248 - match e.status() { 249 - Some(StatusCode::NOT_FOUND | StatusCode::GONE) => { 250 - trace!("repo not found"); 251 - return CrawlCheckResult::NoSignal; 252 - } 253 - Some(s) if s.is_client_error() => { 254 - error!(status = %s, "repo unavailable"); 255 - return CrawlCheckResult::NoSignal; 256 - } 257 - _ => { 258 - error!(err = %e, "repo errored"); 259 - let mut retry_state = RetryState::new(60 * 15); 260 - retry_state.status = e.status(); 261 - return retry_state.into(); 262 - } 263 - } 244 + match e.status() { 245 + Some(StatusCode::NOT_FOUND | StatusCode::GONE) => { 246 + trace!("repo not found"); 247 + return (did, CrawlCheckResult::NoSignal); 264 248 } 265 - }; 266 - 267 - let bytes = match resp.bytes().await { 268 - Ok(b) => b, 269 - Err(e) => { 270 - error!(err = %e, "failed to read listRecords response"); 271 - return RetryState::new(60 * 5).into(); 249 + Some(s) if s.is_client_error() => { 250 + error!(status = %s, "repo unavailable"); 251 + return (did, CrawlCheckResult::NoSignal); 272 252 } 273 - }; 274 - 275 - match serde_json::from_slice::<ListRecordsOutput>(&bytes) { 276 - Ok(out) if !out.records.is_empty() => return CrawlCheckResult::Signal, 277 - Ok(_) => {} 278 - Err(e) => { 279 - error!(err = %e, "failed to parse listRecords response"); 280 - return RetryState::new(60 * 10).into(); 253 + _ => { 254 + error!(err = %e, "repo errored"); 255 + let mut retry_state = RetryState::new(60 * 15); 256 + retry_state.status = e.status(); 257 + return (did, retry_state.into()); 281 258 } 282 259 } 260 + } 261 + }; 283 262 284 - CrawlCheckResult::NoSignal 263 + let bytes = match resp.bytes().await { 264 + Ok(b) => b, 265 + Err(e) => { 266 + error!(err = %e, "failed to read describeRepo response"); 267 + return (did, RetryState::new(60 * 5).into()); 285 268 } 286 - .instrument(tracing::info_span!("check", signal = %signal)) 287 - .await; 269 + }; 288 270 289 - match res { 290 - CrawlCheckResult::Signal => { 291 - found_signal = true; 292 - break; 293 - } 294 - CrawlCheckResult::NoSignal => continue, 295 - other => return (did, other), 271 + let out = match serde_json::from_slice::<DescribeRepoOutput>(&bytes) { 272 + Ok(out) => out, 273 + Err(e) => { 274 + error!(err = %e, "failed to parse describeRepo response"); 275 + return (did, RetryState::new(60 * 10).into()); 296 276 } 297 - } 277 + }; 278 + 279 + let found_signal = filter.signals.iter().any(|s| out.collections.contains(s)); 298 280 299 281 if !found_signal { 300 - trace!("no signal-matching records found"); 282 + trace!("no signal-matching collections found"); 301 283 } 302 284 303 285 (