···2323smol_str = "0.3"
2424futures = "0.3"
2525reqwest = { version = "0.12", features = ["json", "rustls-tls", "stream", "gzip", "brotli", "zstd", "http2"], default-features = false }
2626+reqwest-client = { package = "reqwest", version = "0.13.2", features = ["json", "rustls", "stream", "gzip", "brotli", "zstd", "http2"], default-features = false }
2727+reqwest-middleware = { version = "0.5.1", default-features = false, features = ["http2", "rustls"] }
2828+reqwest-retry = { version = "0.9.1" }
2629axum = { version = "0.8.8", features = ["ws", "macros"] }
2730tower-http = { version = "0.6.6", features = ["cors", "trace"] }
2831tokio-stream = "0.1"
···4952glob = "0.3"
5053ordermap = { version = "1.1.0", features = ["serde"] }
5154arc-swap = "1.8.2"
5555+rustls = { version = "0.23", features = ["ring"] }
52565357[dev-dependencies]
5458tempfile = "3.26.0"
+1-1
README.md
···4242| `NO_LZ4_COMPRESSION` | `false` | disable lz4 compression for storage. |
4343| `ENABLE_FIREHOSE` | `true` | whether to ingest relay subscriptions. |
4444| `ENABLE_BACKFILL` | `true` | whether to backfill from PDS instances. |
4545-| `ENABLE_CRAWLER` | `false` (if Filter), `true` (if Full) | whether to actively query the network for unknown repositories. |
4545+| `ENABLE_CRAWLER` | `false` (if Filter), `true` (if Full) | whether to actively query the network for unknown repositories. when in `Filter` mode without wildcard (`*`) signals, the crawler uses `com.atproto.repo.listRecords` to verify if a discovered repository has matching records before queuing it for backfill, this will be a lot faster usually since most repos will get filtered out faster. |
4646| `DB_WORKER_THREADS` | `4` (`8` if full network) | database worker threads. |
4747| `DB_MAX_JOURNALING_SIZE_MB` | `512` (`1024` if full network) | max database journaling size in MB. |
4848| `DB_PENDING_MEMTABLE_SIZE_MB` | `64` (`192` if full network) | pending memtable size in MB. |
+151-36
src/crawler/mod.rs
···11use crate::db::{Db, keys, ser_repo_state};
22use crate::state::AppState;
33use crate::types::RepoState;
44-use jacquard::api::com_atproto::sync::list_repos::{ListRepos, ListReposOutput};
55-use jacquard::prelude::*;
66-use jacquard_common::CowStr;
44+use jacquard::IntoStatic;
55+use jacquard_api::com_atproto::repo::list_records::ListRecordsOutput;
66+use jacquard_api::com_atproto::sync::list_repos::ListReposOutput;
77+use jacquard_common::types::string::Did;
78use miette::{IntoDiagnostic, Result};
89use rand::Rng;
910use rand::rngs::SmallRng;
1111+use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
1212+use reqwest_retry::Jitter;
1313+use reqwest_retry::{RetryTransientMiddleware, policies::ExponentialBackoff};
1014use smol_str::SmolStr;
1115use std::sync::Arc;
1216use std::time::Duration;
···1620pub struct Crawler {
1721 state: Arc<AppState>,
1822 relay_host: Url,
1919- http: reqwest::Client,
2323+ http: Arc<ClientWithMiddleware>,
2024 max_pending: usize,
2125 resume_pending: usize,
2226}
···2832 max_pending: usize,
2933 resume_pending: usize,
3034 ) -> Self {
3535+ let retry_policy = ExponentialBackoff::builder()
3636+ .jitter(Jitter::Bounded)
3737+ .build_with_max_retries(8);
3838+ let reqwest_client = reqwest_client::Client::builder()
3939+ .user_agent(concat!(
4040+ env!("CARGO_PKG_NAME"),
4141+ "/",
4242+ env!("CARGO_PKG_VERSION")
4343+ ))
4444+ .gzip(true)
4545+ .build()
4646+ .expect("that reqwest will build");
4747+4848+ let http = ClientBuilder::new(reqwest_client)
4949+ .with(RetryTransientMiddleware::new_with_policy(retry_policy))
5050+ .build();
5151+ let http = Arc::new(http);
5252+3153 Self {
3254 state,
3355 relay_host,
3434- http: reqwest::Client::new(),
5656+ http,
3557 max_pending,
3658 resume_pending,
3759 }
···4062 pub async fn run(self) -> Result<()> {
4163 info!("crawler started");
42646565+ let mut api_url = self.relay_host.clone();
6666+ if api_url.scheme() == "wss" {
6767+ api_url
6868+ .set_scheme("https")
6969+ .map_err(|_| miette::miette!("invalid url: {api_url}"))?;
7070+ } else if api_url.scheme() == "ws" {
7171+ api_url
7272+ .set_scheme("http")
7373+ .map_err(|_| miette::miette!("invalid url: {api_url}"))?;
7474+ }
7575+4376 let mut rng: SmallRng = rand::make_rng();
44774578 let db = &self.state.db;
46794780 // 1. load cursor
4881 let cursor_key = b"crawler_cursor";
4949- let mut cursor: Option<SmolStr> =
5050- if let Ok(Some(bytes)) = Db::get(db.cursors.clone(), cursor_key.to_vec()).await {
8282+ let mut cursor: Option<SmolStr> = Db::get(db.cursors.clone(), cursor_key.to_vec())
8383+ .await?
8484+ .map(|bytes| {
5185 let s = String::from_utf8_lossy(&bytes);
5286 info!("resuming crawler from cursor: {}", s);
5353- Some(s.into())
5454- } else {
5555- None
5656- };
8787+ s.into()
8888+ });
5789 let mut was_throttled = false;
58905991 loop {
···97129 }
9813099131 // 2. fetch listrepos
100100- let req = ListRepos::new()
101101- .limit(1000)
102102- .maybe_cursor(cursor.clone().map(|c| CowStr::from(c.to_string())))
103103- .build();
104104-105105- let mut url = self.relay_host.clone();
106106- if url.scheme() == "wss" {
107107- url.set_scheme("https")
108108- .map_err(|_| miette::miette!("invalid url: {url}"))?;
109109- } else if url.scheme() == "ws" {
110110- url.set_scheme("http")
111111- .map_err(|_| miette::miette!("invalid url: {url}"))?;
132132+ let mut list_repos_url = api_url
133133+ .join("/xrpc/com.atproto.sync.listRepos")
134134+ .into_diagnostic()?;
135135+ list_repos_url
136136+ .query_pairs_mut()
137137+ .append_pair("limit", "1000");
138138+ if let Some(c) = &cursor {
139139+ list_repos_url
140140+ .query_pairs_mut()
141141+ .append_pair("cursor", c.as_str());
112142 }
113113- let res_result = self.http.xrpc(url).send(&req).await;
114143115115- let output: ListReposOutput = match res_result {
116116- Ok(res) => res.into_output().into_diagnostic()?,
144144+ let res_result = self.http.get(list_repos_url.clone()).send().await;
145145+ let bytes = match res_result {
146146+ Ok(res) => match res.bytes().await {
147147+ Ok(b) => b,
148148+ Err(e) => {
149149+ error!(
150150+ "crawler failed to parse list repos response: {e}. retrying in 30s..."
151151+ );
152152+ tokio::time::sleep(Duration::from_secs(30)).await;
153153+ continue;
154154+ }
155155+ },
117156 Err(e) => {
118118- let e = e
119119- .source_err()
120120- .map(|e| e.to_string())
121121- .unwrap_or_else(|| e.to_string());
122157 error!("crawler failed to list repos: {e}. retrying in 30s...");
123158 tokio::time::sleep(Duration::from_secs(30)).await;
124159 continue;
125160 }
126161 };
162162+ let output = serde_json::from_slice::<ListReposOutput>(&bytes)
163163+ .into_diagnostic()?
164164+ .into_static();
127165128166 if output.repos.is_empty() {
129167 info!("crawler finished enumeration (or empty page). sleeping for 1 hour.");
···135173136174 let mut batch = db.inner.batch();
137175 let mut to_queue = Vec::new();
176176+ let filter = self.state.filter.load();
177177+ // we can check whether or not to backfill repos faster if we only have to check
178178+ // certain known signals, since we can just listRecords for those signals
179179+ // if we have glob signals we cant do this since we dont know what signals to check
180180+ let check_signals = filter.mode == crate::filter::FilterMode::Filter
181181+ && !filter.signals.is_empty()
182182+ && !filter.has_glob_signals();
138183139184 // 3. process repos
185185+ let mut unknown_repos = Vec::new();
140186 for repo in output.repos {
141141- let did_key = keys::repo_key(&repo.did);
187187+ let parsed_did: Did = repo.did.parse().unwrap();
188188+ let did_key = keys::repo_key(&parsed_did);
142189143190 let excl_key = crate::db::filter::exclude_key(repo.did.as_str())?;
144191 if db.filter.contains_key(&excl_key).into_diagnostic()? {
···147194148195 // check if known
149196 if !Db::contains_key(db.repos.clone(), &did_key).await? {
150150- trace!("crawler found new repo: {}", repo.did);
197197+ unknown_repos.push(repo);
198198+ }
199199+ }
200200+201201+ let mut valid_repos = Vec::new();
202202+ if check_signals && !unknown_repos.is_empty() {
203203+ let mut set = tokio::task::JoinSet::new();
204204+ for repo in unknown_repos {
205205+ let http = self.http.clone();
206206+ let api_url = api_url.clone();
207207+ let filter = filter.clone();
208208+ set.spawn(async move {
209209+ let mut found_signal = false;
210210+ for signal in filter.signals.iter() {
211211+ let mut list_records_url =
212212+ api_url.join("/xrpc/com.atproto.repo.listRecords").unwrap();
213213+ list_records_url
214214+ .query_pairs_mut()
215215+ .append_pair("repo", &repo.did)
216216+ .append_pair("collection", signal)
217217+ .append_pair("limit", "1");
218218+219219+ match http.get(list_records_url).send().await {
220220+ Ok(res) => {
221221+ let Ok(bytes) = res.bytes().await else {
222222+ error!("failed to read bytes from listRecords response for repo {}, signal {signal}", repo.did);
223223+ continue;
224224+ };
225225+ if let Ok(out) = serde_json::from_slice::<ListRecordsOutput>(&bytes) {
226226+ if !out.records.is_empty() {
227227+ found_signal = true;
228228+ break;
229229+ }
230230+ }
231231+ }
232232+ Err(e) => {
233233+ error!(
234234+ "failed to listRecords for repo {}, signal {signal}: {e}",
235235+ repo.did
236236+ );
237237+ continue;
238238+ }
239239+ }
240240+ }
151241152152- let state = RepoState::backfilling(rng.next_u64());
153153- batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
154154- batch.insert(&db.pending, keys::pending_key(state.index_id), &did_key);
155155- to_queue.push(repo.did.clone());
242242+ if !found_signal {
243243+ trace!(
244244+ "crawler skipped repo {}: no records match signals",
245245+ repo.did
246246+ );
247247+ }
248248+249249+ (repo, found_signal)
250250+ });
156251 }
252252+253253+ while let Some(res) = set.join_next().await {
254254+ let (repo, found_signal) = res.into_diagnostic()?;
255255+ if found_signal {
256256+ valid_repos.push(repo);
257257+ }
258258+ }
259259+ } else {
260260+ valid_repos = unknown_repos;
261261+ }
262262+263263+ for repo in valid_repos {
264264+ let parsed_did: Did = repo.did.parse().unwrap();
265265+ let did_key = keys::repo_key(&parsed_did);
266266+ trace!("crawler found new repo: {}", repo.did);
267267+268268+ let state = RepoState::backfilling(rng.next_u64());
269269+ batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
270270+ batch.insert(&db.pending, keys::pending_key(state.index_id), &did_key);
271271+ to_queue.push(repo.did.clone());
157272 }
158273159274 // 4. update cursor
+12-4
src/db/filter.rs
···11use fjall::{Keyspace, OwnedWriteBatch};
22+use jacquard::IntoStatic;
33+use jacquard::types::nsid::Nsid;
24use miette::{IntoDiagnostic, Result};
3546use crate::db::types::TrimmedDid;
···111113 for guard in ks.prefix(signal_prefix) {
112114 let (k, _) = guard.into_inner().into_diagnostic()?;
113115 let val = std::str::from_utf8(&k[signal_prefix.len()..]).into_diagnostic()?;
114114- config.signals.push(smol_str::SmolStr::new(val));
116116+ config.signals.push(Nsid::new(val)?.into_static());
115117 }
116118117119 let col_prefix = [COLLECTION_PREFIX, SEP];
118120 for guard in ks.prefix(col_prefix) {
119121 let (k, _) = guard.into_inner().into_diagnostic()?;
120122 let val = std::str::from_utf8(&k[col_prefix.len()..]).into_diagnostic()?;
121121- config.collections.push(smol_str::SmolStr::new(val));
123123+ config.collections.push(Nsid::new(val)?.into_static());
122124 }
123125124126 Ok(config)
···196198197199 let config = load(&ks)?;
198200 assert_eq!(config.mode, FilterMode::Filter);
199199- assert_eq!(config.signals, vec!["a.b.c"]);
200200- assert_eq!(config.collections, vec!["d.e.f"]);
201201+ assert_eq!(
202202+ config.signals,
203203+ vec![Nsid::new("a.b.c").unwrap().into_static()]
204204+ );
205205+ assert_eq!(
206206+ config.collections,
207207+ vec![Nsid::new("d.e.f").unwrap().into_static()]
208208+ );
201209202210 let excludes = read_set(&ks, EXCLUDE_PREFIX)?;
203211 assert_eq!(excludes, vec!["did:plc:yk4q3id7id6p5z3bypvshc64"]);
+7-8
src/filter.rs
···11+use jacquard::types::nsid::Nsid;
12use serde::{Deserialize, Serialize};
22-use smol_str::SmolStr;
33use std::sync::Arc;
4455pub type FilterHandle = Arc<arc_swap::ArcSwap<FilterConfig>>;
···2525 Full = 2,
2626}
27272828-/// hot-path in-memory config: only the small fields needed on every event.
2929-/// dids and excludes are large sets kept in the filter keyspace only.
3028#[derive(Debug, Clone, Serialize)]
3129pub struct FilterConfig {
3230 pub mode: FilterMode,
3333- pub signals: Vec<SmolStr>,
3434- pub collections: Vec<SmolStr>,
3131+ pub signals: Vec<Nsid<'static>>,
3232+ pub collections: Vec<Nsid<'static>>,
3533}
36343735impl FilterConfig {
···4341 }
4442 }
45434646- /// returns true if the collection matches the content filter.
4747- /// if collections is empty, all collections match.
4844 pub fn matches_collection(&self, collection: &str) -> bool {
4945 if self.collections.is_empty() {
5046 return true;
···5248 self.collections.iter().any(|p| nsid_matches(p, collection))
5349 }
54505555- /// returns true if the commit touches a collection covered by a signal.
5651 pub fn matches_signal(&self, collection: &str) -> bool {
5752 self.signals.iter().any(|p| nsid_matches(p, collection))
5353+ }
5454+5555+ pub fn has_glob_signals(&self) -> bool {
5656+ self.signals.iter().any(|s| s.ends_with(".*"))
5857 }
5958}
6059