tangled
alpha
login
or
join now
ptr.pet
/
hydrant
24
fork
atom
at protocol indexer with flexible filtering, xrpc queries, and a cursor-backed event stream, built on fjall
at-protocol
atproto
indexer
rust
fjall
24
fork
atom
overview
issues
6
pulls
pipelines
[crawler] refactor HTTP backoff and structured logging
ptr.pet
1 week ago
5ba9461d
99d1ffac
verified
This commit was signed with the committer's
known signature
.
ptr.pet
SSH Key Fingerprint:
SHA256:Abmvag+juovVufZTxyWY8KcVgrznxvBjQpJesv071Aw=
+251
-284
4 changed files
expand all
collapse all
unified
split
Cargo.lock
Cargo.toml
src
crawler
mod.rs
resolver.rs
-60
Cargo.lock
···
1540
1540
"mimalloc",
1541
1541
"rand 0.10.0",
1542
1542
"reqwest",
1543
1543
-
"reqwest-middleware",
1544
1544
-
"reqwest-retry",
1545
1543
"rmp-serde",
1546
1544
"rustls",
1547
1545
"scc",
···
2866
2864
]
2867
2865
2868
2866
[[package]]
2869
2869
-
name = "reqwest-middleware"
2870
2870
-
version = "0.5.1"
2871
2871
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2872
2872
-
checksum = "199dda04a536b532d0cc04d7979e39b1c763ea749bf91507017069c00b96056f"
2873
2873
-
dependencies = [
2874
2874
-
"anyhow",
2875
2875
-
"async-trait",
2876
2876
-
"http",
2877
2877
-
"reqwest",
2878
2878
-
"thiserror 2.0.18",
2879
2879
-
"tower-service",
2880
2880
-
]
2881
2881
-
2882
2882
-
[[package]]
2883
2883
-
name = "reqwest-retry"
2884
2884
-
version = "0.9.1"
2885
2885
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2886
2886
-
checksum = "fe2412db2af7d2268e7a5406be0431f37d9eb67ff390f35b395716f5f06c2eaa"
2887
2887
-
dependencies = [
2888
2888
-
"anyhow",
2889
2889
-
"async-trait",
2890
2890
-
"futures",
2891
2891
-
"getrandom 0.2.17",
2892
2892
-
"http",
2893
2893
-
"hyper",
2894
2894
-
"reqwest",
2895
2895
-
"reqwest-middleware",
2896
2896
-
"retry-policies",
2897
2897
-
"thiserror 2.0.18",
2898
2898
-
"tokio",
2899
2899
-
"tracing",
2900
2900
-
"wasmtimer",
2901
2901
-
]
2902
2902
-
2903
2903
-
[[package]]
2904
2867
name = "resolv-conf"
2905
2868
version = "0.7.6"
2906
2869
source = "registry+https://github.com/rust-lang/crates.io-index"
2907
2870
checksum = "1e061d1b48cb8d38042de4ae0a7a6401009d6143dc80d2e2d6f31f0bdd6470c7"
2908
2908
-
2909
2909
-
[[package]]
2910
2910
-
name = "retry-policies"
2911
2911
-
version = "0.5.1"
2912
2912
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2913
2913
-
checksum = "46a4bd6027df676bcb752d3724db0ea3c0c5fc1dd0376fec51ac7dcaf9cc69be"
2914
2914
-
dependencies = [
2915
2915
-
"rand 0.9.2",
2916
2916
-
]
2917
2871
2918
2872
[[package]]
2919
2873
name = "rfc6979"
···
4184
4138
"hashbrown 0.15.5",
4185
4139
"indexmap",
4186
4140
"semver",
4187
4187
-
]
4188
4188
-
4189
4189
-
[[package]]
4190
4190
-
name = "wasmtimer"
4191
4191
-
version = "0.4.3"
4192
4192
-
source = "registry+https://github.com/rust-lang/crates.io-index"
4193
4193
-
checksum = "1c598d6b99ea013e35844697fc4670d08339d5cda15588f193c6beedd12f644b"
4194
4194
-
dependencies = [
4195
4195
-
"futures",
4196
4196
-
"js-sys",
4197
4197
-
"parking_lot",
4198
4198
-
"pin-utils",
4199
4199
-
"slab",
4200
4200
-
"wasm-bindgen",
4201
4141
]
4202
4142
4203
4143
[[package]]
-2
Cargo.toml
···
23
23
smol_str = "0.3"
24
24
futures = "0.3"
25
25
reqwest = { version = "0.13.2", features = ["json", "rustls", "stream", "gzip", "brotli", "zstd", "http2"], default-features = false }
26
26
-
reqwest-middleware = { version = "0.5.1", default-features = false, features = ["http2", "rustls"] }
27
27
-
reqwest-retry = { version = "0.9.1" }
28
26
axum = { version = "0.8.8", features = ["ws", "macros"] }
29
27
tower-http = { version = "0.6.6", features = ["cors", "trace"] }
30
28
+250
-222
src/crawler/mod.rs
···
9
9
use rand::Rng;
10
10
use rand::RngExt;
11
11
use rand::rngs::SmallRng;
12
12
-
use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
13
13
-
use reqwest_retry::Jitter;
14
14
-
use reqwest_retry::{
15
15
-
RetryTransientMiddleware, Retryable, RetryableStrategy, default_on_request_failure,
16
16
-
default_on_request_success, policies::ExponentialBackoff,
17
17
-
};
18
18
-
use smol_str::{SmolStr, ToSmolStr};
19
19
-
use std::error::Error;
12
12
+
use reqwest::StatusCode;
13
13
+
use smol_str::SmolStr;
14
14
+
use std::future::Future;
20
15
use std::sync::Arc;
21
16
use std::sync::atomic::{AtomicUsize, Ordering};
22
17
use std::time::Duration;
23
23
-
use tracing::{debug, error, info, trace, warn};
18
18
+
use tracing::{Instrument, debug, error, info, trace, warn};
24
19
use url::Url;
25
20
26
21
enum CrawlCheckResult {
27
22
Signal,
28
23
NoSignal,
29
24
Ratelimited,
30
30
-
Failed,
25
25
+
Failed(Option<u16>),
31
26
}
32
27
33
33
-
struct NoTlsRetry;
28
28
+
/// outcome of [`retry_with_backoff`] when the operation does not succeed.
29
29
+
enum RetryOutcome<E> {
30
30
+
/// ratelimited after exhausting all retries
31
31
+
Ratelimited,
32
32
+
/// non-ratelimit failure, carrying the last error
33
33
+
Failed(E),
34
34
+
}
34
35
35
35
-
impl RetryableStrategy for NoTlsRetry {
36
36
-
fn handle(
37
37
-
&self,
38
38
-
res: &Result<reqwest::Response, reqwest_middleware::Error>,
39
39
-
) -> Option<Retryable> {
40
40
-
match res {
41
41
-
Ok(success) => default_on_request_success(success),
42
42
-
Err(error) => {
43
43
-
if let reqwest_middleware::Error::Reqwest(e) = error {
44
44
-
if e.is_timeout() {
45
45
-
return Some(Retryable::Fatal);
46
46
-
}
47
47
-
let mut src = e.source();
48
48
-
while let Some(s) = src {
49
49
-
if let Some(io_err) = s.downcast_ref::<std::io::Error>() {
50
50
-
if is_tls_cert_error(io_err) {
51
51
-
return Some(Retryable::Fatal);
52
52
-
}
36
36
+
/// retries an async operation with exponential backoff when ratelimited.
37
37
+
///
38
38
+
/// `op` is called on each attempt and returns `Result<T, E>`.
39
39
+
/// `is_ratelimited` classifies an error as a ratelimit (triggering a retry)
40
40
+
/// versus a fatal failure (returning immediately).
41
41
+
async fn retry_with_backoff<T, E, F, Fut>(
42
42
+
rng: &mut SmallRng,
43
43
+
max_retries: u32,
44
44
+
mut op: F,
45
45
+
is_ratelimited: impl Fn(&E) -> bool,
46
46
+
) -> Result<T, RetryOutcome<E>>
47
47
+
where
48
48
+
F: FnMut() -> Fut,
49
49
+
Fut: Future<Output = Result<T, E>>,
50
50
+
{
51
51
+
let mut attempt = 0u32;
52
52
+
loop {
53
53
+
match op().await {
54
54
+
Ok(val) => return Ok(val),
55
55
+
Err(e) if is_ratelimited(&e) => {
56
56
+
if attempt < max_retries {
57
57
+
let base = Duration::from_secs(1 << attempt);
58
58
+
let jitter = Duration::from_millis(rng.random_range(0..2000));
59
59
+
tokio::time::sleep(base + jitter).await;
60
60
+
attempt += 1;
61
61
+
} else {
62
62
+
return Err(RetryOutcome::Ratelimited);
63
63
+
}
64
64
+
}
65
65
+
Err(e) => return Err(RetryOutcome::Failed(e)),
66
66
+
}
67
67
+
}
68
68
+
}
69
69
+
70
70
+
async fn check_repo_signals(
71
71
+
http: Arc<reqwest::Client>,
72
72
+
resolver: crate::resolver::Resolver,
73
73
+
filter: Arc<crate::filter::FilterConfig>,
74
74
+
did: Did<'static>,
75
75
+
) -> (Did<'static>, CrawlCheckResult) {
76
76
+
const MAX_RETRIES: u32 = 5;
77
77
+
let mut rng: SmallRng = rand::make_rng();
78
78
+
79
79
+
let pds_url = retry_with_backoff(
80
80
+
&mut rng,
81
81
+
MAX_RETRIES,
82
82
+
|| resolver.resolve_identity_info(&did),
83
83
+
|e| matches!(e, crate::resolver::ResolverError::Ratelimited),
84
84
+
);
85
85
+
let pds_url = match pds_url.await {
86
86
+
Ok((url, _)) => url,
87
87
+
Err(RetryOutcome::Ratelimited) => {
88
88
+
error!(
89
89
+
retries = MAX_RETRIES,
90
90
+
"rate limited resolving identity, giving up"
91
91
+
);
92
92
+
return (did, CrawlCheckResult::Ratelimited);
93
93
+
}
94
94
+
Err(RetryOutcome::Failed(e)) => {
95
95
+
error!(err = %e, "failed to resolve identity");
96
96
+
return (did, CrawlCheckResult::Failed(None));
97
97
+
}
98
98
+
};
99
99
+
100
100
+
let mut found_signal = false;
101
101
+
for signal in filter.signals.iter() {
102
102
+
let res = async {
103
103
+
let mut list_records_url = pds_url.join("/xrpc/com.atproto.repo.listRecords").unwrap();
104
104
+
list_records_url
105
105
+
.query_pairs_mut()
106
106
+
.append_pair("repo", &did)
107
107
+
.append_pair("collection", signal)
108
108
+
.append_pair("limit", "1");
109
109
+
110
110
+
let res = retry_with_backoff(
111
111
+
&mut rng,
112
112
+
MAX_RETRIES,
113
113
+
|| async {
114
114
+
http.get(list_records_url.clone())
115
115
+
.send()
116
116
+
.await?
117
117
+
.error_for_status()
118
118
+
},
119
119
+
|e: &reqwest::Error| e.status() == Some(StatusCode::TOO_MANY_REQUESTS),
120
120
+
);
121
121
+
let res = match res.await {
122
122
+
Ok(r) => r,
123
123
+
Err(RetryOutcome::Ratelimited) => {
124
124
+
warn!(
125
125
+
retries = MAX_RETRIES,
126
126
+
"rate limited on listRecords, giving up"
127
127
+
);
128
128
+
return CrawlCheckResult::Ratelimited;
129
129
+
}
130
130
+
Err(RetryOutcome::Failed(e)) => {
131
131
+
match e.status() {
132
132
+
Some(StatusCode::NOT_FOUND | StatusCode::GONE) => {
133
133
+
trace!("repo not found");
53
134
}
54
54
-
src = s.source();
135
135
+
Some(s) if s.is_client_error() => {
136
136
+
error!(status = %s, "repo unavailable");
137
137
+
}
138
138
+
_ => {
139
139
+
error!(err = %e, "listRecords failed");
140
140
+
return CrawlCheckResult::Failed(e.status().map(|s| s.as_u16()));
141
141
+
}
55
142
}
143
143
+
return CrawlCheckResult::NoSignal;
56
144
}
57
57
-
let retryable = default_on_request_failure(error);
58
58
-
if retryable == Some(Retryable::Transient) {
59
59
-
if let reqwest_middleware::Error::Reqwest(e) = error {
60
60
-
let url = e.url().map(|u| u.as_str()).unwrap_or("unknown url");
61
61
-
let status = e
62
62
-
.status()
63
63
-
.map(|s| s.to_smolstr())
64
64
-
.unwrap_or_else(|| "unknown status".into());
65
65
-
warn!("retrying request {url}: {status}");
145
145
+
};
146
146
+
147
147
+
let bytes = match res.bytes().await {
148
148
+
Ok(b) => b,
149
149
+
Err(e) => {
150
150
+
error!(err = %e, "failed to read listRecords response");
151
151
+
return CrawlCheckResult::Failed(None);
152
152
+
}
153
153
+
};
154
154
+
155
155
+
match serde_json::from_slice::<ListRecordsOutput>(&bytes) {
156
156
+
Ok(out) => {
157
157
+
if !out.records.is_empty() {
158
158
+
return CrawlCheckResult::Signal;
66
159
}
67
160
}
68
68
-
retryable
161
161
+
Err(e) => {
162
162
+
error!(err = %e, "failed to parse listRecords response");
163
163
+
return CrawlCheckResult::Failed(None);
164
164
+
}
165
165
+
}
166
166
+
167
167
+
CrawlCheckResult::NoSignal
168
168
+
}
169
169
+
.instrument(tracing::info_span!("signal_check", signal = %signal))
170
170
+
.await;
171
171
+
172
172
+
match res {
173
173
+
CrawlCheckResult::Signal => {
174
174
+
found_signal = true;
175
175
+
break;
176
176
+
}
177
177
+
CrawlCheckResult::NoSignal => {
178
178
+
continue;
179
179
+
}
180
180
+
other => {
181
181
+
return (did, other);
69
182
}
70
183
}
71
184
}
72
72
-
}
73
185
74
74
-
fn is_tls_cert_error(io_err: &std::io::Error) -> bool {
75
75
-
let Some(inner) = io_err.get_ref() else {
76
76
-
return false;
77
77
-
};
78
78
-
if let Some(rustls_err) = inner.downcast_ref::<rustls::Error>() {
79
79
-
return matches!(rustls_err, rustls::Error::InvalidCertificate(_));
186
186
+
if !found_signal {
187
187
+
trace!("no signal-matching records found");
80
188
}
81
81
-
if let Some(nested_io) = inner.downcast_ref::<std::io::Error>() {
82
82
-
return is_tls_cert_error(nested_io);
83
83
-
}
84
84
-
false
189
189
+
190
190
+
(
191
191
+
did,
192
192
+
found_signal
193
193
+
.then_some(CrawlCheckResult::Signal)
194
194
+
.unwrap_or(CrawlCheckResult::NoSignal),
195
195
+
)
85
196
}
86
197
87
198
pub struct Crawler {
88
199
state: Arc<AppState>,
89
200
relay_host: Url,
90
90
-
http: Arc<ClientWithMiddleware>,
201
201
+
http: Arc<reqwest::Client>,
91
202
max_pending: usize,
92
203
resume_pending: usize,
93
204
count: Arc<AtomicUsize>,
···
100
211
max_pending: usize,
101
212
resume_pending: usize,
102
213
) -> Self {
103
103
-
let retry_policy = ExponentialBackoff::builder()
104
104
-
.jitter(Jitter::Bounded)
105
105
-
.build_with_max_retries(5);
106
106
-
let reqwest_client = reqwest::Client::builder()
107
107
-
.user_agent(concat!(
108
108
-
env!("CARGO_PKG_NAME"),
109
109
-
"/",
110
110
-
env!("CARGO_PKG_VERSION")
111
111
-
))
112
112
-
.gzip(true)
113
113
-
.build()
114
114
-
.expect("that reqwest will build");
115
115
-
116
116
-
let http = ClientBuilder::new(reqwest_client)
117
117
-
.with(RetryTransientMiddleware::new_with_policy_and_strategy(
118
118
-
retry_policy,
119
119
-
NoTlsRetry,
120
120
-
))
121
121
-
.build();
122
122
-
let http = Arc::new(http);
214
214
+
let http = Arc::new(
215
215
+
reqwest::Client::builder()
216
216
+
.user_agent(concat!(
217
217
+
env!("CARGO_PKG_NAME"),
218
218
+
"/",
219
219
+
env!("CARGO_PKG_VERSION")
220
220
+
))
221
221
+
.gzip(true)
222
222
+
.build()
223
223
+
.expect("that reqwest will build"),
224
224
+
);
123
225
124
226
Self {
125
227
state,
···
132
234
}
133
235
134
236
pub async fn run(self) -> Result<()> {
135
135
-
info!("crawler started");
136
136
-
137
237
tokio::spawn({
238
238
+
use std::time::Instant;
138
239
let count = self.count.clone();
139
139
-
let mut last_time = std::time::Instant::now();
240
240
+
let mut last_time = Instant::now();
140
241
let mut interval = tokio::time::interval(Duration::from_secs(60));
141
242
async move {
142
243
loop {
143
244
interval.tick().await;
144
245
let delta = count.swap(0, Ordering::Relaxed);
145
246
if delta == 0 {
247
247
+
debug!("no repos processed in 60s");
146
248
continue;
147
249
}
148
250
let elapsed = last_time.elapsed().as_secs_f64();
149
149
-
let rate = if elapsed > 0.0 {
150
150
-
delta as f64 / elapsed
151
151
-
} else {
152
152
-
0.0
153
153
-
};
154
154
-
info!("crawler: {rate:.2} repos/s ({delta} repos in {elapsed:.1}s)");
155
155
-
last_time = std::time::Instant::now();
251
251
+
let rate = (elapsed > 0.0)
252
252
+
.then(|| delta as f64 / elapsed)
253
253
+
.unwrap_or(0.0);
254
254
+
info!(rate, delta, elapsed, "crawler progress");
255
255
+
last_time = Instant::now();
156
256
}
157
257
}
158
258
});
159
259
160
160
-
let mut api_url = self.relay_host.clone();
161
161
-
if api_url.scheme() == "wss" {
162
162
-
api_url
260
260
+
let mut relay_url = self.relay_host.clone();
261
261
+
match relay_url.scheme() {
262
262
+
"wss" => relay_url
163
263
.set_scheme("https")
164
164
-
.map_err(|_| miette::miette!("invalid url: {api_url}"))?;
165
165
-
} else if api_url.scheme() == "ws" {
166
166
-
api_url
264
264
+
.map_err(|_| miette::miette!("invalid url: {relay_url}"))?,
265
265
+
"ws" => relay_url
167
266
.set_scheme("http")
168
168
-
.map_err(|_| miette::miette!("invalid url: {api_url}"))?;
267
267
+
.map_err(|_| miette::miette!("invalid url: {relay_url}"))?,
268
268
+
_ => {}
169
269
}
170
270
171
271
let mut rng: SmallRng = rand::make_rng();
···
178
278
.await?
179
279
.map(|bytes| {
180
280
let s = String::from_utf8_lossy(&bytes);
181
181
-
info!("resuming crawler from cursor: {s}");
281
281
+
info!(cursor = %s, "resuming");
182
282
s.into()
183
283
});
184
284
let mut was_throttled = false;
···
190
290
if pending > self.max_pending as u64 {
191
291
if !was_throttled {
192
292
debug!(
193
193
-
"crawler throttling: pending repos {} > max {}, sleeping...",
194
194
-
pending, self.max_pending
293
293
+
pending,
294
294
+
max = self.max_pending,
295
295
+
"throttling: above max pending"
195
296
);
196
297
was_throttled = true;
197
298
}
198
198
-
tokio::time::sleep(Duration::from_secs(10)).await;
199
299
} else if pending > self.resume_pending as u64 {
200
300
if !was_throttled {
201
301
debug!(
202
202
-
"crawler throttling: pending repos {} > max {}, entering cooldown...",
203
203
-
pending, self.max_pending
302
302
+
pending,
303
303
+
resume = self.resume_pending,
304
304
+
"throttling: entering cooldown"
204
305
);
205
306
was_throttled = true;
206
307
}
207
308
208
208
-
while self.state.db.get_count("pending").await > self.resume_pending as u64 {
309
309
+
loop {
310
310
+
let current_pending = self.state.db.get_count("pending").await;
311
311
+
if current_pending <= self.resume_pending as u64 {
312
312
+
break;
313
313
+
}
209
314
debug!(
210
210
-
"crawler cooldown: pending repos {} > resume {}, sleeping...",
211
211
-
self.state.db.get_count("pending").await,
212
212
-
self.resume_pending
315
315
+
pending = current_pending,
316
316
+
resume = self.resume_pending,
317
317
+
"cooldown, waiting"
213
318
);
214
214
-
tokio::time::sleep(Duration::from_secs(10)).await;
215
319
}
216
320
break;
217
321
} else {
218
322
if was_throttled {
219
219
-
info!("crawler resuming: throttling released");
323
323
+
info!("throttling released");
220
324
was_throttled = false;
221
325
}
222
326
break;
···
224
328
}
225
329
226
330
// 2. fetch listrepos
227
227
-
let mut list_repos_url = api_url
331
331
+
let mut list_repos_url = relay_url
228
332
.join("/xrpc/com.atproto.sync.listRepos")
229
333
.into_diagnostic()?;
230
334
list_repos_url
···
238
342
239
343
let res_result = self.http.get(list_repos_url.clone()).send().await;
240
344
let bytes = match res_result {
241
241
-
Ok(res) => match res.bytes().await {
242
242
-
Ok(b) => b,
243
243
-
Err(e) => {
244
244
-
error!(
245
245
-
"crawler failed to parse list repos response: {e}. retrying in 30s..."
246
246
-
);
247
247
-
tokio::time::sleep(Duration::from_secs(30)).await;
248
248
-
continue;
345
345
+
Ok(res) => {
346
346
+
match res.status() {
347
347
+
StatusCode::TOO_MANY_REQUESTS => {
348
348
+
warn!("rate limited by relay");
349
349
+
continue;
350
350
+
}
351
351
+
s if !s.is_success() => {
352
352
+
error!(status = %s, "cant crawl");
353
353
+
continue;
354
354
+
}
355
355
+
_ => {}
356
356
+
}
357
357
+
match res.bytes().await {
358
358
+
Ok(b) => b,
359
359
+
Err(e) => {
360
360
+
error!(err = %e, "cant read listRepos");
361
361
+
continue;
362
362
+
}
249
363
}
250
250
-
},
364
364
+
}
251
365
Err(e) => {
252
252
-
error!("crawler failed to list repos: {e}. retrying in 30s...");
253
253
-
tokio::time::sleep(Duration::from_secs(30)).await;
366
366
+
error!(err = %e, "crawler failed to list repos");
254
367
continue;
255
368
}
256
369
};
···
259
372
.into_static();
260
373
261
374
if output.repos.is_empty() {
262
262
-
info!("crawler finished enumeration (or empty page). sleeping for 1 hour.");
375
375
+
info!("finished enumeration (or empty page)");
263
376
tokio::time::sleep(Duration::from_secs(3600)).await;
264
377
continue;
265
378
}
266
379
267
267
-
debug!("crawler fetched {} repos...", output.repos.len());
380
380
+
debug!(count = output.repos.len(), "fetched repos");
268
381
269
382
let mut batch = db.inner.batch();
270
383
let mut to_queue = Vec::new();
···
301
414
302
415
for did in &valid_dids {
303
416
let did_key = keys::repo_key(did);
304
304
-
trace!("crawler found new repo: {did}");
417
417
+
trace!(did = %did, "found new repo");
305
418
306
419
let state = RepoState::backfilling_untracked(rng.next_u64());
307
420
batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
···
320
433
);
321
434
} else {
322
435
// end of pagination
323
323
-
info!("crawler reached end of list.");
436
436
+
info!("reached end of list.");
324
437
cursor = None;
325
438
}
326
439
···
342
455
async fn check_signals_batch(
343
456
&self,
344
457
dids: &[Did<'static>],
345
345
-
filter: &crate::filter::FilterConfig,
458
458
+
filter: &Arc<crate::filter::FilterConfig>,
346
459
batch: &mut fjall::OwnedWriteBatch,
347
460
) -> Result<Vec<Did<'static>>> {
348
461
let db = &self.state.db;
···
354
467
let http = self.http.clone();
355
468
let resolver = self.state.resolver.clone();
356
469
let filter = filter.clone();
357
357
-
set.spawn(async move {
358
358
-
const MAX_RETRIES: u32 = 5;
359
359
-
let mut rng: SmallRng = rand::make_rng();
360
360
-
361
361
-
let pds_url = {
362
362
-
let mut attempt = 0u32;
363
363
-
loop {
364
364
-
match resolver.resolve_identity_info(&did).await {
365
365
-
Ok((url, _)) => break url,
366
366
-
Err(crate::resolver::ResolverError::Ratelimited)
367
367
-
if attempt < MAX_RETRIES =>
368
368
-
{
369
369
-
let base = Duration::from_secs(1 << attempt);
370
370
-
let jitter = Duration::from_millis(rng.random_range(0..2000));
371
371
-
let try_in = base + jitter;
372
372
-
debug!(
373
373
-
"crawler: rate limited resolving {did}, retry {}/{MAX_RETRIES} in {}s",
374
374
-
attempt + 1,
375
375
-
try_in.as_secs_f64()
376
376
-
);
377
377
-
tokio::time::sleep(try_in).await;
378
378
-
attempt += 1;
379
379
-
}
380
380
-
Err(crate::resolver::ResolverError::Ratelimited) => {
381
381
-
error!(
382
382
-
"crawler: rate limited resolving {did} after {MAX_RETRIES} retries"
383
383
-
);
384
384
-
return (did, CrawlCheckResult::Ratelimited);
385
385
-
}
386
386
-
Err(e) => {
387
387
-
error!("crawler: failed to resolve {did}: {e}");
388
388
-
return (did, CrawlCheckResult::Failed);
389
389
-
}
390
390
-
}
391
391
-
}
392
392
-
};
393
393
-
394
394
-
let mut found_signal = false;
395
395
-
for signal in filter.signals.iter() {
396
396
-
let mut list_records_url =
397
397
-
pds_url.join("/xrpc/com.atproto.repo.listRecords").unwrap();
398
398
-
list_records_url
399
399
-
.query_pairs_mut()
400
400
-
.append_pair("repo", &did)
401
401
-
.append_pair("collection", signal)
402
402
-
.append_pair("limit", "1");
403
403
-
404
404
-
let res = http
405
405
-
.get(list_records_url)
406
406
-
.send()
407
407
-
.await
408
408
-
.into_diagnostic()
409
409
-
.map(|res| res.error_for_status().into_diagnostic())
410
410
-
.flatten();
411
411
-
match res {
412
412
-
Ok(res) => {
413
413
-
let Ok(bytes) = res.bytes().await else {
414
414
-
error!(
415
415
-
"failed to read bytes from listRecords response for repo {did}, signal {signal}"
416
416
-
);
417
417
-
return (did, CrawlCheckResult::Failed);
418
418
-
};
419
419
-
match serde_json::from_slice::<ListRecordsOutput>(&bytes) {
420
420
-
Ok(out) => {
421
421
-
if !out.records.is_empty() {
422
422
-
found_signal = true;
423
423
-
break;
424
424
-
}
425
425
-
}
426
426
-
Err(e) => {
427
427
-
error!(
428
428
-
"failed to parse listRecords response for repo {did}, signal {signal}: {e}"
429
429
-
);
430
430
-
return (did, CrawlCheckResult::Failed);
431
431
-
}
432
432
-
}
433
433
-
}
434
434
-
Err(e) => {
435
435
-
error!(
436
436
-
"failed to listRecords for repo {did}, signal {signal}: {e}"
437
437
-
);
438
438
-
return (did, CrawlCheckResult::Failed);
439
439
-
}
440
440
-
}
441
441
-
}
442
442
-
443
443
-
if found_signal {
444
444
-
(did, CrawlCheckResult::Signal)
445
445
-
} else {
446
446
-
trace!("crawler skipped repo {did}: no records match signals");
447
447
-
(did, CrawlCheckResult::NoSignal)
448
448
-
}
449
449
-
});
470
470
+
let span = tracing::info_span!("check_signals", did = %did);
471
471
+
set.spawn(check_repo_signals(http, resolver, filter, did).instrument(span));
450
472
}
451
473
452
474
while let Some(res) = set.join_next().await {
···
459
481
CrawlCheckResult::NoSignal => {
460
482
batch.remove(&db.crawler, keys::crawler_failed_key(&did));
461
483
}
462
462
-
CrawlCheckResult::Ratelimited | CrawlCheckResult::Failed => {
463
463
-
batch.insert(&db.crawler, keys::crawler_failed_key(&did), []);
484
484
+
CrawlCheckResult::Ratelimited => {
485
485
+
batch.insert(
486
486
+
&db.crawler,
487
487
+
keys::crawler_failed_key(&did),
488
488
+
429u16.to_be_bytes().as_ref(),
489
489
+
);
490
490
+
}
491
491
+
CrawlCheckResult::Failed(status) => {
492
492
+
let code = status.unwrap_or(0);
493
493
+
batch.insert(
494
494
+
&db.crawler,
495
495
+
keys::crawler_failed_key(&did),
496
496
+
code.to_be_bytes().as_ref(),
497
497
+
);
464
498
}
465
499
}
466
500
}
···
482
516
483
517
let mut failed_dids = Vec::new();
484
518
for guard in db.crawler.prefix(keys::CRAWLER_FAILED_PREFIX) {
485
485
-
let (key, _) = guard.into_inner().into_diagnostic()?;
519
519
+
let key = guard.key().into_diagnostic()?;
486
520
let did_bytes = &key[keys::CRAWLER_FAILED_PREFIX.len()..];
487
521
let trimmed = TrimmedDid::try_from(did_bytes)?;
488
522
failed_dids.push(trimmed.to_did());
···
492
526
return Ok(());
493
527
}
494
528
495
495
-
info!(
496
496
-
"crawler: retrying {} previously failed repos",
497
497
-
failed_dids.len()
498
498
-
);
529
529
+
info!("retrying {} previously failed repos", failed_dids.len());
499
530
500
531
let mut batch = db.inner.batch();
501
532
let valid_dids = self
···
519
550
.into_diagnostic()??;
520
551
521
552
if !valid_dids.is_empty() {
522
522
-
info!(
523
523
-
"crawler: recovered {} repos from failed retry",
524
524
-
valid_dids.len()
525
525
-
);
553
553
+
info!("recovered {} repos from failed retry", valid_dids.len());
526
554
self.account_new_repos(valid_dids.len()).await;
527
555
}
528
556
+1
src/resolver.rs
···
138
138
}
139
139
}
140
140
141
141
+
#[inline]
141
142
async fn resolve_doc(&self, did: &Did<'_>) -> Result<MiniDoc, ResolverError> {
142
143
let did_static = did.clone().into_static();
143
144
if let Some(entry) = self.inner.cache.get_async(&did_static).await {