tangled
alpha
login
or
join now
ptr.pet
/
hydrant
26
fork
atom
at protocol indexer with flexible filtering, xrpc queries, and a cursor-backed event stream, built on fjall
at-protocol
atproto
indexer
rust
fjall
26
fork
atom
overview
issues
6
pulls
pipelines
[crawler] limit concurrency per pds, implement smarter retries
ptr.pet
1 week ago
218ce852
571eb26d
verified
This commit was signed with the committer's
known signature
.
ptr.pet
SSH Key Fingerprint:
SHA256:Abmvag+juovVufZTxyWY8KcVgrznxvBjQpJesv071Aw=
+512
-325
7 changed files
expand all
collapse all
unified
split
src
crawler
ban.rs
mod.rs
throttle.rs
db
keys.rs
filter.rs
ingest
worker.rs
types.rs
-117
src/crawler/ban.rs
···
1
1
-
use scc::HashMap;
2
2
-
use std::future::Future;
3
3
-
use std::sync::Arc;
4
4
-
use std::sync::atomic::{AtomicI64, AtomicUsize, Ordering};
5
5
-
use tokio::sync::Notify;
6
6
-
use url::Url;
7
7
-
8
8
-
#[derive(Clone)]
9
9
-
pub struct BanTracker {
10
10
-
states: Arc<HashMap<Url, Arc<State>>>,
11
11
-
}
12
12
-
13
13
-
impl BanTracker {
14
14
-
pub fn new() -> Self {
15
15
-
Self {
16
16
-
states: Arc::new(HashMap::new()),
17
17
-
}
18
18
-
}
19
19
-
20
20
-
pub fn get_handle(&self, url: &Url) -> BanHandle {
21
21
-
let state = self
22
22
-
.states
23
23
-
.entry_sync(url.clone())
24
24
-
.or_insert_with(|| {
25
25
-
Arc::new(State {
26
26
-
banned_until: AtomicI64::new(0),
27
27
-
consecutive_failures: AtomicUsize::new(0),
28
28
-
ban_notify: Notify::new(),
29
29
-
})
30
30
-
})
31
31
-
.get()
32
32
-
.clone();
33
33
-
34
34
-
BanHandle { state }
35
35
-
}
36
36
-
}
37
37
-
38
38
-
struct State {
39
39
-
banned_until: AtomicI64,
40
40
-
consecutive_failures: AtomicUsize,
41
41
-
ban_notify: Notify,
42
42
-
}
43
43
-
44
44
-
pub struct BanHandle {
45
45
-
state: Arc<State>,
46
46
-
}
47
47
-
48
48
-
impl BanHandle {
49
49
-
pub fn is_banned(&self) -> bool {
50
50
-
let until = self.state.banned_until.load(Ordering::Acquire);
51
51
-
if until == 0 {
52
52
-
return false;
53
53
-
}
54
54
-
let now = chrono::Utc::now().timestamp();
55
55
-
now < until
56
56
-
}
57
57
-
58
58
-
pub fn record_success(&self) {
59
59
-
self.state.consecutive_failures.store(0, Ordering::Release);
60
60
-
self.state.banned_until.store(0, Ordering::Release);
61
61
-
}
62
62
-
63
63
-
// returns the amount of minutes banned if its a new ban
64
64
-
pub fn record_failure(&self) -> Option<i64> {
65
65
-
if self.is_banned() {
66
66
-
return None;
67
67
-
}
68
68
-
69
69
-
let failures = self
70
70
-
.state
71
71
-
.consecutive_failures
72
72
-
.fetch_add(1, Ordering::AcqRel)
73
73
-
+ 1;
74
74
-
75
75
-
// start with 30 minutes, double each consecutive failure
76
76
-
let base_minutes = 30;
77
77
-
let exponent = (failures as u32).saturating_sub(1);
78
78
-
let minutes = base_minutes * 2i64.pow(exponent.min(10));
79
79
-
let now = chrono::Utc::now().timestamp();
80
80
-
81
81
-
self.state
82
82
-
.banned_until
83
83
-
.store(now + minutes * 60, Ordering::Release);
84
84
-
85
85
-
self.state.ban_notify.notify_waiters();
86
86
-
87
87
-
Some(minutes)
88
88
-
}
89
89
-
90
90
-
pub async fn wait_for_ban(&self) {
91
91
-
loop {
92
92
-
let notified = self.state.ban_notify.notified();
93
93
-
if self.is_banned() {
94
94
-
return;
95
95
-
}
96
96
-
notified.await;
97
97
-
}
98
98
-
}
99
99
-
}
100
100
-
101
101
-
/// extension trait that adds `.or_ban()` to any future returning `Result<T, E>`.
102
102
-
#[allow(async_fn_in_trait)]
103
103
-
pub trait OrBan<T, E>: Future<Output = Result<T, E>> {
104
104
-
/// races the future against a ban notification.
105
105
-
/// if the pds is banned before the future completes, returns `on_ban()` immediately.
106
106
-
async fn or_ban(self, handle: &BanHandle, on_ban: impl FnOnce() -> E) -> Result<T, E>
107
107
-
where
108
108
-
Self: Sized,
109
109
-
{
110
110
-
tokio::select! {
111
111
-
res = self => res,
112
112
-
_ = handle.wait_for_ban() => Err(on_ban()),
113
113
-
}
114
114
-
}
115
115
-
}
116
116
-
117
117
-
impl<T, E, F: Future<Output = Result<T, E>>> OrBan<T, E> for F {}
+308
-201
src/crawler/mod.rs
···
1
1
-
use crate::db::types::TrimmedDid;
2
1
use crate::db::{Db, keys, ser_repo_state};
3
2
use crate::state::AppState;
4
3
use crate::types::RepoState;
5
5
-
use futures::TryFutureExt;
4
4
+
use futures::FutureExt;
6
5
use jacquard_api::com_atproto::repo::list_records::ListRecordsOutput;
7
6
use jacquard_api::com_atproto::sync::list_repos::ListReposOutput;
8
7
use jacquard_common::{IntoStatic, types::string::Did};
···
13
12
use reqwest::StatusCode;
14
13
use smol_str::SmolStr;
15
14
use std::future::Future;
15
15
+
use std::ops::Mul;
16
16
use std::sync::Arc;
17
17
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
18
18
use std::time::Duration;
···
22
22
enum CrawlCheckResult {
23
23
Signal,
24
24
NoSignal,
25
25
-
Ratelimited,
26
26
-
Failed(Option<u16>),
25
25
+
/// task could not complete; should be retried at `retry_after` (unix timestamp).
26
26
+
/// `status` is the HTTP status that triggered this (0 for non-HTTP failures).
27
27
+
Retry {
28
28
+
retry_after: i64,
29
29
+
status: u16,
30
30
+
},
27
31
}
28
32
29
29
-
/// outcome of [`RetryWithBackoff::retry_with_backoff`] when the operation does not succeed.
33
33
+
/// outcome of [`RetryWithBackoff::retry`] when the operation does not succeed.
30
34
enum RetryOutcome<E> {
31
35
/// ratelimited after exhausting all retries
32
36
Ratelimited,
···
34
38
Failed(E),
35
39
}
36
40
37
37
-
/// extension trait that adds `retry_with_backoff` to async `FnMut` closures.
41
41
+
/// extension trait that adds `.retry()` to async `FnMut` closures.
42
42
+
///
43
43
+
/// `on_ratelimit` receives the error and current attempt number.
44
44
+
/// returning `Some(duration)` signals a transient failure and provides the backoff;
45
45
+
/// returning `None` signals a terminal failure.
38
46
trait RetryWithBackoff<T, E, Fut>: FnMut() -> Fut
39
47
where
40
48
Fut: Future<Output = Result<T, E>>,
···
42
50
async fn retry(
43
51
&mut self,
44
52
max_retries: u32,
45
45
-
is_ratelimited: impl Fn(&E) -> bool,
53
53
+
on_ratelimit: impl Fn(&E, u32) -> Option<Duration>,
46
54
) -> Result<T, RetryOutcome<E>> {
47
55
let mut attempt = 0u32;
48
56
loop {
49
57
match self().await {
50
58
Ok(val) => return Ok(val),
51
51
-
Err(e) if is_ratelimited(&e) => {
52
52
-
if attempt >= max_retries {
53
53
-
return Err(RetryOutcome::Ratelimited);
59
59
+
Err(e) => match on_ratelimit(&e, attempt) {
60
60
+
Some(_) if attempt >= max_retries => return Err(RetryOutcome::Ratelimited),
61
61
+
Some(backoff) => {
62
62
+
let jitter = Duration::from_millis(rand::rng().random_range(0..2000));
63
63
+
tokio::time::sleep(backoff + jitter).await;
64
64
+
attempt += 1;
54
65
}
55
55
-
let base = Duration::from_secs(1 << attempt);
56
56
-
let jitter = Duration::from_millis(rand::rng().random_range(0..2000));
57
57
-
tokio::time::sleep(base + jitter).await;
58
58
-
attempt += 1;
59
59
-
}
60
60
-
Err(e) => return Err(RetryOutcome::Failed(e)),
66
66
+
None => return Err(RetryOutcome::Failed(e)),
67
67
+
},
61
68
}
62
69
}
63
70
}
···
82
89
83
90
impl<F: Future<Output = Result<reqwest::Response, reqwest::Error>>> ErrorForStatus for F {}
84
91
85
85
-
// these two are cloudflare specific
92
92
+
/// extracts a retry delay in seconds from rate limit response headers.
93
93
+
///
94
94
+
/// checks in priority order:
95
95
+
/// - `retry-after: <seconds>` (relative)
96
96
+
/// - `ratelimit-reset: <unix timestamp>` (absolute) (ref pds sends this)
97
97
+
fn parse_retry_after(resp: &reqwest::Response) -> Option<u64> {
98
98
+
let headers = resp.headers();
99
99
+
100
100
+
let retry_after = headers
101
101
+
.get(reqwest::header::RETRY_AFTER)
102
102
+
.and_then(|v| v.to_str().ok())
103
103
+
.and_then(|s| s.parse::<u64>().ok());
104
104
+
105
105
+
let rate_limit_reset = headers
106
106
+
.get("ratelimit-reset")
107
107
+
.and_then(|v| v.to_str().ok())
108
108
+
.and_then(|s| s.parse::<i64>().ok())
109
109
+
.map(|ts| {
110
110
+
let now = chrono::Utc::now().timestamp();
111
111
+
(ts - now).max(1) as u64
112
112
+
});
113
113
+
114
114
+
retry_after.or(rate_limit_reset)
115
115
+
}
116
116
+
117
117
+
// cloudflare-specific status codes
86
118
const CONNECTION_TIMEOUT: StatusCode = unsafe {
87
119
match StatusCode::from_u16(522) {
88
120
Ok(s) => s,
89
89
-
_ => std::hint::unreachable_unchecked(), // status code is valid
121
121
+
_ => std::hint::unreachable_unchecked(),
90
122
}
91
123
};
92
124
const SITE_FROZEN: StatusCode = unsafe {
93
125
match StatusCode::from_u16(530) {
94
126
Ok(s) => s,
95
95
-
_ => std::hint::unreachable_unchecked(), // status code is valid
127
127
+
_ => std::hint::unreachable_unchecked(),
96
128
}
97
129
};
98
130
99
99
-
// we ban on:
100
100
-
// - timeouts
101
101
-
// - tls cert errors
102
102
-
// - bad gateway / gateway timeout, service unavailable, 522 and 530
103
103
-
fn is_ban_worthy(e: &reqwest::Error) -> bool {
131
131
+
fn is_throttle_worthy(e: &reqwest::Error) -> bool {
104
132
use std::error::Error;
105
133
106
134
if e.is_timeout() {
···
147
175
resolver: crate::resolver::Resolver,
148
176
filter: Arc<crate::filter::FilterConfig>,
149
177
did: Did<'static>,
150
150
-
tracker: Arc<BanTracker>,
178
178
+
throttler: Arc<Throttler>,
151
179
) -> (Did<'static>, CrawlCheckResult) {
152
180
const MAX_RETRIES: u32 = 5;
153
181
154
182
let pds_url = (|| resolver.resolve_identity_info(&did))
155
155
-
.retry(MAX_RETRIES, |e| {
183
183
+
.retry(MAX_RETRIES, |e, attempt| {
156
184
matches!(e, crate::resolver::ResolverError::Ratelimited)
185
185
+
.then(|| Duration::from_secs(1 << attempt.min(5)))
157
186
})
158
187
.await;
188
188
+
159
189
let pds_url = match pds_url {
160
190
Ok((url, _)) => url,
161
191
Err(RetryOutcome::Ratelimited) => {
···
163
193
retries = MAX_RETRIES,
164
194
"rate limited resolving identity, giving up"
165
195
);
166
166
-
return (did, CrawlCheckResult::Ratelimited);
196
196
+
// no pds handle to read retry_after from; use a short default
197
197
+
let retry_after = chrono::Utc::now().timestamp() + 60;
198
198
+
return (
199
199
+
did,
200
200
+
CrawlCheckResult::Retry {
201
201
+
retry_after,
202
202
+
status: 429,
203
203
+
},
204
204
+
);
167
205
}
168
206
Err(RetryOutcome::Failed(e)) => {
169
207
error!(err = %e, "failed to resolve identity");
170
170
-
return (did, CrawlCheckResult::Failed(None));
208
208
+
let retry_after = chrono::Utc::now().timestamp() + 60;
209
209
+
return (
210
210
+
did,
211
211
+
CrawlCheckResult::Retry {
212
212
+
retry_after,
213
213
+
status: 0,
214
214
+
},
215
215
+
);
171
216
}
172
217
};
173
218
174
174
-
let pds_handle = tracker.get_handle(&pds_url);
175
175
-
if pds_handle.is_banned() {
176
176
-
trace!(host = pds_url.host_str(), "skipping banned pds");
177
177
-
return (did, CrawlCheckResult::Failed(None));
219
219
+
let throttle = throttler.get_handle(&pds_url).await;
220
220
+
if throttle.is_throttled() {
221
221
+
trace!(host = pds_url.host_str(), "skipping throttled pds");
222
222
+
return (
223
223
+
did,
224
224
+
CrawlCheckResult::Retry {
225
225
+
retry_after: throttle.throttled_until(),
226
226
+
status: 0,
227
227
+
},
228
228
+
);
178
229
}
179
230
231
231
+
let _permit = throttle.acquire().unit_error().or_failure(&throttle, || ());
232
232
+
let Ok(_permit) = _permit.await else {
233
233
+
trace!(
234
234
+
host = pds_url.host_str(),
235
235
+
"pds failed while waiting for permit"
236
236
+
);
237
237
+
return (
238
238
+
did,
239
239
+
CrawlCheckResult::Retry {
240
240
+
retry_after: throttle.throttled_until(),
241
241
+
status: 0,
242
242
+
},
243
243
+
);
244
244
+
};
245
245
+
180
246
enum RequestError {
181
247
Reqwest(reqwest::Error),
182
182
-
Banned,
248
248
+
RateLimited(Option<u64>),
249
249
+
/// hard failure notification from another task on this PDS
250
250
+
Throttled,
183
251
}
184
252
185
253
let mut found_signal = false;
···
192
260
.append_pair("collection", signal)
193
261
.append_pair("limit", "1");
194
262
195
195
-
let res = (|| http.get(list_records_url.clone())
263
263
+
let resp = async {
264
264
+
let resp = http
265
265
+
.get(list_records_url.clone())
196
266
.send()
197
197
-
.error_for_status()
198
198
-
.map_err(RequestError::Reqwest)
199
199
-
.or_ban(&pds_handle, || RequestError::Banned))
200
200
-
.retry(MAX_RETRIES, |e: &RequestError| {
201
201
-
matches!(e, RequestError::Reqwest(e) if matches!(e.status(), Some(StatusCode::TOO_MANY_REQUESTS)))
202
202
-
})
203
203
-
.await;
204
204
-
let res = match res {
267
267
+
.await
268
268
+
.map_err(RequestError::Reqwest)?;
269
269
+
270
270
+
// dont retry ratelimits since we will just put it in a queue to be tried again later
271
271
+
if resp.status() == StatusCode::TOO_MANY_REQUESTS {
272
272
+
return Err(RequestError::RateLimited(parse_retry_after(&resp)));
273
273
+
}
274
274
+
275
275
+
resp.error_for_status().map_err(RequestError::Reqwest)
276
276
+
}
277
277
+
.or_failure(&throttle, || RequestError::Throttled)
278
278
+
.await;
279
279
+
280
280
+
let resp = match resp {
205
281
Ok(r) => {
206
206
-
pds_handle.record_success();
282
282
+
throttle.record_success();
207
283
r
208
284
}
209
209
-
Err(RetryOutcome::Ratelimited) => {
210
210
-
warn!(
211
211
-
retries = MAX_RETRIES,
212
212
-
"rate limited on listRecords, giving up"
213
213
-
);
214
214
-
return CrawlCheckResult::Ratelimited;
285
285
+
Err(RequestError::RateLimited(secs)) => {
286
286
+
throttle.record_ratelimit(secs);
287
287
+
return CrawlCheckResult::Retry {
288
288
+
retry_after: throttle.throttled_until(),
289
289
+
status: 429,
290
290
+
};
215
291
}
216
216
-
Err(RetryOutcome::Failed(e)) => match e {
217
217
-
RequestError::Banned => return CrawlCheckResult::Failed(None),
218
218
-
RequestError::Reqwest(e) => {
219
219
-
if is_ban_worthy(&e) {
220
220
-
if let Some(mins) = pds_handle.record_failure() {
221
221
-
tracing::warn!(url = %pds_url, mins, "banned pds");
222
222
-
}
223
223
-
return CrawlCheckResult::Failed(e.status().map(|s| s.as_u16()));
292
292
+
Err(RequestError::Throttled) => {
293
293
+
return CrawlCheckResult::Retry {
294
294
+
retry_after: throttle.throttled_until(),
295
295
+
status: 0,
296
296
+
};
297
297
+
}
298
298
+
Err(RequestError::Reqwest(e)) => {
299
299
+
if is_throttle_worthy(&e) {
300
300
+
if let Some(mins) = throttle.record_failure() {
301
301
+
warn!(url = %pds_url, mins, "throttling pds due to hard failure");
224
302
}
303
303
+
return CrawlCheckResult::Retry {
304
304
+
retry_after: throttle.throttled_until(),
305
305
+
status: e.status().map_or(0, |s| s.as_u16()),
306
306
+
};
307
307
+
}
225
308
226
226
-
match e.status() {
227
227
-
Some(StatusCode::NOT_FOUND | StatusCode::GONE) => {
228
228
-
trace!("repo not found");
229
229
-
}
230
230
-
Some(s) if s.is_client_error() => {
231
231
-
error!(status = %s, "repo unavailable");
232
232
-
}
233
233
-
_ => {
234
234
-
error!(err = %e, "listRecords failed");
235
235
-
return CrawlCheckResult::Failed(e.status().map(|s| s.as_u16()));
236
236
-
}
309
309
+
match e.status() {
310
310
+
Some(StatusCode::NOT_FOUND | StatusCode::GONE) => {
311
311
+
trace!("repo not found");
312
312
+
return CrawlCheckResult::NoSignal;
237
313
}
238
238
-
return CrawlCheckResult::NoSignal;
314
314
+
Some(s) if s.is_client_error() => {
315
315
+
error!(status = %s, "repo unavailable");
316
316
+
return CrawlCheckResult::NoSignal;
317
317
+
}
318
318
+
_ => {
319
319
+
error!(err = %e, "repo errored");
320
320
+
return CrawlCheckResult::Retry {
321
321
+
retry_after: chrono::Utc::now().timestamp() + 60,
322
322
+
status: e.status().map_or(0, |s| s.as_u16()),
323
323
+
};
324
324
+
}
239
325
}
240
240
-
},
326
326
+
}
241
327
};
242
328
243
243
-
let bytes = match res.bytes().await {
329
329
+
let bytes = match resp.bytes().await {
244
330
Ok(b) => b,
245
331
Err(e) => {
246
332
error!(err = %e, "failed to read listRecords response");
247
247
-
return CrawlCheckResult::Failed(None);
333
333
+
return CrawlCheckResult::Retry {
334
334
+
retry_after: chrono::Utc::now().timestamp() + 60,
335
335
+
status: 0,
336
336
+
};
248
337
}
249
338
};
250
339
251
340
match serde_json::from_slice::<ListRecordsOutput>(&bytes) {
252
252
-
Ok(out) => {
253
253
-
if !out.records.is_empty() {
254
254
-
return CrawlCheckResult::Signal;
255
255
-
}
256
256
-
}
341
341
+
Ok(out) if !out.records.is_empty() => return CrawlCheckResult::Signal,
342
342
+
Ok(_) => {}
257
343
Err(e) => {
258
344
error!(err = %e, "failed to parse listRecords response");
259
259
-
return CrawlCheckResult::Failed(None);
345
345
+
return CrawlCheckResult::Retry {
346
346
+
retry_after: chrono::Utc::now().timestamp() + 60,
347
347
+
status: 0,
348
348
+
};
260
349
}
261
350
}
262
351
263
352
CrawlCheckResult::NoSignal
264
353
}
265
265
-
.instrument(tracing::info_span!("signal_check", signal = %signal))
354
354
+
.instrument(tracing::info_span!("check", signal = %signal))
266
355
.await;
267
356
268
357
match res {
···
270
359
found_signal = true;
271
360
break;
272
361
}
273
273
-
CrawlCheckResult::NoSignal => {
274
274
-
continue;
275
275
-
}
276
276
-
other => {
277
277
-
return (did, other);
278
278
-
}
362
362
+
CrawlCheckResult::NoSignal => continue,
363
363
+
other => return (did, other),
279
364
}
280
365
}
281
366
···
291
376
)
292
377
}
293
378
294
294
-
pub mod ban;
295
295
-
use ban::{BanTracker, OrBan};
379
379
+
pub mod throttle;
380
380
+
use throttle::{OrThrottle, Throttler};
296
381
297
382
pub struct Crawler {
298
383
state: Arc<AppState>,
···
303
388
count: Arc<AtomicUsize>,
304
389
crawled_count: Arc<AtomicUsize>,
305
390
throttled: Arc<AtomicBool>,
306
306
-
tracker: Arc<BanTracker>,
391
391
+
pds_throttler: Arc<Throttler>,
307
392
}
308
393
309
394
impl Crawler {
···
334
419
count: Arc::new(AtomicUsize::new(0)),
335
420
crawled_count: Arc::new(AtomicUsize::new(0)),
336
421
throttled: Arc::new(AtomicBool::new(false)),
337
337
-
tracker: Arc::new(BanTracker::new()),
422
422
+
pds_throttler: Arc::new(Throttler::new()),
338
423
}
339
424
}
340
425
341
426
pub async fn run(self) -> Result<()> {
427
427
+
// stats ticker
342
428
tokio::spawn({
343
429
use std::time::Instant;
344
430
let count = self.count.clone();
345
431
let crawled_count = self.crawled_count.clone();
346
432
let throttled = self.throttled.clone();
433
433
+
let pds_throttler = self.pds_throttler.clone();
347
434
let mut last_time = Instant::now();
348
435
let mut interval = tokio::time::interval(Duration::from_secs(60));
349
436
async move {
···
352
439
let delta_processed = count.swap(0, Ordering::Relaxed);
353
440
let delta_crawled = crawled_count.swap(0, Ordering::Relaxed);
354
441
let is_throttled = throttled.load(Ordering::Relaxed);
442
442
+
443
443
+
pds_throttler.evict_clean().await;
355
444
356
445
if delta_processed == 0 && delta_crawled == 0 {
357
446
if is_throttled {
···
374
463
}
375
464
});
376
465
377
377
-
let mut relay_url = self.relay_host.clone();
466
466
+
let crawler = Arc::new(self);
467
467
+
std::thread::spawn({
468
468
+
let crawler = crawler.clone();
469
469
+
let handle = tokio::runtime::Handle::current();
470
470
+
move || {
471
471
+
use std::thread::sleep;
472
472
+
473
473
+
let _g = handle.enter();
474
474
+
475
475
+
loop {
476
476
+
match crawler.process_retry_queue() {
477
477
+
Ok(Some(next_ts)) => {
478
478
+
let secs = (next_ts - chrono::Utc::now().timestamp()).max(1) as u64;
479
479
+
sleep(Duration::from_secs(secs));
480
480
+
}
481
481
+
Ok(None) => {
482
482
+
sleep(Duration::from_secs(60));
483
483
+
}
484
484
+
Err(e) => {
485
485
+
error!(err = %e, "retry loop failed");
486
486
+
sleep(Duration::from_secs(60));
487
487
+
}
488
488
+
}
489
489
+
}
490
490
+
}
491
491
+
});
492
492
+
493
493
+
let mut relay_url = crawler.relay_host.clone();
378
494
match relay_url.scheme() {
379
495
"wss" => relay_url
380
496
.set_scheme("https")
···
386
502
}
387
503
388
504
let mut rng: SmallRng = rand::make_rng();
389
389
-
390
390
-
let db = &self.state.db;
505
505
+
let db = &crawler.state.db;
391
506
392
392
-
// 1. load cursor
393
507
let cursor_key = b"crawler_cursor";
394
508
let mut cursor: Option<SmolStr> = Db::get(db.cursors.clone(), cursor_key.to_vec())
395
509
.await?
···
401
515
let mut was_throttled = false;
402
516
403
517
loop {
404
404
-
// check throttling
518
518
+
// throttle check
405
519
loop {
406
406
-
let pending = self.state.db.get_count("pending").await;
407
407
-
if pending > self.max_pending as u64 {
520
520
+
let pending = crawler.state.db.get_count("pending").await;
521
521
+
if pending > crawler.max_pending as u64 {
408
522
if !was_throttled {
409
523
debug!(
410
524
pending,
411
411
-
max = self.max_pending,
525
525
+
max = crawler.max_pending,
412
526
"throttling: above max pending"
413
527
);
414
528
was_throttled = true;
415
415
-
self.throttled.store(true, Ordering::Relaxed);
529
529
+
crawler.throttled.store(true, Ordering::Relaxed);
416
530
}
417
531
tokio::time::sleep(Duration::from_secs(5)).await;
418
418
-
} else if pending > self.resume_pending as u64 {
532
532
+
} else if pending > crawler.resume_pending as u64 {
419
533
if !was_throttled {
420
534
debug!(
421
535
pending,
422
422
-
resume = self.resume_pending,
536
536
+
resume = crawler.resume_pending,
423
537
"throttling: entering cooldown"
424
538
);
425
539
was_throttled = true;
426
426
-
self.throttled.store(true, Ordering::Relaxed);
540
540
+
crawler.throttled.store(true, Ordering::Relaxed);
427
541
}
428
542
429
543
loop {
430
430
-
let current_pending = self.state.db.get_count("pending").await;
431
431
-
if current_pending <= self.resume_pending as u64 {
544
544
+
let current_pending = crawler.state.db.get_count("pending").await;
545
545
+
if current_pending <= crawler.resume_pending as u64 {
432
546
break;
433
547
}
434
548
debug!(
435
549
pending = current_pending,
436
436
-
resume = self.resume_pending,
550
550
+
resume = crawler.resume_pending,
437
551
"cooldown, waiting"
438
552
);
439
553
tokio::time::sleep(Duration::from_secs(5)).await;
···
443
557
if was_throttled {
444
558
info!("throttling released");
445
559
was_throttled = false;
446
446
-
self.throttled.store(false, Ordering::Relaxed);
560
560
+
crawler.throttled.store(false, Ordering::Relaxed);
447
561
}
448
562
break;
449
563
}
450
564
}
451
565
452
452
-
// 2. fetch listrepos
453
566
let mut list_repos_url = relay_url
454
567
.join("/xrpc/com.atproto.sync.listRepos")
455
568
.into_diagnostic()?;
···
463
576
}
464
577
465
578
let fetch_result = (|| {
466
466
-
self.http
579
579
+
crawler
580
580
+
.http
467
581
.get(list_repos_url.clone())
468
582
.send()
469
583
.error_for_status()
470
584
})
471
471
-
.retry(5, |e: &reqwest::Error| {
585
585
+
.retry(5, |e: &reqwest::Error, attempt| {
472
586
matches!(e.status(), Some(StatusCode::TOO_MANY_REQUESTS))
587
587
+
.then(|| Duration::from_secs(1 << attempt.min(5)))
473
588
})
474
589
.await;
475
590
···
508
623
}
509
624
510
625
debug!(count = output.repos.len(), "fetched repos");
511
511
-
self.crawled_count
626
626
+
crawler
627
627
+
.crawled_count
512
628
.fetch_add(output.repos.len(), Ordering::Relaxed);
513
629
514
630
let mut batch = db.inner.batch();
515
631
let mut to_queue = Vec::new();
516
516
-
let filter = self.state.filter.load();
517
517
-
// we can check whether or not to backfill repos faster if we only have to check
518
518
-
// certain known signals, since we can just listRecords for those signals
519
519
-
// if we have glob signals we cant do this since we dont know what signals to check
520
520
-
let check_signals = filter.mode == crate::filter::FilterMode::Filter
521
521
-
&& !filter.signals.is_empty()
522
522
-
&& !filter.has_glob_signals();
632
632
+
let filter = crawler.state.filter.load();
523
633
524
524
-
// 3. process repos
525
634
let mut unknown_dids = Vec::new();
526
635
for repo in output.repos {
527
636
let did_key = keys::repo_key(&repo.did);
···
531
640
continue;
532
641
}
533
642
534
534
-
// check if known
535
643
if !Db::contains_key(db.repos.clone(), &did_key).await? {
536
644
unknown_dids.push(repo.did.into_static());
537
645
}
538
646
}
539
647
540
540
-
let valid_dids = if check_signals && !unknown_dids.is_empty() {
541
541
-
self.check_signals_batch(&unknown_dids, &filter, &mut batch)
648
648
+
let valid_dids = if filter.check_signals() && !unknown_dids.is_empty() {
649
649
+
crawler
650
650
+
.check_signals_batch(&unknown_dids, &filter, &mut batch)
542
651
.await?
543
652
} else {
544
653
unknown_dids
···
548
657
let did_key = keys::repo_key(did);
549
658
trace!(did = %did, "found new repo");
550
659
551
551
-
let state = RepoState::backfilling_untracked(rng.next_u64());
660
660
+
let state = RepoState::untracked(rng.next_u64());
552
661
batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
553
662
batch.insert(&db.pending, keys::pending_key(state.index_id), &did_key);
554
663
to_queue.push(did.clone());
555
664
}
556
665
557
557
-
// 4. update cursor
558
666
if let Some(new_cursor) = output.cursor {
559
667
cursor = Some(new_cursor.as_str().into());
560
560
-
561
668
batch.insert(
562
669
&db.cursors,
563
670
cursor_key.to_vec(),
564
671
new_cursor.as_bytes().to_vec(),
565
672
);
566
673
} else {
567
567
-
// end of pagination
568
674
info!("reached end of list.");
569
675
cursor = None;
570
676
}
···
573
679
.await
574
680
.into_diagnostic()??;
575
681
576
576
-
self.account_new_repos(to_queue.len()).await;
682
682
+
crawler.account_new_repos(to_queue.len()).await;
577
683
578
684
if cursor.is_none() {
579
579
-
// 6. retry previously failed repos before sleeping
580
580
-
self.retry_failed_repos(&mut rng).await?;
581
581
-
582
685
tokio::time::sleep(Duration::from_secs(3600)).await;
583
686
}
584
687
}
585
688
}
586
689
690
690
+
/// scan the retry queue for entries whose `retry_after` timestamp has passed,
691
691
+
/// retry them, and return the earliest still-pending timestamp (if any) so the
692
692
+
/// caller knows when to wake up next.
693
693
+
fn process_retry_queue(&self) -> Result<Option<i64>> {
694
694
+
let db = &self.state.db;
695
695
+
let now = chrono::Utc::now().timestamp();
696
696
+
697
697
+
let mut ready: Vec<Did> = Vec::new();
698
698
+
let mut next_retry: Option<i64> = None;
699
699
+
700
700
+
let mut rng: SmallRng = rand::make_rng();
701
701
+
702
702
+
for guard in db.crawler.prefix(keys::CRAWLER_RETRY_PREFIX) {
703
703
+
let (key, val) = guard.into_inner().into_diagnostic()?;
704
704
+
let (retry_after, _) = keys::crawler_retry_parse_value(&val)?;
705
705
+
let did = keys::crawler_retry_parse_key(&key)?.to_did();
706
706
+
707
707
+
// we check an extra backoff of 1 - 7% just to make it less likely for
708
708
+
// many requests to coincide with each other
709
709
+
let backoff =
710
710
+
((retry_after - now).max(0) as f64).mul(rng.random_range(0.01..0.07)) as i64;
711
711
+
if retry_after + backoff > now {
712
712
+
next_retry = Some(
713
713
+
next_retry
714
714
+
.map(|earliest| earliest.min(retry_after))
715
715
+
.unwrap_or(retry_after),
716
716
+
);
717
717
+
continue;
718
718
+
}
719
719
+
720
720
+
ready.push(did);
721
721
+
}
722
722
+
723
723
+
if ready.is_empty() {
724
724
+
return Ok(next_retry);
725
725
+
}
726
726
+
727
727
+
info!(count = ready.len(), "retrying pending repos");
728
728
+
729
729
+
let handle = tokio::runtime::Handle::current();
730
730
+
let mut batch = db.inner.batch();
731
731
+
let filter = self.state.filter.load();
732
732
+
let valid_dids = handle.block_on(self.check_signals_batch(&ready, &filter, &mut batch))?;
733
733
+
734
734
+
let mut rng: SmallRng = rand::make_rng();
735
735
+
for did in &valid_dids {
736
736
+
let did_key = keys::repo_key(did);
737
737
+
738
738
+
if db.repos.contains_key(&did_key).into_diagnostic()? {
739
739
+
continue;
740
740
+
}
741
741
+
742
742
+
let state = RepoState::untracked(rng.next_u64());
743
743
+
batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
744
744
+
batch.insert(&db.pending, keys::pending_key(state.index_id), &did_key);
745
745
+
}
746
746
+
747
747
+
batch.commit().into_diagnostic()?;
748
748
+
749
749
+
if !valid_dids.is_empty() {
750
750
+
info!(count = valid_dids.len(), "recovered from retry queue");
751
751
+
handle.block_on(self.account_new_repos(valid_dids.len()));
752
752
+
}
753
753
+
754
754
+
Ok(next_retry)
755
755
+
}
756
756
+
587
757
async fn check_signals_batch(
588
758
&self,
589
759
dids: &[Did<'static>],
···
599
769
let http = self.http.clone();
600
770
let resolver = self.state.resolver.clone();
601
771
let filter = filter.clone();
602
602
-
let tracker = self.tracker.clone();
603
603
-
let span = tracing::info_span!("check_signals", did = %did);
604
604
-
set.spawn(check_repo_signals(http, resolver, filter, did, tracker).instrument(span));
772
772
+
let throttler = self.pds_throttler.clone();
773
773
+
let span = tracing::info_span!("signals", did = %did);
774
774
+
set.spawn(check_repo_signals(http, resolver, filter, did, throttler).instrument(span));
605
775
}
606
776
607
777
while let Some(res) = set.join_next().await {
608
778
let (did, result) = res.into_diagnostic()?;
609
779
match result {
610
780
CrawlCheckResult::Signal => {
611
611
-
batch.remove(&db.crawler, keys::crawler_failed_key(&did));
612
781
valid.push(did);
613
782
}
614
614
-
CrawlCheckResult::NoSignal => {
615
615
-
batch.remove(&db.crawler, keys::crawler_failed_key(&did));
616
616
-
}
617
617
-
CrawlCheckResult::Ratelimited => {
783
783
+
CrawlCheckResult::NoSignal => {}
784
784
+
CrawlCheckResult::Retry {
785
785
+
retry_after,
786
786
+
status,
787
787
+
} => {
618
788
batch.insert(
619
789
&db.crawler,
620
620
-
keys::crawler_failed_key(&did),
621
621
-
429u16.to_be_bytes().as_ref(),
622
622
-
);
623
623
-
}
624
624
-
CrawlCheckResult::Failed(status) => {
625
625
-
let code = status.unwrap_or(0);
626
626
-
batch.insert(
627
627
-
&db.crawler,
628
628
-
keys::crawler_failed_key(&did),
629
629
-
code.to_be_bytes().as_ref(),
790
790
+
keys::crawler_retry_key(&did),
791
791
+
keys::crawler_retry_value(retry_after, status),
630
792
);
631
793
}
632
794
}
633
795
}
634
796
635
797
Ok(valid)
636
636
-
}
637
637
-
638
638
-
async fn retry_failed_repos(&self, rng: &mut SmallRng) -> Result<()> {
639
639
-
let db = &self.state.db;
640
640
-
let filter = self.state.filter.load();
641
641
-
642
642
-
let check_signals = filter.mode == crate::filter::FilterMode::Filter
643
643
-
&& !filter.signals.is_empty()
644
644
-
&& !filter.has_glob_signals();
645
645
-
646
646
-
if !check_signals {
647
647
-
return Ok(());
648
648
-
}
649
649
-
650
650
-
let mut failed_dids = Vec::new();
651
651
-
for guard in db.crawler.prefix(keys::CRAWLER_FAILED_PREFIX) {
652
652
-
let key = guard.key().into_diagnostic()?;
653
653
-
let did_bytes = &key[keys::CRAWLER_FAILED_PREFIX.len()..];
654
654
-
let trimmed = TrimmedDid::try_from(did_bytes)?;
655
655
-
failed_dids.push(trimmed.to_did());
656
656
-
}
657
657
-
658
658
-
if failed_dids.is_empty() {
659
659
-
return Ok(());
660
660
-
}
661
661
-
662
662
-
info!("retrying {} previously failed repos", failed_dids.len());
663
663
-
664
664
-
let mut batch = db.inner.batch();
665
665
-
let valid_dids = self
666
666
-
.check_signals_batch(&failed_dids, &filter, &mut batch)
667
667
-
.await?;
668
668
-
669
669
-
for did in &valid_dids {
670
670
-
let did_key = keys::repo_key(did);
671
671
-
672
672
-
if Db::contains_key(db.repos.clone(), &did_key).await? {
673
673
-
continue;
674
674
-
}
675
675
-
676
676
-
let state = RepoState::backfilling_untracked(rng.next_u64());
677
677
-
batch.insert(&db.repos, &did_key, ser_repo_state(&state)?);
678
678
-
batch.insert(&db.pending, keys::pending_key(state.index_id), &did_key);
679
679
-
}
680
680
-
681
681
-
tokio::task::spawn_blocking(move || batch.commit().into_diagnostic())
682
682
-
.await
683
683
-
.into_diagnostic()??;
684
684
-
685
685
-
if !valid_dids.is_empty() {
686
686
-
info!("recovered {} repos from failed retry", valid_dids.len());
687
687
-
self.account_new_repos(valid_dids.len()).await;
688
688
-
}
689
689
-
690
690
-
Ok(())
691
798
}
692
799
693
800
async fn account_new_repos(&self, count: usize) {
+172
src/crawler/throttle.rs
···
1
1
+
use scc::HashMap;
2
2
+
use std::future::Future;
3
3
+
use std::sync::Arc;
4
4
+
use std::sync::atomic::{AtomicI64, AtomicUsize, Ordering};
5
5
+
use tokio::sync::{Notify, Semaphore, SemaphorePermit};
6
6
+
use url::Url;
7
7
+
8
8
+
/// max concurrent in-flight requests per PDS before we start queuing
9
9
+
/// ref pds allows 10 requests per second... so 10 should be fine
10
10
+
const PER_PDS_CONCURRENCY: usize = 10;
11
11
+
12
12
+
#[derive(Clone)]
13
13
+
pub struct Throttler {
14
14
+
states: Arc<HashMap<Url, Arc<State>>>,
15
15
+
}
16
16
+
17
17
+
impl Throttler {
18
18
+
pub fn new() -> Self {
19
19
+
Self {
20
20
+
states: Arc::new(HashMap::new()),
21
21
+
}
22
22
+
}
23
23
+
24
24
+
pub async fn get_handle(&self, url: &Url) -> ThrottleHandle {
25
25
+
let state = self
26
26
+
.states
27
27
+
.entry_async(url.clone())
28
28
+
.await
29
29
+
.or_insert_with(|| Arc::new(State::new()))
30
30
+
.get()
31
31
+
.clone();
32
32
+
33
33
+
ThrottleHandle { state }
34
34
+
}
35
35
+
36
36
+
/// drop entries with no active throttle and no consecutive failures.
37
37
+
pub async fn evict_clean(&self) {
38
38
+
self.states
39
39
+
.retain_async(|_, v| {
40
40
+
v.throttled_until.load(Ordering::Acquire) != 0
41
41
+
|| v.consecutive_failures.load(Ordering::Acquire) != 0
42
42
+
})
43
43
+
.await;
44
44
+
}
45
45
+
}
46
46
+
47
47
+
struct State {
48
48
+
throttled_until: AtomicI64,
49
49
+
consecutive_failures: AtomicUsize,
50
50
+
/// only fires on hard failures (timeout, TLS, bad gateway, etc).
51
51
+
/// ratelimits do NOT fire this — they just store `throttled_until` and
52
52
+
/// let tasks exit naturally, deferring to the background retry loop.
53
53
+
failure_notify: Notify,
54
54
+
semaphore: Semaphore,
55
55
+
}
56
56
+
57
57
+
impl State {
58
58
+
fn new() -> Self {
59
59
+
Self {
60
60
+
throttled_until: AtomicI64::new(0),
61
61
+
consecutive_failures: AtomicUsize::new(0),
62
62
+
failure_notify: Notify::new(),
63
63
+
semaphore: Semaphore::new(PER_PDS_CONCURRENCY),
64
64
+
}
65
65
+
}
66
66
+
}
67
67
+
68
68
+
pub struct ThrottleHandle {
69
69
+
state: Arc<State>,
70
70
+
}
71
71
+
72
72
+
impl ThrottleHandle {
73
73
+
pub fn is_throttled(&self) -> bool {
74
74
+
let until = self.state.throttled_until.load(Ordering::Acquire);
75
75
+
until != 0 && chrono::Utc::now().timestamp() < until
76
76
+
}
77
77
+
78
78
+
/// the unix timestamp at which this throttle expires (0 if not throttled).
79
79
+
pub fn throttled_until(&self) -> i64 {
80
80
+
self.state.throttled_until.load(Ordering::Acquire)
81
81
+
}
82
82
+
83
83
+
pub fn record_success(&self) {
84
84
+
self.state.consecutive_failures.store(0, Ordering::Release);
85
85
+
self.state.throttled_until.store(0, Ordering::Release);
86
86
+
}
87
87
+
88
88
+
/// called on a 429 response. `retry_after_secs` comes from the `Retry-After`
89
89
+
/// header if present; falls back to 60s. uses `fetch_max` so concurrent callers
90
90
+
/// don't race each other back to a shorter window.
91
91
+
///
92
92
+
/// deliberately does NOT notify waiters — 429s are soft and tasks should exit
93
93
+
/// naturally via the `Retry` result rather than being cancelled.
94
94
+
pub fn record_ratelimit(&self, retry_after_secs: Option<u64>) {
95
95
+
let secs = retry_after_secs.unwrap_or(60) as i64;
96
96
+
let until = chrono::Utc::now().timestamp() + secs;
97
97
+
self.state
98
98
+
.throttled_until
99
99
+
.fetch_max(until, Ordering::AcqRel);
100
100
+
}
101
101
+
102
102
+
/// called on hard failures (timeout, TLS error, bad gateway, etc).
103
103
+
/// returns throttle duration in minutes if this is a *new* throttle,
104
104
+
/// and notifies all in-flight tasks to cancel immediately.
105
105
+
pub fn record_failure(&self) -> Option<i64> {
106
106
+
if self.is_throttled() {
107
107
+
return None;
108
108
+
}
109
109
+
110
110
+
let failures = self
111
111
+
.state
112
112
+
.consecutive_failures
113
113
+
.fetch_add(1, Ordering::AcqRel)
114
114
+
+ 1;
115
115
+
116
116
+
// 30 min, 60 min, 120 min, ... capped at ~512 hours
117
117
+
let base_minutes = 30i64;
118
118
+
let exponent = (failures as u32).saturating_sub(1);
119
119
+
let minutes = base_minutes * 2i64.pow(exponent.min(10));
120
120
+
let until = chrono::Utc::now().timestamp() + minutes * 60;
121
121
+
122
122
+
self.state.throttled_until.store(until, Ordering::Release);
123
123
+
self.state.failure_notify.notify_waiters();
124
124
+
125
125
+
Some(minutes)
126
126
+
}
127
127
+
128
128
+
/// acquire a concurrency slot for this PDS. hold the returned permit
129
129
+
/// for the duration of the request.
130
130
+
pub async fn acquire(&self) -> SemaphorePermit<'_> {
131
131
+
self.state
132
132
+
.semaphore
133
133
+
.acquire()
134
134
+
.await
135
135
+
.expect("throttle semaphore unexpectedly closed")
136
136
+
}
137
137
+
138
138
+
/// resolves when this PDS gets a hard failure notification.
139
139
+
/// used by `or_throttle` and the semaphore acquire select to cancel in-flight work.
140
140
+
pub async fn wait_for_failure(&self) {
141
141
+
loop {
142
142
+
let notified = self.state.failure_notify.notified();
143
143
+
if self.is_throttled() {
144
144
+
return;
145
145
+
}
146
146
+
notified.await;
147
147
+
}
148
148
+
}
149
149
+
}
150
150
+
151
151
+
/// extension trait that adds `.or_throttle()` to any future returning `Result<T, E>`.
152
152
+
///
153
153
+
/// races the future against a hard-failure notification. soft ratelimits (429) do NOT
154
154
+
/// trigger cancellation — those are handled by the background retry loop.
155
155
+
#[allow(async_fn_in_trait)]
156
156
+
pub trait OrThrottle<T, E>: Future<Output = Result<T, E>> {
157
157
+
async fn or_failure(
158
158
+
self,
159
159
+
handle: &ThrottleHandle,
160
160
+
on_throttle: impl FnOnce() -> E,
161
161
+
) -> Result<T, E>
162
162
+
where
163
163
+
Self: Sized,
164
164
+
{
165
165
+
tokio::select! {
166
166
+
res = self => res,
167
167
+
_ = handle.wait_for_failure() => Err(on_throttle()),
168
168
+
}
169
169
+
}
170
170
+
}
171
171
+
172
172
+
impl<T, E, F: Future<Output = Result<T, E>>> OrThrottle<T, E> for F {}
+24
-4
src/db/keys.rs
···
132
132
prefix
133
133
}
134
134
135
135
-
pub const CRAWLER_FAILED_PREFIX: &[u8] = &[b'f', SEP];
135
135
+
/// key format: `ret|<did bytes>`
136
136
+
pub const CRAWLER_RETRY_PREFIX: &[u8] = b"ret|";
136
137
137
137
-
pub fn crawler_failed_key(did: &Did) -> Vec<u8> {
138
138
+
pub fn crawler_retry_key(did: &Did) -> Vec<u8> {
138
139
let repo = TrimmedDid::from(did);
139
139
-
let mut key = Vec::with_capacity(CRAWLER_FAILED_PREFIX.len() + repo.len());
140
140
-
key.extend_from_slice(CRAWLER_FAILED_PREFIX);
140
140
+
let mut key = Vec::with_capacity(CRAWLER_RETRY_PREFIX.len() + repo.len());
141
141
+
key.extend_from_slice(CRAWLER_RETRY_PREFIX);
141
142
repo.write_to_vec(&mut key);
142
143
key
143
144
}
145
145
+
146
146
+
/// value format: `<retry_after: i64 BE><status: u16 BE>`
147
147
+
pub fn crawler_retry_value(retry_after: i64, status: u16) -> [u8; 10] {
148
148
+
let mut buf = [0u8; 10];
149
149
+
buf[..8].copy_from_slice(&retry_after.to_be_bytes());
150
150
+
buf[8..].copy_from_slice(&status.to_be_bytes());
151
151
+
buf
152
152
+
}
153
153
+
154
154
+
pub fn crawler_retry_parse_value(val: &[u8]) -> miette::Result<(i64, u16)> {
155
155
+
miette::ensure!(val.len() >= 10, "crawler retry value too short");
156
156
+
let retry_after = i64::from_be_bytes(val[..8].try_into().unwrap());
157
157
+
let status = u16::from_be_bytes(val[8..10].try_into().unwrap());
158
158
+
Ok((retry_after, status))
159
159
+
}
160
160
+
161
161
+
pub fn crawler_retry_parse_key(key: &[u8]) -> miette::Result<TrimmedDid<'_>> {
162
162
+
TrimmedDid::try_from(&key[CRAWLER_RETRY_PREFIX.len()..])
163
163
+
}
+5
-1
src/filter.rs
···
52
52
self.signals.iter().any(|p| nsid_matches(p, collection))
53
53
}
54
54
55
55
-
pub fn has_glob_signals(&self) -> bool {
55
55
+
fn has_glob_signals(&self) -> bool {
56
56
self.signals.iter().any(|s| s.ends_with(".*"))
57
57
+
}
58
58
+
59
59
+
pub fn check_signals(&self) -> bool {
60
60
+
self.mode == FilterMode::Filter && !self.signals.is_empty() && !self.has_glob_signals()
57
61
}
58
62
}
59
63
+1
-1
src/ingest/worker.rs
···
556
556
557
557
debug!(did = %did, "discovered new account from firehose, queueing backfill");
558
558
559
559
-
let repo_state = RepoState::backfilling_untracked(rand::rng().next_u64());
559
559
+
let repo_state = RepoState::untracked(rand::rng().next_u64());
560
560
let mut batch = ctx.state.db.inner.batch();
561
561
batch.insert(
562
562
&ctx.state.db.repos,
+2
-1
src/types.rs
···
65
65
}
66
66
}
67
67
68
68
-
pub fn backfilling_untracked(index_id: u64) -> Self {
68
68
+
/// backfilling, but not tracked yet
69
69
+
pub fn untracked(index_id: u64) -> Self {
69
70
Self {
70
71
tracked: false,
71
72
..Self::backfilling(index_id)