tangled
alpha
login
or
join now
ptr.pet
/
hydrant
28
fork
atom
at protocol indexer with flexible filtering, xrpc queries, and a cursor-backed event stream, built on fjall
at-protocol
atproto
indexer
rust
fjall
28
fork
atom
overview
issues
6
pulls
pipelines
[crawler] restart on 'fatal' errors
ptr.pet
1 week ago
9b3bea0a
bea4e639
verified
This commit was signed with the committer's
known signature
.
ptr.pet
SSH Key Fingerprint:
SHA256:Abmvag+juovVufZTxyWY8KcVgrznxvBjQpJesv071Aw=
+19
-13
1 changed file
expand all
collapse all
unified
split
src
crawler
mod.rs
+19
-13
src/crawler/mod.rs
···
23
enum CrawlCheckResult {
24
Signal,
25
NoSignal,
26
-
/// task could not complete; should be retried at `retry_after` (unix timestamp).
27
-
/// `status` is the HTTP status that triggered this (0 for non-HTTP failures).
28
-
Retry {
29
-
retry_after: i64,
30
-
status: u16,
31
-
},
32
}
33
34
/// outcome of [`RetryWithBackoff::retry`] when the operation does not succeed.
···
431
}
432
433
pub async fn run(self) -> Result<()> {
0
0
434
// stats ticker
435
tokio::spawn({
436
use std::time::Instant;
437
-
let count = self.count.clone();
438
-
let crawled_count = self.crawled_count.clone();
439
-
let throttled = self.throttled.clone();
440
-
let pds_throttler = self.pds_throttler.clone();
441
let mut last_time = Instant::now();
442
let mut interval = tokio::time::interval(Duration::from_secs(60));
443
async move {
···
451
452
if delta_processed == 0 && delta_crawled == 0 {
453
if is_throttled {
454
-
info!("crawler throttled: pending queue full");
455
} else {
456
debug!("no repos crawled or processed in 60s");
457
}
···
463
processed = delta_processed,
464
crawled = delta_crawled,
465
elapsed,
466
-
"crawler progress"
467
);
468
last_time = Instant::now();
469
}
470
}
471
});
472
473
-
let crawler = Arc::new(self);
474
std::thread::spawn({
475
let crawler = crawler.clone();
476
let handle = tokio::runtime::Handle::current();
···
497
}
498
});
499
0
0
0
0
0
0
0
0
0
500
let mut relay_url = crawler.relay_host.clone();
501
match relay_url.scheme() {
502
"wss" => relay_url
···
23
enum CrawlCheckResult {
24
Signal,
25
NoSignal,
26
+
Retry { retry_after: i64, status: u16 },
0
0
0
0
0
27
}
28
29
/// outcome of [`RetryWithBackoff::retry`] when the operation does not succeed.
···
426
}
427
428
pub async fn run(self) -> Result<()> {
429
+
let crawler = Arc::new(self);
430
+
431
// stats ticker
432
tokio::spawn({
433
use std::time::Instant;
434
+
let count = crawler.count.clone();
435
+
let crawled_count = crawler.crawled_count.clone();
436
+
let throttled = crawler.throttled.clone();
437
+
let pds_throttler = crawler.pds_throttler.clone();
438
let mut last_time = Instant::now();
439
let mut interval = tokio::time::interval(Duration::from_secs(60));
440
async move {
···
448
449
if delta_processed == 0 && delta_crawled == 0 {
450
if is_throttled {
451
+
info!("throttled: pending queue full");
452
} else {
453
debug!("no repos crawled or processed in 60s");
454
}
···
460
processed = delta_processed,
461
crawled = delta_crawled,
462
elapsed,
463
+
"progress"
464
);
465
last_time = Instant::now();
466
}
467
}
468
});
469
470
+
// retry thread
471
std::thread::spawn({
472
let crawler = crawler.clone();
473
let handle = tokio::runtime::Handle::current();
···
494
}
495
});
496
497
+
loop {
498
+
if let Err(e) = Self::crawl(crawler.clone()).await {
499
+
error!(err = %e, "fatal error, restarting in 30s");
500
+
tokio::time::sleep(Duration::from_secs(30)).await;
501
+
}
502
+
}
503
+
}
504
+
505
+
async fn crawl(crawler: Arc<Self>) -> Result<()> {
506
let mut relay_url = crawler.relay_host.clone();
507
match relay_url.scheme() {
508
"wss" => relay_url