tangled
alpha
login
or
join now
parakeet.at
/
parakeet
63
fork
atom
Parakeet is a Rust-based Bluesky AppServer aiming to implement most of the functionality required to support the Bluesky client
appview
atproto
bluesky
rust
appserver
63
fork
atom
overview
issues
12
pulls
pipelines
feat(consumer): guess who rewrote backfill again?
mia.omg.lol
2 weeks ago
283d69b4
19c77fcd
verified
This commit was signed with the committer's
known signature
.
mia.omg.lol
SSH Key Fingerprint:
SHA256:eb+NhC0QEl+XKRuFP/97oH6LEz0TXTKPXGDIAI5y7CQ=
+523
-553
8 changed files
expand all
collapse all
unified
split
Cargo.lock
crates
consumer
Cargo.toml
src
backfill
downloader.rs
mod.rs
repo.rs
types.rs
utils.rs
config.rs
+166
-17
Cargo.lock
···
70
]
71
72
[[package]]
0
0
0
0
0
0
73
name = "android_system_properties"
74
version = "0.1.5"
75
source = "registry+https://github.com/rust-lang/crates.io-index"
···
488
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
489
490
[[package]]
0
0
0
0
0
0
491
name = "bytes"
492
version = "1.11.0"
493
source = "registry+https://github.com/rust-lang/crates.io-index"
···
495
dependencies = [
496
"serde",
497
]
0
0
0
0
0
0
498
499
[[package]]
500
name = "bzip2-sys"
···
702
]
703
704
[[package]]
0
0
0
0
0
0
705
name = "compression-codecs"
706
version = "0.4.36"
707
source = "registry+https://github.com/rust-lang/crates.io-index"
···
741
"deadpool-postgres",
742
"eyre",
743
"figment",
744
-
"flume",
745
-
"foldhash",
746
"futures",
747
"ipld-core",
748
"iroh-car",
···
754
"parakeet-db",
755
"parakeet-index",
756
"redis",
0
757
"reqwest",
758
"serde",
759
"serde_bytes",
···
855
source = "registry+https://github.com/rust-lang/crates.io-index"
856
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
857
dependencies = [
0
0
0
0
0
0
0
0
0
0
858
"crossbeam-utils",
859
]
860
···
1305
]
1306
1307
[[package]]
0
0
0
0
0
0
0
0
0
0
0
0
1308
name = "equivalent"
1309
version = "1.0.2"
1310
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1385
checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
1386
1387
[[package]]
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1388
name = "flate2"
1389
version = "1.1.8"
1390
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1396
1397
[[package]]
1398
name = "flume"
1399
-
version = "0.11.1"
1400
source = "registry+https://github.com/rust-lang/crates.io-index"
1401
-
checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095"
1402
dependencies = [
1403
-
"futures-core",
1404
-
"futures-sink",
1405
-
"nanorand",
1406
"spin 0.9.8",
1407
]
1408
···
1417
version = "0.1.5"
1418
source = "registry+https://github.com/rust-lang/crates.io-index"
1419
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
0
0
0
0
0
0
1420
1421
[[package]]
1422
name = "foreign-types"
···
1709
source = "registry+https://github.com/rust-lang/crates.io-index"
1710
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
1711
dependencies = [
1712
-
"foldhash",
1713
]
1714
1715
[[package]]
···
1717
version = "0.16.1"
1718
source = "registry+https://github.com/rust-lang/crates.io-index"
1719
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
0
0
0
0
0
1720
1721
[[package]]
1722
name = "headers"
···
2176
]
2177
2178
[[package]]
0
0
0
0
0
0
0
0
0
2179
name = "inventory"
2180
version = "0.3.21"
2181
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2604
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
2605
2606
[[package]]
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
2607
name = "lz4-sys"
2608
version = "1.11.1+lz4-1.10.0"
2609
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2841
"wasm-bindgen",
2842
"wasm-bindgen-futures",
2843
"web-time",
2844
-
]
2845
-
2846
-
[[package]]
2847
-
name = "nanorand"
2848
-
version = "0.7.0"
2849
-
source = "registry+https://github.com/rust-lang/crates.io-index"
2850
-
checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3"
2851
-
dependencies = [
2852
-
"getrandom 0.2.17",
2853
]
2854
2855
[[package]]
···
3645
]
3646
3647
[[package]]
0
0
0
0
0
0
0
0
0
0
3648
name = "quinn"
3649
version = "0.11.9"
3650
source = "registry+https://github.com/rust-lang/crates.io-index"
···
3895
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
3896
3897
[[package]]
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
3898
name = "reqwest"
3899
version = "0.12.28"
3900
source = "registry+https://github.com/rust-lang/crates.io-index"
···
4197
]
4198
4199
[[package]]
0
0
0
0
0
0
4200
name = "semver"
4201
version = "1.0.27"
4202
source = "registry+https://github.com/rust-lang/crates.io-index"
···
4367
"proc-macro2",
4368
"quote",
4369
"syn 2.0.114",
0
0
0
0
0
0
0
0
0
0
0
4370
]
4371
4372
[[package]]
···
5401
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
5402
5403
[[package]]
0
0
0
0
0
0
5404
name = "vcpkg"
5405
version = "0.2.15"
5406
source = "registry+https://github.com/rust-lang/crates.io-index"
···
5916
version = "0.6.2"
5917
source = "registry+https://github.com/rust-lang/crates.io-index"
5918
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
0
0
0
0
0
0
5919
5920
[[package]]
5921
name = "yansi"
···
70
]
71
72
[[package]]
73
+
name = "allocator-api2"
74
+
version = "0.2.21"
75
+
source = "registry+https://github.com/rust-lang/crates.io-index"
76
+
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
77
+
78
+
[[package]]
79
name = "android_system_properties"
80
version = "0.1.5"
81
source = "registry+https://github.com/rust-lang/crates.io-index"
···
494
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
495
496
[[package]]
497
+
name = "byteorder-lite"
498
+
version = "0.1.0"
499
+
source = "registry+https://github.com/rust-lang/crates.io-index"
500
+
checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
501
+
502
+
[[package]]
503
name = "bytes"
504
version = "1.11.0"
505
source = "registry+https://github.com/rust-lang/crates.io-index"
···
507
dependencies = [
508
"serde",
509
]
510
+
511
+
[[package]]
512
+
name = "byteview"
513
+
version = "0.10.1"
514
+
source = "registry+https://github.com/rust-lang/crates.io-index"
515
+
checksum = "1c53ba0f290bfc610084c05582d9c5d421662128fc69f4bf236707af6fd321b9"
516
517
[[package]]
518
name = "bzip2-sys"
···
720
]
721
722
[[package]]
723
+
name = "compare"
724
+
version = "0.0.6"
725
+
source = "registry+https://github.com/rust-lang/crates.io-index"
726
+
checksum = "ea0095f6103c2a8b44acd6fd15960c801dafebf02e21940360833e0673f48ba7"
727
+
728
+
[[package]]
729
name = "compression-codecs"
730
version = "0.4.36"
731
source = "registry+https://github.com/rust-lang/crates.io-index"
···
765
"deadpool-postgres",
766
"eyre",
767
"figment",
768
+
"foldhash 0.1.5",
0
769
"futures",
770
"ipld-core",
771
"iroh-car",
···
777
"parakeet-db",
778
"parakeet-index",
779
"redis",
780
+
"repo-stream",
781
"reqwest",
782
"serde",
783
"serde_bytes",
···
879
source = "registry+https://github.com/rust-lang/crates.io-index"
880
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
881
dependencies = [
882
+
"crossbeam-utils",
883
+
]
884
+
885
+
[[package]]
886
+
name = "crossbeam-skiplist"
887
+
version = "0.1.3"
888
+
source = "registry+https://github.com/rust-lang/crates.io-index"
889
+
checksum = "df29de440c58ca2cc6e587ec3d22347551a32435fbde9d2bff64e78a9ffa151b"
890
+
dependencies = [
891
+
"crossbeam-epoch",
892
"crossbeam-utils",
893
]
894
···
1339
]
1340
1341
[[package]]
1342
+
name = "enum_dispatch"
1343
+
version = "0.3.13"
1344
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1345
+
checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd"
1346
+
dependencies = [
1347
+
"once_cell",
1348
+
"proc-macro2",
1349
+
"quote",
1350
+
"syn 2.0.114",
1351
+
]
1352
+
1353
+
[[package]]
1354
name = "equivalent"
1355
version = "1.0.2"
1356
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1431
checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
1432
1433
[[package]]
1434
+
name = "fjall"
1435
+
version = "3.0.2"
1436
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1437
+
checksum = "5a2799b4198427a08c774838e44d0b77f677208f19a1927671cd2cd36bb30d69"
1438
+
dependencies = [
1439
+
"byteorder-lite",
1440
+
"byteview",
1441
+
"dashmap",
1442
+
"flume",
1443
+
"log",
1444
+
"lsm-tree",
1445
+
"tempfile",
1446
+
"xxhash-rust",
1447
+
]
1448
+
1449
+
[[package]]
1450
name = "flate2"
1451
version = "1.1.8"
1452
source = "registry+https://github.com/rust-lang/crates.io-index"
···
1458
1459
[[package]]
1460
name = "flume"
1461
+
version = "0.12.0"
1462
source = "registry+https://github.com/rust-lang/crates.io-index"
1463
+
checksum = "5e139bc46ca777eb5efaf62df0ab8cc5fd400866427e56c68b22e414e53bd3be"
1464
dependencies = [
0
0
0
1465
"spin 0.9.8",
1466
]
1467
···
1476
version = "0.1.5"
1477
source = "registry+https://github.com/rust-lang/crates.io-index"
1478
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
1479
+
1480
+
[[package]]
1481
+
name = "foldhash"
1482
+
version = "0.2.0"
1483
+
source = "registry+https://github.com/rust-lang/crates.io-index"
1484
+
checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
1485
1486
[[package]]
1487
name = "foreign-types"
···
1774
source = "registry+https://github.com/rust-lang/crates.io-index"
1775
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
1776
dependencies = [
1777
+
"foldhash 0.1.5",
1778
]
1779
1780
[[package]]
···
1782
version = "0.16.1"
1783
source = "registry+https://github.com/rust-lang/crates.io-index"
1784
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
1785
+
dependencies = [
1786
+
"allocator-api2",
1787
+
"equivalent",
1788
+
"foldhash 0.2.0",
1789
+
]
1790
1791
[[package]]
1792
name = "headers"
···
2246
]
2247
2248
[[package]]
2249
+
name = "interval-heap"
2250
+
version = "0.0.5"
2251
+
source = "registry+https://github.com/rust-lang/crates.io-index"
2252
+
checksum = "11274e5e8e89b8607cfedc2910b6626e998779b48a019151c7604d0adcb86ac6"
2253
+
dependencies = [
2254
+
"compare",
2255
+
]
2256
+
2257
+
[[package]]
2258
name = "inventory"
2259
version = "0.3.21"
2260
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2683
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
2684
2685
[[package]]
2686
+
name = "lsm-tree"
2687
+
version = "3.0.2"
2688
+
source = "registry+https://github.com/rust-lang/crates.io-index"
2689
+
checksum = "86e8d0b8e0cf2531a437788ce94d95570dbaabfe9888db20022c2d5ccec9b221"
2690
+
dependencies = [
2691
+
"byteorder-lite",
2692
+
"byteview",
2693
+
"crossbeam-skiplist",
2694
+
"enum_dispatch",
2695
+
"interval-heap",
2696
+
"log",
2697
+
"quick_cache",
2698
+
"rustc-hash",
2699
+
"self_cell",
2700
+
"sfa",
2701
+
"tempfile",
2702
+
"varint-rs",
2703
+
"xxhash-rust",
2704
+
]
2705
+
2706
+
[[package]]
2707
name = "lz4-sys"
2708
version = "1.11.1+lz4-1.10.0"
2709
source = "registry+https://github.com/rust-lang/crates.io-index"
···
2941
"wasm-bindgen",
2942
"wasm-bindgen-futures",
2943
"web-time",
0
0
0
0
0
0
0
0
0
2944
]
2945
2946
[[package]]
···
3736
]
3737
3738
[[package]]
3739
+
name = "quick_cache"
3740
+
version = "0.6.18"
3741
+
source = "registry+https://github.com/rust-lang/crates.io-index"
3742
+
checksum = "7ada44a88ef953a3294f6eb55d2007ba44646015e18613d2f213016379203ef3"
3743
+
dependencies = [
3744
+
"equivalent",
3745
+
"hashbrown 0.16.1",
3746
+
]
3747
+
3748
+
[[package]]
3749
name = "quinn"
3750
version = "0.11.9"
3751
source = "registry+https://github.com/rust-lang/crates.io-index"
···
3996
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
3997
3998
[[package]]
3999
+
name = "repo-stream"
4000
+
version = "0.4.0"
4001
+
source = "registry+https://github.com/rust-lang/crates.io-index"
4002
+
checksum = "6c95af4a1465ac3a5a5aa4921b60f8761c1e126deb53a5b1a2e40abd0ec6ae1f"
4003
+
dependencies = [
4004
+
"cid",
4005
+
"fjall",
4006
+
"hashbrown 0.16.1",
4007
+
"iroh-car",
4008
+
"log",
4009
+
"serde",
4010
+
"serde_bytes",
4011
+
"serde_ipld_dagcbor",
4012
+
"sha2",
4013
+
"thiserror 2.0.18",
4014
+
"tokio",
4015
+
]
4016
+
4017
+
[[package]]
4018
name = "reqwest"
4019
version = "0.12.28"
4020
source = "registry+https://github.com/rust-lang/crates.io-index"
···
4317
]
4318
4319
[[package]]
4320
+
name = "self_cell"
4321
+
version = "1.2.2"
4322
+
source = "registry+https://github.com/rust-lang/crates.io-index"
4323
+
checksum = "b12e76d157a900eb52e81bc6e9f3069344290341720e9178cde2407113ac8d89"
4324
+
4325
+
[[package]]
4326
name = "semver"
4327
version = "1.0.27"
4328
source = "registry+https://github.com/rust-lang/crates.io-index"
···
4493
"proc-macro2",
4494
"quote",
4495
"syn 2.0.114",
4496
+
]
4497
+
4498
+
[[package]]
4499
+
name = "sfa"
4500
+
version = "1.0.0"
4501
+
source = "registry+https://github.com/rust-lang/crates.io-index"
4502
+
checksum = "a1296838937cab56cd6c4eeeb8718ec777383700c33f060e2869867bd01d1175"
4503
+
dependencies = [
4504
+
"byteorder-lite",
4505
+
"log",
4506
+
"xxhash-rust",
4507
]
4508
4509
[[package]]
···
5538
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
5539
5540
[[package]]
5541
+
name = "varint-rs"
5542
+
version = "2.2.0"
5543
+
source = "registry+https://github.com/rust-lang/crates.io-index"
5544
+
checksum = "8f54a172d0620933a27a4360d3db3e2ae0dd6cceae9730751a036bbf182c4b23"
5545
+
5546
+
[[package]]
5547
name = "vcpkg"
5548
version = "0.2.15"
5549
source = "registry+https://github.com/rust-lang/crates.io-index"
···
6059
version = "0.6.2"
6060
source = "registry+https://github.com/rust-lang/crates.io-index"
6061
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
6062
+
6063
+
[[package]]
6064
+
name = "xxhash-rust"
6065
+
version = "0.8.15"
6066
+
source = "registry+https://github.com/rust-lang/crates.io-index"
6067
+
checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
6068
6069
[[package]]
6070
name = "yansi"
+1
-1
crates/consumer/Cargo.toml
···
10
deadpool-postgres = { version = "0.14", features = ["serde"] }
11
eyre = "0.6.12"
12
figment = { version = "0.10.19", features = ["env", "toml"] }
13
-
flume = { version = "0.11", features = ["async"] }
14
foldhash = "0.1"
15
futures = "0.3.31"
16
ipld-core = "0.4"
···
38
tokio-util = { version = "0.7", features = ["io", "rt"] }
39
tracing = "0.1"
40
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
0
···
10
deadpool-postgres = { version = "0.14", features = ["serde"] }
11
eyre = "0.6.12"
12
figment = { version = "0.10.19", features = ["env", "toml"] }
0
13
foldhash = "0.1"
14
futures = "0.3.31"
15
ipld-core = "0.4"
···
37
tokio-util = { version = "0.7", features = ["io", "rt"] }
38
tracing = "0.1"
39
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
40
+
repo-stream = "0.4.0"
-338
crates/consumer/src/backfill/downloader.rs
···
1
-
use super::DL_DONE_KEY;
2
-
use crate::db;
3
-
use chrono::prelude::*;
4
-
use deadpool_postgres::{Client as PgClient, Pool};
5
-
use futures::TryStreamExt;
6
-
use jacquard_common::types::did::Did;
7
-
use jacquard_common::types::string::Handle;
8
-
use jacquard_identity::resolver::IdentityResolver;
9
-
use jacquard_identity::JacquardResolver;
10
-
use metrics::{counter, histogram};
11
-
use parakeet_db::types::{ActorStatus, ActorSyncState};
12
-
use redis::aio::MultiplexedConnection;
13
-
use redis::AsyncTypedCommands;
14
-
use reqwest::header::HeaderMap;
15
-
use reqwest::Client as HttpClient;
16
-
use std::path::{Path, PathBuf};
17
-
use tokio::sync::watch::Receiver as WatchReceiver;
18
-
use tokio::time::{Duration, Instant};
19
-
use tokio_postgres::types::Type;
20
-
use tokio_util::io::StreamReader;
21
-
use tokio_util::task::TaskTracker;
22
-
use tracing::instrument;
23
-
24
-
const BF_RESET_KEY: &str = "bf_download_ratelimit_reset";
25
-
const BF_REM_KEY: &str = "bf_download_ratelimit_rem";
26
-
const DL_DUP_KEY: &str = "bf_downloaded";
27
-
28
-
pub async fn downloader(
29
-
mut rc: MultiplexedConnection,
30
-
pool: Pool,
31
-
resolver: JacquardResolver,
32
-
tmp_dir: PathBuf,
33
-
concurrency: usize,
34
-
buffer: usize,
35
-
tracker: TaskTracker,
36
-
stop: WatchReceiver<bool>,
37
-
) {
38
-
let (tx, rx) = flume::bounded(64);
39
-
let mut conn = pool.get().await.unwrap();
40
-
41
-
let http = HttpClient::new();
42
-
43
-
for _ in 0..concurrency {
44
-
tracker.spawn(download_thread(
45
-
rc.clone(),
46
-
pool.clone(),
47
-
resolver.clone(),
48
-
http.clone(),
49
-
rx.clone(),
50
-
tmp_dir.clone(),
51
-
));
52
-
}
53
-
54
-
let status_stmt = conn.prepare_typed_cached(
55
-
"INSERT INTO actors (did, sync_state, last_indexed) VALUES ($1, 'processing', NOW()) ON CONFLICT (did) DO UPDATE SET sync_state = 'processing', last_indexed=NOW()",
56
-
&[Type::TEXT]
57
-
).await.unwrap();
58
-
59
-
loop {
60
-
if stop.has_changed().unwrap_or(true) {
61
-
tracing::info!("stopping downloader");
62
-
break;
63
-
}
64
-
65
-
if let Ok(count) = rc.llen(DL_DONE_KEY).await {
66
-
if count > buffer {
67
-
tracing::info!("waiting due to full buffer");
68
-
tokio::time::sleep(Duration::from_secs(5)).await;
69
-
continue;
70
-
}
71
-
}
72
-
73
-
let did: String = match rc.lpop("backfill_queue", None).await {
74
-
Ok(Some(did)) => did,
75
-
Ok(None) => {
76
-
tokio::time::sleep(Duration::from_millis(250)).await;
77
-
continue;
78
-
}
79
-
Err(e) => {
80
-
tracing::error!("failed to get item from backfill queue: {e}");
81
-
continue;
82
-
}
83
-
};
84
-
85
-
tracing::trace!("resolving repo {did}");
86
-
87
-
// has the repo already been downloaded?
88
-
if rc.sismember(DL_DUP_KEY, &did).await.unwrap_or_default() {
89
-
tracing::info!("skipping duplicate repo {did}");
90
-
continue;
91
-
}
92
-
93
-
// check if they're already synced in DB too
94
-
match db::actor_get_statuses(&mut conn, &did).await {
95
-
Ok(Some((_, state))) => {
96
-
if state == ActorSyncState::Synced || state == ActorSyncState::Processing {
97
-
tracing::info!("skipping duplicate repo {did}");
98
-
continue;
99
-
}
100
-
}
101
-
Ok(None) => {}
102
-
Err(e) => {
103
-
tracing::error!(did, "failed to check current repo status: {e}");
104
-
db::backfill_job_write(&mut conn, &did, "failed.resolve")
105
-
.await
106
-
.unwrap();
107
-
}
108
-
}
109
-
110
-
let jd = Did::raw(&did);
111
-
match resolver
112
-
.resolve_did_doc(&jd)
113
-
.await
114
-
.and_then(|v| v.into_owned())
115
-
{
116
-
Ok(did_doc) => {
117
-
let Some(service) = did_doc.pds_endpoint() else {
118
-
tracing::warn!("bad DID doc for {did}");
119
-
db::backfill_job_write(&mut conn, &did, "failed.resolve.did_svc")
120
-
.await
121
-
.unwrap();
122
-
continue;
123
-
};
124
-
let service = service.to_string();
125
-
126
-
// set the repo to processing
127
-
if let Err(e) = conn.execute(&status_stmt, &[&did.as_str()]).await {
128
-
tracing::error!("failed to update repo status for {did}: {e}");
129
-
continue;
130
-
}
131
-
132
-
let handle = did_doc.handles().first().cloned();
133
-
134
-
tracing::trace!("resolved repo {did} {service}");
135
-
if let Err(e) = tx.send_async((service, did, handle)).await {
136
-
tracing::error!("failed to send: {e}");
137
-
}
138
-
}
139
-
Err(e) => {
140
-
tracing::error!(did = did.as_str(), "failed to resolve DID doc: {e}");
141
-
db::actor_set_sync_status(&mut conn, &did, ActorSyncState::Dirty, Utc::now())
142
-
.await
143
-
.unwrap();
144
-
db::backfill_job_write(&mut conn, &did, "failed.resolve.did")
145
-
.await
146
-
.unwrap();
147
-
}
148
-
}
149
-
}
150
-
}
151
-
152
-
async fn download_thread(
153
-
mut rc: MultiplexedConnection,
154
-
pool: Pool,
155
-
resolver: JacquardResolver,
156
-
http: reqwest::Client,
157
-
rx: flume::Receiver<(String, String, Option<Handle<'static>>)>,
158
-
tmp_dir: PathBuf,
159
-
) {
160
-
tracing::debug!("spawning thread");
161
-
162
-
// this will return Err(_) and exit when all senders (only held above) are dropped
163
-
while let Ok((pds, did, maybe_handle)) = rx.recv_async().await {
164
-
if let Err(e) = enforce_ratelimit(&mut rc, &pds).await {
165
-
tracing::error!("ratelimiter error: {e}");
166
-
continue;
167
-
};
168
-
169
-
{
170
-
tracing::trace!("getting DB conn...");
171
-
let mut conn = pool.get().await.unwrap();
172
-
tracing::trace!("got DB conn...");
173
-
match check_and_update_repo_status(&http, &mut conn, &pds, &did).await {
174
-
Ok(true) => {}
175
-
Ok(false) => continue,
176
-
Err(e) => {
177
-
tracing::error!(pds, did, "failed to check repo status: {e}");
178
-
db::backfill_job_write(&mut conn, &did, "failed.resolve.status")
179
-
.await
180
-
.unwrap();
181
-
continue;
182
-
}
183
-
}
184
-
185
-
tracing::debug!("trying to resolve handle...");
186
-
if let Some(handle) = maybe_handle {
187
-
if let Err(e) = resolve_and_set_handle(&conn, &resolver, &did, &handle).await {
188
-
tracing::error!(pds, did = did.as_str(), "failed to resolve handle: {e}");
189
-
db::backfill_job_write(&mut conn, &did, "failed.resolve.handle")
190
-
.await
191
-
.unwrap();
192
-
}
193
-
}
194
-
}
195
-
196
-
let start = Instant::now();
197
-
198
-
tracing::trace!("downloading repo {did}");
199
-
200
-
match download_car(&http, &tmp_dir, &pds, &did).await {
201
-
Ok(Some((rem, reset))) => {
202
-
let _ = rc.zadd(BF_REM_KEY, &pds, rem).await;
203
-
let _ = rc.zadd(BF_RESET_KEY, &pds, reset).await;
204
-
}
205
-
Ok(_) => tracing::debug!(pds, "got response with no ratelimit headers."),
206
-
Err(e) => {
207
-
tracing::error!(pds, did, "failed to download repo: {e}");
208
-
continue;
209
-
}
210
-
}
211
-
212
-
histogram!("backfill_download_dur", "pds" => pds).record(start.elapsed().as_secs_f64());
213
-
214
-
let _ = rc.sadd(DL_DUP_KEY, &did).await;
215
-
if let Err(e) = rc.rpush(DL_DONE_KEY, &did).await {
216
-
tracing::error!(did, "failed to mark download complete: {e}");
217
-
} else {
218
-
counter!("backfill_downloaded").increment(1);
219
-
}
220
-
}
221
-
222
-
tracing::debug!("thread exiting");
223
-
}
224
-
225
-
async fn enforce_ratelimit(rc: &mut MultiplexedConnection, pds: &str) -> eyre::Result<()> {
226
-
let score = rc.zscore(BF_REM_KEY, pds).await?;
227
-
228
-
if let Some(rem) = score {
229
-
if (rem as i32) < 100 {
230
-
// if we've got None for some reason, just hope that the next req will contain the reset header.
231
-
if let Some(at) = rc.zscore(BF_RESET_KEY, pds).await? {
232
-
tracing::debug!("rate limit for {pds} resets at {at}");
233
-
let time = chrono::DateTime::from_timestamp(at as i64, 0).unwrap();
234
-
let delta = (time - Utc::now()).num_milliseconds().max(0);
235
-
236
-
tokio::time::sleep(Duration::from_millis(delta as u64)).await;
237
-
};
238
-
}
239
-
}
240
-
241
-
Ok(())
242
-
}
243
-
244
-
// you wouldn't...
245
-
#[instrument(skip(http, tmp_dir, pds))]
246
-
async fn download_car(
247
-
http: &HttpClient,
248
-
tmp_dir: &Path,
249
-
pds: &str,
250
-
did: &str,
251
-
) -> eyre::Result<Option<(i32, i32)>> {
252
-
let res = http
253
-
.get(format!("{pds}xrpc/com.atproto.sync.getRepo?did={did}"))
254
-
.send()
255
-
.await?
256
-
.error_for_status()?;
257
-
258
-
let mut file = tokio::fs::File::create_new(tmp_dir.join(did)).await?;
259
-
260
-
let headers = res.headers();
261
-
let ratelimit_rem = header_to_int(headers, "ratelimit-remaining");
262
-
let ratelimit_reset = header_to_int(headers, "ratelimit-reset");
263
-
264
-
let strm = res.bytes_stream().map_err(std::io::Error::other);
265
-
let mut reader = StreamReader::new(strm);
266
-
267
-
tokio::io::copy(&mut reader, &mut file).await?;
268
-
269
-
Ok(ratelimit_rem.zip(ratelimit_reset))
270
-
}
271
-
272
-
// there's no ratelimit handling here because we pretty much always call download_car after.
273
-
#[instrument(skip(http, conn, pds))]
274
-
async fn check_and_update_repo_status(
275
-
http: &HttpClient,
276
-
conn: &mut PgClient,
277
-
pds: &str,
278
-
repo: &str,
279
-
) -> eyre::Result<bool> {
280
-
match super::check_pds_repo_status(http, pds, repo).await? {
281
-
Some(status) => {
282
-
if !status.active {
283
-
tracing::debug!("repo is inactive");
284
-
285
-
let status = status
286
-
.status
287
-
.unwrap_or(crate::firehose::AtpAccountStatus::Deleted);
288
-
conn.execute(
289
-
"UPDATE actors SET sync_state='dirty', status=$2 WHERE did=$1",
290
-
&[&repo, &ActorStatus::from(status)],
291
-
)
292
-
.await?;
293
-
294
-
Ok(false)
295
-
} else {
296
-
Ok(true)
297
-
}
298
-
}
299
-
None => {
300
-
// this repo can't be found - set dirty and assume deleted.
301
-
tracing::debug!("repo was deleted");
302
-
conn.execute(
303
-
"UPDATE actors SET sync_state='dirty', status='deleted' WHERE did=$1",
304
-
&[&repo],
305
-
)
306
-
.await?;
307
-
308
-
Ok(false)
309
-
}
310
-
}
311
-
}
312
-
313
-
async fn resolve_and_set_handle(
314
-
conn: &PgClient,
315
-
resolver: &JacquardResolver,
316
-
did: &str,
317
-
handle: &Handle<'_>,
318
-
) -> eyre::Result<()> {
319
-
let handle_did = resolver.resolve_handle(handle).await?;
320
-
if handle_did == Did::raw(did) {
321
-
conn.execute(
322
-
"UPDATE actors SET handle=$2 WHERE did=$1",
323
-
&[&did, &handle.as_str()],
324
-
)
325
-
.await?;
326
-
} else {
327
-
tracing::warn!("requested DID ({did}) doesn't match handle");
328
-
}
329
-
330
-
Ok(())
331
-
}
332
-
333
-
fn header_to_int(headers: &HeaderMap, name: &str) -> Option<i32> {
334
-
headers
335
-
.get(name)
336
-
.and_then(|v| v.to_str().ok())
337
-
.and_then(|v| v.parse().ok())
338
-
}
···
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
+128
-65
crates/consumer/src/backfill/mod.rs
···
5
use chrono::prelude::*;
6
use deadpool_postgres::{Object, Pool, Transaction};
7
use ipld_core::cid::Cid;
0
8
use jacquard_identity::JacquardResolver;
9
use lexica::StrongRef;
10
use metrics::counter;
11
-
use parakeet_db::types::{ActorStatus, ActorSyncState};
12
use redis::aio::MultiplexedConnection;
13
use redis::AsyncTypedCommands;
14
-
use reqwest::{Client, StatusCode};
15
use std::path::PathBuf;
16
use std::str::FromStr;
17
use std::sync::Arc;
0
18
use tokio::sync::watch::Receiver as WatchReceiver;
19
use tokio::sync::Semaphore;
20
use tokio_util::task::TaskTracker;
21
use tracing::instrument;
22
23
-
mod downloader;
24
mod repo;
25
-
mod types;
26
27
-
const DL_DONE_KEY: &str = "bf_download_complete";
28
-
const PDS_SERVICE_ID: &str = "#atproto_pds";
29
// There's a 4MiB limit on parakeet-index, so break delta batches up if there's loads.
30
// this should be plenty low enough to not trigger the size limit. (59k did slightly)
31
const DELTA_BATCH_SIZE: usize = 32 * 1024;
···
34
pub struct BackfillManagerInner {
35
index_client: Option<parakeet_index::Client>,
36
tmp_dir: PathBuf,
0
0
37
}
38
39
pub struct BackfillManager {
40
pool: Pool,
41
redis: MultiplexedConnection,
42
-
resolver: JacquardResolver,
43
semaphore: Arc<Semaphore>,
44
-
opts: BackfillConfig,
45
inner: BackfillManagerInner,
46
}
47
···
55
) -> eyre::Result<Self> {
56
let semaphore = Arc::new(Semaphore::new(opts.workers as usize));
57
0
0
58
Ok(BackfillManager {
59
pool,
60
redis,
61
-
resolver,
62
semaphore,
63
inner: BackfillManagerInner {
64
index_client,
65
tmp_dir: PathBuf::from_str(&opts.download_tmp_dir)?,
0
0
66
},
67
-
opts,
68
})
69
}
70
71
pub async fn run(mut self, stop: WatchReceiver<bool>) -> eyre::Result<()> {
72
let tracker = TaskTracker::new();
73
74
-
tracker.spawn(downloader::downloader(
75
-
self.redis.clone(),
76
-
self.pool.clone(),
77
-
self.resolver,
78
-
self.inner.tmp_dir.clone(),
79
-
self.opts.download_workers,
80
-
self.opts.download_buffer,
81
-
tracker.clone(),
82
-
stop.clone(),
83
-
));
84
-
85
loop {
86
if stop.has_changed().unwrap_or(true) {
87
tracker.close();
···
89
break;
90
}
91
92
-
let Some(job): Option<String> = self.redis.lpop(DL_DONE_KEY, None).await? else {
93
-
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
94
-
continue;
0
0
0
0
0
0
0
95
};
96
97
let p = self.semaphore.clone().acquire_owned().await?;
98
99
-
let mut inner = self.inner.clone();
100
let mut conn = self.pool.get().await?;
101
-
let mut rc = self.redis.clone();
102
103
tracker.spawn(async move {
104
let _p = p;
105
-
tracing::trace!("backfilling {job}");
106
107
-
if let Err(e) = backfill_actor(&mut conn, &mut rc, &mut inner, &job).await {
108
-
tracing::error!(did = &job, "backfill failed: {e}");
109
counter!("backfill_failure").increment(1);
110
-
111
-
db::backfill_job_write(&mut conn, &job, "failed.write")
112
-
.await
113
-
.unwrap();
114
} else {
115
counter!("backfill_success").increment(1);
116
117
-
db::backfill_job_write(&mut conn, &job, "successful")
118
.await
119
.unwrap();
120
-
}
121
-
122
-
if let Err(e) = tokio::fs::remove_file(inner.tmp_dir.join(&job)).await {
123
-
tracing::error!(did = &job, "failed to remove file: {e}");
124
}
125
});
126
}
···
131
}
132
}
133
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
134
#[instrument(skip(conn, rc, inner))]
135
-
async fn backfill_actor(
136
conn: &mut Object,
137
rc: &mut MultiplexedConnection,
138
inner: &mut BackfillManagerInner,
0
139
did: &str,
140
-
) -> eyre::Result<()> {
141
let mut t = conn.transaction().await?;
142
t.execute("SET CONSTRAINTS ALL DEFERRED", &[]).await?;
143
144
tracing::trace!("loading repo");
145
146
-
let (commit, mut deltas, copies) = repo::insert_repo(&mut t, rc, &inner.tmp_dir, did).await?;
0
0
147
148
-
db::actor_set_repo_state(&mut t, did, &commit.rev, commit.data).await?;
149
150
-
copies.submit(&mut t, did).await?;
151
152
t.execute(
153
"UPDATE actors SET sync_state=$2, last_indexed=$3 WHERE did=$1",
···
155
)
156
.await?;
157
158
-
handle_backfill_rows(&mut t, rc, &mut deltas, did, &commit.rev).await?;
159
160
tracing::trace!("insertion finished");
161
162
if let Some(index_client) = &mut inner.index_client {
163
// submit the deltas
164
-
let delta_store = deltas
0
165
.into_iter()
166
.map(|((uri, typ), delta)| parakeet_index::AggregateDeltaReq {
167
typ,
···
190
191
t.commit().await?;
192
193
-
Ok(())
194
}
195
196
async fn handle_backfill_rows(
···
245
db::backfill_delete_rows(conn, repo).await?;
246
247
Ok(())
248
-
}
249
-
250
-
async fn check_pds_repo_status(
251
-
client: &Client,
252
-
pds: &str,
253
-
repo: &str,
254
-
) -> eyre::Result<Option<types::GetRepoStatusRes>> {
255
-
let res = client
256
-
.get(format!(
257
-
"{pds}xrpc/com.atproto.sync.getRepoStatus?did={repo}"
258
-
))
259
-
.send()
260
-
.await?;
261
-
262
-
if [StatusCode::NOT_FOUND, StatusCode::BAD_REQUEST].contains(&res.status()) {
263
-
return Ok(None);
264
-
}
265
-
266
-
Ok(res.json().await?)
267
}
268
269
#[derive(Debug, Default)]
···
5
use chrono::prelude::*;
6
use deadpool_postgres::{Object, Pool, Transaction};
7
use ipld_core::cid::Cid;
8
+
use jacquard_common::types::did::Did;
9
use jacquard_identity::JacquardResolver;
10
use lexica::StrongRef;
11
use metrics::counter;
12
+
use parakeet_db::types::ActorSyncState;
13
use redis::aio::MultiplexedConnection;
14
use redis::AsyncTypedCommands;
15
+
use reqwest::Client;
16
use std::path::PathBuf;
17
use std::str::FromStr;
18
use std::sync::Arc;
19
+
use std::time::Duration;
20
use tokio::sync::watch::Receiver as WatchReceiver;
21
use tokio::sync::Semaphore;
22
use tokio_util::task::TaskTracker;
23
use tracing::instrument;
24
0
25
mod repo;
26
+
mod utils;
27
28
+
const DL_DUP_KEY: &str = "bf_completed";
0
29
// There's a 4MiB limit on parakeet-index, so break delta batches up if there's loads.
30
// this should be plenty low enough to not trigger the size limit. (59k did slightly)
31
const DELTA_BATCH_SIZE: usize = 32 * 1024;
···
34
pub struct BackfillManagerInner {
35
index_client: Option<parakeet_index::Client>,
36
tmp_dir: PathBuf,
37
+
resolver: JacquardResolver,
38
+
client: Client,
39
}
40
41
pub struct BackfillManager {
42
pool: Pool,
43
redis: MultiplexedConnection,
0
44
semaphore: Arc<Semaphore>,
0
45
inner: BackfillManagerInner,
46
}
47
···
55
) -> eyre::Result<Self> {
56
let semaphore = Arc::new(Semaphore::new(opts.workers as usize));
57
58
+
let client = Client::builder().brotli(true).build()?;
59
+
60
Ok(BackfillManager {
61
pool,
62
redis,
0
63
semaphore,
64
inner: BackfillManagerInner {
65
index_client,
66
tmp_dir: PathBuf::from_str(&opts.download_tmp_dir)?,
67
+
resolver,
68
+
client,
69
},
0
70
})
71
}
72
73
pub async fn run(mut self, stop: WatchReceiver<bool>) -> eyre::Result<()> {
74
let tracker = TaskTracker::new();
75
0
0
0
0
0
0
0
0
0
0
0
76
loop {
77
if stop.has_changed().unwrap_or(true) {
78
tracker.close();
···
80
break;
81
}
82
83
+
let did: String = match self.redis.lpop("backfill_queue", None).await {
84
+
Ok(Some(did)) => did,
85
+
Ok(None) => {
86
+
tokio::time::sleep(Duration::from_millis(250)).await;
87
+
continue;
88
+
}
89
+
Err(e) => {
90
+
tracing::error!("failed to get item from backfill queue: {e}");
91
+
continue;
92
+
}
93
};
94
95
let p = self.semaphore.clone().acquire_owned().await?;
96
97
+
let inner = self.inner.clone();
98
let mut conn = self.pool.get().await?;
99
+
let rc = self.redis.clone();
100
101
tracker.spawn(async move {
102
let _p = p;
103
+
tracing::trace!("backfilling {did}");
104
105
+
if let Err(e) = do_actor_backfill(&mut conn, rc, inner, &did).await {
106
+
tracing::error!(did, "backfill failed: {e}");
107
counter!("backfill_failure").increment(1);
0
0
0
0
108
} else {
109
counter!("backfill_success").increment(1);
110
111
+
db::backfill_job_write(&mut conn, &did, "successful")
112
.await
113
.unwrap();
0
0
0
0
114
}
115
});
116
}
···
121
}
122
}
123
124
+
async fn do_actor_backfill(
125
+
conn: &mut Object,
126
+
mut rc: MultiplexedConnection,
127
+
mut inner: BackfillManagerInner,
128
+
did: &str,
129
+
) -> eyre::Result<()> {
130
+
// has the repo already been downloaded?
131
+
if rc.sismember(DL_DUP_KEY, did).await.unwrap_or_default() {
132
+
tracing::info!("skipping duplicate repo {did}");
133
+
return Ok(());
134
+
}
135
+
136
+
// check if they're already synced in DB too
137
+
match db::actor_get_statuses(conn, did).await {
138
+
Ok(Some((_, state))) => {
139
+
if state == ActorSyncState::Synced || state == ActorSyncState::Processing {
140
+
tracing::info!("skipping duplicate repo {did}");
141
+
return Ok(());
142
+
}
143
+
}
144
+
Ok(None) => {}
145
+
Err(e) => {
146
+
tracing::error!(did, "failed to check current repo status: {e}");
147
+
db::backfill_job_write(conn, did, "failed.resolve").await?;
148
+
}
149
+
}
150
+
151
+
// set the repo to processing
152
+
conn.execute(
153
+
r#"INSERT INTO actors (did, sync_state, last_indexed) VALUES ($1, 'processing', NOW())
154
+
ON CONFLICT (did) DO UPDATE SET sync_state = 'processing', last_indexed=NOW()"#,
155
+
&[&did],
156
+
)
157
+
.await?;
158
+
159
+
let jd = Did::raw(did);
160
+
let (pds, handle) = match utils::resolve_service(&inner.resolver, &jd).await {
161
+
Ok(Some(data)) => data,
162
+
Ok(None) => {
163
+
tracing::warn!("bad DID doc for {did}");
164
+
db::actor_set_sync_status(conn, did, ActorSyncState::Dirty, Utc::now()).await?;
165
+
db::backfill_job_write(conn, did, "failed.resolve.did_svc").await?;
166
+
return Ok(());
167
+
}
168
+
Err(e) => {
169
+
tracing::error!(did, "failed to resolve DID doc: {e}");
170
+
db::actor_set_sync_status(conn, did, ActorSyncState::Dirty, Utc::now()).await?;
171
+
db::backfill_job_write(conn, did, "failed.resolve.did").await?;
172
+
return Ok(());
173
+
}
174
+
};
175
+
176
+
match utils::check_and_update_repo_status(&reqwest::Client::new(), conn, &pds, did).await {
177
+
Ok(true) => {}
178
+
Ok(false) => return Ok(()),
179
+
Err(e) => {
180
+
tracing::error!(pds, did, "failed to check repo status: {e}");
181
+
db::actor_set_sync_status(conn, did, ActorSyncState::Dirty, Utc::now()).await?;
182
+
db::backfill_job_write(conn, did, "failed.resolve.status").await?;
183
+
}
184
+
}
185
+
186
+
if let Some(handle) = handle {
187
+
// validate the handle and set it. we don't fail the whole backfill if this fails bc it's not
188
+
// really *that* important.
189
+
if let Err(e) = utils::resolve_and_set_handle(conn, &inner.resolver, &jd, handle).await {
190
+
tracing::warn!(pds, did, "failed to resolve handle: {e}");
191
+
}
192
+
}
193
+
194
+
utils::enforce_ratelimit(&mut rc, &pds).await?;
195
+
196
+
match backfill_repo(conn, &mut rc, &mut inner, &pds, did).await {
197
+
Ok(Some((rem, reset))) => {
198
+
let _ = rc.zadd(utils::BF_REM_KEY, &pds, rem).await;
199
+
let _ = rc.zadd(utils::BF_RESET_KEY, &pds, reset).await;
200
+
}
201
+
Ok(None) => tracing::debug!(pds, "got response with no ratelimit headers."),
202
+
Err(e) => {
203
+
tracing::error!(did, "backfill failed: {e}");
204
+
db::actor_set_sync_status(conn, did, ActorSyncState::Dirty, Utc::now()).await?;
205
+
db::backfill_job_write(conn, did, "failed.write").await?;
206
+
}
207
+
}
208
+
209
+
Ok(())
210
+
}
211
+
212
#[instrument(skip(conn, rc, inner))]
213
+
async fn backfill_repo(
214
conn: &mut Object,
215
rc: &mut MultiplexedConnection,
216
inner: &mut BackfillManagerInner,
217
+
pds: &str,
218
did: &str,
219
+
) -> eyre::Result<Option<(i32, i32)>> {
220
let mut t = conn.transaction().await?;
221
t.execute("SET CONSTRAINTS ALL DEFERRED", &[]).await?;
222
223
tracing::trace!("loading repo");
224
225
+
let mut completed =
226
+
repo::process_repo(&mut t, rc, &inner.client, &inner.tmp_dir, pds, did).await?;
227
+
let rev = &completed.commit.rev;
228
229
+
db::actor_set_repo_state(&mut t, did, rev, completed.commit.data).await?;
230
231
+
completed.copies.submit(&mut t, did).await?;
232
233
t.execute(
234
"UPDATE actors SET sync_state=$2, last_indexed=$3 WHERE did=$1",
···
236
)
237
.await?;
238
239
+
handle_backfill_rows(&mut t, rc, &mut completed.deltas, did, rev).await?;
240
241
tracing::trace!("insertion finished");
242
243
if let Some(index_client) = &mut inner.index_client {
244
// submit the deltas
245
+
let delta_store = completed
246
+
.deltas
247
.into_iter()
248
.map(|((uri, typ), delta)| parakeet_index::AggregateDeltaReq {
249
typ,
···
272
273
t.commit().await?;
274
275
+
Ok(completed.ratelimit_rem.zip(completed.ratelimit_reset))
276
}
277
278
async fn handle_backfill_rows(
···
327
db::backfill_delete_rows(conn, repo).await?;
328
329
Ok(())
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
330
}
331
332
#[derive(Debug, Default)]
+90
-66
crates/consumer/src/backfill/repo.rs
···
1
-
use super::{
2
-
types::{CarCommitEntry, CarEntry, CarRecordEntry},
3
-
CopyStore,
4
-
};
5
use crate::indexer::records;
6
use crate::indexer::types::{AggregateDeltaStore, RecordTypes};
7
use crate::utils::at_uri_is_by;
8
use crate::{db, indexer};
9
use deadpool_postgres::Transaction;
0
10
use ipld_core::cid::Cid;
11
-
use iroh_car::CarReader;
12
use metrics::counter;
13
use parakeet_index::AggregateType;
14
use redis::aio::MultiplexedConnection;
0
0
0
15
use std::collections::HashMap;
16
use std::path::Path;
17
-
use tokio::io::BufReader;
18
19
type BackfillDeltaStore = HashMap<(String, i32), i32>;
20
21
-
pub async fn insert_repo(
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
22
t: &mut Transaction<'_>,
23
rc: &mut MultiplexedConnection,
0
24
tmp_dir: &Path,
25
-
repo: &str,
26
-
) -> eyre::Result<(CarCommitEntry, BackfillDeltaStore, CopyStore)> {
27
-
let car = tokio::fs::File::open(tmp_dir.join(repo)).await?;
28
-
let mut car_stream = CarReader::new(BufReader::new(car)).await?;
0
0
0
0
29
30
-
// the root should be the commit block
31
-
let root = car_stream.header().roots().first().cloned().unwrap();
0
32
33
-
let mut commit = None;
34
-
let mut mst_nodes: HashMap<Cid, String> = HashMap::new();
35
-
let mut records: HashMap<Cid, RecordTypes> = HashMap::new();
36
let mut deltas = HashMap::new();
37
let mut copies = CopyStore::default();
38
39
-
while let Some((cid, block)) = car_stream.next_block().await? {
40
-
let Ok(block) = serde_ipld_dagcbor::from_slice::<CarEntry>(&block) else {
41
-
tracing::warn!("failed to parse block {cid}");
42
-
continue;
43
-
};
0
0
44
45
-
if root == cid {
46
-
if let CarEntry::Commit(commit_entry) = block {
47
-
commit = Some(commit_entry);
48
-
} else {
49
-
tracing::warn!("root did not point to a commit entry");
50
-
}
51
-
continue;
52
-
}
0
0
0
0
53
54
-
match block {
55
-
CarEntry::Commit(_) => {
56
-
tracing::debug!("got commit entry that was not in root")
57
-
}
58
-
CarEntry::Record(CarRecordEntry::Known(record)) => {
59
-
if let Some(path) = mst_nodes.remove(&cid) {
60
-
record_index(t, rc, &mut copies, &mut deltas, repo, &path, cid, record).await?;
61
-
} else {
62
-
records.insert(cid, record);
63
}
64
}
65
-
CarEntry::Record(CarRecordEntry::Other { ty }) => {
66
-
tracing::debug!("repo contains unknown record type: {ty} ({cid})");
67
-
}
68
-
CarEntry::Mst(mst) => {
69
-
let mut out = Vec::with_capacity(mst.e.len());
70
71
-
for node in mst.e {
72
-
let ks = String::from_utf8_lossy(&node.k);
0
0
0
0
73
74
-
let key = if node.p == 0 {
75
-
ks.to_string()
76
-
} else {
77
-
let (_, prev): &(Cid, String) = out.last().unwrap();
78
-
let prefix = &prev[..node.p as usize];
79
80
-
format!("{prefix}{ks}")
0
0
0
0
0
0
0
0
0
0
81
};
82
83
-
out.push((node.v, key.to_string()));
84
}
85
-
86
-
mst_nodes.extend(out);
87
}
88
-
}
89
-
}
90
91
-
for (cid, record) in records {
92
-
if let Some(path) = mst_nodes.remove(&cid) {
93
-
record_index(t, rc, &mut copies, &mut deltas, repo, &path, cid, record).await?;
94
-
} else {
95
-
tracing::warn!("couldn't find MST node for record {cid}")
96
}
97
-
}
98
-
99
-
let Some(commit) = commit else {
100
-
eyre::bail!("repo contained no commit?");
101
};
102
103
-
Ok((commit, deltas, copies))
0
0
0
0
0
0
104
}
105
106
async fn record_index(
···
1
+
use super::{utils::header_to_int, CopyStore};
0
0
0
2
use crate::indexer::records;
3
use crate::indexer::types::{AggregateDeltaStore, RecordTypes};
4
use crate::utils::at_uri_is_by;
5
use crate::{db, indexer};
6
use deadpool_postgres::Transaction;
7
+
use futures::TryStreamExt;
8
use ipld_core::cid::Cid;
0
9
use metrics::counter;
10
use parakeet_index::AggregateType;
11
use redis::aio::MultiplexedConnection;
12
+
use repo_stream::{Commit, DiskBuilder, Driver, DriverBuilder};
13
+
use reqwest::Client;
14
+
use serde::Deserialize;
15
use std::collections::HashMap;
16
use std::path::Path;
17
+
use tokio_util::io::StreamReader;
18
19
type BackfillDeltaStore = HashMap<(String, i32), i32>;
20
21
+
#[derive(Debug, Deserialize)]
22
+
#[serde(untagged)]
23
+
pub enum CarRecordEntry {
24
+
Known(RecordTypes),
25
+
Other {
26
+
#[serde(rename = "$type")]
27
+
ty: String,
28
+
},
29
+
}
30
+
31
+
pub struct CompletedRepoData {
32
+
pub commit: Commit,
33
+
pub deltas: BackfillDeltaStore,
34
+
pub copies: CopyStore,
35
+
pub ratelimit_rem: Option<i32>,
36
+
pub ratelimit_reset: Option<i32>,
37
+
}
38
+
39
+
pub async fn process_repo(
40
t: &mut Transaction<'_>,
41
rc: &mut MultiplexedConnection,
42
+
client: &Client,
43
tmp_dir: &Path,
44
+
pds: &str,
45
+
did: &str,
46
+
) -> eyre::Result<CompletedRepoData> {
47
+
let res = client
48
+
.get(format!("{pds}xrpc/com.atproto.sync.getRepo?did={did}"))
49
+
.send()
50
+
.await?
51
+
.error_for_status()?;
52
53
+
let headers = res.headers();
54
+
let ratelimit_rem = header_to_int(headers, "ratelimit-remaining");
55
+
let ratelimit_reset = header_to_int(headers, "ratelimit-reset");
56
57
+
let strm = res.bytes_stream().map_err(std::io::Error::other);
58
+
let reader = StreamReader::new(strm);
59
+
60
let mut deltas = HashMap::new();
61
let mut copies = CopyStore::default();
62
63
+
let commit = match DriverBuilder::new().load_car(reader).await? {
64
+
Driver::Memory(commit, mut driver) => {
65
+
tracing::debug!("parsing with memory driver");
66
+
while let Some(chunk) = driver.next_chunk(256).await? {
67
+
for output in chunk {
68
+
let path = output.rkey;
69
+
let cid = output.cid;
70
71
+
let record = serde_ipld_dagcbor::from_slice::<CarRecordEntry>(&output.data);
72
+
let record = match record {
73
+
Ok(CarRecordEntry::Known(record)) => record,
74
+
Ok(CarRecordEntry::Other { ty }) => {
75
+
tracing::debug!("repo contains unknown record type: {ty} ({cid})");
76
+
continue;
77
+
}
78
+
Err(err) => {
79
+
tracing::warn!("failed to parse block {cid}: {err}");
80
+
continue;
81
+
}
82
+
};
83
84
+
record_index(t, rc, &mut copies, &mut deltas, did, &path, cid, record).await?;
0
0
0
0
0
0
0
0
85
}
86
}
0
0
0
0
0
87
88
+
commit
89
+
}
90
+
Driver::Disk(paused) => {
91
+
tracing::debug!("parsing with disk driver");
92
+
let disk_store = DiskBuilder::new().open(tmp_dir.join(did)).await?;
93
+
let (commit, mut driver) = paused.finish_loading(disk_store).await?;
94
95
+
while let Some(chunk) = driver.next_chunk(256).await? {
96
+
for output in chunk {
97
+
let path = output.rkey;
98
+
let cid = output.cid;
0
99
100
+
let record = serde_ipld_dagcbor::from_slice::<CarRecordEntry>(&output.data);
101
+
let record = match record {
102
+
Ok(CarRecordEntry::Known(record)) => record,
103
+
Ok(CarRecordEntry::Other { ty }) => {
104
+
tracing::debug!("repo contains unknown record type: {ty} ({cid})");
105
+
continue;
106
+
}
107
+
Err(err) => {
108
+
tracing::warn!("failed to parse block {cid}: {err}");
109
+
continue;
110
+
}
111
};
112
113
+
record_index(t, rc, &mut copies, &mut deltas, did, &path, cid, record).await?;
114
}
0
0
115
}
0
0
116
117
+
commit
0
0
0
0
118
}
0
0
0
0
119
};
120
121
+
Ok(CompletedRepoData {
122
+
commit,
123
+
deltas,
124
+
copies,
125
+
ratelimit_rem,
126
+
ratelimit_reset,
127
+
})
128
}
129
130
async fn record_index(
-54
crates/consumer/src/backfill/types.rs
···
1
-
use crate::indexer::types::RecordTypes;
2
-
use ipld_core::cid::Cid;
3
-
use serde::Deserialize;
4
-
use serde_bytes::ByteBuf;
5
-
6
-
#[derive(Debug, Deserialize)]
7
-
#[serde(untagged)]
8
-
pub enum CarEntry {
9
-
Mst(CarMstEntry),
10
-
Commit(CarCommitEntry),
11
-
Record(CarRecordEntry),
12
-
}
13
-
14
-
#[derive(Debug, Deserialize)]
15
-
pub struct CarMstEntry {
16
-
pub l: Option<Cid>,
17
-
pub e: Vec<CarMstEntryNode>,
18
-
}
19
-
20
-
#[derive(Debug, Deserialize)]
21
-
pub struct CarMstEntryNode {
22
-
pub p: i32,
23
-
pub k: ByteBuf,
24
-
pub v: Cid,
25
-
pub t: Option<Cid>,
26
-
}
27
-
28
-
#[derive(Debug, Deserialize)]
29
-
pub struct CarCommitEntry {
30
-
pub did: String,
31
-
pub version: i32,
32
-
pub data: Cid,
33
-
pub rev: String,
34
-
pub prev: Option<Cid>,
35
-
pub sig: ByteBuf,
36
-
}
37
-
38
-
#[derive(Debug, Deserialize)]
39
-
#[serde(untagged)]
40
-
pub enum CarRecordEntry {
41
-
Known(RecordTypes),
42
-
Other {
43
-
#[serde(rename = "$type")]
44
-
ty: String,
45
-
},
46
-
}
47
-
48
-
#[derive(Debug, Deserialize)]
49
-
pub struct GetRepoStatusRes {
50
-
pub did: String,
51
-
pub active: bool,
52
-
pub status: Option<crate::firehose::AtpAccountStatus>,
53
-
pub rev: Option<String>,
54
-
}
···
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
+138
crates/consumer/src/backfill/utils.rs
···
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
···
1
+
use chrono::prelude::*;
2
+
use deadpool_postgres::Object;
3
+
use jacquard_common::types::{did::Did, string::Handle};
4
+
use jacquard_identity::{resolver::IdentityResolver, JacquardResolver};
5
+
use parakeet_db::types::ActorStatus;
6
+
use redis::{aio::MultiplexedConnection, AsyncTypedCommands};
7
+
use reqwest::{header::HeaderMap, Client, StatusCode};
8
+
use serde::Deserialize;
9
+
use std::time::Duration;
10
+
use tracing::instrument;
11
+
12
+
pub const BF_RESET_KEY: &str = "bf_ratelimit_reset";
13
+
pub const BF_REM_KEY: &str = "bf_ratelimit_rem";
14
+
15
+
#[derive(Debug, Deserialize)]
16
+
pub struct GetRepoStatusRes {
17
+
pub did: String,
18
+
pub active: bool,
19
+
pub status: Option<crate::firehose::AtpAccountStatus>,
20
+
pub rev: Option<String>,
21
+
}
22
+
23
+
pub async fn resolve_service(
24
+
resolver: &JacquardResolver,
25
+
did: &Did<'_>,
26
+
) -> jacquard_identity::resolver::Result<Option<(String, Option<Handle<'static>>)>> {
27
+
let doc = resolver.resolve_did_doc_owned(did).await?;
28
+
29
+
let Some(service) = doc.pds_endpoint() else {
30
+
return Ok(None);
31
+
};
32
+
let service = service.to_string();
33
+
34
+
let handle = doc.handles().first().cloned();
35
+
36
+
Ok(Some((service, handle)))
37
+
}
38
+
39
+
pub async fn resolve_and_set_handle(
40
+
conn: &mut Object,
41
+
resolver: &JacquardResolver,
42
+
did: &Did<'_>,
43
+
handle: Handle<'_>,
44
+
) -> eyre::Result<()> {
45
+
let handle_did = resolver.resolve_handle(&handle).await?;
46
+
if handle_did == Did::raw(did) {
47
+
conn.execute(
48
+
"UPDATE actors SET handle=$2 WHERE did=$1",
49
+
&[&did.as_str(), &handle.as_str()],
50
+
)
51
+
.await?;
52
+
} else {
53
+
tracing::warn!("requested DID ({did}) doesn't match handle");
54
+
}
55
+
56
+
Ok(())
57
+
}
58
+
59
+
// there's no ratelimit handling here because we pretty much always call download_car after.
60
+
#[instrument(skip(client, conn, pds))]
61
+
pub async fn check_and_update_repo_status(
62
+
client: &Client,
63
+
conn: &mut Object,
64
+
pds: &str,
65
+
repo: &str,
66
+
) -> eyre::Result<bool> {
67
+
match check_pds_repo_status(client, pds, repo).await? {
68
+
Some(status) if status.active => return Ok(true),
69
+
Some(status) => {
70
+
tracing::debug!("repo is inactive");
71
+
72
+
let status = status
73
+
.status
74
+
.unwrap_or(crate::firehose::AtpAccountStatus::Deleted);
75
+
conn.execute(
76
+
"UPDATE actors SET sync_state='dirty', status=$2 WHERE did=$1",
77
+
&[&repo, &ActorStatus::from(status)],
78
+
)
79
+
.await?;
80
+
}
81
+
None => {
82
+
tracing::debug!("repo was deleted");
83
+
84
+
conn.execute(
85
+
"UPDATE actors SET sync_state='dirty', status='deleted' WHERE did=$1",
86
+
&[&repo],
87
+
)
88
+
.await?;
89
+
}
90
+
}
91
+
92
+
Ok(false)
93
+
}
94
+
95
+
pub async fn check_pds_repo_status(
96
+
client: &Client,
97
+
pds: &str,
98
+
repo: &str,
99
+
) -> eyre::Result<Option<GetRepoStatusRes>> {
100
+
let res = client
101
+
.get(format!(
102
+
"{pds}xrpc/com.atproto.sync.getRepoStatus?did={repo}"
103
+
))
104
+
.send()
105
+
.await?;
106
+
107
+
if [StatusCode::NOT_FOUND, StatusCode::BAD_REQUEST].contains(&res.status()) {
108
+
return Ok(None);
109
+
}
110
+
111
+
Ok(res.json().await?)
112
+
}
113
+
114
+
pub async fn enforce_ratelimit(rc: &mut MultiplexedConnection, pds: &str) -> eyre::Result<()> {
115
+
let score = rc.zscore(BF_REM_KEY, pds).await?;
116
+
117
+
if let Some(rem) = score {
118
+
if (rem as i32) < 100 {
119
+
// if we've got None for some reason, just hope that the next req will contain the reset header.
120
+
if let Some(at) = rc.zscore(BF_RESET_KEY, pds).await? {
121
+
tracing::debug!("rate limit for {pds} resets at {at}");
122
+
let time = DateTime::from_timestamp(at as i64, 0).unwrap();
123
+
let delta = (time - Utc::now()).num_milliseconds().max(0);
124
+
125
+
tokio::time::sleep(Duration::from_millis(delta as u64)).await;
126
+
};
127
+
}
128
+
}
129
+
130
+
Ok(())
131
+
}
132
+
133
+
pub fn header_to_int(headers: &HeaderMap, name: &str) -> Option<i32> {
134
+
headers
135
+
.get(name)
136
+
.and_then(|v| v.to_str().ok())
137
+
.and_then(|v| v.parse().ok())
138
+
}
-12
crates/consumer/src/config.rs
···
70
pub workers: u8,
71
#[serde(default)]
72
pub skip_aggregation: bool,
73
-
#[serde(default = "default_download_workers")]
74
-
pub download_workers: usize,
75
-
#[serde(default = "default_download_buffer")]
76
-
pub download_buffer: usize,
77
pub download_tmp_dir: String,
78
}
79
···
88
fn default_indexer_workers() -> u8 {
89
4
90
}
91
-
92
-
fn default_download_workers() -> usize {
93
-
25
94
-
}
95
-
96
-
fn default_download_buffer() -> usize {
97
-
25_000
98
-
}
···
70
pub workers: u8,
71
#[serde(default)]
72
pub skip_aggregation: bool,
0
0
0
0
73
pub download_tmp_dir: String,
74
}
75
···
84
fn default_indexer_workers() -> u8 {
85
4
86
}
0
0
0
0
0
0
0
0