···000000001use std::{cmp::Ordering, str::FromStr};
23use jacquard::{types::tid::Tid, url::Url};
···32 ParseCarError(#[from] crate::backfill::parse_car::Error),
33}
3435-/// backfill works as follows (https://docs.bsky.app/docs/advanced-guides/backfill)
36-///
37-/// 1. resolve did -> pds
38-/// 2. stream com.atproto.sync.subscribeRepos to a buffer
39-/// 3. get a car file from com.atproto.sync.getRepo (diff if a rev is stored in database)
40-/// 4. apply car file diff to database (incl rev)
41-/// 5. start playing events from buffer
42-/// 1. drop all events from other users
43-/// 2. drop all events with a lower rev than current rev
44-/// 3. apply event & update rev
45-/// 4. (non blocking) get blobs if missing
46-/// 5. (non blocking) parse for strongref and store strongrefs
47-/// 6. (non blocking) trigger garbage collection of blobs and strongref
48-/// 6. once buffer is empty, parse events live
49-pub async fn backfill(pds: &str, conn: &Pool<Postgres>) -> Result<(), Error> {
50 let db_rev = if let Some(rev) = query!(
51 "SELECT (rev) FROM meta WHERE did = $1",
52 config::USER.to_string()
···64 let pds = Url::from_str(&format!("https://{pds}/")).unwrap();
65 let car = load_car(config::USER.clone(), pds).await?;
6667- match car.partial_cmp(&db_rev) {
68- Some(val) => match val {
0000069 // car rev newer than db rev
70 // continue on; every other branch diverges
71 Ordering::Greater => {}
···80 // Most likely either the PDS or repo is broken, or the database has been corrupted.
81 // Check your PDS repo is working and/or drop the database."
82 // ),
83- },
84- // cant compare rev so assume all is ok and continue
85- None => {}
86 };
8788 // erase all old records and return if it fails
···9293 let data = parse_car(&car).await?;
94 let mut data = data.chunks(DB_MAX_REQ / 4);
000009596 while let Some(data) = data.next() {
97 let mut query = sqlx::QueryBuilder::new("INSERT INTO records(collection, rkey, record) ");
···114 }
115 _ => {}
116 };
0000117 }
118119 match query!(
···1+//! backfill works as follows (https://docs.bsky.app/docs/advanced-guides/backfill)
2+//!
3+//! 1. resolve did -> pds
4+//! 2. get a car file from com.atproto.sync.getRepo
5+//! 3. extract collection, rkey, and cbor data from each leaf
6+//! 4. convert cbor data to json
7+//! 5. store in db (limit to DB_MAX_REQ / 4 to avoid err)
8+9use std::{cmp::Ordering, str::FromStr};
1011use jacquard::{types::tid::Tid, url::Url};
···40 ParseCarError(#[from] crate::backfill::parse_car::Error),
41}
4243+pub async fn backfill(
44+ pds: &str,
45+ conn: &Pool<Postgres>,
46+ time: Option<std::time::Instant>,
47+) -> Result<(), Error> {
000000000048 let db_rev = if let Some(rev) = query!(
49 "SELECT (rev) FROM meta WHERE did = $1",
50 config::USER.to_string()
···62 let pds = Url::from_str(&format!("https://{pds}/")).unwrap();
63 let car = load_car(config::USER.clone(), pds).await?;
6465+ if let Some(time) = time {
66+ println!("Downloaded car file ({:?})", time.elapsed());
67+ }
68+ let time = time.map(|_| std::time::Instant::now());
69+70+ if let Some(val) = car.partial_cmp(&db_rev) {
71+ match val {
72 // car rev newer than db rev
73 // continue on; every other branch diverges
74 Ordering::Greater => {}
···83 // Most likely either the PDS or repo is broken, or the database has been corrupted.
84 // Check your PDS repo is working and/or drop the database."
85 // ),
86+ };
0087 };
8889 // erase all old records and return if it fails
···9394 let data = parse_car(&car).await?;
95 let mut data = data.chunks(DB_MAX_REQ / 4);
96+97+ if let Some(time) = time {
98+ println!("Parsed car file ({:?})", time.elapsed());
99+ }
100+ let time = time.map(|_| std::time::Instant::now());
101102 while let Some(data) = data.next() {
103 let mut query = sqlx::QueryBuilder::new("INSERT INTO records(collection, rkey, record) ");
···120 }
121 _ => {}
122 };
123+ }
124+125+ if let Some(time) = time {
126+ println!("Saved to database ({:?})", time.elapsed());
127 }
128129 match query!(
···1+//! get static and parsed environment variables
2+//!
3+//! USER is from env variable USER and parsed into a jacquard Did
4+//! POSTGRES_URL is from POSTGRES_USER, POSTGRES_PASSWORD, and POSTGRES_HOST
5+6use jacquard::types::string::Did;
7use std::env;
8use std::sync::LazyLock;
+2-16
src/db.rs
···001use crate::config;
2use sqlx::{Pool, Postgres, postgres::PgPool, query};
3···25 {
26 println!("Creating table `records`: \n{err}");
27 panic!("Could not instantiate db");
28- };
29-30- if let Err(err) = query!(
31- "CREATE TABLE IF NOT EXISTS foreign_records (
32- did TEXT,
33- collection TEXT,
34- rkey TEXT,
35- record JSON NOT NULL,
36- PRIMARY KEY (did, collection, rkey)
37- );"
38- )
39- .execute(&conn)
40- .await
41- {
42- println!("Creating table `foreign_records`: \n{err}");
43- panic!();
44 };
4546 if let Err(err) = query!(
···1+//! create a connection pool and setup tables before making avaliable
2+3use crate::config;
4use sqlx::{Pool, Postgres, postgres::PgPool, query};
5···27 {
28 println!("Creating table `records`: \n{err}");
29 panic!("Could not instantiate db");
000000000000000030 };
3132 if let Err(err) = query!(
···1+//! convert an ipld_core::ipld::Ipld enum into a serde_json::value::Value in the atproto data model
2+//!
3+//! a specific helper is required for this as Bytes and Link have differing representations to how serde_json handles them by default
4+//!
5+//! in general. types are naievely converted. the following types have special cases:
6+//! - `integer`: this could throw an error if the number is `x` in `i64::MIN < x < u64::MAX`
7+//! - `float`: always issues a warning since this is technically illegal. If its NaN or infinity, this errors as they cant be represented in json
8+//! - `bytes`: atproto JSON represents them as `{"$bytes": "BASE 64 NO PADDING"}`, but serde_json defaults to `[u8]`
9+//! - `link`: atproto JSON represents them as `{"$link": "BASE 32 NO PADDING"}`, but serde_json defaults to `[u8]`
10+11use base64::{Engine, prelude::BASE64_STANDARD_NO_PAD};
12use ipld_core::{cid::multibase::Base, ipld::Ipld};
13use log::warn;
···58 .map(|(k, v)| Ok::<_, Error>((k.clone(), ipld_to_json_value(v)?)))
59 .collect::<Result<Map<String, Value>, _>>()?,
60 ),
61+ Ipld::Link(cid) => json!({
62+ "$link": cid.to_string_of_base(Base::Base32Lower)?
63+ }),
64 })
65}