this repo has no description

improve docs and add timer to backfill

vielle.dev 6989da66 2761b44a

verified
+67 -42
+30 -20
src/backfill/mod.rs
··· 1 use std::{cmp::Ordering, str::FromStr}; 2 3 use jacquard::{types::tid::Tid, url::Url}; ··· 32 ParseCarError(#[from] crate::backfill::parse_car::Error), 33 } 34 35 - /// backfill works as follows (https://docs.bsky.app/docs/advanced-guides/backfill) 36 - /// 37 - /// 1. resolve did -> pds 38 - /// 2. stream com.atproto.sync.subscribeRepos to a buffer 39 - /// 3. get a car file from com.atproto.sync.getRepo (diff if a rev is stored in database) 40 - /// 4. apply car file diff to database (incl rev) 41 - /// 5. start playing events from buffer 42 - /// 1. drop all events from other users 43 - /// 2. drop all events with a lower rev than current rev 44 - /// 3. apply event & update rev 45 - /// 4. (non blocking) get blobs if missing 46 - /// 5. (non blocking) parse for strongref and store strongrefs 47 - /// 6. (non blocking) trigger garbage collection of blobs and strongref 48 - /// 6. once buffer is empty, parse events live 49 - pub async fn backfill(pds: &str, conn: &Pool<Postgres>) -> Result<(), Error> { 50 let db_rev = if let Some(rev) = query!( 51 "SELECT (rev) FROM meta WHERE did = $1", 52 config::USER.to_string() ··· 64 let pds = Url::from_str(&format!("https://{pds}/")).unwrap(); 65 let car = load_car(config::USER.clone(), pds).await?; 66 67 - match car.partial_cmp(&db_rev) { 68 - Some(val) => match val { 69 // car rev newer than db rev 70 // continue on; every other branch diverges 71 Ordering::Greater => {} ··· 80 // Most likely either the PDS or repo is broken, or the database has been corrupted. 81 // Check your PDS repo is working and/or drop the database." 82 // ), 83 - }, 84 - // cant compare rev so assume all is ok and continue 85 - None => {} 86 }; 87 88 // erase all old records and return if it fails ··· 92 93 let data = parse_car(&car).await?; 94 let mut data = data.chunks(DB_MAX_REQ / 4); 95 96 while let Some(data) = data.next() { 97 let mut query = sqlx::QueryBuilder::new("INSERT INTO records(collection, rkey, record) "); ··· 114 } 115 _ => {} 116 }; 117 } 118 119 match query!(
··· 1 + //! backfill works as follows (https://docs.bsky.app/docs/advanced-guides/backfill) 2 + //! 3 + //! 1. resolve did -> pds 4 + //! 2. get a car file from com.atproto.sync.getRepo 5 + //! 3. extract collection, rkey, and cbor data from each leaf 6 + //! 4. convert cbor data to json 7 + //! 5. store in db (limit to DB_MAX_REQ / 4 to avoid err) 8 + 9 use std::{cmp::Ordering, str::FromStr}; 10 11 use jacquard::{types::tid::Tid, url::Url}; ··· 40 ParseCarError(#[from] crate::backfill::parse_car::Error), 41 } 42 43 + pub async fn backfill( 44 + pds: &str, 45 + conn: &Pool<Postgres>, 46 + time: Option<std::time::Instant>, 47 + ) -> Result<(), Error> { 48 let db_rev = if let Some(rev) = query!( 49 "SELECT (rev) FROM meta WHERE did = $1", 50 config::USER.to_string() ··· 62 let pds = Url::from_str(&format!("https://{pds}/")).unwrap(); 63 let car = load_car(config::USER.clone(), pds).await?; 64 65 + if let Some(time) = time { 66 + println!("Downloaded car file ({:?})", time.elapsed()); 67 + } 68 + let time = time.map(|_| std::time::Instant::now()); 69 + 70 + if let Some(val) = car.partial_cmp(&db_rev) { 71 + match val { 72 // car rev newer than db rev 73 // continue on; every other branch diverges 74 Ordering::Greater => {} ··· 83 // Most likely either the PDS or repo is broken, or the database has been corrupted. 84 // Check your PDS repo is working and/or drop the database." 85 // ), 86 + }; 87 }; 88 89 // erase all old records and return if it fails ··· 93 94 let data = parse_car(&car).await?; 95 let mut data = data.chunks(DB_MAX_REQ / 4); 96 + 97 + if let Some(time) = time { 98 + println!("Parsed car file ({:?})", time.elapsed()); 99 + } 100 + let time = time.map(|_| std::time::Instant::now()); 101 102 while let Some(data) = data.next() { 103 let mut query = sqlx::QueryBuilder::new("INSERT INTO records(collection, rkey, record) "); ··· 120 } 121 _ => {} 122 }; 123 + } 124 + 125 + if let Some(time) = time { 126 + println!("Saved to database ({:?})", time.elapsed()); 127 } 128 129 match query!(
+5
src/config.rs
··· 1 use jacquard::types::string::Did; 2 use std::env; 3 use std::sync::LazyLock;
··· 1 + //! get static and parsed environment variables 2 + //! 3 + //! USER is from env variable USER and parsed into a jacquard Did 4 + //! POSTGRES_URL is from POSTGRES_USER, POSTGRES_PASSWORD, and POSTGRES_HOST 5 + 6 use jacquard::types::string::Did; 7 use std::env; 8 use std::sync::LazyLock;
+2 -16
src/db.rs
··· 1 use crate::config; 2 use sqlx::{Pool, Postgres, postgres::PgPool, query}; 3 ··· 25 { 26 println!("Creating table `records`: \n{err}"); 27 panic!("Could not instantiate db"); 28 - }; 29 - 30 - if let Err(err) = query!( 31 - "CREATE TABLE IF NOT EXISTS foreign_records ( 32 - did TEXT, 33 - collection TEXT, 34 - rkey TEXT, 35 - record JSON NOT NULL, 36 - PRIMARY KEY (did, collection, rkey) 37 - );" 38 - ) 39 - .execute(&conn) 40 - .await 41 - { 42 - println!("Creating table `foreign_records`: \n{err}"); 43 - panic!(); 44 }; 45 46 if let Err(err) = query!(
··· 1 + //! create a connection pool and setup tables before making avaliable 2 + 3 use crate::config; 4 use sqlx::{Pool, Postgres, postgres::PgPool, query}; 5 ··· 27 { 28 println!("Creating table `records`: \n{err}"); 29 panic!("Could not instantiate db"); 30 }; 31 32 if let Err(err) = query!(
+11 -4
src/main.rs
··· 7 mod db; 8 mod utils; 9 10 #[tokio::main] 11 - async fn main() -> Result<(), ()> { 12 env_logger::init(); 13 println!("User: {}", *config::USER); 14 let conn: Pool<Postgres> = db::init().await; ··· 19 Err(err) => panic!("{}", err), 20 }; 21 22 - let backfilled = backfill(&pds, &conn).await; 23 - if let Err(err) = backfilled { 24 println!("{}", err); 25 - return Err(()); 26 }; 27 28 println!("Completed sucessfully!"); 29 Ok(())
··· 7 mod db; 8 mod utils; 9 10 + #[derive(Debug)] 11 + struct Error; 12 + 13 #[tokio::main] 14 + async fn main() -> Result<(), Error> { 15 env_logger::init(); 16 println!("User: {}", *config::USER); 17 let conn: Pool<Postgres> = db::init().await; ··· 22 Err(err) => panic!("{}", err), 23 }; 24 25 + println!("Starting backfill"); 26 + let timer = std::time::Instant::now(); 27 + 28 + if let Err(err) = backfill(&pds, &conn, Some(timer)).await { 29 println!("{}", err); 30 + return Err(Error); 31 }; 32 + 33 + println!("Backfill complete. Took {:?}", timer.elapsed()); 34 35 println!("Completed sucessfully!"); 36 Ok(())
+13 -2
src/utils/ipld_json.rs
··· 1 use base64::{Engine, prelude::BASE64_STANDARD_NO_PAD}; 2 use ipld_core::{cid::multibase::Base, ipld::Ipld}; 3 use log::warn; ··· 48 .map(|(k, v)| Ok::<_, Error>((k.clone(), ipld_to_json_value(v)?))) 49 .collect::<Result<Map<String, Value>, _>>()?, 50 ), 51 - Ipld::Link(cid) => json!({"$link": 52 - cid.to_string_of_base(Base::Base32Lower)? }), 53 }) 54 }
··· 1 + //! convert an ipld_core::ipld::Ipld enum into a serde_json::value::Value in the atproto data model 2 + //! 3 + //! a specific helper is required for this as Bytes and Link have differing representations to how serde_json handles them by default 4 + //! 5 + //! in general. types are naievely converted. the following types have special cases: 6 + //! - `integer`: this could throw an error if the number is `x` in `i64::MIN < x < u64::MAX` 7 + //! - `float`: always issues a warning since this is technically illegal. If its NaN or infinity, this errors as they cant be represented in json 8 + //! - `bytes`: atproto JSON represents them as `{"$bytes": "BASE 64 NO PADDING"}`, but serde_json defaults to `[u8]` 9 + //! - `link`: atproto JSON represents them as `{"$link": "BASE 32 NO PADDING"}`, but serde_json defaults to `[u8]` 10 + 11 use base64::{Engine, prelude::BASE64_STANDARD_NO_PAD}; 12 use ipld_core::{cid::multibase::Base, ipld::Ipld}; 13 use log::warn; ··· 58 .map(|(k, v)| Ok::<_, Error>((k.clone(), ipld_to_json_value(v)?))) 59 .collect::<Result<Map<String, Value>, _>>()?, 60 ), 61 + Ipld::Link(cid) => json!({ 62 + "$link": cid.to_string_of_base(Base::Base32Lower)? 63 + }), 64 }) 65 }
+4
src/utils/mod.rs
··· 1 pub mod ipld_json; 2 pub mod resolver;
··· 1 + //! contains utility functions 2 + //! 3 + //! see sub modules for more details 4 + 5 pub mod ipld_json; 6 pub mod resolver;
+2
src/utils/resolver.rs
··· 1 use jacquard::prelude::IdentityResolver; 2 use jacquard::types::did::Did; 3 use thiserror::Error;
··· 1 + //! resolve a Did to a pds domain 2 + 3 use jacquard::prelude::IdentityResolver; 4 use jacquard::types::did::Did; 5 use thiserror::Error;