this repo has no description

improve docs and add timer to backfill

vielle.dev 6989da66 2761b44a

verified
+67 -42
+30 -20
src/backfill/mod.rs
··· 1 + //! backfill works as follows (https://docs.bsky.app/docs/advanced-guides/backfill) 2 + //! 3 + //! 1. resolve did -> pds 4 + //! 2. get a car file from com.atproto.sync.getRepo 5 + //! 3. extract collection, rkey, and cbor data from each leaf 6 + //! 4. convert cbor data to json 7 + //! 5. store in db (limit to DB_MAX_REQ / 4 to avoid err) 8 + 1 9 use std::{cmp::Ordering, str::FromStr}; 2 10 3 11 use jacquard::{types::tid::Tid, url::Url}; ··· 32 40 ParseCarError(#[from] crate::backfill::parse_car::Error), 33 41 } 34 42 35 - /// backfill works as follows (https://docs.bsky.app/docs/advanced-guides/backfill) 36 - /// 37 - /// 1. resolve did -> pds 38 - /// 2. stream com.atproto.sync.subscribeRepos to a buffer 39 - /// 3. get a car file from com.atproto.sync.getRepo (diff if a rev is stored in database) 40 - /// 4. apply car file diff to database (incl rev) 41 - /// 5. start playing events from buffer 42 - /// 1. drop all events from other users 43 - /// 2. drop all events with a lower rev than current rev 44 - /// 3. apply event & update rev 45 - /// 4. (non blocking) get blobs if missing 46 - /// 5. (non blocking) parse for strongref and store strongrefs 47 - /// 6. (non blocking) trigger garbage collection of blobs and strongref 48 - /// 6. once buffer is empty, parse events live 49 - pub async fn backfill(pds: &str, conn: &Pool<Postgres>) -> Result<(), Error> { 43 + pub async fn backfill( 44 + pds: &str, 45 + conn: &Pool<Postgres>, 46 + time: Option<std::time::Instant>, 47 + ) -> Result<(), Error> { 50 48 let db_rev = if let Some(rev) = query!( 51 49 "SELECT (rev) FROM meta WHERE did = $1", 52 50 config::USER.to_string() ··· 64 62 let pds = Url::from_str(&format!("https://{pds}/")).unwrap(); 65 63 let car = load_car(config::USER.clone(), pds).await?; 66 64 67 - match car.partial_cmp(&db_rev) { 68 - Some(val) => match val { 65 + if let Some(time) = time { 66 + println!("Downloaded car file ({:?})", time.elapsed()); 67 + } 68 + let time = time.map(|_| std::time::Instant::now()); 69 + 70 + if let Some(val) = car.partial_cmp(&db_rev) { 71 + match val { 69 72 // car rev newer than db rev 70 73 // continue on; every other branch diverges 71 74 Ordering::Greater => {} ··· 80 83 // Most likely either the PDS or repo is broken, or the database has been corrupted. 81 84 // Check your PDS repo is working and/or drop the database." 82 85 // ), 83 - }, 84 - // cant compare rev so assume all is ok and continue 85 - None => {} 86 + }; 86 87 }; 87 88 88 89 // erase all old records and return if it fails ··· 92 93 93 94 let data = parse_car(&car).await?; 94 95 let mut data = data.chunks(DB_MAX_REQ / 4); 96 + 97 + if let Some(time) = time { 98 + println!("Parsed car file ({:?})", time.elapsed()); 99 + } 100 + let time = time.map(|_| std::time::Instant::now()); 95 101 96 102 while let Some(data) = data.next() { 97 103 let mut query = sqlx::QueryBuilder::new("INSERT INTO records(collection, rkey, record) "); ··· 114 120 } 115 121 _ => {} 116 122 }; 123 + } 124 + 125 + if let Some(time) = time { 126 + println!("Saved to database ({:?})", time.elapsed()); 117 127 } 118 128 119 129 match query!(
+5
src/config.rs
··· 1 + //! get static and parsed environment variables 2 + //! 3 + //! USER is from env variable USER and parsed into a jacquard Did 4 + //! POSTGRES_URL is from POSTGRES_USER, POSTGRES_PASSWORD, and POSTGRES_HOST 5 + 1 6 use jacquard::types::string::Did; 2 7 use std::env; 3 8 use std::sync::LazyLock;
+2 -16
src/db.rs
··· 1 + //! create a connection pool and setup tables before making avaliable 2 + 1 3 use crate::config; 2 4 use sqlx::{Pool, Postgres, postgres::PgPool, query}; 3 5 ··· 25 27 { 26 28 println!("Creating table `records`: \n{err}"); 27 29 panic!("Could not instantiate db"); 28 - }; 29 - 30 - if let Err(err) = query!( 31 - "CREATE TABLE IF NOT EXISTS foreign_records ( 32 - did TEXT, 33 - collection TEXT, 34 - rkey TEXT, 35 - record JSON NOT NULL, 36 - PRIMARY KEY (did, collection, rkey) 37 - );" 38 - ) 39 - .execute(&conn) 40 - .await 41 - { 42 - println!("Creating table `foreign_records`: \n{err}"); 43 - panic!(); 44 30 }; 45 31 46 32 if let Err(err) = query!(
+11 -4
src/main.rs
··· 7 7 mod db; 8 8 mod utils; 9 9 10 + #[derive(Debug)] 11 + struct Error; 12 + 10 13 #[tokio::main] 11 - async fn main() -> Result<(), ()> { 14 + async fn main() -> Result<(), Error> { 12 15 env_logger::init(); 13 16 println!("User: {}", *config::USER); 14 17 let conn: Pool<Postgres> = db::init().await; ··· 19 22 Err(err) => panic!("{}", err), 20 23 }; 21 24 22 - let backfilled = backfill(&pds, &conn).await; 23 - if let Err(err) = backfilled { 25 + println!("Starting backfill"); 26 + let timer = std::time::Instant::now(); 27 + 28 + if let Err(err) = backfill(&pds, &conn, Some(timer)).await { 24 29 println!("{}", err); 25 - return Err(()); 30 + return Err(Error); 26 31 }; 32 + 33 + println!("Backfill complete. Took {:?}", timer.elapsed()); 27 34 28 35 println!("Completed sucessfully!"); 29 36 Ok(())
+13 -2
src/utils/ipld_json.rs
··· 1 + //! convert an ipld_core::ipld::Ipld enum into a serde_json::value::Value in the atproto data model 2 + //! 3 + //! a specific helper is required for this as Bytes and Link have differing representations to how serde_json handles them by default 4 + //! 5 + //! in general. types are naievely converted. the following types have special cases: 6 + //! - `integer`: this could throw an error if the number is `x` in `i64::MIN < x < u64::MAX` 7 + //! - `float`: always issues a warning since this is technically illegal. If its NaN or infinity, this errors as they cant be represented in json 8 + //! - `bytes`: atproto JSON represents them as `{"$bytes": "BASE 64 NO PADDING"}`, but serde_json defaults to `[u8]` 9 + //! - `link`: atproto JSON represents them as `{"$link": "BASE 32 NO PADDING"}`, but serde_json defaults to `[u8]` 10 + 1 11 use base64::{Engine, prelude::BASE64_STANDARD_NO_PAD}; 2 12 use ipld_core::{cid::multibase::Base, ipld::Ipld}; 3 13 use log::warn; ··· 48 58 .map(|(k, v)| Ok::<_, Error>((k.clone(), ipld_to_json_value(v)?))) 49 59 .collect::<Result<Map<String, Value>, _>>()?, 50 60 ), 51 - Ipld::Link(cid) => json!({"$link": 52 - cid.to_string_of_base(Base::Base32Lower)? }), 61 + Ipld::Link(cid) => json!({ 62 + "$link": cid.to_string_of_base(Base::Base32Lower)? 63 + }), 53 64 }) 54 65 }
+4
src/utils/mod.rs
··· 1 + //! contains utility functions 2 + //! 3 + //! see sub modules for more details 4 + 1 5 pub mod ipld_json; 2 6 pub mod resolver;
+2
src/utils/resolver.rs
··· 1 + //! resolve a Did to a pds domain 2 + 1 3 use jacquard::prelude::IdentityResolver; 2 4 use jacquard::types::did::Did; 3 5 use thiserror::Error;