Server tools to backfill, tail, mirror, and verify PLC logs

pg copy in (deadlocks at end)

(or something)

+83 -273
+21 -10
src/bin/allegedly.rs
··· 1 - use allegedly::{Dt, FolderSource, HttpSource, backfill, bin_init, pages_to_weeks, poll_upstream}; 1 + use allegedly::{Db, Dt, ExportPage, FolderSource, HttpSource, backfill, bin_init, pages_to_weeks, poll_upstream, write_bulk as pages_to_pg}; 2 2 use clap::{Parser, Subcommand}; 3 3 use std::path::PathBuf; 4 4 use url::Url; ··· 28 28 #[arg(long)] 29 29 #[clap(default_value = "4")] 30 30 source_workers: usize, 31 + /// Bulk load into did-method-plc-compatible postgres instead of stdout 32 + /// 33 + /// Pass a postgres connection url like "postgresql://localhost:5432" 34 + #[arg(long)] 35 + to_postgres: Option<Url>, 31 36 }, 32 37 /// Scrape a PLC server, collecting ops into weekly bundles 33 38 /// ··· 58 63 }, 59 64 } 60 65 66 + async fn pages_to_stdout(rx: flume::Receiver<ExportPage>) -> Result<(), flume::RecvError> { 67 + loop { 68 + for op in rx.recv_async().await?.ops { 69 + println!("{op}") 70 + } 71 + } 72 + } 73 + 61 74 #[tokio::main] 62 75 async fn main() { 63 76 bin_init("main"); ··· 69 82 http, 70 83 dir, 71 84 source_workers, 85 + to_postgres, 72 86 } => { 73 87 let (tx, rx) = flume::bounded(1024); // big pages 74 88 tokio::task::spawn(async move { ··· 84 98 .unwrap(); 85 99 } 86 100 }); 87 - loop { 88 - for op in rx.recv_async().await.unwrap().ops { 89 - println!("{op}") 90 - } 101 + if let Some(url) = to_postgres { 102 + let db = Db::new(url.as_str()); 103 + pages_to_pg(db, rx).await.unwrap(); 104 + } else { 105 + pages_to_stdout(rx).await.unwrap(); 91 106 } 92 107 } 93 108 Commands::Bundle { ··· 109 124 let start_at = after.or_else(|| Some(chrono::Utc::now())); 110 125 let (tx, rx) = flume::bounded(0); // rendezvous, don't read ahead 111 126 tokio::task::spawn(async move { poll_upstream(start_at, url, tx).await.unwrap() }); 112 - loop { 113 - for op in rx.recv_async().await.unwrap().ops { 114 - println!("{op}") 115 - } 116 - } 127 + pages_to_stdout(rx).await.unwrap(); 117 128 } 118 129 } 119 130 }
-214
src/bin/backfill.rs
··· 1 - use clap::Parser; 2 - use std::time::Duration; 3 - use url::Url; 4 - 5 - use allegedly::{Db, Dt, ExportPage, Op, bin_init, poll_upstream}; 6 - 7 - const EXPORT_PAGE_QUEUE_SIZE: usize = 0; // rendezvous for now 8 - const WEEK_IN_SECONDS: u64 = 7 * 86400; 9 - 10 - #[derive(Parser)] 11 - struct Args { 12 - /// Upstream PLC server to mirror 13 - /// 14 - /// default: https://plc.directory 15 - #[arg(long, env)] 16 - #[clap(default_value = "https://plc.directory")] 17 - upstream: Url, 18 - /// Bulk export source prefix 19 - /// 20 - /// Must be a prefix for urls ending with {WEEK_TIMESTAMP}.jsonl.gz 21 - /// 22 - /// default: https://plc.t3.storage.dev/plc.directory/ 23 - /// 24 - /// pass "off" to skip fast bulk backfilling 25 - #[arg(long, env)] 26 - #[clap(default_value = "https://plc.t3.storage.dev/plc.directory/")] 27 - upstream_bulk: Url, 28 - /// The oldest available bulk upstream export timestamp 29 - /// 30 - /// Must be a week-truncated unix timestamp 31 - /// 32 - /// plc.directory's oldest week is `1668643200`; you probably don't want to change this. 33 - #[arg(long, env)] 34 - #[clap(default_value = "1668643200")] 35 - bulk_epoch: u64, 36 - /// Mirror PLC's postgres database 37 - /// 38 - /// URI string with credentials etc 39 - #[arg(long, env)] 40 - postgres: String, 41 - } 42 - 43 - async fn bulk_backfill((_upstream, epoch): (Url, u64), _tx: flume::Sender<ExportPage>) { 44 - let immutable_cutoff = std::time::SystemTime::now() - Duration::from_secs((7 + 4) * 86400); 45 - let immutable_ts = (immutable_cutoff.duration_since(std::time::SystemTime::UNIX_EPOCH)) 46 - .unwrap() 47 - .as_secs(); 48 - let _immutable_week = (immutable_ts / WEEK_IN_SECONDS) * WEEK_IN_SECONDS; 49 - let _week = epoch; 50 - let _week_n = 0; 51 - todo!(); 52 - // while week < immutable_week { 53 - // log::info!("backfilling week {week_n} ({week})"); 54 - // let url = upstream.join(&format!("{week}.jsonl.gz")).unwrap(); 55 - // week_to_pages(url, tx.clone()).await.unwrap(); 56 - // week_n += 1; 57 - // week += WEEK_IN_SECONDS; 58 - // } 59 - } 60 - 61 - async fn export_upstream( 62 - upstream: Url, 63 - bulk: (Url, u64), 64 - tx: flume::Sender<ExportPage>, 65 - pg_client: tokio_postgres::Client, 66 - ) { 67 - let latest = get_latest(&pg_client).await; 68 - 69 - if latest.is_none() { 70 - bulk_backfill(bulk, tx.clone()).await; 71 - } 72 - let mut upstream = upstream; 73 - upstream.set_path("/export"); 74 - poll_upstream(latest, upstream, tx).await.unwrap(); 75 - } 76 - 77 - async fn write_pages( 78 - rx: flume::Receiver<ExportPage>, 79 - mut pg_client: tokio_postgres::Client, 80 - ) -> Result<(), anyhow::Error> { 81 - // TODO: one big upsert at the end from select distinct on the other table 82 - 83 - // let upsert_did = &pg_client 84 - // .prepare( 85 - // r#" 86 - // INSERT INTO dids (did) VALUES ($1) 87 - // ON CONFLICT DO NOTHING"#, 88 - // ) 89 - // .await 90 - // .unwrap(); 91 - 92 - let insert_op = &pg_client 93 - .prepare( 94 - r#" 95 - INSERT INTO operations (did, operation, cid, nullified, "createdAt") 96 - VALUES ($1, $2, $3, $4, $5) 97 - ON CONFLICT (did, cid) DO UPDATE 98 - SET nullified = excluded.nullified, 99 - "createdAt" = excluded."createdAt" 100 - WHERE operations.nullified = excluded.nullified 101 - OR operations."createdAt" = excluded."createdAt""#, 102 - ) // idea: op is provable via cid, so leave it out. after did/cid (pk) that leaves nullified and createdAt 103 - // that we want to notice changing. 104 - // normal insert: no conflict, rows changed = 1 105 - // conflict (exact match): where clause passes, rows changed = 1 106 - // conflict (mismatch): where clause fails, rows changed = 0 (detect this and warn!) 107 - .await 108 - .unwrap(); 109 - 110 - while let Ok(page) = rx.recv_async().await { 111 - log::trace!("got a page..."); 112 - 113 - let tx = pg_client.transaction().await.unwrap(); 114 - 115 - // TODO: probably figure out postgres COPY IN 116 - // for now just write everything into a transaction 117 - 118 - log::trace!("setting up inserts..."); 119 - for op_line in page 120 - .ops 121 - .into_iter() 122 - .flat_map(|s| { 123 - s.replace("}{", "}\n{") 124 - .split('\n') 125 - .map(|s| s.trim()) 126 - .map(Into::into) 127 - .collect::<Vec<String>>() 128 - }) 129 - .filter(|s| !s.is_empty()) 130 - { 131 - let Ok(op) = serde_json::from_str::<Op>(&op_line) 132 - .inspect_err(|e| log::error!("failing! at the {op_line}! {e}")) 133 - else { 134 - log::error!("ayeeeee just ignoring this error for now......"); 135 - continue; 136 - }; 137 - // let client = &tx; 138 - 139 - // client.execute(upsert_did, &[&op.did]).await.unwrap(); 140 - 141 - // let sp = tx.savepoint("op").await.unwrap(); 142 - let inserted = tx 143 - .execute( 144 - insert_op, 145 - &[ 146 - &op.did, 147 - &tokio_postgres::types::Json(op.operation), 148 - &op.cid, 149 - &op.nullified, 150 - &op.created_at, 151 - ], 152 - ) 153 - .await 154 - .unwrap(); 155 - if inserted != 1 { 156 - log::warn!( 157 - "possible log modification: {inserted} rows changed after upserting {op:?}" 158 - ); 159 - } 160 - // { 161 - // if e.code() != Some(&tokio_postgres::error::SqlState::UNIQUE_VIOLATION) { 162 - // anyhow::bail!(e); 163 - // } 164 - // // TODO: assert that the row has not changed 165 - // log::warn!("ignoring dup"); 166 - // } 167 - } 168 - 169 - tx.commit().await.unwrap(); 170 - } 171 - Ok(()) 172 - } 173 - 174 - async fn get_latest(pg_client: &tokio_postgres::Client) -> Option<Dt> { 175 - pg_client 176 - .query_opt( 177 - r#"SELECT "createdAt" FROM operations 178 - ORDER BY "createdAt" DESC LIMIT 1"#, 179 - &[], 180 - ) 181 - .await 182 - .unwrap() 183 - .map(|r| r.get(0)) 184 - } 185 - 186 - #[tokio::main] 187 - async fn main() -> anyhow::Result<()> { 188 - bin_init("main"); 189 - let args = Args::parse(); 190 - let db = Db::new(&args.postgres); 191 - let (tx, rx) = flume::bounded(EXPORT_PAGE_QUEUE_SIZE); 192 - 193 - log::trace!("connecting postgres for export task..."); 194 - let pg_client = db.connect().await?; 195 - let export_task = tokio::task::spawn(export_upstream( 196 - args.upstream, 197 - (args.upstream_bulk, args.bulk_epoch), 198 - tx, 199 - pg_client, 200 - )); 201 - 202 - log::trace!("connecting postgres for writer task..."); 203 - let pg_client = db.connect().await?; 204 - let writer_task = tokio::task::spawn(write_pages(rx, pg_client)); 205 - 206 - tokio::select! { 207 - z = export_task => log::warn!("export task ended: {z:?}"), 208 - z = writer_task => log::warn!("writer task ended: {z:?}"), 209 - }; 210 - 211 - log::error!("todo: shutdown"); 212 - 213 - Ok(()) 214 - }
-47
src/bin/get_backfill_chunk_adsf.rs
··· 1 - use allegedly::{HttpSource, Week, week_to_pages}; 2 - use std::io::Write; 3 - 4 - #[tokio::main] 5 - async fn main() { 6 - let url: url::Url = "https://plc.t3.storage.dev/plc.directory/".parse().unwrap(); 7 - let source = HttpSource(url); 8 - // let source = FolderSource("./weekly/".into()); 9 - let week = Week::from_n(1699488000); 10 - 11 - let (tx, rx) = flume::bounded(32); 12 - 13 - tokio::task::spawn(async move { 14 - week_to_pages(source, week, tx).await.unwrap(); 15 - }); 16 - 17 - let mut n = 0; 18 - 19 - print!("receiving"); 20 - while let Ok(page) = rx.recv_async().await { 21 - print!("."); 22 - std::io::stdout().flush().unwrap(); 23 - n += page.ops.len(); 24 - } 25 - println!(); 26 - 27 - println!("bye ({n})"); 28 - 29 - // let reader = CLIENT 30 - // .get("https://plc.t3.storage.dev/plc.directory/1699488000.jsonl.gz") 31 - // // .get("https://plc.t3.storage.dev/plc.directory/1669248000.jsonl.gz") 32 - // .send() 33 - // .await 34 - // .unwrap() 35 - // .error_for_status() 36 - // .unwrap() 37 - // .bytes_stream() 38 - // .map_err(io::Error::other) 39 - // .into_async_read(); 40 - 41 - // let decoder = GzipDecoder::new(io::BufReader::new(reader)); 42 - // let mut chunks = io::BufReader::new(decoder).lines().chunks(1000); 43 - // while let Some(ref _chunk) = chunks.next().await { 44 - // print!("."); 45 - // } 46 - // println!(); 47 - }
+1 -1
src/lib.rs
··· 8 8 9 9 pub use backfill::backfill; 10 10 pub use client::CLIENT; 11 - pub use plc_pg::Db; 11 + pub use plc_pg::{Db, write_bulk}; 12 12 pub use poll::{get_page, poll_upstream}; 13 13 pub use weekly::{BundleSource, FolderSource, HttpSource, Week, pages_to_weeks, week_to_pages}; 14 14
+61 -1
src/plc_pg.rs
··· 1 - use tokio_postgres::{Client, Error as PgError, NoTls, connect}; 1 + use crate::{ExportPage, Op}; 2 + use tokio_postgres::{Client, types::{Type, Json}, Error as PgError, NoTls, connect, binary_copy::BinaryCopyInWriter}; 3 + use std::pin::pin; 4 + 2 5 3 6 /// a little tokio-postgres helper 7 + /// 8 + /// it's clone for easiness. it doesn't share any resources underneath after 9 + /// cloning at all so it's not meant for 4 10 #[derive(Debug, Clone)] 5 11 pub struct Db { 6 12 pg_uri: String, ··· 29 35 Ok(client) 30 36 } 31 37 } 38 + 39 + pub async fn write_bulk( 40 + db: Db, 41 + pages: flume::Receiver<ExportPage>, 42 + ) -> Result<(), PgError> { 43 + let mut client = db.connect().await?; 44 + let tx = client.transaction().await?; 45 + 46 + tx 47 + .execute(r#" 48 + CREATE TABLE backfill ( 49 + did text not null, 50 + cid text not null, 51 + operation jsonb not null, 52 + nullified boolean not null, 53 + createdAt timestamptz not null 54 + )"#, &[]) 55 + .await?; 56 + 57 + 58 + let types = &[ 59 + Type::TEXT, 60 + Type::TEXT, 61 + Type::JSONB, 62 + Type::BOOL, 63 + Type::TIMESTAMPTZ, 64 + ]; 65 + 66 + let sync = tx.copy_in("COPY backfill FROM STDIN BINARY").await?; 67 + let mut writer = pin!(BinaryCopyInWriter::new(sync, types)); 68 + 69 + while let Ok(page) = pages.recv_async().await { 70 + for s in page.ops { 71 + let Ok(op) = serde_json::from_str::<Op>(&s) else { 72 + log::warn!("ignoring unparseable op: {s:?}"); 73 + continue; 74 + }; 75 + writer.as_mut().write(&[ 76 + &op.did, 77 + &op.cid, 78 + &Json(op.operation), 79 + &op.nullified, 80 + &op.created_at, 81 + ]).await?; 82 + } 83 + } 84 + 85 + let n = writer.as_mut().finish().await?; 86 + log::info!("copied in {n} rows"); 87 + 88 + tx.commit().await?; 89 + 90 + Ok(()) 91 + }