Server tools to backfill, tail, mirror, and verify PLC logs

catch up after bulk backfill

+171 -32
+11 -10
src/backfill.rs
··· 1 1 use crate::{BundleSource, Dt, ExportPage, Week, week_to_pages}; 2 2 use std::sync::Arc; 3 + use std::time::Instant; 3 4 use tokio::{sync::Mutex, task::JoinSet}; 4 5 5 6 const FIRST_WEEK: Week = Week::from_n(1668643200); ··· 20 21 21 22 let mut workers: JoinSet<anyhow::Result<()>> = JoinSet::new(); 22 23 24 + let t_step = Instant::now(); 25 + log::info!( 26 + "fetching backfill for {} weeks with {source_workers} workers...", 27 + weeks.lock().await.len() 28 + ); 29 + 23 30 // spin up the fetchers to work in parallel 24 31 for w in 0..source_workers { 25 32 let weeks = weeks.clone(); 26 33 let dest = dest.clone(); 27 34 let source = source.clone(); 28 35 workers.spawn(async move { 29 - log::trace!("about to get weeks..."); 30 - 31 36 while let Some(week) = weeks.lock().await.pop() { 32 - log::trace!( 33 - "worker {w}: fetching week {} (-{})", 34 - Into::<Dt>::into(week).to_rfc3339(), 35 - week.n_ago(), 36 - ); 37 + let when = Into::<Dt>::into(week).to_rfc3339(); 38 + log::trace!("worker {w}: fetching week {when} (-{})", week.n_ago()); 37 39 week_to_pages(source.clone(), week, dest.clone()).await?; 38 - log::trace!("week {}", Into::<Dt>::into(week).to_rfc3339()); 39 40 } 40 41 log::info!("done with the weeks ig"); 41 42 Ok(()) ··· 44 45 45 46 // TODO: handle missing/failed weeks 46 47 47 - // wait for them to finish 48 + // wait for the big backfill to finish 48 49 while let Some(res) = workers.join_next().await { 49 50 res??; 50 51 } 51 - 52 + log::info!("finished fetching backfill in {:?}", t_step.elapsed()); 52 53 Ok(()) 53 54 }
+83 -11
src/bin/allegedly.rs
··· 1 1 use allegedly::{ 2 - Db, Dt, ExportPage, FolderSource, HttpSource, backfill, bin_init, pages_to_weeks, 3 - poll_upstream, write_bulk as pages_to_pg, 2 + Db, Dt, ExportPage, FolderSource, HttpSource, PageBoundaryState, backfill, backfill_to_pg, 3 + bin_init, pages_to_pg, pages_to_weeks, poll_upstream, 4 4 }; 5 5 use clap::{Parser, Subcommand}; 6 6 use std::path::PathBuf; 7 + use tokio::sync::oneshot; 7 8 use url::Url; 8 9 9 10 #[derive(Debug, Parser)] ··· 45 46 /// Stop at the week ending before this date 46 47 #[arg(long)] 47 48 until: Option<Dt>, 49 + /// After the weekly imports, poll upstream until we're caught up 50 + #[arg(long, action)] 51 + catch_up: bool, 48 52 }, 49 53 /// Scrape a PLC server, collecting ops into weekly bundles 50 54 /// ··· 75 79 }, 76 80 } 77 81 78 - async fn pages_to_stdout(rx: flume::Receiver<ExportPage>) -> Result<(), flume::RecvError> { 82 + async fn pages_to_stdout( 83 + rx: flume::Receiver<ExportPage>, 84 + notify_last_at: Option<oneshot::Sender<Option<Dt>>>, 85 + ) -> Result<(), flume::RecvError> { 86 + let mut last_at = None; 79 87 while let Ok(page) = rx.recv_async().await { 80 - for op in page.ops { 81 - println!("{op}") 88 + for op in &page.ops { 89 + println!("{op}"); 90 + } 91 + if notify_last_at.is_some() 92 + && let Some(s) = PageBoundaryState::new(&page) 93 + { 94 + last_at = last_at.filter(|&l| l >= s.last_at).or(Some(s.last_at)); 82 95 } 83 96 } 97 + if let Some(notify) = notify_last_at { 98 + log::trace!("notifying last_at: {last_at:?}"); 99 + if notify.send(last_at).is_err() { 100 + log::error!("receiver for last_at dropped, can't notify"); 101 + }; 102 + } 84 103 Ok(()) 85 104 } 86 105 106 + /// page forwarder who drops its channels on receipt of a small page 107 + /// 108 + /// PLC will return up to 1000 ops on a page, and returns full pages until it 109 + /// has caught up, so this is a (hacky?) way to stop polling once we're up. 110 + fn full_pages(rx: flume::Receiver<ExportPage>) -> flume::Receiver<ExportPage> { 111 + let (tx, fwd) = flume::bounded(0); 112 + tokio::task::spawn(async move { 113 + while let Ok(page) = rx.recv_async().await 114 + && page.ops.len() > 900 115 + { 116 + tx.send_async(page).await.unwrap(); 117 + } 118 + }); 119 + fwd 120 + } 121 + 87 122 #[tokio::main] 88 123 async fn main() { 89 124 bin_init("main"); ··· 98 133 to_postgres, 99 134 postgres_reset, 100 135 until, 136 + catch_up, 101 137 } => { 102 - let (tx, rx) = flume::bounded(32); // big pages 138 + let (tx, rx) = flume::bounded(32); // these are big pages 103 139 tokio::task::spawn(async move { 104 140 if let Some(dir) = dir { 105 141 log::info!("Reading weekly bundles from local folder {dir:?}"); ··· 113 149 .unwrap(); 114 150 } 115 151 }); 116 - if let Some(url) = to_postgres { 117 - let db = Db::new(url.as_str()).await.unwrap(); 118 - pages_to_pg(db, rx, postgres_reset).await.unwrap(); 152 + 153 + // postgres writer will notify us as soon as the very last op's time is known 154 + // so we can start catching up while pg is restoring indexes and stuff 155 + let (notify_last_at, rx_last) = if catch_up { 156 + let (tx, rx) = oneshot::channel(); 157 + (Some(tx), Some(rx)) 119 158 } else { 120 - pages_to_stdout(rx).await.unwrap(); 159 + (None, None) 160 + }; 161 + 162 + let to_postgres_url_bulk = to_postgres.clone(); 163 + let bulk_out_write = tokio::task::spawn(async move { 164 + if let Some(ref url) = to_postgres_url_bulk { 165 + let db = Db::new(url.as_str()).await.unwrap(); 166 + backfill_to_pg(db, postgres_reset, rx, notify_last_at) 167 + .await 168 + .unwrap(); 169 + } else { 170 + pages_to_stdout(rx, notify_last_at).await.unwrap(); 171 + } 172 + }); 173 + 174 + if let Some(rx_last) = rx_last { 175 + let mut upstream = args.upstream; 176 + upstream.set_path("/export"); 177 + // wait until the time for `after` is known 178 + let last_at = rx_last.await.unwrap(); 179 + log::info!("beginning catch-up from {last_at:?} while the writer finalizes stuff"); 180 + let (tx, rx) = flume::bounded(256); 181 + tokio::task::spawn( 182 + async move { poll_upstream(last_at, upstream, tx).await.unwrap() }, 183 + ); 184 + bulk_out_write.await.unwrap(); 185 + log::info!("writing catch-up pages"); 186 + let full_pages = full_pages(rx); 187 + if let Some(url) = to_postgres { 188 + let db = Db::new(url.as_str()).await.unwrap(); 189 + pages_to_pg(db, full_pages).await.unwrap(); 190 + } else { 191 + pages_to_stdout(full_pages, None).await.unwrap(); 192 + } 121 193 } 122 194 } 123 195 Commands::Bundle { ··· 139 211 let start_at = after.or_else(|| Some(chrono::Utc::now())); 140 212 let (tx, rx) = flume::bounded(1); 141 213 tokio::task::spawn(async move { poll_upstream(start_at, url, tx).await.unwrap() }); 142 - pages_to_stdout(rx).await.unwrap(); 214 + pages_to_stdout(rx, None).await.unwrap(); 143 215 } 144 216 } 145 217 }
+2 -2
src/lib.rs
··· 8 8 9 9 pub use backfill::backfill; 10 10 pub use client::CLIENT; 11 - pub use plc_pg::{Db, write_bulk}; 12 - pub use poll::{get_page, poll_upstream}; 11 + pub use plc_pg::{Db, backfill_to_pg, pages_to_pg}; 12 + pub use poll::{PageBoundaryState, get_page, poll_upstream}; 13 13 pub use weekly::{BundleSource, FolderSource, HttpSource, Week, pages_to_weeks, week_to_pages}; 14 14 15 15 pub type Dt = chrono::DateTime<chrono::Utc>;
+72 -6
src/plc_pg.rs
··· 1 - use crate::{ExportPage, Op}; 1 + use crate::{Dt, ExportPage, Op, PageBoundaryState}; 2 2 use std::pin::pin; 3 3 use std::time::Instant; 4 + use tokio::sync::oneshot; 4 5 use tokio_postgres::{ 5 6 Client, Error as PgError, NoTls, 6 7 binary_copy::BinaryCopyInWriter, ··· 71 72 } 72 73 } 73 74 75 + pub async fn pages_to_pg(db: Db, pages: flume::Receiver<ExportPage>) -> Result<(), PgError> { 76 + let mut client = db.connect().await?; 77 + 78 + let ops_stmt = client 79 + .prepare( 80 + r#"INSERT INTO operations (did, operation, cid, nullified, "createdAt") 81 + VALUES ($1, $2, $3, $4, $5)"#, 82 + ) 83 + .await?; 84 + let did_stmt = client 85 + .prepare(r#"INSERT INTO dids (did) VALUES ($1) ON CONFLICT do nothing"#) 86 + .await?; 87 + 88 + let t0 = Instant::now(); 89 + let mut ops_inserted = 0; 90 + let mut dids_inserted = 0; 91 + 92 + while let Ok(page) = pages.recv_async().await { 93 + log::trace!("writing page with {} ops", page.ops.len()); 94 + let tx = client.transaction().await?; 95 + for s in page.ops { 96 + let Ok(op) = serde_json::from_str::<Op>(&s) else { 97 + log::warn!("ignoring unparseable op {s:?}"); 98 + continue; 99 + }; 100 + ops_inserted += tx 101 + .execute( 102 + &ops_stmt, 103 + &[ 104 + &op.did, 105 + &Json(op.operation), 106 + &op.cid, 107 + &op.nullified, 108 + &op.created_at, 109 + ], 110 + ) 111 + .await?; 112 + dids_inserted += tx.execute(&did_stmt, &[&op.did]).await?; 113 + } 114 + tx.commit().await?; 115 + } 116 + 117 + log::info!( 118 + "no more pages. inserted {ops_inserted} ops and {dids_inserted} dids in {:?}", 119 + t0.elapsed() 120 + ); 121 + Ok(()) 122 + } 123 + 74 124 /// Dump rows into an empty operations table quickly 75 125 /// 76 126 /// you must run this after initializing the db with kysely migrations from the ··· 84 134 /// panics: if the operations or dids tables are not empty, unless reset is true 85 135 /// 86 136 /// recommended postgres setting: `max_wal_size=4GB` (or more) 87 - pub async fn write_bulk( 137 + pub async fn backfill_to_pg( 88 138 db: Db, 139 + reset: bool, 89 140 pages: flume::Receiver<ExportPage>, 90 - reset: bool, 141 + notify_last_at: Option<oneshot::Sender<Option<Dt>>>, 91 142 ) -> Result<(), PgError> { 92 143 let mut client = db.connect().await?; 93 144 94 145 let t0 = Instant::now(); 95 146 let tx = client.transaction().await?; 96 - tx.execute("SET LOCAL synchronous_commit = off", &[]).await?; 147 + tx.execute("SET LOCAL synchronous_commit = off", &[]) 148 + .await?; 97 149 98 150 let t_step = Instant::now(); 99 151 for table in ["operations", "dids"] { ··· 142 194 ) 143 195 .await?; 144 196 let mut writer = pin!(BinaryCopyInWriter::new(sync, types)); 197 + let mut last_at = None; 145 198 while let Ok(page) = pages.recv_async().await { 146 - for s in page.ops { 147 - let Ok(op) = serde_json::from_str::<Op>(&s) else { 199 + for s in &page.ops { 200 + let Ok(op) = serde_json::from_str::<Op>(s) else { 148 201 log::warn!("ignoring unparseable op: {s:?}"); 149 202 continue; 150 203 }; ··· 159 212 ]) 160 213 .await?; 161 214 } 215 + if notify_last_at.is_some() 216 + && let Some(s) = PageBoundaryState::new(&page) 217 + { 218 + last_at = last_at.filter(|&l| l >= s.last_at).or(Some(s.last_at)); 219 + } 162 220 } 221 + 222 + if let Some(notify) = notify_last_at { 223 + log::trace!("notifying last_at: {last_at:?}"); 224 + if notify.send(last_at).is_err() { 225 + log::error!("receiver for last_at dropped, can't notify"); 226 + }; 227 + } 228 + 163 229 let n = writer.as_mut().finish().await?; 164 230 log::trace!("COPY IN wrote {n} ops: {:?}", t_step.elapsed()); 165 231
+3 -3
src/poll.rs
··· 45 45 46 46 /// PLC 47 47 #[derive(Debug, PartialEq)] 48 - struct PageBoundaryState { 49 - last_at: Dt, 48 + pub struct PageBoundaryState { 49 + pub last_at: Dt, 50 50 keys_at: Vec<OpKey>, // expected to ~always be length one 51 51 } 52 52 ··· 66 66 // should unrefactor to make Op own its data again, parse (and deal with errors) 67 67 // upfront, and probably greatly simplify everything downstream. simple. 68 68 impl PageBoundaryState { 69 - fn new(page: &ExportPage) -> Option<Self> { 69 + pub fn new(page: &ExportPage) -> Option<Self> { 70 70 let mut skips = 0; 71 71 72 72 // grab the very last op