···1use crate::{BundleSource, Dt, ExportPage, Week, week_to_pages};
2use std::sync::Arc;
03use tokio::{sync::Mutex, task::JoinSet};
45const FIRST_WEEK: Week = Week::from_n(1668643200);
···2021 let mut workers: JoinSet<anyhow::Result<()>> = JoinSet::new();
2200000023 // spin up the fetchers to work in parallel
24 for w in 0..source_workers {
25 let weeks = weeks.clone();
26 let dest = dest.clone();
27 let source = source.clone();
28 workers.spawn(async move {
29- log::trace!("about to get weeks...");
30-31 while let Some(week) = weeks.lock().await.pop() {
32- log::trace!(
33- "worker {w}: fetching week {} (-{})",
34- Into::<Dt>::into(week).to_rfc3339(),
35- week.n_ago(),
36- );
37 week_to_pages(source.clone(), week, dest.clone()).await?;
38- log::trace!("week {}", Into::<Dt>::into(week).to_rfc3339());
39 }
40 log::info!("done with the weeks ig");
41 Ok(())
···4445 // TODO: handle missing/failed weeks
4647- // wait for them to finish
48 while let Some(res) = workers.join_next().await {
49 res??;
50 }
51-52 Ok(())
53}
···1use crate::{BundleSource, Dt, ExportPage, Week, week_to_pages};
2use std::sync::Arc;
3+use std::time::Instant;
4use tokio::{sync::Mutex, task::JoinSet};
56const FIRST_WEEK: Week = Week::from_n(1668643200);
···2122 let mut workers: JoinSet<anyhow::Result<()>> = JoinSet::new();
2324+ let t_step = Instant::now();
25+ log::info!(
26+ "fetching backfill for {} weeks with {source_workers} workers...",
27+ weeks.lock().await.len()
28+ );
29+30 // spin up the fetchers to work in parallel
31 for w in 0..source_workers {
32 let weeks = weeks.clone();
33 let dest = dest.clone();
34 let source = source.clone();
35 workers.spawn(async move {
0036 while let Some(week) = weeks.lock().await.pop() {
37+ let when = Into::<Dt>::into(week).to_rfc3339();
38+ log::trace!("worker {w}: fetching week {when} (-{})", week.n_ago());
00039 week_to_pages(source.clone(), week, dest.clone()).await?;
040 }
41 log::info!("done with the weeks ig");
42 Ok(())
···4546 // TODO: handle missing/failed weeks
4748+ // wait for the big backfill to finish
49 while let Some(res) = workers.join_next().await {
50 res??;
51 }
52+ log::info!("finished fetching backfill in {:?}", t_step.elapsed());
53 Ok(())
54}
+83-11
src/bin/allegedly.rs
···1use allegedly::{
2- Db, Dt, ExportPage, FolderSource, HttpSource, backfill, bin_init, pages_to_weeks,
3- poll_upstream, write_bulk as pages_to_pg,
4};
5use clap::{Parser, Subcommand};
6use std::path::PathBuf;
07use url::Url;
89#[derive(Debug, Parser)]
···45 /// Stop at the week ending before this date
46 #[arg(long)]
47 until: Option<Dt>,
00048 },
49 /// Scrape a PLC server, collecting ops into weekly bundles
50 ///
···75 },
76}
7778-async fn pages_to_stdout(rx: flume::Receiver<ExportPage>) -> Result<(), flume::RecvError> {
000079 while let Ok(page) = rx.recv_async().await {
80- for op in page.ops {
81- println!("{op}")
0000082 }
83 }
00000084 Ok(())
85}
86000000000000000087#[tokio::main]
88async fn main() {
89 bin_init("main");
···98 to_postgres,
99 postgres_reset,
100 until,
0101 } => {
102- let (tx, rx) = flume::bounded(32); // big pages
103 tokio::task::spawn(async move {
104 if let Some(dir) = dir {
105 log::info!("Reading weekly bundles from local folder {dir:?}");
···113 .unwrap();
114 }
115 });
116- if let Some(url) = to_postgres {
117- let db = Db::new(url.as_str()).await.unwrap();
118- pages_to_pg(db, rx, postgres_reset).await.unwrap();
000119 } else {
120- pages_to_stdout(rx).await.unwrap();
000000000000000000000000000000000121 }
122 }
123 Commands::Bundle {
···139 let start_at = after.or_else(|| Some(chrono::Utc::now()));
140 let (tx, rx) = flume::bounded(1);
141 tokio::task::spawn(async move { poll_upstream(start_at, url, tx).await.unwrap() });
142- pages_to_stdout(rx).await.unwrap();
143 }
144 }
145}
···1use allegedly::{
2+ Db, Dt, ExportPage, FolderSource, HttpSource, PageBoundaryState, backfill, backfill_to_pg,
3+ bin_init, pages_to_pg, pages_to_weeks, poll_upstream,
4};
5use clap::{Parser, Subcommand};
6use std::path::PathBuf;
7+use tokio::sync::oneshot;
8use url::Url;
910#[derive(Debug, Parser)]
···46 /// Stop at the week ending before this date
47 #[arg(long)]
48 until: Option<Dt>,
49+ /// After the weekly imports, poll upstream until we're caught up
50+ #[arg(long, action)]
51+ catch_up: bool,
52 },
53 /// Scrape a PLC server, collecting ops into weekly bundles
54 ///
···79 },
80}
8182+async fn pages_to_stdout(
83+ rx: flume::Receiver<ExportPage>,
84+ notify_last_at: Option<oneshot::Sender<Option<Dt>>>,
85+) -> Result<(), flume::RecvError> {
86+ let mut last_at = None;
87 while let Ok(page) = rx.recv_async().await {
88+ for op in &page.ops {
89+ println!("{op}");
90+ }
91+ if notify_last_at.is_some()
92+ && let Some(s) = PageBoundaryState::new(&page)
93+ {
94+ last_at = last_at.filter(|&l| l >= s.last_at).or(Some(s.last_at));
95 }
96 }
97+ if let Some(notify) = notify_last_at {
98+ log::trace!("notifying last_at: {last_at:?}");
99+ if notify.send(last_at).is_err() {
100+ log::error!("receiver for last_at dropped, can't notify");
101+ };
102+ }
103 Ok(())
104}
105106+/// page forwarder who drops its channels on receipt of a small page
107+///
108+/// PLC will return up to 1000 ops on a page, and returns full pages until it
109+/// has caught up, so this is a (hacky?) way to stop polling once we're up.
110+fn full_pages(rx: flume::Receiver<ExportPage>) -> flume::Receiver<ExportPage> {
111+ let (tx, fwd) = flume::bounded(0);
112+ tokio::task::spawn(async move {
113+ while let Ok(page) = rx.recv_async().await
114+ && page.ops.len() > 900
115+ {
116+ tx.send_async(page).await.unwrap();
117+ }
118+ });
119+ fwd
120+}
121+122#[tokio::main]
123async fn main() {
124 bin_init("main");
···133 to_postgres,
134 postgres_reset,
135 until,
136+ catch_up,
137 } => {
138+ let (tx, rx) = flume::bounded(32); // these are big pages
139 tokio::task::spawn(async move {
140 if let Some(dir) = dir {
141 log::info!("Reading weekly bundles from local folder {dir:?}");
···149 .unwrap();
150 }
151 });
152+153+ // postgres writer will notify us as soon as the very last op's time is known
154+ // so we can start catching up while pg is restoring indexes and stuff
155+ let (notify_last_at, rx_last) = if catch_up {
156+ let (tx, rx) = oneshot::channel();
157+ (Some(tx), Some(rx))
158 } else {
159+ (None, None)
160+ };
161+162+ let to_postgres_url_bulk = to_postgres.clone();
163+ let bulk_out_write = tokio::task::spawn(async move {
164+ if let Some(ref url) = to_postgres_url_bulk {
165+ let db = Db::new(url.as_str()).await.unwrap();
166+ backfill_to_pg(db, postgres_reset, rx, notify_last_at)
167+ .await
168+ .unwrap();
169+ } else {
170+ pages_to_stdout(rx, notify_last_at).await.unwrap();
171+ }
172+ });
173+174+ if let Some(rx_last) = rx_last {
175+ let mut upstream = args.upstream;
176+ upstream.set_path("/export");
177+ // wait until the time for `after` is known
178+ let last_at = rx_last.await.unwrap();
179+ log::info!("beginning catch-up from {last_at:?} while the writer finalizes stuff");
180+ let (tx, rx) = flume::bounded(256);
181+ tokio::task::spawn(
182+ async move { poll_upstream(last_at, upstream, tx).await.unwrap() },
183+ );
184+ bulk_out_write.await.unwrap();
185+ log::info!("writing catch-up pages");
186+ let full_pages = full_pages(rx);
187+ if let Some(url) = to_postgres {
188+ let db = Db::new(url.as_str()).await.unwrap();
189+ pages_to_pg(db, full_pages).await.unwrap();
190+ } else {
191+ pages_to_stdout(full_pages, None).await.unwrap();
192+ }
193 }
194 }
195 Commands::Bundle {
···211 let start_at = after.or_else(|| Some(chrono::Utc::now()));
212 let (tx, rx) = flume::bounded(1);
213 tokio::task::spawn(async move { poll_upstream(start_at, url, tx).await.unwrap() });
214+ pages_to_stdout(rx, None).await.unwrap();
215 }
216 }
217}
+2-2
src/lib.rs
···89pub use backfill::backfill;
10pub use client::CLIENT;
11-pub use plc_pg::{Db, write_bulk};
12-pub use poll::{get_page, poll_upstream};
13pub use weekly::{BundleSource, FolderSource, HttpSource, Week, pages_to_weeks, week_to_pages};
1415pub type Dt = chrono::DateTime<chrono::Utc>;
···89pub use backfill::backfill;
10pub use client::CLIENT;
11+pub use plc_pg::{Db, backfill_to_pg, pages_to_pg};
12+pub use poll::{PageBoundaryState, get_page, poll_upstream};
13pub use weekly::{BundleSource, FolderSource, HttpSource, Week, pages_to_weeks, week_to_pages};
1415pub type Dt = chrono::DateTime<chrono::Utc>;
+72-6
src/plc_pg.rs
···1-use crate::{ExportPage, Op};
2use std::pin::pin;
3use std::time::Instant;
04use tokio_postgres::{
5 Client, Error as PgError, NoTls,
6 binary_copy::BinaryCopyInWriter,
···71 }
72}
73000000000000000000000000000000000000000000000000074/// Dump rows into an empty operations table quickly
75///
76/// you must run this after initializing the db with kysely migrations from the
···84/// panics: if the operations or dids tables are not empty, unless reset is true
85///
86/// recommended postgres setting: `max_wal_size=4GB` (or more)
87-pub async fn write_bulk(
88 db: Db,
089 pages: flume::Receiver<ExportPage>,
90- reset: bool,
91) -> Result<(), PgError> {
92 let mut client = db.connect().await?;
9394 let t0 = Instant::now();
95 let tx = client.transaction().await?;
96- tx.execute("SET LOCAL synchronous_commit = off", &[]).await?;
09798 let t_step = Instant::now();
99 for table in ["operations", "dids"] {
···142 )
143 .await?;
144 let mut writer = pin!(BinaryCopyInWriter::new(sync, types));
0145 while let Ok(page) = pages.recv_async().await {
146- for s in page.ops {
147- let Ok(op) = serde_json::from_str::<Op>(&s) else {
148 log::warn!("ignoring unparseable op: {s:?}");
149 continue;
150 };
···159 ])
160 .await?;
161 }
00000162 }
00000000163 let n = writer.as_mut().finish().await?;
164 log::trace!("COPY IN wrote {n} ops: {:?}", t_step.elapsed());
165
···1+use crate::{Dt, ExportPage, Op, PageBoundaryState};
2use std::pin::pin;
3use std::time::Instant;
4+use tokio::sync::oneshot;
5use tokio_postgres::{
6 Client, Error as PgError, NoTls,
7 binary_copy::BinaryCopyInWriter,
···72 }
73}
7475+pub async fn pages_to_pg(db: Db, pages: flume::Receiver<ExportPage>) -> Result<(), PgError> {
76+ let mut client = db.connect().await?;
77+78+ let ops_stmt = client
79+ .prepare(
80+ r#"INSERT INTO operations (did, operation, cid, nullified, "createdAt")
81+ VALUES ($1, $2, $3, $4, $5)"#,
82+ )
83+ .await?;
84+ let did_stmt = client
85+ .prepare(r#"INSERT INTO dids (did) VALUES ($1) ON CONFLICT do nothing"#)
86+ .await?;
87+88+ let t0 = Instant::now();
89+ let mut ops_inserted = 0;
90+ let mut dids_inserted = 0;
91+92+ while let Ok(page) = pages.recv_async().await {
93+ log::trace!("writing page with {} ops", page.ops.len());
94+ let tx = client.transaction().await?;
95+ for s in page.ops {
96+ let Ok(op) = serde_json::from_str::<Op>(&s) else {
97+ log::warn!("ignoring unparseable op {s:?}");
98+ continue;
99+ };
100+ ops_inserted += tx
101+ .execute(
102+ &ops_stmt,
103+ &[
104+ &op.did,
105+ &Json(op.operation),
106+ &op.cid,
107+ &op.nullified,
108+ &op.created_at,
109+ ],
110+ )
111+ .await?;
112+ dids_inserted += tx.execute(&did_stmt, &[&op.did]).await?;
113+ }
114+ tx.commit().await?;
115+ }
116+117+ log::info!(
118+ "no more pages. inserted {ops_inserted} ops and {dids_inserted} dids in {:?}",
119+ t0.elapsed()
120+ );
121+ Ok(())
122+}
123+124/// Dump rows into an empty operations table quickly
125///
126/// you must run this after initializing the db with kysely migrations from the
···134/// panics: if the operations or dids tables are not empty, unless reset is true
135///
136/// recommended postgres setting: `max_wal_size=4GB` (or more)
137+pub async fn backfill_to_pg(
138 db: Db,
139+ reset: bool,
140 pages: flume::Receiver<ExportPage>,
141+ notify_last_at: Option<oneshot::Sender<Option<Dt>>>,
142) -> Result<(), PgError> {
143 let mut client = db.connect().await?;
144145 let t0 = Instant::now();
146 let tx = client.transaction().await?;
147+ tx.execute("SET LOCAL synchronous_commit = off", &[])
148+ .await?;
149150 let t_step = Instant::now();
151 for table in ["operations", "dids"] {
···194 )
195 .await?;
196 let mut writer = pin!(BinaryCopyInWriter::new(sync, types));
197+ let mut last_at = None;
198 while let Ok(page) = pages.recv_async().await {
199+ for s in &page.ops {
200+ let Ok(op) = serde_json::from_str::<Op>(s) else {
201 log::warn!("ignoring unparseable op: {s:?}");
202 continue;
203 };
···212 ])
213 .await?;
214 }
215+ if notify_last_at.is_some()
216+ && let Some(s) = PageBoundaryState::new(&page)
217+ {
218+ last_at = last_at.filter(|&l| l >= s.last_at).or(Some(s.last_at));
219+ }
220 }
221+222+ if let Some(notify) = notify_last_at {
223+ log::trace!("notifying last_at: {last_at:?}");
224+ if notify.send(last_at).is_err() {
225+ log::error!("receiver for last_at dropped, can't notify");
226+ };
227+ }
228+229 let n = writer.as_mut().finish().await?;
230 log::trace!("COPY IN wrote {n} ops: {:?}", t_step.elapsed());
231
+3-3
src/poll.rs
···4546/// PLC
47#[derive(Debug, PartialEq)]
48-struct PageBoundaryState {
49- last_at: Dt,
50 keys_at: Vec<OpKey>, // expected to ~always be length one
51}
52···66// should unrefactor to make Op own its data again, parse (and deal with errors)
67// upfront, and probably greatly simplify everything downstream. simple.
68impl PageBoundaryState {
69- fn new(page: &ExportPage) -> Option<Self> {
70 let mut skips = 0;
7172 // grab the very last op
···4546/// PLC
47#[derive(Debug, PartialEq)]
48+pub struct PageBoundaryState {
49+ pub last_at: Dt,
50 keys_at: Vec<OpKey>, // expected to ~always be length one
51}
52···66// should unrefactor to make Op own its data again, parse (and deal with errors)
67// upfront, and probably greatly simplify everything downstream. simple.
68impl PageBoundaryState {
69+ pub fn new(page: &ExportPage) -> Option<Self> {
70 let mut skips = 0;
7172 // grab the very last op