···44mod client;
55mod plc_pg;
66mod poll;
77+mod weekly;
7889pub use backfill::week_to_pages;
910pub use client::CLIENT;
1011pub use plc_pg::Db;
1111-pub use poll::poll_upstream;
1212+pub use poll::{get_page, poll_upstream};
1313+pub use weekly::{Week, pages_to_weeks};
12141315pub type Dt = chrono::DateTime<chrono::Utc>;
1416
+9-2
src/poll.rs
···1818/// we assume that the order will at least be deterministic: this may be unsound
1919#[derive(Debug, PartialEq)]
2020pub struct LastOp {
2121- created_at: Dt, // any op greater is definitely not duplicated
2121+ pub created_at: Dt, // any op greater is definitely not duplicated
2222 pk: (String, String), // did, cid
2323}
2424···117117 page.only_after_last(pl);
118118 }
119119 if !page.is_empty() {
120120- dest.send_async(page).await?;
120120+ match dest.try_send(page) {
121121+ Ok(()) => {}
122122+ Err(flume::TrySendError::Full(page)) => {
123123+ log::warn!("export: destination channel full, awaiting...");
124124+ dest.send_async(page).await?;
125125+ }
126126+ e => e?,
127127+ };
121128 }
122129123130 prev_last = next_last.or(prev_last);
+76
src/weekly.rs
···11+use crate::{Dt, ExportPage, Op};
22+use async_compression::tokio::write::GzipEncoder;
33+use std::path::PathBuf;
44+use tokio::{fs::File, io::AsyncWriteExt};
55+66+const WEEK_IN_SECONDS: i64 = 7 * 86400;
77+88+#[derive(Debug, Clone, Copy, PartialEq)]
99+pub struct Week(i64);
1010+1111+impl From<Dt> for Week {
1212+ fn from(dt: Dt) -> Self {
1313+ let ts = dt.timestamp();
1414+ let truncated = (ts / WEEK_IN_SECONDS) * WEEK_IN_SECONDS;
1515+ Week(truncated)
1616+ }
1717+}
1818+1919+impl From<Week> for Dt {
2020+ fn from(week: Week) -> Dt {
2121+ let Week(ts) = week;
2222+ Dt::from_timestamp(ts, 0).expect("the week to be in valid range")
2323+ }
2424+}
2525+2626+pub async fn pages_to_weeks(rx: flume::Receiver<ExportPage>, dir: PathBuf) -> anyhow::Result<()> {
2727+ pub use std::time::Instant;
2828+2929+ // ...there is certainly a nicer way to write this
3030+ let mut current_week: Option<Week> = None;
3131+ let dummy_file = File::create(dir.join("_dummy")).await?;
3232+ let mut encoder = GzipEncoder::new(dummy_file);
3333+3434+ let mut total_ops = 0;
3535+ let total_t0 = Instant::now();
3636+ let mut week_ops = 0;
3737+ let mut week_t0 = total_t0;
3838+ let mut week = 0;
3939+4040+ while let Ok(page) = rx.recv_async().await {
4141+ for mut s in page.ops {
4242+ let Ok(op) = serde_json::from_str::<Op>(&s)
4343+ .inspect_err(|e| log::error!("failed to parse plc op, ignoring: {e}"))
4444+ else {
4545+ continue;
4646+ };
4747+ let op_week = op.created_at.into();
4848+ if current_week.map(|w| w != op_week).unwrap_or(true) {
4949+ encoder.shutdown().await?;
5050+ let now = Instant::now();
5151+5252+ log::info!(
5353+ "done week {week:3 } ({:10 }): {week_ops:7 } ({:5.0 }/s) ops, {:5 }k total ({:5.0 }/s)",
5454+ current_week.unwrap_or(Week(0)).0,
5555+ (week_ops as f64) / (now - week_t0).as_secs_f64(),
5656+ total_ops / 1000,
5757+ (total_ops as f64) / (now - total_t0).as_secs_f64(),
5858+ );
5959+6060+ let file = File::create(dir.join(format!("{}.jsonl.gz", op_week.0))).await?;
6161+ encoder = GzipEncoder::with_quality(file, async_compression::Level::Best);
6262+ current_week = Some(op_week);
6363+ week_ops = 0;
6464+ week_t0 = now;
6565+ week += 1;
6666+ }
6767+ s.push('\n'); // hack
6868+ log::trace!("writing: {s}");
6969+ encoder.write_all(s.as_bytes()).await?;
7070+ total_ops += 1;
7171+ week_ops += 1;
7272+ }
7373+ }
7474+7575+ Ok(())
7676+}