···22 Db, Dt, ExportPage, FjallDb, FolderSource, HttpSource, backfill, backfill_to_fjall,
33 backfill_to_pg,
44 bin::{GlobalArgs, bin_init},
55- full_pages, logo, pages_to_fjall, pages_to_pg, pages_to_stdout, poll_upstream,
55+ fjall_to_pages, full_pages, logo, pages_to_fjall, pages_to_pg, pages_to_stdout, poll_upstream,
66};
77use clap::Parser;
88use reqwest::Url;
···2323 /// Local folder to fetch bundles from (overrides `http`)
2424 #[arg(long)]
2525 dir: Option<PathBuf>,
2626+ /// Local fjall database to fetch raw ops from (overrides `http` and `dir`)
2727+ #[arg(long, conflicts_with_all = ["dir"])]
2828+ from_fjall: Option<PathBuf>,
2629 /// Don't do weekly bulk-loading at all.
2730 ///
2831 /// overrides `http` and `dir`, makes catch_up redundant
···7275 Args {
7376 http,
7477 dir,
7878+ from_fjall,
7579 no_bulk,
7680 source_workers,
7781 to_postgres,
···131135 // fun mode
132136133137 // set up bulk sources
134134- if let Some(dir) = dir {
138138+ if let Some(fjall_path) = from_fjall {
139139+ log::trace!("opening source fjall db at {fjall_path:?}...");
140140+ let db = FjallDb::open(&fjall_path)?;
141141+ log::trace!("opened source fjall db");
142142+ tasks.spawn(fjall_to_pages(db, bulk_tx, until));
143143+ } else if let Some(dir) = dir {
135144 if http != DEFAULT_HTTP.parse()? {
136145 anyhow::bail!(
137146 "non-default bulk http setting can't be used with bulk dir setting ({dir:?})"
+1-1
src/lib.rs
···1919pub use cached_value::{CachedValue, Fetcher};
2020pub use client::{CLIENT, UA};
2121pub use mirror::{ExperimentalConf, ListenConf, serve, serve_fjall};
2222-pub use plc_fjall::{FjallDb, backfill_to_fjall, pages_to_fjall};
2222+pub use plc_fjall::{FjallDb, backfill_to_fjall, fjall_to_pages, pages_to_fjall};
2323pub use plc_pg::{Db, backfill_to_pg, pages_to_pg};
2424pub use poll::{PageBoundaryState, get_page, poll_upstream};
2525pub use ratelimit::{CreatePlcOpLimiter, GovernorMiddleware, IpLimiters};
+89-4
src/plc_fjall.rs
···11use crate::{Dt, ExportPage, Op as CommonOp, PageBoundaryState};
22use anyhow::Context;
33use data_encoding::{BASE32_NOPAD, BASE64URL_NOPAD};
44-use fjall::{Database, Keyspace, KeyspaceCreateOptions, OwnedWriteBatch, PersistMode};
44+use fjall::{
55+ Database, Keyspace, KeyspaceCreateOptions, OwnedWriteBatch, PersistMode,
66+ config::BlockSizePolicy,
77+};
58use serde::{Deserialize, Serialize};
69use std::collections::BTreeMap;
710use std::fmt;
···694697695698impl FjallDb {
696699 pub fn open(path: impl AsRef<Path>) -> fjall::Result<Self> {
700700+ const fn kb(kb: u32) -> u32 {
701701+ kb * 1_024
702702+ }
703703+ const fn mb(mb: u32) -> u64 {
704704+ kb(mb) as u64 * 1_024
705705+ }
706706+697707 let db = Database::builder(path)
698698- .max_journaling_size(/* 1 GiB */ 1_024 * 1_024 * 1_024)
708708+ // 32mb is too low we can afford more
709709+ // this should be configurable though!
710710+ .cache_size(mb(256))
699711 .open()?;
700712 let opts = KeyspaceCreateOptions::default;
701713 let ops = db.keyspace("ops", || {
702702- opts().max_memtable_size(/* 256 MiB */ 256 * 1_024 * 1_024)
714714+ opts()
715715+ // this is mainly for when backfilling
716716+ .max_memtable_size(mb(192))
717717+ // this wont compress terribly well since its a bunch of CIDs and signatures and did:keys
718718+ // and we want to keep reads fast since we'll be reading a lot...
719719+ .data_block_size_policy(BlockSizePolicy::new([kb(4), kb(8), kb(32)]))
720720+ // this has no downsides, since the only point reads that might miss we do is on by_did
721721+ .expect_point_read_hits(true)
703722 })?;
704723 let by_did = db.keyspace("by_did", || {
705705- opts().max_memtable_size(/* 128 MiB */ 128 * 1_024 * 1_024)
724724+ opts()
725725+ .max_memtable_size(mb(64))
726726+ // this isn't gonna compress well anyway, since its just keys (did + timestamp + cid)
727727+ // and dids dont have many operations in the first place, so we can use small blocks
728728+ .data_block_size_policy(BlockSizePolicy::all(kb(2)))
706729 })?;
707730 Ok(Self {
708731 inner: Arc::new(FjallInner { db, ops, by_did }),
···946969 t0.elapsed()
947970 );
948971 Ok("pages_to_fjall")
972972+}
973973+974974+pub async fn fjall_to_pages(
975975+ db: FjallDb,
976976+ dest: mpsc::Sender<ExportPage>,
977977+ until: Option<Dt>,
978978+) -> anyhow::Result<&'static str> {
979979+ log::info!("starting fjall_to_pages backfill source...");
980980+981981+ let t0 = Instant::now();
982982+983983+ let dest_clone = dest.clone();
984984+ let ops_sent = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
985985+ let iter = db.export_ops(None, usize::MAX)?;
986986+ let mut current_page = Vec::with_capacity(1000);
987987+ let mut count = 0;
988988+989989+ for op_res in iter {
990990+ let op = op_res?;
991991+992992+ if let Some(u) = until {
993993+ if op.created_at >= u {
994994+ break;
995995+ }
996996+ }
997997+998998+ let operation_str = serde_json::to_string(&op.operation)?;
999999+ let common_op = crate::Op {
10001000+ did: op.did,
10011001+ cid: op.cid,
10021002+ created_at: op.created_at,
10031003+ nullified: op.nullified,
10041004+ operation: serde_json::value::RawValue::from_string(operation_str)?,
10051005+ };
10061006+10071007+ current_page.push(common_op);
10081008+ count += 1;
10091009+10101010+ if current_page.len() >= 1000 {
10111011+ let page = ExportPage {
10121012+ ops: std::mem::take(&mut current_page),
10131013+ };
10141014+ if dest_clone.blocking_send(page).is_err() {
10151015+ break;
10161016+ }
10171017+ }
10181018+ }
10191019+10201020+ if !current_page.is_empty() {
10211021+ let page = ExportPage { ops: current_page };
10221022+ let _ = dest_clone.blocking_send(page);
10231023+ }
10241024+10251025+ Ok(count)
10261026+ })
10271027+ .await??;
10281028+10291029+ log::info!(
10301030+ "finished sending {ops_sent} ops from fjall in {:?}",
10311031+ t0.elapsed()
10321032+ );
10331033+ Ok("fjall_to_pages")
9491034}
95010359511036#[cfg(test)]