···1//! Consume a CAR from an AsyncRead, producing an ordered stream of records
203use crate::disk::{DiskError, DiskStore};
4-use crate::process::Processable;
5-use ipld_core::cid::Cid;
06use iroh_car::CarReader;
7-use serde::{Deserialize, Serialize};
8-use std::collections::HashMap;
9use std::convert::Infallible;
10use tokio::{io::AsyncRead, sync::mpsc};
1112-use crate::mst::{Commit, Node};
13use crate::walk::{Step, WalkError, Walker};
1415/// Errors that can happen while consuming and emitting blocks and records
···29 MissingRoot,
30 #[error("Storage error")]
31 StorageError(#[from] DiskError),
32- #[error("Encode error: {0}")]
33- BincodeEncodeError(#[from] bincode::error::EncodeError),
34 #[error("Tried to send on a closed channel")]
35 ChannelSendError, // SendError takes <T> which we don't need
36 #[error("Failed to join a task: {0}")]
37 JoinError(#[from] tokio::task::JoinError),
38}
3940-#[derive(Debug, thiserror::Error)]
41-pub enum DecodeError {
42- #[error(transparent)]
43- BincodeDecodeError(#[from] bincode::error::DecodeError),
44- #[error("extra bytes remained after decoding")]
45- ExtraGarbage,
46-}
47-48/// An in-order chunk of Rkey + (processed) Block pairs
49-pub type BlockChunk<T> = Vec<(String, T)>;
5051-#[derive(Debug, Clone, Serialize, Deserialize)]
52-pub(crate) enum MaybeProcessedBlock<T> {
53 /// A block that's *probably* a Node (but we can't know yet)
54 ///
55 /// It *can be* a record that suspiciously looks a lot like a node, so we
56 /// cannot eagerly turn it into a Node. We only know for sure what it is
57 /// when we actually walk down the MST
58- Raw(Vec<u8>),
59 /// A processed record from a block that was definitely not a Node
60 ///
61 /// Processing has to be fallible because the CAR can have totally-unused
···71 /// There's an alternative here, which would be to kick unprocessable blocks
72 /// back to Raw, or maybe even a new RawUnprocessable variant. Then we could
73 /// surface the typed error later if needed by trying to reprocess.
74- Processed(T),
75-}
76-77-impl<T: Processable> Processable for MaybeProcessedBlock<T> {
78- /// TODO this is probably a little broken
79- fn get_size(&self) -> usize {
80- use std::{cmp::max, mem::size_of};
81-82- // enum is always as big as its biggest member?
83- let base_size = max(size_of::<Vec<u8>>(), size_of::<T>());
84-85- let extra = match self {
86- Self::Raw(bytes) => bytes.len(),
87- Self::Processed(t) => t.get_size(),
88- };
89-90- base_size + extra
91- }
92}
9394-impl<T> MaybeProcessedBlock<T> {
95- fn maybe(process: fn(Vec<u8>) -> T, data: Vec<u8>) -> Self {
96 if Node::could_be(&data) {
97 MaybeProcessedBlock::Raw(data)
98 } else {
99 MaybeProcessedBlock::Processed(process(data))
100 }
101 }
00000000000000000000000000000102}
103104/// Read a CAR file, buffering blocks in memory or to disk
105-pub enum Driver<R: AsyncRead + Unpin, T: Processable> {
106 /// All blocks fit within the memory limit
107 ///
108 /// You probably want to check the commit's signature. You can go ahead and
109 /// walk the MST right away.
110- Memory(Commit, MemDriver<T>),
111 /// Blocks exceed the memory limit
112 ///
113 /// You'll need to provide a disk storage to continue. The commit will be
114 /// returned and can be validated only once all blocks are loaded.
115- Disk(NeedDisk<R, T>),
116}
117118/// Builder-style driver setup
···127 }
128}
129000000130impl DriverBuilder {
131 /// Begin configuring the driver with defaults
132 pub fn new() -> Self {
···143 /// Set the block processor
144 ///
145 /// Default: noop, raw blocks will be emitted
146- pub fn with_block_processor<T: Processable>(
147 self,
148- p: fn(Vec<u8>) -> T,
149- ) -> DriverBuilderWithProcessor<T> {
150 DriverBuilderWithProcessor {
151 mem_limit_mb: self.mem_limit_mb,
152- block_processor: p,
153 }
154 }
155 /// Begin processing an atproto MST from a CAR file
156- pub async fn load_car<R: AsyncRead + Unpin>(
157- &self,
158- reader: R,
159- ) -> Result<Driver<R, Vec<u8>>, DriveError> {
160- Driver::load_car(reader, crate::process::noop, self.mem_limit_mb).await
161 }
162}
163···165///
166/// start from `DriverBuilder`
167#[derive(Debug, Clone)]
168-pub struct DriverBuilderWithProcessor<T: Processable> {
169 pub mem_limit_mb: usize,
170- pub block_processor: fn(Vec<u8>) -> T,
171}
172173-impl<T: Processable> DriverBuilderWithProcessor<T> {
174 /// Set the in-memory size limit, in MiB
175 ///
176 /// Default: 16 MiB
···179 self
180 }
181 /// Begin processing an atproto MST from a CAR file
182- pub async fn load_car<R: AsyncRead + Unpin>(
183- &self,
184- reader: R,
185- ) -> Result<Driver<R, T>, DriveError> {
186 Driver::load_car(reader, self.block_processor, self.mem_limit_mb).await
187 }
188}
189190-impl<R: AsyncRead + Unpin, T: Processable> Driver<R, T> {
191 /// Begin processing an atproto MST from a CAR file
192 ///
193 /// Blocks will be loaded, processed, and buffered in memory. If the entire
···199 /// resumed by providing a `SqliteStorage` for on-disk block storage.
200 pub async fn load_car(
201 reader: R,
202- process: fn(Vec<u8>) -> T,
203 mem_limit_mb: usize,
204- ) -> Result<Driver<R, T>, DriveError> {
205 let max_size = mem_limit_mb * 2_usize.pow(20);
206 let mut mem_blocks = HashMap::new();
207···227 continue;
228 }
22900230 // remaining possible types: node, record, other. optimistically process
231 let maybe_processed = MaybeProcessedBlock::maybe(process, data);
232233 // stash (maybe processed) blocks in memory as long as we have room
234- mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size();
235 mem_blocks.insert(cid, maybe_processed);
236 if mem_size >= max_size {
237 return Ok(Driver::Disk(NeedDisk {
···275/// work the init function will do. We can drop the CAR reader before walking,
276/// so the sync/async boundaries become a little easier to work around.
277#[derive(Debug)]
278-pub struct MemDriver<T: Processable> {
279- blocks: HashMap<Cid, MaybeProcessedBlock<T>>,
280 walker: Walker,
281- process: fn(Vec<u8>) -> T,
282}
283284-impl<T: Processable> MemDriver<T> {
285 /// Step through the record outputs, in rkey order
286- pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> {
287 let mut out = Vec::with_capacity(n);
288 for _ in 0..n {
289 // walk as far as we can until we run out of blocks or find a record
···306}
307308/// A partially memory-loaded car file that needs disk spillover to continue
309-pub struct NeedDisk<R: AsyncRead + Unpin, T: Processable> {
310 car: CarReader<R>,
311 root: Cid,
312- process: fn(Vec<u8>) -> T,
313 max_size: usize,
314- mem_blocks: HashMap<Cid, MaybeProcessedBlock<T>>,
315 pub commit: Option<Commit>,
316}
317318-fn encode(v: impl Serialize) -> Result<Vec<u8>, bincode::error::EncodeError> {
319- bincode::serde::encode_to_vec(v, bincode::config::standard())
320-}
321-322-pub(crate) fn decode<T: Processable>(bytes: &[u8]) -> Result<T, DecodeError> {
323- let (t, n) = bincode::serde::decode_from_slice(bytes, bincode::config::standard())?;
324- if n != bytes.len() {
325- return Err(DecodeError::ExtraGarbage);
326- }
327- Ok(t)
328-}
329-330-impl<R: AsyncRead + Unpin, T: Processable + Send + 'static> NeedDisk<R, T> {
331 pub async fn finish_loading(
332 mut self,
333 mut store: DiskStore,
334- ) -> Result<(Commit, DiskDriver<T>), DriveError> {
335 // move store in and back out so we can manage lifetimes
336 // dump mem blocks into the store
337 store = tokio::task::spawn(async move {
338 let kvs = self
339 .mem_blocks
340 .into_iter()
341- .map(|(k, v)| Ok(encode(v).map(|v| (k.to_bytes(), v))?));
342343 store.put_many(kvs)?;
344 Ok::<_, DriveError>(store)
345 })
346 .await??;
347348- let (tx, mut rx) = mpsc::channel::<Vec<(Cid, MaybeProcessedBlock<T>)>>(1);
349350 let store_worker = tokio::task::spawn_blocking(move || {
351 while let Some(chunk) = rx.blocking_recv() {
352 let kvs = chunk
353 .into_iter()
354- .map(|(k, v)| Ok(encode(v).map(|v| (k.to_bytes(), v))?));
355 store.put_many(kvs)?;
356 }
357 Ok::<_, DriveError>(store)
···372 self.commit = Some(c);
373 continue;
374 }
000375 // remaining possible types: node, record, other. optimistically process
376 // TODO: get the actual in-memory size to compute disk spill
377 let maybe_processed = MaybeProcessedBlock::maybe(self.process, data);
378- mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size();
379 chunk.push((cid, maybe_processed));
380 if mem_size >= self.max_size {
381 // soooooo if we're setting the db cache to max_size and then letting
···418}
419420/// MST walker that reads from disk instead of an in-memory hashmap
421-pub struct DiskDriver<T: Clone> {
422- process: fn(Vec<u8>) -> T,
423 state: Option<BigState>,
424}
425426// for doctests only
427#[doc(hidden)]
428-pub fn _get_fake_disk_driver() -> DiskDriver<Vec<u8>> {
429- use crate::process::noop;
430 DiskDriver {
431 process: noop,
432 state: None,
433 }
434}
435436-impl<T: Processable + Send + 'static> DiskDriver<T> {
437 /// Walk the MST returning up to `n` rkey + record pairs
438 ///
439 /// ```no_run
440- /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop};
441 /// # #[tokio::main]
442 /// # async fn main() -> Result<(), DriveError> {
443 /// # let mut disk_driver = _get_fake_disk_driver();
···449 /// # Ok(())
450 /// # }
451 /// ```
452- pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> {
453 let process = self.process;
454455 // state should only *ever* be None transiently while inside here
···458 // the big pain here is that we don't want to leave self.state in an
459 // invalid state (None), so all the error paths have to make sure it
460 // comes out again.
461- let (state, res) = tokio::task::spawn_blocking(
462- move || -> (BigState, Result<BlockChunk<T>, DriveError>) {
463 let mut out = Vec::with_capacity(n);
464465 for _ in 0..n {
···480 }
481482 (state, Ok::<_, DriveError>(out))
483- },
484- )
485- .await?; // on tokio JoinError, we'll be left with invalid state :(
486487 // *must* restore state before dealing with the actual result
488 self.state = Some(state);
···499 fn read_tx_blocking(
500 &mut self,
501 n: usize,
502- tx: mpsc::Sender<Result<BlockChunk<T>, DriveError>>,
503- ) -> Result<(), mpsc::error::SendError<Result<BlockChunk<T>, DriveError>>> {
504 let BigState { store, walker } = self.state.as_mut().expect("valid state");
505506 loop {
507- let mut out: BlockChunk<T> = Vec::with_capacity(n);
508509 for _ in 0..n {
510 // walk as far as we can until we run out of blocks or find a record
···546 /// benefit over just using `.next_chunk(n)`.
547 ///
548 /// ```no_run
549- /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop};
550 /// # #[tokio::main]
551 /// # async fn main() -> Result<(), DriveError> {
552 /// # let mut disk_driver = _get_fake_disk_driver();
···565 mut self,
566 n: usize,
567 ) -> (
568- mpsc::Receiver<Result<BlockChunk<T>, DriveError>>,
569 tokio::task::JoinHandle<Self>,
570 ) {
571- let (tx, rx) = mpsc::channel::<Result<BlockChunk<T>, DriveError>>(1);
572573 // sketch: this worker is going to be allowed to execute without a join handle
574 let chan_task = tokio::task::spawn_blocking(move || {
···1//! Consume a CAR from an AsyncRead, producing an ordered stream of records
23+use crate::HashMap;
4use crate::disk::{DiskError, DiskStore};
5+use crate::mst::Node;
6+use bytes::Bytes;
7+use cid::Cid;
8use iroh_car::CarReader;
009use std::convert::Infallible;
10use tokio::{io::AsyncRead, sync::mpsc};
1112+use crate::mst::Commit;
13use crate::walk::{Step, WalkError, Walker};
1415/// Errors that can happen while consuming and emitting blocks and records
···29 MissingRoot,
30 #[error("Storage error")]
31 StorageError(#[from] DiskError),
0032 #[error("Tried to send on a closed channel")]
33 ChannelSendError, // SendError takes <T> which we don't need
34 #[error("Failed to join a task: {0}")]
35 JoinError(#[from] tokio::task::JoinError),
36}
370000000038/// An in-order chunk of Rkey + (processed) Block pairs
39+pub type BlockChunk = Vec<(String, Bytes)>;
4041+#[derive(Debug, Clone)]
42+pub(crate) enum MaybeProcessedBlock {
43 /// A block that's *probably* a Node (but we can't know yet)
44 ///
45 /// It *can be* a record that suspiciously looks a lot like a node, so we
46 /// cannot eagerly turn it into a Node. We only know for sure what it is
47 /// when we actually walk down the MST
48+ Raw(Bytes),
49 /// A processed record from a block that was definitely not a Node
50 ///
51 /// Processing has to be fallible because the CAR can have totally-unused
···61 /// There's an alternative here, which would be to kick unprocessable blocks
62 /// back to Raw, or maybe even a new RawUnprocessable variant. Then we could
63 /// surface the typed error later if needed by trying to reprocess.
64+ Processed(Bytes),
0000000000000000065}
6667+impl MaybeProcessedBlock {
68+ pub(crate) fn maybe(process: fn(Bytes) -> Bytes, data: Bytes) -> Self {
69 if Node::could_be(&data) {
70 MaybeProcessedBlock::Raw(data)
71 } else {
72 MaybeProcessedBlock::Processed(process(data))
73 }
74 }
75+ pub(crate) fn len(&self) -> usize {
76+ match self {
77+ MaybeProcessedBlock::Raw(b) => b.len(),
78+ MaybeProcessedBlock::Processed(b) => b.len(),
79+ }
80+ }
81+ pub(crate) fn into_bytes(self) -> Bytes {
82+ match self {
83+ MaybeProcessedBlock::Raw(b) => {
84+ let mut owned = b.try_into_mut().unwrap();
85+ owned.extend_from_slice(&[0x00]);
86+ owned.into()
87+ }
88+ MaybeProcessedBlock::Processed(b) => {
89+ let mut owned = b.try_into_mut().unwrap();
90+ owned.extend_from_slice(&[0x01]);
91+ owned.into()
92+ }
93+ }
94+ }
95+ pub(crate) fn from_bytes(mut b: Bytes) -> Self {
96+ // TODO: make sure bytes is not empty, that it's explicitly 0 or 1, etc
97+ let suffix = b.split_off(b.len() - 1);
98+ if *suffix == [0x00] {
99+ MaybeProcessedBlock::Raw(b)
100+ } else {
101+ MaybeProcessedBlock::Processed(b)
102+ }
103+ }
104}
105106/// Read a CAR file, buffering blocks in memory or to disk
107+pub enum Driver<R: AsyncRead + Unpin> {
108 /// All blocks fit within the memory limit
109 ///
110 /// You probably want to check the commit's signature. You can go ahead and
111 /// walk the MST right away.
112+ Memory(Commit, MemDriver),
113 /// Blocks exceed the memory limit
114 ///
115 /// You'll need to provide a disk storage to continue. The commit will be
116 /// returned and can be validated only once all blocks are loaded.
117+ Disk(NeedDisk<R>),
118}
119120/// Builder-style driver setup
···129 }
130}
131132+/// Processor that just returns the raw blocks
133+#[inline]
134+pub fn noop(block: Bytes) -> Bytes {
135+ block
136+}
137+138impl DriverBuilder {
139 /// Begin configuring the driver with defaults
140 pub fn new() -> Self {
···151 /// Set the block processor
152 ///
153 /// Default: noop, raw blocks will be emitted
154+ pub fn with_block_processor(
155 self,
156+ block_processor: fn(Bytes) -> Bytes,
157+ ) -> DriverBuilderWithProcessor {
158 DriverBuilderWithProcessor {
159 mem_limit_mb: self.mem_limit_mb,
160+ block_processor,
161 }
162 }
163 /// Begin processing an atproto MST from a CAR file
164+ pub async fn load_car<R: AsyncRead + Unpin>(&self, reader: R) -> Result<Driver<R>, DriveError> {
165+ Driver::load_car(reader, noop, self.mem_limit_mb).await
000166 }
167}
168···170///
171/// start from `DriverBuilder`
172#[derive(Debug, Clone)]
173+pub struct DriverBuilderWithProcessor {
174 pub mem_limit_mb: usize,
175+ pub block_processor: fn(Bytes) -> Bytes,
176}
177178+impl DriverBuilderWithProcessor {
179 /// Set the in-memory size limit, in MiB
180 ///
181 /// Default: 16 MiB
···184 self
185 }
186 /// Begin processing an atproto MST from a CAR file
187+ pub async fn load_car<R: AsyncRead + Unpin>(&self, reader: R) -> Result<Driver<R>, DriveError> {
000188 Driver::load_car(reader, self.block_processor, self.mem_limit_mb).await
189 }
190}
191192+impl<R: AsyncRead + Unpin> Driver<R> {
193 /// Begin processing an atproto MST from a CAR file
194 ///
195 /// Blocks will be loaded, processed, and buffered in memory. If the entire
···201 /// resumed by providing a `SqliteStorage` for on-disk block storage.
202 pub async fn load_car(
203 reader: R,
204+ process: fn(Bytes) -> Bytes,
205 mem_limit_mb: usize,
206+ ) -> Result<Driver<R>, DriveError> {
207 let max_size = mem_limit_mb * 2_usize.pow(20);
208 let mut mem_blocks = HashMap::new();
209···229 continue;
230 }
231232+ let data = Bytes::from(data);
233+234 // remaining possible types: node, record, other. optimistically process
235 let maybe_processed = MaybeProcessedBlock::maybe(process, data);
236237 // stash (maybe processed) blocks in memory as long as we have room
238+ mem_size += maybe_processed.len();
239 mem_blocks.insert(cid, maybe_processed);
240 if mem_size >= max_size {
241 return Ok(Driver::Disk(NeedDisk {
···279/// work the init function will do. We can drop the CAR reader before walking,
280/// so the sync/async boundaries become a little easier to work around.
281#[derive(Debug)]
282+pub struct MemDriver {
283+ blocks: HashMap<Cid, MaybeProcessedBlock>,
284 walker: Walker,
285+ process: fn(Bytes) -> Bytes,
286}
287288+impl MemDriver {
289 /// Step through the record outputs, in rkey order
290+ pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk>, DriveError> {
291 let mut out = Vec::with_capacity(n);
292 for _ in 0..n {
293 // walk as far as we can until we run out of blocks or find a record
···310}
311312/// A partially memory-loaded car file that needs disk spillover to continue
313+pub struct NeedDisk<R: AsyncRead + Unpin> {
314 car: CarReader<R>,
315 root: Cid,
316+ process: fn(Bytes) -> Bytes,
317 max_size: usize,
318+ mem_blocks: HashMap<Cid, MaybeProcessedBlock>,
319 pub commit: Option<Commit>,
320}
321322+impl<R: AsyncRead + Unpin> NeedDisk<R> {
000000000000323 pub async fn finish_loading(
324 mut self,
325 mut store: DiskStore,
326+ ) -> Result<(Commit, DiskDriver), DriveError> {
327 // move store in and back out so we can manage lifetimes
328 // dump mem blocks into the store
329 store = tokio::task::spawn(async move {
330 let kvs = self
331 .mem_blocks
332 .into_iter()
333+ .map(|(k, v)| (k.to_bytes(), v.into_bytes()));
334335 store.put_many(kvs)?;
336 Ok::<_, DriveError>(store)
337 })
338 .await??;
339340+ let (tx, mut rx) = mpsc::channel::<Vec<(Cid, MaybeProcessedBlock)>>(1);
341342 let store_worker = tokio::task::spawn_blocking(move || {
343 while let Some(chunk) = rx.blocking_recv() {
344 let kvs = chunk
345 .into_iter()
346+ .map(|(k, v)| (k.to_bytes(), v.into_bytes()));
347 store.put_many(kvs)?;
348 }
349 Ok::<_, DriveError>(store)
···364 self.commit = Some(c);
365 continue;
366 }
367+368+ let data = Bytes::from(data);
369+370 // remaining possible types: node, record, other. optimistically process
371 // TODO: get the actual in-memory size to compute disk spill
372 let maybe_processed = MaybeProcessedBlock::maybe(self.process, data);
373+ mem_size += maybe_processed.len();
374 chunk.push((cid, maybe_processed));
375 if mem_size >= self.max_size {
376 // soooooo if we're setting the db cache to max_size and then letting
···413}
414415/// MST walker that reads from disk instead of an in-memory hashmap
416+pub struct DiskDriver {
417+ process: fn(Bytes) -> Bytes,
418 state: Option<BigState>,
419}
420421// for doctests only
422#[doc(hidden)]
423+pub fn _get_fake_disk_driver() -> DiskDriver {
0424 DiskDriver {
425 process: noop,
426 state: None,
427 }
428}
429430+impl DiskDriver {
431 /// Walk the MST returning up to `n` rkey + record pairs
432 ///
433 /// ```no_run
434+ /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, noop};
435 /// # #[tokio::main]
436 /// # async fn main() -> Result<(), DriveError> {
437 /// # let mut disk_driver = _get_fake_disk_driver();
···443 /// # Ok(())
444 /// # }
445 /// ```
446+ pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk>, DriveError> {
447 let process = self.process;
448449 // state should only *ever* be None transiently while inside here
···452 // the big pain here is that we don't want to leave self.state in an
453 // invalid state (None), so all the error paths have to make sure it
454 // comes out again.
455+ let (state, res) =
456+ tokio::task::spawn_blocking(move || -> (BigState, Result<BlockChunk, DriveError>) {
457 let mut out = Vec::with_capacity(n);
458459 for _ in 0..n {
···474 }
475476 (state, Ok::<_, DriveError>(out))
477+ })
478+ .await?; // on tokio JoinError, we'll be left with invalid state :(
0479480 // *must* restore state before dealing with the actual result
481 self.state = Some(state);
···492 fn read_tx_blocking(
493 &mut self,
494 n: usize,
495+ tx: mpsc::Sender<Result<BlockChunk, DriveError>>,
496+ ) -> Result<(), mpsc::error::SendError<Result<BlockChunk, DriveError>>> {
497 let BigState { store, walker } = self.state.as_mut().expect("valid state");
498499 loop {
500+ let mut out: BlockChunk = Vec::with_capacity(n);
501502 for _ in 0..n {
503 // walk as far as we can until we run out of blocks or find a record
···539 /// benefit over just using `.next_chunk(n)`.
540 ///
541 /// ```no_run
542+ /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, noop};
543 /// # #[tokio::main]
544 /// # async fn main() -> Result<(), DriveError> {
545 /// # let mut disk_driver = _get_fake_disk_driver();
···558 mut self,
559 n: usize,
560 ) -> (
561+ mpsc::Receiver<Result<BlockChunk, DriveError>>,
562 tokio::task::JoinHandle<Self>,
563 ) {
564+ let (tx, rx) = mpsc::channel::<Result<BlockChunk, DriveError>>(1);
565566 // sketch: this worker is going to be allowed to execute without a join handle
567 let chan_task = tokio::task::spawn_blocking(move || {
+16-6
src/lib.rs
···2728match DriverBuilder::new()
29 .with_mem_limit_mb(10)
30- .with_block_processor(|rec| rec.len()) // block processing: just extract the raw record size
0031 .load_car(reader)
32 .await?
33{
···35 // if all blocks fit within memory
36 Driver::Memory(_commit, mut driver) => {
37 while let Some(chunk) = driver.next_chunk(256).await? {
38- for (_rkey, size) in chunk {
000039 total_size += size;
40 }
41 }
···49 let (_commit, mut driver) = paused.finish_loading(store).await?;
5051 while let Some(chunk) = driver.next_chunk(256).await? {
52- for (_rkey, size) in chunk {
000053 total_size += size;
54 }
55 }
···7677pub mod disk;
78pub mod drive;
79-pub mod process;
8081pub use disk::{DiskBuilder, DiskError, DiskStore};
82-pub use drive::{DriveError, Driver, DriverBuilder, NeedDisk};
83pub use mst::Commit;
84-pub use process::Processable;
0
···2728match DriverBuilder::new()
29 .with_mem_limit_mb(10)
30+ .with_block_processor(
31+ |rec| rec.len().to_ne_bytes().to_vec().into()
32+ ) // block processing: just extract the raw record size
33 .load_car(reader)
34 .await?
35{
···37 // if all blocks fit within memory
38 Driver::Memory(_commit, mut driver) => {
39 while let Some(chunk) = driver.next_chunk(256).await? {
40+ for (_rkey, bytes) in chunk {
41+42+ let (int_bytes, _) = bytes.split_at(size_of::<usize>());
43+ let size = usize::from_ne_bytes(int_bytes.try_into().unwrap());
44+45 total_size += size;
46 }
47 }
···55 let (_commit, mut driver) = paused.finish_loading(store).await?;
5657 while let Some(chunk) = driver.next_chunk(256).await? {
58+ for (_rkey, bytes) in chunk {
59+60+ let (int_bytes, _) = bytes.split_at(size_of::<usize>());
61+ let size = usize::from_ne_bytes(int_bytes.try_into().unwrap());
62+63 total_size += size;
64 }
65 }
···8687pub mod disk;
88pub mod drive;
08990pub use disk::{DiskBuilder, DiskError, DiskStore};
91+pub use drive::{DriveError, Driver, DriverBuilder, NeedDisk, noop};
92pub use mst::Commit;
93+94+pub(crate) use hashbrown::HashMap;
+1-1
src/mst.rs
···3//! The primary aim is to work through the **tree** structure. Non-node blocks
4//! are left as raw bytes, for upper levels to parse into DAG-CBOR or whatever.
56-use ipld_core::cid::Cid;
7use serde::Deserialize;
89/// The top-level data object in a repository's tree is a signed commit.
···3//! The primary aim is to work through the **tree** structure. Non-node blocks
4//! are left as raw bytes, for upper levels to parse into DAG-CBOR or whatever.
56+use cid::Cid;
7use serde::Deserialize;
89/// The top-level data object in a repository's tree is a signed commit.
-108
src/process.rs
···1-/*!
2-Record processor function output trait
3-4-The return type must satisfy the `Processable` trait, which requires:
5-6-- `Clone` because two rkeys can refer to the same record by CID, which may
7- only appear once in the CAR file.
8-- `Serialize + DeserializeOwned` so it can be spilled to disk.
9-10-One required function must be implemented, `get_size()`: this should return the
11-approximate total off-stack size of the type. (the on-stack size will be added
12-automatically via `std::mem::get_size`).
13-14-Note that it is **not guaranteed** that the `process` function will run on a
15-block before storing it in memory or on disk: it's not possible to know if a
16-block is a record without actually walking the MST, so the best we can do is
17-apply `process` to any block that we know *cannot* be an MST node, and otherwise
18-store the raw block bytes.
19-20-Here's a silly processing function that just collects 'eyy's found in the raw
21-record bytes
22-23-```
24-# use repo_stream::Processable;
25-# use serde::{Serialize, Deserialize};
26-#[derive(Debug, Clone, Serialize, Deserialize)]
27-struct Eyy(usize, String);
28-29-impl Processable for Eyy {
30- fn get_size(&self) -> usize {
31- // don't need to compute the usize, it's on the stack
32- self.1.capacity() // in-mem size from the string's capacity, in bytes
33- }
34-}
35-36-fn process(raw: Vec<u8>) -> Vec<Eyy> {
37- let mut out = Vec::new();
38- let to_find = "eyy".as_bytes();
39- for i in 0..(raw.len() - 3) {
40- if &raw[i..(i+3)] == to_find {
41- out.push(Eyy(i, "eyy".to_string()));
42- }
43- }
44- out
45-}
46-```
47-48-The memory sizing stuff is a little sketch but probably at least approximately
49-works.
50-*/
51-52-use serde::{Serialize, de::DeserializeOwned};
53-54-/// Output trait for record processing
55-pub trait Processable: Clone + Serialize + DeserializeOwned {
56- /// Any additional in-memory size taken by the processed type
57- ///
58- /// Do not include stack size (`std::mem::size_of`)
59- fn get_size(&self) -> usize;
60-}
61-62-/// Processor that just returns the raw blocks
63-#[inline]
64-pub fn noop(block: Vec<u8>) -> Vec<u8> {
65- block
66-}
67-68-impl Processable for u8 {
69- fn get_size(&self) -> usize {
70- 0
71- }
72-}
73-74-impl Processable for usize {
75- fn get_size(&self) -> usize {
76- 0 // no additional space taken, just its stack size (newtype is free)
77- }
78-}
79-80-impl Processable for String {
81- fn get_size(&self) -> usize {
82- self.capacity()
83- }
84-}
85-86-impl<Item: Sized + Processable> Processable for Vec<Item> {
87- fn get_size(&self) -> usize {
88- let slot_size = std::mem::size_of::<Item>();
89- let direct_size = slot_size * self.capacity();
90- let items_referenced_size: usize = self.iter().map(|item| item.get_size()).sum();
91- direct_size + items_referenced_size
92- }
93-}
94-95-impl<Item: Processable> Processable for Option<Item> {
96- fn get_size(&self) -> usize {
97- self.as_ref().map(|item| item.get_size()).unwrap_or(0)
98- }
99-}
100-101-impl<Item: Processable, Error: Processable> Processable for Result<Item, Error> {
102- fn get_size(&self) -> usize {
103- match self {
104- Ok(item) => item.get_size(),
105- Err(err) => err.get_size(),
106- }
107- }
108-}