···11//! Consume a CAR from an AsyncRead, producing an ordered stream of records
2233+use crate::HashMap;
34use crate::disk::{DiskError, DiskStore};
44-use crate::process::Processable;
55-use ipld_core::cid::Cid;
55+use crate::mst::Node;
66+use bytes::Bytes;
77+use cid::Cid;
68use iroh_car::CarReader;
77-use serde::{Deserialize, Serialize};
88-use std::collections::HashMap;
99use std::convert::Infallible;
1010use tokio::{io::AsyncRead, sync::mpsc};
11111212-use crate::mst::{Commit, Node};
1212+use crate::mst::Commit;
1313use crate::walk::{Step, WalkError, Walker};
14141515/// Errors that can happen while consuming and emitting blocks and records
···2929 MissingRoot,
3030 #[error("Storage error")]
3131 StorageError(#[from] DiskError),
3232- #[error("Encode error: {0}")]
3333- BincodeEncodeError(#[from] bincode::error::EncodeError),
3432 #[error("Tried to send on a closed channel")]
3533 ChannelSendError, // SendError takes <T> which we don't need
3634 #[error("Failed to join a task: {0}")]
3735 JoinError(#[from] tokio::task::JoinError),
3836}
39374040-#[derive(Debug, thiserror::Error)]
4141-pub enum DecodeError {
4242- #[error(transparent)]
4343- BincodeDecodeError(#[from] bincode::error::DecodeError),
4444- #[error("extra bytes remained after decoding")]
4545- ExtraGarbage,
4646-}
4747-4838/// An in-order chunk of Rkey + (processed) Block pairs
4949-pub type BlockChunk<T> = Vec<(String, T)>;
3939+pub type BlockChunk = Vec<(String, Bytes)>;
50405151-#[derive(Debug, Clone, Serialize, Deserialize)]
5252-pub(crate) enum MaybeProcessedBlock<T> {
4141+#[derive(Debug, Clone)]
4242+pub(crate) enum MaybeProcessedBlock {
5343 /// A block that's *probably* a Node (but we can't know yet)
5444 ///
5545 /// It *can be* a record that suspiciously looks a lot like a node, so we
5646 /// cannot eagerly turn it into a Node. We only know for sure what it is
5747 /// when we actually walk down the MST
5858- Raw(Vec<u8>),
4848+ Raw(Bytes),
5949 /// A processed record from a block that was definitely not a Node
6050 ///
6151 /// Processing has to be fallible because the CAR can have totally-unused
···7161 /// There's an alternative here, which would be to kick unprocessable blocks
7262 /// back to Raw, or maybe even a new RawUnprocessable variant. Then we could
7363 /// surface the typed error later if needed by trying to reprocess.
7474- Processed(T),
7575-}
7676-7777-impl<T: Processable> Processable for MaybeProcessedBlock<T> {
7878- /// TODO this is probably a little broken
7979- fn get_size(&self) -> usize {
8080- use std::{cmp::max, mem::size_of};
8181-8282- // enum is always as big as its biggest member?
8383- let base_size = max(size_of::<Vec<u8>>(), size_of::<T>());
8484-8585- let extra = match self {
8686- Self::Raw(bytes) => bytes.len(),
8787- Self::Processed(t) => t.get_size(),
8888- };
8989-9090- base_size + extra
9191- }
6464+ Processed(Bytes),
9265}
93669494-impl<T> MaybeProcessedBlock<T> {
9595- fn maybe(process: fn(Vec<u8>) -> T, data: Vec<u8>) -> Self {
6767+impl MaybeProcessedBlock {
6868+ pub(crate) fn maybe(process: fn(Bytes) -> Bytes, data: Bytes) -> Self {
9669 if Node::could_be(&data) {
9770 MaybeProcessedBlock::Raw(data)
9871 } else {
9972 MaybeProcessedBlock::Processed(process(data))
10073 }
10174 }
7575+ pub(crate) fn len(&self) -> usize {
7676+ match self {
7777+ MaybeProcessedBlock::Raw(b) => b.len(),
7878+ MaybeProcessedBlock::Processed(b) => b.len(),
7979+ }
8080+ }
8181+ pub(crate) fn into_bytes(self) -> Bytes {
8282+ match self {
8383+ MaybeProcessedBlock::Raw(b) => {
8484+ let mut owned = b.try_into_mut().unwrap();
8585+ owned.extend_from_slice(&[0x00]);
8686+ owned.into()
8787+ }
8888+ MaybeProcessedBlock::Processed(b) => {
8989+ let mut owned = b.try_into_mut().unwrap();
9090+ owned.extend_from_slice(&[0x01]);
9191+ owned.into()
9292+ }
9393+ }
9494+ }
9595+ pub(crate) fn from_bytes(mut b: Bytes) -> Self {
9696+ // TODO: make sure bytes is not empty, that it's explicitly 0 or 1, etc
9797+ let suffix = b.split_off(b.len() - 1);
9898+ if *suffix == [0x00] {
9999+ MaybeProcessedBlock::Raw(b)
100100+ } else {
101101+ MaybeProcessedBlock::Processed(b)
102102+ }
103103+ }
102104}
103105104106/// Read a CAR file, buffering blocks in memory or to disk
105105-pub enum Driver<R: AsyncRead + Unpin, T: Processable> {
107107+pub enum Driver<R: AsyncRead + Unpin> {
106108 /// All blocks fit within the memory limit
107109 ///
108110 /// You probably want to check the commit's signature. You can go ahead and
109111 /// walk the MST right away.
110110- Memory(Commit, MemDriver<T>),
112112+ Memory(Commit, MemDriver),
111113 /// Blocks exceed the memory limit
112114 ///
113115 /// You'll need to provide a disk storage to continue. The commit will be
114116 /// returned and can be validated only once all blocks are loaded.
115115- Disk(NeedDisk<R, T>),
117117+ Disk(NeedDisk<R>),
116118}
117119118120/// Builder-style driver setup
···127129 }
128130}
129131132132+/// Processor that just returns the raw blocks
133133+#[inline]
134134+pub fn noop(block: Bytes) -> Bytes {
135135+ block
136136+}
137137+130138impl DriverBuilder {
131139 /// Begin configuring the driver with defaults
132140 pub fn new() -> Self {
···143151 /// Set the block processor
144152 ///
145153 /// Default: noop, raw blocks will be emitted
146146- pub fn with_block_processor<T: Processable>(
154154+ pub fn with_block_processor(
147155 self,
148148- p: fn(Vec<u8>) -> T,
149149- ) -> DriverBuilderWithProcessor<T> {
156156+ block_processor: fn(Bytes) -> Bytes,
157157+ ) -> DriverBuilderWithProcessor {
150158 DriverBuilderWithProcessor {
151159 mem_limit_mb: self.mem_limit_mb,
152152- block_processor: p,
160160+ block_processor,
153161 }
154162 }
155163 /// Begin processing an atproto MST from a CAR file
156156- pub async fn load_car<R: AsyncRead + Unpin>(
157157- &self,
158158- reader: R,
159159- ) -> Result<Driver<R, Vec<u8>>, DriveError> {
160160- Driver::load_car(reader, crate::process::noop, self.mem_limit_mb).await
164164+ pub async fn load_car<R: AsyncRead + Unpin>(&self, reader: R) -> Result<Driver<R>, DriveError> {
165165+ Driver::load_car(reader, noop, self.mem_limit_mb).await
161166 }
162167}
163168···165170///
166171/// start from `DriverBuilder`
167172#[derive(Debug, Clone)]
168168-pub struct DriverBuilderWithProcessor<T: Processable> {
173173+pub struct DriverBuilderWithProcessor {
169174 pub mem_limit_mb: usize,
170170- pub block_processor: fn(Vec<u8>) -> T,
175175+ pub block_processor: fn(Bytes) -> Bytes,
171176}
172177173173-impl<T: Processable> DriverBuilderWithProcessor<T> {
178178+impl DriverBuilderWithProcessor {
174179 /// Set the in-memory size limit, in MiB
175180 ///
176181 /// Default: 16 MiB
···179184 self
180185 }
181186 /// Begin processing an atproto MST from a CAR file
182182- pub async fn load_car<R: AsyncRead + Unpin>(
183183- &self,
184184- reader: R,
185185- ) -> Result<Driver<R, T>, DriveError> {
187187+ pub async fn load_car<R: AsyncRead + Unpin>(&self, reader: R) -> Result<Driver<R>, DriveError> {
186188 Driver::load_car(reader, self.block_processor, self.mem_limit_mb).await
187189 }
188190}
189191190190-impl<R: AsyncRead + Unpin, T: Processable> Driver<R, T> {
192192+impl<R: AsyncRead + Unpin> Driver<R> {
191193 /// Begin processing an atproto MST from a CAR file
192194 ///
193195 /// Blocks will be loaded, processed, and buffered in memory. If the entire
···199201 /// resumed by providing a `SqliteStorage` for on-disk block storage.
200202 pub async fn load_car(
201203 reader: R,
202202- process: fn(Vec<u8>) -> T,
204204+ process: fn(Bytes) -> Bytes,
203205 mem_limit_mb: usize,
204204- ) -> Result<Driver<R, T>, DriveError> {
206206+ ) -> Result<Driver<R>, DriveError> {
205207 let max_size = mem_limit_mb * 2_usize.pow(20);
206208 let mut mem_blocks = HashMap::new();
207209···227229 continue;
228230 }
229231232232+ let data = Bytes::from(data);
233233+230234 // remaining possible types: node, record, other. optimistically process
231235 let maybe_processed = MaybeProcessedBlock::maybe(process, data);
232236233237 // stash (maybe processed) blocks in memory as long as we have room
234234- mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size();
238238+ mem_size += maybe_processed.len();
235239 mem_blocks.insert(cid, maybe_processed);
236240 if mem_size >= max_size {
237241 return Ok(Driver::Disk(NeedDisk {
···275279/// work the init function will do. We can drop the CAR reader before walking,
276280/// so the sync/async boundaries become a little easier to work around.
277281#[derive(Debug)]
278278-pub struct MemDriver<T: Processable> {
279279- blocks: HashMap<Cid, MaybeProcessedBlock<T>>,
282282+pub struct MemDriver {
283283+ blocks: HashMap<Cid, MaybeProcessedBlock>,
280284 walker: Walker,
281281- process: fn(Vec<u8>) -> T,
285285+ process: fn(Bytes) -> Bytes,
282286}
283287284284-impl<T: Processable> MemDriver<T> {
288288+impl MemDriver {
285289 /// Step through the record outputs, in rkey order
286286- pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> {
290290+ pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk>, DriveError> {
287291 let mut out = Vec::with_capacity(n);
288292 for _ in 0..n {
289293 // walk as far as we can until we run out of blocks or find a record
···306310}
307311308312/// A partially memory-loaded car file that needs disk spillover to continue
309309-pub struct NeedDisk<R: AsyncRead + Unpin, T: Processable> {
313313+pub struct NeedDisk<R: AsyncRead + Unpin> {
310314 car: CarReader<R>,
311315 root: Cid,
312312- process: fn(Vec<u8>) -> T,
316316+ process: fn(Bytes) -> Bytes,
313317 max_size: usize,
314314- mem_blocks: HashMap<Cid, MaybeProcessedBlock<T>>,
318318+ mem_blocks: HashMap<Cid, MaybeProcessedBlock>,
315319 pub commit: Option<Commit>,
316320}
317321318318-fn encode(v: impl Serialize) -> Result<Vec<u8>, bincode::error::EncodeError> {
319319- bincode::serde::encode_to_vec(v, bincode::config::standard())
320320-}
321321-322322-pub(crate) fn decode<T: Processable>(bytes: &[u8]) -> Result<T, DecodeError> {
323323- let (t, n) = bincode::serde::decode_from_slice(bytes, bincode::config::standard())?;
324324- if n != bytes.len() {
325325- return Err(DecodeError::ExtraGarbage);
326326- }
327327- Ok(t)
328328-}
329329-330330-impl<R: AsyncRead + Unpin, T: Processable + Send + 'static> NeedDisk<R, T> {
322322+impl<R: AsyncRead + Unpin> NeedDisk<R> {
331323 pub async fn finish_loading(
332324 mut self,
333325 mut store: DiskStore,
334334- ) -> Result<(Commit, DiskDriver<T>), DriveError> {
326326+ ) -> Result<(Commit, DiskDriver), DriveError> {
335327 // move store in and back out so we can manage lifetimes
336328 // dump mem blocks into the store
337329 store = tokio::task::spawn(async move {
338330 let kvs = self
339331 .mem_blocks
340332 .into_iter()
341341- .map(|(k, v)| Ok(encode(v).map(|v| (k.to_bytes(), v))?));
333333+ .map(|(k, v)| (k.to_bytes(), v.into_bytes()));
342334343335 store.put_many(kvs)?;
344336 Ok::<_, DriveError>(store)
345337 })
346338 .await??;
347339348348- let (tx, mut rx) = mpsc::channel::<Vec<(Cid, MaybeProcessedBlock<T>)>>(1);
340340+ let (tx, mut rx) = mpsc::channel::<Vec<(Cid, MaybeProcessedBlock)>>(1);
349341350342 let store_worker = tokio::task::spawn_blocking(move || {
351343 while let Some(chunk) = rx.blocking_recv() {
352344 let kvs = chunk
353345 .into_iter()
354354- .map(|(k, v)| Ok(encode(v).map(|v| (k.to_bytes(), v))?));
346346+ .map(|(k, v)| (k.to_bytes(), v.into_bytes()));
355347 store.put_many(kvs)?;
356348 }
357349 Ok::<_, DriveError>(store)
···372364 self.commit = Some(c);
373365 continue;
374366 }
367367+368368+ let data = Bytes::from(data);
369369+375370 // remaining possible types: node, record, other. optimistically process
376371 // TODO: get the actual in-memory size to compute disk spill
377372 let maybe_processed = MaybeProcessedBlock::maybe(self.process, data);
378378- mem_size += std::mem::size_of::<Cid>() + maybe_processed.get_size();
373373+ mem_size += maybe_processed.len();
379374 chunk.push((cid, maybe_processed));
380375 if mem_size >= self.max_size {
381376 // soooooo if we're setting the db cache to max_size and then letting
···418413}
419414420415/// MST walker that reads from disk instead of an in-memory hashmap
421421-pub struct DiskDriver<T: Clone> {
422422- process: fn(Vec<u8>) -> T,
416416+pub struct DiskDriver {
417417+ process: fn(Bytes) -> Bytes,
423418 state: Option<BigState>,
424419}
425420426421// for doctests only
427422#[doc(hidden)]
428428-pub fn _get_fake_disk_driver() -> DiskDriver<Vec<u8>> {
429429- use crate::process::noop;
423423+pub fn _get_fake_disk_driver() -> DiskDriver {
430424 DiskDriver {
431425 process: noop,
432426 state: None,
433427 }
434428}
435429436436-impl<T: Processable + Send + 'static> DiskDriver<T> {
430430+impl DiskDriver {
437431 /// Walk the MST returning up to `n` rkey + record pairs
438432 ///
439433 /// ```no_run
440440- /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop};
434434+ /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, noop};
441435 /// # #[tokio::main]
442436 /// # async fn main() -> Result<(), DriveError> {
443437 /// # let mut disk_driver = _get_fake_disk_driver();
···449443 /// # Ok(())
450444 /// # }
451445 /// ```
452452- pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk<T>>, DriveError> {
446446+ pub async fn next_chunk(&mut self, n: usize) -> Result<Option<BlockChunk>, DriveError> {
453447 let process = self.process;
454448455449 // state should only *ever* be None transiently while inside here
···458452 // the big pain here is that we don't want to leave self.state in an
459453 // invalid state (None), so all the error paths have to make sure it
460454 // comes out again.
461461- let (state, res) = tokio::task::spawn_blocking(
462462- move || -> (BigState, Result<BlockChunk<T>, DriveError>) {
455455+ let (state, res) =
456456+ tokio::task::spawn_blocking(move || -> (BigState, Result<BlockChunk, DriveError>) {
463457 let mut out = Vec::with_capacity(n);
464458465459 for _ in 0..n {
···480474 }
481475482476 (state, Ok::<_, DriveError>(out))
483483- },
484484- )
485485- .await?; // on tokio JoinError, we'll be left with invalid state :(
477477+ })
478478+ .await?; // on tokio JoinError, we'll be left with invalid state :(
486479487480 // *must* restore state before dealing with the actual result
488481 self.state = Some(state);
···499492 fn read_tx_blocking(
500493 &mut self,
501494 n: usize,
502502- tx: mpsc::Sender<Result<BlockChunk<T>, DriveError>>,
503503- ) -> Result<(), mpsc::error::SendError<Result<BlockChunk<T>, DriveError>>> {
495495+ tx: mpsc::Sender<Result<BlockChunk, DriveError>>,
496496+ ) -> Result<(), mpsc::error::SendError<Result<BlockChunk, DriveError>>> {
504497 let BigState { store, walker } = self.state.as_mut().expect("valid state");
505498506499 loop {
507507- let mut out: BlockChunk<T> = Vec::with_capacity(n);
500500+ let mut out: BlockChunk = Vec::with_capacity(n);
508501509502 for _ in 0..n {
510503 // walk as far as we can until we run out of blocks or find a record
···546539 /// benefit over just using `.next_chunk(n)`.
547540 ///
548541 /// ```no_run
549549- /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, process::noop};
542542+ /// # use repo_stream::{drive::{DiskDriver, DriveError, _get_fake_disk_driver}, noop};
550543 /// # #[tokio::main]
551544 /// # async fn main() -> Result<(), DriveError> {
552545 /// # let mut disk_driver = _get_fake_disk_driver();
···565558 mut self,
566559 n: usize,
567560 ) -> (
568568- mpsc::Receiver<Result<BlockChunk<T>, DriveError>>,
561561+ mpsc::Receiver<Result<BlockChunk, DriveError>>,
569562 tokio::task::JoinHandle<Self>,
570563 ) {
571571- let (tx, rx) = mpsc::channel::<Result<BlockChunk<T>, DriveError>>(1);
564564+ let (tx, rx) = mpsc::channel::<Result<BlockChunk, DriveError>>(1);
572565573566 // sketch: this worker is going to be allowed to execute without a join handle
574567 let chan_task = tokio::task::spawn_blocking(move || {
+16-6
src/lib.rs
···27272828match DriverBuilder::new()
2929 .with_mem_limit_mb(10)
3030- .with_block_processor(|rec| rec.len()) // block processing: just extract the raw record size
3030+ .with_block_processor(
3131+ |rec| rec.len().to_ne_bytes().to_vec().into()
3232+ ) // block processing: just extract the raw record size
3133 .load_car(reader)
3234 .await?
3335{
···3537 // if all blocks fit within memory
3638 Driver::Memory(_commit, mut driver) => {
3739 while let Some(chunk) = driver.next_chunk(256).await? {
3838- for (_rkey, size) in chunk {
4040+ for (_rkey, bytes) in chunk {
4141+4242+ let (int_bytes, _) = bytes.split_at(size_of::<usize>());
4343+ let size = usize::from_ne_bytes(int_bytes.try_into().unwrap());
4444+3945 total_size += size;
4046 }
4147 }
···4955 let (_commit, mut driver) = paused.finish_loading(store).await?;
50565157 while let Some(chunk) = driver.next_chunk(256).await? {
5252- for (_rkey, size) in chunk {
5858+ for (_rkey, bytes) in chunk {
5959+6060+ let (int_bytes, _) = bytes.split_at(size_of::<usize>());
6161+ let size = usize::from_ne_bytes(int_bytes.try_into().unwrap());
6262+5363 total_size += size;
5464 }
5565 }
···76867787pub mod disk;
7888pub mod drive;
7979-pub mod process;
80898190pub use disk::{DiskBuilder, DiskError, DiskStore};
8282-pub use drive::{DriveError, Driver, DriverBuilder, NeedDisk};
9191+pub use drive::{DriveError, Driver, DriverBuilder, NeedDisk, noop};
8392pub use mst::Commit;
8484-pub use process::Processable;
9393+9494+pub(crate) use hashbrown::HashMap;
+1-1
src/mst.rs
···33//! The primary aim is to work through the **tree** structure. Non-node blocks
44//! are left as raw bytes, for upper levels to parse into DAG-CBOR or whatever.
5566-use ipld_core::cid::Cid;
66+use cid::Cid;
77use serde::Deserialize;
8899/// The top-level data object in a repository's tree is a signed commit.
-108
src/process.rs
···11-/*!
22-Record processor function output trait
33-44-The return type must satisfy the `Processable` trait, which requires:
55-66-- `Clone` because two rkeys can refer to the same record by CID, which may
77- only appear once in the CAR file.
88-- `Serialize + DeserializeOwned` so it can be spilled to disk.
99-1010-One required function must be implemented, `get_size()`: this should return the
1111-approximate total off-stack size of the type. (the on-stack size will be added
1212-automatically via `std::mem::get_size`).
1313-1414-Note that it is **not guaranteed** that the `process` function will run on a
1515-block before storing it in memory or on disk: it's not possible to know if a
1616-block is a record without actually walking the MST, so the best we can do is
1717-apply `process` to any block that we know *cannot* be an MST node, and otherwise
1818-store the raw block bytes.
1919-2020-Here's a silly processing function that just collects 'eyy's found in the raw
2121-record bytes
2222-2323-```
2424-# use repo_stream::Processable;
2525-# use serde::{Serialize, Deserialize};
2626-#[derive(Debug, Clone, Serialize, Deserialize)]
2727-struct Eyy(usize, String);
2828-2929-impl Processable for Eyy {
3030- fn get_size(&self) -> usize {
3131- // don't need to compute the usize, it's on the stack
3232- self.1.capacity() // in-mem size from the string's capacity, in bytes
3333- }
3434-}
3535-3636-fn process(raw: Vec<u8>) -> Vec<Eyy> {
3737- let mut out = Vec::new();
3838- let to_find = "eyy".as_bytes();
3939- for i in 0..(raw.len() - 3) {
4040- if &raw[i..(i+3)] == to_find {
4141- out.push(Eyy(i, "eyy".to_string()));
4242- }
4343- }
4444- out
4545-}
4646-```
4747-4848-The memory sizing stuff is a little sketch but probably at least approximately
4949-works.
5050-*/
5151-5252-use serde::{Serialize, de::DeserializeOwned};
5353-5454-/// Output trait for record processing
5555-pub trait Processable: Clone + Serialize + DeserializeOwned {
5656- /// Any additional in-memory size taken by the processed type
5757- ///
5858- /// Do not include stack size (`std::mem::size_of`)
5959- fn get_size(&self) -> usize;
6060-}
6161-6262-/// Processor that just returns the raw blocks
6363-#[inline]
6464-pub fn noop(block: Vec<u8>) -> Vec<u8> {
6565- block
6666-}
6767-6868-impl Processable for u8 {
6969- fn get_size(&self) -> usize {
7070- 0
7171- }
7272-}
7373-7474-impl Processable for usize {
7575- fn get_size(&self) -> usize {
7676- 0 // no additional space taken, just its stack size (newtype is free)
7777- }
7878-}
7979-8080-impl Processable for String {
8181- fn get_size(&self) -> usize {
8282- self.capacity()
8383- }
8484-}
8585-8686-impl<Item: Sized + Processable> Processable for Vec<Item> {
8787- fn get_size(&self) -> usize {
8888- let slot_size = std::mem::size_of::<Item>();
8989- let direct_size = slot_size * self.capacity();
9090- let items_referenced_size: usize = self.iter().map(|item| item.get_size()).sum();
9191- direct_size + items_referenced_size
9292- }
9393-}
9494-9595-impl<Item: Processable> Processable for Option<Item> {
9696- fn get_size(&self) -> usize {
9797- self.as_ref().map(|item| item.get_size()).unwrap_or(0)
9898- }
9999-}
100100-101101-impl<Item: Processable, Error: Processable> Processable for Result<Item, Error> {
102102- fn get_size(&self) -> usize {
103103- match self {
104104- Ok(item) => item.get_size(),
105105- Err(err) => err.get_size(),
106106- }
107107- }
108108-}
+18-20
src/walk.rs
···11//! Depth-first MST traversal
2233+use crate::HashMap;
34use crate::disk::DiskStore;
44-use crate::drive::{DecodeError, MaybeProcessedBlock};
55+use crate::drive::MaybeProcessedBlock;
56use crate::mst::Node;
66-use crate::process::Processable;
77-use ipld_core::cid::Cid;
77+use bytes::Bytes;
88+use cid::Cid;
89use sha2::{Digest, Sha256};
99-use std::collections::HashMap;
1010use std::convert::Infallible;
11111212/// Errors that can happen while walking
···2020 MstError(#[from] MstError),
2121 #[error("storage error: {0}")]
2222 StorageError(#[from] fjall::Error),
2323- #[error("Decode error: {0}")]
2424- DecodeError(#[from] DecodeError),
2523}
26242725/// Errors from invalid Rkeys
···45434644/// Walker outputs
4745#[derive(Debug)]
4848-pub enum Step<T> {
4646+pub enum Step {
4947 /// We needed this CID but it's not in the block store
5048 Missing(Cid),
5149 /// Reached the end of the MST! yay!
5250 Finish,
5351 /// A record was found!
5454- Found { rkey: String, data: T },
5252+ Found { rkey: String, data: Bytes },
5553}
56545755#[derive(Debug, Clone, PartialEq)]
···176174 }
177175178176 /// Advance through nodes until we find a record or can't go further
179179- pub fn step<T: Processable>(
177177+ pub fn step(
180178 &mut self,
181181- blocks: &mut HashMap<Cid, MaybeProcessedBlock<T>>,
182182- process: impl Fn(Vec<u8>) -> T,
183183- ) -> Result<Step<T>, WalkError> {
179179+ blocks: &mut HashMap<Cid, MaybeProcessedBlock>,
180180+ process: impl Fn(Bytes) -> Bytes,
181181+ ) -> Result<Step, WalkError> {
184182 loop {
185183 let Some(need) = self.stack.last_mut() else {
186184 log::trace!("tried to walk but we're actually done.");
···216214 };
217215 let rkey = rkey.clone();
218216 let data = match data {
219219- MaybeProcessedBlock::Raw(data) => process(data.to_vec()),
217217+ MaybeProcessedBlock::Raw(data) => process(data.clone()),
220218 MaybeProcessedBlock::Processed(t) => t.clone(),
221219 };
222220···237235 }
238236239237 /// blocking!!!!!!
240240- pub fn disk_step<T: Processable>(
238238+ pub fn disk_step(
241239 &mut self,
242240 reader: &mut DiskStore,
243243- process: impl Fn(Vec<u8>) -> T,
244244- ) -> Result<Step<T>, WalkError> {
241241+ process: impl Fn(Bytes) -> Bytes,
242242+ ) -> Result<Step, WalkError> {
245243 loop {
246244 let Some(need) = self.stack.last_mut() else {
247245 log::trace!("tried to walk but we're actually done.");
···252250 &mut Need::Node { depth, cid } => {
253251 let cid_bytes = cid.to_bytes();
254252 log::trace!("need node {cid:?}");
255255- let Some(block_bytes) = reader.get(&cid_bytes)? else {
253253+ let Some(block_slice) = reader.get(&cid_bytes)? else {
256254 log::trace!("node not found, resting");
257255 return Ok(Step::Missing(cid));
258256 };
259257260260- let block: MaybeProcessedBlock<T> = crate::drive::decode(&block_bytes)?;
258258+ let block = MaybeProcessedBlock::from_bytes(block_slice.into()); // TODO shouldn't fjalls slice already be bytes
261259262260 let MaybeProcessedBlock::Raw(data) = block else {
263261 return Err(WalkError::BadCommitFingerprint);
···274272 Need::Record { rkey, cid } => {
275273 log::trace!("need record {cid:?}");
276274 let cid_bytes = cid.to_bytes();
277277- let Some(data_bytes) = reader.get(&cid_bytes)? else {
275275+ let Some(data_slice) = reader.get(&cid_bytes)? else {
278276 log::trace!("record block not found, resting");
279277 return Ok(Step::Missing(*cid));
280278 };
281281- let data: MaybeProcessedBlock<T> = crate::drive::decode(&data_bytes)?;
279279+ let data = MaybeProcessedBlock::from_bytes(data_slice.into());
282280 let rkey = rkey.clone();
283281 let data = match data {
284282 MaybeProcessedBlock::Raw(data) => process(data),
+12-4
tests/non-huge-cars.rs
···1212 expected_sum: usize,
1313 expect_profile: bool,
1414) {
1515- let mut driver = match Driver::load_car(bytes, |block| block.len(), 10 /* MiB */)
1616- .await
1717- .unwrap()
1515+ let mut driver = match Driver::load_car(
1616+ bytes,
1717+ |block| block.len().to_ne_bytes().to_vec().into(),
1818+ 10, /* MiB */
1919+ )
2020+ .await
2121+ .unwrap()
1822 {
1923 Driver::Memory(_commit, mem_driver) => mem_driver,
2024 Driver::Disk(_) => panic!("too big"),
···2630 let mut prev_rkey = "".to_string();
27312832 while let Some(pairs) = driver.next_chunk(256).await.unwrap() {
2929- for (rkey, size) in pairs {
3333+ for (rkey, bytes) in pairs {
3034 records += 1;
3535+3636+ let (int_bytes, _) = bytes.split_at(size_of::<usize>());
3737+ let size = usize::from_ne_bytes(int_bytes.try_into().unwrap());
3838+3139 sum += size;
3240 if rkey == "app.bsky.actor.profile/self" {
3341 found_bsky_profile = true;