Fast and robust atproto CAR file processing in rust

oops check CIDs on read #3

merged opened by bad-example.com targeting main from hash-cost

turns out iroh-car doesn't

85–300% overall repo-stream pref regression for in-memory CAR processing with this :(

Labels

None yet.

assignee

None yet.

Participants 1
AT URI
at://did:plc:hdhoaan3xa3jiuq4fg4mefid/sh.tangled.repo.pull/3mdo42f64d422
+56 -8
Interdiff #0 #1
Cargo.lock

This file has not been changed.

+4
Cargo.toml
··· 48 48 # [[bench]] 49 49 # name = "leading" 50 50 # harness = false 51 + 52 + [[bench]] 53 + name = "cid-check" 54 + harness = false
+7 -8
src/drive.rs
··· 8 8 }; 9 9 use cid::Cid; 10 10 use iroh_car::CarReader; 11 + use multihash_codetable::{Code, MultihashDigest}; 11 12 use std::convert::Infallible; 12 13 use tokio::{io::AsyncRead, sync::mpsc}; 13 14 ··· 123 124 block 124 125 } 125 126 127 + // iroh-car doesn't verify CIDs!!!!!! 128 + #[inline(always)] 129 + fn verify_block(given: Cid, block: &[u8]) -> bool { 130 + Cid::new_v1(0x71, Code::Sha2_256.digest(block)) == given 131 + } 132 + 126 133 /// Builder-style driver setup 127 134 #[derive(Debug, Clone)] 128 135 pub struct DriverBuilder { ··· 198 205 // try to load all the blocks into memory 199 206 let mut mem_size = 0; 200 207 while let Some((cid, data)) = car.next_block().await? { 201 - 202 208 // lkasdjflkajdsflkajsfdlkjasdf 203 209 if !verify_block(cid, &data) { 204 210 return Err(DriveError::BadCID); ··· 301 307 302 308 303 309 pub commit: Option<Commit>, 304 - } 305 - 306 - fn verify_block(given: Cid, block: &[u8]) -> bool { 307 - use multihash_codetable::{Code, MultihashDigest}; 308 - const RAW: u64 = 0x71; 309 - let calculated = cid::Cid::new_v1(RAW, Code::Sha2_256.digest(block)); 310 - calculated == given 311 310 } 312 311 313 312 impl<R: AsyncRead + Unpin> NeedDisk<R> {
+45
benches/cid-check.rs
··· 1 + use cid::Cid; 2 + use criterion::{Criterion, criterion_group, criterion_main}; 3 + use multihash_codetable::{Code, MultihashDigest}; 4 + use sha2::{Digest, Sha256}; 5 + 6 + fn multihash_verify(given: Cid, block: &[u8]) -> bool { 7 + let calculated = Cid::new_v1(0x71, Code::Sha2_256.digest(block)); 8 + calculated == given 9 + } 10 + 11 + fn effortful_verify(given: Cid, block: &[u8]) -> bool { 12 + // we know we're in atproto, so we can make a few assumptions 13 + if given.version() != cid::Version::V1 { 14 + return false; 15 + } 16 + let (codec, given_digest, _) = given.hash().into_inner(); 17 + if codec != 0x12 { 18 + return false; 19 + } 20 + given_digest[..32] == *Sha256::digest(block) 21 + } 22 + 23 + fn fastloose_verify(given: Cid, block: &[u8]) -> bool { 24 + let (_, given_digest, _) = given.hash().into_inner(); 25 + given_digest[..32] == *Sha256::digest(block) 26 + } 27 + 28 + pub fn criterion_benchmark(c: &mut Criterion) { 29 + let some_bytes: Vec<u8> = vec![0x1a, 0x00, 0xAA, 0x39, 0x8C].repeat(100); 30 + let cid = Cid::new_v1(0x71, Code::Sha2_256.digest(&some_bytes)); 31 + 32 + let mut g = c.benchmark_group("CID check"); 33 + g.bench_function("multihash", |b| { 34 + b.iter(|| multihash_verify(cid, &some_bytes)) 35 + }); 36 + g.bench_function("effortful", |b| { 37 + b.iter(|| effortful_verify(cid, &some_bytes)) 38 + }); 39 + g.bench_function("fastloose", |b| { 40 + b.iter(|| fastloose_verify(cid, &some_bytes)) 41 + }); 42 + } 43 + 44 + criterion_group!(benches, criterion_benchmark); 45 + criterion_main!(benches);

History

2 rounds 0 comments
sign up or login to add to the discussion
4 commits
expand
check CIDs from CAR blocks
little faster?
bench it
fmt
expand 0 comments
pull request successfully merged
1 commit
expand
check CIDs from CAR blocks
expand 0 comments