···2020use crate::IntoStatic;
2121use std::borrow::Cow;
22222323+/// Multicodec code for SHA2-256 hash
2424+pub const SHA2_256: u64 = 0x12;
2525+2626+/// Multicodec code for DAG-CBOR codec
2727+pub const DAG_CBOR: u64 = 0x71;
2828+2329/// Known multicodec key codecs for Multikey public keys
2430///
2531#[derive(Debug, Clone, Copy, PartialEq, Eq)]
···186192 }
187193}
188194189189-fn decode_uvarint(data: &[u8]) -> Option<(u64, usize)> {
195195+pub fn decode_uvarint(data: &[u8]) -> Option<(u64, usize)> {
190196 let mut x: u64 = 0;
191197 let mut s: u32 = 0;
192198 for (i, b) in data.iter().copied().enumerate() {
···202208 None
203209}
204210211211+pub fn encode_uvarint(mut x: u64) -> Vec<u8> {
212212+ let mut out = Vec::new();
213213+ while x >= 0x80 {
214214+ out.push(((x as u8) & 0x7F) | 0x80);
215215+ x >>= 7;
216216+ }
217217+ out.push(x as u8);
218218+ out
219219+}
220220+221221+pub fn multikey(code: u64, key: &[u8]) -> String {
222222+ let mut buf = encode_uvarint(code);
223223+ buf.extend_from_slice(key);
224224+ multibase::encode(multibase::Base::Base58Btc, buf)
225225+}
226226+205227#[cfg(test)]
206228mod tests {
207229 use super::*;
208230 use multibase;
209209-210210- fn encode_uvarint(mut x: u64) -> Vec<u8> {
211211- let mut out = Vec::new();
212212- while x >= 0x80 {
213213- out.push(((x as u8) & 0x7F) | 0x80);
214214- x >>= 7;
215215- }
216216- out.push(x as u8);
217217- out
218218- }
219219-220220- fn multikey(code: u64, key: &[u8]) -> String {
221221- let mut buf = encode_uvarint(code);
222222- buf.extend_from_slice(key);
223223- multibase::encode(multibase::Base::Base58Btc, buf)
224224- }
225231226232 #[test]
227233 fn decode_ed25519() {
+4-6
crates/jacquard-common/src/types/tid.rs
···120120121121 /// Construct a TID from a timestamp (in microseconds) and clock ID
122122 pub fn from_time(timestamp: u64, clkid: u32) -> Self {
123123- let str = smol_str::format_smolstr!(
124124- "{0}{1:2>2}",
125125- s32_encode(timestamp as u64),
126126- s32_encode(Into::<u32>::into(clkid) as u64)
127127- );
128128- Self(str)
123123+ // Combine timestamp and clock ID into single u64: 53 bits timestamp + 10 bits clock ID
124124+ // 0TTTTTTTTTTTTTTT TTTTTTTTTTTTTTTT TTTTTTTTTTTTTTTT TTTTTTCCCCCCCCCC
125125+ let tid = (timestamp << 10) & 0x7FFF_FFFF_FFFF_FC00 | (clkid as u64 & 0x3FF);
126126+ Self(s32_encode(tid))
129127 }
130128131129 /// Extract the timestamp component (microseconds since UNIX epoch)
···11+//! CAR (Content Addressable aRchive) file I/O
22+//!
33+//! Provides utilities for reading and writing CAR files, which are the standard
44+//! format for AT Protocol repository export/import.
55+//!
66+//! # Examples
77+//!
88+//! Reading a CAR file:
99+//! ```ignore
1010+//! use jacquard_repo::car::reader::read_car;
1111+//!
1212+//! let blocks = read_car("repo.car").await?;
1313+//! ```
1414+//!
1515+//! Writing a CAR file:
1616+//! ```ignore
1717+//! use jacquard_repo::car::writer::write_car;
1818+//!
1919+//! let roots = vec![commit_cid];
2020+//! write_car("repo.car", roots, blocks).await?;
2121+//! ```
2222+2323+pub mod reader;
2424+pub mod writer;
2525+2626+// Re-export commonly used functions and types
2727+pub use reader::{parse_car_bytes, read_car, read_car_header, stream_car, ParsedCar};
2828+pub use writer::{export_repo_car, write_car, write_car_bytes};
+274
crates/jacquard-repo/src/car/reader.rs
···11+//! CAR file reading utilities
22+//!
33+//! Provides functions for reading CAR (Content Addressable aRchive) files into memory
44+//! or streaming them for large repositories.
55+66+use crate::error::Result;
77+use bytes::Bytes;
88+use cid::Cid as IpldCid;
99+use iroh_car::CarReader;
1010+use n0_future::stream::StreamExt;
1111+use std::collections::BTreeMap;
1212+use std::path::Path;
1313+use tokio::fs::File;
1414+1515+/// Parsed CAR file data
1616+#[derive(Debug, Clone)]
1717+pub struct ParsedCar {
1818+ /// The first root CID from the CAR header
1919+ pub root: IpldCid,
2020+ /// All blocks in the CAR file
2121+ pub blocks: BTreeMap<IpldCid, Bytes>,
2222+}
2323+2424+/// Read entire CAR file into memory
2525+///
2626+/// Returns BTreeMap of CID -> block data (sorted order for determinism).
2727+/// For large CAR files, consider using `stream_car()` instead.
2828+pub async fn read_car(path: impl AsRef<Path>) -> Result<BTreeMap<IpldCid, Bytes>> {
2929+ let file = File::open(path)
3030+ .await
3131+ .map_err(|e| crate::error::RepoError::io(e))?;
3232+3333+ let reader = CarReader::new(file)
3434+ .await
3535+ .map_err(|e| crate::error::RepoError::car(e))?;
3636+3737+ let mut blocks = BTreeMap::new();
3838+ let stream = reader.stream();
3939+ n0_future::pin!(stream);
4040+4141+ while let Some(result) = stream.next().await {
4242+ let (cid, data) = result.map_err(|e| crate::error::RepoError::car_parse(e))?;
4343+ blocks.insert(cid, Bytes::from(data));
4444+ }
4545+4646+ Ok(blocks)
4747+}
4848+4949+/// Read CAR file header (roots only)
5050+///
5151+/// Useful for checking roots without loading all blocks.
5252+pub async fn read_car_header(path: impl AsRef<Path>) -> Result<Vec<IpldCid>> {
5353+ let file = File::open(path)
5454+ .await
5555+ .map_err(|e| crate::error::RepoError::io(e))?;
5656+5757+ let reader = CarReader::new(file)
5858+ .await
5959+ .map_err(|e| crate::error::RepoError::car(e))?;
6060+6161+ Ok(reader.header().roots().to_vec())
6262+}
6363+6464+/// Parse CAR bytes into root and block map
6565+///
6666+/// For in-memory CAR data (e.g., from firehose commit messages, merkle proofs).
6767+/// Returns the first root CID and all blocks.
6868+pub async fn parse_car_bytes(data: &[u8]) -> Result<ParsedCar> {
6969+ let reader = CarReader::new(data)
7070+ .await
7171+ .map_err(|e| crate::error::RepoError::car_parse(e))?;
7272+7373+ let roots = reader.header().roots();
7474+ let root = roots
7575+ .first()
7676+ .copied()
7777+ .ok_or_else(|| crate::error::RepoError::invalid("CAR file has no roots"))?;
7878+7979+ let mut blocks = BTreeMap::new();
8080+ let stream = reader.stream();
8181+ n0_future::pin!(stream);
8282+8383+ while let Some(result) = stream.next().await {
8484+ let (cid, data) = result.map_err(|e| crate::error::RepoError::car_parse(e))?;
8585+ blocks.insert(cid, Bytes::from(data));
8686+ }
8787+8888+ Ok(ParsedCar { root, blocks })
8989+}
9090+9191+/// Stream CAR blocks without loading entire file into memory
9292+///
9393+/// Useful for processing large CAR files incrementally.
9494+pub async fn stream_car(path: impl AsRef<Path>) -> Result<CarBlockStream> {
9595+ let file = File::open(path)
9696+ .await
9797+ .map_err(|e| crate::error::RepoError::io(e))?;
9898+9999+ let reader = CarReader::new(file)
100100+ .await
101101+ .map_err(|e| crate::error::RepoError::car(e))?;
102102+103103+ let roots = reader.header().roots().to_vec();
104104+ let stream = Box::pin(reader.stream());
105105+106106+ Ok(CarBlockStream { stream, roots })
107107+}
108108+109109+/// Streaming CAR block reader
110110+///
111111+/// Iterates through CAR blocks without loading entire file into memory.
112112+pub struct CarBlockStream {
113113+ stream: std::pin::Pin<
114114+ Box<
115115+ dyn n0_future::stream::Stream<
116116+ Item = std::result::Result<(IpldCid, Vec<u8>), iroh_car::Error>,
117117+ > + Send,
118118+ >,
119119+ >,
120120+ roots: Vec<IpldCid>,
121121+}
122122+123123+impl CarBlockStream {
124124+ /// Get next block from the stream
125125+ ///
126126+ /// Returns `None` when stream is exhausted.
127127+ pub async fn next(&mut self) -> Result<Option<(IpldCid, Bytes)>> {
128128+ match self.stream.next().await {
129129+ Some(result) => {
130130+ let (cid, data) = result.map_err(|e| crate::error::RepoError::car_parse(e))?;
131131+ Ok(Some((cid, Bytes::from(data))))
132132+ }
133133+ None => Ok(None),
134134+ }
135135+ }
136136+137137+ /// Get the CAR file roots
138138+ pub fn roots(&self) -> &[IpldCid] {
139139+ &self.roots
140140+ }
141141+}
142142+143143+#[cfg(test)]
144144+mod tests {
145145+ use crate::DAG_CBOR_CID_CODEC;
146146+147147+ use super::*;
148148+ use iroh_car::CarWriter;
149149+ use jacquard_common::types::crypto::SHA2_256;
150150+ use tempfile::NamedTempFile;
151151+ use tokio::io::AsyncWriteExt;
152152+153153+ async fn make_test_car(roots: Vec<IpldCid>, blocks: Vec<(IpldCid, Vec<u8>)>) -> Vec<u8> {
154154+ let mut buf = Vec::new();
155155+ let header = iroh_car::CarHeader::new_v1(roots);
156156+ let mut writer = CarWriter::new(header, &mut buf);
157157+158158+ for (cid, data) in blocks {
159159+ writer.write(cid, data).await.unwrap();
160160+ }
161161+162162+ writer.finish().await.unwrap();
163163+ buf.flush().await.unwrap();
164164+ buf
165165+ }
166166+167167+ fn make_test_cid(value: u8) -> IpldCid {
168168+ use sha2::{Digest, Sha256};
169169+ let hash = Sha256::digest(&[value]);
170170+ let mh = multihash::Multihash::wrap(SHA2_256, &hash).unwrap();
171171+ IpldCid::new_v1(DAG_CBOR_CID_CODEC, mh) // dag-cbor codec
172172+ }
173173+174174+ #[tokio::test]
175175+ async fn test_parse_car_with_blocks() {
176176+ let cid1 = make_test_cid(1);
177177+ let cid2 = make_test_cid(2);
178178+ let data1 = vec![1, 2, 3];
179179+ let data2 = vec![4, 5, 6];
180180+181181+ let car_bytes = make_test_car(
182182+ vec![cid1],
183183+ vec![(cid1, data1.clone()), (cid2, data2.clone())],
184184+ )
185185+ .await;
186186+187187+ let parsed = parse_car_bytes(&car_bytes).await.unwrap();
188188+ assert_eq!(parsed.root, cid1);
189189+ assert_eq!(parsed.blocks.len(), 2);
190190+ assert_eq!(parsed.blocks.get(&cid1).unwrap().as_ref(), &data1);
191191+ assert_eq!(parsed.blocks.get(&cid2).unwrap().as_ref(), &data2);
192192+ }
193193+194194+ #[tokio::test]
195195+ async fn test_read_car_from_file() {
196196+ let cid1 = make_test_cid(1);
197197+ let data1 = vec![1, 2, 3];
198198+199199+ let car_bytes = make_test_car(vec![cid1], vec![(cid1, data1.clone())]).await;
200200+201201+ // Write to temp file
202202+ let temp_file = NamedTempFile::new().unwrap();
203203+ tokio::io::AsyncWriteExt::write_all(
204204+ &mut tokio::fs::File::from_std(temp_file.reopen().unwrap()),
205205+ &car_bytes,
206206+ )
207207+ .await
208208+ .unwrap();
209209+210210+ // Read back
211211+ let blocks = read_car(temp_file.path()).await.unwrap();
212212+ assert_eq!(blocks.len(), 1);
213213+ assert_eq!(blocks.get(&cid1).unwrap().as_ref(), &data1);
214214+ }
215215+216216+ #[tokio::test]
217217+ async fn test_read_car_header() {
218218+ let cid1 = make_test_cid(1);
219219+ let cid2 = make_test_cid(2);
220220+ let data1 = vec![1, 2, 3];
221221+222222+ let car_bytes = make_test_car(vec![cid1, cid2], vec![(cid1, data1)]).await;
223223+224224+ let temp_file = NamedTempFile::new().unwrap();
225225+ tokio::io::AsyncWriteExt::write_all(
226226+ &mut tokio::fs::File::from_std(temp_file.reopen().unwrap()),
227227+ &car_bytes,
228228+ )
229229+ .await
230230+ .unwrap();
231231+232232+ let roots = read_car_header(temp_file.path()).await.unwrap();
233233+ assert_eq!(roots.len(), 2);
234234+ assert_eq!(roots[0], cid1);
235235+ assert_eq!(roots[1], cid2);
236236+ }
237237+238238+ #[tokio::test]
239239+ async fn test_stream_car() {
240240+ let cid1 = make_test_cid(1);
241241+ let cid2 = make_test_cid(2);
242242+ let data1 = vec![1, 2, 3];
243243+ let data2 = vec![4, 5, 6];
244244+245245+ let car_bytes = make_test_car(
246246+ vec![cid1],
247247+ vec![(cid1, data1.clone()), (cid2, data2.clone())],
248248+ )
249249+ .await;
250250+251251+ let temp_file = NamedTempFile::new().unwrap();
252252+ tokio::io::AsyncWriteExt::write_all(
253253+ &mut tokio::fs::File::from_std(temp_file.reopen().unwrap()),
254254+ &car_bytes,
255255+ )
256256+ .await
257257+ .unwrap();
258258+259259+ let mut stream = stream_car(temp_file.path()).await.unwrap();
260260+261261+ // Read first block
262262+ let (cid, data) = stream.next().await.unwrap().unwrap();
263263+ assert_eq!(cid, cid1);
264264+ assert_eq!(data.as_ref(), &data1);
265265+266266+ // Read second block
267267+ let (cid, data) = stream.next().await.unwrap().unwrap();
268268+ assert_eq!(cid, cid2);
269269+ assert_eq!(data.as_ref(), &data2);
270270+271271+ // Stream exhausted
272272+ assert!(stream.next().await.unwrap().is_none());
273273+ }
274274+}
+218
crates/jacquard-repo/src/car/writer.rs
···11+//! CAR file writing utilities
22+//!
33+//! Provides functions for writing blocks to CAR (Content Addressable aRchive) files.
44+55+use crate::error::Result;
66+use crate::mst::tree::Mst;
77+use crate::storage::BlockStore;
88+use bytes::Bytes;
99+use cid::Cid as IpldCid;
1010+use iroh_car::CarWriter;
1111+use std::collections::BTreeMap;
1212+use std::path::Path;
1313+use tokio::fs::File;
1414+use tokio::io::AsyncWriteExt;
1515+1616+/// Write blocks to CAR file
1717+///
1818+/// Roots should contain commit CID(s).
1919+/// Blocks are written in sorted CID order (BTreeMap) for determinism.
2020+pub async fn write_car(
2121+ path: impl AsRef<Path>,
2222+ roots: Vec<IpldCid>,
2323+ blocks: BTreeMap<IpldCid, Bytes>,
2424+) -> Result<()> {
2525+ let file = File::create(path)
2626+ .await
2727+ .map_err(|e| crate::error::RepoError::io(e))?;
2828+2929+ let header = iroh_car::CarHeader::new_v1(roots);
3030+ let mut writer = CarWriter::new(header, file);
3131+3232+ for (cid, data) in blocks {
3333+ writer
3434+ .write(cid, data.as_ref())
3535+ .await
3636+ .map_err(|e| crate::error::RepoError::car(e))?;
3737+ }
3838+3939+ writer
4040+ .finish()
4141+ .await
4242+ .map_err(|e| crate::error::RepoError::car(e))?;
4343+4444+ Ok(())
4545+}
4646+4747+/// Write blocks to CAR bytes (in-memory)
4848+///
4949+/// Like `write_car()` but writes to a Vec<u8> instead of a file.
5050+/// Useful for tests and proof generation.
5151+pub async fn write_car_bytes(
5252+ root: IpldCid,
5353+ blocks: BTreeMap<IpldCid, Bytes>,
5454+) -> Result<Vec<u8>> {
5555+ let mut buffer = Vec::new();
5656+ let header = iroh_car::CarHeader::new_v1(vec![root]);
5757+ let mut writer = CarWriter::new(header, &mut buffer);
5858+5959+ for (cid, data) in blocks {
6060+ writer
6161+ .write(cid, data.as_ref())
6262+ .await
6363+ .map_err(|e| crate::error::RepoError::car(e))?;
6464+ }
6565+6666+ writer
6767+ .finish()
6868+ .await
6969+ .map_err(|e| crate::error::RepoError::car(e))?;
7070+7171+ buffer.flush().await.map_err(|e| crate::error::RepoError::io(e))?;
7272+7373+ Ok(buffer)
7474+}
7575+7676+/// Write MST + commit to CAR file
7777+///
7878+/// Streams blocks directly to CAR file:
7979+/// - Commit block (from storage)
8080+/// - All MST node blocks (from storage)
8181+/// - All record blocks (from storage)
8282+///
8383+/// Uses streaming to avoid loading all blocks into memory.
8484+pub async fn export_repo_car<S: BlockStore + Sync + 'static>(
8585+ path: impl AsRef<Path>,
8686+ commit_cid: IpldCid,
8787+ mst: &Mst<S>,
8888+) -> Result<()> {
8989+ let file = File::create(path)
9090+ .await
9191+ .map_err(|e| crate::error::RepoError::io(e))?;
9292+9393+ let header = iroh_car::CarHeader::new_v1(vec![commit_cid]);
9494+ let mut writer = CarWriter::new(header, file);
9595+9696+ // Write commit block first
9797+ let storage = mst.storage();
9898+ let commit_data = storage
9999+ .get(&commit_cid)
100100+ .await?
101101+ .ok_or_else(|| crate::error::RepoError::not_found("commit", &commit_cid))?;
102102+103103+ writer
104104+ .write(commit_cid, &commit_data)
105105+ .await
106106+ .map_err(|e| crate::error::RepoError::car(e))?;
107107+108108+ // Stream MST and record blocks
109109+ mst.write_blocks_to_car(&mut writer).await?;
110110+111111+ // Finish writing
112112+ writer
113113+ .finish()
114114+ .await
115115+ .map_err(|e| crate::error::RepoError::car(e))?;
116116+117117+ Ok(())
118118+}
119119+120120+#[cfg(test)]
121121+mod tests {
122122+ use super::*;
123123+ use crate::DAG_CBOR_CID_CODEC;
124124+ use crate::car::reader::read_car;
125125+ use crate::mst::tree::Mst;
126126+ use crate::storage::memory::MemoryBlockStore;
127127+ use jacquard_common::types::crypto::SHA2_256;
128128+ use std::sync::Arc;
129129+ use tempfile::NamedTempFile;
130130+131131+ fn make_test_cid(value: u8) -> IpldCid {
132132+ use sha2::{Digest, Sha256};
133133+ let hash = Sha256::digest(&[value]);
134134+ let mh = multihash::Multihash::wrap(SHA2_256, &hash).unwrap();
135135+136136+ IpldCid::new_v1(DAG_CBOR_CID_CODEC, mh)
137137+ }
138138+139139+ #[tokio::test]
140140+ async fn test_write_car_with_blocks() {
141141+ let temp_file = NamedTempFile::new().unwrap();
142142+143143+ let cid1 = make_test_cid(1);
144144+ let cid2 = make_test_cid(2);
145145+ let data1 = Bytes::from_static(&[1, 2, 3]);
146146+ let data2 = Bytes::from_static(&[4, 5, 6]);
147147+148148+ let mut blocks = BTreeMap::new();
149149+ blocks.insert(cid1, data1.clone());
150150+ blocks.insert(cid2, data2.clone());
151151+152152+ write_car(temp_file.path(), vec![cid1], blocks)
153153+ .await
154154+ .unwrap();
155155+156156+ // Read back and verify
157157+ let read_blocks = read_car(temp_file.path()).await.unwrap();
158158+ assert_eq!(read_blocks.len(), 2);
159159+ assert_eq!(read_blocks.get(&cid1).unwrap(), &data1);
160160+ assert_eq!(read_blocks.get(&cid2).unwrap(), &data2);
161161+ }
162162+163163+ #[tokio::test]
164164+ async fn test_export_mst_to_car() {
165165+ let storage = Arc::new(MemoryBlockStore::new());
166166+ let mst = Mst::new(storage.clone());
167167+168168+ // Add some entries
169169+ let cid1 = make_test_cid(1);
170170+ let cid2 = make_test_cid(2);
171171+172172+ let mst = mst.add("app.bsky.feed.post/abc123", cid1).await.unwrap();
173173+ let mst = mst.add("app.bsky.feed.post/def456", cid2).await.unwrap();
174174+175175+ // Persist MST blocks to storage
176176+ mst.persist().await.unwrap();
177177+178178+ // Persist record blocks to storage
179179+ storage
180180+ .put_with_cid(cid1, Bytes::from_static(&[1, 1, 1]))
181181+ .await
182182+ .unwrap();
183183+ storage
184184+ .put_with_cid(cid2, Bytes::from_static(&[2, 2, 2]))
185185+ .await
186186+ .unwrap();
187187+188188+ // Create and persist commit block
189189+ let commit_cid = make_test_cid(99);
190190+ let commit_data = Bytes::from_static(&[99, 99, 99]);
191191+ storage
192192+ .put_with_cid(commit_cid, commit_data.clone())
193193+ .await
194194+ .unwrap();
195195+196196+ let temp_file = NamedTempFile::new().unwrap();
197197+198198+ // Export to CAR
199199+ export_repo_car(temp_file.path(), commit_cid, &mst)
200200+ .await
201201+ .unwrap();
202202+203203+ // Read back and verify
204204+ let blocks = read_car(temp_file.path()).await.unwrap();
205205+206206+ // Should have commit + MST nodes + record blocks
207207+ assert!(blocks.contains_key(&commit_cid));
208208+ assert_eq!(blocks.get(&commit_cid).unwrap(), &commit_data);
209209+210210+ // Should have at least the root node
211211+ let root_cid = mst.root().await.unwrap();
212212+ assert!(blocks.contains_key(&root_cid));
213213+214214+ // Should have record blocks
215215+ assert!(blocks.contains_key(&cid1));
216216+ assert!(blocks.contains_key(&cid2));
217217+ }
218218+}
+336
crates/jacquard-repo/src/commit/firehose.rs
···11+//! Firehose commit message structures
22+//!
33+//! These structures are vendored from `jacquard-api::com_atproto::sync::subscribe_repos`
44+//! to avoid a dependency on the full API crate. They represent firehose protocol messages,
55+//! which are DISTINCT from repository commit objects.
66+77+use bytes::Bytes;
88+use jacquard_common::IntoStatic;
99+use jacquard_common::types::string::{Did, Tid};
1010+1111+/// Firehose commit message (sync v1.0 and v1.1)
1212+///
1313+/// Represents an update of repository state in the firehose stream.
1414+/// This is the message format sent over `com.atproto.sync.subscribeRepos`.
1515+///
1616+/// **Sync v1.0 vs v1.1:**
1717+/// - v1.0: `prev_data` is None/skipped, consumers must have sufficient previous repository state to validate
1818+/// - v1.1: `prev_data` includes previous MST root for inductive validation
1919+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
2020+#[serde(rename_all = "camelCase")]
2121+pub struct FirehoseCommit<'a> {
2222+ /// The repo this event comes from
2323+ #[serde(borrow)]
2424+ pub repo: Did<'a>,
2525+2626+ /// The rev of the emitted commit
2727+ pub rev: Tid,
2828+2929+ /// The stream sequence number of this message
3030+ pub seq: i64,
3131+3232+ /// The rev of the last emitted commit from this repo (if any)
3333+ pub since: Tid,
3434+3535+ /// Timestamp of when this message was originally broadcast
3636+ pub time: jacquard_common::types::string::Datetime,
3737+3838+ /// Repo commit object CID
3939+ ///
4040+ /// This CID points to the repository commit block (with did, version, data, rev, prev, sig).
4141+ /// It must be the first entry in the CAR header 'roots' list.
4242+ #[serde(borrow)]
4343+ pub commit: jacquard_common::types::cid::CidLink<'a>,
4444+4545+ /// CAR file containing relevant blocks
4646+ ///
4747+ /// Contains blocks as a diff since the previous repo state. The commit block
4848+ /// must be included, and its CID must be the first root in the CAR header.
4949+ ///
5050+ /// For sync v1.1, may include additional MST node blocks needed for operation inversion.
5151+ #[serde(with = "super::serde_bytes_helper")]
5252+ pub blocks: Bytes,
5353+5454+ /// Operations in this commit
5555+ #[serde(borrow)]
5656+ pub ops: Vec<RepoOp<'a>>,
5757+5858+ /// Previous MST root CID (sync v1.1 only)
5959+ ///
6060+ /// The root CID of the MST tree for the previous commit (indicated by the 'since' field).
6161+ /// Corresponds to the 'data' field in the previous repo commit object.
6262+ ///
6363+ /// **Sync v1.1 inductive validation:**
6464+ /// - Enables validation without local MST state
6565+ /// - Operations can be inverted (creates→deletes, deletes→creates with prev values)
6666+ /// - Required for "inductive firehose" consumption
6767+ ///
6868+ /// **Sync v1.0:**
6969+ /// - This field is None
7070+ /// - Consumers must have previous repository state
7171+ #[serde(skip_serializing_if = "Option::is_none")]
7272+ #[serde(borrow)]
7373+ pub prev_data: Option<jacquard_common::types::cid::CidLink<'a>>,
7474+7575+ /// Blob CIDs referenced in this commit
7676+ #[serde(borrow)]
7777+ pub blobs: Vec<jacquard_common::types::cid::CidLink<'a>>,
7878+7979+ /// DEPRECATED: Replaced by #sync event and data limits
8080+ ///
8181+ /// Indicates that this commit contained too many ops, or data size was too large.
8282+ /// Consumers will need to make a separate request to get missing data.
8383+ pub too_big: bool,
8484+8585+ /// DEPRECATED: Unused
8686+ pub rebase: bool,
8787+}
8888+8989+/// A repository operation (mutation of a single record)
9090+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
9191+#[serde(rename_all = "camelCase")]
9292+pub struct RepoOp<'a> {
9393+ /// Operation type: "create", "update", or "delete"
9494+ #[serde(borrow)]
9595+ pub action: jacquard_common::CowStr<'a>,
9696+9797+ /// Collection/rkey path (e.g., "app.bsky.feed.post/abc123")
9898+ #[serde(borrow)]
9999+ pub path: jacquard_common::CowStr<'a>,
100100+101101+ /// For creates and updates, the new record CID. For deletions, None (null).
102102+ #[serde(skip_serializing_if = "Option::is_none")]
103103+ #[serde(borrow)]
104104+ pub cid: Option<jacquard_common::types::cid::CidLink<'a>>,
105105+106106+ /// For updates and deletes, the previous record CID
107107+ ///
108108+ /// Required for sync v1.1 inductive firehose validation.
109109+ /// For creates, this field should not be defined.
110110+ #[serde(skip_serializing_if = "Option::is_none")]
111111+ #[serde(borrow)]
112112+ pub prev: Option<jacquard_common::types::cid::CidLink<'a>>,
113113+}
114114+115115+impl IntoStatic for FirehoseCommit<'_> {
116116+ type Output = FirehoseCommit<'static>;
117117+118118+ fn into_static(self) -> Self::Output {
119119+ FirehoseCommit {
120120+ repo: self.repo.into_static(),
121121+ rev: self.rev,
122122+ seq: self.seq,
123123+ since: self.since,
124124+ time: self.time,
125125+ commit: self.commit.into_static(),
126126+ blocks: self.blocks,
127127+ ops: self.ops.into_iter().map(|op| op.into_static()).collect(),
128128+ prev_data: self.prev_data.map(|pd| pd.into_static()),
129129+ blobs: self.blobs.into_iter().map(|b| b.into_static()).collect(),
130130+ too_big: self.too_big,
131131+ rebase: self.rebase,
132132+ }
133133+ }
134134+}
135135+136136+impl IntoStatic for RepoOp<'_> {
137137+ type Output = RepoOp<'static>;
138138+139139+ fn into_static(self) -> Self::Output {
140140+ RepoOp {
141141+ action: self.action.into_static(),
142142+ path: self.path.into_static(),
143143+ cid: self.cid.into_static(),
144144+ prev: self.prev.map(|p| p.into_static()),
145145+ }
146146+ }
147147+}
148148+149149+/// Validation functions for firehose commit messages
150150+///
151151+/// These functions validate commits from the `com.atproto.sync.subscribeRepos` firehose.
152152+use crate::error::Result;
153153+use crate::mst::Mst;
154154+use crate::storage::BlockStore;
155155+use cid::Cid as IpldCid;
156156+use std::sync::Arc;
157157+158158+impl<'a> FirehoseCommit<'a> {
159159+ /// Validate a sync v1.0 commit
160160+ ///
161161+ /// **Requirements:**
162162+ /// - Must have previous MST state (potentially full repository)
163163+ /// - All blocks needed for validation must be in `self.blocks`
164164+ ///
165165+ /// **Validation steps:**
166166+ /// 1. Parse CAR blocks from `self.blocks` into temporary storage
167167+ /// 2. Load commit object and verify signature
168168+ /// 3. Apply operations to previous MST (using temporary storage for new blocks)
169169+ /// 4. Verify result matches commit.data (new MST root)
170170+ ///
171171+ /// Returns the new MST root CID on success.
172172+ pub async fn validate_v1_0<S: BlockStore + Sync + 'static>(
173173+ &self,
174174+ prev_mst_root: Option<IpldCid>,
175175+ prev_storage: Arc<S>,
176176+ pubkey: &jacquard_common::types::crypto::PublicKey<'_>,
177177+ ) -> Result<IpldCid> {
178178+ // 1. Parse CAR blocks from the firehose message into temporary storage
179179+ let parsed = crate::car::parse_car_bytes(&self.blocks).await?;
180180+ let temp_storage = crate::storage::MemoryBlockStore::new_from_blocks(parsed.blocks);
181181+182182+ // 2. Create layered storage: reads from temp first, then prev; writes to temp only
183183+ // This avoids copying all previous MST blocks
184184+ let layered_storage =
185185+ crate::storage::LayeredBlockStore::new(temp_storage.clone(), prev_storage);
186186+187187+ // 3. Extract and verify commit object from temporary storage
188188+ let commit_cid: IpldCid = self
189189+ .commit
190190+ .to_ipld()
191191+ .map_err(|e| crate::error::RepoError::invalid(format!("Invalid commit CID: {}", e)))?;
192192+ let commit_bytes = temp_storage
193193+ .get(&commit_cid)
194194+ .await?
195195+ .ok_or_else(|| crate::error::RepoError::not_found("commit block", &commit_cid))?;
196196+197197+ let commit = super::Commit::from_cbor(&commit_bytes)?;
198198+199199+ // Verify DID matches
200200+ if commit.did().as_ref() != self.repo.as_ref() {
201201+ return Err(crate::error::RepoError::invalid_commit(format!(
202202+ "DID mismatch: commit has {}, message has {}",
203203+ commit.did(),
204204+ self.repo
205205+ )));
206206+ }
207207+208208+ // Verify signature
209209+ commit.verify(pubkey)?;
210210+211211+ let layered_arc = Arc::new(layered_storage);
212212+213213+ // 4. Load previous MST state from layered storage (or start empty)
214214+ let prev_mst = if let Some(prev_root) = prev_mst_root {
215215+ Mst::load(layered_arc.clone(), prev_root, None)
216216+ } else {
217217+ Mst::new(layered_arc.clone())
218218+ };
219219+220220+ // 5. Load new MST from commit.data (claimed result)
221221+ let expected_root = *commit.data();
222222+ let new_mst = Mst::load(layered_arc, expected_root, None);
223223+224224+ // 6. Compute diff to get verified write ops (with actual prev values from tree state)
225225+ let diff = prev_mst.diff(&new_mst).await?;
226226+ let verified_ops = diff.to_verified_ops();
227227+228228+ // 7. Apply verified ops to prev MST
229229+ let computed_mst = prev_mst.batch(&verified_ops).await?;
230230+231231+ // 8. Verify computed result matches claimed result
232232+ let computed_root = computed_mst.get_pointer().await?;
233233+234234+ if computed_root != expected_root {
235235+ return Err(crate::error::RepoError::invalid_commit(format!(
236236+ "MST root mismatch: expected {}, got {}",
237237+ expected_root, computed_root
238238+ )));
239239+ }
240240+241241+ Ok(expected_root)
242242+ }
243243+244244+ /// Validate a sync v1.1 commit (inductive validation)
245245+ ///
246246+ /// **Requirements:**
247247+ /// - `self.prev_data` must be Some (contains previous MST root)
248248+ /// - All blocks needed for validation must be in `self.blocks`
249249+ ///
250250+ /// **Validation steps:**
251251+ /// 1. Parse CAR blocks from `self.blocks` into temporary storage
252252+ /// 2. Load commit object and verify signature
253253+ /// 3. Start from `prev_data` MST root (loaded from temp storage)
254254+ /// 4. Apply operations (with prev CID validation for updates/deletes)
255255+ /// 5. Verify result matches commit.data (new MST root)
256256+ ///
257257+ /// Returns the new MST root CID on success.
258258+ ///
259259+ /// **Inductive property:** Can validate without any external state besides the blocks
260260+ /// in this message. The `prev_data` field provides the starting MST root, and operations
261261+ /// include `prev` CIDs for validation. All necessary blocks must be in the CAR bytes.
262262+ pub async fn validate_v1_1(
263263+ &self,
264264+ pubkey: &jacquard_common::types::crypto::PublicKey<'_>,
265265+ ) -> Result<IpldCid> {
266266+ // 1. Require prev_data for v1.1
267267+ let prev_data_cid: IpldCid = self
268268+ .prev_data
269269+ .as_ref()
270270+ .ok_or_else(|| {
271271+ crate::error::RepoError::invalid_commit(
272272+ "Sync v1.1 validation requires prev_data field",
273273+ )
274274+ })?
275275+ .to_ipld()
276276+ .map_err(|e| {
277277+ crate::error::RepoError::invalid(format!("Invalid prev_data CID: {}", e))
278278+ })?;
279279+280280+ // 2. Parse CAR blocks from the firehose message into temporary storage
281281+ let parsed = crate::car::parse_car_bytes(&self.blocks).await?;
282282+ let temp_storage = Arc::new(crate::storage::MemoryBlockStore::new_from_blocks(
283283+ parsed.blocks,
284284+ ));
285285+286286+ // 3. Extract and verify commit object from temporary storage
287287+ let commit_cid: IpldCid = self
288288+ .commit
289289+ .to_ipld()
290290+ .map_err(|e| crate::error::RepoError::invalid(format!("Invalid commit CID: {}", e)))?;
291291+ let commit_bytes = temp_storage
292292+ .get(&commit_cid)
293293+ .await?
294294+ .ok_or_else(|| crate::error::RepoError::not_found("commit block", &commit_cid))?;
295295+296296+ let commit = super::Commit::from_cbor(&commit_bytes)?;
297297+298298+ // Verify DID matches
299299+ if commit.did().as_ref() != self.repo.as_ref() {
300300+ return Err(crate::error::RepoError::invalid_commit(format!(
301301+ "DID mismatch: commit has {}, message has {}",
302302+ commit.did(),
303303+ self.repo
304304+ )));
305305+ }
306306+307307+ // Verify signature
308308+ commit.verify(pubkey)?;
309309+310310+ // 4. Load previous MST from prev_data (all blocks should be in temp_storage)
311311+ let prev_mst = Mst::load(temp_storage.clone(), prev_data_cid, None);
312312+313313+ // 5. Load new MST from commit.data (claimed result)
314314+ let expected_root = *commit.data();
315315+ let new_mst = Mst::load(temp_storage, expected_root, None);
316316+317317+ // 6. Compute diff to get verified write ops (with actual prev values from tree state)
318318+ let diff = prev_mst.diff(&new_mst).await?;
319319+ let verified_ops = diff.to_verified_ops();
320320+321321+ // 7. Apply verified ops to prev MST
322322+ let computed_mst = prev_mst.batch(&verified_ops).await?;
323323+324324+ // 8. Verify computed result matches claimed result
325325+ let computed_root = computed_mst.get_pointer().await?;
326326+327327+ if computed_root != expected_root {
328328+ return Err(crate::error::RepoError::invalid_commit(format!(
329329+ "MST root mismatch: expected {}, got {}",
330330+ expected_root, computed_root
331331+ )));
332332+ }
333333+334334+ Ok(expected_root)
335335+ }
336336+}
+239
crates/jacquard-repo/src/commit/mod.rs
···11+//! Commit structures and signature verification for AT Protocol repositories.
22+//!
33+//! This module provides repository commit object handling with signature support.
44+55+pub mod firehose;
66+pub mod proof;
77+pub(crate) mod serde_bytes_helper;
88+use crate::error::{CommitError, Result};
99+use bytes::Bytes;
1010+use cid::Cid as IpldCid;
1111+use jacquard_common::IntoStatic;
1212+use jacquard_common::types::crypto::PublicKey;
1313+use jacquard_common::types::string::Did;
1414+use jacquard_common::types::tid::Tid;
1515+/// Repository commit object
1616+///
1717+/// This structure represents a signed commit in an AT Protocol repository.
1818+/// Stored as a block in CAR files, identified by its CID.
1919+///
2020+/// **Version compatibility**: v2 and v3 commits differ only in how `prev` is
2121+/// serialized (v2 uses it, v3 must include it even if null). This struct
2222+/// handles both by always including `prev` in serialization.
2323+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
2424+pub struct Commit<'a> {
2525+ /// Repository DID
2626+ #[serde(borrow)]
2727+ pub did: Did<'a>,
2828+2929+ /// Commit version (2 or 3)
3030+ pub version: i64,
3131+3232+ /// MST root CID
3333+ pub data: IpldCid,
3434+3535+ /// Revision TID
3636+ pub rev: Tid,
3737+3838+ /// Previous commit CID (None for initial commit)
3939+ pub prev: Option<IpldCid>,
4040+4141+ /// Signature bytes
4242+ #[serde(with = "serde_bytes_helper")]
4343+ pub sig: Bytes,
4444+}
4545+4646+impl<'a> Commit<'a> {
4747+ /// Create new unsigned commit (version = 3, sig empty)
4848+ pub fn new_unsigned(did: Did<'a>, data: IpldCid, rev: Tid, prev: Option<IpldCid>) -> Self {
4949+ Self {
5050+ did,
5151+ version: 3,
5252+ data,
5353+ rev,
5454+ prev,
5555+ sig: Bytes::new(),
5656+ }
5757+ }
5858+5959+ /// Sign this commit with a key
6060+ pub fn sign(mut self, key: &impl SigningKey) -> Result<Self> {
6161+ let unsigned = self.unsigned_bytes()?;
6262+ self.sig = key.sign_bytes(&unsigned)?;
6363+ Ok(self)
6464+ }
6565+6666+ /// Get the repository DID
6767+ pub fn did(&self) -> &Did<'a> {
6868+ &self.did
6969+ }
7070+7171+ /// Get the MST root CID
7272+ pub fn data(&self) -> &IpldCid {
7373+ &self.data
7474+ }
7575+7676+ /// Get the revision TID
7777+ pub fn rev(&self) -> &Tid {
7878+ &self.rev
7979+ }
8080+8181+ /// Get the previous commit CID
8282+ pub fn prev(&self) -> Option<&IpldCid> {
8383+ self.prev.as_ref()
8484+ }
8585+8686+ /// Get the signature bytes
8787+ pub fn sig(&self) -> &Bytes {
8888+ &self.sig
8989+ }
9090+9191+ /// Get unsigned commit bytes (for signing/verification)
9292+ pub(super) fn unsigned_bytes(&self) -> Result<Vec<u8>> {
9393+ // Serialize without signature field
9494+ let mut unsigned = self.clone();
9595+ unsigned.sig = Bytes::new();
9696+ serde_ipld_dagcbor::to_vec(&unsigned)
9797+ .map_err(|e| crate::error::CommitError::Serialization(Box::new(e)).into())
9898+ }
9999+100100+ /// Serialize to DAG-CBOR
101101+ pub fn to_cbor(&self) -> Result<Vec<u8>> {
102102+ serde_ipld_dagcbor::to_vec(self).map_err(|e| CommitError::Serialization(Box::new(e)).into())
103103+ }
104104+105105+ /// Deserialize from DAG-CBOR
106106+ pub fn from_cbor(data: &'a [u8]) -> Result<Self> {
107107+ serde_ipld_dagcbor::from_slice(data)
108108+ .map_err(|e| CommitError::Serialization(Box::new(e)).into())
109109+ }
110110+111111+ /// Compute CID of this commit
112112+ pub fn to_cid(&self) -> Result<IpldCid> {
113113+ let cbor = self.to_cbor()?;
114114+ crate::mst::util::compute_cid(&cbor)
115115+ }
116116+117117+ /// Verify signature against a public key from a DID document.
118118+ ///
119119+ /// The key type is inferred from the PublicKey codec.
120120+ pub fn verify(&self, pubkey: &PublicKey) -> std::result::Result<(), CommitError> {
121121+ let unsigned = self
122122+ .unsigned_bytes()
123123+ .map_err(|e| CommitError::Serialization(e.into()))?;
124124+ let signature = self.sig();
125125+126126+ use jacquard_common::types::crypto::KeyCodec;
127127+ match pubkey.codec {
128128+ KeyCodec::Ed25519 => {
129129+ let vk = pubkey
130130+ .to_ed25519()
131131+ .map_err(|e| CommitError::InvalidKey(e.to_string()))?;
132132+ let sig = ed25519_dalek::Signature::from_slice(signature.as_ref())
133133+ .map_err(|e| CommitError::InvalidSignature(e.to_string()))?;
134134+ vk.verify_strict(&unsigned, &sig)
135135+ .map_err(|_| CommitError::SignatureVerificationFailed)?;
136136+ }
137137+ KeyCodec::Secp256k1 => {
138138+ use k256::ecdsa::{Signature, VerifyingKey, signature::Verifier};
139139+ let vk = pubkey
140140+ .to_k256()
141141+ .map_err(|e| CommitError::InvalidKey(e.to_string()))?;
142142+ let verifying_key = VerifyingKey::from(&vk);
143143+ let sig = Signature::from_slice(signature.as_ref())
144144+ .map_err(|e| CommitError::InvalidSignature(e.to_string()))?;
145145+ verifying_key
146146+ .verify(&unsigned, &sig)
147147+ .map_err(|_| CommitError::SignatureVerificationFailed)?;
148148+ }
149149+ KeyCodec::P256 => {
150150+ use p256::ecdsa::{Signature, VerifyingKey, signature::Verifier};
151151+ let vk = pubkey
152152+ .to_p256()
153153+ .map_err(|e| CommitError::InvalidKey(e.to_string()))?;
154154+ let verifying_key = VerifyingKey::from(&vk);
155155+ let sig = Signature::from_slice(signature.as_ref())
156156+ .map_err(|e| CommitError::InvalidSignature(e.to_string()))?;
157157+ verifying_key
158158+ .verify(&unsigned, &sig)
159159+ .map_err(|_| CommitError::SignatureVerificationFailed)?;
160160+ }
161161+ KeyCodec::Unknown(code) => {
162162+ return Err(CommitError::UnsupportedKeyType(code));
163163+ }
164164+ }
165165+166166+ Ok(())
167167+ }
168168+}
169169+170170+impl IntoStatic for Commit<'_> {
171171+ type Output = Commit<'static>;
172172+173173+ fn into_static(self) -> Self::Output {
174174+ Commit {
175175+ did: self.did.into_static(),
176176+ version: self.version,
177177+ data: self.data,
178178+ rev: self.rev,
179179+ prev: self.prev,
180180+ sig: self.sig,
181181+ }
182182+ }
183183+}
184184+185185+/// Trait for signing keys.
186186+///
187187+/// Implemented for ed25519_dalek::SigningKey, k256::ecdsa::SigningKey, and p256::ecdsa::SigningKey.
188188+pub trait SigningKey {
189189+ /// Sign the given data and return signature as Bytes
190190+ fn sign_bytes(&self, data: &[u8]) -> Result<Bytes>;
191191+192192+ /// Get the public key bytes
193193+ fn public_key(&self) -> Vec<u8>;
194194+}
195195+196196+// Ed25519 implementation
197197+impl SigningKey for ed25519_dalek::SigningKey {
198198+ fn sign_bytes(&self, data: &[u8]) -> Result<Bytes> {
199199+ use ed25519_dalek::Signer;
200200+ let sig = Signer::sign(self, data);
201201+ Ok(Bytes::copy_from_slice(&sig.to_bytes()))
202202+ }
203203+204204+ fn public_key(&self) -> Vec<u8> {
205205+ self.verifying_key().to_bytes().to_vec()
206206+ }
207207+}
208208+209209+// K-256 (secp256k1) implementation
210210+impl SigningKey for k256::ecdsa::SigningKey {
211211+ fn sign_bytes(&self, data: &[u8]) -> Result<Bytes> {
212212+ use k256::ecdsa::signature::Signer;
213213+ let sig: k256::ecdsa::Signature = Signer::sign(self, data);
214214+ Ok(Bytes::copy_from_slice(&sig.to_bytes()))
215215+ }
216216+217217+ fn public_key(&self) -> Vec<u8> {
218218+ self.verifying_key()
219219+ .to_encoded_point(true)
220220+ .as_bytes()
221221+ .to_vec()
222222+ }
223223+}
224224+225225+// P-256 implementation
226226+impl SigningKey for p256::ecdsa::SigningKey {
227227+ fn sign_bytes(&self, data: &[u8]) -> Result<Bytes> {
228228+ use p256::ecdsa::signature::Signer;
229229+ let sig: p256::ecdsa::Signature = Signer::sign(self, data);
230230+ Ok(Bytes::copy_from_slice(&sig.to_bytes()))
231231+ }
232232+233233+ fn public_key(&self) -> Vec<u8> {
234234+ self.verifying_key()
235235+ .to_encoded_point(true)
236236+ .as_bytes()
237237+ .to_vec()
238238+ }
239239+}
+774
crates/jacquard-repo/src/commit/proof.rs
···11+//! Record proof verification
22+//!
33+//! Verifies merkle proofs for individual record existence/non-existence.
44+//!
55+//! **Proof structure:**
66+//! - CAR file containing:
77+//! - Commit block (with signature)
88+//! - MST node blocks along the path to the record(s)
99+//! - Record blocks (if proving existence)
1010+//!
1111+//! **Verification:**
1212+//! 1. Parse CAR blocks into temporary storage
1313+//! 2. Load and verify commit (signature + DID)
1414+//! 3. Load MST using ONLY blocks from CAR
1515+//! 4. For each claim, check if record exists/matches in MST
1616+//!
1717+//! This is distinct from firehose commit validation - proofs verify individual
1818+//! records, not full repository commits.
1919+2020+use crate::BlockStore;
2121+use crate::error::ProofError;
2222+use crate::mst::Mst;
2323+use crate::storage::MemoryBlockStore;
2424+use cid::Cid as IpldCid;
2525+use jacquard_common::types::string::Did;
2626+use smol_str::format_smolstr;
2727+use std::sync::Arc;
2828+2929+/// A claim about a record's CID at a specific path
3030+#[derive(Debug, Clone, PartialEq, Eq)]
3131+pub struct RecordClaim<'a> {
3232+ /// Collection NSID (e.g., "app.bsky.feed.post")
3333+ pub collection: jacquard_common::CowStr<'a>,
3434+3535+ /// Record key (TID or other identifier)
3636+ pub rkey: jacquard_common::CowStr<'a>,
3737+3838+ /// Expected CID of the record
3939+ /// - Some(cid): claiming record exists with this CID
4040+ /// - None: claiming record does not exist
4141+ pub cid: Option<IpldCid>,
4242+}
4343+4444+/// Result of proof verification
4545+#[derive(Debug)]
4646+pub struct VerifyProofsOutput<'a> {
4747+ /// Claims that were successfully verified
4848+ pub verified: Vec<RecordClaim<'a>>,
4949+5050+ /// Claims that failed verification
5151+ pub unverified: Vec<RecordClaim<'a>>,
5252+}
5353+5454+/// Verify record proofs from a CAR file
5555+///
5656+/// **Inputs:**
5757+/// - `car_bytes`: CAR file containing commit + MST blocks + record blocks
5858+/// - `claims`: Records to verify (existence or non-existence)
5959+/// - `did`: Expected DID of the repository
6060+/// - `pubkey`: Public key for signature verification
6161+///
6262+/// **Returns:**
6363+/// - `verified`: Claims that match the MST state
6464+/// - `unverified`: Claims that don't match
6565+///
6666+/// **Security:**
6767+/// - Verifies commit signature using provided pubkey
6868+/// - Verifies DID matches
6969+/// - Uses ONLY blocks from CAR (merkle proof property)
7070+///
7171+/// # Example
7272+///
7373+/// ```rust,ignore
7474+/// let claims = vec![
7575+/// RecordClaim {
7676+/// collection: "app.bsky.feed.post".into(),
7777+/// rkey: "3l4qpz7ajrc2a".into(),
7878+/// cid: Some(record_cid), // Claiming this record exists
7979+/// },
8080+/// RecordClaim {
8181+/// collection: "app.bsky.feed.post".into(),
8282+/// rkey: "nonexistent".into(),
8383+/// cid: None, // Claiming this record doesn't exist
8484+/// },
8585+/// ];
8686+///
8787+/// let result = verify_proofs(car_bytes, claims, did, pubkey).await?;
8888+/// assert_eq!(result.verified.len(), 2); // Both claims verified
8989+/// ```
9090+pub async fn verify_proofs<'a>(
9191+ car_bytes: &[u8],
9292+ claims: Vec<RecordClaim<'a>>,
9393+ did: &Did<'_>,
9494+ pubkey: &jacquard_common::types::crypto::PublicKey<'_>,
9595+) -> Result<VerifyProofsOutput<'a>, ProofError> {
9696+ // 1. Parse CAR file
9797+ let parsed =
9898+ crate::car::parse_car_bytes(car_bytes)
9999+ .await
100100+ .map_err(|e| ProofError::CarParseFailed {
101101+ source: Box::new(e),
102102+ })?;
103103+104104+ // 2. Create storage with ONLY blocks from CAR (merkle proof property)
105105+ let storage = Arc::new(MemoryBlockStore::new_from_blocks(parsed.blocks));
106106+107107+ // 3. Load commit from CAR root
108108+ let commit_cid = parsed.root;
109109+ let commit_bytes = storage
110110+ .get(&commit_cid)
111111+ .await
112112+ .map_err(|_| ProofError::CommitNotFound)?
113113+ .ok_or(ProofError::CommitNotFound)?;
114114+115115+ let commit = super::Commit::from_cbor(&commit_bytes).map_err(|e| {
116116+ ProofError::CommitDeserializeFailed {
117117+ source: Box::new(e),
118118+ }
119119+ })?;
120120+121121+ // 4. Verify DID matches
122122+ if commit.did().as_ref() != did.as_ref() {
123123+ return Err(ProofError::DidMismatch {
124124+ commit_did: commit.did().to_string(),
125125+ expected_did: did.to_string(),
126126+ }
127127+ .into());
128128+ }
129129+130130+ // 5. Verify signature
131131+ // We need to extract the CommitError before it gets converted to RepoError
132132+ if let Err(e) = commit.verify(pubkey) {
133133+ return Err(ProofError::SignatureVerificationFailed { source: e }.into());
134134+ }
135135+136136+ // 6. Load MST using ONLY blocks from CAR
137137+ let mst = Mst::load(storage.clone(), *commit.data(), None);
138138+139139+ // 7. Verify each claim
140140+ let mut verified = Vec::new();
141141+ let mut unverified = Vec::new();
142142+143143+ for claim in claims {
144144+ let key = format_smolstr!("{}/{}", claim.collection, claim.rkey);
145145+ let found_cid = mst.get(&key).await.ok().flatten();
146146+147147+ match (&claim.cid, found_cid) {
148148+ // Claiming record doesn't exist
149149+ (None, None) => {
150150+ // Correct: record doesn't exist
151151+ verified.push(claim);
152152+ }
153153+ (None, Some(_)) => {
154154+ // Incorrect: claimed doesn't exist but it does
155155+ unverified.push(claim);
156156+ }
157157+ // Claiming record exists with specific CID
158158+ (Some(claimed_cid), Some(found)) if claimed_cid == &found => {
159159+ // Correct: CID matches
160160+ verified.push(claim);
161161+ }
162162+ (Some(_), _) => {
163163+ // Incorrect: CID mismatch or doesn't exist
164164+ unverified.push(claim);
165165+ }
166166+ }
167167+ }
168168+169169+ Ok(VerifyProofsOutput {
170170+ verified,
171171+ unverified,
172172+ })
173173+}
174174+175175+#[cfg(test)]
176176+mod tests {
177177+ use super::*;
178178+ use crate::commit::Commit;
179179+ use crate::mst::Mst;
180180+ use crate::storage::MemoryBlockStore;
181181+ use jacquard_common::types::crypto::PublicKey;
182182+ use jacquard_common::types::string::Did;
183183+184184+ fn test_signing_key() -> k256::ecdsa::SigningKey {
185185+ use k256::ecdsa::SigningKey;
186186+ use rand::rngs::OsRng;
187187+ SigningKey::random(&mut OsRng)
188188+ }
189189+190190+ fn test_pubkey(sk: &k256::ecdsa::SigningKey) -> PublicKey<'static> {
191191+ use jacquard_common::types::crypto::KeyCodec;
192192+ use std::borrow::Cow;
193193+ let vk = sk.verifying_key();
194194+ PublicKey {
195195+ codec: KeyCodec::Secp256k1,
196196+ bytes: Cow::Owned(vk.to_encoded_point(true).as_bytes().to_vec()),
197197+ }
198198+ }
199199+200200+ fn test_cid(n: u8) -> IpldCid {
201201+ let data = vec![n; 32];
202202+ let mh =
203203+ multihash::Multihash::wrap(jacquard_common::types::crypto::SHA2_256, &data).unwrap();
204204+ IpldCid::new_v1(crate::DAG_CBOR_CID_CODEC, mh)
205205+ }
206206+207207+ #[tokio::test]
208208+ async fn test_verify_proofs_record_exists() {
209209+ // Create MST with records
210210+ let storage = Arc::new(MemoryBlockStore::new());
211211+ let mst = Mst::new(storage.clone());
212212+213213+ let key1 = "app.bsky.feed.post/abc123";
214214+ let cid1 = test_cid(1);
215215+216216+ let mst = mst.add(key1, cid1).await.unwrap();
217217+ let mst_root = mst.persist().await.unwrap();
218218+219219+ // Create and sign commit
220220+ let sk = test_signing_key();
221221+ let pubkey = test_pubkey(&sk);
222222+ let did = Did::new("did:plc:test").unwrap();
223223+224224+ let commit = Commit::new_unsigned(
225225+ did.clone(),
226226+ mst_root,
227227+ jacquard_common::types::tid::Ticker::new().next(None),
228228+ None,
229229+ )
230230+ .sign(&sk)
231231+ .unwrap();
232232+233233+ let commit_cid = commit.to_cid().unwrap();
234234+ let commit_bytes = commit.to_cbor().unwrap();
235235+ storage.put(&commit_bytes).await.unwrap();
236236+237237+ // Generate CAR proof with: commit + MST path blocks
238238+ let cids_for_proof = mst.cids_for_path(key1).await.unwrap();
239239+ let mut car_blocks = std::collections::BTreeMap::new();
240240+241241+ // Add commit block
242242+ car_blocks.insert(commit_cid, bytes::Bytes::from(commit_bytes));
243243+244244+ // Add MST blocks
245245+ for cid in &cids_for_proof[..cids_for_proof.len() - 1] {
246246+ // All except record CID
247247+ if let Some(block) = storage.get(cid).await.unwrap() {
248248+ car_blocks.insert(*cid, block);
249249+ }
250250+ }
251251+252252+ // Add record block
253253+ car_blocks.insert(cid1, bytes::Bytes::from(vec![0x42])); // dummy record data
254254+255255+ // Write CAR
256256+ let car_bytes = crate::car::write_car_bytes(commit_cid, car_blocks)
257257+ .await
258258+ .unwrap();
259259+260260+ // Verify proof
261261+ let claims = vec![RecordClaim {
262262+ collection: "app.bsky.feed.post".into(),
263263+ rkey: "abc123".into(),
264264+ cid: Some(cid1),
265265+ }];
266266+267267+ let result = verify_proofs(&car_bytes, claims, &did, &pubkey)
268268+ .await
269269+ .unwrap();
270270+271271+ assert_eq!(result.verified.len(), 1);
272272+ assert_eq!(result.unverified.len(), 0);
273273+ }
274274+275275+ #[tokio::test]
276276+ async fn test_verify_proofs_record_not_exists() {
277277+ // Create MST with one record
278278+ let storage = Arc::new(MemoryBlockStore::new());
279279+ let mst = Mst::new(storage.clone());
280280+281281+ let key1 = "app.bsky.feed.post/abc123";
282282+ let cid1 = test_cid(1);
283283+284284+ let mst = mst.add(key1, cid1).await.unwrap();
285285+ let mst_root = mst.persist().await.unwrap();
286286+287287+ // Create and sign commit
288288+ let sk = test_signing_key();
289289+ let pubkey = test_pubkey(&sk);
290290+ let did = Did::new("did:plc:test").unwrap();
291291+292292+ let commit = Commit::new_unsigned(
293293+ did.clone(),
294294+ mst_root,
295295+ jacquard_common::types::tid::Ticker::new().next(None),
296296+ None,
297297+ )
298298+ .sign(&sk)
299299+ .unwrap();
300300+301301+ let commit_cid = commit.to_cid().unwrap();
302302+ let commit_bytes = commit.to_cbor().unwrap();
303303+ storage.put(&commit_bytes).await.unwrap();
304304+305305+ // Generate proof for non-existent record
306306+ let nonexistent_key = "app.bsky.feed.post/xyz789";
307307+ let cids_for_proof = mst.cids_for_path(nonexistent_key).await.unwrap();
308308+ let mut car_blocks = std::collections::BTreeMap::new();
309309+310310+ // Add commit block
311311+ car_blocks.insert(commit_cid, bytes::Bytes::from(commit_bytes));
312312+313313+ // Add MST blocks (proof of absence)
314314+ for cid in &cids_for_proof {
315315+ if let Some(block) = storage.get(cid).await.unwrap() {
316316+ car_blocks.insert(*cid, block);
317317+ }
318318+ }
319319+320320+ // Write CAR
321321+ let car_bytes = crate::car::write_car_bytes(commit_cid, car_blocks)
322322+ .await
323323+ .unwrap();
324324+325325+ // Verify proof of non-existence
326326+ let claims = vec![RecordClaim {
327327+ collection: "app.bsky.feed.post".into(),
328328+ rkey: "xyz789".into(),
329329+ cid: None, // Claiming it doesn't exist
330330+ }];
331331+332332+ let result = verify_proofs(&car_bytes, claims, &did, &pubkey)
333333+ .await
334334+ .unwrap();
335335+336336+ assert_eq!(result.verified.len(), 1);
337337+ assert_eq!(result.unverified.len(), 0);
338338+ }
339339+340340+ #[tokio::test]
341341+ async fn test_verify_proofs_multiple_claims_mixed() {
342342+ // Test verifying multiple claims - some valid, some invalid
343343+ let storage = Arc::new(MemoryBlockStore::new());
344344+ let mst = Mst::new(storage.clone());
345345+346346+ let key1 = "app.bsky.feed.post/abc123";
347347+ let key2 = "app.bsky.feed.post/def456";
348348+ let cid1 = test_cid(1);
349349+ let cid2 = test_cid(2);
350350+351351+ let mst = mst.add(key1, cid1).await.unwrap();
352352+ let mst = mst.add(key2, cid2).await.unwrap();
353353+ let mst_root = mst.persist().await.unwrap();
354354+355355+ let sk = test_signing_key();
356356+ let pubkey = test_pubkey(&sk);
357357+ let did = Did::new("did:plc:test").unwrap();
358358+359359+ let commit = Commit::new_unsigned(
360360+ did.clone(),
361361+ mst_root,
362362+ jacquard_common::types::tid::Ticker::new().next(None),
363363+ None,
364364+ )
365365+ .sign(&sk)
366366+ .unwrap();
367367+368368+ let commit_cid = commit.to_cid().unwrap();
369369+ let commit_bytes = commit.to_cbor().unwrap();
370370+ storage.put(&commit_bytes).await.unwrap();
371371+372372+ // Generate CAR with both records
373373+ let cids_for_proof1 = mst.cids_for_path(key1).await.unwrap();
374374+ let cids_for_proof2 = mst.cids_for_path(key2).await.unwrap();
375375+ let mut car_blocks = std::collections::BTreeMap::new();
376376+377377+ car_blocks.insert(commit_cid, bytes::Bytes::from(commit_bytes));
378378+379379+ // Add all MST blocks from both paths
380380+ for cid in cids_for_proof1
381381+ .iter()
382382+ .chain(cids_for_proof2.iter())
383383+ .take(cids_for_proof1.len() + cids_for_proof2.len() - 2)
384384+ {
385385+ if let Some(block) = storage.get(cid).await.unwrap() {
386386+ car_blocks.insert(*cid, block);
387387+ }
388388+ }
389389+390390+ // Add record blocks
391391+ car_blocks.insert(cid1, bytes::Bytes::from(vec![0x41]));
392392+ car_blocks.insert(cid2, bytes::Bytes::from(vec![0x42]));
393393+394394+ let car_bytes = crate::car::write_car_bytes(commit_cid, car_blocks)
395395+ .await
396396+ .unwrap();
397397+398398+ // Mixed claims: valid, invalid CID, non-existent
399399+ let claims = vec![
400400+ RecordClaim {
401401+ collection: "app.bsky.feed.post".into(),
402402+ rkey: "abc123".into(),
403403+ cid: Some(cid1), // Valid
404404+ },
405405+ RecordClaim {
406406+ collection: "app.bsky.feed.post".into(),
407407+ rkey: "def456".into(),
408408+ cid: Some(test_cid(99)), // Wrong CID
409409+ },
410410+ RecordClaim {
411411+ collection: "app.bsky.feed.post".into(),
412412+ rkey: "xyz789".into(),
413413+ cid: None, // Correctly doesn't exist
414414+ },
415415+ ];
416416+417417+ let result = verify_proofs(&car_bytes, claims, &did, &pubkey)
418418+ .await
419419+ .unwrap();
420420+421421+ assert_eq!(result.verified.len(), 2); // First and third should verify
422422+ assert_eq!(result.unverified.len(), 1); // Second should fail
423423+ }
424424+425425+ #[tokio::test]
426426+ async fn test_verify_proofs_wrong_did() {
427427+ // Test that verification fails when DID doesn't match
428428+ let storage = Arc::new(MemoryBlockStore::new());
429429+ let mst = Mst::new(storage.clone());
430430+431431+ let key1 = "app.bsky.feed.post/abc123";
432432+ let cid1 = test_cid(1);
433433+434434+ let mst = mst.add(key1, cid1).await.unwrap();
435435+ let mst_root = mst.persist().await.unwrap();
436436+437437+ let sk = test_signing_key();
438438+ let pubkey = test_pubkey(&sk);
439439+ let did = Did::new("did:plc:test").unwrap();
440440+441441+ let commit = Commit::new_unsigned(
442442+ did.clone(),
443443+ mst_root,
444444+ jacquard_common::types::tid::Ticker::new().next(None),
445445+ None,
446446+ )
447447+ .sign(&sk)
448448+ .unwrap();
449449+450450+ let commit_cid = commit.to_cid().unwrap();
451451+ let commit_bytes = commit.to_cbor().unwrap();
452452+ storage.put(&commit_bytes).await.unwrap();
453453+454454+ let cids_for_proof = mst.cids_for_path(key1).await.unwrap();
455455+ let mut car_blocks = std::collections::BTreeMap::new();
456456+ car_blocks.insert(commit_cid, bytes::Bytes::from(commit_bytes));
457457+458458+ for cid in &cids_for_proof[..cids_for_proof.len() - 1] {
459459+ if let Some(block) = storage.get(cid).await.unwrap() {
460460+ car_blocks.insert(*cid, block);
461461+ }
462462+ }
463463+464464+ car_blocks.insert(cid1, bytes::Bytes::from(vec![0x42]));
465465+466466+ let car_bytes = crate::car::write_car_bytes(commit_cid, car_blocks)
467467+ .await
468468+ .unwrap();
469469+470470+ let claims = vec![RecordClaim {
471471+ collection: "app.bsky.feed.post".into(),
472472+ rkey: "abc123".into(),
473473+ cid: Some(cid1),
474474+ }];
475475+476476+ // Try to verify with WRONG DID
477477+ let wrong_did = Did::new("did:plc:wrong").unwrap();
478478+ let result = verify_proofs(&car_bytes, claims, &wrong_did, &pubkey).await;
479479+480480+ assert!(result.is_err());
481481+ assert!(result.unwrap_err().to_string().contains("DID mismatch"));
482482+ }
483483+484484+ #[tokio::test]
485485+ async fn test_verify_proofs_bad_signature() {
486486+ // Test that verification fails with wrong public key
487487+ let storage = Arc::new(MemoryBlockStore::new());
488488+ let mst = Mst::new(storage.clone());
489489+490490+ let key1 = "app.bsky.feed.post/abc123";
491491+ let cid1 = test_cid(1);
492492+493493+ let mst = mst.add(key1, cid1).await.unwrap();
494494+ let mst_root = mst.persist().await.unwrap();
495495+496496+ let sk = test_signing_key();
497497+ let did = Did::new("did:plc:test").unwrap();
498498+499499+ let commit = Commit::new_unsigned(
500500+ did.clone(),
501501+ mst_root,
502502+ jacquard_common::types::tid::Ticker::new().next(None),
503503+ None,
504504+ )
505505+ .sign(&sk)
506506+ .unwrap();
507507+508508+ let commit_cid = commit.to_cid().unwrap();
509509+ let commit_bytes = commit.to_cbor().unwrap();
510510+ storage.put(&commit_bytes).await.unwrap();
511511+512512+ let cids_for_proof = mst.cids_for_path(key1).await.unwrap();
513513+ let mut car_blocks = std::collections::BTreeMap::new();
514514+ car_blocks.insert(commit_cid, bytes::Bytes::from(commit_bytes));
515515+516516+ for cid in &cids_for_proof[..cids_for_proof.len() - 1] {
517517+ if let Some(block) = storage.get(cid).await.unwrap() {
518518+ car_blocks.insert(*cid, block);
519519+ }
520520+ }
521521+522522+ car_blocks.insert(cid1, bytes::Bytes::from(vec![0x42]));
523523+524524+ let car_bytes = crate::car::write_car_bytes(commit_cid, car_blocks)
525525+ .await
526526+ .unwrap();
527527+528528+ let claims = vec![RecordClaim {
529529+ collection: "app.bsky.feed.post".into(),
530530+ rkey: "abc123".into(),
531531+ cid: Some(cid1),
532532+ }];
533533+534534+ // Use WRONG public key
535535+ let wrong_sk = test_signing_key();
536536+ let wrong_pubkey = test_pubkey(&wrong_sk);
537537+538538+ let result = verify_proofs(&car_bytes, claims, &did, &wrong_pubkey).await;
539539+540540+ // Should fail signature verification
541541+ assert!(matches!(
542542+ result,
543543+ Err(ProofError::SignatureVerificationFailed { source: _ })
544544+ ));
545545+ }
546546+547547+ #[tokio::test]
548548+ async fn test_verify_proofs_missing_blocks() {
549549+ // Test that verification fails when CAR is missing necessary blocks
550550+ let storage = Arc::new(MemoryBlockStore::new());
551551+ let mst = Mst::new(storage.clone());
552552+553553+ let key1 = "app.bsky.feed.post/abc123";
554554+ let cid1 = test_cid(1);
555555+556556+ let mst = mst.add(key1, cid1).await.unwrap();
557557+ let mst_root = mst.persist().await.unwrap();
558558+559559+ let sk = test_signing_key();
560560+ let pubkey = test_pubkey(&sk);
561561+ let did = Did::new("did:plc:test").unwrap();
562562+563563+ let commit = Commit::new_unsigned(
564564+ did.clone(),
565565+ mst_root,
566566+ jacquard_common::types::tid::Ticker::new().next(None),
567567+ None,
568568+ )
569569+ .sign(&sk)
570570+ .unwrap();
571571+572572+ let commit_cid = commit.to_cid().unwrap();
573573+ let commit_bytes = commit.to_cbor().unwrap();
574574+ storage.put(&commit_bytes).await.unwrap();
575575+576576+ // Create CAR with ONLY commit block, missing MST blocks
577577+ let mut car_blocks = std::collections::BTreeMap::new();
578578+ car_blocks.insert(commit_cid, bytes::Bytes::from(commit_bytes));
579579+ // Intentionally NOT adding MST blocks or record blocks
580580+581581+ let car_bytes = crate::car::write_car_bytes(commit_cid, car_blocks)
582582+ .await
583583+ .unwrap();
584584+585585+ let claims = vec![RecordClaim {
586586+ collection: "app.bsky.feed.post".into(),
587587+ rkey: "abc123".into(),
588588+ cid: Some(cid1),
589589+ }];
590590+591591+ let result = verify_proofs(&car_bytes, claims, &did, &pubkey)
592592+ .await
593593+ .unwrap();
594594+595595+ assert!(result.verified.is_empty())
596596+ }
597597+598598+ #[tokio::test]
599599+ async fn test_verify_proofs_empty_mst() {
600600+ // Test proof verification on empty MST (claiming non-existence)
601601+ let storage = Arc::new(MemoryBlockStore::new());
602602+ let mst = Mst::new(storage.clone());
603603+604604+ let mst_root = mst.persist().await.unwrap();
605605+606606+ let sk = test_signing_key();
607607+ let pubkey = test_pubkey(&sk);
608608+ let did = Did::new("did:plc:test").unwrap();
609609+610610+ let commit = Commit::new_unsigned(
611611+ did.clone(),
612612+ mst_root,
613613+ jacquard_common::types::tid::Ticker::new().next(None),
614614+ None,
615615+ )
616616+ .sign(&sk)
617617+ .unwrap();
618618+619619+ let commit_cid = commit.to_cid().unwrap();
620620+ let commit_bytes = commit.to_cbor().unwrap();
621621+ storage.put(&commit_bytes).await.unwrap();
622622+623623+ let cids_for_proof = mst
624624+ .cids_for_path("app.bsky.feed.post/abc123")
625625+ .await
626626+ .unwrap();
627627+ let mut car_blocks = std::collections::BTreeMap::new();
628628+ car_blocks.insert(commit_cid, bytes::Bytes::from(commit_bytes));
629629+630630+ // Add any MST blocks (empty MST might still have root node)
631631+ for cid in &cids_for_proof {
632632+ if let Some(block) = storage.get(cid).await.unwrap() {
633633+ car_blocks.insert(*cid, block);
634634+ }
635635+ }
636636+637637+ let car_bytes = crate::car::write_car_bytes(commit_cid, car_blocks)
638638+ .await
639639+ .unwrap();
640640+641641+ let claims = vec![RecordClaim {
642642+ collection: "app.bsky.feed.post".into(),
643643+ rkey: "abc123".into(),
644644+ cid: None, // Claiming doesn't exist
645645+ }];
646646+647647+ let result = verify_proofs(&car_bytes, claims, &did, &pubkey)
648648+ .await
649649+ .unwrap();
650650+651651+ assert_eq!(result.verified.len(), 1);
652652+ assert_eq!(result.unverified.len(), 0);
653653+ }
654654+655655+ #[tokio::test]
656656+ async fn test_verify_proofs_claim_exists_in_empty_mst() {
657657+ // Test that claiming existence in empty MST fails
658658+ let storage = Arc::new(MemoryBlockStore::new());
659659+ let mst = Mst::new(storage.clone());
660660+661661+ let mst_root = mst.persist().await.unwrap();
662662+663663+ let sk = test_signing_key();
664664+ let pubkey = test_pubkey(&sk);
665665+ let did = Did::new("did:plc:test").unwrap();
666666+667667+ let commit = Commit::new_unsigned(
668668+ did.clone(),
669669+ mst_root,
670670+ jacquard_common::types::tid::Ticker::new().next(None),
671671+ None,
672672+ )
673673+ .sign(&sk)
674674+ .unwrap();
675675+676676+ let commit_cid = commit.to_cid().unwrap();
677677+ let commit_bytes = commit.to_cbor().unwrap();
678678+ storage.put(&commit_bytes).await.unwrap();
679679+680680+ let cids_for_proof = mst
681681+ .cids_for_path("app.bsky.feed.post/abc123")
682682+ .await
683683+ .unwrap();
684684+ let mut car_blocks = std::collections::BTreeMap::new();
685685+ car_blocks.insert(commit_cid, bytes::Bytes::from(commit_bytes));
686686+687687+ for cid in &cids_for_proof {
688688+ if let Some(block) = storage.get(cid).await.unwrap() {
689689+ car_blocks.insert(*cid, block);
690690+ }
691691+ }
692692+693693+ let car_bytes = crate::car::write_car_bytes(commit_cid, car_blocks)
694694+ .await
695695+ .unwrap();
696696+697697+ let claims = vec![RecordClaim {
698698+ collection: "app.bsky.feed.post".into(),
699699+ rkey: "abc123".into(),
700700+ cid: Some(test_cid(1)), // Claiming it exists
701701+ }];
702702+703703+ let result = verify_proofs(&car_bytes, claims, &did, &pubkey)
704704+ .await
705705+ .unwrap();
706706+707707+ assert_eq!(result.verified.len(), 0);
708708+ assert_eq!(result.unverified.len(), 1); // Should fail
709709+ }
710710+711711+ #[tokio::test]
712712+ async fn test_verify_proofs_invalid_claim() {
713713+ // Create MST with records
714714+ let storage = Arc::new(MemoryBlockStore::new());
715715+ let mst = Mst::new(storage.clone());
716716+717717+ let key1 = "app.bsky.feed.post/abc123";
718718+ let cid1 = test_cid(1);
719719+720720+ let mst = mst.add(key1, cid1).await.unwrap();
721721+ let mst_root = mst.persist().await.unwrap();
722722+723723+ // Create and sign commit
724724+ let sk = test_signing_key();
725725+ let pubkey = test_pubkey(&sk);
726726+ let did = Did::new("did:plc:test").unwrap();
727727+728728+ let commit = Commit::new_unsigned(
729729+ did.clone(),
730730+ mst_root,
731731+ jacquard_common::types::tid::Ticker::new().next(None),
732732+ None,
733733+ )
734734+ .sign(&sk)
735735+ .unwrap();
736736+737737+ let commit_cid = commit.to_cid().unwrap();
738738+ let commit_bytes = commit.to_cbor().unwrap();
739739+ storage.put(&commit_bytes).await.unwrap();
740740+741741+ // Generate CAR proof
742742+ let cids_for_proof = mst.cids_for_path(key1).await.unwrap();
743743+ let mut car_blocks = std::collections::BTreeMap::new();
744744+745745+ car_blocks.insert(commit_cid, bytes::Bytes::from(commit_bytes));
746746+747747+ for cid in &cids_for_proof[..cids_for_proof.len() - 1] {
748748+ if let Some(block) = storage.get(cid).await.unwrap() {
749749+ car_blocks.insert(*cid, block);
750750+ }
751751+ }
752752+753753+ car_blocks.insert(cid1, bytes::Bytes::from(vec![0x42]));
754754+755755+ let car_bytes = crate::car::write_car_bytes(commit_cid, car_blocks)
756756+ .await
757757+ .unwrap();
758758+759759+ // Verify proof with WRONG CID
760760+ let wrong_cid = test_cid(99);
761761+ let claims = vec![RecordClaim {
762762+ collection: "app.bsky.feed.post".into(),
763763+ rkey: "abc123".into(),
764764+ cid: Some(wrong_cid), // Wrong CID
765765+ }];
766766+767767+ let result = verify_proofs(&car_bytes, claims, &did, &pubkey)
768768+ .await
769769+ .unwrap();
770770+771771+ assert_eq!(result.verified.len(), 0);
772772+ assert_eq!(result.unverified.len(), 1); // Failed verification
773773+ }
774774+}
···11+//! Custom serde helpers for bytes::Bytes using serde_bytes
22+33+use bytes::Bytes;
44+use serde::{Deserializer, Serializer};
55+66+/// Serialize Bytes as a CBOR byte string
77+pub fn serialize<S>(bytes: &Bytes, serializer: S) -> Result<S::Ok, S::Error>
88+where
99+ S: Serializer,
1010+{
1111+ serde_bytes::serialize(bytes.as_ref(), serializer)
1212+}
1313+1414+/// Deserialize Bytes from a CBOR byte string
1515+pub fn deserialize<'de, D>(deserializer: D) -> Result<Bytes, D::Error>
1616+where
1717+ D: Deserializer<'de>,
1818+{
1919+ let vec: Vec<u8> = serde_bytes::deserialize(deserializer)?;
2020+ Ok(Bytes::from(vec))
2121+}
+410
crates/jacquard-repo/src/error.rs
···11+//! Error types for repository operations
22+33+use std::error::Error;
44+use std::fmt;
55+66+/// Boxed error type for error sources
77+pub type BoxError = Box<dyn Error + Send + Sync + 'static>;
88+99+/// Result type alias for repository operations
1010+pub type Result<T> = std::result::Result<T, RepoError>;
1111+1212+/// Repository operation error with rich diagnostics
1313+#[derive(Debug, thiserror::Error, miette::Diagnostic)]
1414+pub struct RepoError {
1515+ kind: RepoErrorKind,
1616+ #[source]
1717+ source: Option<BoxError>,
1818+ #[help]
1919+ help: Option<String>,
2020+ context: Option<String>,
2121+}
2222+2323+/// Error categories for repository operations
2424+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2525+pub enum RepoErrorKind {
2626+ /// Storage operation failed
2727+ Storage,
2828+ /// Invalid MST structure
2929+ InvalidMst,
3030+ /// Invalid commit structure
3131+ InvalidCommit,
3232+ /// Invalid key format
3333+ InvalidKey,
3434+ /// Invalid CID
3535+ InvalidCid,
3636+ /// Resource not found
3737+ NotFound,
3838+ /// Cryptographic operation failed
3939+ Crypto,
4040+ /// Serialization/deserialization failed
4141+ Serialization,
4242+ /// Data too large (exceeds protocol limits)
4343+ TooLarge,
4444+ /// CAR file operation failed
4545+ Car,
4646+ /// I/O error
4747+ Io,
4848+}
4949+5050+impl RepoError {
5151+ /// Create a new error with the given kind and optional source
5252+ pub fn new(kind: RepoErrorKind, source: Option<BoxError>) -> Self {
5353+ Self {
5454+ kind,
5555+ source,
5656+ help: None,
5757+ context: None,
5858+ }
5959+ }
6060+6161+ /// Add a help message to the error
6262+ pub fn with_help(mut self, help: impl Into<String>) -> Self {
6363+ self.help = Some(help.into());
6464+ self
6565+ }
6666+6767+ /// Add context information to the error
6868+ pub fn with_context(mut self, context: impl Into<String>) -> Self {
6969+ self.context = Some(context.into());
7070+ self
7171+ }
7272+7373+ /// Get the error kind
7474+ pub fn kind(&self) -> &RepoErrorKind {
7575+ &self.kind
7676+ }
7777+7878+ // Constructors for different error kinds
7979+8080+ /// Create a storage error
8181+ pub fn storage(source: impl Error + Send + Sync + 'static) -> Self {
8282+ Self::new(RepoErrorKind::Storage, Some(Box::new(source)))
8383+ }
8484+8585+ /// Create an invalid MST error
8686+ pub fn invalid_mst(msg: impl Into<String>) -> Self {
8787+ Self::new(RepoErrorKind::InvalidMst, Some(msg.into().into()))
8888+ .with_help("MST nodes must follow protocol structure")
8989+ }
9090+9191+ /// Create an invalid commit error
9292+ pub fn invalid_commit(msg: impl Into<String>) -> Self {
9393+ Self::new(RepoErrorKind::InvalidCommit, Some(msg.into().into()))
9494+ }
9595+9696+ /// Create an invalid key error
9797+ pub fn invalid_key(key: impl Into<String>) -> Self {
9898+ Self::new(RepoErrorKind::InvalidKey, None)
9999+ .with_help("MST keys must match [a-zA-Z0-9._:~-]+, max 256 bytes")
100100+ .with_context(format!("key: {}", key.into()))
101101+ }
102102+103103+ /// Create an invalid CID error
104104+ pub fn invalid_cid(msg: impl Into<String>) -> Self {
105105+ Self::new(RepoErrorKind::InvalidCid, Some(msg.into().into()))
106106+ }
107107+108108+ /// Create a not found error
109109+ pub fn not_found(resource: &str, id: impl fmt::Display) -> Self {
110110+ Self::new(RepoErrorKind::NotFound, None)
111111+ .with_context(format!("{} not found: {}", resource, id))
112112+ }
113113+114114+ /// Create an already exists error
115115+ pub fn already_exists(resource: &str, id: impl fmt::Display) -> Self {
116116+ Self::new(RepoErrorKind::InvalidMst, None)
117117+ .with_context(format!("{} already exists: {}", resource, id))
118118+ }
119119+120120+ /// Create a crypto error
121121+ pub fn crypto(source: impl Error + Send + Sync + 'static) -> Self {
122122+ Self::new(RepoErrorKind::Crypto, Some(Box::new(source)))
123123+ }
124124+125125+ /// Create a serialization error
126126+ pub fn serialization(source: impl Error + Send + Sync + 'static) -> Self {
127127+ Self::new(RepoErrorKind::Serialization, Some(Box::new(source)))
128128+ }
129129+130130+ /// Create a too large error
131131+ pub fn too_large(what: &str, size: usize, max: usize) -> Self {
132132+ Self::new(RepoErrorKind::TooLarge, None)
133133+ .with_context(format!("{} is {} bytes, max {}", what, size, max))
134134+ .with_help("See sync v1.1 protocol limits")
135135+ }
136136+137137+ /// Create a CAR file error
138138+ pub fn car(source: impl Error + Send + Sync + 'static) -> Self {
139139+ Self::new(RepoErrorKind::Car, Some(Box::new(source)))
140140+ }
141141+142142+ /// Create a CAR parse error (alias for car)
143143+ pub fn car_parse(source: impl Error + Send + Sync + 'static) -> Self {
144144+ Self::car(source).with_context("Failed to parse CAR file".to_string())
145145+ }
146146+147147+ /// Create an I/O error
148148+ pub fn io(source: impl Error + Send + Sync + 'static) -> Self {
149149+ Self::new(RepoErrorKind::Io, Some(Box::new(source)))
150150+ }
151151+152152+ /// Create a generic invalid error
153153+ pub fn invalid(msg: impl Into<String>) -> Self {
154154+ Self::new(RepoErrorKind::InvalidMst, Some(msg.into().into()))
155155+ }
156156+}
157157+158158+impl fmt::Display for RepoError {
159159+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
160160+ write!(f, "{:?}", self.kind)?;
161161+162162+ if let Some(ctx) = &self.context {
163163+ write!(f, ": {}", ctx)?;
164164+ }
165165+166166+ if let Some(src) = &self.source {
167167+ write!(f, ": {}", src)?;
168168+ }
169169+170170+ Ok(())
171171+ }
172172+}
173173+174174+// Internal granular errors
175175+176176+/// MST-specific errors
177177+#[derive(Debug, thiserror::Error, miette::Diagnostic)]
178178+pub enum MstError {
179179+ /// Empty key not allowed
180180+ #[error("Empty key not allowed")]
181181+ EmptyKey,
182182+183183+ /// Key too long
184184+ #[error("Key too long: {len} bytes (max {max})")]
185185+ KeyTooLong {
186186+ /// Actual key length
187187+ len: usize,
188188+ /// Maximum allowed length
189189+ max: usize,
190190+ },
191191+192192+ /// Invalid key characters
193193+ #[error("Invalid key characters: {key}")]
194194+ InvalidKeyChars {
195195+ /// The invalid key
196196+ key: String,
197197+ },
198198+199199+ /// Node structure invalid
200200+ #[error("Node structure invalid: {0}")]
201201+ InvalidNode(String),
202202+203203+ /// Serialization failed
204204+ #[error("Serialization failed")]
205205+ Serialization(#[source] BoxError),
206206+}
207207+208208+impl From<MstError> for RepoError {
209209+ fn from(e: MstError) -> Self {
210210+ match e {
211211+ MstError::EmptyKey => RepoError::invalid_key(""),
212212+ MstError::KeyTooLong { len, max } => {
213213+ RepoError::invalid_key(format!("length {}/{}", len, max))
214214+ }
215215+ MstError::InvalidKeyChars { key } => RepoError::invalid_key(key),
216216+ MstError::InvalidNode(msg) => RepoError::invalid_mst(msg),
217217+ MstError::Serialization(e) => RepoError::new(RepoErrorKind::Serialization, Some(e)),
218218+ }
219219+ }
220220+}
221221+222222+/// Commit-specific errors
223223+#[derive(Debug, thiserror::Error, miette::Diagnostic)]
224224+pub enum CommitError {
225225+ /// Invalid commit version
226226+ #[error("Invalid commit version: {0}")]
227227+ InvalidVersion(i64),
228228+229229+ /// Invalid signature format
230230+ #[error("Invalid signature format: {0}")]
231231+ InvalidSignature(String),
232232+233233+ /// Signature verification failed
234234+ #[error("Signature verification failed")]
235235+ SignatureVerificationFailed,
236236+237237+ /// Invalid key format
238238+ #[error("Invalid key format: {0}")]
239239+ InvalidKey(String),
240240+241241+ /// Unsupported key type
242242+ #[error("Unsupported key type: {0}")]
243243+ UnsupportedKeyType(u64),
244244+245245+ /// Serialization failed
246246+ #[error("Serialization failed")]
247247+ Serialization(#[source] BoxError),
248248+}
249249+250250+impl From<CommitError> for RepoError {
251251+ fn from(e: CommitError) -> Self {
252252+ match e {
253253+ CommitError::InvalidVersion(v) => {
254254+ RepoError::invalid_commit(format!("unsupported version {}", v))
255255+ }
256256+ CommitError::InvalidSignature(msg) => {
257257+ RepoError::new(RepoErrorKind::Crypto, Some(msg.into()))
258258+ .with_context("invalid signature format".to_string())
259259+ }
260260+ CommitError::SignatureVerificationFailed => RepoError::new(RepoErrorKind::Crypto, None)
261261+ .with_context("signature verification failed".to_string()),
262262+ CommitError::InvalidKey(msg) => RepoError::new(RepoErrorKind::Crypto, Some(msg.into()))
263263+ .with_context("invalid key format".to_string()),
264264+ CommitError::UnsupportedKeyType(code) => RepoError::new(RepoErrorKind::Crypto, None)
265265+ .with_context(format!("unsupported key type: 0x{:x}", code)),
266266+ CommitError::Serialization(e) => RepoError::new(RepoErrorKind::Serialization, Some(e)),
267267+ }
268268+ }
269269+}
270270+271271+/// Diff-specific errors
272272+#[derive(Debug, thiserror::Error)]
273273+pub enum DiffError {
274274+ /// Too many operations
275275+ #[error("Too many operations: {count} (max {max})")]
276276+ TooManyOps {
277277+ /// Actual operation count
278278+ count: usize,
279279+ /// Maximum allowed operations
280280+ max: usize,
281281+ },
282282+283283+ /// Diff too large
284284+ #[error("Diff too large: {size} bytes (max {max})")]
285285+ TooLarge {
286286+ /// Actual size
287287+ size: usize,
288288+ /// Maximum size
289289+ max: usize,
290290+ },
291291+}
292292+293293+impl From<DiffError> for RepoError {
294294+ fn from(e: DiffError) -> Self {
295295+ match e {
296296+ DiffError::TooManyOps { count, max } => {
297297+ RepoError::too_large("diff operation count", count, max)
298298+ }
299299+ DiffError::TooLarge { size, max } => RepoError::too_large("diff size", size, max),
300300+ }
301301+ }
302302+}
303303+304304+/// Proof verification errors
305305+#[derive(Debug, thiserror::Error, miette::Diagnostic)]
306306+pub enum ProofError {
307307+ /// CAR file has no root CID
308308+ #[error("CAR file has no root CID")]
309309+ #[diagnostic(
310310+ code(proof::no_root),
311311+ help("CAR files for proofs must have exactly one root CID pointing to the commit")
312312+ )]
313313+ NoRoot,
314314+315315+ /// Commit block not found in CAR
316316+ #[error("Commit block not found in CAR")]
317317+ #[diagnostic(
318318+ code(proof::commit_not_found),
319319+ help("The CAR root CID must point to a valid commit block")
320320+ )]
321321+ CommitNotFound,
322322+323323+ /// DID mismatch between commit and expected
324324+ #[error("DID mismatch: commit has {commit_did}, expected {expected_did}")]
325325+ #[diagnostic(
326326+ code(proof::did_mismatch),
327327+ help("The commit must be signed by the expected DID")
328328+ )]
329329+ DidMismatch {
330330+ /// DID in the commit
331331+ commit_did: String,
332332+ /// Expected DID
333333+ expected_did: String,
334334+ },
335335+336336+ /// Signature verification failed
337337+ #[error("Signature verification failed")]
338338+ #[diagnostic(
339339+ code(proof::signature_failed),
340340+ help("The commit signature must be valid for the provided public key")
341341+ )]
342342+ SignatureVerificationFailed {
343343+ /// Underlying crypto error
344344+ #[source]
345345+ source: CommitError,
346346+ },
347347+348348+ /// MST root block missing from CAR
349349+ #[error("MST root block missing from CAR: {cid}")]
350350+ #[diagnostic(
351351+ code(proof::missing_mst_block),
352352+ help("All MST blocks along the proof path must be included in the CAR file")
353353+ )]
354354+ MissingMstBlock {
355355+ /// The missing CID
356356+ cid: String,
357357+ },
358358+359359+ /// Invalid commit structure
360360+ #[error("Invalid commit structure: {0}")]
361361+ #[diagnostic(code(proof::invalid_commit))]
362362+ InvalidCommit(String),
363363+364364+ /// CAR parsing failed
365365+ #[error("CAR parsing failed")]
366366+ #[diagnostic(code(proof::car_parse_failed))]
367367+ CarParseFailed {
368368+ /// Underlying error
369369+ #[source]
370370+ source: BoxError,
371371+ },
372372+373373+ /// Commit deserialization failed
374374+ #[error("Commit deserialization failed")]
375375+ #[diagnostic(code(proof::commit_deserialize_failed))]
376376+ CommitDeserializeFailed {
377377+ /// Underlying error
378378+ #[source]
379379+ source: BoxError,
380380+ },
381381+}
382382+383383+impl From<ProofError> for RepoError {
384384+ fn from(e: ProofError) -> Self {
385385+ match &e {
386386+ ProofError::NoRoot => RepoError::invalid("CAR file has no root CID"),
387387+ ProofError::CommitNotFound => {
388388+ RepoError::new(RepoErrorKind::NotFound, Some(Box::new(e)))
389389+ }
390390+ ProofError::DidMismatch { .. } => {
391391+ RepoError::new(RepoErrorKind::InvalidCommit, Some(Box::new(e)))
392392+ }
393393+ ProofError::SignatureVerificationFailed { .. } => {
394394+ RepoError::new(RepoErrorKind::Crypto, Some(Box::new(e)))
395395+ }
396396+ ProofError::MissingMstBlock { .. } => {
397397+ RepoError::new(RepoErrorKind::NotFound, Some(Box::new(e)))
398398+ }
399399+ ProofError::InvalidCommit(_) => {
400400+ RepoError::new(RepoErrorKind::InvalidCommit, Some(Box::new(e)))
401401+ }
402402+ ProofError::CarParseFailed { .. } => {
403403+ RepoError::new(RepoErrorKind::Car, Some(Box::new(e)))
404404+ }
405405+ ProofError::CommitDeserializeFailed { .. } => {
406406+ RepoError::new(RepoErrorKind::Serialization, Some(Box::new(e)))
407407+ }
408408+ }
409409+ }
410410+}
+62
crates/jacquard-repo/src/lib.rs
···11+//! AT Protocol repository primitives
22+//!
33+//! This crate provides building blocks for working with AT Protocol repositories:
44+//!
55+//! - **MST (Merkle Search Tree)**: Immutable tree operations with deterministic structure
66+//! - **Commits**: Signed commit structures for Sync v1 (version 2) and v1.1 (version 3)
77+//! - **CAR I/O**: Import and export repositories in CAR (Content Addressable aRchive) format
88+//! - **Storage**: Pluggable block storage abstraction with in-memory and file-backed implementations
99+//!
1010+//! # Design Philosophy
1111+//!
1212+//! - Core primitives are always available (MST, commits, storage)
1313+//! - Optional high-level Repository API for convenience
1414+//! - Immutable MST operations for referential transparency
1515+//! - Zero-copy deserialization where possible
1616+//! - Support for both current and future sync protocol versions
1717+//!
1818+//! # Example
1919+//!
2020+//! ```rust,ignore
2121+//! use jacquard_repo::{Mst, MemoryBlockStore};
2222+//! use cid::Cid;
2323+//!
2424+//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
2525+//! let storage = MemoryBlockStore::new();
2626+//! let mst = Mst::new(storage);
2727+//!
2828+//! // Add entries
2929+//! let cid = /* ... */;
3030+//! let new_mst = mst.add("app.bsky.feed.post/abc123", cid).await?;
3131+//!
3232+//! // Retrieve
3333+//! if let Some(value) = new_mst.get("app.bsky.feed.post/abc123").await? {
3434+//! println!("Found: {}", value);
3535+//! }
3636+//! # Ok(())
3737+//! # }
3838+//! ```
3939+4040+#![warn(missing_docs)]
4141+#![warn(clippy::all)]
4242+#![deny(unsafe_code)]
4343+4444+/// CAR (Content Addressable aRchive) utilities
4545+pub mod car;
4646+/// Commit structures and signature verification
4747+pub mod commit;
4848+pub mod error;
4949+/// Merkle Search Tree implementation
5050+pub mod mst;
5151+/// High-level repository operations
5252+pub mod repo;
5353+/// Block storage abstraction
5454+pub mod storage;
5555+5656+pub use error::{RepoError, RepoErrorKind, Result};
5757+pub use mst::{Mst, MstDiff, WriteOp};
5858+pub use repo::{CommitData, Repository};
5959+pub use storage::{BlockStore, FileBlockStore, LayeredBlockStore, MemoryBlockStore};
6060+6161+/// DAG-CBOR codec identifier for CIDs (0x71)
6262+pub const DAG_CBOR_CID_CODEC: u64 = 0x71;
+399
crates/jacquard-repo/src/mst/diff.rs
···11+//! MST diff calculation
22+33+use super::tree::Mst;
44+use crate::error::Result;
55+use crate::storage::BlockStore;
66+use cid::Cid as IpldCid;
77+use smol_str::SmolStr;
88+use std::collections::HashMap;
99+1010+/// Diff between two MST states
1111+///
1212+/// Represents the changes needed to transform one tree into another.
1313+/// Used for firehose validation and batch operations.
1414+#[derive(Debug, Clone, PartialEq, Eq)]
1515+pub struct MstDiff {
1616+ /// New records created (key, new CID)
1717+ pub creates: Vec<(SmolStr, IpldCid)>,
1818+1919+ /// Records updated (key, new CID, old CID)
2020+ pub updates: Vec<(SmolStr, IpldCid, IpldCid)>,
2121+2222+ /// Records deleted (key, old CID)
2323+ pub deletes: Vec<(SmolStr, IpldCid)>,
2424+}
2525+2626+use super::tree::VerifiedWriteOp;
2727+2828+impl MstDiff {
2929+ /// Create empty diff
3030+ pub fn new() -> Self {
3131+ Self {
3232+ creates: Vec::new(),
3333+ updates: Vec::new(),
3434+ deletes: Vec::new(),
3535+ }
3636+ }
3737+3838+ /// Check if diff is empty (no changes)
3939+ pub fn is_empty(&self) -> bool {
4040+ self.creates.is_empty() && self.updates.is_empty() && self.deletes.is_empty()
4141+ }
4242+4343+ /// Count total operations
4444+ pub fn op_count(&self) -> usize {
4545+ self.creates.len() + self.updates.len() + self.deletes.len()
4646+ }
4747+4848+ /// Validate against sync v1.1 limits
4949+ ///
5050+ /// The sync protocol has a 200 operation limit per commit.
5151+ pub fn validate_limits(&self) -> Result<()> {
5252+ if self.op_count() > 200 {
5353+ return Err(crate::error::RepoError::too_large(
5454+ "diff operation count",
5555+ self.op_count(),
5656+ 200,
5757+ ));
5858+ }
5959+ Ok(())
6060+ }
6161+6262+ /// Convert diff to verified write operations
6363+ ///
6464+ /// Returns operations that can be safely applied with `batch()`.
6565+ /// All update/delete operations include verified prev CIDs.
6666+ pub fn to_verified_ops(&self) -> Vec<VerifiedWriteOp> {
6767+ let mut ops = Vec::with_capacity(self.op_count());
6868+6969+ // Add creates
7070+ for (key, cid) in &self.creates {
7171+ ops.push(VerifiedWriteOp::Create {
7272+ key: key.clone(),
7373+ cid: *cid,
7474+ });
7575+ }
7676+7777+ // Add updates (includes prev)
7878+ for (key, new_cid, old_cid) in &self.updates {
7979+ ops.push(VerifiedWriteOp::Update {
8080+ key: key.clone(),
8181+ cid: *new_cid,
8282+ prev: *old_cid,
8383+ });
8484+ }
8585+8686+ // Add deletes (includes prev)
8787+ for (key, old_cid) in &self.deletes {
8888+ ops.push(VerifiedWriteOp::Delete {
8989+ key: key.clone(),
9090+ prev: *old_cid,
9191+ });
9292+ }
9393+9494+ ops
9595+ }
9696+9797+ /// Convert diff to firehose repository operations
9898+ ///
9999+ /// Returns operations in the format used by `com.atproto.sync.subscribeRepos`.
100100+ /// All update/delete operations include prev CIDs for sync v1.1 validation.
101101+ pub fn to_repo_ops(&self) -> Vec<crate::commit::firehose::RepoOp<'_>> {
102102+ use jacquard_common::types::cid::CidLink;
103103+104104+ let mut ops = Vec::with_capacity(self.op_count());
105105+106106+ // Add creates
107107+ for (key, cid) in &self.creates {
108108+ ops.push(crate::commit::firehose::RepoOp {
109109+ action: "create".into(),
110110+ path: key.as_str().into(),
111111+ cid: Some(CidLink::from(*cid)),
112112+ prev: None,
113113+ });
114114+ }
115115+116116+ // Add updates
117117+ for (key, new_cid, old_cid) in &self.updates {
118118+ ops.push(crate::commit::firehose::RepoOp {
119119+ action: "update".into(),
120120+ path: key.as_str().into(),
121121+ cid: Some(CidLink::from(*new_cid)),
122122+ prev: Some(CidLink::from(*old_cid)),
123123+ });
124124+ }
125125+126126+ // Add deletes
127127+ for (key, old_cid) in &self.deletes {
128128+ ops.push(crate::commit::firehose::RepoOp {
129129+ action: "delete".into(),
130130+ path: key.as_str().into(),
131131+ cid: None, // null for deletes
132132+ prev: Some(CidLink::from(*old_cid)),
133133+ });
134134+ }
135135+136136+ ops
137137+ }
138138+}
139139+140140+impl Default for MstDiff {
141141+ fn default() -> Self {
142142+ Self::new()
143143+ }
144144+}
145145+146146+impl<S: BlockStore + Sync + 'static> Mst<S> {
147147+ /// Compute diff from this tree to another
148148+ ///
149149+ /// Returns operations needed to transform `self` into `other`.
150150+ /// - Creates: keys in `other` but not in `self`
151151+ /// - Updates: keys in both but with different CIDs
152152+ /// - Deletes: keys in `self` but not in `other`
153153+ pub async fn diff(&self, other: &Mst<S>) -> Result<MstDiff> {
154154+ // Collect all leaves from both trees
155155+ let self_leaves = self.leaves().await?;
156156+ let other_leaves = other.leaves().await?;
157157+158158+ // Build hashmaps for efficient lookup
159159+ let self_map: HashMap<SmolStr, IpldCid> = self_leaves.into_iter().collect();
160160+ let other_map: HashMap<SmolStr, IpldCid> = other_leaves.into_iter().collect();
161161+162162+ let mut diff = MstDiff::new();
163163+164164+ // Find creates and updates
165165+ for (key, new_cid) in &other_map {
166166+ match self_map.get(key) {
167167+ Some(old_cid) => {
168168+ // Key exists in both - check if CID changed
169169+ if old_cid != new_cid {
170170+ diff.updates.push((key.clone(), *new_cid, *old_cid));
171171+ }
172172+ }
173173+ None => {
174174+ // Key only in other - create
175175+ diff.creates.push((key.clone(), *new_cid));
176176+ }
177177+ }
178178+ }
179179+180180+ // Find deletes
181181+ for (key, old_cid) in &self_map {
182182+ if !other_map.contains_key(key) {
183183+ // Key only in self - delete
184184+ diff.deletes.push((key.clone(), *old_cid));
185185+ }
186186+ }
187187+188188+ Ok(diff)
189189+ }
190190+191191+ /// Compute diff from this tree to empty (all deletes)
192192+ ///
193193+ /// Returns diff representing deletion of all records in this tree.
194194+ pub async fn diff_to_empty(&self) -> Result<MstDiff> {
195195+ let leaves = self.leaves().await?;
196196+197197+ Ok(MstDiff {
198198+ creates: Vec::new(),
199199+ updates: Vec::new(),
200200+ deletes: leaves,
201201+ })
202202+ }
203203+}
204204+205205+#[cfg(test)]
206206+mod tests {
207207+ use jacquard_common::types::crypto::SHA2_256;
208208+209209+ use super::*;
210210+ use crate::{DAG_CBOR_CID_CODEC, storage::memory::MemoryBlockStore};
211211+ use std::sync::Arc;
212212+213213+ fn test_cid(n: u8) -> IpldCid {
214214+ let data = vec![n; 32];
215215+ let mh = multihash::Multihash::wrap(SHA2_256, &data).unwrap();
216216+ IpldCid::new_v1(DAG_CBOR_CID_CODEC, mh)
217217+ }
218218+219219+ #[tokio::test]
220220+ async fn test_diff_empty_trees() {
221221+ let storage = Arc::new(MemoryBlockStore::new());
222222+ let tree1 = Mst::new(storage.clone());
223223+ let tree2 = Mst::new(storage);
224224+225225+ let diff = tree1.diff(&tree2).await.unwrap();
226226+227227+ assert!(diff.is_empty());
228228+ assert_eq!(diff.op_count(), 0);
229229+ }
230230+231231+ #[tokio::test]
232232+ async fn test_diff_creates() {
233233+ let storage1 = Arc::new(MemoryBlockStore::new());
234234+ let tree1 = Mst::new(storage1);
235235+236236+ let storage2 = Arc::new(MemoryBlockStore::new());
237237+ let tree2 = Mst::new(storage2);
238238+ let tree2 = tree2.add("a", test_cid(1)).await.unwrap();
239239+ let tree2 = tree2.add("b", test_cid(2)).await.unwrap();
240240+241241+ let diff = tree1.diff(&tree2).await.unwrap();
242242+243243+ assert_eq!(diff.creates.len(), 2);
244244+ assert_eq!(diff.updates.len(), 0);
245245+ assert_eq!(diff.deletes.len(), 0);
246246+ assert_eq!(diff.op_count(), 2);
247247+248248+ // Check creates content
249249+ assert!(
250250+ diff.creates
251251+ .iter()
252252+ .any(|(k, c)| k == "a" && *c == test_cid(1))
253253+ );
254254+ assert!(
255255+ diff.creates
256256+ .iter()
257257+ .any(|(k, c)| k == "b" && *c == test_cid(2))
258258+ );
259259+ }
260260+261261+ #[tokio::test]
262262+ async fn test_diff_deletes() {
263263+ let storage1 = Arc::new(MemoryBlockStore::new());
264264+ let tree1 = Mst::new(storage1);
265265+ let tree1 = tree1.add("a", test_cid(1)).await.unwrap();
266266+ let tree1 = tree1.add("b", test_cid(2)).await.unwrap();
267267+268268+ let storage2 = Arc::new(MemoryBlockStore::new());
269269+ let tree2 = Mst::new(storage2);
270270+271271+ let diff = tree1.diff(&tree2).await.unwrap();
272272+273273+ assert_eq!(diff.creates.len(), 0);
274274+ assert_eq!(diff.updates.len(), 0);
275275+ assert_eq!(diff.deletes.len(), 2);
276276+ assert_eq!(diff.op_count(), 2);
277277+278278+ // Check deletes content
279279+ assert!(
280280+ diff.deletes
281281+ .iter()
282282+ .any(|(k, c)| k == "a" && *c == test_cid(1))
283283+ );
284284+ assert!(
285285+ diff.deletes
286286+ .iter()
287287+ .any(|(k, c)| k == "b" && *c == test_cid(2))
288288+ );
289289+ }
290290+291291+ #[tokio::test]
292292+ async fn test_diff_updates() {
293293+ let storage1 = Arc::new(MemoryBlockStore::new());
294294+ let tree1 = Mst::new(storage1);
295295+ let tree1 = tree1.add("a", test_cid(1)).await.unwrap();
296296+ let tree1 = tree1.add("b", test_cid(2)).await.unwrap();
297297+298298+ let storage2 = Arc::new(MemoryBlockStore::new());
299299+ let tree2 = Mst::new(storage2);
300300+ let tree2 = tree2.add("a", test_cid(10)).await.unwrap(); // Changed CID
301301+ let tree2 = tree2.add("b", test_cid(2)).await.unwrap(); // Same CID
302302+303303+ let diff = tree1.diff(&tree2).await.unwrap();
304304+305305+ assert_eq!(diff.creates.len(), 0);
306306+ assert_eq!(diff.updates.len(), 1); // Only "a" changed
307307+ assert_eq!(diff.deletes.len(), 0);
308308+ assert_eq!(diff.op_count(), 1);
309309+310310+ // Check update content
311311+ assert_eq!(diff.updates[0].0, "a");
312312+ assert_eq!(diff.updates[0].1, test_cid(10)); // new CID
313313+ assert_eq!(diff.updates[0].2, test_cid(1)); // old CID
314314+ }
315315+316316+ #[tokio::test]
317317+ async fn test_diff_mixed_operations() {
318318+ let storage1 = Arc::new(MemoryBlockStore::new());
319319+ let tree1 = Mst::new(storage1);
320320+ let tree1 = tree1.add("a", test_cid(1)).await.unwrap();
321321+ let tree1 = tree1.add("b", test_cid(2)).await.unwrap();
322322+ let tree1 = tree1.add("c", test_cid(3)).await.unwrap();
323323+324324+ let storage2 = Arc::new(MemoryBlockStore::new());
325325+ let tree2 = Mst::new(storage2);
326326+ let tree2 = tree2.add("a", test_cid(10)).await.unwrap(); // Updated
327327+ let tree2 = tree2.add("b", test_cid(2)).await.unwrap(); // Unchanged
328328+ // "c" deleted
329329+ let tree2 = tree2.add("d", test_cid(4)).await.unwrap(); // Created
330330+331331+ let diff = tree1.diff(&tree2).await.unwrap();
332332+333333+ assert_eq!(diff.creates.len(), 1); // "d"
334334+ assert_eq!(diff.updates.len(), 1); // "a"
335335+ assert_eq!(diff.deletes.len(), 1); // "c"
336336+ assert_eq!(diff.op_count(), 3);
337337+ }
338338+339339+ #[tokio::test]
340340+ async fn test_diff_to_empty() {
341341+ let storage = Arc::new(MemoryBlockStore::new());
342342+ let tree = Mst::new(storage);
343343+ let tree = tree.add("a", test_cid(1)).await.unwrap();
344344+ let tree = tree.add("b", test_cid(2)).await.unwrap();
345345+ let tree = tree.add("c", test_cid(3)).await.unwrap();
346346+347347+ let diff = tree.diff_to_empty().await.unwrap();
348348+349349+ assert_eq!(diff.creates.len(), 0);
350350+ assert_eq!(diff.updates.len(), 0);
351351+ assert_eq!(diff.deletes.len(), 3);
352352+ assert_eq!(diff.op_count(), 3);
353353+ }
354354+355355+ #[tokio::test]
356356+ async fn test_validate_limits() {
357357+ let mut diff = MstDiff::new();
358358+359359+ // Add 200 creates (at limit)
360360+ for i in 0..200 {
361361+ diff.creates
362362+ .push((SmolStr::new(&format!("key{}", i)), test_cid(1)));
363363+ }
364364+365365+ // Should be ok at exactly 200
366366+ assert!(diff.validate_limits().is_ok());
367367+368368+ // Add one more - should fail
369369+ diff.creates.push((SmolStr::new("key201"), test_cid(1)));
370370+ assert!(diff.validate_limits().is_err());
371371+ }
372372+373373+ #[tokio::test]
374374+ async fn test_diff_symmetry() {
375375+ // diff(A, B) should be inverse of diff(B, A)
376376+ let storage1 = Arc::new(MemoryBlockStore::new());
377377+ let tree1 = Mst::new(storage1);
378378+ let tree1 = tree1.add("a", test_cid(1)).await.unwrap();
379379+ let tree1 = tree1.add("b", test_cid(2)).await.unwrap();
380380+381381+ let storage2 = Arc::new(MemoryBlockStore::new());
382382+ let tree2 = Mst::new(storage2);
383383+ let tree2 = tree2.add("b", test_cid(2)).await.unwrap();
384384+ let tree2 = tree2.add("c", test_cid(3)).await.unwrap();
385385+386386+ let diff1 = tree1.diff(&tree2).await.unwrap();
387387+ let diff2 = tree2.diff(&tree1).await.unwrap();
388388+389389+ // diff1: creates="c", deletes="a"
390390+ // diff2: creates="a", deletes="c"
391391+ assert_eq!(diff1.creates.len(), 1);
392392+ assert_eq!(diff1.deletes.len(), 1);
393393+ assert_eq!(diff2.creates.len(), 1);
394394+ assert_eq!(diff2.deletes.len(), 1);
395395+396396+ assert_eq!(diff1.creates[0].0, diff2.deletes[0].0); // "c"
397397+ assert_eq!(diff1.deletes[0].0, diff2.creates[0].0); // "a"
398398+ }
399399+}
+10
crates/jacquard-repo/src/mst/mod.rs
···11+//! Merkle Search Tree implementation
22+33+pub mod node;
44+pub mod tree;
55+pub mod util;
66+pub mod diff;
77+88+pub use node::{NodeData, NodeEntry, TreeEntry};
99+pub use tree::{Mst, WriteOp};
1010+pub use diff::MstDiff;
+124
crates/jacquard-repo/src/mst/node.rs
···11+//! MST node data structures
22+33+use bytes::Bytes;
44+use cid::Cid as IpldCid;
55+use smol_str::SmolStr;
66+77+/// Entry in an MST node - either a subtree or a leaf
88+///
99+/// This is the in-memory representation used for tree operations.
1010+/// MST operations work on flat `Vec<NodeEntry>` where entries are interleaved:
1111+/// `[Tree, Leaf, Tree, Leaf, Leaf, Tree]` etc.
1212+///
1313+/// The wire format (CBOR) is different - see `NodeData` and `TreeEntry`.
1414+#[derive(Debug, Clone)]
1515+pub enum NodeEntry<S: crate::storage::BlockStore> {
1616+ /// Subtree reference
1717+ ///
1818+ /// Will be lazily loaded from storage when needed.
1919+ Tree(crate::mst::Mst<S>),
2020+2121+ /// Leaf node with key-value pair
2222+ Leaf {
2323+ /// Full key (not prefix-compressed in memory)
2424+ key: SmolStr,
2525+ /// CID of the record value
2626+ value: IpldCid,
2727+ },
2828+}
2929+3030+impl<S: crate::storage::BlockStore> NodeEntry<S> {
3131+ /// Check if this is a tree entry
3232+ pub fn is_tree(&self) -> bool {
3333+ matches!(self, NodeEntry::Tree(_))
3434+ }
3535+3636+ /// Check if this is a leaf entry
3737+ pub fn is_leaf(&self) -> bool {
3838+ matches!(self, NodeEntry::Leaf { .. })
3939+ }
4040+4141+ /// Get the key if this is a leaf
4242+ pub fn leaf_key(&self) -> Option<&str> {
4343+ match self {
4444+ NodeEntry::Leaf { key, .. } => Some(key.as_str()),
4545+ NodeEntry::Tree(_) => None,
4646+ }
4747+ }
4848+}
4949+5050+// ============================================================================
5151+// Wire format structures (for CBOR serialization)
5252+// ============================================================================
5353+//
5454+// These represent the on-disk/network format with prefix compression.
5555+// Conversion functions will be in util.rs.
5656+5757+/// Wire format entry (prefix-compressed leaf with optional subtree pointer)
5858+///
5959+/// This is what gets serialized to CBOR. In memory, we use the flat
6060+/// `Vec<NodeEntry>` representation instead.
6161+///
6262+/// **IMPORTANT:** Fields MUST be in alphabetical order (k, p, t, v) to match
6363+/// DAG-CBOR canonical form. Even though serde_ipld_dagcbor should handle this,
6464+/// we define them in order to be explicit.
6565+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
6666+pub struct TreeEntry {
6767+ /// Key suffix after prefix (stored as bytes in wire format)
6868+ ///
6969+ /// Must use serde_bytes to serialize as CBOR byte string (major type 2)
7070+ /// instead of array of integers (major type 4)
7171+ #[serde(rename = "k", with = "crate::commit::serde_bytes_helper")]
7272+ pub key_suffix: Bytes,
7373+7474+ /// Prefix length (shared chars with previous key in node)
7575+ ///
7676+ /// Must be u8 (not usize) to match CBOR encoding in reference implementations
7777+ #[serde(rename = "p")]
7878+ pub prefix_len: u8,
7979+8080+ /// Optional subtree pointer (CID of child MST node)
8181+ ///
8282+ /// Serializes as explicit `null` when None (AT Protocol spec requirement for determinism).
8383+ #[serde(rename = "t")]
8484+ pub tree: Option<IpldCid>,
8585+8686+ /// CID of the record value
8787+ #[serde(rename = "v")]
8888+ pub value: IpldCid,
8989+}
9090+9191+/// Wire format node data (serialized as DAG-CBOR)
9292+///
9393+/// This is the structure that gets written to storage. The in-memory
9494+/// representation uses `Vec<NodeEntry>` instead.
9595+///
9696+/// # Conversion rules
9797+///
9898+/// **Serialization (flat → wire):**
9999+/// - First entry if `Tree` → becomes `left` pointer
100100+/// - Each `Leaf` → becomes entry in `entries`
101101+/// - `Tree` after `Leaf` → becomes that leaf's `tree` pointer
102102+///
103103+/// **Deserialization (wire → flat):**
104104+/// - `left` if present → prepend `Tree` entry
105105+/// - Each entry → append `Leaf`
106106+/// - Each `tree` if present → append `Tree` entry
107107+///
108108+/// # Nullability requirement
109109+///
110110+/// **CRITICAL:** All `Option<T>` fields MUST serialize as explicit `null` (not skip).
111111+/// This is an AT Protocol spec requirement for cross-implementation determinism.
112112+/// Skipping vs explicit null produces different CBOR bytes → different CIDs → broken interop.
113113+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
114114+pub struct NodeData {
115115+ /// Left-most subtree pointer
116116+ ///
117117+ /// Serializes as explicit `null` when None (AT Protocol spec requirement for determinism).
118118+ #[serde(rename = "l")]
119119+ pub left: Option<IpldCid>,
120120+121121+ /// Entries in this node (sorted by full key, prefix-compressed)
122122+ #[serde(rename = "e")]
123123+ pub entries: Vec<TreeEntry>,
124124+}
+1571
crates/jacquard-repo/src/mst/tree.rs
···11+//! Immutable Merkle Search Tree operations
22+33+use super::node::NodeEntry;
44+use super::util;
55+use crate::error::{RepoError, Result};
66+use crate::storage::BlockStore;
77+use cid::Cid as IpldCid;
88+use smol_str::SmolStr;
99+use std::sync::Arc;
1010+use tokio::sync::RwLock;
1111+1212+/// Write operation for batch application
1313+///
1414+/// Represents a single operation to apply to an MST.
1515+/// For firehose operations where `prev` may be optional (v3).
1616+#[derive(Debug, Clone, PartialEq, Eq)]
1717+pub enum WriteOp {
1818+ /// Create new record (error if exists)
1919+ Create {
2020+ /// Record key (collection/rkey)
2121+ key: SmolStr,
2222+ /// Record CID
2323+ cid: IpldCid,
2424+ },
2525+2626+ /// Update existing record (error if not exists)
2727+ ///
2828+ /// `prev` is optional in v3 (required in v2)
2929+ Update {
3030+ /// Record key (collection/rkey)
3131+ key: SmolStr,
3232+ /// New record CID
3333+ cid: IpldCid,
3434+ /// Previous CID (optional for validation)
3535+ prev: Option<IpldCid>,
3636+ },
3737+3838+ /// Delete record
3939+ ///
4040+ /// `prev` is optional in v3 (required in v2)
4141+ Delete {
4242+ /// Record key (collection/rkey)
4343+ key: SmolStr,
4444+ /// Previous CID (optional for validation)
4545+ prev: Option<IpldCid>,
4646+ },
4747+}
4848+4949+/// Verified write operation with required prev fields
5050+///
5151+/// Used for operations where prev CID has been verified against tree state.
5252+/// Safer than `WriteOp` because it always validates prev values.
5353+#[derive(Debug, Clone, PartialEq, Eq)]
5454+pub enum VerifiedWriteOp {
5555+ /// Create new record (verified not to exist)
5656+ Create {
5757+ /// Record key (collection/rkey)
5858+ key: SmolStr,
5959+ /// Record CID
6060+ cid: IpldCid,
6161+ },
6262+6363+ /// Update existing record (with verified prev CID)
6464+ Update {
6565+ /// Record key (collection/rkey)
6666+ key: SmolStr,
6767+ /// New record CID
6868+ cid: IpldCid,
6969+ /// Previous CID (required, validated)
7070+ prev: IpldCid,
7171+ },
7272+7373+ /// Delete record (with verified current CID)
7474+ Delete {
7575+ /// Record key (collection/rkey)
7676+ key: SmolStr,
7777+ /// Previous CID (required, validated)
7878+ prev: IpldCid,
7979+ },
8080+}
8181+8282+/// Immutable Merkle Search Tree
8383+///
8484+/// MST operations return new tree instances, leaving the original unchanged.
8585+/// This enables versioning and safe concurrent access to different tree versions.
8686+///
8787+/// # Architecture
8888+///
8989+/// The tree uses a flat `Vec<NodeEntry>` representation in memory, where
9090+/// `NodeEntry` is an enum of `Tree` (subtree) and `Leaf` (key-value pair).
9191+///
9292+/// Entries are interleaved: `[Tree, Leaf, Tree, Leaf, Leaf, Tree]` etc.
9393+/// This representation makes operations simple (Vec slicing, splicing).
9494+///
9595+/// The wire format (CBOR) uses prefix compression and pointers (left/tree).
9696+/// See `NodeData` and `TreeEntry` in node.rs for serialization format.
9797+///
9898+/// # Layer-based structure
9999+///
100100+/// Keys are hashed (SHA-256) and leading zero bits determine layer:
101101+/// - More leading zeros = higher layer (deeper in tree)
102102+/// - Layer = floor(leading_zeros / 2) for ~4 fanout
103103+/// - Deterministic and insertion-order independent
104104+#[derive(Debug, Clone)]
105105+pub struct Mst<S: BlockStore> {
106106+ /// Block storage for loading/saving nodes (shared via Arc)
107107+ storage: Arc<S>,
108108+109109+ /// Flat list of entries (lazy-loaded, interior mutable)
110110+ ///
111111+ /// `None` means not yet loaded from storage.
112112+ /// Empty `Vec` means tree has been loaded and has no entries.
113113+ entries: Arc<RwLock<Option<Vec<NodeEntry<S>>>>>,
114114+115115+ /// CID pointer to this node in storage (interior mutable)
116116+ pointer: Arc<RwLock<IpldCid>>,
117117+118118+ /// Whether pointer is stale (entries modified, interior mutable)
119119+ ///
120120+ /// When `true`, `pointer` doesn't match current `entries`.
121121+ /// Call `get_pointer()` to recompute and update.
122122+ outdated_pointer: Arc<RwLock<bool>>,
123123+124124+ /// Layer hint for this node
125125+ ///
126126+ /// `None` means layer unknown (will be computed from entries).
127127+ /// Layer is the maximum layer of any key in this node.
128128+ layer: Option<usize>,
129129+}
130130+131131+impl<S: BlockStore + Sync + 'static> Mst<S> {
132132+ /// Create empty MST
133133+ pub fn new(storage: Arc<S>) -> Self {
134134+ Self {
135135+ storage,
136136+ entries: Arc::new(RwLock::new(Some(Vec::new()))),
137137+ pointer: Arc::new(RwLock::new(IpldCid::default())),
138138+ outdated_pointer: Arc::new(RwLock::new(true)),
139139+ layer: Some(0),
140140+ }
141141+ }
142142+143143+ /// Create MST with existing entries
144144+ ///
145145+ /// Used internally for tree operations.
146146+ /// Computes CID from entries (doesn't persist to storage).
147147+ pub(crate) async fn create(
148148+ storage: Arc<S>,
149149+ entries: Vec<NodeEntry<S>>,
150150+ layer: Option<usize>,
151151+ ) -> Result<Self> {
152152+ // Serialize and compute CID (don't persist yet)
153153+ let node_data = util::serialize_node_data(&entries).await?;
154154+ let cbor =
155155+ serde_ipld_dagcbor::to_vec(&node_data).map_err(|e| RepoError::serialization(e))?;
156156+ let cid = util::compute_cid(&cbor)?;
157157+158158+ let mst = Self {
159159+ storage,
160160+ entries: Arc::new(RwLock::new(Some(entries))),
161161+ pointer: Arc::new(RwLock::new(cid)),
162162+ outdated_pointer: Arc::new(RwLock::new(false)),
163163+ layer,
164164+ };
165165+166166+ Ok(mst)
167167+ }
168168+169169+ /// Load MST from CID (lazy)
170170+ ///
171171+ /// Doesn't actually load from storage until entries are accessed.
172172+ pub fn load(storage: Arc<S>, cid: IpldCid, layer: Option<usize>) -> Self {
173173+ Self {
174174+ storage,
175175+ entries: Arc::new(RwLock::new(None)), // Not loaded yet
176176+ pointer: Arc::new(RwLock::new(cid)),
177177+ outdated_pointer: Arc::new(RwLock::new(false)),
178178+ layer,
179179+ }
180180+ }
181181+182182+ /// Create new tree with modified entries
183183+ ///
184184+ /// Returns a new Mst with updated entries. Marks pointer as outdated.
185185+ async fn new_tree(&self, entries: Vec<NodeEntry<S>>) -> Result<Self> {
186186+ Ok(Self {
187187+ storage: self.storage.clone(),
188188+ entries: Arc::new(RwLock::new(Some(entries))),
189189+ pointer: self.pointer.clone(),
190190+ outdated_pointer: Arc::new(RwLock::new(true)),
191191+ layer: self.layer,
192192+ })
193193+ }
194194+195195+ /// Get entries (lazy load if needed)
196196+ async fn get_entries(&self) -> Result<Vec<NodeEntry<S>>> {
197197+ {
198198+ let entries_guard = self.entries.read().await;
199199+ if let Some(ref entries) = *entries_guard {
200200+ return Ok(entries.clone());
201201+ }
202202+ }
203203+204204+ // Load from storage
205205+ let pointer = *self.pointer.read().await;
206206+ let node_bytes = self
207207+ .storage
208208+ .get(&pointer)
209209+ .await?
210210+ .ok_or_else(|| RepoError::not_found("MST node", &pointer))?;
211211+212212+ let node_data: super::node::NodeData =
213213+ serde_ipld_dagcbor::from_slice(&node_bytes).map_err(|e| RepoError::serialization(e))?;
214214+215215+ let entries = util::deserialize_node_data(self.storage.clone(), &node_data, self.layer)?;
216216+217217+ // Cache the loaded entries
218218+ {
219219+ let mut entries_guard = self.entries.write().await;
220220+ *entries_guard = Some(entries.clone());
221221+ }
222222+223223+ Ok(entries)
224224+ }
225225+226226+ /// Get CID pointer (recompute if outdated)
227227+ ///
228228+ /// Computes CID from current entries but doesn't persist to storage.
229229+ /// Use `collect_blocks()` to gather blocks for persistence.
230230+ pub async fn get_pointer(&self) -> Result<IpldCid> {
231231+ let outdated = *self.outdated_pointer.read().await;
232232+ if !outdated {
233233+ return Ok(*self.pointer.read().await);
234234+ }
235235+236236+ // Serialize and compute CID (don't persist yet)
237237+ let entries = self.get_entries().await?;
238238+ let node_data = util::serialize_node_data(&entries).await?;
239239+ let cbor =
240240+ serde_ipld_dagcbor::to_vec(&node_data).map_err(|e| RepoError::serialization(e))?;
241241+ let cid = util::compute_cid(&cbor)?;
242242+243243+ // Update pointer and mark as fresh
244244+ {
245245+ let mut pointer_guard = self.pointer.write().await;
246246+ *pointer_guard = cid;
247247+ }
248248+ {
249249+ let mut outdated_guard = self.outdated_pointer.write().await;
250250+ *outdated_guard = false;
251251+ }
252252+253253+ Ok(cid)
254254+ }
255255+256256+ /// Get root CID (alias for get_pointer)
257257+ pub async fn root(&self) -> Result<IpldCid> {
258258+ self.get_pointer().await
259259+ }
260260+261261+ /// Get shared reference to the block storage
262262+ ///
263263+ /// Useful for CAR export and other operations that need direct storage access.
264264+ pub fn storage(&self) -> &Arc<S> {
265265+ &self.storage
266266+ }
267267+268268+ /// Get the layer of this node
269269+ ///
270270+ /// Layer is the maximum layer of any leaf key in this node.
271271+ /// For nodes with no leaves, recursively checks subtrees.
272272+ fn get_layer<'a>(
273273+ &'a self,
274274+ ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<usize>> + Send + 'a>> {
275275+ Box::pin(async move {
276276+ if let Some(layer) = self.layer {
277277+ return Ok(layer);
278278+ }
279279+280280+ // Compute layer from entries
281281+ let entries = self.get_entries().await?;
282282+283283+ // Find first leaf and get its layer
284284+ for entry in &entries {
285285+ if let NodeEntry::Leaf { key, .. } = entry {
286286+ let layer = util::layer_for_key(key.as_str());
287287+ return Ok(layer);
288288+ }
289289+ }
290290+291291+ // No leaves found - check first subtree
292292+ for entry in &entries {
293293+ if let NodeEntry::Tree(subtree) = entry {
294294+ let child_layer = subtree.get_layer().await?;
295295+ return Ok(child_layer + 1);
296296+ }
297297+ }
298298+299299+ // Empty tree
300300+ Ok(0)
301301+ })
302302+ }
303303+304304+ /// Find index of first leaf >= key
305305+ ///
306306+ /// Returns `entries.len()` if all leaves are < key.
307307+ fn find_gt_or_equal_leaf_index_in(entries: &[NodeEntry<S>], key: &str) -> usize {
308308+ for (i, entry) in entries.iter().enumerate() {
309309+ if let NodeEntry::Leaf { key: leaf_key, .. } = entry {
310310+ if leaf_key.as_str() >= key {
311311+ return i;
312312+ }
313313+ }
314314+ }
315315+316316+ entries.len()
317317+ }
318318+319319+ /// Get a value by key
320320+ pub fn get<'a>(
321321+ &'a self,
322322+ key: &'a str,
323323+ ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<Option<IpldCid>>> + Send + 'a>>
324324+ {
325325+ Box::pin(async move {
326326+ util::validate_key(key)?;
327327+328328+ let entries = self.get_entries().await?;
329329+ let index = Self::find_gt_or_equal_leaf_index_in(&entries, key);
330330+331331+ // Check if we found exact match
332332+ if index < entries.len() {
333333+ if let NodeEntry::Leaf {
334334+ key: leaf_key,
335335+ value,
336336+ } = &entries[index]
337337+ {
338338+ if leaf_key.as_str() == key {
339339+ return Ok(Some(*value));
340340+ }
341341+ }
342342+ }
343343+344344+ // Not found at this level - check subtree before this index
345345+ if index > 0 {
346346+ if let NodeEntry::Tree(subtree) = &entries[index - 1] {
347347+ return subtree.get(key).await;
348348+ }
349349+ }
350350+351351+ Ok(None)
352352+ })
353353+ }
354354+355355+ /// Add a key-value pair (returns new tree)
356356+ pub fn add<'a>(
357357+ &'a self,
358358+ key: &'a str,
359359+ cid: IpldCid,
360360+ ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<Mst<S>>> + Send + 'a>> {
361361+ Box::pin(async move {
362362+ util::validate_key(key)?;
363363+364364+ let key_layer = util::layer_for_key(key);
365365+ let node_layer = self.get_layer().await?;
366366+ let entries = self.get_entries().await?;
367367+368368+ if key_layer == node_layer {
369369+ // Key belongs at this layer - insert here
370370+ let index = Self::find_gt_or_equal_leaf_index_in(&entries, key);
371371+372372+ // Check if key already exists
373373+ if index < entries.len() {
374374+ if let NodeEntry::Leaf { key: leaf_key, .. } = &entries[index] {
375375+ if leaf_key.as_str() == key {
376376+ // Key exists - replace by just inserting at same position
377377+ let mut new_entries = entries.clone();
378378+ new_entries[index] = NodeEntry::Leaf {
379379+ key: smol_str::SmolStr::new(key),
380380+ value: cid,
381381+ };
382382+ return self.new_tree(new_entries).await;
383383+ }
384384+ }
385385+ }
386386+387387+ // Check entry before insertion point
388388+ if index > 0 {
389389+ match &entries[index - 1] {
390390+ NodeEntry::Leaf { .. } => {
391391+ // Prev is Leaf - just splice in
392392+ self.splice_in(
393393+ NodeEntry::Leaf {
394394+ key: smol_str::SmolStr::new(key),
395395+ value: cid,
396396+ },
397397+ index,
398398+ )
399399+ .await
400400+ }
401401+ NodeEntry::Tree(subtree) => {
402402+ // Prev is Tree - split it around key
403403+ let (left, right) = subtree.split_around(key).await?;
404404+ self.replace_with_split(
405405+ index - 1,
406406+ left,
407407+ NodeEntry::Leaf {
408408+ key: smol_str::SmolStr::new(key),
409409+ value: cid,
410410+ },
411411+ right,
412412+ )
413413+ .await
414414+ }
415415+ }
416416+ } else {
417417+ // At far left - splice in
418418+ self.splice_in(
419419+ NodeEntry::Leaf {
420420+ key: smol_str::SmolStr::new(key),
421421+ value: cid,
422422+ },
423423+ index,
424424+ )
425425+ .await
426426+ }
427427+ } else if key_layer < node_layer {
428428+ // Key belongs on lower layer - recurse into subtree
429429+ let index = Self::find_gt_or_equal_leaf_index_in(&entries, key);
430430+431431+ if index > 0 {
432432+ if let NodeEntry::Tree(prev_tree) = &entries[index - 1] {
433433+ // Prev is Tree - add to it
434434+ let new_subtree = prev_tree.add(key, cid).await?;
435435+ return self
436436+ .update_entry(index - 1, NodeEntry::Tree(new_subtree))
437437+ .await;
438438+ }
439439+ }
440440+441441+ // No prev tree - create child and add to it
442442+ let child = self.create_child().await?;
443443+ let new_subtree = child.add(key, cid).await?;
444444+ self.splice_in(NodeEntry::Tree(new_subtree), index).await
445445+ } else {
446446+ // Key belongs on higher layer - create parent layers
447447+ let extra_layers = key_layer - node_layer;
448448+449449+ let (mut left, mut right) = self.split_around(key).await?;
450450+451451+ // Create intermediate layers if gap > 1
452452+ for _ in 1..extra_layers {
453453+ if let Some(l) = left {
454454+ left = Some(l.create_parent().await?);
455455+ }
456456+ if let Some(r) = right {
457457+ right = Some(r.create_parent().await?);
458458+ }
459459+ }
460460+461461+ // Build new root
462462+ let mut new_entries = Vec::new();
463463+ if let Some(l) = left {
464464+ new_entries.push(NodeEntry::Tree(l));
465465+ }
466466+ new_entries.push(NodeEntry::Leaf {
467467+ key: smol_str::SmolStr::new(key),
468468+ value: cid,
469469+ });
470470+ if let Some(r) = right {
471471+ new_entries.push(NodeEntry::Tree(r));
472472+ }
473473+474474+ Mst::create(self.storage.clone(), new_entries, Some(key_layer)).await
475475+ }
476476+ })
477477+ }
478478+479479+ /// Delete a key (returns new tree)
480480+ pub fn delete<'a>(
481481+ &'a self,
482482+ key: &'a str,
483483+ ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<Mst<S>>> + Send + 'a>> {
484484+ Box::pin(async move {
485485+ util::validate_key(key)?;
486486+487487+ let altered = self.delete_recurse(key).await?;
488488+ altered.trim_top().await
489489+ })
490490+ }
491491+492492+ /// Recursively delete a key
493493+ fn delete_recurse<'a>(
494494+ &'a self,
495495+ key: &'a str,
496496+ ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<Mst<S>>> + Send + 'a>> {
497497+ Box::pin(async move {
498498+ let entries = self.get_entries().await?;
499499+ let index = Self::find_gt_or_equal_leaf_index_in(&entries, key);
500500+501501+ // Check if found at this level
502502+ if index < entries.len() {
503503+ if let NodeEntry::Leaf { key: leaf_key, .. } = &entries[index] {
504504+ if leaf_key.as_str() == key {
505505+ // Found it - delete this entry
506506+ let prev = if index > 0 {
507507+ Some(&entries[index - 1])
508508+ } else {
509509+ None
510510+ };
511511+ let next = entries.get(index + 1);
512512+513513+ // Check if we need to merge Trees
514514+ if let (
515515+ Some(NodeEntry::Tree(prev_tree)),
516516+ Some(NodeEntry::Tree(next_tree)),
517517+ ) = (prev, next)
518518+ {
519519+ // Merge the two Trees
520520+ let merged = prev_tree.append_merge(next_tree).await?;
521521+522522+ // Build: [0..index-1] + [merged] + [index+2..]
523523+ let mut new_entries = entries[..index - 1].to_vec();
524524+ new_entries.push(NodeEntry::Tree(merged));
525525+ new_entries.extend_from_slice(&entries[index + 2..]);
526526+527527+ return self.new_tree(new_entries).await;
528528+ }
529529+530530+ // Simple case: just remove the entry
531531+ return self.remove_entry(index).await;
532532+ }
533533+ }
534534+ }
535535+536536+ // Not found at this level - recurse into prev Tree
537537+ if index > 0 {
538538+ if let NodeEntry::Tree(prev_tree) = &entries[index - 1] {
539539+ let subtree = prev_tree.delete_recurse(key).await?;
540540+ let subtree_entries = subtree.get_entries().await?;
541541+542542+ if subtree_entries.is_empty() {
543543+ // Subtree is now empty - remove it
544544+ return self.remove_entry(index - 1).await;
545545+ } else {
546546+ // Update with new subtree
547547+ return self.update_entry(index - 1, NodeEntry::Tree(subtree)).await;
548548+ }
549549+ }
550550+ }
551551+552552+ // Key not found
553553+ Err(RepoError::not_found("key", key))
554554+ })
555555+ }
556556+557557+ /// Update an existing key (returns new tree)
558558+ pub async fn update(&self, key: &str, cid: IpldCid) -> Result<Mst<S>> {
559559+ util::validate_key(key)?;
560560+561561+ // Check key exists
562562+ if self.get(key).await?.is_none() {
563563+ return Err(RepoError::not_found("key", key));
564564+ }
565565+566566+ // Update is just add (which replaces)
567567+ self.add(key, cid).await
568568+ }
569569+570570+ /// Update entry at index
571571+ async fn update_entry(&self, index: usize, entry: NodeEntry<S>) -> Result<Mst<S>> {
572572+ let mut entries = self.get_entries().await?;
573573+ entries[index] = entry;
574574+ self.new_tree(entries).await
575575+ }
576576+577577+ /// Remove entry at index
578578+ async fn remove_entry(&self, index: usize) -> Result<Mst<S>> {
579579+ let mut entries = self.get_entries().await?;
580580+ entries.remove(index);
581581+ self.new_tree(entries).await
582582+ }
583583+584584+ /// Append entry to end
585585+ async fn append(&self, entry: NodeEntry<S>) -> Result<Mst<S>> {
586586+ let mut entries = self.get_entries().await?;
587587+ entries.push(entry);
588588+ self.new_tree(entries).await
589589+ }
590590+591591+ /// Prepend entry to start
592592+ async fn prepend(&self, entry: NodeEntry<S>) -> Result<Mst<S>> {
593593+ let mut entries = self.get_entries().await?;
594594+ entries.insert(0, entry);
595595+ self.new_tree(entries).await
596596+ }
597597+598598+ /// Splice in entry at index
599599+ async fn splice_in(&self, entry: NodeEntry<S>, index: usize) -> Result<Mst<S>> {
600600+ let mut entries = self.get_entries().await?;
601601+ entries.insert(index, entry);
602602+ self.new_tree(entries).await
603603+ }
604604+605605+ /// Get slice of entries
606606+ pub async fn slice(&self, start: usize, end: usize) -> Result<Vec<NodeEntry<S>>> {
607607+ let entries = self.get_entries().await?;
608608+ Ok(entries[start..end].to_vec())
609609+ }
610610+611611+ /// Trim top node if it only contains one subtree
612612+ fn trim_top(
613613+ self,
614614+ ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<Mst<S>>> + Send>> {
615615+ Box::pin(async move {
616616+ let entries = self.get_entries().await?;
617617+618618+ if entries.len() == 1 {
619619+ if let NodeEntry::Tree(subtree) = &entries[0] {
620620+ return subtree.clone().trim_top().await;
621621+ }
622622+ }
623623+624624+ Ok(self)
625625+ })
626626+ }
627627+628628+ /// Split tree around a key into left and right subtrees
629629+ ///
630630+ /// Returns (left, right) where:
631631+ /// - left contains all entries < key
632632+ /// - right contains all entries >= key
633633+ ///
634634+ /// Either side can be None if empty.
635635+ pub fn split_around<'a>(
636636+ &'a self,
637637+ key: &'a str,
638638+ ) -> std::pin::Pin<
639639+ Box<dyn std::future::Future<Output = Result<(Option<Mst<S>>, Option<Mst<S>>)>> + Send + 'a>,
640640+ > {
641641+ Box::pin(async move {
642642+ let entries = self.get_entries().await?;
643643+ let index = Self::find_gt_or_equal_leaf_index_in(&entries, key);
644644+645645+ // Split at index
646646+ let left_data = entries[..index].to_vec();
647647+ let right_data = entries[index..].to_vec();
648648+649649+ let mut left = self.new_tree(left_data.clone()).await?;
650650+ let mut right = self.new_tree(right_data).await?;
651651+652652+ // If last entry in left is a Tree, recursively split it
653653+ if let Some(NodeEntry::Tree(last_tree)) = left_data.last() {
654654+ let left_len = left_data.len();
655655+ left = left.remove_entry(left_len - 1).await?;
656656+657657+ let (split_left, split_right) = last_tree.split_around(key).await?;
658658+659659+ if let Some(sl) = split_left {
660660+ left = left.append(NodeEntry::Tree(sl)).await?;
661661+ }
662662+ if let Some(sr) = split_right {
663663+ right = right.prepend(NodeEntry::Tree(sr)).await?;
664664+ }
665665+ }
666666+667667+ // Return None for empty sides
668668+ let left_out = if left.get_entries().await?.is_empty() {
669669+ None
670670+ } else {
671671+ Some(left)
672672+ };
673673+674674+ let right_out = if right.get_entries().await?.is_empty() {
675675+ None
676676+ } else {
677677+ Some(right)
678678+ };
679679+680680+ Ok((left_out, right_out))
681681+ })
682682+ }
683683+684684+ /// Merge two adjacent subtrees
685685+ ///
686686+ /// All keys in `to_merge` must be > all keys in `self`.
687687+ /// Used primarily for delete operations.
688688+ pub fn append_merge<'a>(
689689+ &'a self,
690690+ to_merge: &'a Mst<S>,
691691+ ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<Mst<S>>> + Send + 'a>> {
692692+ Box::pin(async move {
693693+ // Check same layer
694694+ let self_layer = self.get_layer().await?;
695695+ let merge_layer = to_merge.get_layer().await?;
696696+697697+ if self_layer != merge_layer {
698698+ return Err(RepoError::invalid_mst(
699699+ "Cannot merge MST nodes from different layers",
700700+ ));
701701+ }
702702+703703+ let mut self_entries = self.get_entries().await?;
704704+ let merge_entries = to_merge.get_entries().await?;
705705+706706+ // Check if we need to merge adjacent Trees
707707+ let last_is_tree = matches!(self_entries.last(), Some(NodeEntry::Tree(_)));
708708+ let first_is_tree = matches!(merge_entries.first(), Some(NodeEntry::Tree(_)));
709709+710710+ if last_is_tree && first_is_tree {
711711+ // Both are Trees - recursively merge them
712712+ if let (Some(NodeEntry::Tree(left_tree)), Some(NodeEntry::Tree(right_tree))) =
713713+ (self_entries.last(), merge_entries.first())
714714+ {
715715+ let merged = left_tree.append_merge(right_tree).await?;
716716+717717+ // Build new entries: self[..-1] + merged + merge[1..]
718718+ let mut new_entries = self_entries[..self_entries.len() - 1].to_vec();
719719+ new_entries.push(NodeEntry::Tree(merged));
720720+ new_entries.extend_from_slice(&merge_entries[1..]);
721721+722722+ return self.new_tree(new_entries).await;
723723+ }
724724+ }
725725+726726+ // Simple case: just concatenate
727727+ self_entries.extend(merge_entries);
728728+ self.new_tree(self_entries).await
729729+ })
730730+ }
731731+732732+ /// Create empty child tree at layer-1
733733+ pub async fn create_child(&self) -> Result<Mst<S>> {
734734+ let layer = self.get_layer().await?;
735735+ let child_layer = if layer > 0 { Some(layer - 1) } else { Some(0) };
736736+737737+ Mst::create(self.storage.clone(), Vec::new(), child_layer).await
738738+ }
739739+740740+ /// Create parent tree at layer+1 containing self
741741+ pub async fn create_parent(self) -> Result<Mst<S>> {
742742+ let layer = self.get_layer().await?;
743743+744744+ Mst::create(
745745+ self.storage.clone(),
746746+ vec![NodeEntry::Tree(self)],
747747+ Some(layer + 1),
748748+ )
749749+ .await
750750+ }
751751+752752+ /// Replace entry at index with [left?, leaf, right?]
753753+ async fn replace_with_split(
754754+ &self,
755755+ index: usize,
756756+ left: Option<Mst<S>>,
757757+ leaf: NodeEntry<S>,
758758+ right: Option<Mst<S>>,
759759+ ) -> Result<Mst<S>> {
760760+ let entries = self.get_entries().await?;
761761+762762+ // Build: [0..index] + [left?] + [leaf] + [right?] + [index+1..]
763763+ let mut new_entries = entries[..index].to_vec();
764764+765765+ if let Some(l) = left {
766766+ new_entries.push(NodeEntry::Tree(l));
767767+ }
768768+ new_entries.push(leaf);
769769+ if let Some(r) = right {
770770+ new_entries.push(NodeEntry::Tree(r));
771771+ }
772772+773773+ new_entries.extend_from_slice(&entries[index + 1..]);
774774+775775+ self.new_tree(new_entries).await
776776+ }
777777+778778+ /// Get all leaf entries (key-CID pairs) in lexicographic order
779779+ ///
780780+ /// Recursively traverses the tree to collect all leaves.
781781+ /// Used for diff calculation and tree listing.
782782+ pub fn leaves<'a>(
783783+ &'a self,
784784+ ) -> std::pin::Pin<
785785+ Box<
786786+ dyn std::future::Future<Output = Result<Vec<(smol_str::SmolStr, IpldCid)>>> + Send + 'a,
787787+ >,
788788+ > {
789789+ Box::pin(async move {
790790+ let mut result = Vec::new();
791791+ self.collect_leaves(&mut result).await?;
792792+ Ok(result)
793793+ })
794794+ }
795795+796796+ /// Recursively collect all leaves into the result vector
797797+ fn collect_leaves<'a>(
798798+ &'a self,
799799+ result: &'a mut Vec<(smol_str::SmolStr, IpldCid)>,
800800+ ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<()>> + Send + 'a>> {
801801+ Box::pin(async move {
802802+ let entries = self.get_entries().await?;
803803+804804+ for entry in entries {
805805+ match entry {
806806+ NodeEntry::Tree(subtree) => {
807807+ // Recurse into subtree
808808+ subtree.collect_leaves(result).await?;
809809+ }
810810+ NodeEntry::Leaf { key, value } => {
811811+ // Add leaf to result
812812+ result.push((key, value));
813813+ }
814814+ }
815815+ }
816816+817817+ Ok(())
818818+ })
819819+ }
820820+821821+ /// Apply batch of verified write operations (returns new tree)
822822+ ///
823823+ /// More efficient than individual operations as it only rebuilds
824824+ /// the tree structure once per operation. Operations are applied in order.
825825+ ///
826826+ /// # Validation
827827+ ///
828828+ /// - Create: errors if key already exists
829829+ /// - Update: errors if key doesn't exist OR prev CID doesn't match
830830+ /// - Delete: errors if key doesn't exist OR prev CID doesn't match
831831+ ///
832832+ /// All operations validate prev CIDs against current tree state.
833833+ pub async fn batch(&self, ops: &[VerifiedWriteOp]) -> Result<Mst<S>> {
834834+ let mut tree = self.clone();
835835+836836+ for op in ops {
837837+ tree = match op {
838838+ VerifiedWriteOp::Create { key, cid } => {
839839+ // Check doesn't exist
840840+ if tree.get(key.as_str()).await?.is_some() {
841841+ return Err(RepoError::invalid_mst(format!(
842842+ "Cannot create: key already exists: {}",
843843+ key
844844+ )));
845845+ }
846846+ tree.add(key.as_str(), *cid).await?
847847+ }
848848+849849+ VerifiedWriteOp::Update { key, cid, prev } => {
850850+ // Check exists and validate prev
851851+ let current = tree
852852+ .get(key.as_str())
853853+ .await?
854854+ .ok_or_else(|| RepoError::not_found("key", key.as_str()))?;
855855+856856+ if ¤t != prev {
857857+ return Err(RepoError::invalid_mst(format!(
858858+ "Update prev CID mismatch for key {}: expected {}, got {}",
859859+ key, prev, current
860860+ )));
861861+ }
862862+863863+ tree.add(key.as_str(), *cid).await?
864864+ }
865865+866866+ VerifiedWriteOp::Delete { key, prev } => {
867867+ // Check exists and validate prev
868868+ let current = tree
869869+ .get(key.as_str())
870870+ .await?
871871+ .ok_or_else(|| RepoError::not_found("key", key.as_str()))?;
872872+873873+ if ¤t != prev {
874874+ return Err(RepoError::invalid_mst(format!(
875875+ "Delete prev CID mismatch for key {}: expected {}, got {}",
876876+ key, prev, current
877877+ )));
878878+ }
879879+880880+ tree.delete(key.as_str()).await?
881881+ }
882882+ };
883883+ }
884884+885885+ Ok(tree)
886886+ }
887887+888888+ /// Collect all blocks that need persisting
889889+ ///
890890+ /// Recursively walks the tree, serializing nodes and collecting blocks
891891+ /// that aren't already in storage. Skips nodes that are already persisted.
892892+ ///
893893+ /// Returns (root_cid, blocks) where blocks is a map of CID → bytes.
894894+ pub fn collect_blocks<'a>(
895895+ &'a self,
896896+ ) -> std::pin::Pin<
897897+ Box<
898898+ dyn std::future::Future<
899899+ Output = Result<(IpldCid, std::collections::BTreeMap<IpldCid, bytes::Bytes>)>,
900900+ > + Send
901901+ + 'a,
902902+ >,
903903+ > {
904904+ Box::pin(async move {
905905+ use bytes::Bytes;
906906+ use std::collections::BTreeMap;
907907+908908+ let mut blocks = BTreeMap::new();
909909+ let pointer = self.get_pointer().await?;
910910+911911+ // Check if already in storage
912912+ if self.storage.has(&pointer).await? {
913913+ return Ok((pointer, blocks));
914914+ }
915915+916916+ // Serialize this node
917917+ let entries = self.get_entries().await?;
918918+ let node_data = util::serialize_node_data(&entries).await?;
919919+ let cbor =
920920+ serde_ipld_dagcbor::to_vec(&node_data).map_err(|e| RepoError::serialization(e))?;
921921+ blocks.insert(pointer, Bytes::from(cbor));
922922+923923+ // Recursively collect from subtrees
924924+ for entry in &entries {
925925+ if let NodeEntry::Tree(subtree) = entry {
926926+ let (_, subtree_blocks) = subtree.collect_blocks().await?;
927927+ blocks.extend(subtree_blocks);
928928+ }
929929+ }
930930+931931+ Ok((pointer, blocks))
932932+ })
933933+ }
934934+935935+ /// Persist all unstored blocks to storage
936936+ ///
937937+ /// Convenience method that calls `collect_blocks()` and `put_many()`.
938938+ /// Returns the root CID after persisting.
939939+ pub async fn persist(&self) -> Result<IpldCid> {
940940+ let (root_cid, blocks) = self.collect_blocks().await?;
941941+942942+ if !blocks.is_empty() {
943943+ self.storage.put_many(blocks).await?;
944944+ }
945945+946946+ Ok(root_cid)
947947+ }
948948+949949+ /// Get all CIDs in the merkle path to a key
950950+ ///
951951+ /// Returns a list of CIDs representing the proof path from root to the target key:
952952+ /// - Always includes the root CID (this node's pointer)
953953+ /// - If key exists, includes the record CID
954954+ /// - Includes all intermediate MST node CIDs in the path
955955+ ///
956956+ /// This is used for generating merkle proofs for record existence/non-existence.
957957+ ///
958958+ /// # Example
959959+ ///
960960+ /// For a key that exists:
961961+ /// - Returns: `[root_cid, intermediate_node_cid?, ..., record_cid]`
962962+ ///
963963+ /// For a key that doesn't exist:
964964+ /// - Returns: `[root_cid, intermediate_node_cid?, ...]` (proves absence)
965965+ pub fn cids_for_path<'a>(
966966+ &'a self,
967967+ key: &'a str,
968968+ ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<Vec<IpldCid>>> + Send + 'a>>
969969+ {
970970+ Box::pin(async move {
971971+ util::validate_key(key)?;
972972+973973+ let mut cids = vec![self.get_pointer().await?];
974974+ let entries = self.get_entries().await?;
975975+ let index = Self::find_gt_or_equal_leaf_index_in(&entries, key);
976976+977977+ // Check if we found exact match at this level
978978+ if index < entries.len() {
979979+ if let NodeEntry::Leaf {
980980+ key: leaf_key,
981981+ value,
982982+ } = &entries[index]
983983+ {
984984+ if leaf_key.as_str() == key {
985985+ cids.push(*value);
986986+ return Ok(cids);
987987+ }
988988+ }
989989+ }
990990+991991+ // Not found at this level - check subtree before this index
992992+ if index > 0 {
993993+ if let NodeEntry::Tree(subtree) = &entries[index - 1] {
994994+ let mut subtree_cids = subtree.cids_for_path(key).await?;
995995+ cids.append(&mut subtree_cids);
996996+ return Ok(cids);
997997+ }
998998+ }
999999+10001000+ // Key not found in tree
10011001+ Ok(cids)
10021002+ })
10031003+ }
10041004+10051005+ /// Write all MST and record blocks to CAR writer
10061006+ ///
10071007+ /// Streams blocks directly to the writer as the tree is walked:
10081008+ /// - All MST node blocks (read from storage)
10091009+ /// - All leaf record blocks (read from storage)
10101010+ ///
10111011+ /// This is suitable for CAR export and avoids loading all blocks into memory.
10121012+ pub async fn write_blocks_to_car<W: tokio::io::AsyncWrite + Send + Unpin>(
10131013+ &self,
10141014+ writer: &mut iroh_car::CarWriter<W>,
10151015+ ) -> Result<()> {
10161016+ let mut leaf_cids = Vec::new();
10171017+10181018+ // Walk tree, writing MST nodes and collecting leaf CIDs
10191019+ self.write_mst_nodes_to_car(writer, &mut leaf_cids).await?;
10201020+10211021+ // Fetch and write all leaf record blocks
10221022+ let leaf_blocks = self.storage.get_many(&leaf_cids).await?;
10231023+ for (cid, maybe_data) in leaf_cids.iter().zip(leaf_blocks) {
10241024+ if let Some(data) = maybe_data {
10251025+ writer
10261026+ .write(*cid, &data)
10271027+ .await
10281028+ .map_err(|e| RepoError::car(e))?;
10291029+ }
10301030+ }
10311031+10321032+ Ok(())
10331033+ }
10341034+10351035+ /// Recursively write MST nodes to CAR and collect leaf CIDs
10361036+ fn write_mst_nodes_to_car<'a, W: tokio::io::AsyncWrite + Send + Unpin>(
10371037+ &'a self,
10381038+ writer: &'a mut iroh_car::CarWriter<W>,
10391039+ leaf_cids: &'a mut Vec<IpldCid>,
10401040+ ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<()>> + Send + 'a>> {
10411041+ Box::pin(async move {
10421042+ let pointer = self.get_pointer().await?;
10431043+10441044+ // Read MST node from storage and write to CAR
10451045+ let node_bytes = self
10461046+ .storage
10471047+ .get(&pointer)
10481048+ .await?
10491049+ .ok_or_else(|| RepoError::not_found("MST node", &pointer))?;
10501050+10511051+ writer
10521052+ .write(pointer, &node_bytes)
10531053+ .await
10541054+ .map_err(|e| RepoError::car(e))?;
10551055+10561056+ // Parse to get entries
10571057+ let entries = self.get_entries().await?;
10581058+10591059+ // Collect leaf CIDs and recurse into subtrees
10601060+ for entry in &entries {
10611061+ match entry {
10621062+ NodeEntry::Leaf { value, .. } => {
10631063+ leaf_cids.push(*value);
10641064+ }
10651065+ NodeEntry::Tree(subtree) => {
10661066+ subtree.write_mst_nodes_to_car(writer, leaf_cids).await?;
10671067+ }
10681068+ }
10691069+ }
10701070+10711071+ Ok(())
10721072+ })
10731073+ }
10741074+}
10751075+10761076+#[cfg(test)]
10771077+mod tests {
10781078+ use super::*;
10791079+ use crate::{DAG_CBOR_CID_CODEC, storage::memory::MemoryBlockStore};
10801080+ use jacquard_common::types::crypto::SHA2_256;
10811081+ use smol_str::SmolStr;
10821082+10831083+ fn test_cid(n: u8) -> IpldCid {
10841084+ let data = vec![n; 32];
10851085+ let mh = multihash::Multihash::wrap(SHA2_256, &data).unwrap();
10861086+ IpldCid::new_v1(DAG_CBOR_CID_CODEC, mh)
10871087+ }
10881088+10891089+ #[tokio::test]
10901090+ async fn test_empty_tree() {
10911091+ let storage = Arc::new(MemoryBlockStore::new());
10921092+ let mst = Mst::new(storage);
10931093+10941094+ let entries = mst.get_entries().await.unwrap();
10951095+ assert_eq!(entries.len(), 0);
10961096+ }
10971097+10981098+ #[tokio::test]
10991099+ async fn test_get_from_empty() {
11001100+ let storage = Arc::new(MemoryBlockStore::new());
11011101+ let mst = Mst::new(storage);
11021102+11031103+ let result = mst.get("test/key").await.unwrap();
11041104+ assert!(result.is_none());
11051105+ }
11061106+11071107+ #[tokio::test]
11081108+ async fn test_manually_constructed_tree() {
11091109+ // Test with manually constructed entries (no CBOR)
11101110+ let storage = Arc::new(MemoryBlockStore::new());
11111111+11121112+ let entries = vec![
11131113+ NodeEntry::Leaf {
11141114+ key: SmolStr::new("a"),
11151115+ value: test_cid(1),
11161116+ },
11171117+ NodeEntry::Leaf {
11181118+ key: SmolStr::new("b"),
11191119+ value: test_cid(2),
11201120+ },
11211121+ NodeEntry::Leaf {
11221122+ key: SmolStr::new("c"),
11231123+ value: test_cid(3),
11241124+ },
11251125+ ];
11261126+11271127+ let mst = Mst::create(storage, entries, Some(0)).await.unwrap();
11281128+11291129+ assert_eq!(mst.get("a").await.unwrap(), Some(test_cid(1)));
11301130+ assert_eq!(mst.get("b").await.unwrap(), Some(test_cid(2)));
11311131+ assert_eq!(mst.get("c").await.unwrap(), Some(test_cid(3)));
11321132+ assert_eq!(mst.get("d").await.unwrap(), None);
11331133+ }
11341134+11351135+ #[tokio::test]
11361136+ async fn test_add_single_key() {
11371137+ let storage = Arc::new(MemoryBlockStore::new());
11381138+ let mst = Mst::new(storage);
11391139+11401140+ let updated = mst.add("test/key", test_cid(1)).await.unwrap();
11411141+11421142+ assert_eq!(updated.get("test/key").await.unwrap(), Some(test_cid(1)));
11431143+ }
11441144+11451145+ #[tokio::test]
11461146+ async fn test_add_multiple_keys() {
11471147+ let storage = Arc::new(MemoryBlockStore::new());
11481148+ let mst = Mst::new(storage);
11491149+11501150+ let mst = mst.add("a", test_cid(1)).await.unwrap();
11511151+ let mst = mst.add("b", test_cid(2)).await.unwrap();
11521152+ let mst = mst.add("c", test_cid(3)).await.unwrap();
11531153+11541154+ assert_eq!(mst.get("a").await.unwrap(), Some(test_cid(1)));
11551155+ assert_eq!(mst.get("b").await.unwrap(), Some(test_cid(2)));
11561156+ assert_eq!(mst.get("c").await.unwrap(), Some(test_cid(3)));
11571157+ }
11581158+11591159+ #[tokio::test]
11601160+ async fn test_add_replace_key() {
11611161+ let storage = Arc::new(MemoryBlockStore::new());
11621162+ let mst = Mst::new(storage);
11631163+11641164+ let mst = mst.add("test", test_cid(1)).await.unwrap();
11651165+ let mst = mst.add("test", test_cid(2)).await.unwrap();
11661166+11671167+ assert_eq!(mst.get("test").await.unwrap(), Some(test_cid(2)));
11681168+ }
11691169+11701170+ #[tokio::test]
11711171+ async fn test_delete_single_key() {
11721172+ let storage = Arc::new(MemoryBlockStore::new());
11731173+ let mst = Mst::new(storage);
11741174+11751175+ let mst = mst.add("test", test_cid(1)).await.unwrap();
11761176+ let mst = mst.delete("test").await.unwrap();
11771177+11781178+ assert_eq!(mst.get("test").await.unwrap(), None);
11791179+ assert_eq!(mst.get_entries().await.unwrap().len(), 0);
11801180+ }
11811181+11821182+ #[tokio::test]
11831183+ async fn test_delete_from_multi_key_tree() {
11841184+ let storage = Arc::new(MemoryBlockStore::new());
11851185+ let mst = Mst::new(storage);
11861186+11871187+ let mst = mst.add("a", test_cid(1)).await.unwrap();
11881188+ let mst = mst.add("b", test_cid(2)).await.unwrap();
11891189+ let mst = mst.add("c", test_cid(3)).await.unwrap();
11901190+11911191+ let mst = mst.delete("b").await.unwrap();
11921192+11931193+ assert_eq!(mst.get("a").await.unwrap(), Some(test_cid(1)));
11941194+ assert_eq!(mst.get("b").await.unwrap(), None);
11951195+ assert_eq!(mst.get("c").await.unwrap(), Some(test_cid(3)));
11961196+ }
11971197+11981198+ #[tokio::test]
11991199+ async fn test_delete_nonexistent_key() {
12001200+ let storage = Arc::new(MemoryBlockStore::new());
12011201+ let mst = Mst::new(storage);
12021202+12031203+ let mst = mst.add("a", test_cid(1)).await.unwrap();
12041204+12051205+ let result = mst.delete("b").await;
12061206+ assert!(result.is_err());
12071207+ }
12081208+12091209+ #[tokio::test]
12101210+ async fn test_serialization_roundtrip() {
12111211+ let storage = Arc::new(MemoryBlockStore::new());
12121212+ let mst = Mst::new(storage.clone());
12131213+12141214+ let mst = mst.add("a", test_cid(1)).await.unwrap();
12151215+ let mst = mst.add("b", test_cid(2)).await.unwrap();
12161216+ let mst = mst.add("c", test_cid(3)).await.unwrap();
12171217+12181218+ // Persist to storage
12191219+ let cid = mst.persist().await.unwrap();
12201220+12211221+ // Load from storage
12221222+ let reloaded = Mst::load(storage, cid, Some(0));
12231223+12241224+ // Verify all keys are present
12251225+ assert_eq!(reloaded.get("a").await.unwrap(), Some(test_cid(1)));
12261226+ assert_eq!(reloaded.get("b").await.unwrap(), Some(test_cid(2)));
12271227+ assert_eq!(reloaded.get("c").await.unwrap(), Some(test_cid(3)));
12281228+ }
12291229+12301230+ #[tokio::test]
12311231+ async fn test_cid_determinism() {
12321232+ // Same keys inserted in same order should produce same CID
12331233+ let storage1 = Arc::new(MemoryBlockStore::new());
12341234+ let mst1 = Mst::new(storage1);
12351235+ let mst1 = mst1.add("a", test_cid(1)).await.unwrap();
12361236+ let mst1 = mst1.add("b", test_cid(2)).await.unwrap();
12371237+ let mst1 = mst1.add("c", test_cid(3)).await.unwrap();
12381238+ let cid1 = mst1.get_pointer().await.unwrap();
12391239+12401240+ let storage2 = Arc::new(MemoryBlockStore::new());
12411241+ let mst2 = Mst::new(storage2);
12421242+ let mst2 = mst2.add("a", test_cid(1)).await.unwrap();
12431243+ let mst2 = mst2.add("b", test_cid(2)).await.unwrap();
12441244+ let mst2 = mst2.add("c", test_cid(3)).await.unwrap();
12451245+ let cid2 = mst2.get_pointer().await.unwrap();
12461246+12471247+ assert_eq!(cid1, cid2);
12481248+ }
12491249+12501250+ #[tokio::test]
12511251+ async fn test_insertion_order_determinism() {
12521252+ // Different insertion orders should produce same CID
12531253+ let storage1 = Arc::new(MemoryBlockStore::new());
12541254+ let mst1 = Mst::new(storage1);
12551255+ let mst1 = mst1.add("a", test_cid(1)).await.unwrap();
12561256+ let mst1 = mst1.add("b", test_cid(2)).await.unwrap();
12571257+ let mst1 = mst1.add("c", test_cid(3)).await.unwrap();
12581258+ let cid1 = mst1.get_pointer().await.unwrap();
12591259+12601260+ let storage2 = Arc::new(MemoryBlockStore::new());
12611261+ let mst2 = Mst::new(storage2);
12621262+ let mst2 = mst2.add("c", test_cid(3)).await.unwrap();
12631263+ let mst2 = mst2.add("a", test_cid(1)).await.unwrap();
12641264+ let mst2 = mst2.add("b", test_cid(2)).await.unwrap();
12651265+ let cid2 = mst2.get_pointer().await.unwrap();
12661266+12671267+ assert_eq!(cid1, cid2);
12681268+ }
12691269+12701270+ #[tokio::test]
12711271+ async fn test_batch_creates() {
12721272+ let storage = Arc::new(MemoryBlockStore::new());
12731273+ let mst = Mst::new(storage);
12741274+12751275+ let ops = vec![
12761276+ VerifiedWriteOp::Create {
12771277+ key: SmolStr::new("a"),
12781278+ cid: test_cid(1),
12791279+ },
12801280+ VerifiedWriteOp::Create {
12811281+ key: SmolStr::new("b"),
12821282+ cid: test_cid(2),
12831283+ },
12841284+ VerifiedWriteOp::Create {
12851285+ key: SmolStr::new("c"),
12861286+ cid: test_cid(3),
12871287+ },
12881288+ ];
12891289+12901290+ let mst = mst.batch(&ops).await.unwrap();
12911291+12921292+ assert_eq!(mst.get("a").await.unwrap(), Some(test_cid(1)));
12931293+ assert_eq!(mst.get("b").await.unwrap(), Some(test_cid(2)));
12941294+ assert_eq!(mst.get("c").await.unwrap(), Some(test_cid(3)));
12951295+ }
12961296+12971297+ #[tokio::test]
12981298+ async fn test_batch_mixed_operations() {
12991299+ let storage = Arc::new(MemoryBlockStore::new());
13001300+ let mst = Mst::new(storage);
13011301+13021302+ // Start with some keys
13031303+ let mst = mst.add("a", test_cid(1)).await.unwrap();
13041304+ let mst = mst.add("b", test_cid(2)).await.unwrap();
13051305+ let mst = mst.add("c", test_cid(3)).await.unwrap();
13061306+13071307+ let ops = vec![
13081308+ VerifiedWriteOp::Create {
13091309+ key: SmolStr::new("d"),
13101310+ cid: test_cid(4),
13111311+ },
13121312+ VerifiedWriteOp::Update {
13131313+ key: SmolStr::new("a"),
13141314+ cid: test_cid(10),
13151315+ prev: test_cid(1),
13161316+ },
13171317+ VerifiedWriteOp::Delete {
13181318+ key: SmolStr::new("b"),
13191319+ prev: test_cid(2),
13201320+ },
13211321+ ];
13221322+13231323+ let mst = mst.batch(&ops).await.unwrap();
13241324+13251325+ assert_eq!(mst.get("a").await.unwrap(), Some(test_cid(10))); // Updated
13261326+ assert_eq!(mst.get("b").await.unwrap(), None); // Deleted
13271327+ assert_eq!(mst.get("c").await.unwrap(), Some(test_cid(3))); // Unchanged
13281328+ assert_eq!(mst.get("d").await.unwrap(), Some(test_cid(4))); // Created
13291329+ }
13301330+13311331+ #[tokio::test]
13321332+ async fn test_batch_with_prev_validation() {
13331333+ let storage = Arc::new(MemoryBlockStore::new());
13341334+ let mst = Mst::new(storage);
13351335+ let mst = mst.add("a", test_cid(1)).await.unwrap();
13361336+13371337+ // Update with correct prev - should succeed
13381338+ let ops = vec![VerifiedWriteOp::Update {
13391339+ key: SmolStr::new("a"),
13401340+ cid: test_cid(2),
13411341+ prev: test_cid(1),
13421342+ }];
13431343+ let mst = mst.batch(&ops).await.unwrap();
13441344+ assert_eq!(mst.get("a").await.unwrap(), Some(test_cid(2)));
13451345+13461346+ // Update with wrong prev - should fail
13471347+ let ops = vec![VerifiedWriteOp::Update {
13481348+ key: SmolStr::new("a"),
13491349+ cid: test_cid(3),
13501350+ prev: test_cid(99), // Wrong CID
13511351+ }];
13521352+ assert!(mst.batch(&ops).await.is_err());
13531353+13541354+ // Delete with correct prev - should succeed
13551355+ let ops = vec![VerifiedWriteOp::Delete {
13561356+ key: SmolStr::new("a"),
13571357+ prev: test_cid(2),
13581358+ }];
13591359+ let mst = mst.batch(&ops).await.unwrap();
13601360+ assert_eq!(mst.get("a").await.unwrap(), None);
13611361+ }
13621362+13631363+ #[tokio::test]
13641364+ async fn test_batch_create_duplicate_error() {
13651365+ let storage = Arc::new(MemoryBlockStore::new());
13661366+ let mst = Mst::new(storage);
13671367+ let mst = mst.add("a", test_cid(1)).await.unwrap();
13681368+13691369+ let ops = vec![VerifiedWriteOp::Create {
13701370+ key: SmolStr::new("a"),
13711371+ cid: test_cid(2),
13721372+ }];
13731373+13741374+ // Should error because key already exists
13751375+ assert!(mst.batch(&ops).await.is_err());
13761376+ }
13771377+13781378+ #[tokio::test]
13791379+ async fn test_batch_update_nonexistent_error() {
13801380+ let storage = Arc::new(MemoryBlockStore::new());
13811381+ let mst = Mst::new(storage);
13821382+13831383+ let ops = vec![VerifiedWriteOp::Update {
13841384+ key: SmolStr::new("a"),
13851385+ cid: test_cid(1),
13861386+ prev: test_cid(99), // Doesn't matter since key doesn't exist
13871387+ }];
13881388+13891389+ // Should error because key doesn't exist
13901390+ assert!(mst.batch(&ops).await.is_err());
13911391+ }
13921392+13931393+ #[tokio::test]
13941394+ async fn test_batch_delete_nonexistent_error() {
13951395+ let storage = Arc::new(MemoryBlockStore::new());
13961396+ let mst = Mst::new(storage);
13971397+13981398+ let ops = vec![VerifiedWriteOp::Delete {
13991399+ key: SmolStr::new("a"),
14001400+ prev: test_cid(99), // Doesn't matter since key doesn't exist
14011401+ }];
14021402+14031403+ // Should error because key doesn't exist
14041404+ assert!(mst.batch(&ops).await.is_err());
14051405+ }
14061406+14071407+ #[tokio::test]
14081408+ async fn test_batch_empty() {
14091409+ let storage = Arc::new(MemoryBlockStore::new());
14101410+ let mst = Mst::new(storage);
14111411+ let mst = mst.add("a", test_cid(1)).await.unwrap();
14121412+14131413+ let ops = vec![];
14141414+ let mst = mst.batch(&ops).await.unwrap();
14151415+14161416+ // Should be unchanged
14171417+ assert_eq!(mst.get("a").await.unwrap(), Some(test_cid(1)));
14181418+ }
14191419+14201420+ #[tokio::test]
14211421+ async fn test_cids_for_path_simple() {
14221422+ // Test cids_for_path with a simple flat tree
14231423+ let storage = Arc::new(MemoryBlockStore::new());
14241424+ let mst = Mst::new(storage);
14251425+14261426+ let mst = mst.add("a", test_cid(1)).await.unwrap();
14271427+ let mst = mst.add("b", test_cid(2)).await.unwrap();
14281428+ let mst = mst.add("c", test_cid(3)).await.unwrap();
14291429+14301430+ // Get proof path for key "b"
14311431+ let cids = mst.cids_for_path("b").await.unwrap();
14321432+14331433+ // Should contain: root CID, record CID
14341434+ assert_eq!(cids.len(), 2);
14351435+ assert_eq!(cids[0], mst.get_pointer().await.unwrap());
14361436+ assert_eq!(cids[1], test_cid(2));
14371437+ }
14381438+14391439+ #[tokio::test]
14401440+ async fn test_cids_for_path_nonexistent() {
14411441+ // Test cids_for_path with a key that doesn't exist
14421442+ let storage = Arc::new(MemoryBlockStore::new());
14431443+ let mst = Mst::new(storage);
14441444+14451445+ let mst = mst.add("a", test_cid(1)).await.unwrap();
14461446+ let mst = mst.add("c", test_cid(3)).await.unwrap();
14471447+14481448+ // Get proof path for nonexistent key "b"
14491449+ let cids = mst.cids_for_path("b").await.unwrap();
14501450+14511451+ // Should contain root CID first, and NOT contain the record CID (proves absence)
14521452+ assert!(cids.len() >= 1, "Should have at least the root CID");
14531453+ assert_eq!(
14541454+ cids[0],
14551455+ mst.get_pointer().await.unwrap(),
14561456+ "First CID should be root"
14571457+ );
14581458+ assert!(
14591459+ !cids.contains(&test_cid(2)),
14601460+ "Should not contain nonexistent record"
14611461+ );
14621462+ }
14631463+14641464+ #[tokio::test]
14651465+ async fn test_cids_for_path_multiple_records() {
14661466+ // Test with multiple records across different collection paths (like rsky)
14671467+ let storage = Arc::new(MemoryBlockStore::new());
14681468+ let mst = Mst::new(storage);
14691469+14701470+ // Simulate records from different collections (app.bsky.feed.post, app.bsky.feed.like)
14711471+ let keys = vec![
14721472+ "app.bsky.feed.post/3l4qpz7ajrc2a",
14731473+ "app.bsky.feed.post/3l4qpz7bjrc2b",
14741474+ "app.bsky.feed.like/3l4qpz7cjrc2c",
14751475+ "app.bsky.feed.like/3l4qpz7djrc2d",
14761476+ "app.bsky.graph.follow/3l4qpz7ejrc2e",
14771477+ ];
14781478+14791479+ let mut mst = mst;
14801480+ for (i, key) in keys.iter().enumerate() {
14811481+ mst = mst.add(key, test_cid((i + 1) as u8)).await.unwrap();
14821482+ }
14831483+14841484+ // Get proof for each record
14851485+ for (i, key) in keys.iter().enumerate() {
14861486+ let cids = mst.cids_for_path(key).await.unwrap();
14871487+14881488+ // Should have root CID first
14891489+ assert_eq!(cids[0], mst.get_pointer().await.unwrap());
14901490+14911491+ // Should have record CID last (since record exists)
14921492+ assert_eq!(*cids.last().unwrap(), test_cid((i + 1) as u8));
14931493+14941494+ // Should have at least root + record
14951495+ assert!(cids.len() >= 2);
14961496+ }
14971497+ }
14981498+14991499+ #[tokio::test]
15001500+ async fn test_cids_for_path_proves_nonexistence() {
15011501+ // Test that we can prove a record doesn't exist in a tree with many records
15021502+ let storage = Arc::new(MemoryBlockStore::new());
15031503+ let mst = Mst::new(storage);
15041504+15051505+ // Add several records
15061506+ let existing_keys = vec![
15071507+ "com.example.posts/key1",
15081508+ "com.example.posts/key2",
15091509+ "com.example.posts/key4",
15101510+ "com.example.posts/key5",
15111511+ ];
15121512+15131513+ let mut mst = mst;
15141514+ for (i, key) in existing_keys.iter().enumerate() {
15151515+ mst = mst.add(key, test_cid((i + 1) as u8)).await.unwrap();
15161516+ }
15171517+15181518+ // Prove key3 doesn't exist (between key2 and key4)
15191519+ let nonexistent_key = "com.example.posts/key3";
15201520+ let cids = mst.cids_for_path(nonexistent_key).await.unwrap();
15211521+15221522+ // Should have root CID
15231523+ assert_eq!(cids[0], mst.get_pointer().await.unwrap());
15241524+15251525+ // Should NOT contain a record CID for key3
15261526+ assert!(!cids.contains(&test_cid(3)));
15271527+15281528+ // Proof is just the path showing where key3 WOULD be (proves absence)
15291529+ assert!(cids.len() >= 1);
15301530+ }
15311531+15321532+ #[tokio::test]
15331533+ async fn test_cids_for_path_collection_structure() {
15341534+ // Test proof generation for realistic collection/rkey structure
15351535+ let storage = Arc::new(MemoryBlockStore::new());
15361536+ let mst = Mst::new(storage);
15371537+15381538+ // Simulate a repo with multiple collections and records
15391539+ let records = vec![
15401540+ ("com.atproto.repo.strongRef", "abc123", test_cid(1)),
15411541+ ("app.bsky.feed.post", "post1", test_cid(2)),
15421542+ ("app.bsky.feed.post", "post2", test_cid(3)),
15431543+ ("app.bsky.feed.like", "like1", test_cid(4)),
15441544+ ("app.bsky.graph.follow", "follow1", test_cid(5)),
15451545+ ];
15461546+15471547+ let mut mst = mst;
15481548+ for (collection, rkey, cid) in &records {
15491549+ let key = format!("{}/{}", collection, rkey);
15501550+ mst = mst.add(&key, *cid).await.unwrap();
15511551+ }
15521552+15531553+ // Persist to storage so we have real MST blocks
15541554+ let root_cid = mst.persist().await.unwrap();
15551555+ assert_eq!(root_cid, mst.get_pointer().await.unwrap());
15561556+15571557+ // Get proofs for each record
15581558+ for (collection, rkey, expected_cid) in &records {
15591559+ let key = format!("{}/{}", collection, rkey);
15601560+ let cids = mst.cids_for_path(&key).await.unwrap();
15611561+15621562+ // Verify structure
15631563+ assert_eq!(cids[0], root_cid, "First CID should be root");
15641564+ assert_eq!(
15651565+ *cids.last().unwrap(),
15661566+ *expected_cid,
15671567+ "Last CID should be record"
15681568+ );
15691569+ }
15701570+ }
15711571+}
+278
crates/jacquard-repo/src/mst/util.rs
···11+//! MST utility functions
22+33+use super::node::{NodeData, NodeEntry, TreeEntry};
44+use crate::error::{MstError, Result};
55+use crate::storage::BlockStore;
66+use bytes::Bytes;
77+use cid::Cid as IpldCid;
88+use sha2::{Digest, Sha256};
99+1010+/// Compute CID from raw bytes
1111+///
1212+/// Uses SHA-256 hash and DAG-CBOR codec. Assumes data is already DAG-CBOR encoded.
1313+pub fn compute_cid(data: &[u8]) -> Result<IpldCid> {
1414+ use jacquard_common::types::crypto::{DAG_CBOR, SHA2_256};
1515+1616+ // SHA-256 hash
1717+ let mut sha = Sha256::new();
1818+ sha.update(data);
1919+ let hash = sha.finalize().to_vec();
2020+ // Build multihash using wrap (matches rsky approach)
2121+ let mh = multihash::Multihash::<64>::wrap(SHA2_256, hash.as_slice())
2222+ .map_err(|e| MstError::InvalidNode(e.to_string()))?;
2323+2424+ // Build CID with DAG-CBOR codec
2525+ Ok(IpldCid::new_v1(DAG_CBOR, mh))
2626+}
2727+2828+/// Serialize node to DAG-CBOR and compute CID
2929+///
3030+/// Uses SHA-256 hash and DAG-CBOR codec.
3131+pub fn node_to_cid(node: &NodeData) -> Result<IpldCid> {
3232+ let cbor =
3333+ serde_ipld_dagcbor::to_vec(node).map_err(|e| MstError::Serialization(Box::new(e)))?;
3434+ compute_cid(&cbor)
3535+}
3636+3737+/// Calculate layer (depth) for a key based on its hash
3838+///
3939+/// Per atproto spec: depth = floor(leading_zero_bits / 2)
4040+/// This gives a fanout of 4 (counting 2-bit chunks of zeros).
4141+/// More leading zeros = deeper layer.
4242+pub fn layer_for_key(key: &str) -> usize {
4343+ let hash = Sha256::digest(key.as_bytes());
4444+ leading_zeros(&hash) / 2
4545+}
4646+4747+/// Count leading zero bits in hash
4848+fn leading_zeros(hash: &[u8]) -> usize {
4949+ let mut count = 0;
5050+ for byte in hash {
5151+ if *byte == 0 {
5252+ count += 8;
5353+ } else {
5454+ count += byte.leading_zeros() as usize;
5555+ break;
5656+ }
5757+ }
5858+ count
5959+}
6060+6161+/// Validate MST key format
6262+///
6363+/// Keys must match: [a-zA-Z0-9._:~-]+
6464+/// Max length: 256 bytes (atproto limit)
6565+pub fn validate_key(key: &str) -> Result<()> {
6666+ if key.is_empty() {
6767+ return Err(MstError::EmptyKey.into());
6868+ }
6969+7070+ if key.len() > 256 {
7171+ return Err(MstError::KeyTooLong {
7272+ len: key.len(),
7373+ max: 256,
7474+ }
7575+ .into());
7676+ }
7777+7878+ if !key
7979+ .chars()
8080+ .all(|c| c.is_ascii_alphanumeric() || matches!(c, '.' | '_' | ':' | '~' | '-' | '/'))
8181+ {
8282+ return Err(MstError::InvalidKeyChars {
8383+ key: key.to_string(),
8484+ }
8585+ .into());
8686+ }
8787+8888+ Ok(())
8989+}
9090+9191+/// Count shared prefix length between two strings
9292+pub fn common_prefix_len(a: &str, b: &str) -> usize {
9393+ a.chars().zip(b.chars()).take_while(|(x, y)| x == y).count()
9494+}
9595+9696+/// Serialize flat entries to wire format (with prefix compression)
9797+///
9898+/// Converts in-memory `Vec<NodeEntry>` to DAG-CBOR `NodeData`.
9999+/// - First `Tree` entry → `left` pointer
100100+/// - Each `Leaf` → entry with prefix compression
101101+/// - `Tree` after `Leaf` → that leaf's `tree` pointer
102102+pub fn serialize_node_data<'a, S: BlockStore + Sync + 'static>(
103103+ entries: &'a [NodeEntry<S>],
104104+) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<NodeData>> + Send + 'a>> {
105105+ Box::pin(async move {
106106+ let mut data = NodeData {
107107+ left: None,
108108+ entries: Vec::new(),
109109+ };
110110+111111+ let mut i = 0;
112112+113113+ // First entry if Tree → becomes left pointer
114114+ if let Some(NodeEntry::Tree(tree)) = entries.get(0) {
115115+ data.left = Some(tree.get_pointer().await?);
116116+ i += 1;
117117+ }
118118+119119+ // Process remaining entries
120120+ let mut last_key = String::new();
121121+ while i < entries.len() {
122122+ let entry = &entries[i];
123123+124124+ if let NodeEntry::Leaf { key, value } = entry {
125125+ i += 1;
126126+127127+ // Calculate prefix with last key
128128+ let prefix_len = common_prefix_len(&last_key, key.as_str());
129129+ let key_suffix = &key.as_str()[prefix_len..];
130130+131131+ // Check for Tree after this Leaf
132132+ let tree_ptr = if let Some(NodeEntry::Tree(tree)) = entries.get(i) {
133133+ i += 1;
134134+ Some(tree.get_pointer().await?)
135135+ } else {
136136+ None
137137+ };
138138+139139+ data.entries.push(TreeEntry {
140140+ prefix_len: prefix_len.try_into().map_err(|_| {
141141+ MstError::InvalidNode(format!(
142142+ "Prefix length {} exceeds u8::MAX",
143143+ prefix_len
144144+ ))
145145+ })?,
146146+ key_suffix: Bytes::copy_from_slice(key_suffix.as_bytes()),
147147+ value: *value,
148148+ tree: tree_ptr,
149149+ });
150150+151151+ last_key = key.as_str().to_string();
152152+ } else {
153153+ return Err(
154154+ MstError::InvalidNode("Two Trees adjacent in flat entries".into()).into(),
155155+ );
156156+ }
157157+ }
158158+159159+ Ok(data)
160160+ })
161161+}
162162+163163+/// Deserialize wire format to flat entries
164164+///
165165+/// Converts DAG-CBOR `NodeData` to in-memory `Vec<NodeEntry>`.
166166+/// - `left` pointer → prepend `Tree` entry
167167+/// - Each entry → `Leaf` with reconstructed full key
168168+/// - `tree` pointer → append `Tree` entry
169169+pub fn deserialize_node_data<S: BlockStore + Sync + 'static>(
170170+ storage: std::sync::Arc<S>,
171171+ data: &NodeData,
172172+ layer: Option<usize>,
173173+) -> Result<Vec<NodeEntry<S>>> {
174174+ use crate::mst::Mst;
175175+176176+ let mut entries = Vec::new();
177177+178178+ // Left pointer → prepend Tree
179179+ if let Some(left_cid) = data.left {
180180+ let child_layer = layer.map(|l| if l > 0 { l - 1 } else { 0 });
181181+ entries.push(NodeEntry::Tree(Mst::load(
182182+ storage.clone(),
183183+ left_cid,
184184+ child_layer,
185185+ )));
186186+ }
187187+188188+ // Process entries
189189+ let mut last_key = String::new();
190190+ for entry in &data.entries {
191191+ // Reconstruct full key from prefix
192192+ let key_str = std::str::from_utf8(&entry.key_suffix)
193193+ .map_err(|e| MstError::InvalidNode(format!("Invalid UTF-8 in key suffix: {}", e)))?;
194194+ let prefix_len = entry.prefix_len as usize;
195195+ let full_key = format!("{}{}", &last_key[..prefix_len], key_str);
196196+197197+ // Append Leaf
198198+ entries.push(NodeEntry::Leaf {
199199+ key: smol_str::SmolStr::new(&full_key),
200200+ value: entry.value,
201201+ });
202202+203203+ last_key = full_key;
204204+205205+ // Tree pointer → append Tree
206206+ if let Some(tree_cid) = entry.tree {
207207+ let child_layer = layer.map(|l| if l > 0 { l - 1 } else { 0 });
208208+ entries.push(NodeEntry::Tree(Mst::load(
209209+ storage.clone(),
210210+ tree_cid,
211211+ child_layer,
212212+ )));
213213+ }
214214+ }
215215+216216+ Ok(entries)
217217+}
218218+219219+#[cfg(test)]
220220+mod tests {
221221+ use super::*;
222222+223223+ #[test]
224224+ fn test_validate_key_valid() {
225225+ assert!(validate_key("app.bsky.feed.post/abc123").is_ok());
226226+ assert!(validate_key("foo.bar/test-key_2024").is_ok());
227227+ assert!(validate_key("a").is_ok());
228228+ }
229229+230230+ #[test]
231231+ fn test_validate_key_empty() {
232232+ assert!(validate_key("").is_err());
233233+ }
234234+235235+ #[test]
236236+ fn test_validate_key_too_long() {
237237+ let long_key = "a".repeat(257);
238238+ assert!(validate_key(&long_key).is_err());
239239+ }
240240+241241+ #[test]
242242+ fn test_validate_key_invalid_chars() {
243243+ assert!(validate_key("key with spaces").is_err());
244244+ assert!(validate_key("key@invalid").is_err());
245245+ assert!(validate_key("key#hash").is_err());
246246+ }
247247+248248+ #[test]
249249+ fn test_common_prefix_len() {
250250+ assert_eq!(common_prefix_len("hello", "help"), 3);
251251+ assert_eq!(common_prefix_len("abc", "abc"), 3);
252252+ assert_eq!(common_prefix_len("abc", "def"), 0);
253253+ assert_eq!(common_prefix_len("", "test"), 0);
254254+ }
255255+256256+ #[test]
257257+ fn test_layer_for_key() {
258258+ // Just ensure it returns a reasonable value
259259+ let layer = layer_for_key("app.bsky.feed.post/test");
260260+ assert!(layer < 256); // SHA-256 has 256 bits max
261261+262262+ // Same key should always give same layer
263263+ let layer2 = layer_for_key("app.bsky.feed.post/test");
264264+ assert_eq!(layer, layer2);
265265+ }
266266+267267+ #[test]
268268+ fn test_leading_zeros() {
269269+ // [0, 0, 0, 1] = 8 + 8 + 8 + 7 = 31 leading zeros
270270+ assert_eq!(leading_zeros(&[0, 0, 0, 1]), 31);
271271+ // [0xFF, ...] = 0 leading zeros (first byte has leading 1s)
272272+ assert_eq!(leading_zeros(&[0xFF, 0, 0]), 0);
273273+ // [0, 0x80] = 8 + 0 = 8 leading zeros (0x80 = 0b10000000)
274274+ assert_eq!(leading_zeros(&[0, 0x80]), 8);
275275+ // [0, 0x01] = 8 + 7 = 15 leading zeros (0x01 = 0b00000001)
276276+ assert_eq!(leading_zeros(&[0, 0x01]), 15);
277277+ }
278278+}
+888
crates/jacquard-repo/src/repo.rs
···11+//! High-level repository operations
22+//!
33+//! Optional convenience layer over MST primitives. Provides type-safe record operations,
44+//! batch writes, commit creation, and CAR export.
55+66+use crate::MstDiff;
77+use crate::commit::Commit;
88+use crate::error::Result;
99+use crate::mst::{Mst, WriteOp};
1010+use crate::storage::BlockStore;
1111+use cid::Cid as IpldCid;
1212+use jacquard_common::IntoStatic;
1313+use jacquard_common::types::string::{Did, Nsid, RecordKey, Tid};
1414+use jacquard_common::types::tid::Ticker;
1515+use std::collections::BTreeMap;
1616+use std::path::Path;
1717+use std::sync::Arc;
1818+1919+/// Commit data for repository updates
2020+///
2121+/// Contains signed commit and all blocks needed for persistence.
2222+/// Follows the rsky pattern of separating commit formatting from application.
2323+#[derive(Debug, Clone)]
2424+pub struct CommitData {
2525+ /// Commit CID
2626+ pub cid: IpldCid,
2727+2828+ /// New revision TID
2929+ pub rev: Tid,
3030+3131+ /// Previous revision TID (None for initial commit)
3232+ pub since: Option<Tid>,
3333+3434+ /// Previous commit CID (None for initial commit)
3535+ pub prev: Option<IpldCid>,
3636+3737+ /// New MST root CID
3838+ pub data: IpldCid,
3939+4040+ /// Previous MST root CID (for sync v1.1)
4141+ pub prev_data: Option<IpldCid>,
4242+4343+ /// All blocks to persist (MST nodes + commit block)
4444+ ///
4545+ /// Includes:
4646+ /// - All new MST node blocks from `mst.collect_blocks()`
4747+ /// - The commit block itself
4848+ pub blocks: BTreeMap<IpldCid, bytes::Bytes>,
4949+5050+ /// Relevant blocks for firehose (sync v1.1 inductive validation)
5151+ ///
5252+ /// Subset of `blocks` containing:
5353+ /// - Commit block
5454+ /// - MST node blocks along paths for all changed keys
5555+ /// - Includes "adjacent" blocks needed for operation inversion
5656+ pub relevant_blocks: BTreeMap<IpldCid, bytes::Bytes>,
5757+}
5858+5959+impl CommitData {
6060+ /// Generate a firehose commit message (sync v1.1)
6161+ ///
6262+ /// Converts this commit into a `FirehoseCommit` with `prev_data` field
6363+ /// and relevant blocks for inductive validation.
6464+ pub async fn to_firehose_commit(
6565+ &self,
6666+ repo: &Did<'_>,
6767+ seq: i64,
6868+ time: jacquard_common::types::string::Datetime,
6969+ ops: Vec<crate::commit::firehose::RepoOp<'static>>,
7070+ blobs: Vec<jacquard_common::types::cid::CidLink<'static>>,
7171+ ) -> Result<crate::commit::firehose::FirehoseCommit<'static>> {
7272+ use jacquard_common::types::cid::CidLink;
7373+7474+ // Convert relevant blocks to CAR format
7575+ let blocks_car =
7676+ crate::car::write_car_bytes(self.cid, self.relevant_blocks.clone()).await?;
7777+7878+ Ok(crate::commit::firehose::FirehoseCommit {
7979+ repo: repo.clone().into_static(),
8080+ rev: self.rev.clone(),
8181+ seq,
8282+ since: self.since.clone().unwrap_or_else(|| self.rev.clone()),
8383+ time,
8484+ commit: CidLink::from(self.cid),
8585+ blocks: blocks_car.into(),
8686+ ops,
8787+ prev_data: self.prev_data.map(CidLink::from),
8888+ blobs,
8989+ too_big: false,
9090+ rebase: false,
9191+ })
9292+ }
9393+}
9494+9595+/// High-level repository operations
9696+///
9797+/// Provides a convenient API over MST primitives for common repository workflows.
9898+///
9999+/// # Example
100100+///
101101+/// ```rust,ignore
102102+/// use jacquard_repo::{Repository, MemoryBlockStore};
103103+/// use jacquard_common::types::string::{Did, Nsid, RecordKey};
104104+///
105105+/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
106106+/// let storage = Arc::new(MemoryBlockStore::new());
107107+/// let mut repo = create_test_repo(storage).await;
108108+///
109109+/// let collection = Nsid::new("app.bsky.feed.post")?;
110110+/// let rkey = RecordKey::new("3l5yhcgz7y42y")?;
111111+/// let record_cid = /* ... compute CID of record ... */;
112112+///
113113+/// repo.create_record(&collection, &rkey, record_cid).await?;
114114+///
115115+/// let did = Did::new("did:plc:example")?;
116116+/// let signing_key = /* ... load key ... */;
117117+/// let commit_cid = repo.commit(&did, None, &signing_key).await?;
118118+///
119119+/// repo.export_car("repo.car", commit_cid).await?;
120120+/// # Ok(())
121121+/// # }
122122+/// ```
123123+pub struct Repository<S: BlockStore> {
124124+ mst: Mst<S>,
125125+ storage: Arc<S>,
126126+ commit: Commit<'static>,
127127+ commit_cid: IpldCid,
128128+}
129129+130130+impl<S: BlockStore + Sync + 'static> Repository<S> {
131131+ /// Create repository from existing components
132132+ ///
133133+ /// Static constructor for when you already have the MST, commit, and CID.
134134+ pub fn new(storage: Arc<S>, mst: Mst<S>, commit: Commit<'static>, commit_cid: IpldCid) -> Self {
135135+ Self {
136136+ storage,
137137+ mst,
138138+ commit,
139139+ commit_cid,
140140+ }
141141+ }
142142+143143+ /// Load repository from commit CID
144144+ pub async fn from_commit(storage: Arc<S>, commit_cid: &IpldCid) -> Result<Self> {
145145+ let commit_bytes = storage
146146+ .get(commit_cid)
147147+ .await?
148148+ .ok_or_else(|| crate::error::RepoError::not_found("commit", commit_cid))?;
149149+150150+ let commit = Commit::from_cbor(&commit_bytes)?;
151151+ let mst_root = commit.data();
152152+153153+ let mst = Mst::load(storage.clone(), *mst_root, None);
154154+155155+ Ok(Self {
156156+ mst,
157157+ storage,
158158+ commit: commit.into_static(),
159159+ commit_cid: *commit_cid,
160160+ })
161161+ }
162162+163163+ /// Get a record by collection and rkey
164164+ pub async fn get_record<T: jacquard_common::types::recordkey::RecordKeyType>(
165165+ &self,
166166+ collection: &Nsid<'_>,
167167+ rkey: &RecordKey<T>,
168168+ ) -> Result<Option<IpldCid>> {
169169+ let key = format!("{}/{}", collection.as_ref(), rkey.as_ref());
170170+ self.mst.get(&key).await
171171+ }
172172+173173+ /// Create a record (error if exists)
174174+ pub async fn create_record<T: jacquard_common::types::recordkey::RecordKeyType>(
175175+ &mut self,
176176+ collection: &Nsid<'_>,
177177+ rkey: &RecordKey<T>,
178178+ record_cid: IpldCid,
179179+ ) -> Result<()> {
180180+ let key = format!("{}/{}", collection.as_ref(), rkey.as_ref());
181181+182182+ if self.mst.get(&key).await?.is_some() {
183183+ return Err(crate::error::RepoError::already_exists("record", &key));
184184+ }
185185+186186+ self.mst = self.mst.add(&key, record_cid).await?;
187187+ Ok(())
188188+ }
189189+190190+ /// Update a record (error if not exists, returns previous CID)
191191+ pub async fn update_record<T: jacquard_common::types::recordkey::RecordKeyType>(
192192+ &mut self,
193193+ collection: &Nsid<'_>,
194194+ rkey: &RecordKey<T>,
195195+ record_cid: IpldCid,
196196+ ) -> Result<IpldCid> {
197197+ let key = format!("{}/{}", collection.as_ref(), rkey.as_ref());
198198+199199+ let old_cid = self
200200+ .mst
201201+ .get(&key)
202202+ .await?
203203+ .ok_or_else(|| crate::error::RepoError::not_found("record", &key))?;
204204+205205+ self.mst = self.mst.update(&key, record_cid).await?;
206206+ Ok(old_cid)
207207+ }
208208+209209+ /// Delete a record (error if not exists, returns deleted CID)
210210+ pub async fn delete_record<T: jacquard_common::types::recordkey::RecordKeyType>(
211211+ &mut self,
212212+ collection: &Nsid<'_>,
213213+ rkey: &RecordKey<T>,
214214+ ) -> Result<IpldCid> {
215215+ let key = format!("{}/{}", collection.as_ref(), rkey.as_ref());
216216+217217+ let old_cid = self
218218+ .mst
219219+ .get(&key)
220220+ .await?
221221+ .ok_or_else(|| crate::error::RepoError::not_found("record", &key))?;
222222+223223+ self.mst = self.mst.delete(&key).await?;
224224+ Ok(old_cid)
225225+ }
226226+227227+ /// Apply write operations individually (validates existence/prev)
228228+ pub async fn create_writes(&mut self, ops: &[WriteOp]) -> Result<crate::mst::MstDiff> {
229229+ let old_mst = self.mst.clone();
230230+231231+ // Apply operations individually (add/update/delete verify existence)
232232+ for op in ops {
233233+ self.mst = match op {
234234+ WriteOp::Create { key, cid } => {
235235+ // Check doesn't exist
236236+ if self.mst.get(key.as_str()).await?.is_some() {
237237+ return Err(crate::error::RepoError::already_exists(
238238+ "record",
239239+ key.as_str(),
240240+ ));
241241+ }
242242+ self.mst.add(key.as_str(), *cid).await?
243243+ }
244244+ WriteOp::Update { key, cid, prev } => {
245245+ // Check exists
246246+ let current = self.mst.get(key.as_str()).await?.ok_or_else(|| {
247247+ crate::error::RepoError::not_found("record", key.as_str())
248248+ })?;
249249+250250+ // Validate prev if provided
251251+ if let Some(prev_cid) = prev {
252252+ if ¤t != prev_cid {
253253+ return Err(crate::error::RepoError::invalid(format!(
254254+ "Update prev CID mismatch for key {}: expected {}, got {}",
255255+ key, prev_cid, current
256256+ )));
257257+ }
258258+ }
259259+260260+ self.mst.add(key.as_str(), *cid).await?
261261+ }
262262+ WriteOp::Delete { key, prev } => {
263263+ // Check exists
264264+ let current = self.mst.get(key.as_str()).await?.ok_or_else(|| {
265265+ crate::error::RepoError::not_found("record", key.as_str())
266266+ })?;
267267+268268+ // Validate prev if provided
269269+ if let Some(prev_cid) = prev {
270270+ if ¤t != prev_cid {
271271+ return Err(crate::error::RepoError::invalid(format!(
272272+ "Delete prev CID mismatch for key {}: expected {}, got {}",
273273+ key, prev_cid, current
274274+ )));
275275+ }
276276+ }
277277+278278+ self.mst.delete(key.as_str()).await?
279279+ }
280280+ };
281281+ }
282282+283283+ old_mst.diff(&self.mst).await
284284+ }
285285+286286+ /// Apply write operations and create a commit
287287+ ///
288288+ /// Convenience method that calls `create_writes()` and `commit()`.
289289+ pub async fn apply_writes<K>(&mut self, ops: &[WriteOp], signing_key: &K) -> Result<MstDiff>
290290+ where
291291+ K: crate::commit::SigningKey,
292292+ {
293293+ let did = &self.commit.did.clone();
294294+ let cid = &self.commit_cid.clone();
295295+ let diff = self.create_writes(ops).await?;
296296+ self.commit(&did, Some(*cid), signing_key).await?;
297297+ Ok(diff)
298298+ }
299299+300300+ /// Format a commit (create signed commit + collect blocks)
301301+ ///
302302+ /// Creates signed commit and collects blocks for persistence and firehose:
303303+ /// - All MST node blocks from `mst.collect_blocks()`
304304+ /// - Commit block itself
305305+ /// - Relevant blocks for sync v1.1 (walks paths for all changed keys)
306306+ ///
307307+ /// Returns `(ops, CommitData)` - ops are needed for `to_firehose_commit()`.
308308+ pub async fn format_commit<K>(
309309+ &self,
310310+ did: &Did<'_>,
311311+ prev: Option<IpldCid>,
312312+ signing_key: &K,
313313+ ) -> Result<(Vec<crate::commit::firehose::RepoOp<'static>>, CommitData)>
314314+ where
315315+ K: crate::commit::SigningKey,
316316+ {
317317+ let rev = Ticker::new().next(Some(self.commit.rev.clone()));
318318+ let data = self.mst.root().await?;
319319+ let prev_data = *self.commit.data();
320320+321321+ // Create signed commit
322322+ let commit = Commit::new_unsigned(did.clone().into_static(), data, rev.clone(), prev)
323323+ .sign(signing_key)?;
324324+325325+ // Load previous MST to compute diff
326326+ let prev_mst = Mst::load(self.storage.clone(), prev_data, None);
327327+ let diff = prev_mst.diff(&self.mst).await?;
328328+329329+ // Collect all MST blocks for persistence
330330+ let (_root_cid, mut blocks) = self.mst.collect_blocks().await?;
331331+332332+ // Collect relevant blocks for firehose (walk paths for all changed keys)
333333+ let mut relevant_blocks = BTreeMap::new();
334334+335335+ // Walk paths for creates
336336+ for (key, _cid) in &diff.creates {
337337+ let path_cids = self.mst.cids_for_path(key.as_str()).await?;
338338+ for path_cid in path_cids {
339339+ if let Some(block) = blocks.get(&path_cid) {
340340+ relevant_blocks.insert(path_cid, block.clone());
341341+ } else if let Some(block) = self.storage.get(&path_cid).await? {
342342+ relevant_blocks.insert(path_cid, block);
343343+ }
344344+ }
345345+ }
346346+347347+ // Walk paths for updates
348348+ for (key, _new_cid, _old_cid) in &diff.updates {
349349+ let path_cids = self.mst.cids_for_path(key.as_str()).await?;
350350+ for path_cid in path_cids {
351351+ if let Some(block) = blocks.get(&path_cid) {
352352+ relevant_blocks.insert(path_cid, block.clone());
353353+ } else if let Some(block) = self.storage.get(&path_cid).await? {
354354+ relevant_blocks.insert(path_cid, block);
355355+ }
356356+ }
357357+ }
358358+359359+ // Walk paths for deletes (path may not exist in new tree, but walk as far as possible)
360360+ for (key, _old_cid) in &diff.deletes {
361361+ let path_cids = self.mst.cids_for_path(key.as_str()).await?;
362362+ for path_cid in path_cids {
363363+ if let Some(block) = blocks.get(&path_cid) {
364364+ relevant_blocks.insert(path_cid, block.clone());
365365+ } else if let Some(block) = self.storage.get(&path_cid).await? {
366366+ relevant_blocks.insert(path_cid, block);
367367+ }
368368+ }
369369+ }
370370+371371+ // Add commit block to both collections
372372+ let commit_cbor = commit.to_cbor()?;
373373+ let commit_cid = crate::mst::util::compute_cid(&commit_cbor)?;
374374+ let commit_bytes = bytes::Bytes::from(commit_cbor);
375375+ blocks.insert(commit_cid, commit_bytes.clone());
376376+ relevant_blocks.insert(commit_cid, commit_bytes);
377377+378378+ // Convert diff to repository operations
379379+ let ops = diff
380380+ .to_repo_ops()
381381+ .into_iter()
382382+ .map(|op| op.into_static())
383383+ .collect();
384384+385385+ Ok((
386386+ ops,
387387+ CommitData {
388388+ cid: commit_cid,
389389+ rev,
390390+ since: Some(self.commit.rev.clone()),
391391+ prev,
392392+ data,
393393+ prev_data: Some(prev_data),
394394+ blocks,
395395+ relevant_blocks,
396396+ },
397397+ ))
398398+ }
399399+400400+ /// Apply a commit (persist blocks to storage)
401401+ ///
402402+ /// Persists all blocks from `CommitData` and updates internal state.
403403+ pub async fn apply_commit(&mut self, commit_data: CommitData) -> Result<IpldCid> {
404404+ let commit_cid = commit_data.cid;
405405+406406+ // Persist all blocks (MST + commit)
407407+ self.storage.put_many(commit_data.blocks).await?;
408408+409409+ // Load and update internal state
410410+ let commit_bytes = self
411411+ .storage
412412+ .get(&commit_cid)
413413+ .await?
414414+ .ok_or_else(|| crate::error::RepoError::not_found("commit block", &commit_cid))?;
415415+ let commit = Commit::from_cbor(&commit_bytes)?;
416416+417417+ self.commit = commit.into_static();
418418+ self.commit_cid = commit_cid;
419419+420420+ // Reload MST from new root
421421+ self.mst = Mst::load(self.storage.clone(), *self.commit.data(), None);
422422+423423+ Ok(commit_cid)
424424+ }
425425+426426+ /// Create a commit for the current repository state
427427+ ///
428428+ /// Convenience method that calls `format_commit()` and `apply_commit()`.
429429+ pub async fn commit<K>(
430430+ &mut self,
431431+ did: &Did<'_>,
432432+ prev: Option<IpldCid>,
433433+ signing_key: &K,
434434+ ) -> Result<(Vec<crate::commit::firehose::RepoOp<'static>>, IpldCid)>
435435+ where
436436+ K: crate::commit::SigningKey,
437437+ {
438438+ let (ops, commit_data) = self.format_commit(did, prev, signing_key).await?;
439439+ Ok((ops, self.apply_commit(commit_data).await?))
440440+ }
441441+442442+ /// Export repository to CAR file
443443+ pub async fn export_car(&self, path: impl AsRef<Path>, commit_cid: IpldCid) -> Result<()> {
444444+ crate::car::export_repo_car(path, commit_cid, &self.mst).await
445445+ }
446446+447447+ /// Get the underlying MST
448448+ pub fn mst(&self) -> &Mst<S> {
449449+ &self.mst
450450+ }
451451+452452+ /// Get reference to the storage
453453+ pub fn storage(&self) -> &Arc<S> {
454454+ &self.storage
455455+ }
456456+457457+ /// Get the current commit
458458+ pub fn current_commit(&self) -> &Commit<'static> {
459459+ &self.commit
460460+ }
461461+462462+ /// Get the current commit CID
463463+ pub fn current_commit_cid(&self) -> &IpldCid {
464464+ &self.commit_cid
465465+ }
466466+467467+ /// Get the DID from the current commit
468468+ pub fn did(&self) -> &Did<'_> {
469469+ self.commit.did()
470470+ }
471471+}
472472+473473+#[cfg(test)]
474474+mod tests {
475475+ use std::str::FromStr;
476476+477477+ use super::*;
478478+ use crate::storage::MemoryBlockStore;
479479+ use jacquard_common::types::recordkey::Rkey;
480480+481481+ fn make_test_cid(value: u8) -> IpldCid {
482482+ use crate::DAG_CBOR_CID_CODEC;
483483+ use jacquard_common::types::crypto::SHA2_256;
484484+ use sha2::{Digest, Sha256};
485485+486486+ let hash = Sha256::digest(&[value]);
487487+ let mh = multihash::Multihash::wrap(SHA2_256, &hash).unwrap();
488488+ IpldCid::new_v1(DAG_CBOR_CID_CODEC, mh)
489489+ }
490490+491491+ async fn create_test_repo(storage: Arc<MemoryBlockStore>) -> Repository<MemoryBlockStore> {
492492+ let did = Did::new("did:plc:test").unwrap();
493493+ let signing_key = k256::ecdsa::SigningKey::random(&mut rand::rngs::OsRng);
494494+495495+ let mst = Mst::new(storage.clone());
496496+ let data = mst.persist().await.unwrap(); // Persist empty MST
497497+498498+ let rev = Ticker::new().next(None);
499499+ let commit = Commit::new_unsigned(did.into_static(), data, rev, None)
500500+ .sign(&signing_key)
501501+ .unwrap();
502502+503503+ let commit_cbor = commit.to_cbor().unwrap();
504504+ let commit_cid = storage.put(&commit_cbor).await.unwrap();
505505+506506+ Repository::new(storage, mst, commit.into_static(), commit_cid)
507507+ }
508508+509509+ #[tokio::test]
510510+ async fn test_create_and_get_record() {
511511+ let storage = Arc::new(MemoryBlockStore::new());
512512+ let mut repo = create_test_repo(storage.clone()).await;
513513+514514+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
515515+ let rkey = RecordKey(Rkey::new("abc123").unwrap());
516516+ let cid = make_test_cid(1);
517517+518518+ repo.create_record(&collection, &rkey, cid).await.unwrap();
519519+520520+ let retrieved = repo.get_record(&collection, &rkey).await.unwrap();
521521+ assert_eq!(retrieved, Some(cid));
522522+ }
523523+524524+ #[tokio::test]
525525+ async fn test_create_duplicate_fails() {
526526+ let storage = Arc::new(MemoryBlockStore::new());
527527+ let mut repo = create_test_repo(storage).await;
528528+529529+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
530530+ let rkey = RecordKey(Rkey::new("abc123").unwrap());
531531+ let cid = make_test_cid(1);
532532+533533+ repo.create_record(&collection, &rkey, cid).await.unwrap();
534534+535535+ let result = repo
536536+ .create_record(&collection, &rkey, make_test_cid(2))
537537+ .await;
538538+ assert!(result.is_err());
539539+ }
540540+541541+ #[tokio::test]
542542+ async fn test_update_record() {
543543+ let storage = Arc::new(MemoryBlockStore::new());
544544+ let mut repo = create_test_repo(storage).await;
545545+546546+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
547547+ let rkey = RecordKey(Rkey::new("abc123").unwrap());
548548+ let cid1 = make_test_cid(1);
549549+ let cid2 = make_test_cid(2);
550550+551551+ repo.create_record(&collection, &rkey, cid1).await.unwrap();
552552+553553+ let old = repo.update_record(&collection, &rkey, cid2).await.unwrap();
554554+ assert_eq!(old, cid1);
555555+556556+ let retrieved = repo.get_record(&collection, &rkey).await.unwrap();
557557+ assert_eq!(retrieved, Some(cid2));
558558+ }
559559+560560+ #[tokio::test]
561561+ async fn test_update_nonexistent_fails() {
562562+ let storage = Arc::new(MemoryBlockStore::new());
563563+ let mut repo = create_test_repo(storage).await;
564564+565565+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
566566+ let rkey = RecordKey(Rkey::new("abc123").unwrap());
567567+ let cid = make_test_cid(1);
568568+569569+ let result = repo.update_record(&collection, &rkey, cid).await;
570570+ assert!(result.is_err());
571571+ }
572572+573573+ #[tokio::test]
574574+ async fn test_delete_record() {
575575+ let storage = Arc::new(MemoryBlockStore::new());
576576+ let mut repo = create_test_repo(storage).await;
577577+578578+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
579579+ let rkey = RecordKey(Rkey::new("abc123").unwrap());
580580+ let cid = make_test_cid(1);
581581+582582+ repo.create_record(&collection, &rkey, cid).await.unwrap();
583583+584584+ let deleted = repo.delete_record(&collection, &rkey).await.unwrap();
585585+ assert_eq!(deleted, cid);
586586+587587+ let retrieved = repo.get_record(&collection, &rkey).await.unwrap();
588588+ assert_eq!(retrieved, None);
589589+ }
590590+591591+ #[tokio::test]
592592+ async fn test_delete_nonexistent_fails() {
593593+ let storage = Arc::new(MemoryBlockStore::new());
594594+ let mut repo = create_test_repo(storage).await;
595595+596596+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
597597+ let rkey = RecordKey(Rkey::new("abc123").unwrap());
598598+599599+ let result = repo.delete_record(&collection, &rkey).await;
600600+ assert!(result.is_err());
601601+ }
602602+603603+ #[tokio::test]
604604+ async fn test_apply_writes() {
605605+ let storage = Arc::new(MemoryBlockStore::new());
606606+ let mut repo = create_test_repo(storage).await;
607607+608608+ let ops = vec![
609609+ WriteOp::Create {
610610+ key: "app.bsky.feed.post/abc123".into(),
611611+ cid: make_test_cid(1),
612612+ },
613613+ WriteOp::Create {
614614+ key: "app.bsky.feed.post/def456".into(),
615615+ cid: make_test_cid(2),
616616+ },
617617+ ];
618618+619619+ let diff = repo.create_writes(&ops).await.unwrap();
620620+ assert_eq!(diff.creates.len(), 2);
621621+ assert_eq!(diff.updates.len(), 0);
622622+ assert_eq!(diff.deletes.len(), 0);
623623+ }
624624+625625+ #[tokio::test]
626626+ async fn test_from_commit() {
627627+ let storage = Arc::new(MemoryBlockStore::new());
628628+ let mut repo = create_test_repo(storage.clone()).await;
629629+630630+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
631631+ let rkey = RecordKey(Rkey::new("abc123").unwrap());
632632+ let cid = make_test_cid(1);
633633+634634+ repo.create_record(&collection, &rkey, cid).await.unwrap();
635635+636636+ // Persist MST
637637+ repo.mst.persist().await.unwrap();
638638+639639+ // Create commit (need a signing key for this test)
640640+ let did = Did::new("did:plc:test").unwrap();
641641+ let signing_key = k256::ecdsa::SigningKey::random(&mut rand::rngs::OsRng);
642642+ let (_, commit_cid) = repo.commit(&did, None, &signing_key).await.unwrap();
643643+644644+ // Load from commit
645645+ let loaded_repo = Repository::from_commit(storage, &commit_cid).await.unwrap();
646646+647647+ let retrieved = loaded_repo.get_record(&collection, &rkey).await.unwrap();
648648+ assert_eq!(retrieved, Some(cid));
649649+ }
650650+651651+ #[tokio::test]
652652+ async fn test_commit_creates_valid_commit() {
653653+ let storage = Arc::new(MemoryBlockStore::new());
654654+ let mut repo = create_test_repo(storage.clone()).await;
655655+656656+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
657657+ let rkey = RecordKey(Rkey::new("abc123").unwrap());
658658+ let cid = make_test_cid(1);
659659+660660+ repo.create_record(&collection, &rkey, cid).await.unwrap();
661661+ repo.mst.persist().await.unwrap();
662662+663663+ let did = Did::new("did:plc:test").unwrap();
664664+ let signing_key = k256::ecdsa::SigningKey::random(&mut rand::rngs::OsRng);
665665+ let (_, commit_cid) = repo.commit(&did, None, &signing_key).await.unwrap();
666666+667667+ // Verify commit was stored
668668+ let commit_bytes = storage.get(&commit_cid).await.unwrap();
669669+ assert!(commit_bytes.is_some());
670670+671671+ // Verify commit can be deserialized
672672+ let bytes = commit_bytes.unwrap();
673673+ let commit = Commit::from_cbor(&bytes).unwrap();
674674+ assert_eq!(commit.did().as_ref(), did.as_ref());
675675+ let root_cid = repo.mst.root().await.unwrap();
676676+ assert_eq!(commit.data(), &root_cid);
677677+ }
678678+679679+ #[tokio::test]
680680+ async fn test_sequential_operations() {
681681+ let storage = Arc::new(MemoryBlockStore::new());
682682+ let mut repo = create_test_repo(storage.clone()).await;
683683+684684+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
685685+ let rkey = RecordKey(Rkey::new("test1").unwrap());
686686+ let cid1 = make_test_cid(1);
687687+ let cid2 = make_test_cid(2);
688688+689689+ // Create
690690+ repo.create_record(&collection, &rkey, cid1).await.unwrap();
691691+ let got = repo.get_record(&collection, &rkey).await.unwrap();
692692+ assert_eq!(got, Some(cid1));
693693+694694+ // Update
695695+ let old = repo.update_record(&collection, &rkey, cid2).await.unwrap();
696696+ assert_eq!(old, cid1);
697697+ let got = repo.get_record(&collection, &rkey).await.unwrap();
698698+ assert_eq!(got, Some(cid2));
699699+700700+ // Delete
701701+ let deleted = repo.delete_record(&collection, &rkey).await.unwrap();
702702+ assert_eq!(deleted, cid2);
703703+ let got = repo.get_record(&collection, &rkey).await.unwrap();
704704+ assert!(got.is_none());
705705+ }
706706+707707+ #[tokio::test]
708708+ async fn test_large_scale_operations() {
709709+ let storage = Arc::new(MemoryBlockStore::new());
710710+ let mut repo = create_test_repo(storage.clone()).await;
711711+712712+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
713713+ let mut ticker = Ticker::new();
714714+715715+ // Add 100 records
716716+ let mut records = Vec::new();
717717+ for i in 0..100 {
718718+ let tid_str = ticker.next(None).into_static();
719719+ let rkey = RecordKey(Rkey::from_str(tid_str.as_str()).unwrap());
720720+ let cid = make_test_cid((i % 256) as u8);
721721+ repo.create_record(&collection, &rkey, cid).await.unwrap();
722722+ records.push((rkey, cid));
723723+ }
724724+725725+ // Verify all records exist
726726+ for (rkey, cid) in &records {
727727+ let got = repo.get_record(&collection, rkey).await.unwrap();
728728+ assert_eq!(got, Some(*cid));
729729+ }
730730+731731+ // Update first 20 records
732732+ for i in 0..20 {
733733+ let (rkey, _old_cid) = &records[i];
734734+ let new_cid = make_test_cid(((i + 100) % 256) as u8);
735735+ repo.update_record(&collection, rkey, new_cid)
736736+ .await
737737+ .unwrap();
738738+739739+ let got = repo.get_record(&collection, rkey).await.unwrap();
740740+ assert_eq!(got, Some(new_cid));
741741+ }
742742+743743+ // Delete last 20 records
744744+ for i in 80..100 {
745745+ let (rkey, cid) = &records[i];
746746+ let deleted = repo.delete_record(&collection, rkey).await.unwrap();
747747+ assert_eq!(deleted, *cid);
748748+749749+ let got = repo.get_record(&collection, rkey).await.unwrap();
750750+ assert!(got.is_none());
751751+ }
752752+ }
753753+754754+ #[tokio::test]
755755+ async fn test_commit_signature_verification() {
756756+ use jacquard_common::types::crypto::{KeyCodec, PublicKey};
757757+758758+ let storage = Arc::new(MemoryBlockStore::new());
759759+ let mut repo = create_test_repo(storage.clone()).await;
760760+761761+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
762762+ let rkey = RecordKey(Rkey::new("abc123").unwrap());
763763+ let cid = make_test_cid(1);
764764+765765+ repo.create_record(&collection, &rkey, cid).await.unwrap();
766766+ repo.mst.persist().await.unwrap();
767767+768768+ let did = Did::new("did:plc:test").unwrap();
769769+ let signing_key = k256::ecdsa::SigningKey::random(&mut rand::rngs::OsRng);
770770+771771+ // Get public key from signing key
772772+ let verifying_key = signing_key.verifying_key();
773773+ let pubkey_bytes = verifying_key.to_encoded_point(true).as_bytes().to_vec();
774774+ let pubkey = PublicKey {
775775+ codec: KeyCodec::Secp256k1,
776776+ bytes: pubkey_bytes.into(),
777777+ };
778778+779779+ let (_, commit_cid) = repo.commit(&did, None, &signing_key).await.unwrap();
780780+781781+ // Load commit and verify signature
782782+ let commit_bytes = storage.get(&commit_cid).await.unwrap().unwrap();
783783+ let commit = Commit::from_cbor(&commit_bytes).unwrap();
784784+785785+ // Signature verification should succeed
786786+ commit.verify(&pubkey).unwrap();
787787+ }
788788+789789+ #[tokio::test]
790790+ async fn test_load_from_storage_with_multiple_commits() {
791791+ let storage = Arc::new(MemoryBlockStore::new());
792792+ let mut repo = create_test_repo(storage.clone()).await;
793793+794794+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
795795+ let did = Did::new("did:plc:test").unwrap();
796796+ let signing_key = k256::ecdsa::SigningKey::random(&mut rand::rngs::OsRng);
797797+798798+ // Add some records and commit
799799+ let mut records = Vec::new();
800800+ for i in 0..10 {
801801+ let rkey = RecordKey(Rkey::from_str(&format!("record{}", i)).unwrap());
802802+ let cid = make_test_cid(i as u8);
803803+ repo.create_record(&collection, &rkey, cid).await.unwrap();
804804+ records.push((rkey, cid));
805805+ }
806806+ repo.mst.persist().await.unwrap();
807807+ let (_, commit_cid) = repo
808808+ .commit(&did, Some(repo.current_commit_cid().clone()), &signing_key)
809809+ .await
810810+ .unwrap();
811811+812812+ // Load repository from storage
813813+ let loaded_repo = Repository::from_commit(storage.clone(), &commit_cid)
814814+ .await
815815+ .unwrap();
816816+817817+ // Verify all records are accessible
818818+ for (rkey, cid) in &records {
819819+ let got = loaded_repo.get_record(&collection, rkey).await.unwrap();
820820+ assert_eq!(got, Some(*cid));
821821+ }
822822+823823+ // Verify metadata matches
824824+ assert_eq!(loaded_repo.did().as_ref(), did.as_ref());
825825+ assert_eq!(loaded_repo.current_commit().version, 3);
826826+ assert_eq!(loaded_repo.current_commit_cid(), &commit_cid);
827827+ }
828828+829829+ #[tokio::test]
830830+ async fn test_batch_mixed_operations() {
831831+ let storage = Arc::new(MemoryBlockStore::new());
832832+ let mut repo = create_test_repo(storage.clone()).await;
833833+834834+ let collection = Nsid::new("app.bsky.feed.post").unwrap();
835835+836836+ // Pre-populate with some records
837837+ let rkey1 = RecordKey(Rkey::new("existing1").unwrap());
838838+ let rkey2 = RecordKey(Rkey::new("existing2").unwrap());
839839+ let rkey3 = RecordKey(Rkey::new("existing3").unwrap());
840840+ repo.create_record(&collection, &rkey1, make_test_cid(1))
841841+ .await
842842+ .unwrap();
843843+ repo.create_record(&collection, &rkey2, make_test_cid(2))
844844+ .await
845845+ .unwrap();
846846+ repo.create_record(&collection, &rkey3, make_test_cid(3))
847847+ .await
848848+ .unwrap();
849849+850850+ // Batch operation: create new, update existing, delete existing
851851+ let ops = vec![
852852+ WriteOp::Create {
853853+ key: format!("{}/{}", collection.as_ref(), "new1").into(),
854854+ cid: make_test_cid(10),
855855+ },
856856+ WriteOp::Update {
857857+ key: format!("{}/{}", collection.as_ref(), "existing1").into(),
858858+ cid: make_test_cid(11),
859859+ prev: None,
860860+ },
861861+ WriteOp::Delete {
862862+ key: format!("{}/{}", collection.as_ref(), "existing2").into(),
863863+ prev: None,
864864+ },
865865+ ];
866866+867867+ let diff = repo.create_writes(&ops).await.unwrap();
868868+ assert_eq!(diff.creates.len(), 1);
869869+ assert_eq!(diff.updates.len(), 1);
870870+ assert_eq!(diff.deletes.len(), 1);
871871+872872+ // Verify final state
873873+ let new_rkey = RecordKey(Rkey::new("new1").unwrap());
874874+ assert_eq!(
875875+ repo.get_record(&collection, &new_rkey).await.unwrap(),
876876+ Some(make_test_cid(10))
877877+ );
878878+ assert_eq!(
879879+ repo.get_record(&collection, &rkey1).await.unwrap(),
880880+ Some(make_test_cid(11))
881881+ );
882882+ assert_eq!(repo.get_record(&collection, &rkey2).await.unwrap(), None);
883883+ assert_eq!(
884884+ repo.get_record(&collection, &rkey3).await.unwrap(),
885885+ Some(make_test_cid(3))
886886+ );
887887+ }
888888+}
+276
crates/jacquard-repo/src/storage/file.rs
···11+//! CAR file-backed block storage
22+33+use std::collections::BTreeMap;
44+use std::path::PathBuf;
55+use std::sync::{Arc, RwLock};
66+77+use bytes::Bytes;
88+use cid::Cid as IpldCid;
99+1010+use crate::error::Result;
1111+use crate::storage::BlockStore;
1212+1313+/// CAR file-backed block storage
1414+///
1515+/// Loads entire CAR file into memory on construction, writes back on flush.
1616+/// For very large CAR files, consider database-backed storage instead.
1717+///
1818+/// Primarily useful for testing and simple file-based persistence.
1919+#[derive(Debug, Clone)]
2020+pub struct FileBlockStore {
2121+ path: PathBuf,
2222+ blocks: Arc<RwLock<BTreeMap<IpldCid, Bytes>>>,
2323+ roots: Arc<RwLock<Vec<IpldCid>>>,
2424+ dirty: Arc<RwLock<bool>>,
2525+}
2626+2727+impl FileBlockStore {
2828+ /// Load from existing CAR file
2929+ pub async fn load(path: impl Into<PathBuf>) -> Result<Self> {
3030+ let path = path.into();
3131+3232+ // Read header to get roots
3333+ let roots = crate::car::read_car_header(&path).await?;
3434+3535+ // Read all blocks
3636+ let blocks = crate::car::read_car(&path).await?;
3737+3838+ Ok(Self {
3939+ path,
4040+ blocks: Arc::new(RwLock::new(blocks)),
4141+ roots: Arc::new(RwLock::new(roots)),
4242+ dirty: Arc::new(RwLock::new(false)),
4343+ })
4444+ }
4545+4646+ /// Create new CAR file storage (empty)
4747+ ///
4848+ /// Creates an empty in-memory storage that will write to the given path
4949+ /// when `flush()` is called.
5050+ ///
5151+ /// The file is not created until the first flush.
5252+ pub fn new(path: impl Into<PathBuf>) -> Self {
5353+ Self {
5454+ path: path.into(),
5555+ blocks: Arc::new(RwLock::new(BTreeMap::new())),
5656+ roots: Arc::new(RwLock::new(Vec::new())),
5757+ dirty: Arc::new(RwLock::new(false)),
5858+ }
5959+ }
6060+6161+ /// Get the CAR file roots
6262+ ///
6363+ /// In a repository CAR file, roots typically contain the commit CID(s).
6464+ pub fn roots(&self) -> Vec<IpldCid> {
6565+ self.roots.read().unwrap().clone()
6666+ }
6767+6868+ /// Set the CAR file roots (for writing)
6969+ ///
7070+ /// This marks the storage as dirty. Call `flush()` to persist the change.
7171+ pub fn set_roots(&self, new_roots: Vec<IpldCid>) {
7272+ *self.roots.write().unwrap() = new_roots;
7373+ *self.dirty.write().unwrap() = true;
7474+ }
7575+7676+ /// Write blocks back to CAR file if dirty
7777+ ///
7878+ /// This is an async operation that writes the entire block store to the
7979+ /// CAR file. Only writes if there have been changes since the last flush.
8080+ ///
8181+ /// # Errors
8282+ ///
8383+ /// Returns an error if the CAR file cannot be written.
8484+ pub async fn flush(&self) -> Result<()> {
8585+ if !*self.dirty.read().unwrap() {
8686+ return Ok(());
8787+ }
8888+8989+ let blocks = self.blocks.read().unwrap().clone();
9090+ let roots = self.roots.read().unwrap().clone();
9191+ crate::car::write_car(&self.path, roots, blocks).await?;
9292+9393+ *self.dirty.write().unwrap() = false;
9494+ Ok(())
9595+ }
9696+9797+ /// Check if store has unflushed changes
9898+ pub fn is_dirty(&self) -> bool {
9999+ *self.dirty.read().unwrap()
100100+ }
101101+102102+ /// Get the path to the CAR file
103103+ pub fn path(&self) -> &std::path::Path {
104104+ &self.path
105105+ }
106106+}
107107+108108+impl BlockStore for FileBlockStore {
109109+ async fn get(&self, cid: &IpldCid) -> Result<Option<Bytes>> {
110110+ Ok(self.blocks.read().unwrap().get(cid).cloned())
111111+ }
112112+113113+ async fn put(&self, data: &[u8]) -> Result<IpldCid> {
114114+ let cid = crate::mst::util::compute_cid(data)?;
115115+ self.blocks
116116+ .write()
117117+ .unwrap()
118118+ .insert(cid, Bytes::copy_from_slice(data));
119119+ *self.dirty.write().unwrap() = true;
120120+ Ok(cid)
121121+ }
122122+123123+ async fn has(&self, cid: &IpldCid) -> Result<bool> {
124124+ Ok(self.blocks.read().unwrap().contains_key(cid))
125125+ }
126126+127127+ async fn put_many(
128128+ &self,
129129+ blocks: impl IntoIterator<Item = (IpldCid, Bytes)> + Send,
130130+ ) -> Result<()> {
131131+ let mut store = self.blocks.write().unwrap();
132132+ for (cid, data) in blocks {
133133+ store.insert(cid, data);
134134+ }
135135+ *self.dirty.write().unwrap() = true;
136136+ Ok(())
137137+ }
138138+139139+ async fn get_many(&self, cids: &[IpldCid]) -> Result<Vec<Option<Bytes>>> {
140140+ let store = self.blocks.read().unwrap();
141141+ let mut results = Vec::with_capacity(cids.len());
142142+ for cid in cids {
143143+ results.push(store.get(cid).cloned());
144144+ }
145145+ Ok(results)
146146+ }
147147+}
148148+149149+#[cfg(test)]
150150+mod tests {
151151+ use super::*;
152152+ use crate::DAG_CBOR_CID_CODEC;
153153+ use jacquard_common::types::crypto::SHA2_256;
154154+ use tempfile::NamedTempFile;
155155+156156+ fn test_cid(n: u8) -> IpldCid {
157157+ let data = vec![n; 32];
158158+ let mh = multihash::Multihash::wrap(SHA2_256, &data).unwrap();
159159+ IpldCid::new_v1(DAG_CBOR_CID_CODEC, mh)
160160+ }
161161+162162+ #[tokio::test]
163163+ async fn test_new_empty_store() {
164164+ let temp_file = NamedTempFile::new().unwrap();
165165+ let storage = FileBlockStore::new(temp_file.path());
166166+167167+ assert!(storage.roots().is_empty());
168168+ assert!(!storage.is_dirty());
169169+ assert_eq!(storage.path(), temp_file.path());
170170+ }
171171+172172+ #[tokio::test]
173173+ async fn test_put_and_get() {
174174+ let temp_file = NamedTempFile::new().unwrap();
175175+ let storage = FileBlockStore::new(temp_file.path());
176176+177177+ let data = b"test data";
178178+ let cid = storage.put(data).await.unwrap();
179179+180180+ assert!(storage.is_dirty());
181181+182182+ let retrieved = storage.get(&cid).await.unwrap().unwrap();
183183+ assert_eq!(retrieved.as_ref(), data);
184184+ }
185185+186186+ #[tokio::test]
187187+ async fn test_has() {
188188+ let temp_file = NamedTempFile::new().unwrap();
189189+ let storage = FileBlockStore::new(temp_file.path());
190190+191191+ let data = b"test data";
192192+ let cid = storage.put(data).await.unwrap();
193193+194194+ assert!(storage.has(&cid).await.unwrap());
195195+ assert!(!storage.has(&test_cid(99)).await.unwrap());
196196+ }
197197+198198+ #[tokio::test]
199199+ async fn test_flush_and_reload() {
200200+ let temp_file = NamedTempFile::new().unwrap();
201201+202202+ // Create store, add data, flush
203203+ let storage = FileBlockStore::new(temp_file.path());
204204+ let data1 = b"test data 1";
205205+ let data2 = b"test data 2";
206206+ let cid1 = storage.put(data1).await.unwrap();
207207+ let cid2 = storage.put(data2).await.unwrap();
208208+209209+ storage.set_roots(vec![cid1]);
210210+ assert!(storage.is_dirty());
211211+212212+ storage.flush().await.unwrap();
213213+ assert!(!storage.is_dirty());
214214+215215+ // Reload from file
216216+ let storage2 = FileBlockStore::load(temp_file.path()).await.unwrap();
217217+218218+ assert_eq!(storage2.roots(), vec![cid1]);
219219+ assert_eq!(storage2.get(&cid1).await.unwrap().unwrap().as_ref(), data1);
220220+ assert_eq!(storage2.get(&cid2).await.unwrap().unwrap().as_ref(), data2);
221221+ assert!(!storage2.is_dirty());
222222+ }
223223+224224+ #[tokio::test]
225225+ async fn test_put_many() {
226226+ let temp_file = NamedTempFile::new().unwrap();
227227+ let storage = FileBlockStore::new(temp_file.path());
228228+229229+ let data1 = Bytes::from_static(b"data 1");
230230+ let data2 = Bytes::from_static(b"data 2");
231231+ let cid1 = test_cid(1);
232232+ let cid2 = test_cid(2);
233233+234234+ storage
235235+ .put_many(vec![(cid1, data1.clone()), (cid2, data2.clone())])
236236+ .await
237237+ .unwrap();
238238+239239+ assert!(storage.is_dirty());
240240+ assert_eq!(storage.get(&cid1).await.unwrap().unwrap(), data1);
241241+ assert_eq!(storage.get(&cid2).await.unwrap().unwrap(), data2);
242242+ }
243243+244244+ #[tokio::test]
245245+ async fn test_get_many() {
246246+ let temp_file = NamedTempFile::new().unwrap();
247247+ let storage = FileBlockStore::new(temp_file.path());
248248+249249+ let data1 = b"data 1";
250250+ let data2 = b"data 2";
251251+ let cid1 = storage.put(data1).await.unwrap();
252252+ let cid2 = storage.put(data2).await.unwrap();
253253+ let cid3 = test_cid(99); // Non-existent
254254+255255+ let results = storage.get_many(&[cid1, cid2, cid3]).await.unwrap();
256256+257257+ assert_eq!(results.len(), 3);
258258+ assert_eq!(results[0].as_ref().unwrap().as_ref(), data1);
259259+ assert_eq!(results[1].as_ref().unwrap().as_ref(), data2);
260260+ assert!(results[2].is_none());
261261+ }
262262+263263+ #[tokio::test]
264264+ async fn test_set_roots_marks_dirty() {
265265+ let temp_file = NamedTempFile::new().unwrap();
266266+ let storage = FileBlockStore::new(temp_file.path());
267267+268268+ assert!(!storage.is_dirty());
269269+270270+ storage.set_roots(vec![test_cid(1)]);
271271+ assert!(storage.is_dirty());
272272+273273+ storage.flush().await.unwrap();
274274+ assert!(!storage.is_dirty());
275275+ }
276276+}
+210
crates/jacquard-repo/src/storage/layered.rs
···11+//! Layered block storage for efficient firehose validation
22+//!
33+//! Provides a two-layer storage that reads from a writable layer first,
44+//! then falls back to a read-only base layer. All writes go to the writable layer.
55+//!
66+//! This is used for firehose validation to avoid copying the entire previous MST tree.
77+88+use crate::error::Result;
99+use crate::storage::BlockStore;
1010+use bytes::Bytes;
1111+use cid::Cid as IpldCid;
1212+1313+/// Layered block storage with a writable overlay and read-only base
1414+///
1515+/// Reads check the writable layer first, then fall back to the base layer.
1616+/// All writes go to the writable layer only.
1717+///
1818+/// # Use Case
1919+///
2020+/// Firehose validation needs to:
2121+/// 1. Load previous MST state from existing storage (base layer)
2222+/// 2. Apply new operations that create new MST nodes (writable layer)
2323+/// 3. Not pollute the base storage with temporary validation blocks
2424+///
2525+/// Without layering, we'd need to copy all previous MST blocks to temporary storage.
2626+/// With layering, we just overlay temp storage on top of base storage.
2727+///
2828+/// # Example
2929+///
3030+/// ```rust,ignore
3131+/// use jacquard_repo::storage::{LayeredBlockStore, MemoryBlockStore};
3232+/// use std::sync::Arc;
3333+///
3434+/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
3535+/// let base = Arc::new(MemoryBlockStore::new()); // existing repo storage
3636+/// let writable = MemoryBlockStore::new(); // temp storage for validation
3737+///
3838+/// let layered = LayeredBlockStore::new(writable, base);
3939+///
4040+/// // Reads check writable first, then base
4141+/// // Writes only go to writable
4242+/// # Ok(())
4343+/// # }
4444+/// ```
4545+#[derive(Clone)]
4646+pub struct LayeredBlockStore<W: BlockStore, B: BlockStore> {
4747+ writable: W,
4848+ base: std::sync::Arc<B>,
4949+}
5050+5151+impl<W: BlockStore, B: BlockStore> LayeredBlockStore<W, B> {
5252+ /// Create a new layered storage
5353+ ///
5454+ /// - `writable`: Top layer receiving all writes
5555+ /// - `base`: Bottom layer for fallback reads (read-only, Arc-wrapped to avoid cloning)
5656+ pub fn new(writable: W, base: std::sync::Arc<B>) -> Self {
5757+ Self { writable, base }
5858+ }
5959+6060+ /// Get reference to the writable layer
6161+ pub fn writable(&self) -> &W {
6262+ &self.writable
6363+ }
6464+6565+ /// Get reference to the base layer
6666+ pub fn base(&self) -> &std::sync::Arc<B> {
6767+ &self.base
6868+ }
6969+}
7070+7171+impl<W: BlockStore + Sync + 'static, B: BlockStore + Sync + 'static> BlockStore
7272+ for LayeredBlockStore<W, B>
7373+{
7474+ async fn get(&self, cid: &IpldCid) -> Result<Option<Bytes>> {
7575+ // Check writable layer first
7676+ if let Some(data) = self.writable.get(cid).await? {
7777+ return Ok(Some(data));
7878+ }
7979+8080+ // Fall back to base layer
8181+ self.base.get(cid).await
8282+ }
8383+8484+ async fn put(&self, data: &[u8]) -> Result<IpldCid> {
8585+ // All writes go to writable layer
8686+ self.writable.put(data).await
8787+ }
8888+8989+ async fn has(&self, cid: &IpldCid) -> Result<bool> {
9090+ // Check writable first
9191+ if self.writable.has(cid).await? {
9292+ return Ok(true);
9393+ }
9494+9595+ // Fall back to base
9696+ self.base.has(cid).await
9797+ }
9898+9999+ async fn put_many(&self, blocks: impl IntoIterator<Item = (IpldCid, Bytes)> + Send) -> Result<()> {
100100+ // All writes go to writable layer
101101+ self.writable.put_many(blocks).await
102102+ }
103103+104104+ async fn get_many(&self, cids: &[IpldCid]) -> Result<Vec<Option<Bytes>>> {
105105+ let mut results = Vec::with_capacity(cids.len());
106106+107107+ for cid in cids {
108108+ results.push(self.get(cid).await?);
109109+ }
110110+111111+ Ok(results)
112112+ }
113113+}
114114+115115+#[cfg(test)]
116116+mod tests {
117117+ use super::*;
118118+ use crate::storage::MemoryBlockStore;
119119+120120+ #[tokio::test]
121121+ async fn test_layered_read_from_writable() {
122122+ let base = std::sync::Arc::new(MemoryBlockStore::new());
123123+ let writable = MemoryBlockStore::new();
124124+125125+ // Put data in writable layer
126126+ let cid = writable.put(b"test data").await.unwrap();
127127+128128+ let layered = LayeredBlockStore::new(writable, base);
129129+130130+ // Should read from writable layer
131131+ let data = layered.get(&cid).await.unwrap();
132132+ assert_eq!(&*data.unwrap(), b"test data");
133133+ }
134134+135135+ #[tokio::test]
136136+ async fn test_layered_fallback_to_base() {
137137+ let base = std::sync::Arc::new(MemoryBlockStore::new());
138138+ let writable = MemoryBlockStore::new();
139139+140140+ // Put data in base layer
141141+ let cid = base.put(b"base data").await.unwrap();
142142+143143+ let layered = LayeredBlockStore::new(writable, base);
144144+145145+ // Should fall back to base layer
146146+ let data = layered.get(&cid).await.unwrap();
147147+ assert_eq!(&*data.unwrap(), b"base data");
148148+ }
149149+150150+ #[tokio::test]
151151+ async fn test_layered_writable_overrides_base() {
152152+ let base = std::sync::Arc::new(MemoryBlockStore::new());
153153+ let writable = MemoryBlockStore::new();
154154+155155+ // Put same content in both layers (will have same CID)
156156+ let cid = base.put(b"original").await.unwrap();
157157+ let cid2 = writable.put(b"original").await.unwrap();
158158+ assert_eq!(cid, cid2); // Same content = same CID
159159+160160+ // Now put different content with manual override (in real usage this wouldn't happen,
161161+ // but testing the layer priority)
162162+ // Actually, we can't manually set CIDs, so let's test differently:
163163+164164+ // Put different data in each layer
165165+ let base_cid = base.put(b"base content").await.unwrap();
166166+ let writable_cid = writable.put(b"writable content").await.unwrap();
167167+168168+ let layered = LayeredBlockStore::new(writable, base);
169169+170170+ // Should get writable content for writable CID
171171+ let data1 = layered.get(&writable_cid).await.unwrap().unwrap();
172172+ assert_eq!(&*data1, b"writable content");
173173+174174+ // Should get base content for base CID
175175+ let data2 = layered.get(&base_cid).await.unwrap().unwrap();
176176+ assert_eq!(&*data2, b"base content");
177177+ }
178178+179179+ #[tokio::test]
180180+ async fn test_layered_writes_to_writable_only() {
181181+ let base = std::sync::Arc::new(MemoryBlockStore::new());
182182+ let writable = MemoryBlockStore::new();
183183+184184+ let layered = LayeredBlockStore::new(writable.clone(), base.clone());
185185+186186+ // Write through layered storage
187187+ let cid = layered.put(b"new data").await.unwrap();
188188+189189+ // Should be in writable layer
190190+ assert!(writable.has(&cid).await.unwrap());
191191+192192+ // Should NOT be in base layer
193193+ assert!(!base.has(&cid).await.unwrap());
194194+ }
195195+196196+ #[tokio::test]
197197+ async fn test_layered_has_checks_both_layers() {
198198+ let base = std::sync::Arc::new(MemoryBlockStore::new());
199199+ let writable = MemoryBlockStore::new();
200200+201201+ let base_cid = base.put(b"base").await.unwrap();
202202+ let writable_cid = writable.put(b"writable").await.unwrap();
203203+204204+ let layered = LayeredBlockStore::new(writable, base);
205205+206206+ // Should find in both layers
207207+ assert!(layered.has(&base_cid).await.unwrap());
208208+ assert!(layered.has(&writable_cid).await.unwrap());
209209+ }
210210+}
+210
crates/jacquard-repo/src/storage/memory.rs
···11+//! In-memory block storage implementation
22+33+use crate::error::Result;
44+use crate::storage::BlockStore;
55+use bytes::Bytes;
66+use cid::Cid as IpldCid;
77+use std::collections::BTreeMap;
88+use std::sync::{Arc, RwLock};
99+1010+/// In-memory block storage using BTreeMap
1111+///
1212+/// Useful for:
1313+/// - Testing
1414+/// - Temporary operations
1515+/// - Small repositories that fit in memory
1616+///
1717+/// Uses `Bytes` for efficient reference-counted storage with cheap cloning.
1818+///
1919+/// # Example
2020+///
2121+/// ```rust,ignore
2222+/// use jacquard_repo::storage::{BlockStore, MemoryBlockStore};
2323+///
2424+/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
2525+/// let storage = MemoryBlockStore::new();
2626+///
2727+/// let data = b"hello world";
2828+/// let cid = storage.put(data).await?;
2929+///
3030+/// let retrieved = storage.get(&cid).await?;
3131+/// assert_eq!(retrieved.as_deref(), Some(&data[..]));
3232+/// # Ok(())
3333+/// # }
3434+/// ```
3535+#[derive(Debug, Clone)]
3636+pub struct MemoryBlockStore {
3737+ blocks: Arc<RwLock<BTreeMap<IpldCid, Bytes>>>,
3838+}
3939+4040+impl MemoryBlockStore {
4141+ /// Create new empty memory store
4242+ pub fn new() -> Self {
4343+ Self {
4444+ blocks: Arc::new(RwLock::new(BTreeMap::new())),
4545+ }
4646+ }
4747+4848+ /// Create new memory store from a map of blocks
4949+ pub fn new_from_blocks(blocks: BTreeMap<IpldCid, Bytes>) -> Self {
5050+ Self {
5151+ blocks: Arc::new(RwLock::new(blocks)),
5252+ }
5353+ }
5454+5555+ /// Get number of blocks stored
5656+ pub fn len(&self) -> usize {
5757+ self.blocks.read().unwrap().len()
5858+ }
5959+6060+ /// Check if store is empty
6161+ pub fn is_empty(&self) -> bool {
6262+ self.blocks.read().unwrap().is_empty()
6363+ }
6464+6565+ /// Clear all blocks
6666+ pub fn clear(&self) {
6767+ self.blocks.write().unwrap().clear();
6868+ }
6969+7070+ /// Put a block with a pre-computed CID (for testing)
7171+ ///
7272+ /// # Note
7373+ ///
7474+ /// This bypasses CID verification. Only use for testing.
7575+ #[cfg(test)]
7676+ pub(crate) async fn put_with_cid(&self, cid: IpldCid, data: impl Into<Bytes>) -> Result<()> {
7777+ self.blocks.write().unwrap().insert(cid, data.into());
7878+ Ok(())
7979+ }
8080+}
8181+8282+impl Default for MemoryBlockStore {
8383+ fn default() -> Self {
8484+ Self::new()
8585+ }
8686+}
8787+8888+impl BlockStore for MemoryBlockStore {
8989+ async fn get(&self, cid: &IpldCid) -> Result<Option<Bytes>> {
9090+ Ok(self.blocks.read().unwrap().get(cid).cloned())
9191+ }
9292+9393+ async fn put(&self, data: &[u8]) -> Result<IpldCid> {
9494+ let cid = crate::mst::util::compute_cid(data)?;
9595+ self.blocks
9696+ .write()
9797+ .unwrap()
9898+ .insert(cid, Bytes::copy_from_slice(data));
9999+ Ok(cid)
100100+ }
101101+102102+ async fn has(&self, cid: &IpldCid) -> Result<bool> {
103103+ Ok(self.blocks.read().unwrap().contains_key(cid))
104104+ }
105105+106106+ async fn put_many(&self, blocks: impl IntoIterator<Item = (IpldCid, Bytes)> + Send) -> Result<()> {
107107+ let mut store = self.blocks.write().unwrap();
108108+ for (cid, data) in blocks {
109109+ store.insert(cid, data);
110110+ }
111111+ Ok(())
112112+ }
113113+114114+ async fn get_many(&self, cids: &[IpldCid]) -> Result<Vec<Option<Bytes>>> {
115115+ let store = self.blocks.read().unwrap();
116116+ let mut results = Vec::with_capacity(cids.len());
117117+ for cid in cids {
118118+ results.push(store.get(cid).cloned());
119119+ }
120120+ Ok(results)
121121+ }
122122+}
123123+124124+#[cfg(test)]
125125+mod tests {
126126+ use super::*;
127127+128128+ #[tokio::test]
129129+ async fn test_put_and_get() {
130130+ let store = MemoryBlockStore::new();
131131+ let data = b"test data";
132132+133133+ let cid = store.put(data).await.unwrap();
134134+ let retrieved = store.get(&cid).await.unwrap();
135135+136136+ assert_eq!(retrieved.as_deref(), Some(&data[..]));
137137+ }
138138+139139+ #[tokio::test]
140140+ async fn test_has() {
141141+ let store = MemoryBlockStore::new();
142142+ let data = b"test data";
143143+144144+ let cid = store.put(data).await.unwrap();
145145+ assert!(store.has(&cid).await.unwrap());
146146+147147+ let fake_cid = IpldCid::default();
148148+ assert!(!store.has(&fake_cid).await.unwrap());
149149+ }
150150+151151+ #[tokio::test]
152152+ async fn test_put_many() {
153153+ let store = MemoryBlockStore::new();
154154+155155+ let data1 = b"data1";
156156+ let data2 = b"data2";
157157+ let cid1 = crate::mst::util::compute_cid(data1).unwrap();
158158+ let cid2 = crate::mst::util::compute_cid(data2).unwrap();
159159+160160+ store
161161+ .put_many(vec![
162162+ (cid1, Bytes::from_static(data1)),
163163+ (cid2, Bytes::from_static(data2)),
164164+ ])
165165+ .await
166166+ .unwrap();
167167+168168+ assert_eq!(store.len(), 2);
169169+ assert!(store.has(&cid1).await.unwrap());
170170+ assert!(store.has(&cid2).await.unwrap());
171171+ }
172172+173173+ #[tokio::test]
174174+ async fn test_get_many() {
175175+ let store = MemoryBlockStore::new();
176176+177177+ let data1 = b"data1";
178178+ let data2 = b"data2";
179179+ let cid1 = store.put(data1).await.unwrap();
180180+ let cid2 = store.put(data2).await.unwrap();
181181+ let fake_cid = IpldCid::default();
182182+183183+ let results = store.get_many(&[cid1, fake_cid, cid2]).await.unwrap();
184184+185185+ assert_eq!(results.len(), 3);
186186+ assert_eq!(results[0].as_deref(), Some(&data1[..]));
187187+ assert_eq!(results[1], None);
188188+ assert_eq!(results[2].as_deref(), Some(&data2[..]));
189189+ }
190190+191191+ #[tokio::test]
192192+ async fn test_clear() {
193193+ let store = MemoryBlockStore::new();
194194+ store.put(b"data").await.unwrap();
195195+196196+ assert_eq!(store.len(), 1);
197197+ store.clear();
198198+ assert_eq!(store.len(), 0);
199199+ assert!(store.is_empty());
200200+ }
201201+202202+ #[tokio::test]
203203+ async fn test_clone_shares_storage() {
204204+ let store1 = MemoryBlockStore::new();
205205+ let store2 = store1.clone();
206206+207207+ let cid = store1.put(b"test").await.unwrap();
208208+ assert!(store2.has(&cid).await.unwrap());
209209+ }
210210+}
+88
crates/jacquard-repo/src/storage/mod.rs
···11+//! Block storage abstraction for MST nodes and records
22+33+use bytes::Bytes;
44+use cid::Cid as IpldCid;
55+use crate::error::Result;
66+77+/// Async block storage trait
88+///
99+/// Provides CID-keyed block storage for MST nodes, commits, and record data.
1010+/// Implementations might use:
1111+/// - In-memory HashMap ([`MemoryBlockStore`](memory::MemoryBlockStore))
1212+/// - CAR file ([`FileBlockStore`](file::FileBlockStore))
1313+/// - SQLite/RocksDB (user-provided)
1414+/// - Remote HTTP storage (user-provided)
1515+///
1616+/// Clone is required so MST can share storage references across tree operations.
1717+///
1818+/// # WASM Compatibility
1919+///
2020+/// The trait uses `trait_variant` to conditionally require `Send` only on non-WASM targets,
2121+/// allowing it to work in browser environments where `Send` is not available.
2222+///
2323+/// # Example
2424+///
2525+/// ```rust,ignore
2626+/// use jacquard_repo::storage::{BlockStore, MemoryBlockStore};
2727+///
2828+/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
2929+/// let storage = MemoryBlockStore::new();
3030+///
3131+/// // Store a block
3232+/// let data = b"hello world";
3333+/// let cid = storage.put(data).await?;
3434+///
3535+/// // Retrieve it
3636+/// if let Some(retrieved) = storage.get(&cid).await? {
3737+/// assert_eq!(retrieved, data);
3838+/// }
3939+/// # Ok(())
4040+/// # }
4141+/// ```
4242+#[trait_variant::make(Send)]
4343+pub trait BlockStore: Clone {
4444+ /// Get a block by CID
4545+ ///
4646+ /// Returns `None` if the block is not found.
4747+ async fn get(&self, cid: &IpldCid) -> Result<Option<Bytes>>;
4848+4949+ /// Put a block, return its CID
5050+ ///
5151+ /// The CID is calculated from the data using SHA-256 hash and DAG-CBOR codec.
5252+ /// This ensures content addressing: the same data always produces the same CID.
5353+ async fn put(&self, data: &[u8]) -> Result<IpldCid>;
5454+5555+ /// Check if a block exists without retrieving it
5656+ ///
5757+ /// This can be more efficient than `get()` for implementations that can check
5858+ /// existence without reading the full block data.
5959+ async fn has(&self, cid: &IpldCid) -> Result<bool>;
6060+6161+ /// Put many blocks at once (optimization for batch writes)
6262+ ///
6363+ /// Implementations should optimize this for batch operations where possible
6464+ /// (e.g., single transaction, bulk insert). A simple implementation can just
6565+ /// call `put()` individually.
6666+ ///
6767+ /// # Note
6868+ ///
6969+ /// The provided CIDs should match the data, but implementations may choose to
7070+ /// recalculate and validate them.
7171+ async fn put_many(&self, blocks: impl IntoIterator<Item = (IpldCid, Bytes)> + Send) -> Result<()>;
7272+7373+ /// Get multiple blocks at once (optimization for batch reads)
7474+ ///
7575+ /// Implementations should optimize this for batch operations where possible.
7676+ /// A simple implementation can just call `get()` individually.
7777+ ///
7878+ /// Returns a vec of the same length as the input, with `None` for missing blocks.
7979+ async fn get_many(&self, cids: &[IpldCid]) -> Result<Vec<Option<Bytes>>>;
8080+}
8181+8282+pub mod file;
8383+pub mod layered;
8484+pub mod memory;
8585+8686+pub use file::FileBlockStore;
8787+pub use layered::LayeredBlockStore;
8888+pub use memory::MemoryBlockStore;
···11+#!/usr/bin/env python3
22+33+"""
44+Helper script to output MST keys with different letter prefixes, at given heights. Eg:
55+66+ A0/asdf - at MST height 0
77+"""
88+99+import hashlib
1010+import random
1111+1212+def height(key):
1313+ h = hashlib.sha256(key).hexdigest()
1414+ i = 0
1515+ for c in h:
1616+ if c > '4':
1717+ return i*2
1818+ if c != '0':
1919+ return i*2+1
2020+ i = i+1
2121+ raise Exception("very suss")
2222+2323+def rand_key(letter, level):
2424+ num = random.randint(0, 999999)
2525+ return f"{letter}{level}/{num:06}".encode("utf8")
2626+2727+def gen_key(letter, level):
2828+ while True:
2929+ key = rand_key(letter, level)
3030+ if height(key) == level:
3131+ print(key.decode("utf-8"))
3232+ return
3333+3434+if __name__=="__main__":
3535+ for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
3636+ for level in [0,1,2,3,4,5]:
3737+ gen_key(letter, level)