···2323 blocks: BTreeMap<IpldCid, Bytes>,
2424) -> Result<()> {
2525 let path = path.as_ref();
2626- let file = File::create(path)
2727- .await
2828- .map_err(|e| RepoError::io(e).with_context(format!("creating CAR file: {}", path.display())))?;
2626+ let file = File::create(path).await.map_err(|e| {
2727+ RepoError::io(e).with_context(format!("creating CAR file: {}", path.display()))
2828+ })?;
29293030 let header = iroh_car::CarHeader::new_v1(roots);
3131 let mut writer = CarWriter::new(header, file);
···3737 .map_err(|e| RepoError::car(e).with_context(format!("writing block {}", cid)))?;
3838 }
39394040- writer.finish().await.map_err(|e| RepoError::car(e).with_context("finalizing CAR file"))?;
4040+ writer
4141+ .finish()
4242+ .await
4343+ .map_err(|e| RepoError::car(e).with_context("finalizing CAR file"))?;
41444245 Ok(())
4346}
···5861 .map_err(|e| RepoError::car(e).with_context(format!("writing block {}", cid)))?;
5962 }
60636161- writer.finish().await.map_err(|e| RepoError::car(e).with_context("finalizing CAR bytes"))?;
6464+ writer
6565+ .finish()
6666+ .await
6767+ .map_err(|e| RepoError::car(e).with_context("finalizing CAR bytes"))?;
62686363- buffer.flush().await.map_err(|e| RepoError::io(e).with_context("flushing CAR buffer"))?;
6969+ buffer
7070+ .flush()
7171+ .await
7272+ .map_err(|e| RepoError::io(e).with_context("flushing CAR buffer"))?;
64736574 Ok(buffer)
6675}
···7382/// - All record blocks (from storage)
7483///
7584/// Uses streaming to avoid loading all blocks into memory.
8585+///
8686+/// Should write in the correct order for [streaming car processing](https://github.com/bluesky-social/proposals/blob/main/0006-sync-iteration/README.md#streaming-car-processing) from sync v1.1
7687pub async fn export_repo_car<S: BlockStore + Sync + 'static>(
7788 path: impl AsRef<Path>,
7889 commit_cid: IpldCid,
7990 mst: &Mst<S>,
8091) -> Result<()> {
8192 let path = path.as_ref();
8282- let file = File::create(path)
8383- .await
8484- .map_err(|e| RepoError::io(e).with_context(format!("creating CAR export file: {}", path.display())))?;
9393+ let file = File::create(path).await.map_err(|e| {
9494+ RepoError::io(e).with_context(format!("creating CAR export file: {}", path.display()))
9595+ })?;
85968697 let header = iroh_car::CarHeader::new_v1(vec![commit_cid]);
8798 let mut writer = CarWriter::new(header, file);
···105116 mst.write_blocks_to_car(&mut writer).await?;
106117107118 // Finish writing
108108- writer.finish().await.map_err(|e| RepoError::car(e).with_context("finalizing CAR export"))?;
119119+ writer
120120+ .finish()
121121+ .await
122122+ .map_err(|e| RepoError::car(e).with_context("finalizing CAR export"))?;
109123110124 Ok(())
111125}
+190-6
crates/jacquard-repo/src/commit/firehose.rs
···87878888 /// DEPRECATED: Unused
8989 pub rebase: bool,
9090+9191+ /// Debug: block sources for validation analysis
9292+ #[cfg(debug_assertions)]
9393+ pub block_sources: BTreeMap<IpldCid, String>,
9494+9595+ #[cfg(debug_assertions)]
9696+ pub excluded_blocks: BTreeMap<IpldCid, Vec<u8>>, // blocks we skipped
9797+9898+ #[cfg(debug_assertions)]
9999+ pub excluded_metadata: BTreeMap<IpldCid, Vec<String>>, // context about excluded blocks
90100}
9110192102/// A repository operation (mutation of a single record)
···193203 blobs: self.blobs.into_iter().map(|b| b.into_static()).collect(),
194204 too_big: self.too_big,
195205 rebase: self.rebase,
206206+ #[cfg(debug_assertions)]
207207+ block_sources: self.block_sources,
208208+ #[cfg(debug_assertions)]
209209+ excluded_blocks: self.excluded_blocks,
210210+ #[cfg(debug_assertions)]
211211+ excluded_metadata: self.excluded_metadata,
196212 }
197213 }
198214}
···218234use crate::mst::{Mst, VerifiedWriteOp};
219235use crate::storage::{BlockStore, LayeredBlockStore, MemoryBlockStore};
220236use cid::Cid as IpldCid;
237237+use std::collections::BTreeMap;
221238use std::sync::Arc;
222239223240impl<'a> FirehoseCommit<'a> {
···324341 /// **Inductive property:** Can validate without any external state besides the blocks
325342 /// in this message. The `prev_data` field provides the starting MST root, and operations
326343 /// include `prev` CIDs for validation. All necessary blocks must be in the CAR bytes.
344344+ ///
345345+ /// Note: Because this uses the same merkle search tree struct as the repository itself,
346346+ /// this is far from the most efficient possible validation function possible. The repo
347347+ /// tree struct carries extra information. However,
348348+ /// it has the virtue of making everything self-validating.
327349 pub async fn validate_v1_1(&self, pubkey: &PublicKey<'_>) -> Result<IpldCid> {
328350 // 1. Require prev_data for v1.1
329351 let prev_data_cid: IpldCid = self
···337359338360 // 2. Parse CAR blocks from the firehose message into temporary storage
339361 let parsed = parse_car_bytes(&self.blocks).await?;
362362+363363+ #[cfg(debug_assertions)]
364364+ let provided_blocks = parsed
365365+ .blocks
366366+ .keys()
367367+ .cloned()
368368+ .collect::<std::collections::HashSet<_>>();
369369+ #[cfg(debug_assertions)]
370370+ let accessed_blocks = Arc::new(std::sync::RwLock::new(std::collections::HashSet::new()));
371371+372372+ #[cfg(debug_assertions)]
373373+ let missing_blocks = Arc::new(std::sync::RwLock::new(Vec::new()));
374374+375375+ #[cfg(debug_assertions)]
376376+ let block_categories = self.block_sources.clone();
377377+378378+ #[cfg(debug_assertions)]
379379+ let excluded_blocks_ref = self.excluded_blocks.clone();
380380+340381 let temp_storage = Arc::new(MemoryBlockStore::new_from_blocks(parsed.blocks));
341382383383+ #[cfg(debug_assertions)]
384384+ let tracking_storage = {
385385+ use crate::storage::BlockStore;
386386+ #[derive(Clone)]
387387+ struct TrackingStorage {
388388+ inner: Arc<MemoryBlockStore>,
389389+ accessed: Arc<std::sync::RwLock<std::collections::HashSet<IpldCid>>>,
390390+ missing: Arc<std::sync::RwLock<Vec<IpldCid>>>,
391391+ excluded: BTreeMap<IpldCid, Vec<u8>>,
392392+ }
393393+ impl BlockStore for TrackingStorage {
394394+ async fn get(&self, cid: &IpldCid) -> Result<Option<Bytes>> {
395395+ self.accessed.write().unwrap().insert(*cid);
396396+ let result = self.inner.get(cid).await?;
397397+398398+ if result.is_none() {
399399+ // Check if this block was excluded
400400+ if let Some(excluded_block) = self.excluded.get(cid) {
401401+ self.missing.write().unwrap().push(*cid);
402402+ eprintln!(
403403+ "[MISS] Block {} was EXCLUDED but needed during validation",
404404+ cid
405405+ );
406406+ // Return the excluded block so validation can continue
407407+ return Ok(Some(Bytes::copy_from_slice(&excluded_block)));
408408+ } else {
409409+ self.missing.write().unwrap().push(*cid);
410410+ eprintln!(
411411+ "[MISS] Block {} not found (never seen during commit creation)",
412412+ cid
413413+ );
414414+ }
415415+ }
416416+417417+ Ok(result)
418418+ }
419419+ async fn put(&self, data: &[u8]) -> Result<IpldCid> {
420420+ self.inner.put(data).await
421421+ }
422422+ async fn has(&self, cid: &IpldCid) -> Result<bool> {
423423+ self.inner.has(cid).await
424424+ }
425425+ async fn put_many(
426426+ &self,
427427+ blocks: impl IntoIterator<Item = (IpldCid, Bytes)> + Send,
428428+ ) -> Result<()> {
429429+ self.inner.put_many(blocks).await
430430+ }
431431+ async fn get_many(&self, cids: &[IpldCid]) -> Result<Vec<Option<Bytes>>> {
432432+ self.inner.get_many(cids).await
433433+ }
434434+ async fn apply_commit(&self, commit: crate::repo::CommitData) -> Result<()> {
435435+ self.inner.apply_commit(commit).await
436436+ }
437437+ }
438438+ Arc::new(TrackingStorage {
439439+ inner: temp_storage.clone(),
440440+ accessed: accessed_blocks.clone(),
441441+ missing: missing_blocks.clone(),
442442+ excluded: excluded_blocks_ref,
443443+ })
444444+ };
445445+446446+ #[cfg(not(debug_assertions))]
447447+ let tracking_storage = temp_storage.clone();
448448+342449 // 3. Extract and verify commit object from temporary storage
343450 let commit_cid: IpldCid = self
344451 .commit
345452 .to_ipld()
346453 .map_err(|e| RepoError::invalid_cid_conversion(e, "commit CID"))?;
347347- let commit_bytes = temp_storage
454454+ let commit_bytes = tracking_storage
348455 .get(&commit_cid)
349456 .await?
350457 .ok_or_else(|| RepoError::not_found("commit block", &commit_cid))?;
···366473367474 // 5. Load new MST from commit.data (claimed result)
368475 let expected_root = *commit.data();
369369- let mut new_mst = Mst::load(temp_storage, expected_root, None);
476476+477477+ let mut new_mst = Mst::load(tracking_storage, expected_root, None);
370478371479 let verified_ops = self
372480 .ops
···401509 )));
402510 }
403511512512+ #[cfg(debug_assertions)]
513513+ {
514514+ let accessed = accessed_blocks.read().unwrap();
515515+ let missing = missing_blocks.read().unwrap();
516516+ let unused: Vec<_> = provided_blocks.difference(&*accessed).copied().collect();
517517+518518+ println!("[validation stats]");
519519+ println!(" provided: {} blocks", provided_blocks.len());
520520+ println!(" accessed: {} blocks", accessed.len());
521521+522522+ if !missing.is_empty() {
523523+ println!(
524524+ " MISSING: {} blocks NEEDED but not provided!",
525525+ missing.len()
526526+ );
527527+528528+ // Show operation breakdown for this commit
529529+ let mut op_counts: BTreeMap<&str, usize> = BTreeMap::new();
530530+ for op in &self.ops {
531531+ *op_counts.entry(op.action.as_ref()).or_insert(0) += 1;
532532+ }
533533+ println!(" operations in this commit:");
534534+ for (action, count) in op_counts {
535535+ println!(" {}: {}", action, count);
536536+ }
537537+538538+ println!(" missing block CIDs:");
539539+ for cid in missing.iter() {
540540+ if self.excluded_blocks.contains_key(cid) {
541541+ println!(" {} (was excluded)", cid);
542542+ if let Some(metadata) = self.excluded_metadata.get(cid) {
543543+ for context in metadata {
544544+ println!(" -> {}", context);
545545+ }
546546+ }
547547+ } else {
548548+ println!(" {} (never seen)", cid);
549549+ }
550550+ }
551551+ }
552552+553553+ if !unused.is_empty() {
554554+ println!(
555555+ " UNUSED: {} blocks ({}%)",
556556+ unused.len(),
557557+ (unused.len() * 100) / provided_blocks.len()
558558+ );
559559+560560+ // Show breakdown by category
561561+ let mut category_stats: BTreeMap<&str, (usize, usize)> = BTreeMap::new();
562562+ for (cid, category) in &block_categories {
563563+ let stats = category_stats.entry(category).or_insert((0, 0));
564564+ stats.0 += 1; // total provided
565565+ if accessed.contains(cid) {
566566+ stats.1 += 1; // accessed
567567+ }
568568+ }
569569+570570+ println!("\n breakdown by category:");
571571+ for (category, (total, accessed_count)) in category_stats {
572572+ let unused_count = total - accessed_count;
573573+ let unused_pct = if total > 0 {
574574+ (unused_count * 100) / total
575575+ } else {
576576+ 0
577577+ };
578578+ println!(
579579+ " {}: {} total, {} accessed, {} unused ({}%)",
580580+ category, total, accessed_count, unused_count, unused_pct
581581+ );
582582+ }
583583+ } else {
584584+ println!(" ✓ all blocks were used");
585585+ }
586586+ }
587587+404588 Ok(expected_root)
405589 }
406590}
···408592#[cfg(test)]
409593mod tests {
410594 use super::*;
411411- use crate::commit::{Commit, SigningKey as _};
595595+ use crate::Repository;
596596+ use crate::commit::Commit;
412597 use crate::mst::{Mst, RecordWriteOp};
413598 use crate::storage::MemoryBlockStore;
414414- use crate::{CommitData, Repository};
415599 use jacquard_common::types::crypto::{KeyCodec, PublicKey};
416600 use jacquard_common::types::recordkey::Rkey;
417601 use jacquard_common::types::string::{Nsid, RecordKey};
···720904 let parsed = parse_car_bytes(&firehose_commit.blocks).await.unwrap();
721905 let commit_cid: IpldCid = firehose_commit.commit.to_ipld().unwrap();
722906723723- let mut blocks_without_commit: BTreeMap<IpldCid, bytes::Bytes> = parsed
907907+ let blocks_without_commit: BTreeMap<IpldCid, bytes::Bytes> = parsed
724908 .blocks
725909 .into_iter()
726910 .filter(|(cid, _)| cid != &commit_cid)
···8511035 .insert(fake_commit_cid, bytes::Bytes::from(fake_commit_cbor));
8521036 commit_data.cid = fake_commit_cid;
8531037854854- let mut firehose_commit = commit_data
10381038+ let firehose_commit = commit_data
8551039 .to_firehose_commit(&did, 1, Datetime::now(), repo_ops, vec![])
8561040 .await
8571041 .unwrap();
+47
crates/jacquard-repo/src/mst/cursor.rs
···77use cid::Cid as IpldCid;
88use smol_str::SmolStr;
991010+#[cfg(debug_assertions)]
1111+use std::collections::HashSet;
1212+#[cfg(debug_assertions)]
1313+use std::sync::{Arc, RwLock};
1414+1015/// Position within an MST traversal
1116#[derive(Debug, Clone)]
1217pub enum CursorPosition<S: BlockStore> {
···65706671 /// Current position in traversal
6772 current: CursorPosition<S>,
7373+7474+ /// Track CIDs accessed during traversal (debug only)
7575+ #[cfg(debug_assertions)]
7676+ accessed_cids: Option<Arc<RwLock<HashSet<IpldCid>>>>,
6877}
69787079impl<S: BlockStore + Sync + 'static> MstCursor<S> {
···7685 Self {
7786 path: Vec::new(),
7887 current: CursorPosition::Tree { mst: root },
8888+ #[cfg(debug_assertions)]
8989+ accessed_cids: None,
9090+ }
9191+ }
9292+9393+ /// Create new cursor with dirty tracking enabled
9494+ ///
9595+ /// Records all CIDs accessed during traversal in the provided set.
9696+ #[cfg(debug_assertions)]
9797+ pub fn new_with_tracking(root: Mst<S>, tracking: Arc<RwLock<HashSet<IpldCid>>>) -> Self {
9898+ Self {
9999+ path: Vec::new(),
100100+ current: CursorPosition::Tree { mst: root },
101101+ accessed_cids: Some(tracking),
79102 }
80103 }
81104···103126 /// If at the root level (before stepping in), returns root's layer + 1.
104127 pub async fn layer(&self) -> Result<usize> {
105128 if let Some((walking_node, _, _)) = self.path.last() {
129129+ // Track CID access
130130+ #[cfg(debug_assertions)]
131131+ if let Some(ref tracking) = self.accessed_cids {
132132+ if let Ok(cid) = walking_node.get_pointer().await {
133133+ tracking.write().unwrap().insert(cid);
134134+ }
135135+ }
136136+106137 // We're inside a node - return its layer
107138 walking_node.get_layer().await
108139 } else {
···111142 // is one layer higher than being "inside" the root
112143 match &self.current {
113144 CursorPosition::Tree { mst } => {
145145+ // Track CID access
146146+ #[cfg(debug_assertions)]
147147+ if let Some(ref tracking) = self.accessed_cids {
148148+ if let Ok(cid) = mst.get_pointer().await {
149149+ tracking.write().unwrap().insert(cid);
150150+ }
151151+ }
152152+114153 let root_layer = mst.get_layer().await?;
115154 Ok(root_layer + 1)
116155 }
···186225187226 /// Descend into a tree node
188227 async fn step_into(&mut self, mst: Mst<S>) -> Result<()> {
228228+ // Track CID access
229229+ #[cfg(debug_assertions)]
230230+ if let Some(ref tracking) = self.accessed_cids {
231231+ if let Ok(cid) = mst.get_pointer().await {
232232+ tracking.write().unwrap().insert(cid);
233233+ }
234234+ }
235235+189236 let entries = mst.get_entries().await?;
190237191238 if entries.is_empty() {
+40-1
crates/jacquard-repo/src/mst/diff.rs
···44use std::future::Future;
55use std::pin::Pin;
6677+#[cfg(debug_assertions)]
88+use std::collections::HashSet;
99+#[cfg(debug_assertions)]
1010+use std::sync::{Arc, RwLock};
1111+712use super::cursor::{CursorPosition, MstCursor};
813use super::tree::Mst;
914use super::util::serialize_node_data;
···6065 /// When modifying a tree, old MST nodes along changed paths become unreachable.
6166 /// This tracks those nodes for garbage collection.
6267 pub removed_mst_blocks: Vec<IpldCid>,
6868+6969+ /// CIDs accessed from old tree during diff (debug only)
7070+ ///
7171+ /// Tracks all blocks touched when walking the old tree during diff.
7272+ /// This is the precise set of blocks needed for validation.
7373+ #[cfg(debug_assertions)]
7474+ pub old_tree_accessed: Vec<IpldCid>,
7575+7676+ /// CIDs accessed from new tree during diff (debug only)
7777+ #[cfg(debug_assertions)]
7878+ pub new_tree_accessed: Vec<IpldCid>,
6379}
64806581use super::tree::VerifiedWriteOp;
···7591 removed_cids: Vec::new(),
7692 new_mst_blocks: BTreeMap::new(),
7793 removed_mst_blocks: Vec::new(),
9494+ #[cfg(debug_assertions)]
9595+ old_tree_accessed: Vec::new(),
9696+ #[cfg(debug_assertions)]
9797+ new_tree_accessed: Vec::new(),
7898 }
7999 }
80100···234254 return Ok(());
235255 }
236256237237- // CIDs differ - use cursors to walk both trees
257257+ // CIDs differ - use cursors to walk both trees with tracking
258258+ #[cfg(debug_assertions)]
259259+ let old_tracking = Arc::new(RwLock::new(HashSet::new()));
260260+ #[cfg(debug_assertions)]
261261+ let new_tracking = Arc::new(RwLock::new(HashSet::new()));
262262+263263+ #[cfg(debug_assertions)]
264264+ let mut old_cursor = MstCursor::new_with_tracking(old.clone(), old_tracking.clone());
265265+ #[cfg(debug_assertions)]
266266+ let mut new_cursor = MstCursor::new_with_tracking(new.clone(), new_tracking.clone());
267267+268268+ #[cfg(not(debug_assertions))]
238269 let mut old_cursor = MstCursor::new(old.clone());
270270+ #[cfg(not(debug_assertions))]
239271 let mut new_cursor = MstCursor::new(new.clone());
240272241273 // Don't advance yet - let loop handle roots like any other tree comparison
···393425 }
394426 }
395427 }
428428+ }
429429+430430+ // Collect tracking data
431431+ #[cfg(debug_assertions)]
432432+ {
433433+ diff.old_tree_accessed = old_tracking.read().unwrap().iter().copied().collect();
434434+ diff.new_tree_accessed = new_tracking.read().unwrap().iter().copied().collect();
396435 }
397436398437 Ok(())
+121-118
crates/jacquard-repo/src/repo.rs
···58585959 /// CIDs of blocks to delete
6060 pub deleted_cids: Vec<IpldCid>,
6161+6262+ /// Debug: block sources for validation analysis
6363+ #[cfg(debug_assertions)]
6464+ pub block_sources: BTreeMap<IpldCid, String>,
6565+ #[cfg(debug_assertions)]
6666+ pub excluded_blocks: BTreeMap<IpldCid, Bytes>, // blocks we skipped
6767+ #[cfg(debug_assertions)]
6868+ pub excluded_metadata: BTreeMap<IpldCid, Vec<String>>, // context about excluded blocks
6169}
62706371impl CommitData {
···9199 blobs,
92100 too_big: false,
93101 rebase: false,
102102+ #[cfg(debug_assertions)]
103103+ block_sources: self.block_sources.clone(),
104104+ #[cfg(debug_assertions)]
105105+ excluded_blocks: self
106106+ .excluded_blocks
107107+ .iter()
108108+ .map(|(cid, b)| (cid.clone(), b.to_vec()))
109109+ .collect(),
110110+ #[cfg(debug_assertions)]
111111+ excluded_metadata: self.excluded_metadata.clone(),
94112 })
95113 }
96114}
···233251 blocks: blocks.clone(),
234252 relevant_blocks: blocks,
235253 deleted_cids: Vec::new(),
254254+ #[cfg(debug_assertions)]
255255+ block_sources: BTreeMap::new(),
256256+ #[cfg(debug_assertions)]
257257+ excluded_blocks: BTreeMap::new(),
258258+ #[cfg(debug_assertions)]
259259+ excluded_metadata: BTreeMap::new(),
236260 })
237261 }
238262···346370 // But these bulk operations would benefit significantly from cursor's skip_subtree()
347371 // to avoid traversing unrelated branches when searching lexicographically-organized data.
348372349349- /// Apply record write operations with inline data
350350- ///
351351- /// Serializes record data to DAG-CBOR, computes CIDs, stores data blocks,
352352- /// then applies write operations to the MST. Returns the diff for inspection.
353353- ///
354354- /// For creating commits with operations, use `create_commit()` instead.
355355- pub async fn apply_record_writes(&mut self, ops: &[RecordWriteOp<'_>]) -> Result<MstDiff> {
356356- use smol_str::format_smolstr;
357357-358358- let mut updated_tree = self.mst.clone();
359359-360360- for op in ops {
361361- updated_tree = match op {
362362- RecordWriteOp::Create {
363363- collection,
364364- rkey,
365365- record,
366366- } => {
367367- let key = format_smolstr!("{}/{}", collection.as_ref(), rkey.as_ref());
368368-369369- // Serialize record to DAG-CBOR
370370- let cbor = serde_ipld_dagcbor::to_vec(record).map_err(|e| {
371371- RepoError::serialization(e).with_context(format!(
372372- "serializing record data for {}/{}",
373373- collection.as_ref(),
374374- rkey.as_ref()
375375- ))
376376- })?;
377377-378378- // Compute CID and store data
379379- let cid = self.storage.put(&cbor).await?;
380380-381381- updated_tree.add(key.as_str(), cid).await?
382382- }
383383- RecordWriteOp::Update {
384384- collection,
385385- rkey,
386386- record,
387387- prev,
388388- } => {
389389- let key = format_smolstr!("{}/{}", collection.as_ref(), rkey.as_ref());
390390-391391- // Serialize record to DAG-CBOR
392392- let cbor = serde_ipld_dagcbor::to_vec(record).map_err(|e| {
393393- RepoError::serialization(e).with_context(format!(
394394- "serializing record data for {}/{}",
395395- collection.as_ref(),
396396- rkey.as_ref()
397397- ))
398398- })?;
399399-400400- // Compute CID and store data
401401- let cid = self.storage.put(&cbor).await?;
402402-403403- // Validate prev if provided
404404- if let Some(prev_cid) = prev {
405405- if &cid != prev_cid {
406406- return Err(RepoError::cid_mismatch(format!(
407407- "Update prev CID mismatch for key {}: expected {}, got {}",
408408- key, prev_cid, cid
409409- )));
410410- }
411411- }
412412-413413- updated_tree.add(key.as_str(), cid).await?
414414- }
415415- RecordWriteOp::Delete {
416416- collection,
417417- rkey,
418418- prev,
419419- } => {
420420- let key = format_smolstr!("{}/{}", collection.as_ref(), rkey.as_ref());
421421-422422- // Check exists
423423- let current = self
424424- .mst
425425- .get(key.as_str())
426426- .await?
427427- .ok_or_else(|| RepoError::not_found("record", key.as_str()))?;
428428-429429- // Validate prev if provided
430430- if let Some(prev_cid) = prev {
431431- if ¤t != prev_cid {
432432- return Err(RepoError::cid_mismatch(format!(
433433- "Delete prev CID mismatch for key {}: expected {}, got {}",
434434- key, prev_cid, current
435435- )));
436436- }
437437- }
438438-439439- updated_tree.delete(key.as_str()).await?
440440- }
441441- };
442442- }
443443-444444- // Compute diff before updating
445445- let diff = self.mst.diff(&updated_tree).await?;
446446-447447- println!("Repo before:\n{}", self);
448448- // Update mst
449449- self.mst = updated_tree;
450450-451451- println!("Repo after:\n{}", self);
452452- Ok(diff)
453453- }
454454-455373 /// Create a commit from record write operations
456374 ///
457375 /// Applies write operations, creates signed commit, and collects blocks:
···578496 let mut blocks = diff.new_mst_blocks;
579497 let mut relevant_blocks = BTreeMap::new();
580498581581- // Add the previous MST root block (needed to load prev_data in validation)
582582- if let Some(prev_root_block) = self.storage.get(&prev_data).await? {
583583- relevant_blocks.insert(prev_data, prev_root_block);
584584- }
499499+ #[cfg(debug_assertions)]
500500+ let mut block_sources: BTreeMap<IpldCid, &str> = BTreeMap::new();
501501+502502+ // // Add the previous MST root block (needed to load prev_data in validation)
503503+ // if let Some(prev_root_block) = self.storage.get(&prev_data).await? {
504504+ // #[cfg(debug_assertions)]
505505+ // block_sources.insert(prev_data, "prev_root");
506506+ // relevant_blocks.insert(prev_data, prev_root_block);
507507+ // }
585508586586- // Walk paths in both old and new trees for each operation
509509+ let mut new_tree_cids = std::collections::HashSet::new();
587510 for op in ops {
588511 let key = format_smolstr!("{}/{}", op.collection().as_ref(), op.rkey().as_ref());
589589-590512 updated_tree
591513 .blocks_for_path(&key, &mut relevant_blocks)
592514 .await?;
593515594594- self.mst.blocks_for_path(&key, &mut relevant_blocks).await?;
516516+ let new_path_cids = updated_tree.cids_for_path(&key).await?;
517517+ for cid in &new_path_cids {
518518+ new_tree_cids.insert(*cid);
519519+ #[cfg(debug_assertions)]
520520+ block_sources.insert(*cid, "new_tree_path");
521521+ }
522522+ }
523523+ let mut old_path_blocks = BTreeMap::new();
524524+ let mut old_path_cids = std::collections::HashSet::new();
525525+526526+ // Step 2: Walk OLD tree, only add blocks NOT in new tree (changed nodes)
527527+ for op in ops {
528528+ let key = format_smolstr!("{}/{}", op.collection().as_ref(), op.rkey().as_ref());
529529+530530+ self.mst.blocks_for_path(&key, &mut old_path_blocks).await?;
531531+ for cid in updated_tree.cids_for_path(&key).await? {
532532+ old_path_cids.insert(cid);
533533+ }
534534+ }
535535+536536+ let mut excluded_blocks = BTreeMap::new();
537537+ #[cfg(debug_assertions)]
538538+ let mut excluded_metadata: BTreeMap<IpldCid, Vec<String>> = BTreeMap::new();
539539+540540+ // Re-walk old tree paths to collect metadata about excluded blocks
541541+ #[cfg(debug_assertions)]
542542+ for (op_idx, op) in ops.iter().enumerate() {
543543+ let key = format_smolstr!("{}/{}", op.collection().as_ref(), op.rkey().as_ref());
544544+ let old_path_cids = self.mst.cids_for_path(&key).await?;
545545+546546+ for (depth, cid) in old_path_cids.iter().enumerate() {
547547+ if !new_tree_cids.contains(cid) {
548548+ let metadata = format!("op#{} ({}) path depth {}", op_idx, key, depth);
549549+ excluded_metadata.entry(*cid).or_insert_with(Vec::new).push(metadata);
550550+ }
551551+ }
595552 }
596553597597- // Add new leaf blocks to both collections (single iteration)
598598- for (cid, block) in &leaf_blocks {
599599- if diff.new_leaf_cids.contains(cid) {
600600- blocks.insert(*cid, block.clone());
601601- relevant_blocks.insert(*cid, block.clone());
554554+ for (cid, block) in old_path_blocks.into_iter() {
555555+ // Only include if this block CHANGED (different CID in new tree)
556556+ if !new_tree_cids.contains(&cid) {
557557+ //relevant_blocks.insert(cid, block);
558558+ excluded_blocks.insert(cid, block);
559559+ #[cfg(debug_assertions)]
560560+ block_sources.insert(cid, "old_tree_changed");
602561 }
603562 }
604563564564+ // // Add new leaf blocks to both collections (single iteration)
565565+ // for (cid, block) in &leaf_blocks {
566566+ // if diff.new_leaf_cids.contains(cid) {
567567+ // blocks.insert(*cid, block.clone());
568568+ // #[cfg(debug_assertions)]
569569+ // block_sources.insert(*cid, "new_leaf");
570570+ // relevant_blocks.insert(*cid, block.clone());
571571+ // }
572572+ // }
573573+574574+ // For DELETE operations, we need the deleted record blocks for inversion
575575+ // (when inverting a delete, we insert the prev CID back)
576576+ // for cid in &deleted_cids {
577577+ // if let Some(block) = self.storage.get(cid).await? {
578578+ // #[cfg(debug_assertions)]
579579+ // block_sources.insert(*cid, "deleted_leaf");
580580+ // relevant_blocks.insert(*cid, block);
581581+ // }
582582+ // }
583583+605584 // Step 6: Create and sign commit
606585 let rev = Ticker::new().next(Some(self.commit.rev.clone()));
607586 let commit = Commit::new_unsigned(did.clone().into_static(), data, rev.clone(), prev)
···613592614593 // Step 7: Add commit block to both collections
615594 blocks.insert(commit_cid, commit_bytes.clone());
595595+ #[cfg(debug_assertions)]
596596+ block_sources.insert(commit_cid, "commit");
616597 relevant_blocks.insert(commit_cid, commit_bytes);
617598599599+ #[cfg(debug_assertions)]
600600+ {
601601+ let mut by_source: BTreeMap<&str, usize> = BTreeMap::new();
602602+ for source in block_sources.values() {
603603+ *by_source.entry(source).or_insert(0) += 1;
604604+ }
605605+ println!("[commit creation] relevant_blocks by source:");
606606+ for (source, count) in by_source {
607607+ println!(" {}: {}", source, count);
608608+ }
609609+ println!(" TOTAL: {}", relevant_blocks.len());
610610+ }
611611+618612 // Step 8: Update internal MST state
619613 self.mst = updated_tree;
620614···630624 blocks,
631625 relevant_blocks,
632626 deleted_cids,
627627+ #[cfg(debug_assertions)]
628628+ block_sources: block_sources
629629+ .into_iter()
630630+ .map(|(k, v)| (k, v.to_string()))
631631+ .collect(),
632632+ #[cfg(debug_assertions)]
633633+ excluded_blocks,
634634+ #[cfg(debug_assertions)]
635635+ excluded_metadata,
633636 },
634637 ))
635638 }
···338338339339 firehose_commit.validate_v1_1(&pubkey).await.unwrap();
340340341341- for batch_num in 1..=2000 {
341341+ for batch_num in 1..=5000 {
342342 let batch_size = rng.gen_range(1..=20);
343343 let ops = generate_random_ops(&mut rng, &mut tracker, batch_size);
344344 let record_writes = test_ops_to_record_writes(ops, &collection);