···167167 matches!(self, Data::Null)
168168 }
169169170170+ /// Get the "$type" discriminator field if this is an object with a string "$type" field
171171+ ///
172172+ /// This is a shortcut for union type discrimination in AT Protocol.
173173+ /// Returns `None` if this is not an object or if the "$type" field is missing/not a string.
174174+ pub fn type_discriminator(&self) -> Option<&str> {
175175+ self.as_object()?.type_discriminator()
176176+ }
177177+170178 /// Serialize to canonical DAG-CBOR bytes for CID computation
171179 ///
172180 /// This produces the deterministic CBOR encoding used for content-addressing.
···350358 /// Get an iterator over the keys
351359 pub fn keys(&self) -> std::collections::btree_map::Keys<'_, SmolStr, Data<'s>> {
352360 self.0.keys()
361361+ }
362362+363363+ /// Get the "$type" discriminator field if present and it's a string
364364+ ///
365365+ /// This is a shortcut for union type discrimination in AT Protocol.
366366+ pub fn type_discriminator(&self) -> Option<&str> {
367367+ self.get("$type")?.as_str()
353368 }
354369355370 /// Get an iterator over the values
···568583 /// Check if this is a null value
569584 pub fn is_null(&self) -> bool {
570585 matches!(self, RawData::Null)
586586+ }
587587+588588+ /// Get the "$type" discriminator field if this is an object with a string "$type" field
589589+ ///
590590+ /// This is a shortcut for union type discrimination in AT Protocol.
591591+ /// Returns `None` if this is not an object or if the "$type" field is missing/not a string.
592592+ pub fn type_discriminator(&self) -> Option<&str> {
593593+ let obj = self.as_object()?;
594594+ let type_val = obj.get("$type")?;
595595+ type_val.as_str()
571596 }
572597573598 /// Serialize to canonical DAG-CBOR bytes for CID computation
+41
crates/jacquard-common/src/types/value/tests.rs
···12821282 let values: Vec<_> = result.values().collect();
12831283 assert_eq!(values.len(), 1);
12841284}
12851285+12861286+#[test]
12871287+fn test_type_discriminator() {
12881288+ // Object with $type field
12891289+ let mut map = BTreeMap::new();
12901290+ map.insert(
12911291+ SmolStr::new_static("$type"),
12921292+ Data::String(AtprotoStr::String(CowStr::new_static("app.bsky.feed.post"))),
12931293+ );
12941294+ map.insert(SmolStr::new_static("text"), Data::String(AtprotoStr::String(CowStr::new_static("hello"))));
12951295+ let obj = Object(map);
12961296+12971297+ assert_eq!(obj.type_discriminator(), Some("app.bsky.feed.post"));
12981298+12991299+ let data = Data::Object(obj.clone());
13001300+ assert_eq!(data.type_discriminator(), Some("app.bsky.feed.post"));
13011301+13021302+ // Object without $type field
13031303+ let mut map2 = BTreeMap::new();
13041304+ map2.insert(SmolStr::new_static("foo"), Data::Integer(42));
13051305+ let obj2 = Object(map2);
13061306+13071307+ assert_eq!(obj2.type_discriminator(), None);
13081308+13091309+ let data2 = Data::Object(obj2);
13101310+ assert_eq!(data2.type_discriminator(), None);
13111311+13121312+ // Non-object data
13131313+ let data3 = Data::Integer(42);
13141314+ assert_eq!(data3.type_discriminator(), None);
13151315+13161316+ // RawData with $type
13171317+ let mut raw_map = BTreeMap::new();
13181318+ raw_map.insert(
13191319+ SmolStr::new_static("$type"),
13201320+ RawData::String(CowStr::new_static("test.type")),
13211321+ );
13221322+ let raw_obj = RawData::Object(raw_map);
13231323+13241324+ assert_eq!(raw_obj.type_discriminator(), Some("test.type"));
13251325+}
···1313//! - [`union_registry`] - Tracks union types for collision detection
1414//! - [`fs`] - Filesystem utilities for lexicon storage
1515//! - [`derive_impl`] - Implementation functions for derive macros (used by jacquard-derive)
1616+//! - [`validation`] - Runtime validation of Data against lexicon schemas
16171718pub mod codegen;
1819pub mod corpus;
···2223pub mod lexicon;
2324pub mod schema;
2425pub mod union_registry;
2626+pub mod validation;
+9
crates/jacquard-lexicon/src/schema.rs
···6767 /// For fragments, this is the base NSID (without `#fragment`).
6868 fn nsid() -> &'static str;
69697070+ /// The definition name within the lexicon document
7171+ ///
7272+ /// Returns "main" for the primary definition, or the fragment name for other defs.
7373+ /// For example, in a lexicon with multiple defs like `pub.leaflet.poll.definition`,
7474+ /// the main type returns "main" while the `Option` type returns "option".
7575+ fn def_name() -> &'static str {
7676+ "main"
7777+ }
7878+7079 /// The schema ID for this type
7180 ///
7281 /// Defaults to NSID. Override for fragments to include `#fragment` suffix.
+1278
crates/jacquard-lexicon/src/validation.rs
···11+//! Runtime validation of Data values against lexicon schemas
22+//!
33+//! This module provides infrastructure for validating untyped `Data` values against
44+//! lexicon schemas, enabling partial deserialization, debugging, and schema migration.
55+66+use crate::{lexicon::LexiconDoc, schema::LexiconSchemaRef};
77+use cid::Cid as IpldCid;
88+use dashmap::DashMap;
99+use jacquard_common::{
1010+ IntoStatic,
1111+ smol_str::{self, ToSmolStr},
1212+ types::value::Data,
1313+};
1414+use sha2::{Digest, Sha256};
1515+use smol_str::SmolStr;
1616+use std::{
1717+ fmt,
1818+ sync::{Arc, LazyLock, OnceLock},
1919+};
2020+2121+/// Path to a value within a data structure
2222+///
2323+/// Tracks the location of values during validation for precise error reporting.
2424+#[derive(Debug, Clone, PartialEq, Eq)]
2525+pub struct ValidationPath {
2626+ segments: Vec<PathSegment>,
2727+}
2828+2929+/// A segment in a validation path
3030+#[derive(Debug, Clone, PartialEq, Eq)]
3131+pub enum PathSegment {
3232+ /// Object field access
3333+ Field(SmolStr),
3434+ /// Array index access
3535+ Index(usize),
3636+ /// Union variant discriminator
3737+ UnionVariant(SmolStr),
3838+}
3939+4040+impl ValidationPath {
4141+ /// Create a new empty path
4242+ pub fn new() -> Self {
4343+ Self {
4444+ segments: Vec::new(),
4545+ }
4646+ }
4747+4848+ /// Add a field segment to the path
4949+ pub fn push_field(&mut self, name: &str) {
5050+ self.segments.push(PathSegment::Field(name.into()));
5151+ }
5252+5353+ /// Add an index segment to the path
5454+ pub fn push_index(&mut self, idx: usize) {
5555+ self.segments.push(PathSegment::Index(idx));
5656+ }
5757+5858+ /// Add a union variant segment to the path
5959+ pub fn push_variant(&mut self, type_str: &str) {
6060+ self.segments
6161+ .push(PathSegment::UnionVariant(type_str.into()));
6262+ }
6363+6464+ /// Remove the last segment from the path
6565+ pub fn pop(&mut self) {
6666+ self.segments.pop();
6767+ }
6868+6969+ /// Get the depth of the path
7070+ pub fn depth(&self) -> usize {
7171+ self.segments.len()
7272+ }
7373+7474+ /// Check if the path is empty
7575+ pub fn is_empty(&self) -> bool {
7676+ self.segments.is_empty()
7777+ }
7878+}
7979+8080+impl Default for ValidationPath {
8181+ fn default() -> Self {
8282+ Self::new()
8383+ }
8484+}
8585+8686+impl fmt::Display for ValidationPath {
8787+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
8888+ if self.segments.is_empty() {
8989+ return write!(f, "(root)");
9090+ }
9191+9292+ for seg in &self.segments {
9393+ match seg {
9494+ PathSegment::Field(name) => write!(f, ".{}", name)?,
9595+ PathSegment::Index(idx) => write!(f, "[{}]", idx)?,
9696+ PathSegment::UnionVariant(t) => write!(f, "($type={})", t)?,
9797+ }
9898+ }
9999+ Ok(())
100100+ }
101101+}
102102+103103+/// Structural validation errors
104104+///
105105+/// These errors indicate that the data structure doesn't match the schema's type expectations.
106106+#[derive(Debug, Clone, thiserror::Error, miette::Diagnostic)]
107107+pub enum StructuralError {
108108+ #[error("Type mismatch at {path}: expected {expected}, got {actual}")]
109109+ TypeMismatch {
110110+ path: ValidationPath,
111111+ expected: jacquard_common::types::DataModelType,
112112+ actual: jacquard_common::types::DataModelType,
113113+ },
114114+115115+ #[error("Missing required field at {path}: '{field}'")]
116116+ MissingRequiredField {
117117+ path: ValidationPath,
118118+ field: SmolStr,
119119+ },
120120+121121+ #[error("Missing union discriminator ($type) at {path}")]
122122+ MissingUnionDiscriminator { path: ValidationPath },
123123+124124+ #[error("Union type mismatch at {path}: $type='{actual_type}' not in [{expected_refs}]")]
125125+ UnionNoMatch {
126126+ path: ValidationPath,
127127+ actual_type: SmolStr,
128128+ expected_refs: SmolStr,
129129+ },
130130+131131+ #[error("Unresolved ref at {path}: '{ref_nsid}'")]
132132+ UnresolvedRef {
133133+ path: ValidationPath,
134134+ ref_nsid: SmolStr,
135135+ },
136136+137137+ #[error("Reference cycle detected at {path}: '{ref_nsid}' (stack: {stack})")]
138138+ RefCycle {
139139+ path: ValidationPath,
140140+ ref_nsid: SmolStr,
141141+ stack: SmolStr,
142142+ },
143143+144144+ #[error("Max validation depth exceeded at {path}: {max}")]
145145+ MaxDepthExceeded { path: ValidationPath, max: usize },
146146+}
147147+148148+/// Constraint validation errors
149149+///
150150+/// These errors indicate that the data violates lexicon constraints like max_length,
151151+/// max_graphemes, ranges, etc. The structure is correct but values are out of bounds.
152152+#[derive(Debug, Clone, thiserror::Error, miette::Diagnostic)]
153153+pub enum ConstraintError {
154154+ #[error("{path} exceeds max length: {actual} > {max}")]
155155+ MaxLength {
156156+ path: ValidationPath,
157157+ max: usize,
158158+ actual: usize,
159159+ },
160160+161161+ #[error("{path} exceeds max graphemes: {actual} > {max}")]
162162+ MaxGraphemes {
163163+ path: ValidationPath,
164164+ max: usize,
165165+ actual: usize,
166166+ },
167167+168168+ #[error("{path} below min length: {actual} < {min}")]
169169+ MinLength {
170170+ path: ValidationPath,
171171+ min: usize,
172172+ actual: usize,
173173+ },
174174+175175+ #[error("{path} below min graphemes: {actual} < {min}")]
176176+ MinGraphemes {
177177+ path: ValidationPath,
178178+ min: usize,
179179+ actual: usize,
180180+ },
181181+182182+ #[error("{path} value {actual} exceeds maximum: {max}")]
183183+ Maximum {
184184+ path: ValidationPath,
185185+ max: i64,
186186+ actual: i64,
187187+ },
188188+189189+ #[error("{path} value {actual} below minimum: {min}")]
190190+ Minimum {
191191+ path: ValidationPath,
192192+ min: i64,
193193+ actual: i64,
194194+ },
195195+}
196196+197197+/// Unified validation error type
198198+#[derive(Debug, Clone, thiserror::Error)]
199199+pub enum ValidationError {
200200+ #[error(transparent)]
201201+ Structural(#[from] StructuralError),
202202+203203+ #[error(transparent)]
204204+ Constraint(#[from] ConstraintError),
205205+}
206206+207207+/// Registry of lexicon schemas for validation
208208+///
209209+/// Collects schemas from inventory at construction and supports runtime insertion.
210210+#[derive(Debug, Clone)]
211211+pub struct SchemaRegistry {
212212+ /// Schema documents indexed by NSID (concurrent access safe)
213213+ schemas: DashMap<SmolStr, LexiconDoc<'static>>,
214214+}
215215+216216+impl SchemaRegistry {
217217+ /// Build registry from inventory-collected schemas
218218+ pub fn from_inventory() -> Self {
219219+ let schemas = DashMap::new();
220220+221221+ for entry in inventory::iter::<LexiconSchemaRef> {
222222+ let doc = (entry.provider)();
223223+ schemas.insert(entry.nsid.to_smolstr(), doc);
224224+ }
225225+226226+ Self { schemas }
227227+ }
228228+229229+ /// Create an empty registry
230230+ pub fn new() -> Self {
231231+ Self {
232232+ schemas: DashMap::new(),
233233+ }
234234+ }
235235+236236+ /// Get schema by NSID
237237+ ///
238238+ /// IMPORTANT: Clone the returned schema immediately to avoid holding DashMap ref
239239+ pub fn get(&self, nsid: &str) -> Option<LexiconDoc<'static>> {
240240+ self.schemas.get(nsid).map(|doc| doc.clone())
241241+ }
242242+243243+ /// Insert or update a schema (for runtime schema loading)
244244+ pub fn insert(&self, nsid: SmolStr, doc: LexiconDoc<'static>) {
245245+ self.schemas.insert(nsid, doc);
246246+ }
247247+248248+ /// Get specific def from a schema
249249+ ///
250250+ /// IMPORTANT: Returns cloned def to avoid holding DashMap ref
251251+ pub fn get_def(
252252+ &self,
253253+ nsid: &str,
254254+ def_name: &str,
255255+ ) -> Option<crate::lexicon::LexUserType<'static>> {
256256+ // Clone immediately to release DashMap ref before returning
257257+ self.schemas
258258+ .get(nsid)
259259+ .and_then(|doc| doc.defs.get(def_name).cloned())
260260+ }
261261+}
262262+263263+impl Default for SchemaRegistry {
264264+ fn default() -> Self {
265265+ Self::from_inventory()
266266+ }
267267+}
268268+269269+/// Cache key for validation results
270270+///
271271+/// Content-addressed by CID to enable efficient caching across identical data.
272272+#[derive(Debug, Clone, Hash, Eq, PartialEq)]
273273+struct ValidationCacheKey {
274274+ nsid: SmolStr,
275275+ def_name: SmolStr,
276276+ cid: IpldCid,
277277+}
278278+279279+impl ValidationCacheKey {
280280+ /// Create cache key from schema info and data
281281+ fn from_data<T: crate::schema::LexiconSchema>(
282282+ data: &Data,
283283+ ) -> Result<Self, CidComputationError> {
284284+ let cid = compute_data_cid(data)?;
285285+ Ok(Self {
286286+ nsid: SmolStr::new_static(T::nsid()),
287287+ def_name: SmolStr::new_static(T::def_name()),
288288+ cid,
289289+ })
290290+ }
291291+}
292292+293293+/// Errors that can occur when computing CIDs
294294+#[derive(Debug, thiserror::Error)]
295295+pub enum CidComputationError {
296296+ #[error("Failed to serialize data to DAG-CBOR: {0}")]
297297+ DagCborEncode(#[from] serde_ipld_dagcbor::EncodeError<std::collections::TryReserveError>),
298298+299299+ #[error("Failed to create multihash: {0}")]
300300+ Multihash(#[from] multihash::Error),
301301+}
302302+303303+/// Compute CID for Data value
304304+///
305305+/// Uses SHA-256 hash and DAG-CBOR codec for content addressing.
306306+fn compute_data_cid(data: &Data) -> Result<IpldCid, CidComputationError> {
307307+ // Serialize to DAG-CBOR
308308+ let dag_cbor = data.to_dag_cbor()?;
309309+310310+ // Compute SHA-256 hash
311311+ let hash = Sha256::digest(&dag_cbor);
312312+313313+ // Create multihash (code 0x12 = sha2-256)
314314+ let multihash = multihash::Multihash::wrap(0x12, &hash)?;
315315+316316+ // Create CIDv1 with dag-cbor codec (0x71)
317317+ Ok(IpldCid::new_v1(0x71, multihash))
318318+}
319319+320320+/// Result of validating Data against a schema
321321+///
322322+/// Distinguishes between structural errors (type mismatches, missing fields) and
323323+/// constraint violations (max_length, ranges, etc.). Constraint validation is lazy.
324324+#[derive(Debug, Clone)]
325325+pub struct ValidationResult {
326326+ /// Structural errors (computed immediately)
327327+ structural: Vec<StructuralError>,
328328+329329+ /// Constraint errors (computed on first access)
330330+ constraints: OnceLock<Vec<ConstraintError>>,
331331+332332+ /// Context for lazy constraint validation
333333+ data: Option<Arc<Data<'static>>>,
334334+ schema_ref: Option<(SmolStr, SmolStr)>, // (nsid, def_name)
335335+ registry: Option<Arc<SchemaRegistry>>,
336336+}
337337+338338+impl ValidationResult {
339339+ /// Create a validation result with no errors
340340+ pub fn valid() -> Self {
341341+ Self {
342342+ structural: Vec::new(),
343343+ constraints: OnceLock::new(),
344344+ data: None,
345345+ schema_ref: None,
346346+ registry: None,
347347+ }
348348+ }
349349+350350+ /// Create a validation result with structural errors
351351+ pub fn with_structural_errors(errors: Vec<StructuralError>) -> Self {
352352+ Self {
353353+ structural: errors,
354354+ constraints: OnceLock::new(),
355355+ data: None,
356356+ schema_ref: None,
357357+ registry: None,
358358+ }
359359+ }
360360+361361+ /// Create a validation result with context for lazy constraint validation
362362+ pub fn with_context(
363363+ structural: Vec<StructuralError>,
364364+ data: Arc<Data<'static>>,
365365+ nsid: SmolStr,
366366+ def_name: SmolStr,
367367+ registry: Arc<SchemaRegistry>,
368368+ ) -> Self {
369369+ Self {
370370+ structural,
371371+ constraints: OnceLock::new(),
372372+ data: Some(data),
373373+ schema_ref: Some((nsid, def_name)),
374374+ registry: Some(registry),
375375+ }
376376+ }
377377+378378+ /// Check if validation passed (no structural or constraint errors)
379379+ pub fn is_valid(&self) -> bool {
380380+ self.structural.is_empty() && self.constraint_errors().is_empty()
381381+ }
382382+383383+ /// Check if structurally valid (ignoring constraint checks)
384384+ pub fn is_structurally_valid(&self) -> bool {
385385+ self.structural.is_empty()
386386+ }
387387+388388+ /// Get structural errors
389389+ pub fn structural_errors(&self) -> &[StructuralError] {
390390+ &self.structural
391391+ }
392392+393393+ /// Get constraint errors (computed lazily on first access)
394394+ pub fn constraint_errors(&self) -> &[ConstraintError] {
395395+ self.constraints.get_or_init(|| {
396396+ // If no context or structurally invalid, skip constraint validation
397397+ if !self.is_structurally_valid() || self.data.is_none() || self.schema_ref.is_none() {
398398+ return Vec::new();
399399+ }
400400+401401+ let data = self.data.as_ref().unwrap();
402402+ let (nsid, def_name) = self.schema_ref.as_ref().unwrap();
403403+404404+ let mut path = ValidationPath::new();
405405+ validate_constraints(
406406+ &mut path,
407407+ data,
408408+ nsid.as_str(),
409409+ def_name.as_str(),
410410+ self.registry.as_ref(),
411411+ )
412412+ })
413413+ }
414414+415415+ /// Check if there are any constraint violations
416416+ pub fn has_constraint_violations(&self) -> bool {
417417+ !self.constraint_errors().is_empty()
418418+ }
419419+420420+ /// Get all errors (structural and constraint)
421421+ pub fn all_errors(&self) -> impl Iterator<Item = ValidationError> + '_ {
422422+ self.structural
423423+ .iter()
424424+ .cloned()
425425+ .map(ValidationError::Structural)
426426+ .chain(
427427+ self.constraint_errors()
428428+ .iter()
429429+ .cloned()
430430+ .map(ValidationError::Constraint),
431431+ )
432432+ }
433433+}
434434+435435+/// Schema validator with caching
436436+///
437437+/// Validates Data values against lexicon schemas, caching results by content hash.
438438+pub struct SchemaValidator {
439439+ registry: SchemaRegistry,
440440+ cache: DashMap<ValidationCacheKey, Arc<ValidationResult>>,
441441+}
442442+443443+impl SchemaValidator {
444444+ /// Get the global validator instance
445445+ pub fn global() -> &'static Self {
446446+ static VALIDATOR: LazyLock<SchemaValidator> = LazyLock::new(|| SchemaValidator {
447447+ registry: SchemaRegistry::from_inventory(),
448448+ cache: DashMap::new(),
449449+ });
450450+ &VALIDATOR
451451+ }
452452+453453+ /// Create a new validator with empty registry
454454+ pub fn new() -> Self {
455455+ Self {
456456+ registry: SchemaRegistry::new(),
457457+ cache: DashMap::new(),
458458+ }
459459+ }
460460+461461+ /// Validate data against a schema
462462+ ///
463463+ /// Results are cached by content hash for efficiency.
464464+ pub fn validate<T: crate::schema::LexiconSchema>(
465465+ &self,
466466+ data: &Data,
467467+ ) -> Result<ValidationResult, CidComputationError> {
468468+ // Compute cache key
469469+ let key = ValidationCacheKey::from_data::<T>(data)?;
470470+471471+ // Check cache (clone Arc immediately to avoid holding ref)
472472+ if let Some(cached) = self.cache.get(&key).map(|r| Arc::clone(&r)) {
473473+ return Ok((*cached).clone());
474474+ }
475475+476476+ // Validate (placeholder - actual validation in Phase 3)
477477+ let result = self.validate_uncached::<T>(data);
478478+479479+ // Cache result
480480+ self.cache.insert(key, Arc::new(result.clone()));
481481+482482+ Ok(result)
483483+ }
484484+485485+ /// Validate without caching (internal)
486486+ fn validate_uncached<T: crate::schema::LexiconSchema>(&self, data: &Data) -> ValidationResult {
487487+ let def = match self.registry.get_def(T::nsid(), T::def_name()) {
488488+ Some(d) => d,
489489+ None => {
490490+ // Schema not found - this is a structural error
491491+ return ValidationResult::with_structural_errors(vec![
492492+ StructuralError::UnresolvedRef {
493493+ path: ValidationPath::new(),
494494+ ref_nsid: format!("{}#{}", T::nsid(), T::def_name()).into(),
495495+ },
496496+ ]);
497497+ }
498498+ };
499499+500500+ let mut path = ValidationPath::new();
501501+ let mut ctx = ValidationContext::new(T::nsid(), T::def_name());
502502+503503+ let errors = validate_def(&mut path, data, &def, &self.registry, &mut ctx);
504504+505505+ // If structurally valid, create result with context for lazy constraint validation
506506+ if errors.is_empty() {
507507+ // Convert data to owned for constraint validation
508508+ let owned_data = Arc::new(data.clone().into_static());
509509+ ValidationResult::with_context(
510510+ errors,
511511+ owned_data,
512512+ SmolStr::new_static(T::nsid()),
513513+ SmolStr::new_static(T::def_name()),
514514+ Arc::new(self.registry.clone()),
515515+ )
516516+ } else {
517517+ ValidationResult::with_structural_errors(errors)
518518+ }
519519+ }
520520+521521+ /// Get the schema registry
522522+ pub fn registry(&self) -> &SchemaRegistry {
523523+ &self.registry
524524+ }
525525+}
526526+527527+impl Default for SchemaValidator {
528528+ fn default() -> Self {
529529+ Self::new()
530530+ }
531531+}
532532+533533+/// Validation context for tracking refs and preventing cycles
534534+struct ValidationContext {
535535+ current_nsid: String,
536536+ current_def: String,
537537+ ref_stack: Vec<String>,
538538+ max_depth: usize,
539539+}
540540+541541+impl ValidationContext {
542542+ fn new(nsid: &str, def_name: &str) -> Self {
543543+ Self {
544544+ current_nsid: nsid.to_string(),
545545+ current_def: def_name.to_string(),
546546+ ref_stack: Vec::new(),
547547+ max_depth: 32,
548548+ }
549549+ }
550550+}
551551+552552+/// Normalize a ref string to (nsid, def_name)
553553+fn normalize_ref(ref_str: &str, current_nsid: &str) -> (String, String) {
554554+ if let Some(fragment) = ref_str.strip_prefix('#') {
555555+ // #option -> (current_nsid, "option")
556556+ (current_nsid.to_string(), fragment.to_string())
557557+ } else if let Some((nsid, def)) = ref_str.split_once('#') {
558558+ // com.example.foo#bar -> ("com.example.foo", "bar")
559559+ (nsid.to_string(), def.to_string())
560560+ } else {
561561+ // com.example.foo -> ("com.example.foo", "main")
562562+ (ref_str.to_string(), "main".to_string())
563563+ }
564564+}
565565+566566+/// Validate data against a lexicon def
567567+fn validate_def(
568568+ path: &mut ValidationPath,
569569+ data: &Data,
570570+ def: &crate::lexicon::LexUserType,
571571+ registry: &SchemaRegistry,
572572+ ctx: &mut ValidationContext,
573573+) -> Vec<StructuralError> {
574574+ use crate::lexicon::LexUserType;
575575+ use jacquard_common::types::DataModelType;
576576+577577+ match def {
578578+ LexUserType::Object(obj) => {
579579+ // Must be an object
580580+ let Data::Object(obj_data) = data else {
581581+ return vec![StructuralError::TypeMismatch {
582582+ path: path.clone(),
583583+ expected: DataModelType::Object,
584584+ actual: data.data_type(),
585585+ }];
586586+ };
587587+588588+ let mut errors = Vec::new();
589589+590590+ // Check required fields
591591+ if let Some(required) = &obj.required {
592592+ for field in required {
593593+ if !obj_data.get(field.as_ref()).is_some() {
594594+ errors.push(StructuralError::MissingRequiredField {
595595+ path: path.clone(),
596596+ field: field.clone(),
597597+ });
598598+ }
599599+ }
600600+ }
601601+602602+ // Validate each property that's present
603603+ for (name, prop) in &obj.properties {
604604+ if let Some(field_data) = obj_data.get(name.as_ref()) {
605605+ path.push_field(name.as_ref());
606606+ errors.extend(validate_property(path, field_data, prop, registry, ctx));
607607+ path.pop();
608608+ }
609609+ }
610610+611611+ errors
612612+ }
613613+ // Other def types (Record, Token, etc.) would go here
614614+ // For now, just handle Object since that's what our tests use
615615+ _ => Vec::new(),
616616+ }
617617+}
618618+619619+/// Validate data against a property schema
620620+fn validate_property(
621621+ path: &mut ValidationPath,
622622+ data: &Data,
623623+ prop: &crate::lexicon::LexObjectProperty,
624624+ registry: &SchemaRegistry,
625625+ ctx: &mut ValidationContext,
626626+) -> Vec<StructuralError> {
627627+ use crate::lexicon::LexObjectProperty;
628628+ use jacquard_common::types::DataModelType;
629629+630630+ match prop {
631631+ LexObjectProperty::String(_) => {
632632+ // Accept any string type
633633+ if !matches!(data.data_type(), DataModelType::String(_)) {
634634+ vec![StructuralError::TypeMismatch {
635635+ path: path.clone(),
636636+ expected: DataModelType::String(
637637+ jacquard_common::types::LexiconStringType::String,
638638+ ),
639639+ actual: data.data_type(),
640640+ }]
641641+ } else {
642642+ Vec::new()
643643+ }
644644+ }
645645+646646+ LexObjectProperty::Integer(_) => {
647647+ if !matches!(data.data_type(), DataModelType::Integer) {
648648+ vec![StructuralError::TypeMismatch {
649649+ path: path.clone(),
650650+ expected: DataModelType::Integer,
651651+ actual: data.data_type(),
652652+ }]
653653+ } else {
654654+ Vec::new()
655655+ }
656656+ }
657657+658658+ LexObjectProperty::Boolean(_) => {
659659+ if !matches!(data.data_type(), DataModelType::Boolean) {
660660+ vec![StructuralError::TypeMismatch {
661661+ path: path.clone(),
662662+ expected: DataModelType::Boolean,
663663+ actual: data.data_type(),
664664+ }]
665665+ } else {
666666+ Vec::new()
667667+ }
668668+ }
669669+670670+ LexObjectProperty::Object(obj) => {
671671+ let Data::Object(obj_data) = data else {
672672+ return vec![StructuralError::TypeMismatch {
673673+ path: path.clone(),
674674+ expected: DataModelType::Object,
675675+ actual: data.data_type(),
676676+ }];
677677+ };
678678+679679+ let mut errors = Vec::new();
680680+681681+ // Check required fields
682682+ if let Some(required) = &obj.required {
683683+ for field in required {
684684+ if !obj_data.get(field.as_ref()).is_some() {
685685+ errors.push(StructuralError::MissingRequiredField {
686686+ path: path.clone(),
687687+ field: field.clone(),
688688+ });
689689+ }
690690+ }
691691+ }
692692+693693+ // Recursively validate each property
694694+ for (name, schema_prop) in &obj.properties {
695695+ if let Some(field_data) = obj_data.get(name.as_ref()) {
696696+ path.push_field(name.as_ref());
697697+ errors.extend(validate_property(
698698+ path,
699699+ field_data,
700700+ schema_prop,
701701+ registry,
702702+ ctx,
703703+ ));
704704+ path.pop();
705705+ }
706706+ }
707707+708708+ errors
709709+ }
710710+711711+ LexObjectProperty::Array(arr) => {
712712+ let Data::Array(array) = data else {
713713+ return vec![StructuralError::TypeMismatch {
714714+ path: path.clone(),
715715+ expected: DataModelType::Array,
716716+ actual: data.data_type(),
717717+ }];
718718+ };
719719+720720+ let mut errors = Vec::new();
721721+ for (idx, item) in array.iter().enumerate() {
722722+ path.push_index(idx);
723723+ errors.extend(validate_array_item(path, item, &arr.items, registry, ctx));
724724+ path.pop();
725725+ }
726726+ errors
727727+ }
728728+729729+ LexObjectProperty::Union(u) => {
730730+ let Data::Object(obj) = data else {
731731+ return vec![StructuralError::TypeMismatch {
732732+ path: path.clone(),
733733+ expected: DataModelType::Object,
734734+ actual: data.data_type(),
735735+ }];
736736+ };
737737+738738+ // Get $type discriminator
739739+ let Some(type_str) = obj.type_discriminator() else {
740740+ return vec![StructuralError::MissingUnionDiscriminator { path: path.clone() }];
741741+ };
742742+743743+ // Try to match against refs
744744+ for variant_ref in &u.refs {
745745+ let (variant_nsid, variant_def) =
746746+ normalize_ref(variant_ref.as_ref(), &ctx.current_nsid);
747747+ let full_variant = format!("{}#{}", variant_nsid, variant_def);
748748+749749+ // Match by full ref or just nsid
750750+ if type_str == full_variant || type_str == variant_nsid {
751751+ // Found match - validate against this variant
752752+ let Some(variant_def_type) = registry.get_def(&variant_nsid, &variant_def)
753753+ else {
754754+ return vec![StructuralError::UnresolvedRef {
755755+ path: path.clone(),
756756+ ref_nsid: full_variant.into(),
757757+ }];
758758+ };
759759+760760+ path.push_variant(type_str);
761761+ let old_nsid = std::mem::replace(&mut ctx.current_nsid, variant_nsid);
762762+ let old_def = std::mem::replace(&mut ctx.current_def, variant_def);
763763+764764+ let errors = validate_def(path, data, &variant_def_type, registry, ctx);
765765+766766+ ctx.current_nsid = old_nsid;
767767+ ctx.current_def = old_def;
768768+ path.pop();
769769+770770+ return errors;
771771+ }
772772+ }
773773+774774+ // No match found
775775+ if u.closed.unwrap_or(false) {
776776+ // Closed union - this is an error
777777+ let expected_refs = u
778778+ .refs
779779+ .iter()
780780+ .map(|r| r.as_ref())
781781+ .collect::<Vec<_>>()
782782+ .join(", ");
783783+ vec![StructuralError::UnionNoMatch {
784784+ path: path.clone(),
785785+ actual_type: type_str.into(),
786786+ expected_refs: expected_refs.into(),
787787+ }]
788788+ } else {
789789+ // Open union - allow unknown variants
790790+ Vec::new()
791791+ }
792792+ }
793793+794794+ LexObjectProperty::Ref(r) => {
795795+ // Depth check
796796+ if path.depth() >= ctx.max_depth {
797797+ return vec![StructuralError::MaxDepthExceeded {
798798+ path: path.clone(),
799799+ max: ctx.max_depth,
800800+ }];
801801+ }
802802+803803+ // Normalize ref
804804+ let (ref_nsid, ref_def) = normalize_ref(r.r#ref.as_ref(), &ctx.current_nsid);
805805+ let full_ref = format!("{}#{}", ref_nsid, ref_def);
806806+807807+ // Cycle detection
808808+ if ctx.ref_stack.contains(&full_ref) {
809809+ let stack = ctx.ref_stack.join(" -> ");
810810+ return vec![StructuralError::RefCycle {
811811+ path: path.clone(),
812812+ ref_nsid: full_ref.into(),
813813+ stack: stack.into(),
814814+ }];
815815+ }
816816+817817+ // Look up ref
818818+ let Some(ref_def_type) = registry.get_def(&ref_nsid, &ref_def) else {
819819+ return vec![StructuralError::UnresolvedRef {
820820+ path: path.clone(),
821821+ ref_nsid: full_ref.into(),
822822+ }];
823823+ };
824824+825825+ // Push, validate, pop
826826+ ctx.ref_stack.push(full_ref);
827827+ let old_nsid = std::mem::replace(&mut ctx.current_nsid, ref_nsid);
828828+ let old_def = std::mem::replace(&mut ctx.current_def, ref_def);
829829+830830+ let errors = validate_def(path, data, &ref_def_type, registry, ctx);
831831+832832+ ctx.current_nsid = old_nsid;
833833+ ctx.current_def = old_def;
834834+ ctx.ref_stack.pop();
835835+836836+ errors
837837+ }
838838+839839+ LexObjectProperty::Bytes(_) => {
840840+ if !matches!(data.data_type(), DataModelType::Bytes) {
841841+ vec![StructuralError::TypeMismatch {
842842+ path: path.clone(),
843843+ expected: DataModelType::Bytes,
844844+ actual: data.data_type(),
845845+ }]
846846+ } else {
847847+ Vec::new()
848848+ }
849849+ }
850850+851851+ LexObjectProperty::CidLink(_) => {
852852+ if !matches!(data.data_type(), DataModelType::CidLink) {
853853+ vec![StructuralError::TypeMismatch {
854854+ path: path.clone(),
855855+ expected: DataModelType::CidLink,
856856+ actual: data.data_type(),
857857+ }]
858858+ } else {
859859+ Vec::new()
860860+ }
861861+ }
862862+863863+ LexObjectProperty::Blob(_) => {
864864+ if !matches!(data.data_type(), DataModelType::Blob) {
865865+ vec![StructuralError::TypeMismatch {
866866+ path: path.clone(),
867867+ expected: DataModelType::Blob,
868868+ actual: data.data_type(),
869869+ }]
870870+ } else {
871871+ Vec::new()
872872+ }
873873+ }
874874+875875+ LexObjectProperty::Unknown(_) => {
876876+ // Any type allowed
877877+ Vec::new()
878878+ }
879879+ }
880880+}
881881+882882+/// Validate array item against array item schema
883883+fn validate_array_item(
884884+ path: &mut ValidationPath,
885885+ data: &Data,
886886+ item_schema: &crate::lexicon::LexArrayItem,
887887+ registry: &SchemaRegistry,
888888+ ctx: &mut ValidationContext,
889889+) -> Vec<StructuralError> {
890890+ use crate::lexicon::LexArrayItem;
891891+892892+ match item_schema {
893893+ LexArrayItem::String(s) => validate_property(
894894+ path,
895895+ data,
896896+ &crate::lexicon::LexObjectProperty::String(s.clone()),
897897+ registry,
898898+ ctx,
899899+ ),
900900+ LexArrayItem::Integer(i) => validate_property(
901901+ path,
902902+ data,
903903+ &crate::lexicon::LexObjectProperty::Integer(i.clone()),
904904+ registry,
905905+ ctx,
906906+ ),
907907+ LexArrayItem::Boolean(b) => validate_property(
908908+ path,
909909+ data,
910910+ &crate::lexicon::LexObjectProperty::Boolean(b.clone()),
911911+ registry,
912912+ ctx,
913913+ ),
914914+ LexArrayItem::Object(o) => validate_property(
915915+ path,
916916+ data,
917917+ &crate::lexicon::LexObjectProperty::Object(o.clone()),
918918+ registry,
919919+ ctx,
920920+ ),
921921+ LexArrayItem::Unknown(u) => validate_property(
922922+ path,
923923+ data,
924924+ &crate::lexicon::LexObjectProperty::Unknown(u.clone()),
925925+ registry,
926926+ ctx,
927927+ ),
928928+ LexArrayItem::Bytes(b) => validate_property(
929929+ path,
930930+ data,
931931+ &crate::lexicon::LexObjectProperty::Bytes(b.clone()),
932932+ registry,
933933+ ctx,
934934+ ),
935935+ LexArrayItem::CidLink(c) => validate_property(
936936+ path,
937937+ data,
938938+ &crate::lexicon::LexObjectProperty::CidLink(c.clone()),
939939+ registry,
940940+ ctx,
941941+ ),
942942+ LexArrayItem::Blob(b) => validate_property(
943943+ path,
944944+ data,
945945+ &crate::lexicon::LexObjectProperty::Blob(b.clone()),
946946+ registry,
947947+ ctx,
948948+ ),
949949+ LexArrayItem::Ref(r) => validate_property(
950950+ path,
951951+ data,
952952+ &crate::lexicon::LexObjectProperty::Ref(r.clone()),
953953+ registry,
954954+ ctx,
955955+ ),
956956+ LexArrayItem::Union(u) => validate_property(
957957+ path,
958958+ data,
959959+ &crate::lexicon::LexObjectProperty::Union(u.clone()),
960960+ registry,
961961+ ctx,
962962+ ),
963963+ }
964964+}
965965+966966+// ============================================================================
967967+// CONSTRAINT VALIDATION
968968+// ============================================================================
969969+970970+/// Validate constraints on data against schema (entry point with optional registry)
971971+fn validate_constraints(
972972+ path: &mut ValidationPath,
973973+ data: &Data,
974974+ nsid: &str,
975975+ def_name: &str,
976976+ registry: Option<&Arc<SchemaRegistry>>,
977977+) -> Vec<ConstraintError> {
978978+ // Use provided registry or fall back to global inventory
979979+ let fallback_registry;
980980+ let registry_ref = match registry {
981981+ Some(r) => r.as_ref(),
982982+ None => {
983983+ fallback_registry = SchemaRegistry::from_inventory();
984984+ &fallback_registry
985985+ }
986986+ };
987987+988988+ validate_constraints_impl(path, data, nsid, def_name, registry_ref)
989989+}
990990+991991+/// Internal implementation that takes materialized registry
992992+fn validate_constraints_impl(
993993+ path: &mut ValidationPath,
994994+ data: &Data,
995995+ nsid: &str,
996996+ def_name: &str,
997997+ registry: &SchemaRegistry,
998998+) -> Vec<ConstraintError> {
999999+ use crate::lexicon::LexUserType;
10001000+10011001+ // Get schema def
10021002+ let Some(def) = registry.get_def(nsid, def_name) else {
10031003+ return Vec::new();
10041004+ };
10051005+10061006+ match def {
10071007+ LexUserType::Object(obj) => {
10081008+ let Data::Object(obj_data) = data else {
10091009+ return Vec::new();
10101010+ };
10111011+10121012+ let mut errors = Vec::new();
10131013+10141014+ // Check constraints on each property
10151015+ for (name, prop) in &obj.properties {
10161016+ if let Some(field_data) = obj_data.get(name.as_ref()) {
10171017+ path.push_field(name.as_ref());
10181018+ errors.extend(check_property_constraints(path, field_data, prop, registry));
10191019+ path.pop();
10201020+ }
10211021+ }
10221022+10231023+ errors
10241024+ }
10251025+ // Other def types would go here
10261026+ _ => Vec::new(),
10271027+ }
10281028+}
10291029+10301030+/// Check constraints on a property
10311031+fn check_property_constraints(
10321032+ path: &mut ValidationPath,
10331033+ data: &Data,
10341034+ prop: &crate::lexicon::LexObjectProperty,
10351035+ registry: &SchemaRegistry,
10361036+) -> Vec<ConstraintError> {
10371037+ use crate::lexicon::LexObjectProperty;
10381038+10391039+ match prop {
10401040+ LexObjectProperty::String(s) => {
10411041+ if let Data::String(str_val) = data {
10421042+ check_string_constraints(path, str_val.as_str(), s)
10431043+ } else {
10441044+ Vec::new()
10451045+ }
10461046+ }
10471047+10481048+ LexObjectProperty::Integer(i) => {
10491049+ if let Data::Integer(int_val) = data {
10501050+ check_integer_constraints(path, *int_val, i)
10511051+ } else {
10521052+ Vec::new()
10531053+ }
10541054+ }
10551055+10561056+ LexObjectProperty::Array(arr) => {
10571057+ if let Data::Array(array) = data {
10581058+ let mut errors = check_array_constraints(path, array, arr);
10591059+10601060+ // Also check constraints on array items
10611061+ for (idx, item) in array.iter().enumerate() {
10621062+ path.push_index(idx);
10631063+ errors.extend(check_array_item_constraints(
10641064+ path, item, &arr.items, registry,
10651065+ ));
10661066+ path.pop();
10671067+ }
10681068+10691069+ errors
10701070+ } else {
10711071+ Vec::new()
10721072+ }
10731073+ }
10741074+10751075+ LexObjectProperty::Object(obj) => {
10761076+ if let Data::Object(obj_data) = data {
10771077+ let mut errors = Vec::new();
10781078+10791079+ // Recursively check nested object properties
10801080+ for (name, schema_prop) in &obj.properties {
10811081+ if let Some(field_data) = obj_data.get(name.as_ref()) {
10821082+ path.push_field(name.as_ref());
10831083+ errors.extend(check_property_constraints(
10841084+ path,
10851085+ field_data,
10861086+ schema_prop,
10871087+ registry,
10881088+ ));
10891089+ path.pop();
10901090+ }
10911091+ }
10921092+10931093+ errors
10941094+ } else {
10951095+ Vec::new()
10961096+ }
10971097+ }
10981098+10991099+ LexObjectProperty::Ref(r) => {
11001100+ // Follow ref and check constraints
11011101+ let (ref_nsid, ref_def) = normalize_ref(r.r#ref.as_ref(), ""); // FIXME: need current nsid
11021102+11031103+ if registry.get_def(&ref_nsid, &ref_def).is_some() {
11041104+ validate_constraints_impl(path, data, &ref_nsid, &ref_def, registry)
11051105+ } else {
11061106+ Vec::new()
11071107+ }
11081108+ }
11091109+11101110+ // Other property types don't have constraints
11111111+ _ => Vec::new(),
11121112+ }
11131113+}
11141114+11151115+/// Check string constraints
11161116+fn check_string_constraints(
11171117+ path: &ValidationPath,
11181118+ value: &str,
11191119+ schema: &crate::lexicon::LexString,
11201120+) -> Vec<ConstraintError> {
11211121+ let mut errors = Vec::new();
11221122+11231123+ // Check byte length constraints
11241124+ let byte_len = value.len();
11251125+11261126+ if let Some(min) = schema.min_length {
11271127+ if byte_len < min as usize {
11281128+ errors.push(ConstraintError::MinLength {
11291129+ path: path.clone(),
11301130+ min: min as usize,
11311131+ actual: byte_len,
11321132+ });
11331133+ }
11341134+ }
11351135+11361136+ if let Some(max) = schema.max_length {
11371137+ if byte_len > max as usize {
11381138+ errors.push(ConstraintError::MaxLength {
11391139+ path: path.clone(),
11401140+ max: max as usize,
11411141+ actual: byte_len,
11421142+ });
11431143+ }
11441144+ }
11451145+11461146+ // Check grapheme count constraints
11471147+ if schema.min_graphemes.is_some() || schema.max_graphemes.is_some() {
11481148+ use unicode_segmentation::UnicodeSegmentation;
11491149+ let grapheme_count = value.graphemes(true).count();
11501150+11511151+ if let Some(min) = schema.min_graphemes {
11521152+ if grapheme_count < min as usize {
11531153+ errors.push(ConstraintError::MinGraphemes {
11541154+ path: path.clone(),
11551155+ min: min as usize,
11561156+ actual: grapheme_count,
11571157+ });
11581158+ }
11591159+ }
11601160+11611161+ if let Some(max) = schema.max_graphemes {
11621162+ if grapheme_count > max as usize {
11631163+ errors.push(ConstraintError::MaxGraphemes {
11641164+ path: path.clone(),
11651165+ max: max as usize,
11661166+ actual: grapheme_count,
11671167+ });
11681168+ }
11691169+ }
11701170+ }
11711171+11721172+ errors
11731173+}
11741174+11751175+/// Check integer constraints
11761176+fn check_integer_constraints(
11771177+ path: &ValidationPath,
11781178+ value: i64,
11791179+ schema: &crate::lexicon::LexInteger,
11801180+) -> Vec<ConstraintError> {
11811181+ let mut errors = Vec::new();
11821182+11831183+ if let Some(min) = schema.minimum {
11841184+ if value < min {
11851185+ errors.push(ConstraintError::Minimum {
11861186+ path: path.clone(),
11871187+ min,
11881188+ actual: value,
11891189+ });
11901190+ }
11911191+ }
11921192+11931193+ if let Some(max) = schema.maximum {
11941194+ if value > max {
11951195+ errors.push(ConstraintError::Maximum {
11961196+ path: path.clone(),
11971197+ max,
11981198+ actual: value,
11991199+ });
12001200+ }
12011201+ }
12021202+12031203+ errors
12041204+}
12051205+12061206+/// Check array length constraints
12071207+fn check_array_constraints(
12081208+ path: &ValidationPath,
12091209+ array: &jacquard_common::types::value::Array,
12101210+ schema: &crate::lexicon::LexArray,
12111211+) -> Vec<ConstraintError> {
12121212+ let mut errors = Vec::new();
12131213+ let len = array.len();
12141214+12151215+ if let Some(min) = schema.min_length {
12161216+ if len < min as usize {
12171217+ errors.push(ConstraintError::MinLength {
12181218+ path: path.clone(),
12191219+ min: min as usize,
12201220+ actual: len,
12211221+ });
12221222+ }
12231223+ }
12241224+12251225+ if let Some(max) = schema.max_length {
12261226+ if len > max as usize {
12271227+ errors.push(ConstraintError::MaxLength {
12281228+ path: path.clone(),
12291229+ max: max as usize,
12301230+ actual: len,
12311231+ });
12321232+ }
12331233+ }
12341234+12351235+ errors
12361236+}
12371237+12381238+/// Check constraints on array items
12391239+fn check_array_item_constraints(
12401240+ path: &mut ValidationPath,
12411241+ data: &Data,
12421242+ item_schema: &crate::lexicon::LexArrayItem,
12431243+ registry: &SchemaRegistry,
12441244+) -> Vec<ConstraintError> {
12451245+ use crate::lexicon::LexArrayItem;
12461246+12471247+ match item_schema {
12481248+ LexArrayItem::String(s) => check_property_constraints(
12491249+ path,
12501250+ data,
12511251+ &crate::lexicon::LexObjectProperty::String(s.clone()),
12521252+ registry,
12531253+ ),
12541254+ LexArrayItem::Integer(i) => check_property_constraints(
12551255+ path,
12561256+ data,
12571257+ &crate::lexicon::LexObjectProperty::Integer(i.clone()),
12581258+ registry,
12591259+ ),
12601260+ LexArrayItem::Object(o) => check_property_constraints(
12611261+ path,
12621262+ data,
12631263+ &crate::lexicon::LexObjectProperty::Object(o.clone()),
12641264+ registry,
12651265+ ),
12661266+ LexArrayItem::Ref(r) => check_property_constraints(
12671267+ path,
12681268+ data,
12691269+ &crate::lexicon::LexObjectProperty::Ref(r.clone()),
12701270+ registry,
12711271+ ),
12721272+ // Other array item types don't have constraints
12731273+ _ => Vec::new(),
12741274+ }
12751275+}
12761276+12771277+#[cfg(test)]
12781278+mod tests;