A better Rust ATProto crate
at main 453 lines 14 kB view raw
1//! # Schema Extraction 2//! 3//! Extract AT Protocol lexicon schemas from Rust types via `inventory` discovery. 4//! 5//! ## Usage Pattern 6//! 7//! This module provides schema extraction for types implementing `LexiconSchema`. 8//! The extraction binary discovers schemas at **link time** via `inventory`, so you need 9//! to create a binary in your workspace that links your schema types. 10//! 11//! ### Simple Usage 12//! 13//! ```rust,ignore 14//! // bin/extract_schemas.rs 15//! use jacquard_lexgen::schema_extraction; 16//! 17//! // Import your types so they get linked 18//! use my_app::models::*; 19//! 20//! fn main() -> miette::Result<()> { 21//! schema_extraction::run( 22//! "lexicons", // output directory 23//! true, // verbose 24//! ) 25//! } 26//! ``` 27//! 28//! ### Advanced Usage 29//! 30//! ```rust,ignore 31//! use jacquard_lexgen::schema_extraction::{ExtractOptions, SchemaExtractor}; 32//! use my_app::models::*; // Your schema types 33//! 34//! fn main() -> miette::Result<()> { 35//! let options = ExtractOptions { 36//! output_dir: "lexicons".into(), 37//! verbose: true, 38//! filter: Some("app.bsky".into()), // Only extract app.bsky.* schemas 39//! validate: true, 40//! pretty: true, 41//! }; 42//! 43//! SchemaExtractor::new(options).extract_all() 44//! } 45//! ``` 46//! 47//! ### Integration with Build Tools 48//! 49//! **Just:** 50//! ```justfile 51//! # Generate lexicon schemas from Rust types 52//! extract-schemas: 53//! cargo run --bin extract-schemas 54//! ``` 55//! 56//! **Cargo xtask:** 57//! ```rust,ignore 58//! // xtask/src/main.rs 59//! match args { 60//! "codegen" => { 61//! run_command("cargo", &["run", "--bin", "extract-schemas"])?; 62//! } 63//! } 64//! ``` 65//! 66//! **Pre-commit hook:** 67//! ```bash 68//! #!/bin/bash 69//! # Regenerate schemas when Rust files change 70//! if git diff --cached --name-only | grep -E '\.rs$'; then 71//! cargo run --bin extract-schemas 72//! git add lexicons/*.json 73//! fi 74//! ``` 75 76use jacquard_lexicon::lexicon::LexiconDoc; 77use jacquard_lexicon::schema::LexiconSchemaRef; 78use miette::{IntoDiagnostic, Result}; 79use std::collections::BTreeMap; 80use std::fs; 81use std::path::{Path, PathBuf}; 82 83/// Options for schema extraction 84pub struct ExtractOptions { 85 /// Output directory for generated schema files 86 pub output_dir: PathBuf, 87 /// Enable verbose output 88 pub verbose: bool, 89 /// Filter by NSID prefix (e.g., "app.bsky") 90 pub filter: Option<String>, 91 /// Validate schemas before writing 92 pub validate: bool, 93 /// Pretty-print JSON output 94 pub pretty: bool, 95} 96 97impl Default for ExtractOptions { 98 fn default() -> Self { 99 Self { 100 output_dir: PathBuf::from("lexicons"), 101 verbose: false, 102 filter: None, 103 validate: true, 104 pretty: true, 105 } 106 } 107} 108 109/// Run schema extraction with simple defaults 110/// 111/// Convenience function for the common case. For more control, use [`SchemaExtractor`]. 112/// 113/// # Arguments 114/// 115/// * `output_dir` - Directory to write schema files (will be created if needed) 116/// * `verbose` - Print progress information 117/// 118/// # Example 119/// 120/// ```rust,ignore 121/// use jacquard_lexgen::schema_extraction; 122/// use my_app::models::*; // Your types with #[derive(LexiconSchema)] 123/// 124/// fn main() -> miette::Result<()> { 125/// schema_extraction::run("lexicons", true) 126/// } 127/// ``` 128pub fn run(output_dir: impl AsRef<Path>, verbose: bool) -> Result<()> { 129 let options = ExtractOptions { 130 output_dir: output_dir.as_ref().to_path_buf(), 131 verbose, 132 ..Default::default() 133 }; 134 135 SchemaExtractor::new(options).extract_all() 136} 137 138pub struct SchemaExtractor { 139 options: ExtractOptions, 140} 141 142impl SchemaExtractor { 143 pub fn new(options: ExtractOptions) -> Self { 144 Self { options } 145 } 146 147 /// Extract all schemas from inventory 148 pub fn extract_all(&self) -> Result<()> { 149 if self.options.verbose { 150 println!("Discovering schemas via inventory..."); 151 } 152 153 // Collect all schema refs from inventory 154 let refs: Vec<&LexiconSchemaRef> = inventory::iter::<LexiconSchemaRef>().collect(); 155 156 if self.options.verbose { 157 println!("Found {} schema types", refs.len()); 158 } 159 160 // Group by base NSID 161 let grouped = self.group_by_base_nsid(&refs)?; 162 163 // Create output directory 164 fs::create_dir_all(&self.options.output_dir).into_diagnostic()?; 165 166 // Process each group 167 let mut written = 0; 168 for (base_nsid, group_refs) in grouped { 169 // Apply filter if specified 170 if let Some(filter) = &self.options.filter { 171 if !base_nsid.starts_with(filter) { 172 continue; 173 } 174 } 175 176 if self.options.verbose { 177 println!("Processing {} ({} types)", base_nsid, group_refs.len()); 178 } 179 180 self.write_lexicon(&base_nsid, &group_refs)?; 181 written += 1; 182 } 183 184 println!( 185 "✓ Wrote {} lexicon files to {}", 186 written, 187 self.options.output_dir.display() 188 ); 189 190 Ok(()) 191 } 192 193 /// Group refs by base NSID (strip fragment suffix) 194 fn group_by_base_nsid<'a>( 195 &self, 196 refs: &[&'a LexiconSchemaRef], 197 ) -> Result<BTreeMap<String, Vec<&'a LexiconSchemaRef>>> { 198 let mut groups: BTreeMap<String, Vec<&'a LexiconSchemaRef>> = BTreeMap::new(); 199 200 for schema_ref in refs { 201 let nsid = schema_ref.nsid; 202 203 // Split on # to get base NSID 204 let base_nsid = if let Some(pos) = nsid.find('#') { 205 &nsid[..pos] 206 } else { 207 nsid 208 }; 209 210 groups 211 .entry(base_nsid.to_string()) 212 .or_default() 213 .push(schema_ref); 214 } 215 216 Ok(groups) 217 } 218 219 /// Write a single lexicon file 220 fn write_lexicon(&self, base_nsid: &str, refs: &[&LexiconSchemaRef]) -> Result<()> { 221 // Generate all schemas in this group 222 let mut all_defs = BTreeMap::new(); 223 let mut primary_doc: Option<LexiconDoc> = None; 224 225 for schema_ref in refs { 226 let doc = (schema_ref.provider)(); 227 228 // Determine if this is the primary def or a fragment 229 if schema_ref.nsid.contains('#') { 230 // Fragment - extract def name and add to defs 231 let fragment_name = schema_ref.nsid.split('#').nth(1).unwrap(); 232 233 // Merge defs from fragment doc 234 for (def_name, def) in doc.defs { 235 // Use fragment name if def is "main", otherwise use as-is 236 let final_name = if def_name == "main" { 237 fragment_name.to_string() 238 } else { 239 def_name.to_string() 240 }; 241 all_defs.insert(final_name, def); 242 } 243 } else { 244 // Primary type - use as base doc 245 primary_doc = Some(doc); 246 } 247 } 248 249 // Build final doc 250 let mut final_doc = primary_doc.unwrap_or_else(|| { 251 // No primary doc - create one 252 use jacquard_lexicon::lexicon::Lexicon; 253 LexiconDoc { 254 lexicon: Lexicon::Lexicon1, 255 id: base_nsid.into(), 256 revision: None, 257 description: None, 258 defs: BTreeMap::new(), 259 } 260 }); 261 262 // Merge in all defs (convert String keys to SmolStr) 263 for (k, v) in all_defs { 264 final_doc.defs.insert(k.into(), v); 265 } 266 267 // Validate if requested 268 if self.options.validate { 269 self.validate_schema(&final_doc)?; 270 } 271 272 // Serialize to JSON with "main" def first 273 let json = self.serialize_with_main_first(&final_doc)?; 274 275 // Write to file 276 let filename = base_nsid.replace('.', "_") + ".json"; 277 let path = self.options.output_dir.join(&filename); 278 279 fs::write(&path, json).into_diagnostic()?; 280 281 if self.options.verbose { 282 println!(" Wrote {} ({} defs)", filename, final_doc.defs.len()); 283 } 284 285 Ok(()) 286 } 287 288 /// Validate a schema document 289 fn validate_schema(&self, doc: &LexiconDoc) -> Result<()> { 290 // Must have at least one def 291 if doc.defs.is_empty() { 292 return Err(miette::miette!("lexicon {} has no defs", doc.id)); 293 } 294 295 // Warn if no "main" def and doesn't follow .defs convention 296 if !doc.defs.contains_key("main") { 297 let id_str = doc.id.as_ref(); 298 if !id_str.ends_with(".defs") { 299 eprintln!( 300 "⚠️ Warning: lexicon {} has no 'main' def - consider naming it {}.defs", 301 id_str, id_str 302 ); 303 if self.options.verbose { 304 eprintln!( 305 " Lexicons without a primary type should use the .defs suffix (e.g., app.bsky.actor.defs)" 306 ); 307 } 308 } 309 } 310 311 // Validate NSID format 312 if !is_valid_nsid(&doc.id) { 313 return Err(miette::miette!("invalid NSID format: {}", doc.id)); 314 } 315 316 Ok(()) 317 } 318 319 /// Watch mode - regenerate on file changes 320 pub fn watch(&self) -> Result<()> { 321 println!("Watch mode not yet implemented"); 322 println!("Run with --help to see available options"); 323 Ok(()) 324 } 325 326 /// Serialize a lexicon doc with "main" def first 327 fn serialize_with_main_first(&self, doc: &LexiconDoc) -> Result<String> { 328 use serde_json::{Map, Value, json}; 329 330 // Build defs map with main first 331 let mut defs_map = Map::new(); 332 333 // Insert main first if it exists 334 if let Some(main_def) = doc.defs.get("main") { 335 let main_value = serde_json::to_value(main_def).into_diagnostic()?; 336 defs_map.insert("main".to_string(), main_value); 337 } 338 339 // Insert all other defs in sorted order 340 for (name, def) in &doc.defs { 341 if name != "main" { 342 let def_value = serde_json::to_value(def).into_diagnostic()?; 343 defs_map.insert(name.to_string(), def_value); 344 } 345 } 346 347 // Build final JSON object 348 let mut obj = Map::new(); 349 obj.insert("lexicon".to_string(), json!(1)); 350 obj.insert("id".to_string(), json!(doc.id.as_ref())); 351 352 if let Some(rev) = &doc.revision { 353 obj.insert("revision".to_string(), json!(rev)); 354 } 355 356 if let Some(desc) = &doc.description { 357 obj.insert("description".to_string(), json!(desc)); 358 } 359 360 obj.insert("defs".to_string(), Value::Object(defs_map)); 361 362 // Serialize with or without pretty printing 363 if self.options.pretty { 364 serde_json::to_string_pretty(&Value::Object(obj)).into_diagnostic() 365 } else { 366 serde_json::to_string(&Value::Object(obj)).into_diagnostic() 367 } 368 } 369} 370 371/// Validate NSID format: domain.name.record 372fn is_valid_nsid(nsid: &str) -> bool { 373 let parts: Vec<&str> = nsid.split('.').collect(); 374 375 // Must have at least 3 parts 376 if parts.len() < 3 { 377 return false; 378 } 379 380 // Each part must be valid 381 for part in parts { 382 if part.is_empty() { 383 return false; 384 } 385 386 // Must be alphanumeric, hyphens, or underscores 387 if !part 388 .chars() 389 .all(|c| c.is_alphanumeric() || c == '-' || c == '_') 390 { 391 return false; 392 } 393 } 394 395 true 396} 397 398#[cfg(test)] 399mod tests { 400 use super::*; 401 402 #[test] 403 fn test_is_valid_nsid() { 404 assert!(is_valid_nsid("com.example.test")); 405 assert!(is_valid_nsid("app.bsky.feed.post")); 406 assert!(is_valid_nsid("com.example.with_underscore")); 407 assert!(is_valid_nsid("com.example.with-hyphen")); 408 409 assert!(!is_valid_nsid("com.example")); // Too short 410 assert!(!is_valid_nsid("com")); // Too short 411 assert!(!is_valid_nsid("com.example.invalid!")); // Invalid char 412 assert!(!is_valid_nsid("com..example")); // Empty segment 413 } 414 415 #[test] 416 fn test_group_by_base_nsid() { 417 let refs = vec![ 418 LexiconSchemaRef { 419 nsid: "com.example.test", 420 def_name: "main", 421 provider: || unimplemented!("test provider"), 422 }, 423 LexiconSchemaRef { 424 nsid: "com.example.test#fragment", 425 def_name: "fragment", 426 provider: || unimplemented!("test provider"), 427 }, 428 LexiconSchemaRef { 429 nsid: "com.example.other", 430 def_name: "main", 431 provider: || unimplemented!("test provider"), 432 }, 433 ]; 434 435 let ref_ptrs: Vec<&LexiconSchemaRef> = refs.iter().collect(); 436 437 let extractor = SchemaExtractor::new(ExtractOptions { 438 output_dir: PathBuf::from("test"), 439 verbose: false, 440 filter: None, 441 validate: false, 442 pretty: true, 443 }); 444 445 let grouped = extractor.group_by_base_nsid(&ref_ptrs).unwrap(); 446 447 assert_eq!(grouped.len(), 2); 448 assert!(grouped.contains_key("com.example.test")); 449 assert!(grouped.contains_key("com.example.other")); 450 assert_eq!(grouped["com.example.test"].len(), 2); 451 assert_eq!(grouped["com.example.other"].len(), 1); 452 } 453}