A better Rust ATProto crate
1//! # Schema Extraction
2//!
3//! Extract AT Protocol lexicon schemas from Rust types via `inventory` discovery.
4//!
5//! ## Usage Pattern
6//!
7//! This module provides schema extraction for types implementing `LexiconSchema`.
8//! The extraction binary discovers schemas at **link time** via `inventory`, so you need
9//! to create a binary in your workspace that links your schema types.
10//!
11//! ### Simple Usage
12//!
13//! ```rust,ignore
14//! // bin/extract_schemas.rs
15//! use jacquard_lexgen::schema_extraction;
16//!
17//! // Import your types so they get linked
18//! use my_app::models::*;
19//!
20//! fn main() -> miette::Result<()> {
21//! schema_extraction::run(
22//! "lexicons", // output directory
23//! true, // verbose
24//! )
25//! }
26//! ```
27//!
28//! ### Advanced Usage
29//!
30//! ```rust,ignore
31//! use jacquard_lexgen::schema_extraction::{ExtractOptions, SchemaExtractor};
32//! use my_app::models::*; // Your schema types
33//!
34//! fn main() -> miette::Result<()> {
35//! let options = ExtractOptions {
36//! output_dir: "lexicons".into(),
37//! verbose: true,
38//! filter: Some("app.bsky".into()), // Only extract app.bsky.* schemas
39//! validate: true,
40//! pretty: true,
41//! };
42//!
43//! SchemaExtractor::new(options).extract_all()
44//! }
45//! ```
46//!
47//! ### Integration with Build Tools
48//!
49//! **Just:**
50//! ```justfile
51//! # Generate lexicon schemas from Rust types
52//! extract-schemas:
53//! cargo run --bin extract-schemas
54//! ```
55//!
56//! **Cargo xtask:**
57//! ```rust,ignore
58//! // xtask/src/main.rs
59//! match args {
60//! "codegen" => {
61//! run_command("cargo", &["run", "--bin", "extract-schemas"])?;
62//! }
63//! }
64//! ```
65//!
66//! **Pre-commit hook:**
67//! ```bash
68//! #!/bin/bash
69//! # Regenerate schemas when Rust files change
70//! if git diff --cached --name-only | grep -E '\.rs$'; then
71//! cargo run --bin extract-schemas
72//! git add lexicons/*.json
73//! fi
74//! ```
75
76use jacquard_lexicon::lexicon::LexiconDoc;
77use jacquard_lexicon::schema::LexiconSchemaRef;
78use miette::{IntoDiagnostic, Result};
79use std::collections::BTreeMap;
80use std::fs;
81use std::path::{Path, PathBuf};
82
83/// Options for schema extraction
84pub struct ExtractOptions {
85 /// Output directory for generated schema files
86 pub output_dir: PathBuf,
87 /// Enable verbose output
88 pub verbose: bool,
89 /// Filter by NSID prefix (e.g., "app.bsky")
90 pub filter: Option<String>,
91 /// Validate schemas before writing
92 pub validate: bool,
93 /// Pretty-print JSON output
94 pub pretty: bool,
95}
96
97impl Default for ExtractOptions {
98 fn default() -> Self {
99 Self {
100 output_dir: PathBuf::from("lexicons"),
101 verbose: false,
102 filter: None,
103 validate: true,
104 pretty: true,
105 }
106 }
107}
108
109/// Run schema extraction with simple defaults
110///
111/// Convenience function for the common case. For more control, use [`SchemaExtractor`].
112///
113/// # Arguments
114///
115/// * `output_dir` - Directory to write schema files (will be created if needed)
116/// * `verbose` - Print progress information
117///
118/// # Example
119///
120/// ```rust,ignore
121/// use jacquard_lexgen::schema_extraction;
122/// use my_app::models::*; // Your types with #[derive(LexiconSchema)]
123///
124/// fn main() -> miette::Result<()> {
125/// schema_extraction::run("lexicons", true)
126/// }
127/// ```
128pub fn run(output_dir: impl AsRef<Path>, verbose: bool) -> Result<()> {
129 let options = ExtractOptions {
130 output_dir: output_dir.as_ref().to_path_buf(),
131 verbose,
132 ..Default::default()
133 };
134
135 SchemaExtractor::new(options).extract_all()
136}
137
138pub struct SchemaExtractor {
139 options: ExtractOptions,
140}
141
142impl SchemaExtractor {
143 pub fn new(options: ExtractOptions) -> Self {
144 Self { options }
145 }
146
147 /// Extract all schemas from inventory
148 pub fn extract_all(&self) -> Result<()> {
149 if self.options.verbose {
150 println!("Discovering schemas via inventory...");
151 }
152
153 // Collect all schema refs from inventory
154 let refs: Vec<&LexiconSchemaRef> = inventory::iter::<LexiconSchemaRef>().collect();
155
156 if self.options.verbose {
157 println!("Found {} schema types", refs.len());
158 }
159
160 // Group by base NSID
161 let grouped = self.group_by_base_nsid(&refs)?;
162
163 // Create output directory
164 fs::create_dir_all(&self.options.output_dir).into_diagnostic()?;
165
166 // Process each group
167 let mut written = 0;
168 for (base_nsid, group_refs) in grouped {
169 // Apply filter if specified
170 if let Some(filter) = &self.options.filter {
171 if !base_nsid.starts_with(filter) {
172 continue;
173 }
174 }
175
176 if self.options.verbose {
177 println!("Processing {} ({} types)", base_nsid, group_refs.len());
178 }
179
180 self.write_lexicon(&base_nsid, &group_refs)?;
181 written += 1;
182 }
183
184 println!(
185 "✓ Wrote {} lexicon files to {}",
186 written,
187 self.options.output_dir.display()
188 );
189
190 Ok(())
191 }
192
193 /// Group refs by base NSID (strip fragment suffix)
194 fn group_by_base_nsid<'a>(
195 &self,
196 refs: &[&'a LexiconSchemaRef],
197 ) -> Result<BTreeMap<String, Vec<&'a LexiconSchemaRef>>> {
198 let mut groups: BTreeMap<String, Vec<&'a LexiconSchemaRef>> = BTreeMap::new();
199
200 for schema_ref in refs {
201 let nsid = schema_ref.nsid;
202
203 // Split on # to get base NSID
204 let base_nsid = if let Some(pos) = nsid.find('#') {
205 &nsid[..pos]
206 } else {
207 nsid
208 };
209
210 groups
211 .entry(base_nsid.to_string())
212 .or_default()
213 .push(schema_ref);
214 }
215
216 Ok(groups)
217 }
218
219 /// Write a single lexicon file
220 fn write_lexicon(&self, base_nsid: &str, refs: &[&LexiconSchemaRef]) -> Result<()> {
221 // Generate all schemas in this group
222 let mut all_defs = BTreeMap::new();
223 let mut primary_doc: Option<LexiconDoc> = None;
224
225 for schema_ref in refs {
226 let doc = (schema_ref.provider)();
227
228 // Determine if this is the primary def or a fragment
229 if schema_ref.nsid.contains('#') {
230 // Fragment - extract def name and add to defs
231 let fragment_name = schema_ref.nsid.split('#').nth(1).unwrap();
232
233 // Merge defs from fragment doc
234 for (def_name, def) in doc.defs {
235 // Use fragment name if def is "main", otherwise use as-is
236 let final_name = if def_name == "main" {
237 fragment_name.to_string()
238 } else {
239 def_name.to_string()
240 };
241 all_defs.insert(final_name, def);
242 }
243 } else {
244 // Primary type - use as base doc
245 primary_doc = Some(doc);
246 }
247 }
248
249 // Build final doc
250 let mut final_doc = primary_doc.unwrap_or_else(|| {
251 // No primary doc - create one
252 use jacquard_lexicon::lexicon::Lexicon;
253 LexiconDoc {
254 lexicon: Lexicon::Lexicon1,
255 id: base_nsid.into(),
256 revision: None,
257 description: None,
258 defs: BTreeMap::new(),
259 }
260 });
261
262 // Merge in all defs (convert String keys to SmolStr)
263 for (k, v) in all_defs {
264 final_doc.defs.insert(k.into(), v);
265 }
266
267 // Validate if requested
268 if self.options.validate {
269 self.validate_schema(&final_doc)?;
270 }
271
272 // Serialize to JSON with "main" def first
273 let json = self.serialize_with_main_first(&final_doc)?;
274
275 // Write to file
276 let filename = base_nsid.replace('.', "_") + ".json";
277 let path = self.options.output_dir.join(&filename);
278
279 fs::write(&path, json).into_diagnostic()?;
280
281 if self.options.verbose {
282 println!(" Wrote {} ({} defs)", filename, final_doc.defs.len());
283 }
284
285 Ok(())
286 }
287
288 /// Validate a schema document
289 fn validate_schema(&self, doc: &LexiconDoc) -> Result<()> {
290 // Must have at least one def
291 if doc.defs.is_empty() {
292 return Err(miette::miette!("lexicon {} has no defs", doc.id));
293 }
294
295 // Warn if no "main" def and doesn't follow .defs convention
296 if !doc.defs.contains_key("main") {
297 let id_str = doc.id.as_ref();
298 if !id_str.ends_with(".defs") {
299 eprintln!(
300 "⚠️ Warning: lexicon {} has no 'main' def - consider naming it {}.defs",
301 id_str, id_str
302 );
303 if self.options.verbose {
304 eprintln!(
305 " Lexicons without a primary type should use the .defs suffix (e.g., app.bsky.actor.defs)"
306 );
307 }
308 }
309 }
310
311 // Validate NSID format
312 if !is_valid_nsid(&doc.id) {
313 return Err(miette::miette!("invalid NSID format: {}", doc.id));
314 }
315
316 Ok(())
317 }
318
319 /// Watch mode - regenerate on file changes
320 pub fn watch(&self) -> Result<()> {
321 println!("Watch mode not yet implemented");
322 println!("Run with --help to see available options");
323 Ok(())
324 }
325
326 /// Serialize a lexicon doc with "main" def first
327 fn serialize_with_main_first(&self, doc: &LexiconDoc) -> Result<String> {
328 use serde_json::{Map, Value, json};
329
330 // Build defs map with main first
331 let mut defs_map = Map::new();
332
333 // Insert main first if it exists
334 if let Some(main_def) = doc.defs.get("main") {
335 let main_value = serde_json::to_value(main_def).into_diagnostic()?;
336 defs_map.insert("main".to_string(), main_value);
337 }
338
339 // Insert all other defs in sorted order
340 for (name, def) in &doc.defs {
341 if name != "main" {
342 let def_value = serde_json::to_value(def).into_diagnostic()?;
343 defs_map.insert(name.to_string(), def_value);
344 }
345 }
346
347 // Build final JSON object
348 let mut obj = Map::new();
349 obj.insert("lexicon".to_string(), json!(1));
350 obj.insert("id".to_string(), json!(doc.id.as_ref()));
351
352 if let Some(rev) = &doc.revision {
353 obj.insert("revision".to_string(), json!(rev));
354 }
355
356 if let Some(desc) = &doc.description {
357 obj.insert("description".to_string(), json!(desc));
358 }
359
360 obj.insert("defs".to_string(), Value::Object(defs_map));
361
362 // Serialize with or without pretty printing
363 if self.options.pretty {
364 serde_json::to_string_pretty(&Value::Object(obj)).into_diagnostic()
365 } else {
366 serde_json::to_string(&Value::Object(obj)).into_diagnostic()
367 }
368 }
369}
370
371/// Validate NSID format: domain.name.record
372fn is_valid_nsid(nsid: &str) -> bool {
373 let parts: Vec<&str> = nsid.split('.').collect();
374
375 // Must have at least 3 parts
376 if parts.len() < 3 {
377 return false;
378 }
379
380 // Each part must be valid
381 for part in parts {
382 if part.is_empty() {
383 return false;
384 }
385
386 // Must be alphanumeric, hyphens, or underscores
387 if !part
388 .chars()
389 .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
390 {
391 return false;
392 }
393 }
394
395 true
396}
397
398#[cfg(test)]
399mod tests {
400 use super::*;
401
402 #[test]
403 fn test_is_valid_nsid() {
404 assert!(is_valid_nsid("com.example.test"));
405 assert!(is_valid_nsid("app.bsky.feed.post"));
406 assert!(is_valid_nsid("com.example.with_underscore"));
407 assert!(is_valid_nsid("com.example.with-hyphen"));
408
409 assert!(!is_valid_nsid("com.example")); // Too short
410 assert!(!is_valid_nsid("com")); // Too short
411 assert!(!is_valid_nsid("com.example.invalid!")); // Invalid char
412 assert!(!is_valid_nsid("com..example")); // Empty segment
413 }
414
415 #[test]
416 fn test_group_by_base_nsid() {
417 let refs = vec![
418 LexiconSchemaRef {
419 nsid: "com.example.test",
420 def_name: "main",
421 provider: || unimplemented!("test provider"),
422 },
423 LexiconSchemaRef {
424 nsid: "com.example.test#fragment",
425 def_name: "fragment",
426 provider: || unimplemented!("test provider"),
427 },
428 LexiconSchemaRef {
429 nsid: "com.example.other",
430 def_name: "main",
431 provider: || unimplemented!("test provider"),
432 },
433 ];
434
435 let ref_ptrs: Vec<&LexiconSchemaRef> = refs.iter().collect();
436
437 let extractor = SchemaExtractor::new(ExtractOptions {
438 output_dir: PathBuf::from("test"),
439 verbose: false,
440 filter: None,
441 validate: false,
442 pretty: true,
443 });
444
445 let grouped = extractor.group_by_base_nsid(&ref_ptrs).unwrap();
446
447 assert_eq!(grouped.len(), 2);
448 assert!(grouped.contains_key("com.example.test"));
449 assert!(grouped.contains_key("com.example.other"));
450 assert_eq!(grouped["com.example.test"].len(), 2);
451 assert_eq!(grouped["com.example.other"].len(), 1);
452 }
453}