crates/jacquard-lexgen/src/schema_extraction.rs at main

A better Rust ATProto crate
jacquard / crates / jacquard-lexgen / src / schema_extraction.rs
at main 453 lines 14 kB view raw
wrap content
Orual runtime validation of data query/path methods for probing into Data and RawData reworked derive macro to be properly robust 4mo ago
6f0e2f92
  1//! # Schema Extraction
  2//!
  3//! Extract AT Protocol lexicon schemas from Rust types via `inventory` discovery.
  4//!
  5//! ## Usage Pattern
  6//!
  7//! This module provides schema extraction for types implementing `LexiconSchema`.
  8//! The extraction binary discovers schemas at **link time** via `inventory`, so you need
  9//! to create a binary in your workspace that links your schema types.
 10//!
 11//! ### Simple Usage
 12//!
 13//! ```rust,ignore
 14//! // bin/extract_schemas.rs
 15//! use jacquard_lexgen::schema_extraction;
 16//!
 17//! // Import your types so they get linked
 18//! use my_app::models::*;
 19//!
 20//! fn main() -> miette::Result<()> {
 21//!     schema_extraction::run(
 22//!         "lexicons",  // output directory
 23//!         true,        // verbose
 24//!     )
 25//! }
 26//! ```
 27//!
 28//! ### Advanced Usage
 29//!
 30//! ```rust,ignore
 31//! use jacquard_lexgen::schema_extraction::{ExtractOptions, SchemaExtractor};
 32//! use my_app::models::*;  // Your schema types
 33//!
 34//! fn main() -> miette::Result<()> {
 35//!     let options = ExtractOptions {
 36//!         output_dir: "lexicons".into(),
 37//!         verbose: true,
 38//!         filter: Some("app.bsky".into()),  // Only extract app.bsky.* schemas
 39//!         validate: true,
 40//!         pretty: true,
 41//!     };
 42//!
 43//!     SchemaExtractor::new(options).extract_all()
 44//! }
 45//! ```
 46//!
 47//! ### Integration with Build Tools
 48//!
 49//! **Just:**
 50//! ```justfile
 51//! # Generate lexicon schemas from Rust types
 52//! extract-schemas:
 53//!     cargo run --bin extract-schemas
 54//! ```
 55//!
 56//! **Cargo xtask:**
 57//! ```rust,ignore
 58//! // xtask/src/main.rs
 59//! match args {
 60//!     "codegen" => {
 61//!         run_command("cargo", &["run", "--bin", "extract-schemas"])?;
 62//!     }
 63//! }
 64//! ```
 65//!
 66//! **Pre-commit hook:**
 67//! ```bash
 68//! #!/bin/bash
 69//! # Regenerate schemas when Rust files change
 70//! if git diff --cached --name-only | grep -E '\.rs$'; then
 71//!     cargo run --bin extract-schemas
 72//!     git add lexicons/*.json
 73//! fi
 74//! ```
 75
 76use jacquard_lexicon::lexicon::LexiconDoc;
 77use jacquard_lexicon::schema::LexiconSchemaRef;
 78use miette::{IntoDiagnostic, Result};
 79use std::collections::BTreeMap;
 80use std::fs;
 81use std::path::{Path, PathBuf};
 82
 83/// Options for schema extraction
 84pub struct ExtractOptions {
 85    /// Output directory for generated schema files
 86    pub output_dir: PathBuf,
 87    /// Enable verbose output
 88    pub verbose: bool,
 89    /// Filter by NSID prefix (e.g., "app.bsky")
 90    pub filter: Option<String>,
 91    /// Validate schemas before writing
 92    pub validate: bool,
 93    /// Pretty-print JSON output
 94    pub pretty: bool,
 95}
 96
 97impl Default for ExtractOptions {
 98    fn default() -> Self {
 99        Self {
100            output_dir: PathBuf::from("lexicons"),
101            verbose: false,
102            filter: None,
103            validate: true,
104            pretty: true,
105        }
106    }
107}
108
109/// Run schema extraction with simple defaults
110///
111/// Convenience function for the common case. For more control, use [`SchemaExtractor`].
112///
113/// # Arguments
114///
115/// * `output_dir` - Directory to write schema files (will be created if needed)
116/// * `verbose` - Print progress information
117///
118/// # Example
119///
120/// ```rust,ignore
121/// use jacquard_lexgen::schema_extraction;
122/// use my_app::models::*;  // Your types with #[derive(LexiconSchema)]
123///
124/// fn main() -> miette::Result<()> {
125///     schema_extraction::run("lexicons", true)
126/// }
127/// ```
128pub fn run(output_dir: impl AsRef<Path>, verbose: bool) -> Result<()> {
129    let options = ExtractOptions {
130        output_dir: output_dir.as_ref().to_path_buf(),
131        verbose,
132        ..Default::default()
133    };
134
135    SchemaExtractor::new(options).extract_all()
136}
137
138pub struct SchemaExtractor {
139    options: ExtractOptions,
140}
141
142impl SchemaExtractor {
143    pub fn new(options: ExtractOptions) -> Self {
144        Self { options }
145    }
146
147    /// Extract all schemas from inventory
148    pub fn extract_all(&self) -> Result<()> {
149        if self.options.verbose {
150            println!("Discovering schemas via inventory...");
151        }
152
153        // Collect all schema refs from inventory
154        let refs: Vec<&LexiconSchemaRef> = inventory::iter::<LexiconSchemaRef>().collect();
155
156        if self.options.verbose {
157            println!("Found {} schema types", refs.len());
158        }
159
160        // Group by base NSID
161        let grouped = self.group_by_base_nsid(&refs)?;
162
163        // Create output directory
164        fs::create_dir_all(&self.options.output_dir).into_diagnostic()?;
165
166        // Process each group
167        let mut written = 0;
168        for (base_nsid, group_refs) in grouped {
169            // Apply filter if specified
170            if let Some(filter) = &self.options.filter {
171                if !base_nsid.starts_with(filter) {
172                    continue;
173                }
174            }
175
176            if self.options.verbose {
177                println!("Processing {} ({} types)", base_nsid, group_refs.len());
178            }
179
180            self.write_lexicon(&base_nsid, &group_refs)?;
181            written += 1;
182        }
183
184        println!(
185            "✓ Wrote {} lexicon files to {}",
186            written,
187            self.options.output_dir.display()
188        );
189
190        Ok(())
191    }
192
193    /// Group refs by base NSID (strip fragment suffix)
194    fn group_by_base_nsid<'a>(
195        &self,
196        refs: &[&'a LexiconSchemaRef],
197    ) -> Result<BTreeMap<String, Vec<&'a LexiconSchemaRef>>> {
198        let mut groups: BTreeMap<String, Vec<&'a LexiconSchemaRef>> = BTreeMap::new();
199
200        for schema_ref in refs {
201            let nsid = schema_ref.nsid;
202
203            // Split on # to get base NSID
204            let base_nsid = if let Some(pos) = nsid.find('#') {
205                &nsid[..pos]
206            } else {
207                nsid
208            };
209
210            groups
211                .entry(base_nsid.to_string())
212                .or_default()
213                .push(schema_ref);
214        }
215
216        Ok(groups)
217    }
218
219    /// Write a single lexicon file
220    fn write_lexicon(&self, base_nsid: &str, refs: &[&LexiconSchemaRef]) -> Result<()> {
221        // Generate all schemas in this group
222        let mut all_defs = BTreeMap::new();
223        let mut primary_doc: Option<LexiconDoc> = None;
224
225        for schema_ref in refs {
226            let doc = (schema_ref.provider)();
227
228            // Determine if this is the primary def or a fragment
229            if schema_ref.nsid.contains('#') {
230                // Fragment - extract def name and add to defs
231                let fragment_name = schema_ref.nsid.split('#').nth(1).unwrap();
232
233                // Merge defs from fragment doc
234                for (def_name, def) in doc.defs {
235                    // Use fragment name if def is "main", otherwise use as-is
236                    let final_name = if def_name == "main" {
237                        fragment_name.to_string()
238                    } else {
239                        def_name.to_string()
240                    };
241                    all_defs.insert(final_name, def);
242                }
243            } else {
244                // Primary type - use as base doc
245                primary_doc = Some(doc);
246            }
247        }
248
249        // Build final doc
250        let mut final_doc = primary_doc.unwrap_or_else(|| {
251            // No primary doc - create one
252            use jacquard_lexicon::lexicon::Lexicon;
253            LexiconDoc {
254                lexicon: Lexicon::Lexicon1,
255                id: base_nsid.into(),
256                revision: None,
257                description: None,
258                defs: BTreeMap::new(),
259            }
260        });
261
262        // Merge in all defs (convert String keys to SmolStr)
263        for (k, v) in all_defs {
264            final_doc.defs.insert(k.into(), v);
265        }
266
267        // Validate if requested
268        if self.options.validate {
269            self.validate_schema(&final_doc)?;
270        }
271
272        // Serialize to JSON with "main" def first
273        let json = self.serialize_with_main_first(&final_doc)?;
274
275        // Write to file
276        let filename = base_nsid.replace('.', "_") + ".json";
277        let path = self.options.output_dir.join(&filename);
278
279        fs::write(&path, json).into_diagnostic()?;
280
281        if self.options.verbose {
282            println!("  Wrote {} ({} defs)", filename, final_doc.defs.len());
283        }
284
285        Ok(())
286    }
287
288    /// Validate a schema document
289    fn validate_schema(&self, doc: &LexiconDoc) -> Result<()> {
290        // Must have at least one def
291        if doc.defs.is_empty() {
292            return Err(miette::miette!("lexicon {} has no defs", doc.id));
293        }
294
295        // Warn if no "main" def and doesn't follow .defs convention
296        if !doc.defs.contains_key("main") {
297            let id_str = doc.id.as_ref();
298            if !id_str.ends_with(".defs") {
299                eprintln!(
300                    "⚠️  Warning: lexicon {} has no 'main' def - consider naming it {}.defs",
301                    id_str, id_str
302                );
303                if self.options.verbose {
304                    eprintln!(
305                        "   Lexicons without a primary type should use the .defs suffix (e.g., app.bsky.actor.defs)"
306                    );
307                }
308            }
309        }
310
311        // Validate NSID format
312        if !is_valid_nsid(&doc.id) {
313            return Err(miette::miette!("invalid NSID format: {}", doc.id));
314        }
315
316        Ok(())
317    }
318
319    /// Watch mode - regenerate on file changes
320    pub fn watch(&self) -> Result<()> {
321        println!("Watch mode not yet implemented");
322        println!("Run with --help to see available options");
323        Ok(())
324    }
325
326    /// Serialize a lexicon doc with "main" def first
327    fn serialize_with_main_first(&self, doc: &LexiconDoc) -> Result<String> {
328        use serde_json::{Map, Value, json};
329
330        // Build defs map with main first
331        let mut defs_map = Map::new();
332
333        // Insert main first if it exists
334        if let Some(main_def) = doc.defs.get("main") {
335            let main_value = serde_json::to_value(main_def).into_diagnostic()?;
336            defs_map.insert("main".to_string(), main_value);
337        }
338
339        // Insert all other defs in sorted order
340        for (name, def) in &doc.defs {
341            if name != "main" {
342                let def_value = serde_json::to_value(def).into_diagnostic()?;
343                defs_map.insert(name.to_string(), def_value);
344            }
345        }
346
347        // Build final JSON object
348        let mut obj = Map::new();
349        obj.insert("lexicon".to_string(), json!(1));
350        obj.insert("id".to_string(), json!(doc.id.as_ref()));
351
352        if let Some(rev) = &doc.revision {
353            obj.insert("revision".to_string(), json!(rev));
354        }
355
356        if let Some(desc) = &doc.description {
357            obj.insert("description".to_string(), json!(desc));
358        }
359
360        obj.insert("defs".to_string(), Value::Object(defs_map));
361
362        // Serialize with or without pretty printing
363        if self.options.pretty {
364            serde_json::to_string_pretty(&Value::Object(obj)).into_diagnostic()
365        } else {
366            serde_json::to_string(&Value::Object(obj)).into_diagnostic()
367        }
368    }
369}
370
371/// Validate NSID format: domain.name.record
372fn is_valid_nsid(nsid: &str) -> bool {
373    let parts: Vec<&str> = nsid.split('.').collect();
374
375    // Must have at least 3 parts
376    if parts.len() < 3 {
377        return false;
378    }
379
380    // Each part must be valid
381    for part in parts {
382        if part.is_empty() {
383            return false;
384        }
385
386        // Must be alphanumeric, hyphens, or underscores
387        if !part
388            .chars()
389            .all(|c| c.is_alphanumeric() || c == '-' || c == '_')
390        {
391            return false;
392        }
393    }
394
395    true
396}
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    #[test]
403    fn test_is_valid_nsid() {
404        assert!(is_valid_nsid("com.example.test"));
405        assert!(is_valid_nsid("app.bsky.feed.post"));
406        assert!(is_valid_nsid("com.example.with_underscore"));
407        assert!(is_valid_nsid("com.example.with-hyphen"));
408
409        assert!(!is_valid_nsid("com.example")); // Too short
410        assert!(!is_valid_nsid("com")); // Too short
411        assert!(!is_valid_nsid("com.example.invalid!")); // Invalid char
412        assert!(!is_valid_nsid("com..example")); // Empty segment
413    }
414
415    #[test]
416    fn test_group_by_base_nsid() {
417        let refs = vec![
418            LexiconSchemaRef {
419                nsid: "com.example.test",
420                def_name: "main",
421                provider: || unimplemented!("test provider"),
422            },
423            LexiconSchemaRef {
424                nsid: "com.example.test#fragment",
425                def_name: "fragment",
426                provider: || unimplemented!("test provider"),
427            },
428            LexiconSchemaRef {
429                nsid: "com.example.other",
430                def_name: "main",
431                provider: || unimplemented!("test provider"),
432            },
433        ];
434
435        let ref_ptrs: Vec<&LexiconSchemaRef> = refs.iter().collect();
436
437        let extractor = SchemaExtractor::new(ExtractOptions {
438            output_dir: PathBuf::from("test"),
439            verbose: false,
440            filter: None,
441            validate: false,
442            pretty: true,
443        });
444
445        let grouped = extractor.group_by_base_nsid(&ref_ptrs).unwrap();
446
447        assert_eq!(grouped.len(), 2);
448        assert!(grouped.contains_key("com.example.test"));
449        assert!(grouped.contains_key("com.example.other"));
450        assert_eq!(grouped["com.example.test"].len(), 2);
451        assert_eq!(grouped["com.example.other"].len(), 1);
452    }
453}