A better Rust ATProto crate

embed candidate detection

Orual 0cbf3d8e 62e76554

+336 -13
+3
crates/jacquard/src/client.rs
··· 160 160 pub handle: Handle<'static>, 161 161 } 162 162 163 + #[cfg(feature = "api")] 163 164 impl From<CreateSessionOutput<'_>> for AtpSession { 164 165 fn from(output: CreateSessionOutput<'_>) -> Self { 165 166 Self { ··· 171 172 } 172 173 } 173 174 175 + #[cfg(feature = "api")] 174 176 impl From<RefreshSessionOutput<'_>> for AtpSession { 175 177 fn from(output: RefreshSessionOutput<'_>) -> Self { 176 178 Self { ··· 891 893 } 892 894 } 893 895 896 + #[cfg(feature = "api")] 894 897 impl<T: AgentSession + IdentityResolver> AgentSessionExt for T {} 895 898 896 899 impl<A: AgentSession> HttpClient for Agent<A> {
+1
crates/jacquard/src/lib.rs
··· 242 242 /// Prelude with the extension traits you're likely to want and some other stuff 243 243 pub mod prelude { 244 244 pub use crate::client::AgentSession; 245 + #[cfg(feature = "api")] 245 246 pub use crate::client::AgentSessionExt; 246 247 pub use crate::client::BasicClient; 247 248 pub use crate::common::http_client::HttpClient;
+332 -13
crates/jacquard/src/richtext.rs
··· 3 3 //! Provides parsing and building of rich text with facets (mentions, links, tags) 4 4 //! and detection of embed candidates (record and external embeds). 5 5 6 + #[cfg(feature = "api_bluesky")] 7 + use crate::api::app_bsky::richtext::facet::Facet; 6 8 use crate::common::CowStr; 7 9 use jacquard_common::IntoStatic; 8 10 use jacquard_common::types::did::{DID_REGEX, Did}; ··· 32 34 LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap()); 33 35 34 36 static TRAILING_PUNCT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\p{P}+$").unwrap()); 37 + 38 + /// Default domains that support at-URI extraction from URLs 39 + /// (bsky.app URL patterns like /profile/{actor}/post/{rkey}) 40 + #[cfg(feature = "api_bluesky")] 41 + static DEFAULT_EMBED_DOMAINS: &[&str] = &["bsky.app", "deer.social"]; 35 42 36 43 /// Marker type indicating all facets are resolved (no handles pending DID resolution) 37 44 pub struct Resolved; ··· 119 126 } 120 127 121 128 /// Entry point for parsing text with automatic facet detection 129 + /// 130 + /// Uses default embed domains (bsky.app, deer.social) for at-URI extraction. 131 + /// For custom domains, use [`parse_with_domains`]. 122 132 pub fn parse(text: impl Into<String>) -> RichTextBuilder<Unresolved> { 133 + #[cfg(feature = "api_bluesky")] 134 + { 135 + parse_with_domains(text, DEFAULT_EMBED_DOMAINS) 136 + } 137 + #[cfg(not(feature = "api_bluesky"))] 138 + { 139 + parse_with_domains(text, &[]) 140 + } 141 + } 142 + 143 + /// Parse text with custom embed domains for at-URI extraction 144 + /// 145 + /// This allows specifying additional domains (beyond bsky.app and deer.social) 146 + /// that use the same URL patterns for records (e.g., /profile/{actor}/post/{rkey}). 147 + #[cfg(feature = "api_bluesky")] 148 + pub fn parse_with_domains( 149 + text: impl Into<String>, 150 + embed_domains: &[&str], 151 + ) -> RichTextBuilder<Unresolved> { 123 152 let text = text.into(); 124 153 let mut facet_candidates = Vec::new(); 154 + let mut embed_candidates = Vec::new(); 125 155 126 156 // Step 1: Detect and strip markdown links first 127 157 let (text_processed, markdown_facets) = detect_markdown_links(&text); 158 + 159 + // Check markdown links for embed candidates 160 + for facet in &markdown_facets { 161 + if let FacetCandidate::MarkdownLink { url, .. } = facet { 162 + if let Some(embed) = classify_embed(url, embed_domains) { 163 + embed_candidates.push(embed); 164 + } 165 + } 166 + } 167 + 128 168 facet_candidates.extend(markdown_facets); 129 169 130 170 // Step 2: Detect mentions ··· 133 173 134 174 // Step 3: Detect URLs 135 175 let url_facets = detect_urls(&text_processed); 176 + 177 + // Check URLs for embed candidates 178 + for facet in &url_facets { 179 + if let FacetCandidate::Link { range } = facet { 180 + let url = &text_processed[range.clone()]; 181 + if let Some(embed) = classify_embed(url, embed_domains) { 182 + embed_candidates.push(embed); 183 + } 184 + } 185 + } 186 + 136 187 facet_candidates.extend(url_facets); 137 188 138 189 // Step 4: Detect tags ··· 142 193 RichTextBuilder { 143 194 text: text_processed, 144 195 facet_candidates, 145 - #[cfg(feature = "api_bluesky")] 146 - embed_candidates: Vec::new(), 196 + embed_candidates, 197 + _state: PhantomData, 198 + } 199 + } 200 + 201 + /// Parse text without embed detection (no api_bluesky feature) 202 + #[cfg(not(feature = "api_bluesky"))] 203 + pub fn parse_with_domains( 204 + text: impl Into<String>, 205 + _embed_domains: &[&str], 206 + ) -> RichTextBuilder<Unresolved> { 207 + let text = text.into(); 208 + let mut facet_candidates = Vec::new(); 209 + 210 + // Step 1: Detect and strip markdown links first 211 + let (text_processed, markdown_facets) = detect_markdown_links(&text); 212 + facet_candidates.extend(markdown_facets); 213 + 214 + // Step 2: Detect mentions 215 + let mention_facets = detect_mentions(&text_processed); 216 + facet_candidates.extend(mention_facets); 217 + 218 + // Step 3: Detect URLs 219 + let url_facets = detect_urls(&text_processed); 220 + facet_candidates.extend(url_facets); 221 + 222 + // Step 4: Detect tags 223 + let tag_facets = detect_tags(&text_processed); 224 + facet_candidates.extend(tag_facets); 225 + 226 + RichTextBuilder { 227 + text: text_processed, 228 + facet_candidates, 147 229 _state: PhantomData, 148 230 } 149 231 } ··· 408 490 facets 409 491 } 410 492 493 + /// Classifies a URL or at-URI as an embed candidate 494 + #[cfg(feature = "api_bluesky")] 495 + fn classify_embed(url: &str, embed_domains: &[&str]) -> Option<EmbedCandidate<'static>> { 496 + use crate::types::aturi::AtUri; 497 + 498 + // Check if it's an at:// URI 499 + if url.starts_with("at://") { 500 + if let Ok(at_uri) = AtUri::new(url) { 501 + return Some(EmbedCandidate::Record { 502 + at_uri: at_uri.into_static(), 503 + strong_ref: None, 504 + }); 505 + } 506 + } 507 + 508 + // Check if it's an HTTP(S) URL 509 + if url.starts_with("http://") || url.starts_with("https://") { 510 + // Try to extract at-uri from configured domain URL patterns 511 + if let Some(at_uri) = extract_at_uri_from_url(url, embed_domains) { 512 + return Some(EmbedCandidate::Record { 513 + at_uri, 514 + strong_ref: None, 515 + }); 516 + } 517 + 518 + // Otherwise, it's an external embed 519 + return Some(EmbedCandidate::External { 520 + url: CowStr::from(url.to_string()), 521 + metadata: None, 522 + }); 523 + } 524 + 525 + None 526 + } 527 + 528 + /// Extracts an at-URI from a URL with bsky.app-style path patterns 529 + /// 530 + /// Supports these patterns: 531 + /// - https://{domain}/profile/{handle|did}/post/{rkey} → at://{actor}/app.bsky.feed.post/{rkey} 532 + /// - https://{domain}/profile/{handle|did}/lists/{rkey} → at://{actor}/app.bsky.graph.list/{rkey} 533 + /// - https://{domain}/profile/{handle|did}/feed/{rkey} → at://{actor}/app.bsky.feed.generator/{rkey} 534 + /// - https://{domain}/starter-pack/{handle|did}/{rkey} → at://{actor}/app.bsky.graph.starterpack/{rkey} 535 + /// - https://{domain}/profile/{handle|did}/{collection}/{rkey} → at://{actor}/{collection}/{rkey} (if collection looks like NSID) 536 + /// 537 + /// Only works for domains in the provided `embed_domains` list. 538 + #[cfg(feature = "api_bluesky")] 539 + fn extract_at_uri_from_url( 540 + url: &str, 541 + embed_domains: &[&str], 542 + ) -> Option<crate::types::aturi::AtUri<'static>> { 543 + use crate::types::aturi::AtUri; 544 + 545 + // Parse URL 546 + let url_parsed = url::Url::parse(url).ok()?; 547 + 548 + // Check if domain is in allowed list 549 + let domain = url_parsed.domain()?; 550 + if !embed_domains.contains(&domain) { 551 + return None; 552 + } 553 + 554 + let path = url_parsed.path(); 555 + let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); 556 + 557 + let at_uri_str = match segments.as_slice() { 558 + // Known shortcuts 559 + ["profile", actor, "post", rkey] => { 560 + format!("at://{}/app.bsky.feed.post/{}", actor, rkey) 561 + } 562 + ["profile", actor, "lists", rkey] => { 563 + format!("at://{}/app.bsky.graph.list/{}", actor, rkey) 564 + } 565 + ["profile", actor, "feed", rkey] => { 566 + format!("at://{}/app.bsky.feed.generator/{}", actor, rkey) 567 + } 568 + ["starter-pack", actor, rkey] => { 569 + format!("at://{}/app.bsky.graph.starterpack/{}", actor, rkey) 570 + } 571 + // Generic pattern: /profile/{actor}/{collection}/{rkey} 572 + // Accept if collection looks like it could be an NSID (contains dots) 573 + ["profile", actor, collection, rkey] if collection.contains('.') => { 574 + format!("at://{}/{}/{}", actor, collection, rkey) 575 + } 576 + _ => return None, 577 + }; 578 + 579 + AtUri::new(&at_uri_str).ok().map(|u| u.into_static()) 580 + } 581 + 411 582 use jacquard_common::types::string::AtStrError; 412 583 use thiserror::Error; 413 584 ··· 429 600 /// Invalid byte range 430 601 #[error("Invalid byte range {start}..{end} for text of length {text_len}")] 431 602 InvalidRange { 603 + /// Range start position 432 604 start: usize, 605 + /// Range end position 433 606 end: usize, 607 + /// Total text length 434 608 text_len: usize, 435 609 }, 436 610 ··· 446 620 #[cfg(feature = "api_bluesky")] 447 621 impl RichTextBuilder<Resolved> { 448 622 /// Build the richtext (sync - all facets must be resolved) 449 - pub fn build( 450 - self, 451 - ) -> Result< 452 - ( 453 - String, 454 - Option<Vec<crate::api::app_bsky::richtext::facet::Facet<'static>>>, 455 - ), 456 - RichTextError, 457 - > { 623 + pub fn build(self) -> Result<(String, Option<Vec<Facet<'static>>>), RichTextError> { 458 624 use std::collections::BTreeMap; 459 625 if self.facet_candidates.is_empty() { 460 626 return Ok((self.text, None)); ··· 475 641 let text_len = self.text.len(); 476 642 477 643 for candidate in candidates { 644 + use crate::api::app_bsky::richtext::facet::{ByteSlice, Facet}; 645 + 478 646 let (range, feature) = match candidate { 479 647 FacetCandidate::MarkdownLink { display_range, url } => { 480 648 // MarkdownLink stores URL directly, use display_range for index ··· 574 742 }); 575 743 } 576 744 577 - facets.push(crate::api::app_bsky::richtext::facet::Facet { 578 - index: crate::api::app_bsky::richtext::facet::ByteSlice { 745 + facets.push(Facet { 746 + index: ByteSlice { 747 + byte_start: range.start as i64, 748 + byte_end: range.end as i64, 749 + extra_data: BTreeMap::new(), 750 + }, 751 + features: vec![feature], 752 + extra_data: BTreeMap::new(), 753 + }); 754 + 755 + last_end = range.end; 756 + } 757 + 758 + Ok((self.text, Some(facets.into_static()))) 759 + } 760 + } 761 + 762 + #[cfg(feature = "api_bluesky")] 763 + impl RichTextBuilder<Unresolved> { 764 + /// Build richtext, resolving handles to DIDs using the provided resolver 765 + pub async fn build_async<R>( 766 + self, 767 + resolver: &R, 768 + ) -> Result<(String, Option<Vec<Facet<'static>>>), RichTextError> 769 + where 770 + R: jacquard_identity::resolver::IdentityResolver + Sync, 771 + { 772 + use crate::api::app_bsky::richtext::facet::{ 773 + ByteSlice, FacetFeaturesItem, Link, Mention, Tag, 774 + }; 775 + use std::collections::BTreeMap; 776 + 777 + if self.facet_candidates.is_empty() { 778 + return Ok((self.text, None)); 779 + } 780 + 781 + // Sort facets by start position 782 + let mut candidates = self.facet_candidates; 783 + candidates.sort_by_key(|fc| match fc { 784 + FacetCandidate::MarkdownLink { display_range, .. } => display_range.start, 785 + FacetCandidate::Mention { range, .. } => range.start, 786 + FacetCandidate::Link { range } => range.start, 787 + FacetCandidate::Tag { range } => range.start, 788 + }); 789 + 790 + // Resolve handles and convert to Facet types 791 + let mut facets = Vec::with_capacity(candidates.len()); 792 + let mut last_end = 0; 793 + let text_len = self.text.len(); 794 + 795 + for candidate in candidates { 796 + let (range, feature) = match candidate { 797 + FacetCandidate::MarkdownLink { display_range, url } => { 798 + // MarkdownLink stores URL directly, use display_range for index 799 + 800 + let feature = FacetFeaturesItem::Link(Box::new(Link { 801 + uri: crate::types::uri::Uri::new_owned(&url)?, 802 + extra_data: BTreeMap::new(), 803 + })); 804 + (display_range, feature) 805 + } 806 + FacetCandidate::Mention { range, did } => { 807 + let did = if let Some(did) = did { 808 + // Already resolved 809 + did 810 + } else { 811 + // Extract handle from text and resolve 812 + if range.end > text_len { 813 + return Err(RichTextError::InvalidRange { 814 + start: range.start, 815 + end: range.end, 816 + text_len, 817 + }); 818 + } 819 + 820 + let handle_str = self.text[range.clone()].trim_start_matches('@'); 821 + let handle = jacquard_common::types::handle::Handle::new(handle_str)?; 822 + 823 + resolver.resolve_handle(&handle).await? 824 + }; 825 + 826 + let feature = FacetFeaturesItem::Mention(Box::new(Mention { 827 + did, 828 + extra_data: BTreeMap::new(), 829 + })); 830 + (range, feature) 831 + } 832 + FacetCandidate::Link { range } => { 833 + // Extract URL from text[range] and normalize 834 + 835 + if range.end > text_len { 836 + return Err(RichTextError::InvalidRange { 837 + start: range.start, 838 + end: range.end, 839 + text_len, 840 + }); 841 + } 842 + 843 + let mut url = self.text[range.clone()].to_string(); 844 + 845 + // Prepend https:// if URL doesn't have a scheme 846 + if !url.starts_with("http://") && !url.starts_with("https://") { 847 + url = format!("https://{}", url); 848 + } 849 + 850 + let feature = FacetFeaturesItem::Link(Box::new(Link { 851 + uri: crate::types::uri::Uri::new_owned(&url)?, 852 + extra_data: BTreeMap::new(), 853 + })); 854 + (range, feature) 855 + } 856 + FacetCandidate::Tag { range } => { 857 + // Extract tag from text[range] (includes #), strip # and trailing punct 858 + 859 + use smol_str::ToSmolStr; 860 + if range.end > text_len { 861 + return Err(RichTextError::InvalidRange { 862 + start: range.start, 863 + end: range.end, 864 + text_len, 865 + }); 866 + } 867 + 868 + let tag_with_hash = &self.text[range.clone()]; 869 + // Strip # prefix (could be # or #) 870 + let tag = tag_with_hash 871 + .trim_start_matches('#') 872 + .trim_start_matches('#'); 873 + 874 + let feature = FacetFeaturesItem::Tag(Box::new(Tag { 875 + tag: CowStr::from(tag.to_smolstr()), 876 + extra_data: BTreeMap::new(), 877 + })); 878 + (range, feature) 879 + } 880 + }; 881 + 882 + // Check overlap 883 + if range.start < last_end { 884 + return Err(RichTextError::OverlappingFacets(range.start, range.end)); 885 + } 886 + 887 + // Validate range 888 + if range.end > text_len { 889 + return Err(RichTextError::InvalidRange { 890 + start: range.start, 891 + end: range.end, 892 + text_len, 893 + }); 894 + } 895 + 896 + facets.push(Facet { 897 + index: ByteSlice { 579 898 byte_start: range.start as i64, 580 899 byte_end: range.end as i64, 581 900 extra_data: BTreeMap::new(),