A better Rust ATProto crate

resolution, incl opengraph stuff

Orual e706c991 0cbf3d8e

+341 -17
+166
Cargo.lock
··· 1432 1432 ] 1433 1433 1434 1434 [[package]] 1435 + name = "futf" 1436 + version = "0.1.5" 1437 + source = "registry+https://github.com/rust-lang/crates.io-index" 1438 + checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" 1439 + dependencies = [ 1440 + "mac", 1441 + "new_debug_unreachable", 1442 + ] 1443 + 1444 + [[package]] 1435 1445 name = "futures" 1436 1446 version = "0.3.31" 1437 1447 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 1813 1823 ] 1814 1824 1815 1825 [[package]] 1826 + name = "html5ever" 1827 + version = "0.27.0" 1828 + source = "registry+https://github.com/rust-lang/crates.io-index" 1829 + checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" 1830 + dependencies = [ 1831 + "log", 1832 + "mac", 1833 + "markup5ever", 1834 + "proc-macro2", 1835 + "quote", 1836 + "syn 2.0.106", 1837 + ] 1838 + 1839 + [[package]] 1816 1840 name = "http" 1817 1841 version = "1.3.1" 1818 1842 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2252 2276 "trait-variant", 2253 2277 "url", 2254 2278 "viuer", 2279 + "webpage", 2255 2280 ] 2256 2281 2257 2282 [[package]] ··· 2775 2800 checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" 2776 2801 2777 2802 [[package]] 2803 + name = "mac" 2804 + version = "0.1.1" 2805 + source = "registry+https://github.com/rust-lang/crates.io-index" 2806 + checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" 2807 + 2808 + [[package]] 2778 2809 name = "malloc_buf" 2779 2810 version = "0.0.6" 2780 2811 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 2784 2815 ] 2785 2816 2786 2817 [[package]] 2818 + name = "markup5ever" 2819 + version = "0.12.1" 2820 + source = "registry+https://github.com/rust-lang/crates.io-index" 2821 + checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" 2822 + dependencies = [ 2823 + "log", 2824 + "phf", 2825 + "phf_codegen", 2826 + "string_cache", 2827 + "string_cache_codegen", 2828 + "tendril", 2829 + ] 2830 + 2831 + [[package]] 2832 + name = "markup5ever_rcdom" 2833 + version = "0.3.0" 2834 + source = "registry+https://github.com/rust-lang/crates.io-index" 2835 + checksum = "edaa21ab3701bfee5099ade5f7e1f84553fd19228cf332f13cd6e964bf59be18" 2836 + dependencies = [ 2837 + "html5ever", 2838 + "markup5ever", 2839 + "tendril", 2840 + "xml5ever", 2841 + ] 2842 + 2843 + [[package]] 2787 2844 name = "match-lookup" 2788 2845 version = "0.1.1" 2789 2846 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 3283 3340 checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" 3284 3341 3285 3342 [[package]] 3343 + name = "phf" 3344 + version = "0.11.3" 3345 + source = "registry+https://github.com/rust-lang/crates.io-index" 3346 + checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" 3347 + dependencies = [ 3348 + "phf_shared", 3349 + ] 3350 + 3351 + [[package]] 3352 + name = "phf_codegen" 3353 + version = "0.11.3" 3354 + source = "registry+https://github.com/rust-lang/crates.io-index" 3355 + checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" 3356 + dependencies = [ 3357 + "phf_generator", 3358 + "phf_shared", 3359 + ] 3360 + 3361 + [[package]] 3362 + name = "phf_generator" 3363 + version = "0.11.3" 3364 + source = "registry+https://github.com/rust-lang/crates.io-index" 3365 + checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" 3366 + dependencies = [ 3367 + "phf_shared", 3368 + "rand 0.8.5", 3369 + ] 3370 + 3371 + [[package]] 3372 + name = "phf_shared" 3373 + version = "0.11.3" 3374 + source = "registry+https://github.com/rust-lang/crates.io-index" 3375 + checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" 3376 + dependencies = [ 3377 + "siphasher", 3378 + ] 3379 + 3380 + [[package]] 3286 3381 name = "pin-project" 3287 3382 version = "1.1.10" 3288 3383 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 3377 3472 dependencies = [ 3378 3473 "zerocopy", 3379 3474 ] 3475 + 3476 + [[package]] 3477 + name = "precomputed-hash" 3478 + version = "0.1.1" 3479 + source = "registry+https://github.com/rust-lang/crates.io-index" 3480 + checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" 3380 3481 3381 3482 [[package]] 3382 3483 name = "pretty_assertions" ··· 4406 4507 ] 4407 4508 4408 4509 [[package]] 4510 + name = "siphasher" 4511 + version = "1.0.1" 4512 + source = "registry+https://github.com/rust-lang/crates.io-index" 4513 + checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" 4514 + 4515 + [[package]] 4409 4516 name = "slab" 4410 4517 version = "0.4.11" 4411 4518 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 4502 4609 checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" 4503 4610 4504 4611 [[package]] 4612 + name = "string_cache" 4613 + version = "0.8.9" 4614 + source = "registry+https://github.com/rust-lang/crates.io-index" 4615 + checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" 4616 + dependencies = [ 4617 + "new_debug_unreachable", 4618 + "parking_lot", 4619 + "phf_shared", 4620 + "precomputed-hash", 4621 + "serde", 4622 + ] 4623 + 4624 + [[package]] 4625 + name = "string_cache_codegen" 4626 + version = "0.5.4" 4627 + source = "registry+https://github.com/rust-lang/crates.io-index" 4628 + checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" 4629 + dependencies = [ 4630 + "phf_generator", 4631 + "phf_shared", 4632 + "proc-macro2", 4633 + "quote", 4634 + ] 4635 + 4636 + [[package]] 4505 4637 name = "strsim" 4506 4638 version = "0.11.1" 4507 4639 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 4638 4770 "once_cell", 4639 4771 "rustix 1.1.2", 4640 4772 "windows-sys 0.61.2", 4773 + ] 4774 + 4775 + [[package]] 4776 + name = "tendril" 4777 + version = "0.4.3" 4778 + source = "registry+https://github.com/rust-lang/crates.io-index" 4779 + checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" 4780 + dependencies = [ 4781 + "futf", 4782 + "mac", 4783 + "utf-8", 4641 4784 ] 4642 4785 4643 4786 [[package]] ··· 5431 5574 ] 5432 5575 5433 5576 [[package]] 5577 + name = "webpage" 5578 + version = "2.0.1" 5579 + source = "registry+https://github.com/rust-lang/crates.io-index" 5580 + checksum = "70862efc041d46e6bbaa82bb9c34ae0596d090e86cbd14bd9e93b36ee6802eac" 5581 + dependencies = [ 5582 + "html5ever", 5583 + "markup5ever_rcdom", 5584 + "serde_json", 5585 + "url", 5586 + ] 5587 + 5588 + [[package]] 5434 5589 name = "webpki-roots" 5435 5590 version = "1.0.3" 5436 5591 source = "registry+https://github.com/rust-lang/crates.io-index" ··· 5977 6132 version = "0.6.1" 5978 6133 source = "registry+https://github.com/rust-lang/crates.io-index" 5979 6134 checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" 6135 + 6136 + [[package]] 6137 + name = "xml5ever" 6138 + version = "0.18.1" 6139 + source = "registry+https://github.com/rust-lang/crates.io-index" 6140 + checksum = "9bbb26405d8e919bc1547a5aa9abc95cbfa438f04844f5fdd9dc7596b748bf69" 6141 + dependencies = [ 6142 + "log", 6143 + "mac", 6144 + "markup5ever", 6145 + ] 5980 6146 5981 6147 [[package]] 5982 6148 name = "yansi"
+1
Cargo.toml
··· 83 83 84 84 # Text processing 85 85 regex = "1.11" 86 + webpage = { version = "2.0", default-features = false }
+1
crates/jacquard/Cargo.toml
··· 147 147 smol_str.workspace = true 148 148 percent-encoding.workspace = true 149 149 regex.workspace = true 150 + webpage.workspace = true 150 151 jose-jwk = { workspace = true, features = ["p256"] } 151 152 p256 = { workspace = true, features = ["ecdsa"] } 152 153 rand_core.workspace = true
+173 -17
crates/jacquard/src/richtext.rs
··· 46 46 /// Marker type indicating some facets may need resolution (handles → DIDs) 47 47 pub struct Unresolved; 48 48 49 + /// Rich text with facets (mentions, links, tags) 50 + #[derive(Debug, Clone)] 51 + #[cfg(feature = "api_bluesky")] 52 + pub struct RichText<'a> { 53 + /// The text content 54 + pub text: CowStr<'a>, 55 + /// Facets (mentions, links, tags) 56 + pub facets: Option<Vec<Facet<'a>>>, 57 + } 58 + 59 + #[cfg(feature = "api_bluesky")] 60 + impl RichText<'static> { 61 + /// Entry point for parsing text with automatic facet detection 62 + /// 63 + /// Uses default embed domains (bsky.app, deer.social) for at-URI extraction. 64 + pub fn parse(text: impl Into<String>) -> RichTextBuilder<Unresolved> { 65 + parse(text) 66 + } 67 + 68 + /// Entry point for manual richtext construction 69 + pub fn builder() -> RichTextBuilder<Resolved> { 70 + RichTextBuilder::builder() 71 + } 72 + } 73 + 49 74 /// Detected embed candidate from URL or at-URI 50 75 #[derive(Debug, Clone)] 51 76 #[cfg(feature = "api_bluesky")] ··· 84 109 text: String, 85 110 facet_candidates: Vec<FacetCandidate>, 86 111 #[cfg(feature = "api_bluesky")] 87 - embed_candidates: Vec<EmbedCandidate<'static>>, 112 + embed_candidates: Option<Vec<EmbedCandidate<'static>>>, 88 113 _state: PhantomData<State>, 89 114 } 90 115 ··· 193 218 RichTextBuilder { 194 219 text: text_processed, 195 220 facet_candidates, 196 - embed_candidates, 221 + embed_candidates: if embed_candidates.is_empty() { 222 + None 223 + } else { 224 + Some(embed_candidates) 225 + }, 197 226 _state: PhantomData, 198 227 } 199 228 } ··· 237 266 text: String::new(), 238 267 facet_candidates: Vec::new(), 239 268 #[cfg(feature = "api_bluesky")] 240 - embed_candidates: Vec::new(), 269 + embed_candidates: None, 241 270 _state: PhantomData, 242 271 } 243 272 } 244 273 245 274 /// Add a mention by handle (transitions to Unresolved state) 246 275 pub fn mention_handle( 247 - self, 276 + mut self, 248 277 handle: impl AsRef<str>, 249 278 range: Option<Range<usize>>, 250 279 ) -> RichTextBuilder<Unresolved> { ··· 255 284 self.find_substring(&search).unwrap_or(0..0) 256 285 }); 257 286 258 - let mut facet_candidates = self.facet_candidates; 259 - facet_candidates.push(FacetCandidate::Mention { range, did: None }); 287 + self.facet_candidates 288 + .push(FacetCandidate::Mention { range, did: None }); 260 289 261 290 RichTextBuilder { 262 291 text: self.text, 263 - facet_candidates, 292 + facet_candidates: self.facet_candidates, 264 293 #[cfg(feature = "api_bluesky")] 265 294 embed_candidates: self.embed_candidates, 266 295 _state: PhantomData, ··· 326 355 strong_ref: Option<crate::api::com_atproto::repo::strong_ref::StrongRef<'static>>, 327 356 ) -> Self { 328 357 self.embed_candidates 358 + .get_or_insert_with(Vec::new) 329 359 .push(EmbedCandidate::Record { at_uri, strong_ref }); 330 360 self 331 361 } ··· 337 367 url: impl Into<CowStr<'static>>, 338 368 metadata: Option<ExternalMetadata<'static>>, 339 369 ) -> Self { 340 - self.embed_candidates.push(EmbedCandidate::External { 341 - url: url.into(), 342 - metadata, 343 - }); 370 + self.embed_candidates 371 + .get_or_insert_with(Vec::new) 372 + .push(EmbedCandidate::External { 373 + url: url.into(), 374 + metadata, 375 + }); 344 376 self 345 377 } 346 378 ··· 620 652 #[cfg(feature = "api_bluesky")] 621 653 impl RichTextBuilder<Resolved> { 622 654 /// Build the richtext (sync - all facets must be resolved) 623 - pub fn build(self) -> Result<(String, Option<Vec<Facet<'static>>>), RichTextError> { 655 + pub fn build(self) -> Result<RichText<'static>, RichTextError> { 624 656 use std::collections::BTreeMap; 625 657 if self.facet_candidates.is_empty() { 626 - return Ok((self.text, None)); 658 + return Ok(RichText { 659 + text: CowStr::from(self.text), 660 + facets: None, 661 + }); 627 662 } 628 663 629 664 // Sort facets by start position ··· 755 790 last_end = range.end; 756 791 } 757 792 758 - Ok((self.text, Some(facets.into_static()))) 793 + Ok(RichText { 794 + text: CowStr::from(self.text), 795 + facets: Some(facets.into_static()), 796 + }) 759 797 } 760 798 } 761 799 ··· 765 803 pub async fn build_async<R>( 766 804 self, 767 805 resolver: &R, 768 - ) -> Result<(String, Option<Vec<Facet<'static>>>), RichTextError> 806 + ) -> Result<RichText<'static>, RichTextError> 769 807 where 770 808 R: jacquard_identity::resolver::IdentityResolver + Sync, 771 809 { ··· 775 813 use std::collections::BTreeMap; 776 814 777 815 if self.facet_candidates.is_empty() { 778 - return Ok((self.text, None)); 816 + return Ok(RichText { 817 + text: CowStr::from(self.text), 818 + facets: None, 819 + }); 779 820 } 780 821 781 822 // Sort facets by start position ··· 906 947 last_end = range.end; 907 948 } 908 949 909 - Ok((self.text, Some(facets.into_static()))) 950 + Ok(RichText { 951 + text: CowStr::from(self.text), 952 + facets: Some(facets.into_static()), 953 + }) 910 954 } 955 + 956 + /// Build richtext with embed resolution using HttpClient 957 + /// 958 + /// This resolves handles to DIDs and fetches OpenGraph metadata for external links. 959 + pub async fn build_with_embeds_async<C>( 960 + mut self, 961 + client: &C, 962 + ) -> Result<(RichText<'static>, Option<Vec<EmbedCandidate<'static>>>), RichTextError> 963 + where 964 + C: jacquard_common::http_client::HttpClient 965 + + jacquard_identity::resolver::IdentityResolver 966 + + Sync, 967 + { 968 + // Extract embed candidates 969 + let embed_candidates = self.embed_candidates.take().unwrap_or_default(); 970 + 971 + // Build facets (resolves handles) 972 + let richtext = self.build_async(client).await?; 973 + 974 + // Now resolve embed candidates 975 + let mut resolved_embeds = Vec::new(); 976 + 977 + for candidate in embed_candidates { 978 + match candidate { 979 + EmbedCandidate::Record { at_uri, strong_ref } => { 980 + // TODO: could fetch the record to get CID for strong_ref 981 + // For now, just pass through 982 + resolved_embeds.push(EmbedCandidate::Record { at_uri, strong_ref }); 983 + } 984 + EmbedCandidate::External { 985 + url, 986 + metadata: None, 987 + } => { 988 + // Fetch OpenGraph metadata 989 + match fetch_opengraph_metadata(client, &url).await { 990 + Ok(Some(metadata)) => { 991 + resolved_embeds.push(EmbedCandidate::External { 992 + url, 993 + metadata: Some(metadata), 994 + }); 995 + } 996 + Ok(None) | Err(_) => { 997 + // If we fail to fetch metadata, include embed without metadata 998 + resolved_embeds.push(EmbedCandidate::External { 999 + url, 1000 + metadata: None, 1001 + }); 1002 + } 1003 + } 1004 + } 1005 + other => resolved_embeds.push(other), 1006 + } 1007 + } 1008 + 1009 + Ok((richtext, Some(resolved_embeds).filter(|v| !v.is_empty()))) 1010 + } 1011 + } 1012 + 1013 + /// Fetch OpenGraph metadata from a URL using the webpage crate 1014 + #[cfg(feature = "api_bluesky")] 1015 + async fn fetch_opengraph_metadata<C>( 1016 + client: &C, 1017 + url: &str, 1018 + ) -> Result<Option<ExternalMetadata<'static>>, Box<dyn std::error::Error + Send + Sync>> 1019 + where 1020 + C: jacquard_common::http_client::HttpClient, 1021 + { 1022 + // Build HTTP GET request 1023 + let request = http::Request::builder() 1024 + .method("GET") 1025 + .uri(url) 1026 + .header("User-Agent", "jacquard/0.6") 1027 + .body(Vec::new()) 1028 + .map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)?; 1029 + 1030 + // Fetch the page 1031 + let response = client 1032 + .send_http(request) 1033 + .await 1034 + .map_err(|e| Box::new(e) as Box<dyn std::error::Error + Send + Sync>)?; 1035 + 1036 + // Parse HTML body 1037 + let html = String::from_utf8_lossy(response.body()); 1038 + 1039 + // Use webpage crate to extract OpenGraph metadata 1040 + let info = webpage::HTML::from_string(html.to_string(), Some(url.to_string())) 1041 + .ok() 1042 + .map(|html| html.opengraph); 1043 + 1044 + if let Some(og) = info { 1045 + // Extract title, description, and thumbnail 1046 + 1047 + use jacquard_common::cowstr::ToCowStr; 1048 + let title = og.properties.get("title").map(|s| s.to_cowstr()); 1049 + 1050 + let description = og.properties.get("description").map(|s| s.to_cowstr()); 1051 + 1052 + let thumbnail = og.images.first().map(|img| CowStr::from(img.url.clone())); 1053 + 1054 + // Only return metadata if we have at least a title 1055 + if let Some(title) = title { 1056 + return Ok(Some(ExternalMetadata { 1057 + title: title.into_static(), 1058 + description: description 1059 + .unwrap_or_else(|| CowStr::new_static("")) 1060 + .into_static(), 1061 + thumbnail: thumbnail.into_static(), 1062 + })); 1063 + } 1064 + } 1065 + 1066 + Ok(None) 911 1067 }