//! Rich text facet structures and rendering for AT Protocol. //! //! This module provides structures for handling rich text facets (mentions, links, hashtags), //! parsing them from text, and rendering them as HTML for display in the UI. //! //! # Byte Offset Calculation //! //! This implementation correctly uses UTF-8 byte offsets as required by AT Protocol. //! The facets use "inclusive start and exclusive end" byte ranges. All parsing is done //! using `regex::bytes::Regex` which operates on byte slices and returns byte positions, //! ensuring correct handling of multi-byte UTF-8 characters (emojis, CJK, accented chars). use atproto_identity::resolve::IdentityResolver; use atproto_record::lexicon::app::bsky::richtext::facet::{ ByteSlice, Facet, FacetFeature, Link, Mention, Tag, }; use regex::bytes::Regex; use std::fmt::Write; /// Configuration for facet parsing and rendering limits #[derive(Debug, Clone, Copy)] pub struct FacetLimits { /// Maximum number of mention facets to process (default: 5) pub mentions_max: usize, /// Maximum number of tag facets to process (default: 5) pub tags_max: usize, /// Maximum number of link facets to process (default: 5) pub links_max: usize, /// Maximum total number of facets to process (default: 10) pub max: usize, } impl Default for FacetLimits { fn default() -> Self { Self { mentions_max: 5, tags_max: 5, links_max: 5, max: 10, } } } /// Mention span with byte positions and handle #[derive(Debug)] pub struct MentionSpan { pub start: usize, pub end: usize, pub handle: String, } /// URL span with byte positions and URL #[derive(Debug)] pub struct UrlSpan { pub start: usize, pub end: usize, pub url: String, } /// Tag span with byte positions and tag text #[derive(Debug)] pub struct TagSpan { pub start: usize, pub end: usize, pub tag: String, } /// Parse mentions from text and return their byte positions /// This function excludes mentions that appear within URLs pub fn parse_mentions(text: &str) -> Vec { let mut spans = Vec::new(); // First, parse all URLs to exclude mention matches within them let url_spans = parse_urls(text); // Regex based on: https://atproto.com/specs/handle#handle-identifier-syntax // Pattern: [$|\W](@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?) let mention_regex = Regex::new( r"(?:^|[^\w])(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)" ).unwrap(); let text_bytes = text.as_bytes(); for capture in mention_regex.captures_iter(text_bytes) { if let Some(mention_match) = capture.get(1) { let start = mention_match.start(); let end = mention_match.end(); // Check if this mention overlaps with any URL let overlaps_url = url_spans.iter().any(|url| { // Check if mention is within or overlaps the URL span (start >= url.start && start < url.end) || (end > url.start && end <= url.end) }); // Only add the mention if it doesn't overlap with a URL if !overlaps_url { let handle = std::str::from_utf8(&mention_match.as_bytes()[1..]) .unwrap_or_default() .to_string(); spans.push(MentionSpan { start, end, handle }); } } } spans } /// Parse URLs from text and return their byte positions pub fn parse_urls(text: &str) -> Vec { let mut spans = Vec::new(); // Partial/naive URL regex based on: https://stackoverflow.com/a/3809435 // Pattern: [$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?) // Modified to use + instead of {1,6} to support longer TLDs and multi-level subdomains let url_regex = Regex::new( r"(?:^|[^\w])(https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)" ).unwrap(); let text_bytes = text.as_bytes(); for capture in url_regex.captures_iter(text_bytes) { if let Some(url_match) = capture.get(1) { let url = std::str::from_utf8(url_match.as_bytes()) .unwrap_or_default() .to_string(); spans.push(UrlSpan { start: url_match.start(), end: url_match.end(), url, }); } } spans } /// Parse hashtags from text and return their byte positions pub fn parse_tags(text: &str) -> Vec { let mut spans = Vec::new(); // Regex based on: https://github.com/bluesky-social/atproto/blob/d91988fe79030b61b556dd6f16a46f0c3b9d0b44/packages/api/src/rich-text/util.ts // Simplified for Rust - matches hashtags at word boundaries // Pattern matches: start of string or non-word char, then # or #, then tag content let tag_regex = Regex::new(r"(?:^|[^\w])([##])([\w]+(?:[\w]*)*)").unwrap(); let text_bytes = text.as_bytes(); // Work with bytes for proper position tracking for capture in tag_regex.captures_iter(text_bytes) { if let (Some(full_match), Some(hash_match), Some(tag_match)) = (capture.get(0), capture.get(1), capture.get(2)) { // Calculate the absolute byte position of the hash symbol // The full match includes the preceding character (if any) // so we need to adjust for that let match_start = full_match.start(); let hash_offset = hash_match.start() - full_match.start(); let start = match_start + hash_offset; let end = match_start + hash_offset + hash_match.len() + tag_match.len(); // Extract just the tag text (without the hash symbol) // Normalize to lowercase for case-insensitive tag matching let tag = std::str::from_utf8(tag_match.as_bytes()).unwrap_or_default(); // Only include tags that are not purely numeric if !tag.chars().all(|c| c.is_ascii_digit()) { spans.push(TagSpan { start, end, tag: tag.to_string(), }); } } } spans } /// Parse facets from text and return a vector of Facet objects. /// /// This function extracts mentions, URLs, and hashtags from the provided text /// and creates AT Protocol facets with proper byte indices. /// /// Mentions are resolved to actual DIDs using the provided identity resolver. /// If a handle cannot be resolved to a DID, the mention facet is skipped. /// /// # Arguments /// * `text` - The text to extract facets from /// * `identity_resolver` - Resolver for converting handles to DIDs /// * `limits` - Configuration for maximum facets per type and total /// /// # Returns /// Optional vector of facets. Returns None if no facets were found. pub async fn parse_facets_from_text( text: &str, identity_resolver: &dyn IdentityResolver, limits: &FacetLimits, ) -> Option> { let mut facets = Vec::new(); // Parse mentions (limited by mentions_max) let mention_spans = parse_mentions(text); let mut mention_count = 0; for mention in mention_spans { if mention_count >= limits.mentions_max { break; } // Try to resolve the handle to a DID // First try with at:// prefix, then without let at_uri = format!("at://{}", mention.handle); let did_result = match identity_resolver.resolve(&at_uri).await { Ok(doc) => Ok(doc), Err(_) => identity_resolver.resolve(&mention.handle).await, }; // Only add the mention facet if we successfully resolved the DID if let Ok(did_doc) = did_result { facets.push(Facet { index: ByteSlice { byte_start: mention.start, byte_end: mention.end, }, features: vec![FacetFeature::Mention(Mention { did: did_doc.id.to_string(), })], }); mention_count += 1; } // If resolution fails, we skip this mention facet entirely } // Parse URLs (limited by links_max) let url_spans = parse_urls(text); for (idx, url) in url_spans.into_iter().enumerate() { if idx >= limits.links_max { break; } facets.push(Facet { index: ByteSlice { byte_start: url.start, byte_end: url.end, }, features: vec![FacetFeature::Link(Link { uri: url.url })], }); } // Parse hashtags (limited by tags_max) let tag_spans = parse_tags(text); for (idx, tag_span) in tag_spans.into_iter().enumerate() { if idx >= limits.tags_max { break; } facets.push(Facet { index: ByteSlice { byte_start: tag_span.start, byte_end: tag_span.end, }, features: vec![FacetFeature::Tag(Tag { tag: tag_span.tag })], }); } // Apply global facet limit (truncate if exceeds max) if facets.len() > limits.max { facets.truncate(limits.max); } // Only return facets if we found any if !facets.is_empty() { Some(facets) } else { None } } /// HTML escape helper function fn html_escape(text: &str) -> String { text.chars() .map(|c| match c { '&' => "&".to_string(), '<' => "<".to_string(), '>' => ">".to_string(), '"' => """.to_string(), '\'' => "'".to_string(), c => c.to_string(), }) .collect() } /// Check if text contains HTML tags /// This is used to detect potentially malicious content fn contains_html_tags(text: &str) -> bool { // Look for patterns that indicate HTML tags // We're looking for < followed by either a letter, /, or ! let mut chars = text.chars().peekable(); while let Some(ch) = chars.next() { if ch == '<' && let Some(&next_ch) = chars.peek() { // Check if this looks like an HTML tag if next_ch.is_ascii_alphabetic() || next_ch == '/' || next_ch == '!' { return true; } } } false } /// Render text with facets as HTML. /// /// This function converts plain text with facet annotations into HTML with proper /// links for mentions, URLs, and hashtags based on the facet information. /// /// # HTML Output /// - Mentions: `@handle` /// - Links: `[url]` /// - Tags: `#tag` /// - Regular text is HTML-escaped for security /// /// # Arguments /// * `text` - The plain text content /// * `facets` - Optional facets to apply to the text /// * `limits` - Configuration for maximum facets per type and total /// /// # Returns /// HTML string with facets rendered as links pub fn render_text_with_facets_html( text: &str, facets: Option<&Vec>, limits: &FacetLimits, ) -> String { // First, check if the text contains HTML tags // If it does, treat it as suspicious and just clean it without applying facets if contains_html_tags(text) { // Use ammonia to strip ALL HTML and return plain text let cleaned = ammonia::clean(text); // Convert newlines to
tags after cleaning return cleaned.replace('\n', "
"); } let text_bytes = text.as_bytes(); // If no facets, just return escaped text let Some(facets) = facets else { return html_escape(text); }; // Sort facets by start position to process them in order let mut sorted_facets: Vec<_> = facets.iter().collect(); sorted_facets.sort_by_key(|f| f.index.byte_start); // Apply limits: count facets by type and limit total let mut mention_count = 0; let mut link_count = 0; let mut tag_count = 0; let mut total_count = 0; let filtered_facets: Vec<_> = sorted_facets .into_iter() .filter(|facet| { if total_count >= limits.max { return false; } // Check facet type and apply per-type limits let should_include = facet.features.first().is_some_and(|feature| match feature { FacetFeature::Mention(_) if mention_count < limits.mentions_max => { mention_count += 1; true } FacetFeature::Link(_) if link_count < limits.links_max => { link_count += 1; true } FacetFeature::Tag(_) if tag_count < limits.tags_max => { tag_count += 1; true } _ => false, }); if should_include { total_count += 1; } should_include }) .collect(); let mut html = String::new(); let mut last_end = 0; let text_len = text_bytes.len(); for facet in filtered_facets { // Validate facet indices are within bounds - skip invalid facets if facet.index.byte_start > text_len || facet.index.byte_end > text_len || facet.index.byte_start > facet.index.byte_end { continue; } // Add any text before this facet (HTML-escaped) if facet.index.byte_start > last_end { let text_before = std::str::from_utf8(&text_bytes[last_end..facet.index.byte_start]).unwrap_or(""); html.push_str(&html_escape(text_before)); } // Get the text covered by this facet let facet_text = std::str::from_utf8(&text_bytes[facet.index.byte_start..facet.index.byte_end]) .unwrap_or(""); // Process the facet based on its feature type // Only process the first feature (in practice, there should only be one per facet) if let Some(feature) = facet.features.first() { match feature { FacetFeature::Mention(mention) => { write!( &mut html, r#"{}"#, html_escape(&mention.did), html_escape(facet_text) ) .unwrap(); } FacetFeature::Link(link) => { // Only create link tags for safe URLs if link.uri.starts_with("http://") || link.uri.starts_with("https://") || link.uri.starts_with("/") { write!( &mut html, r#"{}"#, html_escape(&link.uri), html_escape(facet_text) ) .unwrap(); } else { // For unsafe URLs (like javascript:), just render as plain text html.push_str(&html_escape(facet_text)); } } FacetFeature::Tag(tag) => { // URL-encode the tag for the href attribute let encoded_tag = urlencoding::encode(&tag.tag); write!( &mut html, r##"{}"##, encoded_tag, html_escape(facet_text) ) .unwrap(); } } } last_end = facet.index.byte_end; } // Add any remaining text after the last facet if last_end < text_bytes.len() { let remaining_text = std::str::from_utf8(&text_bytes[last_end..]).unwrap_or(""); html.push_str(&html_escape(remaining_text)); } // Sanitize the final HTML output to ensure safety // Configure ammonia to only allow tags with specific attributes let mut builder = ammonia::Builder::new(); builder .tags(std::collections::HashSet::from(["a", "br"])) // Don't automatically add rel="nofollow" - we'll handle it in the attribute filter .link_rel(None) // Allow relative URLs (for internal links like /u/... and /t/...) .url_relative(ammonia::UrlRelative::PassThrough) .attribute_filter(|element, attribute, value| match (element, attribute) { ("a", "href") => { // Only allow safe URLs: relative paths starting with /, or http(s) URLs if value.starts_with('/') || value.starts_with("http://") || value.starts_with("https://") || value.starts_with("#") { Some(value.into()) } else { None } } ("a", "target") => { if value == "_blank" { Some(value.into()) } else { None } } ("a", "rel") => { // For external links, ensure nofollow is present if value.contains("noopener") || value.contains("noreferrer") { // Keep the existing rel value but add nofollow if not present if !value.contains("nofollow") { Some(format!("{} nofollow", value).into()) } else { Some(value.into()) } } else { // Just nofollow for other cases Some("nofollow".into()) } } ("br", _) => None, // br tags don't have attributes _ => None, }); builder.clean(&html).to_string() } #[cfg(test)] mod tests { use async_trait::async_trait; use atproto_identity::model::Document; use atproto_record::lexicon::app::bsky::richtext::facet::{ByteSlice, Link, Mention, Tag}; use std::collections::HashMap; use super::*; /// Mock identity resolver for testing struct MockIdentityResolver { handles_to_dids: HashMap, } impl MockIdentityResolver { fn new() -> Self { let mut handles_to_dids = HashMap::new(); handles_to_dids.insert( "alice.bsky.social".to_string(), "did:plc:alice123".to_string(), ); handles_to_dids.insert( "at://alice.bsky.social".to_string(), "did:plc:alice123".to_string(), ); Self { handles_to_dids } } fn add_identity(&mut self, handle: &str, did: &str) { self.handles_to_dids .insert(handle.to_string(), did.to_string()); self.handles_to_dids .insert(format!("at://{}", handle), did.to_string()); } } #[async_trait] impl IdentityResolver for MockIdentityResolver { async fn resolve(&self, handle: &str) -> anyhow::Result { let handle_key = handle.to_string(); if let Some(did) = self.handles_to_dids.get(&handle_key) { Ok(Document { context: vec![], id: did.clone(), also_known_as: vec![format!("at://{}", handle_key.trim_start_matches("at://"))], verification_method: vec![], service: vec![], extra: HashMap::new(), }) } else { Err(anyhow::anyhow!("Handle not found")) } } } #[test] fn test_html_escape() { assert_eq!(html_escape("Hello & "), "Hello & <world>"); assert_eq!( html_escape("\"quotes\" and 'apostrophes'"), ""quotes" and 'apostrophes'" ); assert_eq!(html_escape("Line 1\nLine 2"), "Line 1\nLine 2"); assert_eq!(html_escape("Normal text"), "Normal text"); } #[test] fn test_render_no_facets() { let text = "This is a description & it's great!"; let limits = FacetLimits::default(); let html = render_text_with_facets_html(text, None, &limits); // HTML tags are detected and stripped by ammonia // The tag is removed entirely assert_eq!(html, "This is a description & it's great!"); } #[test] fn test_render_with_html_tags() { let text = "Check this content!"; let limits = FacetLimits::default(); let html = render_text_with_facets_html(text, None, &limits); // The script tag should be completely removed assert_eq!(html, "Check this content!"); assert!(!html.contains("script")); assert!(!html.contains("alert")); } #[test] fn test_render_with_mention() { let text = "Contact @alice.bsky.social for details"; let limits = FacetLimits::default(); let facets = vec![Facet { index: ByteSlice { byte_start: 8, byte_end: 26, }, features: vec![FacetFeature::Mention(Mention { did: "did:plc:abc123".to_string(), })], }]; let html = render_text_with_facets_html(text, Some(&facets), &limits); assert_eq!( html, r#"Contact @alice.bsky.social for details"# ); } #[test] fn test_render_with_link() { let text = "Apply at https://example.com today!"; let limits = FacetLimits::default(); let facets = vec![Facet { index: ByteSlice { byte_start: 9, byte_end: 28, }, features: vec![FacetFeature::Link(Link { uri: "https://example.com".to_string(), })], }]; let html = render_text_with_facets_html(text, Some(&facets), &limits); assert_eq!( html, r#"Apply at https://example.com today!"# ); } #[test] fn test_render_with_tag() { let text = "Looking for #rust developers"; let limits = FacetLimits::default(); let facets = vec![Facet { index: ByteSlice { byte_start: 12, byte_end: 17, }, features: vec![FacetFeature::Tag(Tag { tag: "rust".to_string(), })], }]; let html = render_text_with_facets_html(text, Some(&facets), &limits); assert_eq!( html, r##"Looking for #rust developers"## ); } #[tokio::test] async fn test_parse_facets_from_text_comprehensive() { let mut resolver = MockIdentityResolver::new(); resolver.add_identity("bob.test.com", "did:plc:bob456"); let limits = FacetLimits::default(); let text = "Join @alice.bsky.social and @bob.test.com at https://example.com #rust #golang"; let facets = parse_facets_from_text(text, &resolver, &limits).await; assert!(facets.is_some()); let facets = facets.unwrap(); assert_eq!(facets.len(), 5); // 2 mentions, 1 URL, 2 hashtags // Check first mention assert_eq!(facets[0].index.byte_start, 5); assert_eq!(facets[0].index.byte_end, 23); if let FacetFeature::Mention(ref mention) = facets[0].features[0] { assert_eq!(mention.did, "did:plc:alice123"); } else { panic!("Expected Mention feature"); } // Check second mention assert_eq!(facets[1].index.byte_start, 28); assert_eq!(facets[1].index.byte_end, 41); if let FacetFeature::Mention(ref mention) = facets[1].features[0] { assert_eq!(mention.did, "did:plc:bob456"); } else { panic!("Expected Mention feature"); } // Check URL assert_eq!(facets[2].index.byte_start, 45); assert_eq!(facets[2].index.byte_end, 64); if let FacetFeature::Link(ref link) = facets[2].features[0] { assert_eq!(link.uri, "https://example.com"); } else { panic!("Expected Link feature"); } // Check first hashtag assert_eq!(facets[3].index.byte_start, 65); assert_eq!(facets[3].index.byte_end, 70); if let FacetFeature::Tag(ref tag) = facets[3].features[0] { assert_eq!(tag.tag, "rust"); } else { panic!("Expected Tag feature"); } // Check second hashtag assert_eq!(facets[4].index.byte_start, 71); assert_eq!(facets[4].index.byte_end, 78); if let FacetFeature::Tag(ref tag) = facets[4].features[0] { assert_eq!(tag.tag, "golang"); } else { panic!("Expected Tag feature"); } } #[tokio::test] async fn test_parse_facets_from_text_with_unresolvable_mention() { let resolver = MockIdentityResolver::new(); let limits = FacetLimits::default(); // Only alice.bsky.social is in the resolver, not unknown.handle.com let text = "Contact @unknown.handle.com for details #rust"; let facets = parse_facets_from_text(text, &resolver, &limits).await; assert!(facets.is_some()); let facets = facets.unwrap(); // Should only have 1 facet (the hashtag) since the mention couldn't be resolved assert_eq!(facets.len(), 1); // Check that it's the hashtag facet if let FacetFeature::Tag(ref tag) = facets[0].features[0] { assert_eq!(tag.tag, "rust"); } else { panic!("Expected Tag feature"); } } #[tokio::test] async fn test_parse_facets_from_text_empty() { let resolver = MockIdentityResolver::new(); let limits = FacetLimits::default(); let text = "No mentions, URLs, or hashtags here"; let facets = parse_facets_from_text(text, &resolver, &limits).await; assert!(facets.is_none()); } #[tokio::test] async fn test_parse_facets_from_text_url_with_at_mention() { let resolver = MockIdentityResolver::new(); let limits = FacetLimits::default(); // URLs with @ should not create mention facets let text = "Tangled https://tangled.org/@smokesignal.events"; let facets = parse_facets_from_text(text, &resolver, &limits).await; assert!(facets.is_some()); let facets = facets.unwrap(); // Should have exactly 1 facet (the URL), not 2 (URL + mention) assert_eq!( facets.len(), 1, "Expected 1 facet (URL only), got {}", facets.len() ); // Verify it's a link facet, not a mention if let FacetFeature::Link(ref link) = facets[0].features[0] { assert_eq!(link.uri, "https://tangled.org/@smokesignal.events"); } else { panic!("Expected Link feature, got Mention or Tag instead"); } } #[tokio::test] async fn test_parse_facets_with_mention_limit() { let mut resolver = MockIdentityResolver::new(); resolver.add_identity("bob.test.com", "did:plc:bob456"); resolver.add_identity("charlie.test.com", "did:plc:charlie789"); // Limit to 2 mentions let limits = FacetLimits { mentions_max: 2, tags_max: 5, links_max: 5, max: 10, }; let text = "Join @alice.bsky.social @bob.test.com @charlie.test.com"; let facets = parse_facets_from_text(text, &resolver, &limits).await; assert!(facets.is_some()); let facets = facets.unwrap(); // Should only have 2 mentions (alice and bob), charlie should be skipped assert_eq!(facets.len(), 2); // Verify they're both mentions for facet in &facets { assert!(matches!(facet.features[0], FacetFeature::Mention(_))); } } #[tokio::test] async fn test_parse_facets_with_global_limit() { let mut resolver = MockIdentityResolver::new(); resolver.add_identity("bob.test.com", "did:plc:bob456"); // Very restrictive global limit let limits = FacetLimits { mentions_max: 5, tags_max: 5, links_max: 5, max: 3, // Only allow 3 total facets }; let text = "Join @alice.bsky.social @bob.test.com at https://example.com #rust #golang #python"; let facets = parse_facets_from_text(text, &resolver, &limits).await; assert!(facets.is_some()); let facets = facets.unwrap(); // Should be truncated to 3 facets total assert_eq!(facets.len(), 3); } #[test] fn test_render_with_facet_limits() { let text = "Contact @alice @bob @charlie for details"; let limits = FacetLimits { mentions_max: 2, // Only render first 2 mentions tags_max: 5, links_max: 5, max: 10, }; let facets = vec![ Facet { index: ByteSlice { byte_start: 8, byte_end: 14, }, features: vec![FacetFeature::Mention(Mention { did: "did:plc:alice".to_string(), })], }, Facet { index: ByteSlice { byte_start: 15, byte_end: 19, }, features: vec![FacetFeature::Mention(Mention { did: "did:plc:bob".to_string(), })], }, Facet { index: ByteSlice { byte_start: 20, byte_end: 28, }, features: vec![FacetFeature::Mention(Mention { did: "did:plc:charlie".to_string(), })], }, ]; let html = render_text_with_facets_html(text, Some(&facets), &limits); // Should only render first 2 mentions, third should be plain text assert!(html.contains(r#"@alice"#)); assert!(html.contains(r#"@bob"#)); // Charlie should NOT be a link due to mention limit assert!(!html.contains(r#""#)); } #[test] fn test_parse_urls_multiple_links() { let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"; let url_spans = parse_urls(text); // Debug output for (i, span) in url_spans.iter().enumerate() { println!( "URL {}: {} (start={}, end={})", i, span.url, span.start, span.end ); } // Should find both URLs assert_eq!( url_spans.len(), 2, "Expected 2 URLs but found {}", url_spans.len() ); if !url_spans.is_empty() { assert_eq!(url_spans[0].url, "https://www.ietf.org/meeting/124/"); } if url_spans.len() >= 2 { assert_eq!( url_spans[1].url, "https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164" ); } } #[test] fn test_parse_urls_with_html_entity() { // Test with the HTML entity & in the text let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"; let url_spans = parse_urls(text); // Debug output for (i, span) in url_spans.iter().enumerate() { println!( "URL {}: {} (start={}, end={})", i, span.url, span.start, span.end ); println!( " Context before: {:?}", &text[span.start.saturating_sub(10)..span.start] ); println!( " Context after: {:?}", &text[span.end..std::cmp::min(span.end + 10, text.len())] ); } // Should find both URLs assert_eq!( url_spans.len(), 2, "Expected 2 URLs but found {}", url_spans.len() ); if !url_spans.is_empty() { assert_eq!(url_spans[0].url, "https://www.ietf.org/meeting/124/"); } if url_spans.len() >= 2 { assert_eq!( url_spans[1].url, "https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164" ); } } #[test] fn test_render_malicious_link() { let text = "Visit example.com for details"; let limits = FacetLimits::default(); let facets = vec![Facet { index: ByteSlice { byte_start: 6, byte_end: 17, }, features: vec![FacetFeature::Link(Link { uri: "javascript:alert('XSS')".to_string(), })], }]; let html = render_text_with_facets_html(text, Some(&facets), &limits); // JavaScript URLs should be blocked assert!(!html.contains("javascript:")); assert_eq!(html, "Visit example.com for details"); } #[test] fn test_byte_offset_with_html_entities() { // This test demonstrates that HTML entity escaping shifts byte positions. // The byte positions shift: // In original: '&' is at byte 8 (1 byte) // In escaped: '&' starts at byte 8 (5 bytes) // This causes facet byte offsets to be misaligned if text is escaped before rendering. // If we have a URL after the ampersand in the original: let original_with_url = "Nov 3rd & Tuesday https://example.com"; let escaped_with_url = "Nov 3rd & Tuesday https://example.com"; // Parse URLs from both versions let original_urls = parse_urls(original_with_url); let escaped_urls = parse_urls(escaped_with_url); println!("Original text: {:?}", original_with_url); println!( "Original URL found at: {:?}", original_urls.first().map(|u| (u.start, u.end)) ); println!("Escaped text: {:?}", escaped_with_url); println!( "Escaped URL found at: {:?}", escaped_urls.first().map(|u| (u.start, u.end)) ); // Both should find the URL, but at different byte positions assert_eq!(original_urls.len(), 1); assert_eq!(escaped_urls.len(), 1); // The byte positions will be different assert_eq!(original_urls[0].start, 18); // After "Nov 3rd & Tuesday " assert_eq!(escaped_urls[0].start, 22); // After "Nov 3rd & Tuesday " (4 extra bytes for &) } #[test] fn test_render_facets_with_ampersand_in_text() { // Test case from the bug report: text with & that should have two URL facets let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"; // Parse facets from the original text let url_spans = parse_urls(text); assert_eq!(url_spans.len(), 2, "Should find 2 URLs"); // Create facets from the parsed URLs let facets = vec![ Facet { index: ByteSlice { byte_start: url_spans[0].start, byte_end: url_spans[0].end, }, features: vec![FacetFeature::Link(Link { uri: url_spans[0].url.clone(), })], }, Facet { index: ByteSlice { byte_start: url_spans[1].start, byte_end: url_spans[1].end, }, features: vec![FacetFeature::Link(Link { uri: url_spans[1].url.clone(), })], }, ]; // Render with facets - this should work correctly even with & in the text let limits = FacetLimits::default(); let html = render_text_with_facets_html(text, Some(&facets), &limits); // Both URLs should be rendered as links assert!( html.contains(r#"").count(), 2, "Should have 2 closing tags" ); } #[test] fn test_render_with_out_of_bounds_facet() { // Regression test for panic: "range end index 324 out of range for slice of length 323" // This can happen when facets come from external AT Protocol data with incorrect byte offsets let text = "Hello world"; // 11 bytes let limits = FacetLimits::default(); // Create a facet that extends beyond the text length let facets = vec![Facet { index: ByteSlice { byte_start: 6, byte_end: 20, // Beyond text length of 11 }, features: vec![FacetFeature::Link(Link { uri: "https://example.com".to_string(), })], }]; // This should NOT panic - invalid facets should be skipped let html = render_text_with_facets_html(text, Some(&facets), &limits); // The text should still be rendered (escaped), just without the invalid facet assert_eq!(html, "Hello world"); } #[test] fn test_render_with_facet_start_beyond_text() { // Test when facet start is beyond text length let text = "Short"; // 5 bytes let limits = FacetLimits::default(); let facets = vec![Facet { index: ByteSlice { byte_start: 100, // Way beyond text length byte_end: 110, }, features: vec![FacetFeature::Link(Link { uri: "https://example.com".to_string(), })], }]; // Should not panic let html = render_text_with_facets_html(text, Some(&facets), &limits); assert_eq!(html, "Short"); } #[test] fn test_render_with_inverted_facet_indices() { // Test when byte_start > byte_end (invalid) let text = "Hello world"; let limits = FacetLimits::default(); let facets = vec![Facet { index: ByteSlice { byte_start: 8, byte_end: 4, // Invalid: end before start }, features: vec![FacetFeature::Link(Link { uri: "https://example.com".to_string(), })], }]; // Should not panic let html = render_text_with_facets_html(text, Some(&facets), &limits); assert_eq!(html, "Hello world"); } #[test] fn test_parse_urls_from_atproto_record_text() { // Test parsing URLs from real AT Protocol record description text. // This demonstrates the correct byte positions that should be used for facets. let text = "Dev, Power Users, and Generally inquisitive folks get a completely unprofessionally amateur interview. Just a yap sesh where chat is part of the call!\n\n✨the daniel✨ & I will be on a Zoom call and I will stream out to https://stream.place/psingletary.com\n\nSubscribe to the publications! https://atprotocalls.leaflet.pub/"; let url_spans = parse_urls(text); assert_eq!(url_spans.len(), 2, "Should find 2 URLs"); // First URL: https://stream.place/psingletary.com assert_eq!(url_spans[0].url, "https://stream.place/psingletary.com"); assert_eq!(url_spans[0].start, 221); assert_eq!(url_spans[0].end, 257); // Second URL: https://atprotocalls.leaflet.pub/ assert_eq!(url_spans[1].url, "https://atprotocalls.leaflet.pub/"); assert_eq!(url_spans[1].start, 290); assert_eq!(url_spans[1].end, 323); // Verify the byte slices match the expected text let text_bytes = text.as_bytes(); assert_eq!( std::str::from_utf8(&text_bytes[221..257]).unwrap(), "https://stream.place/psingletary.com" ); assert_eq!( std::str::from_utf8(&text_bytes[290..323]).unwrap(), "https://atprotocalls.leaflet.pub/" ); // Note: The AT Protocol record had incorrect facet indices: // - First link: byteStart=222, byteEnd=258 (should be 221, 257) // - Second link: byteStart=291, byteEnd=324 (should be 290, 323) // This off-by-one error was in the source data, not our parser. } #[test] fn test_render_with_off_by_one_facet_indices() { // Regression test for facets with off-by-one byte indices from external AT Protocol data. // The facets in this test have byteStart values that are 1 byte too high, causing // the first character of the URL to appear outside the link tag. // // This test documents the current behavior: the renderer faithfully applies facets // at the specified byte positions, even if those positions are incorrect. // The root cause is incorrect facet generation by the client that created the record. let text = "Dev, Power Users, and Generally inquisitive folks get a completely unprofessionally amateur interview. Just a yap sesh where chat is part of the call!\n\n✨the daniel✨ & I will be on a Zoom call and I will stream out to https://stream.place/psingletary.com\n\nSubscribe to the publications! https://atprotocalls.leaflet.pub/"; // Verify text length - the second facet's byte_end (324) exceeds this assert_eq!(text.len(), 323, "Text should be 323 bytes"); let limits = FacetLimits::default(); // These facets have incorrect byte indices (off by 1) - this is real data from AT Protocol let facets = vec![ Facet { index: ByteSlice { byte_start: 222, // Should be 221 byte_end: 258, // Should be 257 (but 258 is within bounds) }, features: vec![FacetFeature::Link(Link { uri: "https://stream.place/psingletary.com".to_string(), })], }, Facet { index: ByteSlice { byte_start: 291, // Should be 290 byte_end: 324, // Should be 323 - but 324 > text.len() so this facet is SKIPPED }, features: vec![FacetFeature::Link(Link { uri: "https://atprotocalls.leaflet.pub/".to_string(), })], }, ]; let html = render_text_with_facets_html(text, Some(&facets), &limits); // Due to off-by-one facet indices, the 'h' from 'https' appears before the link tag assert!( html.contains(r#"stream out to h text.len() (323) // This is the bounds check in render_text_with_facets_html preventing out-of-bounds access assert!( !html.contains(r#"