The smokesignal.events web application
at main 1262 lines 47 kB view raw
1//! Rich text facet structures and rendering for AT Protocol. 2//! 3//! This module provides structures for handling rich text facets (mentions, links, hashtags), 4//! parsing them from text, and rendering them as HTML for display in the UI. 5//! 6//! # Byte Offset Calculation 7//! 8//! This implementation correctly uses UTF-8 byte offsets as required by AT Protocol. 9//! The facets use "inclusive start and exclusive end" byte ranges. All parsing is done 10//! using `regex::bytes::Regex` which operates on byte slices and returns byte positions, 11//! ensuring correct handling of multi-byte UTF-8 characters (emojis, CJK, accented chars). 12 13use atproto_identity::resolve::IdentityResolver; 14use atproto_record::lexicon::app::bsky::richtext::facet::{ 15 ByteSlice, Facet, FacetFeature, Link, Mention, Tag, 16}; 17use regex::bytes::Regex; 18use std::fmt::Write; 19 20/// Configuration for facet parsing and rendering limits 21#[derive(Debug, Clone, Copy)] 22pub struct FacetLimits { 23 /// Maximum number of mention facets to process (default: 5) 24 pub mentions_max: usize, 25 /// Maximum number of tag facets to process (default: 5) 26 pub tags_max: usize, 27 /// Maximum number of link facets to process (default: 5) 28 pub links_max: usize, 29 /// Maximum total number of facets to process (default: 10) 30 pub max: usize, 31} 32 33impl Default for FacetLimits { 34 fn default() -> Self { 35 Self { 36 mentions_max: 5, 37 tags_max: 5, 38 links_max: 5, 39 max: 10, 40 } 41 } 42} 43 44/// Mention span with byte positions and handle 45#[derive(Debug)] 46pub struct MentionSpan { 47 pub start: usize, 48 pub end: usize, 49 pub handle: String, 50} 51 52/// URL span with byte positions and URL 53#[derive(Debug)] 54pub struct UrlSpan { 55 pub start: usize, 56 pub end: usize, 57 pub url: String, 58} 59 60/// Tag span with byte positions and tag text 61#[derive(Debug)] 62pub struct TagSpan { 63 pub start: usize, 64 pub end: usize, 65 pub tag: String, 66} 67 68/// Parse mentions from text and return their byte positions 69/// This function excludes mentions that appear within URLs 70pub fn parse_mentions(text: &str) -> Vec<MentionSpan> { 71 let mut spans = Vec::new(); 72 73 // First, parse all URLs to exclude mention matches within them 74 let url_spans = parse_urls(text); 75 76 // Regex based on: https://atproto.com/specs/handle#handle-identifier-syntax 77 // Pattern: [$|\W](@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?) 78 let mention_regex = Regex::new( 79 r"(?:^|[^\w])(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)" 80 ).unwrap(); 81 82 let text_bytes = text.as_bytes(); 83 for capture in mention_regex.captures_iter(text_bytes) { 84 if let Some(mention_match) = capture.get(1) { 85 let start = mention_match.start(); 86 let end = mention_match.end(); 87 88 // Check if this mention overlaps with any URL 89 let overlaps_url = url_spans.iter().any(|url| { 90 // Check if mention is within or overlaps the URL span 91 (start >= url.start && start < url.end) || (end > url.start && end <= url.end) 92 }); 93 94 // Only add the mention if it doesn't overlap with a URL 95 if !overlaps_url { 96 let handle = std::str::from_utf8(&mention_match.as_bytes()[1..]) 97 .unwrap_or_default() 98 .to_string(); 99 100 spans.push(MentionSpan { start, end, handle }); 101 } 102 } 103 } 104 105 spans 106} 107 108/// Parse URLs from text and return their byte positions 109pub fn parse_urls(text: &str) -> Vec<UrlSpan> { 110 let mut spans = Vec::new(); 111 112 // Partial/naive URL regex based on: https://stackoverflow.com/a/3809435 113 // Pattern: [$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?) 114 // Modified to use + instead of {1,6} to support longer TLDs and multi-level subdomains 115 let url_regex = Regex::new( 116 r"(?:^|[^\w])(https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)" 117 ).unwrap(); 118 119 let text_bytes = text.as_bytes(); 120 for capture in url_regex.captures_iter(text_bytes) { 121 if let Some(url_match) = capture.get(1) { 122 let url = std::str::from_utf8(url_match.as_bytes()) 123 .unwrap_or_default() 124 .to_string(); 125 126 spans.push(UrlSpan { 127 start: url_match.start(), 128 end: url_match.end(), 129 url, 130 }); 131 } 132 } 133 134 spans 135} 136 137/// Parse hashtags from text and return their byte positions 138pub fn parse_tags(text: &str) -> Vec<TagSpan> { 139 let mut spans = Vec::new(); 140 141 // Regex based on: https://github.com/bluesky-social/atproto/blob/d91988fe79030b61b556dd6f16a46f0c3b9d0b44/packages/api/src/rich-text/util.ts 142 // Simplified for Rust - matches hashtags at word boundaries 143 // Pattern matches: start of string or non-word char, then # or #, then tag content 144 let tag_regex = Regex::new(r"(?:^|[^\w])([##])([\w]+(?:[\w]*)*)").unwrap(); 145 146 let text_bytes = text.as_bytes(); 147 148 // Work with bytes for proper position tracking 149 for capture in tag_regex.captures_iter(text_bytes) { 150 if let (Some(full_match), Some(hash_match), Some(tag_match)) = 151 (capture.get(0), capture.get(1), capture.get(2)) 152 { 153 // Calculate the absolute byte position of the hash symbol 154 // The full match includes the preceding character (if any) 155 // so we need to adjust for that 156 let match_start = full_match.start(); 157 let hash_offset = hash_match.start() - full_match.start(); 158 let start = match_start + hash_offset; 159 let end = match_start + hash_offset + hash_match.len() + tag_match.len(); 160 161 // Extract just the tag text (without the hash symbol) 162 // Normalize to lowercase for case-insensitive tag matching 163 let tag = std::str::from_utf8(tag_match.as_bytes()).unwrap_or_default(); 164 165 // Only include tags that are not purely numeric 166 if !tag.chars().all(|c| c.is_ascii_digit()) { 167 spans.push(TagSpan { 168 start, 169 end, 170 tag: tag.to_string(), 171 }); 172 } 173 } 174 } 175 176 spans 177} 178 179/// Parse facets from text and return a vector of Facet objects. 180/// 181/// This function extracts mentions, URLs, and hashtags from the provided text 182/// and creates AT Protocol facets with proper byte indices. 183/// 184/// Mentions are resolved to actual DIDs using the provided identity resolver. 185/// If a handle cannot be resolved to a DID, the mention facet is skipped. 186/// 187/// # Arguments 188/// * `text` - The text to extract facets from 189/// * `identity_resolver` - Resolver for converting handles to DIDs 190/// * `limits` - Configuration for maximum facets per type and total 191/// 192/// # Returns 193/// Optional vector of facets. Returns None if no facets were found. 194pub async fn parse_facets_from_text( 195 text: &str, 196 identity_resolver: &dyn IdentityResolver, 197 limits: &FacetLimits, 198) -> Option<Vec<Facet>> { 199 let mut facets = Vec::new(); 200 201 // Parse mentions (limited by mentions_max) 202 let mention_spans = parse_mentions(text); 203 let mut mention_count = 0; 204 for mention in mention_spans { 205 if mention_count >= limits.mentions_max { 206 break; 207 } 208 209 // Try to resolve the handle to a DID 210 // First try with at:// prefix, then without 211 let at_uri = format!("at://{}", mention.handle); 212 let did_result = match identity_resolver.resolve(&at_uri).await { 213 Ok(doc) => Ok(doc), 214 Err(_) => identity_resolver.resolve(&mention.handle).await, 215 }; 216 217 // Only add the mention facet if we successfully resolved the DID 218 if let Ok(did_doc) = did_result { 219 facets.push(Facet { 220 index: ByteSlice { 221 byte_start: mention.start, 222 byte_end: mention.end, 223 }, 224 features: vec![FacetFeature::Mention(Mention { 225 did: did_doc.id.to_string(), 226 })], 227 }); 228 mention_count += 1; 229 } 230 // If resolution fails, we skip this mention facet entirely 231 } 232 233 // Parse URLs (limited by links_max) 234 let url_spans = parse_urls(text); 235 for (idx, url) in url_spans.into_iter().enumerate() { 236 if idx >= limits.links_max { 237 break; 238 } 239 facets.push(Facet { 240 index: ByteSlice { 241 byte_start: url.start, 242 byte_end: url.end, 243 }, 244 features: vec![FacetFeature::Link(Link { uri: url.url })], 245 }); 246 } 247 248 // Parse hashtags (limited by tags_max) 249 let tag_spans = parse_tags(text); 250 for (idx, tag_span) in tag_spans.into_iter().enumerate() { 251 if idx >= limits.tags_max { 252 break; 253 } 254 facets.push(Facet { 255 index: ByteSlice { 256 byte_start: tag_span.start, 257 byte_end: tag_span.end, 258 }, 259 features: vec![FacetFeature::Tag(Tag { tag: tag_span.tag })], 260 }); 261 } 262 263 // Apply global facet limit (truncate if exceeds max) 264 if facets.len() > limits.max { 265 facets.truncate(limits.max); 266 } 267 268 // Only return facets if we found any 269 if !facets.is_empty() { 270 Some(facets) 271 } else { 272 None 273 } 274} 275 276/// HTML escape helper function 277fn html_escape(text: &str) -> String { 278 text.chars() 279 .map(|c| match c { 280 '&' => "&amp;".to_string(), 281 '<' => "&lt;".to_string(), 282 '>' => "&gt;".to_string(), 283 '"' => "&quot;".to_string(), 284 '\'' => "&#39;".to_string(), 285 c => c.to_string(), 286 }) 287 .collect() 288} 289 290/// Check if text contains HTML tags 291/// This is used to detect potentially malicious content 292fn contains_html_tags(text: &str) -> bool { 293 // Look for patterns that indicate HTML tags 294 // We're looking for < followed by either a letter, /, or ! 295 let mut chars = text.chars().peekable(); 296 while let Some(ch) = chars.next() { 297 if ch == '<' 298 && let Some(&next_ch) = chars.peek() 299 { 300 // Check if this looks like an HTML tag 301 if next_ch.is_ascii_alphabetic() || next_ch == '/' || next_ch == '!' { 302 return true; 303 } 304 } 305 } 306 false 307} 308 309/// Render text with facets as HTML. 310/// 311/// This function converts plain text with facet annotations into HTML with proper 312/// links for mentions, URLs, and hashtags based on the facet information. 313/// 314/// # HTML Output 315/// - Mentions: `<a href="/[did]">@handle</a>` 316/// - Links: `<a href="[url]" target="_blank" rel="noopener noreferrer">[url]</a>` 317/// - Tags: `<a href="#[tag]">#tag</a>` 318/// - Regular text is HTML-escaped for security 319/// 320/// # Arguments 321/// * `text` - The plain text content 322/// * `facets` - Optional facets to apply to the text 323/// * `limits` - Configuration for maximum facets per type and total 324/// 325/// # Returns 326/// HTML string with facets rendered as links 327pub fn render_text_with_facets_html( 328 text: &str, 329 facets: Option<&Vec<Facet>>, 330 limits: &FacetLimits, 331) -> String { 332 // First, check if the text contains HTML tags 333 // If it does, treat it as suspicious and just clean it without applying facets 334 if contains_html_tags(text) { 335 // Use ammonia to strip ALL HTML and return plain text 336 let cleaned = ammonia::clean(text); 337 // Convert newlines to <br> tags after cleaning 338 return cleaned.replace('\n', "<br>"); 339 } 340 341 let text_bytes = text.as_bytes(); 342 343 // If no facets, just return escaped text 344 let Some(facets) = facets else { 345 return html_escape(text); 346 }; 347 348 // Sort facets by start position to process them in order 349 let mut sorted_facets: Vec<_> = facets.iter().collect(); 350 sorted_facets.sort_by_key(|f| f.index.byte_start); 351 352 // Apply limits: count facets by type and limit total 353 let mut mention_count = 0; 354 let mut link_count = 0; 355 let mut tag_count = 0; 356 let mut total_count = 0; 357 358 let filtered_facets: Vec<_> = sorted_facets 359 .into_iter() 360 .filter(|facet| { 361 if total_count >= limits.max { 362 return false; 363 } 364 365 // Check facet type and apply per-type limits 366 let should_include = facet.features.first().is_some_and(|feature| match feature { 367 FacetFeature::Mention(_) if mention_count < limits.mentions_max => { 368 mention_count += 1; 369 true 370 } 371 FacetFeature::Link(_) if link_count < limits.links_max => { 372 link_count += 1; 373 true 374 } 375 FacetFeature::Tag(_) if tag_count < limits.tags_max => { 376 tag_count += 1; 377 true 378 } 379 _ => false, 380 }); 381 382 if should_include { 383 total_count += 1; 384 } 385 386 should_include 387 }) 388 .collect(); 389 390 let mut html = String::new(); 391 let mut last_end = 0; 392 let text_len = text_bytes.len(); 393 394 for facet in filtered_facets { 395 // Validate facet indices are within bounds - skip invalid facets 396 if facet.index.byte_start > text_len 397 || facet.index.byte_end > text_len 398 || facet.index.byte_start > facet.index.byte_end 399 { 400 continue; 401 } 402 403 // Add any text before this facet (HTML-escaped) 404 if facet.index.byte_start > last_end { 405 let text_before = 406 std::str::from_utf8(&text_bytes[last_end..facet.index.byte_start]).unwrap_or(""); 407 html.push_str(&html_escape(text_before)); 408 } 409 410 // Get the text covered by this facet 411 let facet_text = 412 std::str::from_utf8(&text_bytes[facet.index.byte_start..facet.index.byte_end]) 413 .unwrap_or(""); 414 415 // Process the facet based on its feature type 416 // Only process the first feature (in practice, there should only be one per facet) 417 if let Some(feature) = facet.features.first() { 418 match feature { 419 FacetFeature::Mention(mention) => { 420 write!( 421 &mut html, 422 r#"<a href="/{}">{}</a>"#, 423 html_escape(&mention.did), 424 html_escape(facet_text) 425 ) 426 .unwrap(); 427 } 428 FacetFeature::Link(link) => { 429 // Only create link tags for safe URLs 430 if link.uri.starts_with("http://") 431 || link.uri.starts_with("https://") 432 || link.uri.starts_with("/") 433 { 434 write!( 435 &mut html, 436 r#"<a href="{}" target="_blank" rel="noopener noreferrer">{}</a>"#, 437 html_escape(&link.uri), 438 html_escape(facet_text) 439 ) 440 .unwrap(); 441 } else { 442 // For unsafe URLs (like javascript:), just render as plain text 443 html.push_str(&html_escape(facet_text)); 444 } 445 } 446 FacetFeature::Tag(tag) => { 447 // URL-encode the tag for the href attribute 448 let encoded_tag = urlencoding::encode(&tag.tag); 449 write!( 450 &mut html, 451 r##"<a href="#{}">{}</a>"##, 452 encoded_tag, 453 html_escape(facet_text) 454 ) 455 .unwrap(); 456 } 457 } 458 } 459 460 last_end = facet.index.byte_end; 461 } 462 463 // Add any remaining text after the last facet 464 if last_end < text_bytes.len() { 465 let remaining_text = std::str::from_utf8(&text_bytes[last_end..]).unwrap_or(""); 466 html.push_str(&html_escape(remaining_text)); 467 } 468 469 // Sanitize the final HTML output to ensure safety 470 // Configure ammonia to only allow <a> tags with specific attributes 471 let mut builder = ammonia::Builder::new(); 472 builder 473 .tags(std::collections::HashSet::from(["a", "br"])) 474 // Don't automatically add rel="nofollow" - we'll handle it in the attribute filter 475 .link_rel(None) 476 // Allow relative URLs (for internal links like /u/... and /t/...) 477 .url_relative(ammonia::UrlRelative::PassThrough) 478 .attribute_filter(|element, attribute, value| match (element, attribute) { 479 ("a", "href") => { 480 // Only allow safe URLs: relative paths starting with /, or http(s) URLs 481 if value.starts_with('/') 482 || value.starts_with("http://") 483 || value.starts_with("https://") 484 || value.starts_with("#") 485 { 486 Some(value.into()) 487 } else { 488 None 489 } 490 } 491 ("a", "target") => { 492 if value == "_blank" { 493 Some(value.into()) 494 } else { 495 None 496 } 497 } 498 ("a", "rel") => { 499 // For external links, ensure nofollow is present 500 if value.contains("noopener") || value.contains("noreferrer") { 501 // Keep the existing rel value but add nofollow if not present 502 if !value.contains("nofollow") { 503 Some(format!("{} nofollow", value).into()) 504 } else { 505 Some(value.into()) 506 } 507 } else { 508 // Just nofollow for other cases 509 Some("nofollow".into()) 510 } 511 } 512 ("br", _) => None, // br tags don't have attributes 513 _ => None, 514 }); 515 516 builder.clean(&html).to_string() 517} 518 519#[cfg(test)] 520mod tests { 521 use async_trait::async_trait; 522 use atproto_identity::model::Document; 523 use atproto_record::lexicon::app::bsky::richtext::facet::{ByteSlice, Link, Mention, Tag}; 524 use std::collections::HashMap; 525 526 use super::*; 527 528 /// Mock identity resolver for testing 529 struct MockIdentityResolver { 530 handles_to_dids: HashMap<String, String>, 531 } 532 533 impl MockIdentityResolver { 534 fn new() -> Self { 535 let mut handles_to_dids = HashMap::new(); 536 handles_to_dids.insert( 537 "alice.bsky.social".to_string(), 538 "did:plc:alice123".to_string(), 539 ); 540 handles_to_dids.insert( 541 "at://alice.bsky.social".to_string(), 542 "did:plc:alice123".to_string(), 543 ); 544 Self { handles_to_dids } 545 } 546 547 fn add_identity(&mut self, handle: &str, did: &str) { 548 self.handles_to_dids 549 .insert(handle.to_string(), did.to_string()); 550 self.handles_to_dids 551 .insert(format!("at://{}", handle), did.to_string()); 552 } 553 } 554 555 #[async_trait] 556 impl IdentityResolver for MockIdentityResolver { 557 async fn resolve(&self, handle: &str) -> anyhow::Result<Document> { 558 let handle_key = handle.to_string(); 559 560 if let Some(did) = self.handles_to_dids.get(&handle_key) { 561 Ok(Document { 562 context: vec![], 563 id: did.clone(), 564 also_known_as: vec![format!("at://{}", handle_key.trim_start_matches("at://"))], 565 verification_method: vec![], 566 service: vec![], 567 extra: HashMap::new(), 568 }) 569 } else { 570 Err(anyhow::anyhow!("Handle not found")) 571 } 572 } 573 } 574 575 #[test] 576 fn test_html_escape() { 577 assert_eq!(html_escape("Hello & <world>"), "Hello &amp; &lt;world&gt;"); 578 assert_eq!( 579 html_escape("\"quotes\" and 'apostrophes'"), 580 "&quot;quotes&quot; and &#39;apostrophes&#39;" 581 ); 582 assert_eq!(html_escape("Line 1\nLine 2"), "Line 1\nLine 2"); 583 assert_eq!(html_escape("Normal text"), "Normal text"); 584 } 585 586 #[test] 587 fn test_render_no_facets() { 588 let text = "This is a <test> description & it's great!"; 589 let limits = FacetLimits::default(); 590 let html = render_text_with_facets_html(text, None, &limits); 591 // HTML tags are detected and stripped by ammonia 592 // The <test> tag is removed entirely 593 assert_eq!(html, "This is a description &amp; it's great!"); 594 } 595 596 #[test] 597 fn test_render_with_html_tags() { 598 let text = "Check this <script>alert('XSS')</script> content!"; 599 let limits = FacetLimits::default(); 600 let html = render_text_with_facets_html(text, None, &limits); 601 // The script tag should be completely removed 602 assert_eq!(html, "Check this content!"); 603 assert!(!html.contains("script")); 604 assert!(!html.contains("alert")); 605 } 606 607 #[test] 608 fn test_render_with_mention() { 609 let text = "Contact @alice.bsky.social for details"; 610 let limits = FacetLimits::default(); 611 let facets = vec![Facet { 612 index: ByteSlice { 613 byte_start: 8, 614 byte_end: 26, 615 }, 616 features: vec![FacetFeature::Mention(Mention { 617 did: "did:plc:abc123".to_string(), 618 })], 619 }]; 620 621 let html = render_text_with_facets_html(text, Some(&facets), &limits); 622 assert_eq!( 623 html, 624 r#"Contact <a href="/did:plc:abc123">@alice.bsky.social</a> for details"# 625 ); 626 } 627 628 #[test] 629 fn test_render_with_link() { 630 let text = "Apply at https://example.com today!"; 631 let limits = FacetLimits::default(); 632 let facets = vec![Facet { 633 index: ByteSlice { 634 byte_start: 9, 635 byte_end: 28, 636 }, 637 features: vec![FacetFeature::Link(Link { 638 uri: "https://example.com".to_string(), 639 })], 640 }]; 641 642 let html = render_text_with_facets_html(text, Some(&facets), &limits); 643 assert_eq!( 644 html, 645 r#"Apply at <a href="https://example.com">https://example.com</a> today!"# 646 ); 647 } 648 649 #[test] 650 fn test_render_with_tag() { 651 let text = "Looking for #rust developers"; 652 let limits = FacetLimits::default(); 653 let facets = vec![Facet { 654 index: ByteSlice { 655 byte_start: 12, 656 byte_end: 17, 657 }, 658 features: vec![FacetFeature::Tag(Tag { 659 tag: "rust".to_string(), 660 })], 661 }]; 662 663 let html = render_text_with_facets_html(text, Some(&facets), &limits); 664 assert_eq!( 665 html, 666 r##"Looking for <a href="#rust">#rust</a> developers"## 667 ); 668 } 669 670 #[tokio::test] 671 async fn test_parse_facets_from_text_comprehensive() { 672 let mut resolver = MockIdentityResolver::new(); 673 resolver.add_identity("bob.test.com", "did:plc:bob456"); 674 675 let limits = FacetLimits::default(); 676 let text = "Join @alice.bsky.social and @bob.test.com at https://example.com #rust #golang"; 677 let facets = parse_facets_from_text(text, &resolver, &limits).await; 678 679 assert!(facets.is_some()); 680 let facets = facets.unwrap(); 681 assert_eq!(facets.len(), 5); // 2 mentions, 1 URL, 2 hashtags 682 683 // Check first mention 684 assert_eq!(facets[0].index.byte_start, 5); 685 assert_eq!(facets[0].index.byte_end, 23); 686 if let FacetFeature::Mention(ref mention) = facets[0].features[0] { 687 assert_eq!(mention.did, "did:plc:alice123"); 688 } else { 689 panic!("Expected Mention feature"); 690 } 691 692 // Check second mention 693 assert_eq!(facets[1].index.byte_start, 28); 694 assert_eq!(facets[1].index.byte_end, 41); 695 if let FacetFeature::Mention(ref mention) = facets[1].features[0] { 696 assert_eq!(mention.did, "did:plc:bob456"); 697 } else { 698 panic!("Expected Mention feature"); 699 } 700 701 // Check URL 702 assert_eq!(facets[2].index.byte_start, 45); 703 assert_eq!(facets[2].index.byte_end, 64); 704 if let FacetFeature::Link(ref link) = facets[2].features[0] { 705 assert_eq!(link.uri, "https://example.com"); 706 } else { 707 panic!("Expected Link feature"); 708 } 709 710 // Check first hashtag 711 assert_eq!(facets[3].index.byte_start, 65); 712 assert_eq!(facets[3].index.byte_end, 70); 713 if let FacetFeature::Tag(ref tag) = facets[3].features[0] { 714 assert_eq!(tag.tag, "rust"); 715 } else { 716 panic!("Expected Tag feature"); 717 } 718 719 // Check second hashtag 720 assert_eq!(facets[4].index.byte_start, 71); 721 assert_eq!(facets[4].index.byte_end, 78); 722 if let FacetFeature::Tag(ref tag) = facets[4].features[0] { 723 assert_eq!(tag.tag, "golang"); 724 } else { 725 panic!("Expected Tag feature"); 726 } 727 } 728 729 #[tokio::test] 730 async fn test_parse_facets_from_text_with_unresolvable_mention() { 731 let resolver = MockIdentityResolver::new(); 732 let limits = FacetLimits::default(); 733 734 // Only alice.bsky.social is in the resolver, not unknown.handle.com 735 let text = "Contact @unknown.handle.com for details #rust"; 736 let facets = parse_facets_from_text(text, &resolver, &limits).await; 737 738 assert!(facets.is_some()); 739 let facets = facets.unwrap(); 740 // Should only have 1 facet (the hashtag) since the mention couldn't be resolved 741 assert_eq!(facets.len(), 1); 742 743 // Check that it's the hashtag facet 744 if let FacetFeature::Tag(ref tag) = facets[0].features[0] { 745 assert_eq!(tag.tag, "rust"); 746 } else { 747 panic!("Expected Tag feature"); 748 } 749 } 750 751 #[tokio::test] 752 async fn test_parse_facets_from_text_empty() { 753 let resolver = MockIdentityResolver::new(); 754 let limits = FacetLimits::default(); 755 let text = "No mentions, URLs, or hashtags here"; 756 let facets = parse_facets_from_text(text, &resolver, &limits).await; 757 assert!(facets.is_none()); 758 } 759 760 #[tokio::test] 761 async fn test_parse_facets_from_text_url_with_at_mention() { 762 let resolver = MockIdentityResolver::new(); 763 let limits = FacetLimits::default(); 764 765 // URLs with @ should not create mention facets 766 let text = "Tangled https://tangled.org/@smokesignal.events"; 767 let facets = parse_facets_from_text(text, &resolver, &limits).await; 768 769 assert!(facets.is_some()); 770 let facets = facets.unwrap(); 771 772 // Should have exactly 1 facet (the URL), not 2 (URL + mention) 773 assert_eq!( 774 facets.len(), 775 1, 776 "Expected 1 facet (URL only), got {}", 777 facets.len() 778 ); 779 780 // Verify it's a link facet, not a mention 781 if let FacetFeature::Link(ref link) = facets[0].features[0] { 782 assert_eq!(link.uri, "https://tangled.org/@smokesignal.events"); 783 } else { 784 panic!("Expected Link feature, got Mention or Tag instead"); 785 } 786 } 787 788 #[tokio::test] 789 async fn test_parse_facets_with_mention_limit() { 790 let mut resolver = MockIdentityResolver::new(); 791 resolver.add_identity("bob.test.com", "did:plc:bob456"); 792 resolver.add_identity("charlie.test.com", "did:plc:charlie789"); 793 794 // Limit to 2 mentions 795 let limits = FacetLimits { 796 mentions_max: 2, 797 tags_max: 5, 798 links_max: 5, 799 max: 10, 800 }; 801 802 let text = "Join @alice.bsky.social @bob.test.com @charlie.test.com"; 803 let facets = parse_facets_from_text(text, &resolver, &limits).await; 804 805 assert!(facets.is_some()); 806 let facets = facets.unwrap(); 807 // Should only have 2 mentions (alice and bob), charlie should be skipped 808 assert_eq!(facets.len(), 2); 809 810 // Verify they're both mentions 811 for facet in &facets { 812 assert!(matches!(facet.features[0], FacetFeature::Mention(_))); 813 } 814 } 815 816 #[tokio::test] 817 async fn test_parse_facets_with_global_limit() { 818 let mut resolver = MockIdentityResolver::new(); 819 resolver.add_identity("bob.test.com", "did:plc:bob456"); 820 821 // Very restrictive global limit 822 let limits = FacetLimits { 823 mentions_max: 5, 824 tags_max: 5, 825 links_max: 5, 826 max: 3, // Only allow 3 total facets 827 }; 828 829 let text = 830 "Join @alice.bsky.social @bob.test.com at https://example.com #rust #golang #python"; 831 let facets = parse_facets_from_text(text, &resolver, &limits).await; 832 833 assert!(facets.is_some()); 834 let facets = facets.unwrap(); 835 // Should be truncated to 3 facets total 836 assert_eq!(facets.len(), 3); 837 } 838 839 #[test] 840 fn test_render_with_facet_limits() { 841 let text = "Contact @alice @bob @charlie for details"; 842 let limits = FacetLimits { 843 mentions_max: 2, // Only render first 2 mentions 844 tags_max: 5, 845 links_max: 5, 846 max: 10, 847 }; 848 849 let facets = vec![ 850 Facet { 851 index: ByteSlice { 852 byte_start: 8, 853 byte_end: 14, 854 }, 855 features: vec![FacetFeature::Mention(Mention { 856 did: "did:plc:alice".to_string(), 857 })], 858 }, 859 Facet { 860 index: ByteSlice { 861 byte_start: 15, 862 byte_end: 19, 863 }, 864 features: vec![FacetFeature::Mention(Mention { 865 did: "did:plc:bob".to_string(), 866 })], 867 }, 868 Facet { 869 index: ByteSlice { 870 byte_start: 20, 871 byte_end: 28, 872 }, 873 features: vec![FacetFeature::Mention(Mention { 874 did: "did:plc:charlie".to_string(), 875 })], 876 }, 877 ]; 878 879 let html = render_text_with_facets_html(text, Some(&facets), &limits); 880 // Should only render first 2 mentions, third should be plain text 881 assert!(html.contains(r#"<a href="/did:plc:alice">@alice</a>"#)); 882 assert!(html.contains(r#"<a href="/did:plc:bob">@bob</a>"#)); 883 // Charlie should NOT be a link due to mention limit 884 assert!(!html.contains(r#"<a href="/did:plc:charlie">"#)); 885 } 886 887 #[test] 888 fn test_parse_urls_multiple_links() { 889 let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"; 890 891 let url_spans = parse_urls(text); 892 893 // Debug output 894 for (i, span) in url_spans.iter().enumerate() { 895 println!( 896 "URL {}: {} (start={}, end={})", 897 i, span.url, span.start, span.end 898 ); 899 } 900 901 // Should find both URLs 902 assert_eq!( 903 url_spans.len(), 904 2, 905 "Expected 2 URLs but found {}", 906 url_spans.len() 907 ); 908 909 if !url_spans.is_empty() { 910 assert_eq!(url_spans[0].url, "https://www.ietf.org/meeting/124/"); 911 } 912 913 if url_spans.len() >= 2 { 914 assert_eq!( 915 url_spans[1].url, 916 "https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164" 917 ); 918 } 919 } 920 921 #[test] 922 fn test_parse_urls_with_html_entity() { 923 // Test with the HTML entity &amp; in the text 924 let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd &amp; Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"; 925 926 let url_spans = parse_urls(text); 927 928 // Debug output 929 for (i, span) in url_spans.iter().enumerate() { 930 println!( 931 "URL {}: {} (start={}, end={})", 932 i, span.url, span.start, span.end 933 ); 934 println!( 935 " Context before: {:?}", 936 &text[span.start.saturating_sub(10)..span.start] 937 ); 938 println!( 939 " Context after: {:?}", 940 &text[span.end..std::cmp::min(span.end + 10, text.len())] 941 ); 942 } 943 944 // Should find both URLs 945 assert_eq!( 946 url_spans.len(), 947 2, 948 "Expected 2 URLs but found {}", 949 url_spans.len() 950 ); 951 952 if !url_spans.is_empty() { 953 assert_eq!(url_spans[0].url, "https://www.ietf.org/meeting/124/"); 954 } 955 956 if url_spans.len() >= 2 { 957 assert_eq!( 958 url_spans[1].url, 959 "https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164" 960 ); 961 } 962 } 963 964 #[test] 965 fn test_render_malicious_link() { 966 let text = "Visit example.com for details"; 967 let limits = FacetLimits::default(); 968 let facets = vec![Facet { 969 index: ByteSlice { 970 byte_start: 6, 971 byte_end: 17, 972 }, 973 features: vec![FacetFeature::Link(Link { 974 uri: "javascript:alert('XSS')".to_string(), 975 })], 976 }]; 977 978 let html = render_text_with_facets_html(text, Some(&facets), &limits); 979 // JavaScript URLs should be blocked 980 assert!(!html.contains("javascript:")); 981 assert_eq!(html, "Visit example.com for details"); 982 } 983 984 #[test] 985 fn test_byte_offset_with_html_entities() { 986 // This test demonstrates that HTML entity escaping shifts byte positions. 987 // The byte positions shift: 988 // In original: '&' is at byte 8 (1 byte) 989 // In escaped: '&amp;' starts at byte 8 (5 bytes) 990 // This causes facet byte offsets to be misaligned if text is escaped before rendering. 991 992 // If we have a URL after the ampersand in the original: 993 let original_with_url = "Nov 3rd & Tuesday https://example.com"; 994 let escaped_with_url = "Nov 3rd &amp; Tuesday https://example.com"; 995 996 // Parse URLs from both versions 997 let original_urls = parse_urls(original_with_url); 998 let escaped_urls = parse_urls(escaped_with_url); 999 1000 println!("Original text: {:?}", original_with_url); 1001 println!( 1002 "Original URL found at: {:?}", 1003 original_urls.first().map(|u| (u.start, u.end)) 1004 ); 1005 println!("Escaped text: {:?}", escaped_with_url); 1006 println!( 1007 "Escaped URL found at: {:?}", 1008 escaped_urls.first().map(|u| (u.start, u.end)) 1009 ); 1010 1011 // Both should find the URL, but at different byte positions 1012 assert_eq!(original_urls.len(), 1); 1013 assert_eq!(escaped_urls.len(), 1); 1014 1015 // The byte positions will be different 1016 assert_eq!(original_urls[0].start, 18); // After "Nov 3rd & Tuesday " 1017 assert_eq!(escaped_urls[0].start, 22); // After "Nov 3rd &amp; Tuesday " (4 extra bytes for &amp;) 1018 } 1019 1020 #[test] 1021 fn test_render_facets_with_ampersand_in_text() { 1022 // Test case from the bug report: text with & that should have two URL facets 1023 let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"; 1024 1025 // Parse facets from the original text 1026 let url_spans = parse_urls(text); 1027 assert_eq!(url_spans.len(), 2, "Should find 2 URLs"); 1028 1029 // Create facets from the parsed URLs 1030 let facets = vec![ 1031 Facet { 1032 index: ByteSlice { 1033 byte_start: url_spans[0].start, 1034 byte_end: url_spans[0].end, 1035 }, 1036 features: vec![FacetFeature::Link(Link { 1037 uri: url_spans[0].url.clone(), 1038 })], 1039 }, 1040 Facet { 1041 index: ByteSlice { 1042 byte_start: url_spans[1].start, 1043 byte_end: url_spans[1].end, 1044 }, 1045 features: vec![FacetFeature::Link(Link { 1046 uri: url_spans[1].url.clone(), 1047 })], 1048 }, 1049 ]; 1050 1051 // Render with facets - this should work correctly even with & in the text 1052 let limits = FacetLimits::default(); 1053 let html = render_text_with_facets_html(text, Some(&facets), &limits); 1054 1055 // Both URLs should be rendered as links 1056 assert!( 1057 html.contains(r#"<a href="https://www.ietf.org/meeting/124/""#), 1058 "First URL should be a link" 1059 ); 1060 assert!(html.contains(r#"<a href="https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164""#), "Second URL should be a link"); 1061 1062 // The ampersand should be HTML-escaped in the output 1063 assert!( 1064 html.contains("&amp;"), 1065 "Ampersand should be escaped in HTML output" 1066 ); 1067 1068 // Verify the links are properly closed 1069 assert_eq!( 1070 html.matches("</a>").count(), 1071 2, 1072 "Should have 2 closing </a> tags" 1073 ); 1074 } 1075 1076 #[test] 1077 fn test_render_with_out_of_bounds_facet() { 1078 // Regression test for panic: "range end index 324 out of range for slice of length 323" 1079 // This can happen when facets come from external AT Protocol data with incorrect byte offsets 1080 let text = "Hello world"; // 11 bytes 1081 let limits = FacetLimits::default(); 1082 1083 // Create a facet that extends beyond the text length 1084 let facets = vec![Facet { 1085 index: ByteSlice { 1086 byte_start: 6, 1087 byte_end: 20, // Beyond text length of 11 1088 }, 1089 features: vec![FacetFeature::Link(Link { 1090 uri: "https://example.com".to_string(), 1091 })], 1092 }]; 1093 1094 // This should NOT panic - invalid facets should be skipped 1095 let html = render_text_with_facets_html(text, Some(&facets), &limits); 1096 1097 // The text should still be rendered (escaped), just without the invalid facet 1098 assert_eq!(html, "Hello world"); 1099 } 1100 1101 #[test] 1102 fn test_render_with_facet_start_beyond_text() { 1103 // Test when facet start is beyond text length 1104 let text = "Short"; // 5 bytes 1105 let limits = FacetLimits::default(); 1106 1107 let facets = vec![Facet { 1108 index: ByteSlice { 1109 byte_start: 100, // Way beyond text length 1110 byte_end: 110, 1111 }, 1112 features: vec![FacetFeature::Link(Link { 1113 uri: "https://example.com".to_string(), 1114 })], 1115 }]; 1116 1117 // Should not panic 1118 let html = render_text_with_facets_html(text, Some(&facets), &limits); 1119 assert_eq!(html, "Short"); 1120 } 1121 1122 #[test] 1123 fn test_render_with_inverted_facet_indices() { 1124 // Test when byte_start > byte_end (invalid) 1125 let text = "Hello world"; 1126 let limits = FacetLimits::default(); 1127 1128 let facets = vec![Facet { 1129 index: ByteSlice { 1130 byte_start: 8, 1131 byte_end: 4, // Invalid: end before start 1132 }, 1133 features: vec![FacetFeature::Link(Link { 1134 uri: "https://example.com".to_string(), 1135 })], 1136 }]; 1137 1138 // Should not panic 1139 let html = render_text_with_facets_html(text, Some(&facets), &limits); 1140 assert_eq!(html, "Hello world"); 1141 } 1142 1143 #[test] 1144 fn test_parse_urls_from_atproto_record_text() { 1145 // Test parsing URLs from real AT Protocol record description text. 1146 // This demonstrates the correct byte positions that should be used for facets. 1147 let text = "Dev, Power Users, and Generally inquisitive folks get a completely unprofessionally amateur interview. Just a yap sesh where chat is part of the call!\n\n✨the daniel✨ & I will be on a Zoom call and I will stream out to https://stream.place/psingletary.com\n\nSubscribe to the publications! https://atprotocalls.leaflet.pub/"; 1148 1149 let url_spans = parse_urls(text); 1150 1151 assert_eq!(url_spans.len(), 2, "Should find 2 URLs"); 1152 1153 // First URL: https://stream.place/psingletary.com 1154 assert_eq!(url_spans[0].url, "https://stream.place/psingletary.com"); 1155 assert_eq!(url_spans[0].start, 221); 1156 assert_eq!(url_spans[0].end, 257); 1157 1158 // Second URL: https://atprotocalls.leaflet.pub/ 1159 assert_eq!(url_spans[1].url, "https://atprotocalls.leaflet.pub/"); 1160 assert_eq!(url_spans[1].start, 290); 1161 assert_eq!(url_spans[1].end, 323); 1162 1163 // Verify the byte slices match the expected text 1164 let text_bytes = text.as_bytes(); 1165 assert_eq!( 1166 std::str::from_utf8(&text_bytes[221..257]).unwrap(), 1167 "https://stream.place/psingletary.com" 1168 ); 1169 assert_eq!( 1170 std::str::from_utf8(&text_bytes[290..323]).unwrap(), 1171 "https://atprotocalls.leaflet.pub/" 1172 ); 1173 1174 // Note: The AT Protocol record had incorrect facet indices: 1175 // - First link: byteStart=222, byteEnd=258 (should be 221, 257) 1176 // - Second link: byteStart=291, byteEnd=324 (should be 290, 323) 1177 // This off-by-one error was in the source data, not our parser. 1178 } 1179 1180 #[test] 1181 fn test_render_with_off_by_one_facet_indices() { 1182 // Regression test for facets with off-by-one byte indices from external AT Protocol data. 1183 // The facets in this test have byteStart values that are 1 byte too high, causing 1184 // the first character of the URL to appear outside the link tag. 1185 // 1186 // This test documents the current behavior: the renderer faithfully applies facets 1187 // at the specified byte positions, even if those positions are incorrect. 1188 // The root cause is incorrect facet generation by the client that created the record. 1189 let text = "Dev, Power Users, and Generally inquisitive folks get a completely unprofessionally amateur interview. Just a yap sesh where chat is part of the call!\n\n✨the daniel✨ & I will be on a Zoom call and I will stream out to https://stream.place/psingletary.com\n\nSubscribe to the publications! https://atprotocalls.leaflet.pub/"; 1190 1191 // Verify text length - the second facet's byte_end (324) exceeds this 1192 assert_eq!(text.len(), 323, "Text should be 323 bytes"); 1193 1194 let limits = FacetLimits::default(); 1195 1196 // These facets have incorrect byte indices (off by 1) - this is real data from AT Protocol 1197 let facets = vec![ 1198 Facet { 1199 index: ByteSlice { 1200 byte_start: 222, // Should be 221 1201 byte_end: 258, // Should be 257 (but 258 is within bounds) 1202 }, 1203 features: vec![FacetFeature::Link(Link { 1204 uri: "https://stream.place/psingletary.com".to_string(), 1205 })], 1206 }, 1207 Facet { 1208 index: ByteSlice { 1209 byte_start: 291, // Should be 290 1210 byte_end: 324, // Should be 323 - but 324 > text.len() so this facet is SKIPPED 1211 }, 1212 features: vec![FacetFeature::Link(Link { 1213 uri: "https://atprotocalls.leaflet.pub/".to_string(), 1214 })], 1215 }, 1216 ]; 1217 1218 let html = render_text_with_facets_html(text, Some(&facets), &limits); 1219 1220 // Due to off-by-one facet indices, the 'h' from 'https' appears before the link tag 1221 assert!( 1222 html.contains(r#"stream out to h<a href="https://stream.place/psingletary.com""#), 1223 "First link should have 'h' outside due to off-by-one error. Got: {}", 1224 html 1225 ); 1226 1227 // The second facet is SKIPPED entirely because byte_end (324) > text.len() (323) 1228 // This is the bounds check in render_text_with_facets_html preventing out-of-bounds access 1229 assert!( 1230 !html.contains(r#"<a href="https://atprotocalls.leaflet.pub/""#), 1231 "Second link should NOT be rendered because facet is out of bounds. Got: {}", 1232 html 1233 ); 1234 assert!( 1235 html.contains("https://atprotocalls.leaflet.pub/"), 1236 "Second URL should appear as plain text. Got: {}", 1237 html 1238 ); 1239 1240 // Verify correct byte positions from our parser 1241 let url_spans = parse_urls(text); 1242 assert_eq!(url_spans.len(), 2, "Should find 2 URLs"); 1243 1244 // The correct byte positions from our parser 1245 assert_eq!( 1246 url_spans[0].start, 221, 1247 "First URL should start at byte 221, not 222" 1248 ); 1249 assert_eq!( 1250 url_spans[0].end, 257, 1251 "First URL should end at byte 257, not 258" 1252 ); 1253 assert_eq!( 1254 url_spans[1].start, 290, 1255 "Second URL should start at byte 290, not 291" 1256 ); 1257 assert_eq!( 1258 url_spans[1].end, 323, 1259 "Second URL should end at byte 323, not 324" 1260 ); 1261 } 1262}