The smokesignal.events web application
1//! Rich text facet structures and rendering for AT Protocol.
2//!
3//! This module provides structures for handling rich text facets (mentions, links, hashtags),
4//! parsing them from text, and rendering them as HTML for display in the UI.
5//!
6//! # Byte Offset Calculation
7//!
8//! This implementation correctly uses UTF-8 byte offsets as required by AT Protocol.
9//! The facets use "inclusive start and exclusive end" byte ranges. All parsing is done
10//! using `regex::bytes::Regex` which operates on byte slices and returns byte positions,
11//! ensuring correct handling of multi-byte UTF-8 characters (emojis, CJK, accented chars).
12
13use atproto_identity::resolve::IdentityResolver;
14use atproto_record::lexicon::app::bsky::richtext::facet::{
15 ByteSlice, Facet, FacetFeature, Link, Mention, Tag,
16};
17use regex::bytes::Regex;
18use std::fmt::Write;
19
20/// Configuration for facet parsing and rendering limits
21#[derive(Debug, Clone, Copy)]
22pub struct FacetLimits {
23 /// Maximum number of mention facets to process (default: 5)
24 pub mentions_max: usize,
25 /// Maximum number of tag facets to process (default: 5)
26 pub tags_max: usize,
27 /// Maximum number of link facets to process (default: 5)
28 pub links_max: usize,
29 /// Maximum total number of facets to process (default: 10)
30 pub max: usize,
31}
32
33impl Default for FacetLimits {
34 fn default() -> Self {
35 Self {
36 mentions_max: 5,
37 tags_max: 5,
38 links_max: 5,
39 max: 10,
40 }
41 }
42}
43
44/// Mention span with byte positions and handle
45#[derive(Debug)]
46pub struct MentionSpan {
47 pub start: usize,
48 pub end: usize,
49 pub handle: String,
50}
51
52/// URL span with byte positions and URL
53#[derive(Debug)]
54pub struct UrlSpan {
55 pub start: usize,
56 pub end: usize,
57 pub url: String,
58}
59
60/// Tag span with byte positions and tag text
61#[derive(Debug)]
62pub struct TagSpan {
63 pub start: usize,
64 pub end: usize,
65 pub tag: String,
66}
67
68/// Parse mentions from text and return their byte positions
69/// This function excludes mentions that appear within URLs
70pub fn parse_mentions(text: &str) -> Vec<MentionSpan> {
71 let mut spans = Vec::new();
72
73 // First, parse all URLs to exclude mention matches within them
74 let url_spans = parse_urls(text);
75
76 // Regex based on: https://atproto.com/specs/handle#handle-identifier-syntax
77 // Pattern: [$|\W](@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)
78 let mention_regex = Regex::new(
79 r"(?:^|[^\w])(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)"
80 ).unwrap();
81
82 let text_bytes = text.as_bytes();
83 for capture in mention_regex.captures_iter(text_bytes) {
84 if let Some(mention_match) = capture.get(1) {
85 let start = mention_match.start();
86 let end = mention_match.end();
87
88 // Check if this mention overlaps with any URL
89 let overlaps_url = url_spans.iter().any(|url| {
90 // Check if mention is within or overlaps the URL span
91 (start >= url.start && start < url.end) || (end > url.start && end <= url.end)
92 });
93
94 // Only add the mention if it doesn't overlap with a URL
95 if !overlaps_url {
96 let handle = std::str::from_utf8(&mention_match.as_bytes()[1..])
97 .unwrap_or_default()
98 .to_string();
99
100 spans.push(MentionSpan { start, end, handle });
101 }
102 }
103 }
104
105 spans
106}
107
108/// Parse URLs from text and return their byte positions
109pub fn parse_urls(text: &str) -> Vec<UrlSpan> {
110 let mut spans = Vec::new();
111
112 // Partial/naive URL regex based on: https://stackoverflow.com/a/3809435
113 // Pattern: [$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)
114 // Modified to use + instead of {1,6} to support longer TLDs and multi-level subdomains
115 let url_regex = Regex::new(
116 r"(?:^|[^\w])(https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)"
117 ).unwrap();
118
119 let text_bytes = text.as_bytes();
120 for capture in url_regex.captures_iter(text_bytes) {
121 if let Some(url_match) = capture.get(1) {
122 let url = std::str::from_utf8(url_match.as_bytes())
123 .unwrap_or_default()
124 .to_string();
125
126 spans.push(UrlSpan {
127 start: url_match.start(),
128 end: url_match.end(),
129 url,
130 });
131 }
132 }
133
134 spans
135}
136
137/// Parse hashtags from text and return their byte positions
138pub fn parse_tags(text: &str) -> Vec<TagSpan> {
139 let mut spans = Vec::new();
140
141 // Regex based on: https://github.com/bluesky-social/atproto/blob/d91988fe79030b61b556dd6f16a46f0c3b9d0b44/packages/api/src/rich-text/util.ts
142 // Simplified for Rust - matches hashtags at word boundaries
143 // Pattern matches: start of string or non-word char, then # or #, then tag content
144 let tag_regex = Regex::new(r"(?:^|[^\w])([##])([\w]+(?:[\w]*)*)").unwrap();
145
146 let text_bytes = text.as_bytes();
147
148 // Work with bytes for proper position tracking
149 for capture in tag_regex.captures_iter(text_bytes) {
150 if let (Some(full_match), Some(hash_match), Some(tag_match)) =
151 (capture.get(0), capture.get(1), capture.get(2))
152 {
153 // Calculate the absolute byte position of the hash symbol
154 // The full match includes the preceding character (if any)
155 // so we need to adjust for that
156 let match_start = full_match.start();
157 let hash_offset = hash_match.start() - full_match.start();
158 let start = match_start + hash_offset;
159 let end = match_start + hash_offset + hash_match.len() + tag_match.len();
160
161 // Extract just the tag text (without the hash symbol)
162 // Normalize to lowercase for case-insensitive tag matching
163 let tag = std::str::from_utf8(tag_match.as_bytes()).unwrap_or_default();
164
165 // Only include tags that are not purely numeric
166 if !tag.chars().all(|c| c.is_ascii_digit()) {
167 spans.push(TagSpan {
168 start,
169 end,
170 tag: tag.to_string(),
171 });
172 }
173 }
174 }
175
176 spans
177}
178
179/// Parse facets from text and return a vector of Facet objects.
180///
181/// This function extracts mentions, URLs, and hashtags from the provided text
182/// and creates AT Protocol facets with proper byte indices.
183///
184/// Mentions are resolved to actual DIDs using the provided identity resolver.
185/// If a handle cannot be resolved to a DID, the mention facet is skipped.
186///
187/// # Arguments
188/// * `text` - The text to extract facets from
189/// * `identity_resolver` - Resolver for converting handles to DIDs
190/// * `limits` - Configuration for maximum facets per type and total
191///
192/// # Returns
193/// Optional vector of facets. Returns None if no facets were found.
194pub async fn parse_facets_from_text(
195 text: &str,
196 identity_resolver: &dyn IdentityResolver,
197 limits: &FacetLimits,
198) -> Option<Vec<Facet>> {
199 let mut facets = Vec::new();
200
201 // Parse mentions (limited by mentions_max)
202 let mention_spans = parse_mentions(text);
203 let mut mention_count = 0;
204 for mention in mention_spans {
205 if mention_count >= limits.mentions_max {
206 break;
207 }
208
209 // Try to resolve the handle to a DID
210 // First try with at:// prefix, then without
211 let at_uri = format!("at://{}", mention.handle);
212 let did_result = match identity_resolver.resolve(&at_uri).await {
213 Ok(doc) => Ok(doc),
214 Err(_) => identity_resolver.resolve(&mention.handle).await,
215 };
216
217 // Only add the mention facet if we successfully resolved the DID
218 if let Ok(did_doc) = did_result {
219 facets.push(Facet {
220 index: ByteSlice {
221 byte_start: mention.start,
222 byte_end: mention.end,
223 },
224 features: vec![FacetFeature::Mention(Mention {
225 did: did_doc.id.to_string(),
226 })],
227 });
228 mention_count += 1;
229 }
230 // If resolution fails, we skip this mention facet entirely
231 }
232
233 // Parse URLs (limited by links_max)
234 let url_spans = parse_urls(text);
235 for (idx, url) in url_spans.into_iter().enumerate() {
236 if idx >= limits.links_max {
237 break;
238 }
239 facets.push(Facet {
240 index: ByteSlice {
241 byte_start: url.start,
242 byte_end: url.end,
243 },
244 features: vec![FacetFeature::Link(Link { uri: url.url })],
245 });
246 }
247
248 // Parse hashtags (limited by tags_max)
249 let tag_spans = parse_tags(text);
250 for (idx, tag_span) in tag_spans.into_iter().enumerate() {
251 if idx >= limits.tags_max {
252 break;
253 }
254 facets.push(Facet {
255 index: ByteSlice {
256 byte_start: tag_span.start,
257 byte_end: tag_span.end,
258 },
259 features: vec![FacetFeature::Tag(Tag { tag: tag_span.tag })],
260 });
261 }
262
263 // Apply global facet limit (truncate if exceeds max)
264 if facets.len() > limits.max {
265 facets.truncate(limits.max);
266 }
267
268 // Only return facets if we found any
269 if !facets.is_empty() {
270 Some(facets)
271 } else {
272 None
273 }
274}
275
276/// HTML escape helper function
277fn html_escape(text: &str) -> String {
278 text.chars()
279 .map(|c| match c {
280 '&' => "&".to_string(),
281 '<' => "<".to_string(),
282 '>' => ">".to_string(),
283 '"' => """.to_string(),
284 '\'' => "'".to_string(),
285 c => c.to_string(),
286 })
287 .collect()
288}
289
290/// Check if text contains HTML tags
291/// This is used to detect potentially malicious content
292fn contains_html_tags(text: &str) -> bool {
293 // Look for patterns that indicate HTML tags
294 // We're looking for < followed by either a letter, /, or !
295 let mut chars = text.chars().peekable();
296 while let Some(ch) = chars.next() {
297 if ch == '<'
298 && let Some(&next_ch) = chars.peek()
299 {
300 // Check if this looks like an HTML tag
301 if next_ch.is_ascii_alphabetic() || next_ch == '/' || next_ch == '!' {
302 return true;
303 }
304 }
305 }
306 false
307}
308
309/// Render text with facets as HTML.
310///
311/// This function converts plain text with facet annotations into HTML with proper
312/// links for mentions, URLs, and hashtags based on the facet information.
313///
314/// # HTML Output
315/// - Mentions: `<a href="/[did]">@handle</a>`
316/// - Links: `<a href="[url]" target="_blank" rel="noopener noreferrer">[url]</a>`
317/// - Tags: `<a href="#[tag]">#tag</a>`
318/// - Regular text is HTML-escaped for security
319///
320/// # Arguments
321/// * `text` - The plain text content
322/// * `facets` - Optional facets to apply to the text
323/// * `limits` - Configuration for maximum facets per type and total
324///
325/// # Returns
326/// HTML string with facets rendered as links
327pub fn render_text_with_facets_html(
328 text: &str,
329 facets: Option<&Vec<Facet>>,
330 limits: &FacetLimits,
331) -> String {
332 // First, check if the text contains HTML tags
333 // If it does, treat it as suspicious and just clean it without applying facets
334 if contains_html_tags(text) {
335 // Use ammonia to strip ALL HTML and return plain text
336 let cleaned = ammonia::clean(text);
337 // Convert newlines to <br> tags after cleaning
338 return cleaned.replace('\n', "<br>");
339 }
340
341 let text_bytes = text.as_bytes();
342
343 // If no facets, just return escaped text
344 let Some(facets) = facets else {
345 return html_escape(text);
346 };
347
348 // Sort facets by start position to process them in order
349 let mut sorted_facets: Vec<_> = facets.iter().collect();
350 sorted_facets.sort_by_key(|f| f.index.byte_start);
351
352 // Apply limits: count facets by type and limit total
353 let mut mention_count = 0;
354 let mut link_count = 0;
355 let mut tag_count = 0;
356 let mut total_count = 0;
357
358 let filtered_facets: Vec<_> = sorted_facets
359 .into_iter()
360 .filter(|facet| {
361 if total_count >= limits.max {
362 return false;
363 }
364
365 // Check facet type and apply per-type limits
366 let should_include = facet.features.first().is_some_and(|feature| match feature {
367 FacetFeature::Mention(_) if mention_count < limits.mentions_max => {
368 mention_count += 1;
369 true
370 }
371 FacetFeature::Link(_) if link_count < limits.links_max => {
372 link_count += 1;
373 true
374 }
375 FacetFeature::Tag(_) if tag_count < limits.tags_max => {
376 tag_count += 1;
377 true
378 }
379 _ => false,
380 });
381
382 if should_include {
383 total_count += 1;
384 }
385
386 should_include
387 })
388 .collect();
389
390 let mut html = String::new();
391 let mut last_end = 0;
392 let text_len = text_bytes.len();
393
394 for facet in filtered_facets {
395 // Validate facet indices are within bounds - skip invalid facets
396 if facet.index.byte_start > text_len
397 || facet.index.byte_end > text_len
398 || facet.index.byte_start > facet.index.byte_end
399 {
400 continue;
401 }
402
403 // Add any text before this facet (HTML-escaped)
404 if facet.index.byte_start > last_end {
405 let text_before =
406 std::str::from_utf8(&text_bytes[last_end..facet.index.byte_start]).unwrap_or("");
407 html.push_str(&html_escape(text_before));
408 }
409
410 // Get the text covered by this facet
411 let facet_text =
412 std::str::from_utf8(&text_bytes[facet.index.byte_start..facet.index.byte_end])
413 .unwrap_or("");
414
415 // Process the facet based on its feature type
416 // Only process the first feature (in practice, there should only be one per facet)
417 if let Some(feature) = facet.features.first() {
418 match feature {
419 FacetFeature::Mention(mention) => {
420 write!(
421 &mut html,
422 r#"<a href="/{}">{}</a>"#,
423 html_escape(&mention.did),
424 html_escape(facet_text)
425 )
426 .unwrap();
427 }
428 FacetFeature::Link(link) => {
429 // Only create link tags for safe URLs
430 if link.uri.starts_with("http://")
431 || link.uri.starts_with("https://")
432 || link.uri.starts_with("/")
433 {
434 write!(
435 &mut html,
436 r#"<a href="{}" target="_blank" rel="noopener noreferrer">{}</a>"#,
437 html_escape(&link.uri),
438 html_escape(facet_text)
439 )
440 .unwrap();
441 } else {
442 // For unsafe URLs (like javascript:), just render as plain text
443 html.push_str(&html_escape(facet_text));
444 }
445 }
446 FacetFeature::Tag(tag) => {
447 // URL-encode the tag for the href attribute
448 let encoded_tag = urlencoding::encode(&tag.tag);
449 write!(
450 &mut html,
451 r##"<a href="#{}">{}</a>"##,
452 encoded_tag,
453 html_escape(facet_text)
454 )
455 .unwrap();
456 }
457 }
458 }
459
460 last_end = facet.index.byte_end;
461 }
462
463 // Add any remaining text after the last facet
464 if last_end < text_bytes.len() {
465 let remaining_text = std::str::from_utf8(&text_bytes[last_end..]).unwrap_or("");
466 html.push_str(&html_escape(remaining_text));
467 }
468
469 // Sanitize the final HTML output to ensure safety
470 // Configure ammonia to only allow <a> tags with specific attributes
471 let mut builder = ammonia::Builder::new();
472 builder
473 .tags(std::collections::HashSet::from(["a", "br"]))
474 // Don't automatically add rel="nofollow" - we'll handle it in the attribute filter
475 .link_rel(None)
476 // Allow relative URLs (for internal links like /u/... and /t/...)
477 .url_relative(ammonia::UrlRelative::PassThrough)
478 .attribute_filter(|element, attribute, value| match (element, attribute) {
479 ("a", "href") => {
480 // Only allow safe URLs: relative paths starting with /, or http(s) URLs
481 if value.starts_with('/')
482 || value.starts_with("http://")
483 || value.starts_with("https://")
484 || value.starts_with("#")
485 {
486 Some(value.into())
487 } else {
488 None
489 }
490 }
491 ("a", "target") => {
492 if value == "_blank" {
493 Some(value.into())
494 } else {
495 None
496 }
497 }
498 ("a", "rel") => {
499 // For external links, ensure nofollow is present
500 if value.contains("noopener") || value.contains("noreferrer") {
501 // Keep the existing rel value but add nofollow if not present
502 if !value.contains("nofollow") {
503 Some(format!("{} nofollow", value).into())
504 } else {
505 Some(value.into())
506 }
507 } else {
508 // Just nofollow for other cases
509 Some("nofollow".into())
510 }
511 }
512 ("br", _) => None, // br tags don't have attributes
513 _ => None,
514 });
515
516 builder.clean(&html).to_string()
517}
518
519#[cfg(test)]
520mod tests {
521 use async_trait::async_trait;
522 use atproto_identity::model::Document;
523 use atproto_record::lexicon::app::bsky::richtext::facet::{ByteSlice, Link, Mention, Tag};
524 use std::collections::HashMap;
525
526 use super::*;
527
528 /// Mock identity resolver for testing
529 struct MockIdentityResolver {
530 handles_to_dids: HashMap<String, String>,
531 }
532
533 impl MockIdentityResolver {
534 fn new() -> Self {
535 let mut handles_to_dids = HashMap::new();
536 handles_to_dids.insert(
537 "alice.bsky.social".to_string(),
538 "did:plc:alice123".to_string(),
539 );
540 handles_to_dids.insert(
541 "at://alice.bsky.social".to_string(),
542 "did:plc:alice123".to_string(),
543 );
544 Self { handles_to_dids }
545 }
546
547 fn add_identity(&mut self, handle: &str, did: &str) {
548 self.handles_to_dids
549 .insert(handle.to_string(), did.to_string());
550 self.handles_to_dids
551 .insert(format!("at://{}", handle), did.to_string());
552 }
553 }
554
555 #[async_trait]
556 impl IdentityResolver for MockIdentityResolver {
557 async fn resolve(&self, handle: &str) -> anyhow::Result<Document> {
558 let handle_key = handle.to_string();
559
560 if let Some(did) = self.handles_to_dids.get(&handle_key) {
561 Ok(Document {
562 context: vec![],
563 id: did.clone(),
564 also_known_as: vec![format!("at://{}", handle_key.trim_start_matches("at://"))],
565 verification_method: vec![],
566 service: vec![],
567 extra: HashMap::new(),
568 })
569 } else {
570 Err(anyhow::anyhow!("Handle not found"))
571 }
572 }
573 }
574
575 #[test]
576 fn test_html_escape() {
577 assert_eq!(html_escape("Hello & <world>"), "Hello & <world>");
578 assert_eq!(
579 html_escape("\"quotes\" and 'apostrophes'"),
580 ""quotes" and 'apostrophes'"
581 );
582 assert_eq!(html_escape("Line 1\nLine 2"), "Line 1\nLine 2");
583 assert_eq!(html_escape("Normal text"), "Normal text");
584 }
585
586 #[test]
587 fn test_render_no_facets() {
588 let text = "This is a <test> description & it's great!";
589 let limits = FacetLimits::default();
590 let html = render_text_with_facets_html(text, None, &limits);
591 // HTML tags are detected and stripped by ammonia
592 // The <test> tag is removed entirely
593 assert_eq!(html, "This is a description & it's great!");
594 }
595
596 #[test]
597 fn test_render_with_html_tags() {
598 let text = "Check this <script>alert('XSS')</script> content!";
599 let limits = FacetLimits::default();
600 let html = render_text_with_facets_html(text, None, &limits);
601 // The script tag should be completely removed
602 assert_eq!(html, "Check this content!");
603 assert!(!html.contains("script"));
604 assert!(!html.contains("alert"));
605 }
606
607 #[test]
608 fn test_render_with_mention() {
609 let text = "Contact @alice.bsky.social for details";
610 let limits = FacetLimits::default();
611 let facets = vec![Facet {
612 index: ByteSlice {
613 byte_start: 8,
614 byte_end: 26,
615 },
616 features: vec![FacetFeature::Mention(Mention {
617 did: "did:plc:abc123".to_string(),
618 })],
619 }];
620
621 let html = render_text_with_facets_html(text, Some(&facets), &limits);
622 assert_eq!(
623 html,
624 r#"Contact <a href="/did:plc:abc123">@alice.bsky.social</a> for details"#
625 );
626 }
627
628 #[test]
629 fn test_render_with_link() {
630 let text = "Apply at https://example.com today!";
631 let limits = FacetLimits::default();
632 let facets = vec![Facet {
633 index: ByteSlice {
634 byte_start: 9,
635 byte_end: 28,
636 },
637 features: vec![FacetFeature::Link(Link {
638 uri: "https://example.com".to_string(),
639 })],
640 }];
641
642 let html = render_text_with_facets_html(text, Some(&facets), &limits);
643 assert_eq!(
644 html,
645 r#"Apply at <a href="https://example.com">https://example.com</a> today!"#
646 );
647 }
648
649 #[test]
650 fn test_render_with_tag() {
651 let text = "Looking for #rust developers";
652 let limits = FacetLimits::default();
653 let facets = vec![Facet {
654 index: ByteSlice {
655 byte_start: 12,
656 byte_end: 17,
657 },
658 features: vec![FacetFeature::Tag(Tag {
659 tag: "rust".to_string(),
660 })],
661 }];
662
663 let html = render_text_with_facets_html(text, Some(&facets), &limits);
664 assert_eq!(
665 html,
666 r##"Looking for <a href="#rust">#rust</a> developers"##
667 );
668 }
669
670 #[tokio::test]
671 async fn test_parse_facets_from_text_comprehensive() {
672 let mut resolver = MockIdentityResolver::new();
673 resolver.add_identity("bob.test.com", "did:plc:bob456");
674
675 let limits = FacetLimits::default();
676 let text = "Join @alice.bsky.social and @bob.test.com at https://example.com #rust #golang";
677 let facets = parse_facets_from_text(text, &resolver, &limits).await;
678
679 assert!(facets.is_some());
680 let facets = facets.unwrap();
681 assert_eq!(facets.len(), 5); // 2 mentions, 1 URL, 2 hashtags
682
683 // Check first mention
684 assert_eq!(facets[0].index.byte_start, 5);
685 assert_eq!(facets[0].index.byte_end, 23);
686 if let FacetFeature::Mention(ref mention) = facets[0].features[0] {
687 assert_eq!(mention.did, "did:plc:alice123");
688 } else {
689 panic!("Expected Mention feature");
690 }
691
692 // Check second mention
693 assert_eq!(facets[1].index.byte_start, 28);
694 assert_eq!(facets[1].index.byte_end, 41);
695 if let FacetFeature::Mention(ref mention) = facets[1].features[0] {
696 assert_eq!(mention.did, "did:plc:bob456");
697 } else {
698 panic!("Expected Mention feature");
699 }
700
701 // Check URL
702 assert_eq!(facets[2].index.byte_start, 45);
703 assert_eq!(facets[2].index.byte_end, 64);
704 if let FacetFeature::Link(ref link) = facets[2].features[0] {
705 assert_eq!(link.uri, "https://example.com");
706 } else {
707 panic!("Expected Link feature");
708 }
709
710 // Check first hashtag
711 assert_eq!(facets[3].index.byte_start, 65);
712 assert_eq!(facets[3].index.byte_end, 70);
713 if let FacetFeature::Tag(ref tag) = facets[3].features[0] {
714 assert_eq!(tag.tag, "rust");
715 } else {
716 panic!("Expected Tag feature");
717 }
718
719 // Check second hashtag
720 assert_eq!(facets[4].index.byte_start, 71);
721 assert_eq!(facets[4].index.byte_end, 78);
722 if let FacetFeature::Tag(ref tag) = facets[4].features[0] {
723 assert_eq!(tag.tag, "golang");
724 } else {
725 panic!("Expected Tag feature");
726 }
727 }
728
729 #[tokio::test]
730 async fn test_parse_facets_from_text_with_unresolvable_mention() {
731 let resolver = MockIdentityResolver::new();
732 let limits = FacetLimits::default();
733
734 // Only alice.bsky.social is in the resolver, not unknown.handle.com
735 let text = "Contact @unknown.handle.com for details #rust";
736 let facets = parse_facets_from_text(text, &resolver, &limits).await;
737
738 assert!(facets.is_some());
739 let facets = facets.unwrap();
740 // Should only have 1 facet (the hashtag) since the mention couldn't be resolved
741 assert_eq!(facets.len(), 1);
742
743 // Check that it's the hashtag facet
744 if let FacetFeature::Tag(ref tag) = facets[0].features[0] {
745 assert_eq!(tag.tag, "rust");
746 } else {
747 panic!("Expected Tag feature");
748 }
749 }
750
751 #[tokio::test]
752 async fn test_parse_facets_from_text_empty() {
753 let resolver = MockIdentityResolver::new();
754 let limits = FacetLimits::default();
755 let text = "No mentions, URLs, or hashtags here";
756 let facets = parse_facets_from_text(text, &resolver, &limits).await;
757 assert!(facets.is_none());
758 }
759
760 #[tokio::test]
761 async fn test_parse_facets_from_text_url_with_at_mention() {
762 let resolver = MockIdentityResolver::new();
763 let limits = FacetLimits::default();
764
765 // URLs with @ should not create mention facets
766 let text = "Tangled https://tangled.org/@smokesignal.events";
767 let facets = parse_facets_from_text(text, &resolver, &limits).await;
768
769 assert!(facets.is_some());
770 let facets = facets.unwrap();
771
772 // Should have exactly 1 facet (the URL), not 2 (URL + mention)
773 assert_eq!(
774 facets.len(),
775 1,
776 "Expected 1 facet (URL only), got {}",
777 facets.len()
778 );
779
780 // Verify it's a link facet, not a mention
781 if let FacetFeature::Link(ref link) = facets[0].features[0] {
782 assert_eq!(link.uri, "https://tangled.org/@smokesignal.events");
783 } else {
784 panic!("Expected Link feature, got Mention or Tag instead");
785 }
786 }
787
788 #[tokio::test]
789 async fn test_parse_facets_with_mention_limit() {
790 let mut resolver = MockIdentityResolver::new();
791 resolver.add_identity("bob.test.com", "did:plc:bob456");
792 resolver.add_identity("charlie.test.com", "did:plc:charlie789");
793
794 // Limit to 2 mentions
795 let limits = FacetLimits {
796 mentions_max: 2,
797 tags_max: 5,
798 links_max: 5,
799 max: 10,
800 };
801
802 let text = "Join @alice.bsky.social @bob.test.com @charlie.test.com";
803 let facets = parse_facets_from_text(text, &resolver, &limits).await;
804
805 assert!(facets.is_some());
806 let facets = facets.unwrap();
807 // Should only have 2 mentions (alice and bob), charlie should be skipped
808 assert_eq!(facets.len(), 2);
809
810 // Verify they're both mentions
811 for facet in &facets {
812 assert!(matches!(facet.features[0], FacetFeature::Mention(_)));
813 }
814 }
815
816 #[tokio::test]
817 async fn test_parse_facets_with_global_limit() {
818 let mut resolver = MockIdentityResolver::new();
819 resolver.add_identity("bob.test.com", "did:plc:bob456");
820
821 // Very restrictive global limit
822 let limits = FacetLimits {
823 mentions_max: 5,
824 tags_max: 5,
825 links_max: 5,
826 max: 3, // Only allow 3 total facets
827 };
828
829 let text =
830 "Join @alice.bsky.social @bob.test.com at https://example.com #rust #golang #python";
831 let facets = parse_facets_from_text(text, &resolver, &limits).await;
832
833 assert!(facets.is_some());
834 let facets = facets.unwrap();
835 // Should be truncated to 3 facets total
836 assert_eq!(facets.len(), 3);
837 }
838
839 #[test]
840 fn test_render_with_facet_limits() {
841 let text = "Contact @alice @bob @charlie for details";
842 let limits = FacetLimits {
843 mentions_max: 2, // Only render first 2 mentions
844 tags_max: 5,
845 links_max: 5,
846 max: 10,
847 };
848
849 let facets = vec![
850 Facet {
851 index: ByteSlice {
852 byte_start: 8,
853 byte_end: 14,
854 },
855 features: vec![FacetFeature::Mention(Mention {
856 did: "did:plc:alice".to_string(),
857 })],
858 },
859 Facet {
860 index: ByteSlice {
861 byte_start: 15,
862 byte_end: 19,
863 },
864 features: vec![FacetFeature::Mention(Mention {
865 did: "did:plc:bob".to_string(),
866 })],
867 },
868 Facet {
869 index: ByteSlice {
870 byte_start: 20,
871 byte_end: 28,
872 },
873 features: vec![FacetFeature::Mention(Mention {
874 did: "did:plc:charlie".to_string(),
875 })],
876 },
877 ];
878
879 let html = render_text_with_facets_html(text, Some(&facets), &limits);
880 // Should only render first 2 mentions, third should be plain text
881 assert!(html.contains(r#"<a href="/did:plc:alice">@alice</a>"#));
882 assert!(html.contains(r#"<a href="/did:plc:bob">@bob</a>"#));
883 // Charlie should NOT be a link due to mention limit
884 assert!(!html.contains(r#"<a href="/did:plc:charlie">"#));
885 }
886
887 #[test]
888 fn test_parse_urls_multiple_links() {
889 let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164";
890
891 let url_spans = parse_urls(text);
892
893 // Debug output
894 for (i, span) in url_spans.iter().enumerate() {
895 println!(
896 "URL {}: {} (start={}, end={})",
897 i, span.url, span.start, span.end
898 );
899 }
900
901 // Should find both URLs
902 assert_eq!(
903 url_spans.len(),
904 2,
905 "Expected 2 URLs but found {}",
906 url_spans.len()
907 );
908
909 if !url_spans.is_empty() {
910 assert_eq!(url_spans[0].url, "https://www.ietf.org/meeting/124/");
911 }
912
913 if url_spans.len() >= 2 {
914 assert_eq!(
915 url_spans[1].url,
916 "https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"
917 );
918 }
919 }
920
921 #[test]
922 fn test_parse_urls_with_html_entity() {
923 // Test with the HTML entity & in the text
924 let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164";
925
926 let url_spans = parse_urls(text);
927
928 // Debug output
929 for (i, span) in url_spans.iter().enumerate() {
930 println!(
931 "URL {}: {} (start={}, end={})",
932 i, span.url, span.start, span.end
933 );
934 println!(
935 " Context before: {:?}",
936 &text[span.start.saturating_sub(10)..span.start]
937 );
938 println!(
939 " Context after: {:?}",
940 &text[span.end..std::cmp::min(span.end + 10, text.len())]
941 );
942 }
943
944 // Should find both URLs
945 assert_eq!(
946 url_spans.len(),
947 2,
948 "Expected 2 URLs but found {}",
949 url_spans.len()
950 );
951
952 if !url_spans.is_empty() {
953 assert_eq!(url_spans[0].url, "https://www.ietf.org/meeting/124/");
954 }
955
956 if url_spans.len() >= 2 {
957 assert_eq!(
958 url_spans[1].url,
959 "https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"
960 );
961 }
962 }
963
964 #[test]
965 fn test_render_malicious_link() {
966 let text = "Visit example.com for details";
967 let limits = FacetLimits::default();
968 let facets = vec![Facet {
969 index: ByteSlice {
970 byte_start: 6,
971 byte_end: 17,
972 },
973 features: vec![FacetFeature::Link(Link {
974 uri: "javascript:alert('XSS')".to_string(),
975 })],
976 }];
977
978 let html = render_text_with_facets_html(text, Some(&facets), &limits);
979 // JavaScript URLs should be blocked
980 assert!(!html.contains("javascript:"));
981 assert_eq!(html, "Visit example.com for details");
982 }
983
984 #[test]
985 fn test_byte_offset_with_html_entities() {
986 // This test demonstrates that HTML entity escaping shifts byte positions.
987 // The byte positions shift:
988 // In original: '&' is at byte 8 (1 byte)
989 // In escaped: '&' starts at byte 8 (5 bytes)
990 // This causes facet byte offsets to be misaligned if text is escaped before rendering.
991
992 // If we have a URL after the ampersand in the original:
993 let original_with_url = "Nov 3rd & Tuesday https://example.com";
994 let escaped_with_url = "Nov 3rd & Tuesday https://example.com";
995
996 // Parse URLs from both versions
997 let original_urls = parse_urls(original_with_url);
998 let escaped_urls = parse_urls(escaped_with_url);
999
1000 println!("Original text: {:?}", original_with_url);
1001 println!(
1002 "Original URL found at: {:?}",
1003 original_urls.first().map(|u| (u.start, u.end))
1004 );
1005 println!("Escaped text: {:?}", escaped_with_url);
1006 println!(
1007 "Escaped URL found at: {:?}",
1008 escaped_urls.first().map(|u| (u.start, u.end))
1009 );
1010
1011 // Both should find the URL, but at different byte positions
1012 assert_eq!(original_urls.len(), 1);
1013 assert_eq!(escaped_urls.len(), 1);
1014
1015 // The byte positions will be different
1016 assert_eq!(original_urls[0].start, 18); // After "Nov 3rd & Tuesday "
1017 assert_eq!(escaped_urls[0].start, 22); // After "Nov 3rd & Tuesday " (4 extra bytes for &)
1018 }
1019
1020 #[test]
1021 fn test_render_facets_with_ampersand_in_text() {
1022 // Test case from the bug report: text with & that should have two URL facets
1023 let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164";
1024
1025 // Parse facets from the original text
1026 let url_spans = parse_urls(text);
1027 assert_eq!(url_spans.len(), 2, "Should find 2 URLs");
1028
1029 // Create facets from the parsed URLs
1030 let facets = vec![
1031 Facet {
1032 index: ByteSlice {
1033 byte_start: url_spans[0].start,
1034 byte_end: url_spans[0].end,
1035 },
1036 features: vec![FacetFeature::Link(Link {
1037 uri: url_spans[0].url.clone(),
1038 })],
1039 },
1040 Facet {
1041 index: ByteSlice {
1042 byte_start: url_spans[1].start,
1043 byte_end: url_spans[1].end,
1044 },
1045 features: vec![FacetFeature::Link(Link {
1046 uri: url_spans[1].url.clone(),
1047 })],
1048 },
1049 ];
1050
1051 // Render with facets - this should work correctly even with & in the text
1052 let limits = FacetLimits::default();
1053 let html = render_text_with_facets_html(text, Some(&facets), &limits);
1054
1055 // Both URLs should be rendered as links
1056 assert!(
1057 html.contains(r#"<a href="https://www.ietf.org/meeting/124/""#),
1058 "First URL should be a link"
1059 );
1060 assert!(html.contains(r#"<a href="https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164""#), "Second URL should be a link");
1061
1062 // The ampersand should be HTML-escaped in the output
1063 assert!(
1064 html.contains("&"),
1065 "Ampersand should be escaped in HTML output"
1066 );
1067
1068 // Verify the links are properly closed
1069 assert_eq!(
1070 html.matches("</a>").count(),
1071 2,
1072 "Should have 2 closing </a> tags"
1073 );
1074 }
1075
1076 #[test]
1077 fn test_render_with_out_of_bounds_facet() {
1078 // Regression test for panic: "range end index 324 out of range for slice of length 323"
1079 // This can happen when facets come from external AT Protocol data with incorrect byte offsets
1080 let text = "Hello world"; // 11 bytes
1081 let limits = FacetLimits::default();
1082
1083 // Create a facet that extends beyond the text length
1084 let facets = vec![Facet {
1085 index: ByteSlice {
1086 byte_start: 6,
1087 byte_end: 20, // Beyond text length of 11
1088 },
1089 features: vec![FacetFeature::Link(Link {
1090 uri: "https://example.com".to_string(),
1091 })],
1092 }];
1093
1094 // This should NOT panic - invalid facets should be skipped
1095 let html = render_text_with_facets_html(text, Some(&facets), &limits);
1096
1097 // The text should still be rendered (escaped), just without the invalid facet
1098 assert_eq!(html, "Hello world");
1099 }
1100
1101 #[test]
1102 fn test_render_with_facet_start_beyond_text() {
1103 // Test when facet start is beyond text length
1104 let text = "Short"; // 5 bytes
1105 let limits = FacetLimits::default();
1106
1107 let facets = vec![Facet {
1108 index: ByteSlice {
1109 byte_start: 100, // Way beyond text length
1110 byte_end: 110,
1111 },
1112 features: vec![FacetFeature::Link(Link {
1113 uri: "https://example.com".to_string(),
1114 })],
1115 }];
1116
1117 // Should not panic
1118 let html = render_text_with_facets_html(text, Some(&facets), &limits);
1119 assert_eq!(html, "Short");
1120 }
1121
1122 #[test]
1123 fn test_render_with_inverted_facet_indices() {
1124 // Test when byte_start > byte_end (invalid)
1125 let text = "Hello world";
1126 let limits = FacetLimits::default();
1127
1128 let facets = vec![Facet {
1129 index: ByteSlice {
1130 byte_start: 8,
1131 byte_end: 4, // Invalid: end before start
1132 },
1133 features: vec![FacetFeature::Link(Link {
1134 uri: "https://example.com".to_string(),
1135 })],
1136 }];
1137
1138 // Should not panic
1139 let html = render_text_with_facets_html(text, Some(&facets), &limits);
1140 assert_eq!(html, "Hello world");
1141 }
1142
1143 #[test]
1144 fn test_parse_urls_from_atproto_record_text() {
1145 // Test parsing URLs from real AT Protocol record description text.
1146 // This demonstrates the correct byte positions that should be used for facets.
1147 let text = "Dev, Power Users, and Generally inquisitive folks get a completely unprofessionally amateur interview. Just a yap sesh where chat is part of the call!\n\n✨the daniel✨ & I will be on a Zoom call and I will stream out to https://stream.place/psingletary.com\n\nSubscribe to the publications! https://atprotocalls.leaflet.pub/";
1148
1149 let url_spans = parse_urls(text);
1150
1151 assert_eq!(url_spans.len(), 2, "Should find 2 URLs");
1152
1153 // First URL: https://stream.place/psingletary.com
1154 assert_eq!(url_spans[0].url, "https://stream.place/psingletary.com");
1155 assert_eq!(url_spans[0].start, 221);
1156 assert_eq!(url_spans[0].end, 257);
1157
1158 // Second URL: https://atprotocalls.leaflet.pub/
1159 assert_eq!(url_spans[1].url, "https://atprotocalls.leaflet.pub/");
1160 assert_eq!(url_spans[1].start, 290);
1161 assert_eq!(url_spans[1].end, 323);
1162
1163 // Verify the byte slices match the expected text
1164 let text_bytes = text.as_bytes();
1165 assert_eq!(
1166 std::str::from_utf8(&text_bytes[221..257]).unwrap(),
1167 "https://stream.place/psingletary.com"
1168 );
1169 assert_eq!(
1170 std::str::from_utf8(&text_bytes[290..323]).unwrap(),
1171 "https://atprotocalls.leaflet.pub/"
1172 );
1173
1174 // Note: The AT Protocol record had incorrect facet indices:
1175 // - First link: byteStart=222, byteEnd=258 (should be 221, 257)
1176 // - Second link: byteStart=291, byteEnd=324 (should be 290, 323)
1177 // This off-by-one error was in the source data, not our parser.
1178 }
1179
1180 #[test]
1181 fn test_render_with_off_by_one_facet_indices() {
1182 // Regression test for facets with off-by-one byte indices from external AT Protocol data.
1183 // The facets in this test have byteStart values that are 1 byte too high, causing
1184 // the first character of the URL to appear outside the link tag.
1185 //
1186 // This test documents the current behavior: the renderer faithfully applies facets
1187 // at the specified byte positions, even if those positions are incorrect.
1188 // The root cause is incorrect facet generation by the client that created the record.
1189 let text = "Dev, Power Users, and Generally inquisitive folks get a completely unprofessionally amateur interview. Just a yap sesh where chat is part of the call!\n\n✨the daniel✨ & I will be on a Zoom call and I will stream out to https://stream.place/psingletary.com\n\nSubscribe to the publications! https://atprotocalls.leaflet.pub/";
1190
1191 // Verify text length - the second facet's byte_end (324) exceeds this
1192 assert_eq!(text.len(), 323, "Text should be 323 bytes");
1193
1194 let limits = FacetLimits::default();
1195
1196 // These facets have incorrect byte indices (off by 1) - this is real data from AT Protocol
1197 let facets = vec![
1198 Facet {
1199 index: ByteSlice {
1200 byte_start: 222, // Should be 221
1201 byte_end: 258, // Should be 257 (but 258 is within bounds)
1202 },
1203 features: vec![FacetFeature::Link(Link {
1204 uri: "https://stream.place/psingletary.com".to_string(),
1205 })],
1206 },
1207 Facet {
1208 index: ByteSlice {
1209 byte_start: 291, // Should be 290
1210 byte_end: 324, // Should be 323 - but 324 > text.len() so this facet is SKIPPED
1211 },
1212 features: vec![FacetFeature::Link(Link {
1213 uri: "https://atprotocalls.leaflet.pub/".to_string(),
1214 })],
1215 },
1216 ];
1217
1218 let html = render_text_with_facets_html(text, Some(&facets), &limits);
1219
1220 // Due to off-by-one facet indices, the 'h' from 'https' appears before the link tag
1221 assert!(
1222 html.contains(r#"stream out to h<a href="https://stream.place/psingletary.com""#),
1223 "First link should have 'h' outside due to off-by-one error. Got: {}",
1224 html
1225 );
1226
1227 // The second facet is SKIPPED entirely because byte_end (324) > text.len() (323)
1228 // This is the bounds check in render_text_with_facets_html preventing out-of-bounds access
1229 assert!(
1230 !html.contains(r#"<a href="https://atprotocalls.leaflet.pub/""#),
1231 "Second link should NOT be rendered because facet is out of bounds. Got: {}",
1232 html
1233 );
1234 assert!(
1235 html.contains("https://atprotocalls.leaflet.pub/"),
1236 "Second URL should appear as plain text. Got: {}",
1237 html
1238 );
1239
1240 // Verify correct byte positions from our parser
1241 let url_spans = parse_urls(text);
1242 assert_eq!(url_spans.len(), 2, "Should find 2 URLs");
1243
1244 // The correct byte positions from our parser
1245 assert_eq!(
1246 url_spans[0].start, 221,
1247 "First URL should start at byte 221, not 222"
1248 );
1249 assert_eq!(
1250 url_spans[0].end, 257,
1251 "First URL should end at byte 257, not 258"
1252 );
1253 assert_eq!(
1254 url_spans[1].start, 290,
1255 "Second URL should start at byte 290, not 291"
1256 );
1257 assert_eq!(
1258 url_spans[1].end, 323,
1259 "Second URL should end at byte 323, not 324"
1260 );
1261 }
1262}