src/facets.rs at main · smokesignal.events/smokesignal

smokesignal.events / smokesignal
fork atom
The smokesignal.events web application
fork atom
smokesignal / src / facets.rs
at main 1262 lines 47 kB view raw
wrap content
Nick Gerakines feature: community.lexicon.calendar.getRSVP 7w ago
6a200aab
   1//! Rich text facet structures and rendering for AT Protocol.
   2//!
   3//! This module provides structures for handling rich text facets (mentions, links, hashtags),
   4//! parsing them from text, and rendering them as HTML for display in the UI.
   5//!
   6//! # Byte Offset Calculation
   7//!
   8//! This implementation correctly uses UTF-8 byte offsets as required by AT Protocol.
   9//! The facets use "inclusive start and exclusive end" byte ranges. All parsing is done
  10//! using `regex::bytes::Regex` which operates on byte slices and returns byte positions,
  11//! ensuring correct handling of multi-byte UTF-8 characters (emojis, CJK, accented chars).
  12
  13use atproto_identity::resolve::IdentityResolver;
  14use atproto_record::lexicon::app::bsky::richtext::facet::{
  15    ByteSlice, Facet, FacetFeature, Link, Mention, Tag,
  16};
  17use regex::bytes::Regex;
  18use std::fmt::Write;
  19
  20/// Configuration for facet parsing and rendering limits
  21#[derive(Debug, Clone, Copy)]
  22pub struct FacetLimits {
  23    /// Maximum number of mention facets to process (default: 5)
  24    pub mentions_max: usize,
  25    /// Maximum number of tag facets to process (default: 5)
  26    pub tags_max: usize,
  27    /// Maximum number of link facets to process (default: 5)
  28    pub links_max: usize,
  29    /// Maximum total number of facets to process (default: 10)
  30    pub max: usize,
  31}
  32
  33impl Default for FacetLimits {
  34    fn default() -> Self {
  35        Self {
  36            mentions_max: 5,
  37            tags_max: 5,
  38            links_max: 5,
  39            max: 10,
  40        }
  41    }
  42}
  43
  44/// Mention span with byte positions and handle
  45#[derive(Debug)]
  46pub struct MentionSpan {
  47    pub start: usize,
  48    pub end: usize,
  49    pub handle: String,
  50}
  51
  52/// URL span with byte positions and URL
  53#[derive(Debug)]
  54pub struct UrlSpan {
  55    pub start: usize,
  56    pub end: usize,
  57    pub url: String,
  58}
  59
  60/// Tag span with byte positions and tag text
  61#[derive(Debug)]
  62pub struct TagSpan {
  63    pub start: usize,
  64    pub end: usize,
  65    pub tag: String,
  66}
  67
  68/// Parse mentions from text and return their byte positions
  69/// This function excludes mentions that appear within URLs
  70pub fn parse_mentions(text: &str) -> Vec<MentionSpan> {
  71    let mut spans = Vec::new();
  72
  73    // First, parse all URLs to exclude mention matches within them
  74    let url_spans = parse_urls(text);
  75
  76    // Regex based on: https://atproto.com/specs/handle#handle-identifier-syntax
  77    // Pattern: [$|\W](@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)
  78    let mention_regex = Regex::new(
  79        r"(?:^|[^\w])(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)"
  80    ).unwrap();
  81
  82    let text_bytes = text.as_bytes();
  83    for capture in mention_regex.captures_iter(text_bytes) {
  84        if let Some(mention_match) = capture.get(1) {
  85            let start = mention_match.start();
  86            let end = mention_match.end();
  87
  88            // Check if this mention overlaps with any URL
  89            let overlaps_url = url_spans.iter().any(|url| {
  90                // Check if mention is within or overlaps the URL span
  91                (start >= url.start && start < url.end) || (end > url.start && end <= url.end)
  92            });
  93
  94            // Only add the mention if it doesn't overlap with a URL
  95            if !overlaps_url {
  96                let handle = std::str::from_utf8(&mention_match.as_bytes()[1..])
  97                    .unwrap_or_default()
  98                    .to_string();
  99
 100                spans.push(MentionSpan { start, end, handle });
 101            }
 102        }
 103    }
 104
 105    spans
 106}
 107
 108/// Parse URLs from text and return their byte positions
 109pub fn parse_urls(text: &str) -> Vec<UrlSpan> {
 110    let mut spans = Vec::new();
 111
 112    // Partial/naive URL regex based on: https://stackoverflow.com/a/3809435
 113    // Pattern: [$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)
 114    // Modified to use + instead of {1,6} to support longer TLDs and multi-level subdomains
 115    let url_regex = Regex::new(
 116        r"(?:^|[^\w])(https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]+\b(?:[-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)"
 117    ).unwrap();
 118
 119    let text_bytes = text.as_bytes();
 120    for capture in url_regex.captures_iter(text_bytes) {
 121        if let Some(url_match) = capture.get(1) {
 122            let url = std::str::from_utf8(url_match.as_bytes())
 123                .unwrap_or_default()
 124                .to_string();
 125
 126            spans.push(UrlSpan {
 127                start: url_match.start(),
 128                end: url_match.end(),
 129                url,
 130            });
 131        }
 132    }
 133
 134    spans
 135}
 136
 137/// Parse hashtags from text and return their byte positions
 138pub fn parse_tags(text: &str) -> Vec<TagSpan> {
 139    let mut spans = Vec::new();
 140
 141    // Regex based on: https://github.com/bluesky-social/atproto/blob/d91988fe79030b61b556dd6f16a46f0c3b9d0b44/packages/api/src/rich-text/util.ts
 142    // Simplified for Rust - matches hashtags at word boundaries
 143    // Pattern matches: start of string or non-word char, then # or ＃, then tag content
 144    let tag_regex = Regex::new(r"(?:^|[^\w])([#＃])([\w]+(?:[\w]*)*)").unwrap();
 145
 146    let text_bytes = text.as_bytes();
 147
 148    // Work with bytes for proper position tracking
 149    for capture in tag_regex.captures_iter(text_bytes) {
 150        if let (Some(full_match), Some(hash_match), Some(tag_match)) =
 151            (capture.get(0), capture.get(1), capture.get(2))
 152        {
 153            // Calculate the absolute byte position of the hash symbol
 154            // The full match includes the preceding character (if any)
 155            // so we need to adjust for that
 156            let match_start = full_match.start();
 157            let hash_offset = hash_match.start() - full_match.start();
 158            let start = match_start + hash_offset;
 159            let end = match_start + hash_offset + hash_match.len() + tag_match.len();
 160
 161            // Extract just the tag text (without the hash symbol)
 162            // Normalize to lowercase for case-insensitive tag matching
 163            let tag = std::str::from_utf8(tag_match.as_bytes()).unwrap_or_default();
 164
 165            // Only include tags that are not purely numeric
 166            if !tag.chars().all(|c| c.is_ascii_digit()) {
 167                spans.push(TagSpan {
 168                    start,
 169                    end,
 170                    tag: tag.to_string(),
 171                });
 172            }
 173        }
 174    }
 175
 176    spans
 177}
 178
 179/// Parse facets from text and return a vector of Facet objects.
 180///
 181/// This function extracts mentions, URLs, and hashtags from the provided text
 182/// and creates AT Protocol facets with proper byte indices.
 183///
 184/// Mentions are resolved to actual DIDs using the provided identity resolver.
 185/// If a handle cannot be resolved to a DID, the mention facet is skipped.
 186///
 187/// # Arguments
 188/// * `text` - The text to extract facets from
 189/// * `identity_resolver` - Resolver for converting handles to DIDs
 190/// * `limits` - Configuration for maximum facets per type and total
 191///
 192/// # Returns
 193/// Optional vector of facets. Returns None if no facets were found.
 194pub async fn parse_facets_from_text(
 195    text: &str,
 196    identity_resolver: &dyn IdentityResolver,
 197    limits: &FacetLimits,
 198) -> Option<Vec<Facet>> {
 199    let mut facets = Vec::new();
 200
 201    // Parse mentions (limited by mentions_max)
 202    let mention_spans = parse_mentions(text);
 203    let mut mention_count = 0;
 204    for mention in mention_spans {
 205        if mention_count >= limits.mentions_max {
 206            break;
 207        }
 208
 209        // Try to resolve the handle to a DID
 210        // First try with at:// prefix, then without
 211        let at_uri = format!("at://{}", mention.handle);
 212        let did_result = match identity_resolver.resolve(&at_uri).await {
 213            Ok(doc) => Ok(doc),
 214            Err(_) => identity_resolver.resolve(&mention.handle).await,
 215        };
 216
 217        // Only add the mention facet if we successfully resolved the DID
 218        if let Ok(did_doc) = did_result {
 219            facets.push(Facet {
 220                index: ByteSlice {
 221                    byte_start: mention.start,
 222                    byte_end: mention.end,
 223                },
 224                features: vec![FacetFeature::Mention(Mention {
 225                    did: did_doc.id.to_string(),
 226                })],
 227            });
 228            mention_count += 1;
 229        }
 230        // If resolution fails, we skip this mention facet entirely
 231    }
 232
 233    // Parse URLs (limited by links_max)
 234    let url_spans = parse_urls(text);
 235    for (idx, url) in url_spans.into_iter().enumerate() {
 236        if idx >= limits.links_max {
 237            break;
 238        }
 239        facets.push(Facet {
 240            index: ByteSlice {
 241                byte_start: url.start,
 242                byte_end: url.end,
 243            },
 244            features: vec![FacetFeature::Link(Link { uri: url.url })],
 245        });
 246    }
 247
 248    // Parse hashtags (limited by tags_max)
 249    let tag_spans = parse_tags(text);
 250    for (idx, tag_span) in tag_spans.into_iter().enumerate() {
 251        if idx >= limits.tags_max {
 252            break;
 253        }
 254        facets.push(Facet {
 255            index: ByteSlice {
 256                byte_start: tag_span.start,
 257                byte_end: tag_span.end,
 258            },
 259            features: vec![FacetFeature::Tag(Tag { tag: tag_span.tag })],
 260        });
 261    }
 262
 263    // Apply global facet limit (truncate if exceeds max)
 264    if facets.len() > limits.max {
 265        facets.truncate(limits.max);
 266    }
 267
 268    // Only return facets if we found any
 269    if !facets.is_empty() {
 270        Some(facets)
 271    } else {
 272        None
 273    }
 274}
 275
 276/// HTML escape helper function
 277fn html_escape(text: &str) -> String {
 278    text.chars()
 279        .map(|c| match c {
 280            '&' => "&amp;".to_string(),
 281            '<' => "&lt;".to_string(),
 282            '>' => "&gt;".to_string(),
 283            '"' => "&quot;".to_string(),
 284            '\'' => "&#39;".to_string(),
 285            c => c.to_string(),
 286        })
 287        .collect()
 288}
 289
 290/// Check if text contains HTML tags
 291/// This is used to detect potentially malicious content
 292fn contains_html_tags(text: &str) -> bool {
 293    // Look for patterns that indicate HTML tags
 294    // We're looking for < followed by either a letter, /, or !
 295    let mut chars = text.chars().peekable();
 296    while let Some(ch) = chars.next() {
 297        if ch == '<'
 298            && let Some(&next_ch) = chars.peek()
 299        {
 300            // Check if this looks like an HTML tag
 301            if next_ch.is_ascii_alphabetic() || next_ch == '/' || next_ch == '!' {
 302                return true;
 303            }
 304        }
 305    }
 306    false
 307}
 308
 309/// Render text with facets as HTML.
 310///
 311/// This function converts plain text with facet annotations into HTML with proper
 312/// links for mentions, URLs, and hashtags based on the facet information.
 313///
 314/// # HTML Output
 315/// - Mentions: `<a href="/[did]">@handle</a>`
 316/// - Links: `<a href="[url]" target="_blank" rel="noopener noreferrer">[url]</a>`
 317/// - Tags: `<a href="#[tag]">#tag</a>`
 318/// - Regular text is HTML-escaped for security
 319///
 320/// # Arguments
 321/// * `text` - The plain text content
 322/// * `facets` - Optional facets to apply to the text
 323/// * `limits` - Configuration for maximum facets per type and total
 324///
 325/// # Returns
 326/// HTML string with facets rendered as links
 327pub fn render_text_with_facets_html(
 328    text: &str,
 329    facets: Option<&Vec<Facet>>,
 330    limits: &FacetLimits,
 331) -> String {
 332    // First, check if the text contains HTML tags
 333    // If it does, treat it as suspicious and just clean it without applying facets
 334    if contains_html_tags(text) {
 335        // Use ammonia to strip ALL HTML and return plain text
 336        let cleaned = ammonia::clean(text);
 337        // Convert newlines to <br> tags after cleaning
 338        return cleaned.replace('\n', "<br>");
 339    }
 340
 341    let text_bytes = text.as_bytes();
 342
 343    // If no facets, just return escaped text
 344    let Some(facets) = facets else {
 345        return html_escape(text);
 346    };
 347
 348    // Sort facets by start position to process them in order
 349    let mut sorted_facets: Vec<_> = facets.iter().collect();
 350    sorted_facets.sort_by_key(|f| f.index.byte_start);
 351
 352    // Apply limits: count facets by type and limit total
 353    let mut mention_count = 0;
 354    let mut link_count = 0;
 355    let mut tag_count = 0;
 356    let mut total_count = 0;
 357
 358    let filtered_facets: Vec<_> = sorted_facets
 359        .into_iter()
 360        .filter(|facet| {
 361            if total_count >= limits.max {
 362                return false;
 363            }
 364
 365            // Check facet type and apply per-type limits
 366            let should_include = facet.features.first().is_some_and(|feature| match feature {
 367                FacetFeature::Mention(_) if mention_count < limits.mentions_max => {
 368                    mention_count += 1;
 369                    true
 370                }
 371                FacetFeature::Link(_) if link_count < limits.links_max => {
 372                    link_count += 1;
 373                    true
 374                }
 375                FacetFeature::Tag(_) if tag_count < limits.tags_max => {
 376                    tag_count += 1;
 377                    true
 378                }
 379                _ => false,
 380            });
 381
 382            if should_include {
 383                total_count += 1;
 384            }
 385
 386            should_include
 387        })
 388        .collect();
 389
 390    let mut html = String::new();
 391    let mut last_end = 0;
 392    let text_len = text_bytes.len();
 393
 394    for facet in filtered_facets {
 395        // Validate facet indices are within bounds - skip invalid facets
 396        if facet.index.byte_start > text_len
 397            || facet.index.byte_end > text_len
 398            || facet.index.byte_start > facet.index.byte_end
 399        {
 400            continue;
 401        }
 402
 403        // Add any text before this facet (HTML-escaped)
 404        if facet.index.byte_start > last_end {
 405            let text_before =
 406                std::str::from_utf8(&text_bytes[last_end..facet.index.byte_start]).unwrap_or("");
 407            html.push_str(&html_escape(text_before));
 408        }
 409
 410        // Get the text covered by this facet
 411        let facet_text =
 412            std::str::from_utf8(&text_bytes[facet.index.byte_start..facet.index.byte_end])
 413                .unwrap_or("");
 414
 415        // Process the facet based on its feature type
 416        // Only process the first feature (in practice, there should only be one per facet)
 417        if let Some(feature) = facet.features.first() {
 418            match feature {
 419                FacetFeature::Mention(mention) => {
 420                    write!(
 421                        &mut html,
 422                        r#"<a href="/{}">{}</a>"#,
 423                        html_escape(&mention.did),
 424                        html_escape(facet_text)
 425                    )
 426                    .unwrap();
 427                }
 428                FacetFeature::Link(link) => {
 429                    // Only create link tags for safe URLs
 430                    if link.uri.starts_with("http://")
 431                        || link.uri.starts_with("https://")
 432                        || link.uri.starts_with("/")
 433                    {
 434                        write!(
 435                            &mut html,
 436                            r#"<a href="{}" target="_blank" rel="noopener noreferrer">{}</a>"#,
 437                            html_escape(&link.uri),
 438                            html_escape(facet_text)
 439                        )
 440                        .unwrap();
 441                    } else {
 442                        // For unsafe URLs (like javascript:), just render as plain text
 443                        html.push_str(&html_escape(facet_text));
 444                    }
 445                }
 446                FacetFeature::Tag(tag) => {
 447                    // URL-encode the tag for the href attribute
 448                    let encoded_tag = urlencoding::encode(&tag.tag);
 449                    write!(
 450                        &mut html,
 451                        r##"<a href="#{}">{}</a>"##,
 452                        encoded_tag,
 453                        html_escape(facet_text)
 454                    )
 455                    .unwrap();
 456                }
 457            }
 458        }
 459
 460        last_end = facet.index.byte_end;
 461    }
 462
 463    // Add any remaining text after the last facet
 464    if last_end < text_bytes.len() {
 465        let remaining_text = std::str::from_utf8(&text_bytes[last_end..]).unwrap_or("");
 466        html.push_str(&html_escape(remaining_text));
 467    }
 468
 469    // Sanitize the final HTML output to ensure safety
 470    // Configure ammonia to only allow <a> tags with specific attributes
 471    let mut builder = ammonia::Builder::new();
 472    builder
 473        .tags(std::collections::HashSet::from(["a", "br"]))
 474        // Don't automatically add rel="nofollow" - we'll handle it in the attribute filter
 475        .link_rel(None)
 476        // Allow relative URLs (for internal links like /u/... and /t/...)
 477        .url_relative(ammonia::UrlRelative::PassThrough)
 478        .attribute_filter(|element, attribute, value| match (element, attribute) {
 479            ("a", "href") => {
 480                // Only allow safe URLs: relative paths starting with /, or http(s) URLs
 481                if value.starts_with('/')
 482                    || value.starts_with("http://")
 483                    || value.starts_with("https://")
 484                    || value.starts_with("#")
 485                {
 486                    Some(value.into())
 487                } else {
 488                    None
 489                }
 490            }
 491            ("a", "target") => {
 492                if value == "_blank" {
 493                    Some(value.into())
 494                } else {
 495                    None
 496                }
 497            }
 498            ("a", "rel") => {
 499                // For external links, ensure nofollow is present
 500                if value.contains("noopener") || value.contains("noreferrer") {
 501                    // Keep the existing rel value but add nofollow if not present
 502                    if !value.contains("nofollow") {
 503                        Some(format!("{} nofollow", value).into())
 504                    } else {
 505                        Some(value.into())
 506                    }
 507                } else {
 508                    // Just nofollow for other cases
 509                    Some("nofollow".into())
 510                }
 511            }
 512            ("br", _) => None, // br tags don't have attributes
 513            _ => None,
 514        });
 515
 516    builder.clean(&html).to_string()
 517}
 518
 519#[cfg(test)]
 520mod tests {
 521    use async_trait::async_trait;
 522    use atproto_identity::model::Document;
 523    use atproto_record::lexicon::app::bsky::richtext::facet::{ByteSlice, Link, Mention, Tag};
 524    use std::collections::HashMap;
 525
 526    use super::*;
 527
 528    /// Mock identity resolver for testing
 529    struct MockIdentityResolver {
 530        handles_to_dids: HashMap<String, String>,
 531    }
 532
 533    impl MockIdentityResolver {
 534        fn new() -> Self {
 535            let mut handles_to_dids = HashMap::new();
 536            handles_to_dids.insert(
 537                "alice.bsky.social".to_string(),
 538                "did:plc:alice123".to_string(),
 539            );
 540            handles_to_dids.insert(
 541                "at://alice.bsky.social".to_string(),
 542                "did:plc:alice123".to_string(),
 543            );
 544            Self { handles_to_dids }
 545        }
 546
 547        fn add_identity(&mut self, handle: &str, did: &str) {
 548            self.handles_to_dids
 549                .insert(handle.to_string(), did.to_string());
 550            self.handles_to_dids
 551                .insert(format!("at://{}", handle), did.to_string());
 552        }
 553    }
 554
 555    #[async_trait]
 556    impl IdentityResolver for MockIdentityResolver {
 557        async fn resolve(&self, handle: &str) -> anyhow::Result<Document> {
 558            let handle_key = handle.to_string();
 559
 560            if let Some(did) = self.handles_to_dids.get(&handle_key) {
 561                Ok(Document {
 562                    context: vec![],
 563                    id: did.clone(),
 564                    also_known_as: vec![format!("at://{}", handle_key.trim_start_matches("at://"))],
 565                    verification_method: vec![],
 566                    service: vec![],
 567                    extra: HashMap::new(),
 568                })
 569            } else {
 570                Err(anyhow::anyhow!("Handle not found"))
 571            }
 572        }
 573    }
 574
 575    #[test]
 576    fn test_html_escape() {
 577        assert_eq!(html_escape("Hello & <world>"), "Hello &amp; &lt;world&gt;");
 578        assert_eq!(
 579            html_escape("\"quotes\" and 'apostrophes'"),
 580            "&quot;quotes&quot; and &#39;apostrophes&#39;"
 581        );
 582        assert_eq!(html_escape("Line 1\nLine 2"), "Line 1\nLine 2");
 583        assert_eq!(html_escape("Normal text"), "Normal text");
 584    }
 585
 586    #[test]
 587    fn test_render_no_facets() {
 588        let text = "This is a <test> description & it's great!";
 589        let limits = FacetLimits::default();
 590        let html = render_text_with_facets_html(text, None, &limits);
 591        // HTML tags are detected and stripped by ammonia
 592        // The <test> tag is removed entirely
 593        assert_eq!(html, "This is a  description &amp; it's great!");
 594    }
 595
 596    #[test]
 597    fn test_render_with_html_tags() {
 598        let text = "Check this <script>alert('XSS')</script> content!";
 599        let limits = FacetLimits::default();
 600        let html = render_text_with_facets_html(text, None, &limits);
 601        // The script tag should be completely removed
 602        assert_eq!(html, "Check this  content!");
 603        assert!(!html.contains("script"));
 604        assert!(!html.contains("alert"));
 605    }
 606
 607    #[test]
 608    fn test_render_with_mention() {
 609        let text = "Contact @alice.bsky.social for details";
 610        let limits = FacetLimits::default();
 611        let facets = vec![Facet {
 612            index: ByteSlice {
 613                byte_start: 8,
 614                byte_end: 26,
 615            },
 616            features: vec![FacetFeature::Mention(Mention {
 617                did: "did:plc:abc123".to_string(),
 618            })],
 619        }];
 620
 621        let html = render_text_with_facets_html(text, Some(&facets), &limits);
 622        assert_eq!(
 623            html,
 624            r#"Contact <a href="/did:plc:abc123">@alice.bsky.social</a> for details"#
 625        );
 626    }
 627
 628    #[test]
 629    fn test_render_with_link() {
 630        let text = "Apply at https://example.com today!";
 631        let limits = FacetLimits::default();
 632        let facets = vec![Facet {
 633            index: ByteSlice {
 634                byte_start: 9,
 635                byte_end: 28,
 636            },
 637            features: vec![FacetFeature::Link(Link {
 638                uri: "https://example.com".to_string(),
 639            })],
 640        }];
 641
 642        let html = render_text_with_facets_html(text, Some(&facets), &limits);
 643        assert_eq!(
 644            html,
 645            r#"Apply at <a href="https://example.com">https://example.com</a> today!"#
 646        );
 647    }
 648
 649    #[test]
 650    fn test_render_with_tag() {
 651        let text = "Looking for #rust developers";
 652        let limits = FacetLimits::default();
 653        let facets = vec![Facet {
 654            index: ByteSlice {
 655                byte_start: 12,
 656                byte_end: 17,
 657            },
 658            features: vec![FacetFeature::Tag(Tag {
 659                tag: "rust".to_string(),
 660            })],
 661        }];
 662
 663        let html = render_text_with_facets_html(text, Some(&facets), &limits);
 664        assert_eq!(
 665            html,
 666            r##"Looking for <a href="#rust">#rust</a> developers"##
 667        );
 668    }
 669
 670    #[tokio::test]
 671    async fn test_parse_facets_from_text_comprehensive() {
 672        let mut resolver = MockIdentityResolver::new();
 673        resolver.add_identity("bob.test.com", "did:plc:bob456");
 674
 675        let limits = FacetLimits::default();
 676        let text = "Join @alice.bsky.social and @bob.test.com at https://example.com #rust #golang";
 677        let facets = parse_facets_from_text(text, &resolver, &limits).await;
 678
 679        assert!(facets.is_some());
 680        let facets = facets.unwrap();
 681        assert_eq!(facets.len(), 5); // 2 mentions, 1 URL, 2 hashtags
 682
 683        // Check first mention
 684        assert_eq!(facets[0].index.byte_start, 5);
 685        assert_eq!(facets[0].index.byte_end, 23);
 686        if let FacetFeature::Mention(ref mention) = facets[0].features[0] {
 687            assert_eq!(mention.did, "did:plc:alice123");
 688        } else {
 689            panic!("Expected Mention feature");
 690        }
 691
 692        // Check second mention
 693        assert_eq!(facets[1].index.byte_start, 28);
 694        assert_eq!(facets[1].index.byte_end, 41);
 695        if let FacetFeature::Mention(ref mention) = facets[1].features[0] {
 696            assert_eq!(mention.did, "did:plc:bob456");
 697        } else {
 698            panic!("Expected Mention feature");
 699        }
 700
 701        // Check URL
 702        assert_eq!(facets[2].index.byte_start, 45);
 703        assert_eq!(facets[2].index.byte_end, 64);
 704        if let FacetFeature::Link(ref link) = facets[2].features[0] {
 705            assert_eq!(link.uri, "https://example.com");
 706        } else {
 707            panic!("Expected Link feature");
 708        }
 709
 710        // Check first hashtag
 711        assert_eq!(facets[3].index.byte_start, 65);
 712        assert_eq!(facets[3].index.byte_end, 70);
 713        if let FacetFeature::Tag(ref tag) = facets[3].features[0] {
 714            assert_eq!(tag.tag, "rust");
 715        } else {
 716            panic!("Expected Tag feature");
 717        }
 718
 719        // Check second hashtag
 720        assert_eq!(facets[4].index.byte_start, 71);
 721        assert_eq!(facets[4].index.byte_end, 78);
 722        if let FacetFeature::Tag(ref tag) = facets[4].features[0] {
 723            assert_eq!(tag.tag, "golang");
 724        } else {
 725            panic!("Expected Tag feature");
 726        }
 727    }
 728
 729    #[tokio::test]
 730    async fn test_parse_facets_from_text_with_unresolvable_mention() {
 731        let resolver = MockIdentityResolver::new();
 732        let limits = FacetLimits::default();
 733
 734        // Only alice.bsky.social is in the resolver, not unknown.handle.com
 735        let text = "Contact @unknown.handle.com for details #rust";
 736        let facets = parse_facets_from_text(text, &resolver, &limits).await;
 737
 738        assert!(facets.is_some());
 739        let facets = facets.unwrap();
 740        // Should only have 1 facet (the hashtag) since the mention couldn't be resolved
 741        assert_eq!(facets.len(), 1);
 742
 743        // Check that it's the hashtag facet
 744        if let FacetFeature::Tag(ref tag) = facets[0].features[0] {
 745            assert_eq!(tag.tag, "rust");
 746        } else {
 747            panic!("Expected Tag feature");
 748        }
 749    }
 750
 751    #[tokio::test]
 752    async fn test_parse_facets_from_text_empty() {
 753        let resolver = MockIdentityResolver::new();
 754        let limits = FacetLimits::default();
 755        let text = "No mentions, URLs, or hashtags here";
 756        let facets = parse_facets_from_text(text, &resolver, &limits).await;
 757        assert!(facets.is_none());
 758    }
 759
 760    #[tokio::test]
 761    async fn test_parse_facets_from_text_url_with_at_mention() {
 762        let resolver = MockIdentityResolver::new();
 763        let limits = FacetLimits::default();
 764
 765        // URLs with @ should not create mention facets
 766        let text = "Tangled https://tangled.org/@smokesignal.events";
 767        let facets = parse_facets_from_text(text, &resolver, &limits).await;
 768
 769        assert!(facets.is_some());
 770        let facets = facets.unwrap();
 771
 772        // Should have exactly 1 facet (the URL), not 2 (URL + mention)
 773        assert_eq!(
 774            facets.len(),
 775            1,
 776            "Expected 1 facet (URL only), got {}",
 777            facets.len()
 778        );
 779
 780        // Verify it's a link facet, not a mention
 781        if let FacetFeature::Link(ref link) = facets[0].features[0] {
 782            assert_eq!(link.uri, "https://tangled.org/@smokesignal.events");
 783        } else {
 784            panic!("Expected Link feature, got Mention or Tag instead");
 785        }
 786    }
 787
 788    #[tokio::test]
 789    async fn test_parse_facets_with_mention_limit() {
 790        let mut resolver = MockIdentityResolver::new();
 791        resolver.add_identity("bob.test.com", "did:plc:bob456");
 792        resolver.add_identity("charlie.test.com", "did:plc:charlie789");
 793
 794        // Limit to 2 mentions
 795        let limits = FacetLimits {
 796            mentions_max: 2,
 797            tags_max: 5,
 798            links_max: 5,
 799            max: 10,
 800        };
 801
 802        let text = "Join @alice.bsky.social @bob.test.com @charlie.test.com";
 803        let facets = parse_facets_from_text(text, &resolver, &limits).await;
 804
 805        assert!(facets.is_some());
 806        let facets = facets.unwrap();
 807        // Should only have 2 mentions (alice and bob), charlie should be skipped
 808        assert_eq!(facets.len(), 2);
 809
 810        // Verify they're both mentions
 811        for facet in &facets {
 812            assert!(matches!(facet.features[0], FacetFeature::Mention(_)));
 813        }
 814    }
 815
 816    #[tokio::test]
 817    async fn test_parse_facets_with_global_limit() {
 818        let mut resolver = MockIdentityResolver::new();
 819        resolver.add_identity("bob.test.com", "did:plc:bob456");
 820
 821        // Very restrictive global limit
 822        let limits = FacetLimits {
 823            mentions_max: 5,
 824            tags_max: 5,
 825            links_max: 5,
 826            max: 3, // Only allow 3 total facets
 827        };
 828
 829        let text =
 830            "Join @alice.bsky.social @bob.test.com at https://example.com #rust #golang #python";
 831        let facets = parse_facets_from_text(text, &resolver, &limits).await;
 832
 833        assert!(facets.is_some());
 834        let facets = facets.unwrap();
 835        // Should be truncated to 3 facets total
 836        assert_eq!(facets.len(), 3);
 837    }
 838
 839    #[test]
 840    fn test_render_with_facet_limits() {
 841        let text = "Contact @alice @bob @charlie for details";
 842        let limits = FacetLimits {
 843            mentions_max: 2, // Only render first 2 mentions
 844            tags_max: 5,
 845            links_max: 5,
 846            max: 10,
 847        };
 848
 849        let facets = vec![
 850            Facet {
 851                index: ByteSlice {
 852                    byte_start: 8,
 853                    byte_end: 14,
 854                },
 855                features: vec![FacetFeature::Mention(Mention {
 856                    did: "did:plc:alice".to_string(),
 857                })],
 858            },
 859            Facet {
 860                index: ByteSlice {
 861                    byte_start: 15,
 862                    byte_end: 19,
 863                },
 864                features: vec![FacetFeature::Mention(Mention {
 865                    did: "did:plc:bob".to_string(),
 866                })],
 867            },
 868            Facet {
 869                index: ByteSlice {
 870                    byte_start: 20,
 871                    byte_end: 28,
 872                },
 873                features: vec![FacetFeature::Mention(Mention {
 874                    did: "did:plc:charlie".to_string(),
 875                })],
 876            },
 877        ];
 878
 879        let html = render_text_with_facets_html(text, Some(&facets), &limits);
 880        // Should only render first 2 mentions, third should be plain text
 881        assert!(html.contains(r#"<a href="/did:plc:alice">@alice</a>"#));
 882        assert!(html.contains(r#"<a href="/did:plc:bob">@bob</a>"#));
 883        // Charlie should NOT be a link due to mention limit
 884        assert!(!html.contains(r#"<a href="/did:plc:charlie">"#));
 885    }
 886
 887    #[test]
 888    fn test_parse_urls_multiple_links() {
 889        let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164";
 890
 891        let url_spans = parse_urls(text);
 892
 893        // Debug output
 894        for (i, span) in url_spans.iter().enumerate() {
 895            println!(
 896                "URL {}: {} (start={}, end={})",
 897                i, span.url, span.start, span.end
 898            );
 899        }
 900
 901        // Should find both URLs
 902        assert_eq!(
 903            url_spans.len(),
 904            2,
 905            "Expected 2 URLs but found {}",
 906            url_spans.len()
 907        );
 908
 909        if !url_spans.is_empty() {
 910            assert_eq!(url_spans[0].url, "https://www.ietf.org/meeting/124/");
 911        }
 912
 913        if url_spans.len() >= 2 {
 914            assert_eq!(
 915                url_spans[1].url,
 916                "https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"
 917            );
 918        }
 919    }
 920
 921    #[test]
 922    fn test_parse_urls_with_html_entity() {
 923        // Test with the HTML entity &amp; in the text
 924        let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd &amp; Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164";
 925
 926        let url_spans = parse_urls(text);
 927
 928        // Debug output
 929        for (i, span) in url_spans.iter().enumerate() {
 930            println!(
 931                "URL {}: {} (start={}, end={})",
 932                i, span.url, span.start, span.end
 933            );
 934            println!(
 935                "  Context before: {:?}",
 936                &text[span.start.saturating_sub(10)..span.start]
 937            );
 938            println!(
 939                "  Context after: {:?}",
 940                &text[span.end..std::cmp::min(span.end + 10, text.len())]
 941            );
 942        }
 943
 944        // Should find both URLs
 945        assert_eq!(
 946            url_spans.len(),
 947            2,
 948            "Expected 2 URLs but found {}",
 949            url_spans.len()
 950        );
 951
 952        if !url_spans.is_empty() {
 953            assert_eq!(url_spans[0].url, "https://www.ietf.org/meeting/124/");
 954        }
 955
 956        if url_spans.len() >= 2 {
 957            assert_eq!(
 958                url_spans[1].url,
 959                "https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164"
 960            );
 961        }
 962    }
 963
 964    #[test]
 965    fn test_render_malicious_link() {
 966        let text = "Visit example.com for details";
 967        let limits = FacetLimits::default();
 968        let facets = vec![Facet {
 969            index: ByteSlice {
 970                byte_start: 6,
 971                byte_end: 17,
 972            },
 973            features: vec![FacetFeature::Link(Link {
 974                uri: "javascript:alert('XSS')".to_string(),
 975            })],
 976        }];
 977
 978        let html = render_text_with_facets_html(text, Some(&facets), &limits);
 979        // JavaScript URLs should be blocked
 980        assert!(!html.contains("javascript:"));
 981        assert_eq!(html, "Visit example.com for details");
 982    }
 983
 984    #[test]
 985    fn test_byte_offset_with_html_entities() {
 986        // This test demonstrates that HTML entity escaping shifts byte positions.
 987        // The byte positions shift:
 988        // In original: '&' is at byte 8 (1 byte)
 989        // In escaped: '&amp;' starts at byte 8 (5 bytes)
 990        // This causes facet byte offsets to be misaligned if text is escaped before rendering.
 991
 992        // If we have a URL after the ampersand in the original:
 993        let original_with_url = "Nov 3rd & Tuesday https://example.com";
 994        let escaped_with_url = "Nov 3rd &amp; Tuesday https://example.com";
 995
 996        // Parse URLs from both versions
 997        let original_urls = parse_urls(original_with_url);
 998        let escaped_urls = parse_urls(escaped_with_url);
 999
1000        println!("Original text: {:?}", original_with_url);
1001        println!(
1002            "Original URL found at: {:?}",
1003            original_urls.first().map(|u| (u.start, u.end))
1004        );
1005        println!("Escaped text: {:?}", escaped_with_url);
1006        println!(
1007            "Escaped URL found at: {:?}",
1008            escaped_urls.first().map(|u| (u.start, u.end))
1009        );
1010
1011        // Both should find the URL, but at different byte positions
1012        assert_eq!(original_urls.len(), 1);
1013        assert_eq!(escaped_urls.len(), 1);
1014
1015        // The byte positions will be different
1016        assert_eq!(original_urls[0].start, 18); // After "Nov 3rd & Tuesday "
1017        assert_eq!(escaped_urls[0].start, 22); // After "Nov 3rd &amp; Tuesday " (4 extra bytes for &amp;)
1018    }
1019
1020    #[test]
1021    fn test_render_facets_with_ampersand_in_text() {
1022        // Test case from the bug report: text with & that should have two URL facets
1023        let text = "IETF124 is happening in Montreal, Nov 1st to 7th https://www.ietf.org/meeting/124/\n\nWe're confirmed for two days of ATProto community sessions on Monday, Nov 3rd & Tuesday, Mov 4th at ECTO Co-Op. Many of us will also be participating in the free-to-attend IETF hackathon on Sunday, Nov 2nd.\n\nLatest updates and attendees in the forum https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164";
1024
1025        // Parse facets from the original text
1026        let url_spans = parse_urls(text);
1027        assert_eq!(url_spans.len(), 2, "Should find 2 URLs");
1028
1029        // Create facets from the parsed URLs
1030        let facets = vec![
1031            Facet {
1032                index: ByteSlice {
1033                    byte_start: url_spans[0].start,
1034                    byte_end: url_spans[0].end,
1035                },
1036                features: vec![FacetFeature::Link(Link {
1037                    uri: url_spans[0].url.clone(),
1038                })],
1039            },
1040            Facet {
1041                index: ByteSlice {
1042                    byte_start: url_spans[1].start,
1043                    byte_end: url_spans[1].end,
1044                },
1045                features: vec![FacetFeature::Link(Link {
1046                    uri: url_spans[1].url.clone(),
1047                })],
1048            },
1049        ];
1050
1051        // Render with facets - this should work correctly even with & in the text
1052        let limits = FacetLimits::default();
1053        let html = render_text_with_facets_html(text, Some(&facets), &limits);
1054
1055        // Both URLs should be rendered as links
1056        assert!(
1057            html.contains(r#"<a href="https://www.ietf.org/meeting/124/""#),
1058            "First URL should be a link"
1059        );
1060        assert!(html.contains(r#"<a href="https://discourse.atprotocol.community/t/update-on-timing-and-plan-for-montreal/164""#), "Second URL should be a link");
1061
1062        // The ampersand should be HTML-escaped in the output
1063        assert!(
1064            html.contains("&amp;"),
1065            "Ampersand should be escaped in HTML output"
1066        );
1067
1068        // Verify the links are properly closed
1069        assert_eq!(
1070            html.matches("</a>").count(),
1071            2,
1072            "Should have 2 closing </a> tags"
1073        );
1074    }
1075
1076    #[test]
1077    fn test_render_with_out_of_bounds_facet() {
1078        // Regression test for panic: "range end index 324 out of range for slice of length 323"
1079        // This can happen when facets come from external AT Protocol data with incorrect byte offsets
1080        let text = "Hello world"; // 11 bytes
1081        let limits = FacetLimits::default();
1082
1083        // Create a facet that extends beyond the text length
1084        let facets = vec![Facet {
1085            index: ByteSlice {
1086                byte_start: 6,
1087                byte_end: 20, // Beyond text length of 11
1088            },
1089            features: vec![FacetFeature::Link(Link {
1090                uri: "https://example.com".to_string(),
1091            })],
1092        }];
1093
1094        // This should NOT panic - invalid facets should be skipped
1095        let html = render_text_with_facets_html(text, Some(&facets), &limits);
1096
1097        // The text should still be rendered (escaped), just without the invalid facet
1098        assert_eq!(html, "Hello world");
1099    }
1100
1101    #[test]
1102    fn test_render_with_facet_start_beyond_text() {
1103        // Test when facet start is beyond text length
1104        let text = "Short"; // 5 bytes
1105        let limits = FacetLimits::default();
1106
1107        let facets = vec![Facet {
1108            index: ByteSlice {
1109                byte_start: 100, // Way beyond text length
1110                byte_end: 110,
1111            },
1112            features: vec![FacetFeature::Link(Link {
1113                uri: "https://example.com".to_string(),
1114            })],
1115        }];
1116
1117        // Should not panic
1118        let html = render_text_with_facets_html(text, Some(&facets), &limits);
1119        assert_eq!(html, "Short");
1120    }
1121
1122    #[test]
1123    fn test_render_with_inverted_facet_indices() {
1124        // Test when byte_start > byte_end (invalid)
1125        let text = "Hello world";
1126        let limits = FacetLimits::default();
1127
1128        let facets = vec![Facet {
1129            index: ByteSlice {
1130                byte_start: 8,
1131                byte_end: 4, // Invalid: end before start
1132            },
1133            features: vec![FacetFeature::Link(Link {
1134                uri: "https://example.com".to_string(),
1135            })],
1136        }];
1137
1138        // Should not panic
1139        let html = render_text_with_facets_html(text, Some(&facets), &limits);
1140        assert_eq!(html, "Hello world");
1141    }
1142
1143    #[test]
1144    fn test_parse_urls_from_atproto_record_text() {
1145        // Test parsing URLs from real AT Protocol record description text.
1146        // This demonstrates the correct byte positions that should be used for facets.
1147        let text = "Dev, Power Users, and Generally inquisitive folks get a completely unprofessionally amateur interview. Just a yap sesh where chat is part of the call!\n\n✨the daniel✨ & I will be on a Zoom call and I will stream out to https://stream.place/psingletary.com\n\nSubscribe to the publications! https://atprotocalls.leaflet.pub/";
1148
1149        let url_spans = parse_urls(text);
1150
1151        assert_eq!(url_spans.len(), 2, "Should find 2 URLs");
1152
1153        // First URL: https://stream.place/psingletary.com
1154        assert_eq!(url_spans[0].url, "https://stream.place/psingletary.com");
1155        assert_eq!(url_spans[0].start, 221);
1156        assert_eq!(url_spans[0].end, 257);
1157
1158        // Second URL: https://atprotocalls.leaflet.pub/
1159        assert_eq!(url_spans[1].url, "https://atprotocalls.leaflet.pub/");
1160        assert_eq!(url_spans[1].start, 290);
1161        assert_eq!(url_spans[1].end, 323);
1162
1163        // Verify the byte slices match the expected text
1164        let text_bytes = text.as_bytes();
1165        assert_eq!(
1166            std::str::from_utf8(&text_bytes[221..257]).unwrap(),
1167            "https://stream.place/psingletary.com"
1168        );
1169        assert_eq!(
1170            std::str::from_utf8(&text_bytes[290..323]).unwrap(),
1171            "https://atprotocalls.leaflet.pub/"
1172        );
1173
1174        // Note: The AT Protocol record had incorrect facet indices:
1175        // - First link: byteStart=222, byteEnd=258 (should be 221, 257)
1176        // - Second link: byteStart=291, byteEnd=324 (should be 290, 323)
1177        // This off-by-one error was in the source data, not our parser.
1178    }
1179
1180    #[test]
1181    fn test_render_with_off_by_one_facet_indices() {
1182        // Regression test for facets with off-by-one byte indices from external AT Protocol data.
1183        // The facets in this test have byteStart values that are 1 byte too high, causing
1184        // the first character of the URL to appear outside the link tag.
1185        //
1186        // This test documents the current behavior: the renderer faithfully applies facets
1187        // at the specified byte positions, even if those positions are incorrect.
1188        // The root cause is incorrect facet generation by the client that created the record.
1189        let text = "Dev, Power Users, and Generally inquisitive folks get a completely unprofessionally amateur interview. Just a yap sesh where chat is part of the call!\n\n✨the daniel✨ & I will be on a Zoom call and I will stream out to https://stream.place/psingletary.com\n\nSubscribe to the publications! https://atprotocalls.leaflet.pub/";
1190
1191        // Verify text length - the second facet's byte_end (324) exceeds this
1192        assert_eq!(text.len(), 323, "Text should be 323 bytes");
1193
1194        let limits = FacetLimits::default();
1195
1196        // These facets have incorrect byte indices (off by 1) - this is real data from AT Protocol
1197        let facets = vec![
1198            Facet {
1199                index: ByteSlice {
1200                    byte_start: 222, // Should be 221
1201                    byte_end: 258,   // Should be 257 (but 258 is within bounds)
1202                },
1203                features: vec![FacetFeature::Link(Link {
1204                    uri: "https://stream.place/psingletary.com".to_string(),
1205                })],
1206            },
1207            Facet {
1208                index: ByteSlice {
1209                    byte_start: 291, // Should be 290
1210                    byte_end: 324, // Should be 323 - but 324 > text.len() so this facet is SKIPPED
1211                },
1212                features: vec![FacetFeature::Link(Link {
1213                    uri: "https://atprotocalls.leaflet.pub/".to_string(),
1214                })],
1215            },
1216        ];
1217
1218        let html = render_text_with_facets_html(text, Some(&facets), &limits);
1219
1220        // Due to off-by-one facet indices, the 'h' from 'https' appears before the link tag
1221        assert!(
1222            html.contains(r#"stream out to h<a href="https://stream.place/psingletary.com""#),
1223            "First link should have 'h' outside due to off-by-one error. Got: {}",
1224            html
1225        );
1226
1227        // The second facet is SKIPPED entirely because byte_end (324) > text.len() (323)
1228        // This is the bounds check in render_text_with_facets_html preventing out-of-bounds access
1229        assert!(
1230            !html.contains(r#"<a href="https://atprotocalls.leaflet.pub/""#),
1231            "Second link should NOT be rendered because facet is out of bounds. Got: {}",
1232            html
1233        );
1234        assert!(
1235            html.contains("https://atprotocalls.leaflet.pub/"),
1236            "Second URL should appear as plain text. Got: {}",
1237            html
1238        );
1239
1240        // Verify correct byte positions from our parser
1241        let url_spans = parse_urls(text);
1242        assert_eq!(url_spans.len(), 2, "Should find 2 URLs");
1243
1244        // The correct byte positions from our parser
1245        assert_eq!(
1246            url_spans[0].start, 221,
1247            "First URL should start at byte 221, not 222"
1248        );
1249        assert_eq!(
1250            url_spans[0].end, 257,
1251            "First URL should end at byte 257, not 258"
1252        );
1253        assert_eq!(
1254            url_spans[1].start, 290,
1255            "Second URL should start at byte 290, not 291"
1256        );
1257        assert_eq!(
1258            url_spans[1].end, 323,
1259            "Second URL should end at byte 323, not 324"
1260        );
1261    }
1262}