tests for richtext + fixes to sanitization · nonbinary.computer/jacquard@08342a3

nonbinary.computer / jacquard

fork atom

A better Rust ATProto crate

fork atom

tests for richtext + fixes to sanitization

Orual 5 months ago 08342a34 e706c991

build.yml

success 2min 51s

+715 -13

2 changed files

expand all

crates

jacquard

src

richtext

tests.rs

richtext.rs

+70 -13

crates/jacquard/src/richtext.rs

··· 18 18 // https://github.com/bluesky-social/atproto/blob/main/packages/api/src/rich-text/util.ts 19 19 20 20 static MENTION_REGEX: LazyLock<Regex> = 21 - LazyLock::new(|| Regex::new(r"(^|\s|$)(@)([a-zA-Z0-9.-]+)(\b)").unwrap()); 21 + LazyLock::new(|| Regex::new(r"(^|\s|\()(@)([a-zA-Z0-9.:-]+)(\b)").unwrap()); 22 22 23 23 static URL_REGEX: LazyLock<Regex> = LazyLock::new(|| { 24 24 Regex::new(r"(^|\s|\()((https?://[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))") ··· 34 34 LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)$").unwrap()); 35 35 36 36 static TRAILING_PUNCT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\p{P}+$").unwrap()); 37 + 38 + // Sanitization regex - removes soft hyphens, zero-width chars, normalizes newlines 39 + // Matches one of the special chars, optionally followed by whitespace, repeated 40 + // This ensures at least one special char is in the match (won't match pure spaces) 41 + static SANITIZE_NEWLINES_REGEX: LazyLock<Regex> = LazyLock::new(|| { 42 + Regex::new(r"([\r\n\u{00AD}\u{2060}\u{200D}\u{200C}\u{200B}]\s*)+").unwrap() 43 + }); 37 44 38 45 /// Default domains that support at-URI extraction from URLs 39 46 /// (bsky.app URL patterns like /profile/{actor}/post/{rkey}) ··· 61 68 /// Entry point for parsing text with automatic facet detection 62 69 /// 63 70 /// Uses default embed domains (bsky.app, deer.social) for at-URI extraction. 64 - pub fn parse(text: impl Into<String>) -> RichTextBuilder<Unresolved> { 71 + pub fn parse(text: impl AsRef<str>) -> RichTextBuilder<Unresolved> { 65 72 parse(text) 66 73 } 67 74 ··· 150 157 }, 151 158 } 152 159 160 + /// Sanitize text by removing invisible characters and normalizing newlines 161 + /// 162 + /// This removes: 163 + /// - Soft hyphens (\u{00AD}) 164 + /// - Zero-width non-joiner (\u{200C}) 165 + /// - Zero-width joiner (\u{200D}) 166 + /// - Zero-width space (\u{200B}) 167 + /// - Word joiner (\u{2060}) 168 + /// 169 + /// And normalizes all newline variants (\r\n, \r, \n) to \n, while collapsing 170 + /// runs of newlines and invisible chars to at most two newlines. 171 + fn sanitize_text(text: &str) -> String { 172 + SANITIZE_NEWLINES_REGEX 173 + .replace_all(text, |caps: &regex::Captures| { 174 + let matched = caps.get(0).unwrap().as_str(); 175 + 176 + // Count newline sequences, treating \r\n as one unit 177 + let mut newline_sequences = 0; 178 + let mut chars = matched.chars().peekable(); 179 + 180 + while let Some(c) = chars.next() { 181 + if c == '\r' { 182 + // Check if followed by \n 183 + if chars.peek() == Some(&'\n') { 184 + chars.next(); // consume the \n 185 + } 186 + newline_sequences += 1; 187 + } else if c == '\n' { 188 + newline_sequences += 1; 189 + } 190 + // Skip invisible chars (they don't increment count) 191 + } 192 + 193 + if newline_sequences == 0 { 194 + // Only invisible chars, remove them 195 + "" 196 + } else if newline_sequences == 1 { 197 + "\n" 198 + } else { 199 + // Multiple newlines, collapse to \n\n (paragraph break) 200 + "\n\n" 201 + } 202 + }) 203 + .to_string() 204 + } 205 + 153 206 /// Entry point for parsing text with automatic facet detection 154 207 /// 155 208 /// Uses default embed domains (bsky.app, deer.social) for at-URI extraction. 156 209 /// For custom domains, use [`parse_with_domains`]. 157 - pub fn parse(text: impl Into<String>) -> RichTextBuilder<Unresolved> { 210 + pub fn parse(text: impl AsRef<str>) -> RichTextBuilder<Unresolved> { 158 211 #[cfg(feature = "api_bluesky")] 159 212 { 160 213 parse_with_domains(text, DEFAULT_EMBED_DOMAINS) ··· 171 224 /// that use the same URL patterns for records (e.g., /profile/{actor}/post/{rkey}). 172 225 #[cfg(feature = "api_bluesky")] 173 226 pub fn parse_with_domains( 174 - text: impl Into<String>, 227 + text: impl AsRef<str>, 175 228 embed_domains: &[&str], 176 229 ) -> RichTextBuilder<Unresolved> { 177 - let text = text.into(); 230 + // Step 0: Sanitize text (remove invisible chars, normalize newlines) 231 + let text = sanitize_text(text.as_ref()); 232 + 178 233 let mut facet_candidates = Vec::new(); 179 234 let mut embed_candidates = Vec::new(); 180 235 ··· 230 285 /// Parse text without embed detection (no api_bluesky feature) 231 286 #[cfg(not(feature = "api_bluesky"))] 232 287 pub fn parse_with_domains( 233 - text: impl Into<String>, 288 + text: impl AsRef<str>, 234 289 _embed_domains: &[&str], 235 290 ) -> RichTextBuilder<Unresolved> { 236 - let text = text.into(); 291 + // Step 0: Sanitize text (remove invisible chars, normalize newlines) 292 + let text = sanitize_text(text.as_ref()); 293 + 237 294 let mut facet_candidates = Vec::new(); 238 295 239 296 // Step 1: Detect and strip markdown links first ··· 299 356 300 357 impl<S> RichTextBuilder<S> { 301 358 /// Set the text content 302 - pub fn text(mut self, text: impl Into<String>) -> Self { 303 - self.text = text.into(); 359 + pub fn text(mut self, text: impl AsRef<str>) -> Self { 360 + self.text = sanitize_text(text.as_ref()); 304 361 self 305 362 } 306 363 ··· 800 857 #[cfg(feature = "api_bluesky")] 801 858 impl RichTextBuilder<Unresolved> { 802 859 /// Build richtext, resolving handles to DIDs using the provided resolver 803 - pub async fn build_async<R>( 804 - self, 805 - resolver: &R, 806 - ) -> Result<RichText<'static>, RichTextError> 860 + pub async fn build_async<R>(self, resolver: &R) -> Result<RichText<'static>, RichTextError> 807 861 where 808 862 R: jacquard_identity::resolver::IdentityResolver + Sync, 809 863 { ··· 1065 1119 1066 1120 Ok(None) 1067 1121 } 1122 + 1123 + #[cfg(test)] 1124 + mod tests;

+645

crates/jacquard/src/richtext/tests.rs

··· 1 + use super::*; 2 + 3 + #[test] 4 + fn test_parse_mentions() { 5 + let text = "Hey @alice.bsky.social check this out"; 6 + let builder = RichText::parse(text); 7 + 8 + assert_eq!(builder.facet_candidates.len(), 1); 9 + match &builder.facet_candidates[0] { 10 + FacetCandidate::Mention { range, .. } => { 11 + // Verify the text in the range includes the @ symbol 12 + assert_eq!(&builder.text[range.clone()], "@alice.bsky.social"); 13 + } 14 + _ => panic!("Expected mention facet"), 15 + } 16 + } 17 + 18 + #[test] 19 + fn test_parse_links() { 20 + let text = "Check out https://example.com for more info"; 21 + let builder = RichText::parse(text); 22 + 23 + assert!(builder.facet_candidates.iter().any(|fc| { 24 + matches!(fc, FacetCandidate::Link { range } if text[range.clone()].contains("example.com")) 25 + })); 26 + } 27 + 28 + #[test] 29 + fn test_parse_tags() { 30 + let text = "This is #cool and #awesome"; 31 + let builder = RichText::parse(text); 32 + 33 + let tags: Vec<_> = builder 34 + .facet_candidates 35 + .iter() 36 + .filter_map(|fc| match fc { 37 + FacetCandidate::Tag { range } => Some(&builder.text[range.clone()]), 38 + _ => None, 39 + }) 40 + .collect(); 41 + 42 + assert!(tags.contains(&"#cool")); 43 + assert!(tags.contains(&"#awesome")); 44 + } 45 + 46 + #[test] 47 + fn test_markdown_links() { 48 + let text = "Check out [this link](https://example.com)"; 49 + let builder = RichText::parse(text); 50 + 51 + // Should have stripped markdown syntax 52 + assert!(builder.text.contains("this link")); 53 + assert!(!builder.text.contains("[")); 54 + assert!(!builder.text.contains("]")); 55 + 56 + // Should have detected the link facet 57 + assert!(builder.facet_candidates.iter().any(|fc| matches!( 58 + fc, 59 + FacetCandidate::MarkdownLink { url, .. } if url == "https://example.com" 60 + ))); 61 + } 62 + 63 + #[test] 64 + #[cfg(feature = "api_bluesky")] 65 + fn test_builder_manual_construction() { 66 + let did = crate::types::did::Did::new_static("did:plc:z72i7hdynmk6r22z27h6tvur").unwrap(); 67 + 68 + let result = RichText::builder() 69 + .text("Hello @alice check out example.com".to_string()) 70 + .mention(&did, 6..12) 71 + .link("https://example.com", Some(23..34)) 72 + .build() 73 + .unwrap(); 74 + 75 + assert_eq!(result.text.as_str(), "Hello @alice check out example.com"); 76 + assert!(result.facets.is_some()); 77 + let facets = result.facets.unwrap(); 78 + assert_eq!(facets.len(), 2); 79 + } 80 + 81 + #[test] 82 + #[cfg(feature = "api_bluesky")] 83 + fn test_overlapping_facets_error() { 84 + let did1 = crate::types::did::Did::new_static("did:plc:z72i7hdynmk6r22z27h6tvur").unwrap(); 85 + let did2 = crate::types::did::Did::new_static("did:plc:ewvi7nxzyoun6zhxrhs64oiz").unwrap(); 86 + 87 + let result = RichText::builder() 88 + .text("Hello world".to_string()) 89 + .mention(&did1, 0..5) 90 + .mention(&did2, 3..8) // Overlaps with previous 91 + .build(); 92 + 93 + assert!(matches!( 94 + result, 95 + Err(RichTextError::OverlappingFacets(_, _)) 96 + )); 97 + } 98 + 99 + #[test] 100 + fn test_parse_did_mentions() { 101 + let text = "Hey @did:plc:z72i7hdynmk6r22z27h6tvur check this out"; 102 + let builder = RichText::parse(text); 103 + 104 + assert_eq!(builder.facet_candidates.len(), 1); 105 + match &builder.facet_candidates[0] { 106 + FacetCandidate::Mention { range, did } => { 107 + assert_eq!(&text[range.clone()], "@did:plc:z72i7hdynmk6r22z27h6tvur"); 108 + assert!(did.is_some()); // DID should be pre-resolved 109 + } 110 + _ => panic!("Expected mention facet"), 111 + } 112 + } 113 + 114 + #[test] 115 + fn test_bare_domain_link() { 116 + let text = "Visit example.com for info"; 117 + let builder = RichText::parse(text); 118 + 119 + assert!(builder.facet_candidates.iter().any(|fc| { 120 + matches!(fc, FacetCandidate::Link { range } if text[range.clone()].contains("example.com")) 121 + })); 122 + } 123 + 124 + #[test] 125 + fn test_trailing_punctuation_stripped() { 126 + let text = "Check https://example.com, and https://test.org."; 127 + let builder = RichText::parse(text); 128 + 129 + // Count link facets 130 + let link_count = builder 131 + .facet_candidates 132 + .iter() 133 + .filter(|fc| matches!(fc, FacetCandidate::Link { .. })) 134 + .count(); 135 + 136 + assert_eq!(link_count, 2); 137 + 138 + // Verify punctuation is not included in ranges 139 + for fc in &builder.facet_candidates { 140 + if let FacetCandidate::Link { range } = fc { 141 + let url = &text[range.clone()]; 142 + assert!(!url.ends_with(',')); 143 + assert!(!url.ends_with('.')); 144 + } 145 + } 146 + } 147 + 148 + #[test] 149 + #[cfg(feature = "api_bluesky")] 150 + fn test_embed_detection_external() { 151 + let text = "Check out https://external.com/article"; 152 + let builder = RichText::parse(text); 153 + 154 + assert!(builder.embed_candidates.is_some()); 155 + let embeds = builder.embed_candidates.unwrap(); 156 + assert_eq!(embeds.len(), 1); 157 + 158 + match &embeds[0] { 159 + EmbedCandidate::External { url, metadata } => { 160 + assert!(url.contains("external.com")); 161 + assert!(metadata.is_none()); 162 + } 163 + _ => panic!("Expected external embed"), 164 + } 165 + } 166 + 167 + #[test] 168 + #[cfg(feature = "api_bluesky")] 169 + fn test_embed_detection_bsky_post() { 170 + let text = "See https://bsky.app/profile/alice.bsky.social/post/abc123"; 171 + let builder = RichText::parse(text); 172 + 173 + assert!(builder.embed_candidates.is_some()); 174 + let embeds = builder.embed_candidates.unwrap(); 175 + assert_eq!(embeds.len(), 1); 176 + 177 + match &embeds[0] { 178 + EmbedCandidate::Record { at_uri, .. } => { 179 + assert_eq!( 180 + at_uri.as_str(), 181 + "at://alice.bsky.social/app.bsky.feed.post/abc123" 182 + ); 183 + } 184 + _ => panic!("Expected record embed"), 185 + } 186 + } 187 + 188 + #[test] 189 + #[cfg(feature = "api_bluesky")] 190 + fn test_markdown_link_with_embed() { 191 + let text = "Read [my post](https://bsky.app/profile/me.bsky.social/post/xyz)"; 192 + let builder = RichText::parse(text); 193 + 194 + // Should have markdown facet 195 + assert!( 196 + builder 197 + .facet_candidates 198 + .iter() 199 + .any(|fc| matches!(fc, FacetCandidate::MarkdownLink { .. })) 200 + ); 201 + 202 + // Should also detect embed 203 + assert!(builder.embed_candidates.is_some()); 204 + let embeds = builder.embed_candidates.unwrap(); 205 + assert_eq!(embeds.len(), 1); 206 + } 207 + 208 + // === Sanitization Tests === 209 + 210 + #[test] 211 + fn test_sanitize_soft_hyphen() { 212 + // Soft hyphens should be removed 213 + let text = "Hello\u{00AD}World"; 214 + let builder = RichText::parse(text); 215 + 216 + assert_eq!(builder.text, "HelloWorld"); 217 + } 218 + 219 + #[test] 220 + fn test_sanitize_zero_width_space() { 221 + // Zero-width spaces should be removed 222 + let text = "Hello\u{200B}World"; 223 + let builder = RichText::parse(text); 224 + 225 + assert_eq!(builder.text, "HelloWorld"); 226 + } 227 + 228 + #[test] 229 + fn test_sanitize_normalize_newlines() { 230 + // \r\n should normalize to \n 231 + let text = "Hello\r\nWorld"; 232 + let builder = RichText::parse(text); 233 + 234 + assert_eq!(builder.text, "Hello\nWorld"); 235 + } 236 + 237 + #[test] 238 + fn test_sanitize_collapse_multiple_newlines() { 239 + // Multiple newlines should collapse to \n\n 240 + let text = "Hello\n\n\n\nWorld"; 241 + let builder = RichText::parse(text); 242 + 243 + assert_eq!(builder.text, "Hello\n\nWorld"); 244 + } 245 + 246 + #[test] 247 + fn test_sanitize_mixed_invisible_and_newlines() { 248 + // Mix of invisible chars and newlines 249 + let text = "Hello\u{200B}\n\u{200C}\n\u{00AD}World"; 250 + let builder = RichText::parse(text); 251 + 252 + assert_eq!(builder.text, "Hello\n\nWorld"); 253 + } 254 + 255 + #[test] 256 + fn test_sanitize_preserves_facets() { 257 + // Make sure sanitization doesn't break facet detection 258 + let text = "Hey @alice.bsky.social\u{200B} check\u{00AD}out https://example.com"; 259 + let builder = RichText::parse(text); 260 + 261 + // Should still detect both mention and link 262 + assert!(builder 263 + .facet_candidates 264 + .iter() 265 + .any(|fc| matches!(fc, FacetCandidate::Mention { .. }))); 266 + assert!(builder 267 + .facet_candidates 268 + .iter() 269 + .any(|fc| matches!(fc, FacetCandidate::Link { .. }))); 270 + } 271 + 272 + #[test] 273 + fn test_sanitize_newlines_with_spaces() { 274 + // Newlines with spaces between should collapse 275 + let text = "Hello\n \n \nWorld"; 276 + let builder = RichText::parse(text); 277 + 278 + // 3 newlines with spaces -> collapses to \n\n 279 + assert_eq!(builder.text, "Hello\n\nWorld"); 280 + } 281 + 282 + #[test] 283 + fn test_sanitize_preserves_no_excess_newlines() { 284 + // Text without excessive newlines should be unchanged 285 + let text = "Hello\nWorld"; 286 + let builder = RichText::parse(text); 287 + 288 + assert_eq!(builder.text, "Hello\nWorld"); 289 + } 290 + 291 + #[test] 292 + fn test_sanitize_empty_input() { 293 + // Empty string should remain empty 294 + let text = ""; 295 + let builder = RichText::parse(text); 296 + 297 + assert_eq!(builder.text, ""); 298 + } 299 + 300 + #[test] 301 + fn test_sanitize_only_invisible_chars() { 302 + // Only invisible chars should be removed entirely 303 + let text = "\u{200B}\u{200C}\u{200D}\u{00AD}"; 304 + let builder = RichText::parse(text); 305 + 306 + assert_eq!(builder.text, ""); 307 + } 308 + 309 + #[test] 310 + fn test_sanitize_cr_normalization() { 311 + // Standalone \r should normalize to \n 312 + let text = "Hello\rWorld"; 313 + let builder = RichText::parse(text); 314 + 315 + assert_eq!(builder.text, "Hello\nWorld"); 316 + } 317 + 318 + #[test] 319 + fn test_sanitize_mixed_line_endings() { 320 + // Mix of \r\n, \r, \n should all normalize 321 + let text = "Line1\r\nLine2\rLine3\nLine4"; 322 + let builder = RichText::parse(text); 323 + 324 + assert_eq!(builder.text, "Line1\nLine2\nLine3\nLine4"); 325 + } 326 + 327 + #[test] 328 + fn test_sanitize_preserves_regular_spaces() { 329 + // Regular spaces without newlines should be preserved 330 + let text = "Hello World"; 331 + let builder = RichText::parse(text); 332 + 333 + assert_eq!(builder.text, "Hello World"); 334 + } 335 + 336 + // === Adversarial / Edge Case Tests === 337 + 338 + #[test] 339 + fn test_tag_too_long() { 340 + // Tags must be 64 chars or less 341 + let long_tag = "a".repeat(65); 342 + let text = format!("#{}", long_tag); 343 + let builder = RichText::parse(text); 344 + 345 + // Should NOT detect the tag 346 + assert!(builder 347 + .facet_candidates 348 + .iter() 349 + .all(|fc| !matches!(fc, FacetCandidate::Tag { .. }))); 350 + } 351 + 352 + #[test] 353 + fn test_tag_with_zero_width_chars() { 354 + // Zero-width joiners and other invisible unicode 355 + let text = "This is #cool\u{200B}tag"; 356 + let builder = RichText::parse(text); 357 + 358 + // Tag should stop at zero-width char 359 + let tags: Vec<_> = builder 360 + .facet_candidates 361 + .iter() 362 + .filter_map(|fc| match fc { 363 + FacetCandidate::Tag { range } => Some(&builder.text[range.clone()]), 364 + _ => None, 365 + }) 366 + .collect(); 367 + 368 + // Should only capture up to the zero-width char 369 + assert!(tags.iter().any(|t| t.starts_with("#cool"))); 370 + } 371 + 372 + #[test] 373 + fn test_url_with_parens() { 374 + // URLs like wikipedia with (parens) in them 375 + let text = "See https://en.wikipedia.org/wiki/Rust_(programming_language)"; 376 + let builder = RichText::parse(text); 377 + 378 + // Should capture the full URL including parens 379 + assert!(builder.facet_candidates.iter().any(|fc| { 380 + matches!(fc, FacetCandidate::Link { range } if text[range.clone()].contains("programming_language")) 381 + })); 382 + } 383 + 384 + #[test] 385 + fn test_markdown_link_unclosed() { 386 + // Malformed markdown should not be processed 387 + let text = "This is [unclosed link"; 388 + let builder = RichText::parse(text); 389 + 390 + // Should not detect markdown link, text unchanged 391 + assert_eq!(builder.text, text); 392 + assert!(builder 393 + .facet_candidates 394 + .iter() 395 + .all(|fc| !matches!(fc, FacetCandidate::MarkdownLink { .. }))); 396 + } 397 + 398 + #[test] 399 + fn test_nested_markdown_attempts() { 400 + // Try to nest markdown links 401 + let text = "[[nested](https://inner.com)](https://outer.com)"; 402 + let builder = RichText::parse(text); 403 + 404 + // Should only match the innermost valid pattern 405 + let markdown_count = builder 406 + .facet_candidates 407 + .iter() 408 + .filter(|fc| matches!(fc, FacetCandidate::MarkdownLink { .. })) 409 + .count(); 410 + 411 + // Regex should match leftmost, should get one 412 + assert!(markdown_count > 0); 413 + } 414 + 415 + #[test] 416 + fn test_mention_with_emoji() { 417 + // Handles can't have emoji but let's make sure it doesn't crash 418 + let text = "Hey @alice😎.bsky.social wassup"; 419 + let builder = RichText::parse(text); 420 + 421 + // Should not match or should stop at emoji 422 + let mentions: Vec<_> = builder 423 + .facet_candidates 424 + .iter() 425 + .filter_map(|fc| match fc { 426 + FacetCandidate::Mention { range, .. } => Some(&text[range.clone()]), 427 + _ => None, 428 + }) 429 + .collect(); 430 + 431 + // Either no mentions or mention stops before emoji 432 + for mention in mentions { 433 + assert!(!mention.contains('😎')); 434 + } 435 + } 436 + 437 + #[test] 438 + fn test_handle_with_trailing_dots() { 439 + // Handles like @alice... should not include trailing dots 440 + let text = "Hey @alice.bsky.social... how are you"; 441 + let builder = RichText::parse(text); 442 + 443 + if let Some(FacetCandidate::Mention { range, .. }) = builder.facet_candidates.first() { 444 + let mention = &text[range.clone()]; 445 + assert!(!mention.ends_with('.')); 446 + } 447 + } 448 + 449 + #[test] 450 + fn test_url_javascript_protocol() { 451 + // Should not detect javascript: or data: URLs 452 + let text = "Click javascript:alert(1) or data:text/html,<script>alert(1)</script>"; 453 + let builder = RichText::parse(text); 454 + 455 + // Should not match non-http(s) URLs 456 + for fc in &builder.facet_candidates { 457 + if let FacetCandidate::Link { range } = fc { 458 + let url = &text[range.clone()]; 459 + assert!(!url.starts_with("javascript:")); 460 + assert!(!url.starts_with("data:")); 461 + } 462 + } 463 + } 464 + 465 + #[test] 466 + fn test_extremely_long_url() { 467 + // Very long URL should still work (no panic) 468 + let long_path = "a/".repeat(1000); 469 + let text = format!("Visit https://example.com/{}", long_path); 470 + let builder = RichText::parse(text); 471 + 472 + // Should detect the URL without panicking 473 + assert!(builder 474 + .facet_candidates 475 + .iter() 476 + .any(|fc| matches!(fc, FacetCandidate::Link { .. }))); 477 + } 478 + 479 + #[test] 480 + fn test_empty_string() { 481 + let text = ""; 482 + let builder = RichText::parse(text); 483 + 484 + assert_eq!(builder.text, ""); 485 + assert!(builder.facet_candidates.is_empty()); 486 + } 487 + 488 + #[test] 489 + fn test_only_whitespace() { 490 + let text = " \t\n "; 491 + let builder = RichText::parse(text); 492 + 493 + assert!(builder.facet_candidates.is_empty()); 494 + } 495 + 496 + #[test] 497 + fn test_markdown_with_newlines() { 498 + // Markdown regex should not match across newlines 499 + let text = "This is [text\nwith](https://example.com) newline"; 500 + let builder = RichText::parse(text); 501 + 502 + // Current regex won't match \n in the display text part 503 + // Just make sure it doesn't panic 504 + let _ = builder.facet_candidates; 505 + } 506 + 507 + #[test] 508 + fn test_multiple_at_signs() { 509 + // @@alice should only match @alice 510 + let text = "Hey @@alice.bsky.social"; 511 + let builder = RichText::parse(text); 512 + 513 + // Regex requires word boundary before @, so @@ might not match 514 + // Or might match the second @ 515 + // Just verify it doesn't panic and produces valid ranges 516 + for fc in &builder.facet_candidates { 517 + if let FacetCandidate::Mention { range, .. } = fc { 518 + assert!(range.end <= text.len()); 519 + let _ = &text[range.clone()]; // Shouldn't panic 520 + } 521 + } 522 + } 523 + 524 + #[test] 525 + fn test_url_with_unicode_domain() { 526 + // IDN domains 527 + let text = "Visit https://例え.jp for info"; 528 + let builder = RichText::parse(text); 529 + 530 + // Current regex only matches ASCII domains, so this might not detect 531 + // Just make sure no panic 532 + let _ = builder.facet_candidates; 533 + } 534 + 535 + #[test] 536 + #[cfg(feature = "api_bluesky")] 537 + fn test_build_with_invalid_range() { 538 + let did = crate::types::did::Did::new_static("did:plc:z72i7hdynmk6r22z27h6tvur").unwrap(); 539 + 540 + // Range exceeds text length 541 + let result = RichText::builder() 542 + .text("Short".to_string()) 543 + .mention(&did, 0..100) 544 + .build(); 545 + 546 + assert!(matches!( 547 + result, 548 + Err(RichTextError::InvalidRange { .. }) 549 + )); 550 + } 551 + 552 + #[test] 553 + fn test_rtl_override_injection() { 554 + // Right-to-left override character attempts 555 + let text = "Hey @alice\u{202E}reversed\u{202C}.bsky.social"; 556 + let builder = RichText::parse(text); 557 + 558 + // Should either not match or handle gracefully 559 + let _ = builder.facet_candidates; 560 + } 561 + 562 + #[test] 563 + fn test_tag_empty_after_hash() { 564 + // Just # with nothing after 565 + let text = "This is # a test"; 566 + let builder = RichText::parse(text); 567 + 568 + // Should not detect empty tag 569 + assert!(builder 570 + .facet_candidates 571 + .iter() 572 + .all(|fc| !matches!(fc, FacetCandidate::Tag { .. }))); 573 + } 574 + 575 + // === Unicode Byte Boundary Tests === 576 + 577 + #[test] 578 + fn test_facet_ranges_valid_utf8_boundaries() { 579 + // All detected facet ranges must be valid UTF-8 boundaries 580 + let text = "Hey @alice.bsky.social check 你好 #tag🔥 and https://example.com/测试"; 581 + let builder = RichText::parse(text); 582 + 583 + for fc in &builder.facet_candidates { 584 + let range = match fc { 585 + FacetCandidate::Mention { range, .. } => range, 586 + FacetCandidate::Link { range } => range, 587 + FacetCandidate::Tag { range } => range, 588 + FacetCandidate::MarkdownLink { display_range, .. } => display_range, 589 + }; 590 + 591 + // This will panic if range is not on UTF-8 char boundary 592 + // Use builder.text (sanitized) not original text 593 + let slice = &builder.text[range.clone()]; 594 + // Verify it's valid UTF-8 595 + assert!(std::str::from_utf8(slice.as_bytes()).is_ok()); 596 + } 597 + } 598 + 599 + #[test] 600 + fn test_emoji_grapheme_clusters() { 601 + // Family emoji with ZWJ sequences: "👨‍👩‍👧‍👧" is 25 bytes but 1 grapheme 602 + let text = "Hello 👨‍👩‍👧‍👧 @alice.bsky.social"; 603 + let builder = RichText::parse(text); 604 + 605 + // Should still detect the mention after the emoji 606 + assert!(builder 607 + .facet_candidates 608 + .iter() 609 + .any(|fc| matches!(fc, FacetCandidate::Mention { .. }))); 610 + 611 + // Verify all ranges are valid against the sanitized text 612 + for fc in &builder.facet_candidates { 613 + if let FacetCandidate::Mention { range, .. } = fc { 614 + let _ = &builder.text[range.clone()]; // Shouldn't panic 615 + } 616 + } 617 + } 618 + 619 + #[test] 620 + fn test_tag_with_emoji() { 621 + // Tags can contain emoji 622 + let text = "This is #cool🔥"; 623 + let builder = RichText::parse(text); 624 + 625 + let tags: Vec<_> = builder 626 + .facet_candidates 627 + .iter() 628 + .filter_map(|fc| match fc { 629 + FacetCandidate::Tag { range } => Some(&builder.text[range.clone()]), 630 + _ => None, 631 + }) 632 + .collect(); 633 + 634 + // Should include emoji in tag 635 + assert!(tags.iter().any(|t| t.contains("🔥"))); 636 + } 637 + 638 + #[test] 639 + fn test_sanitize_newlines_with_emoji() { 640 + // Newlines with emoji should still collapse correctly 641 + let text = "Hello 🎉\n\n\n\nWorld 🌍"; 642 + let builder = RichText::parse(text); 643 + 644 + assert_eq!(builder.text, "Hello 🎉\n\nWorld 🌍"); 645 + }