···1818// https://github.com/bluesky-social/atproto/blob/main/packages/api/src/rich-text/util.ts
19192020static MENTION_REGEX: LazyLock<Regex> =
2121- LazyLock::new(|| Regex::new(r"(^|\s|\()(@)([a-zA-Z0-9.-]+)(\b)").unwrap());
2121+ LazyLock::new(|| Regex::new(r"(^|\s|\()(@)([a-zA-Z0-9.:-]+)(\b)").unwrap());
22222323static URL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
2424 Regex::new(r"(^|\s|\()((https?://[\S]+)|((?<domain>[a-z][a-z0-9]*(\.[a-z0-9]+)+)[\S]*))")
···3434 LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
35353636static TRAILING_PUNCT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\p{P}+$").unwrap());
3737+3838+// Sanitization regex - removes soft hyphens, zero-width chars, normalizes newlines
3939+// Matches one of the special chars, optionally followed by whitespace, repeated
4040+// This ensures at least one special char is in the match (won't match pure spaces)
4141+static SANITIZE_NEWLINES_REGEX: LazyLock<Regex> = LazyLock::new(|| {
4242+ Regex::new(r"([\r\n\u{00AD}\u{2060}\u{200D}\u{200C}\u{200B}]\s*)+").unwrap()
4343+});
37443845/// Default domains that support at-URI extraction from URLs
3946/// (bsky.app URL patterns like /profile/{actor}/post/{rkey})
···6168 /// Entry point for parsing text with automatic facet detection
6269 ///
6370 /// Uses default embed domains (bsky.app, deer.social) for at-URI extraction.
6464- pub fn parse(text: impl Into<String>) -> RichTextBuilder<Unresolved> {
7171+ pub fn parse(text: impl AsRef<str>) -> RichTextBuilder<Unresolved> {
6572 parse(text)
6673 }
6774···150157 },
151158}
152159160160+/// Sanitize text by removing invisible characters and normalizing newlines
161161+///
162162+/// This removes:
163163+/// - Soft hyphens (\u{00AD})
164164+/// - Zero-width non-joiner (\u{200C})
165165+/// - Zero-width joiner (\u{200D})
166166+/// - Zero-width space (\u{200B})
167167+/// - Word joiner (\u{2060})
168168+///
169169+/// And normalizes all newline variants (\r\n, \r, \n) to \n, while collapsing
170170+/// runs of newlines and invisible chars to at most two newlines.
171171+fn sanitize_text(text: &str) -> String {
172172+ SANITIZE_NEWLINES_REGEX
173173+ .replace_all(text, |caps: ®ex::Captures| {
174174+ let matched = caps.get(0).unwrap().as_str();
175175+176176+ // Count newline sequences, treating \r\n as one unit
177177+ let mut newline_sequences = 0;
178178+ let mut chars = matched.chars().peekable();
179179+180180+ while let Some(c) = chars.next() {
181181+ if c == '\r' {
182182+ // Check if followed by \n
183183+ if chars.peek() == Some(&'\n') {
184184+ chars.next(); // consume the \n
185185+ }
186186+ newline_sequences += 1;
187187+ } else if c == '\n' {
188188+ newline_sequences += 1;
189189+ }
190190+ // Skip invisible chars (they don't increment count)
191191+ }
192192+193193+ if newline_sequences == 0 {
194194+ // Only invisible chars, remove them
195195+ ""
196196+ } else if newline_sequences == 1 {
197197+ "\n"
198198+ } else {
199199+ // Multiple newlines, collapse to \n\n (paragraph break)
200200+ "\n\n"
201201+ }
202202+ })
203203+ .to_string()
204204+}
205205+153206/// Entry point for parsing text with automatic facet detection
154207///
155208/// Uses default embed domains (bsky.app, deer.social) for at-URI extraction.
156209/// For custom domains, use [`parse_with_domains`].
157157-pub fn parse(text: impl Into<String>) -> RichTextBuilder<Unresolved> {
210210+pub fn parse(text: impl AsRef<str>) -> RichTextBuilder<Unresolved> {
158211 #[cfg(feature = "api_bluesky")]
159212 {
160213 parse_with_domains(text, DEFAULT_EMBED_DOMAINS)
···171224/// that use the same URL patterns for records (e.g., /profile/{actor}/post/{rkey}).
172225#[cfg(feature = "api_bluesky")]
173226pub fn parse_with_domains(
174174- text: impl Into<String>,
227227+ text: impl AsRef<str>,
175228 embed_domains: &[&str],
176229) -> RichTextBuilder<Unresolved> {
177177- let text = text.into();
230230+ // Step 0: Sanitize text (remove invisible chars, normalize newlines)
231231+ let text = sanitize_text(text.as_ref());
232232+178233 let mut facet_candidates = Vec::new();
179234 let mut embed_candidates = Vec::new();
180235···230285/// Parse text without embed detection (no api_bluesky feature)
231286#[cfg(not(feature = "api_bluesky"))]
232287pub fn parse_with_domains(
233233- text: impl Into<String>,
288288+ text: impl AsRef<str>,
234289 _embed_domains: &[&str],
235290) -> RichTextBuilder<Unresolved> {
236236- let text = text.into();
291291+ // Step 0: Sanitize text (remove invisible chars, normalize newlines)
292292+ let text = sanitize_text(text.as_ref());
293293+237294 let mut facet_candidates = Vec::new();
238295239296 // Step 1: Detect and strip markdown links first
···299356300357impl<S> RichTextBuilder<S> {
301358 /// Set the text content
302302- pub fn text(mut self, text: impl Into<String>) -> Self {
303303- self.text = text.into();
359359+ pub fn text(mut self, text: impl AsRef<str>) -> Self {
360360+ self.text = sanitize_text(text.as_ref());
304361 self
305362 }
306363···800857#[cfg(feature = "api_bluesky")]
801858impl RichTextBuilder<Unresolved> {
802859 /// Build richtext, resolving handles to DIDs using the provided resolver
803803- pub async fn build_async<R>(
804804- self,
805805- resolver: &R,
806806- ) -> Result<RichText<'static>, RichTextError>
860860+ pub async fn build_async<R>(self, resolver: &R) -> Result<RichText<'static>, RichTextError>
807861 where
808862 R: jacquard_identity::resolver::IdentityResolver + Sync,
809863 {
···1065111910661120 Ok(None)
10671121}
11221122+11231123+#[cfg(test)]
11241124+mod tests;
+645
crates/jacquard/src/richtext/tests.rs
···11+use super::*;
22+33+#[test]
44+fn test_parse_mentions() {
55+ let text = "Hey @alice.bsky.social check this out";
66+ let builder = RichText::parse(text);
77+88+ assert_eq!(builder.facet_candidates.len(), 1);
99+ match &builder.facet_candidates[0] {
1010+ FacetCandidate::Mention { range, .. } => {
1111+ // Verify the text in the range includes the @ symbol
1212+ assert_eq!(&builder.text[range.clone()], "@alice.bsky.social");
1313+ }
1414+ _ => panic!("Expected mention facet"),
1515+ }
1616+}
1717+1818+#[test]
1919+fn test_parse_links() {
2020+ let text = "Check out https://example.com for more info";
2121+ let builder = RichText::parse(text);
2222+2323+ assert!(builder.facet_candidates.iter().any(|fc| {
2424+ matches!(fc, FacetCandidate::Link { range } if text[range.clone()].contains("example.com"))
2525+ }));
2626+}
2727+2828+#[test]
2929+fn test_parse_tags() {
3030+ let text = "This is #cool and #awesome";
3131+ let builder = RichText::parse(text);
3232+3333+ let tags: Vec<_> = builder
3434+ .facet_candidates
3535+ .iter()
3636+ .filter_map(|fc| match fc {
3737+ FacetCandidate::Tag { range } => Some(&builder.text[range.clone()]),
3838+ _ => None,
3939+ })
4040+ .collect();
4141+4242+ assert!(tags.contains(&"#cool"));
4343+ assert!(tags.contains(&"#awesome"));
4444+}
4545+4646+#[test]
4747+fn test_markdown_links() {
4848+ let text = "Check out [this link](https://example.com)";
4949+ let builder = RichText::parse(text);
5050+5151+ // Should have stripped markdown syntax
5252+ assert!(builder.text.contains("this link"));
5353+ assert!(!builder.text.contains("["));
5454+ assert!(!builder.text.contains("]"));
5555+5656+ // Should have detected the link facet
5757+ assert!(builder.facet_candidates.iter().any(|fc| matches!(
5858+ fc,
5959+ FacetCandidate::MarkdownLink { url, .. } if url == "https://example.com"
6060+ )));
6161+}
6262+6363+#[test]
6464+#[cfg(feature = "api_bluesky")]
6565+fn test_builder_manual_construction() {
6666+ let did = crate::types::did::Did::new_static("did:plc:z72i7hdynmk6r22z27h6tvur").unwrap();
6767+6868+ let result = RichText::builder()
6969+ .text("Hello @alice check out example.com".to_string())
7070+ .mention(&did, 6..12)
7171+ .link("https://example.com", Some(23..34))
7272+ .build()
7373+ .unwrap();
7474+7575+ assert_eq!(result.text.as_str(), "Hello @alice check out example.com");
7676+ assert!(result.facets.is_some());
7777+ let facets = result.facets.unwrap();
7878+ assert_eq!(facets.len(), 2);
7979+}
8080+8181+#[test]
8282+#[cfg(feature = "api_bluesky")]
8383+fn test_overlapping_facets_error() {
8484+ let did1 = crate::types::did::Did::new_static("did:plc:z72i7hdynmk6r22z27h6tvur").unwrap();
8585+ let did2 = crate::types::did::Did::new_static("did:plc:ewvi7nxzyoun6zhxrhs64oiz").unwrap();
8686+8787+ let result = RichText::builder()
8888+ .text("Hello world".to_string())
8989+ .mention(&did1, 0..5)
9090+ .mention(&did2, 3..8) // Overlaps with previous
9191+ .build();
9292+9393+ assert!(matches!(
9494+ result,
9595+ Err(RichTextError::OverlappingFacets(_, _))
9696+ ));
9797+}
9898+9999+#[test]
100100+fn test_parse_did_mentions() {
101101+ let text = "Hey @did:plc:z72i7hdynmk6r22z27h6tvur check this out";
102102+ let builder = RichText::parse(text);
103103+104104+ assert_eq!(builder.facet_candidates.len(), 1);
105105+ match &builder.facet_candidates[0] {
106106+ FacetCandidate::Mention { range, did } => {
107107+ assert_eq!(&text[range.clone()], "@did:plc:z72i7hdynmk6r22z27h6tvur");
108108+ assert!(did.is_some()); // DID should be pre-resolved
109109+ }
110110+ _ => panic!("Expected mention facet"),
111111+ }
112112+}
113113+114114+#[test]
115115+fn test_bare_domain_link() {
116116+ let text = "Visit example.com for info";
117117+ let builder = RichText::parse(text);
118118+119119+ assert!(builder.facet_candidates.iter().any(|fc| {
120120+ matches!(fc, FacetCandidate::Link { range } if text[range.clone()].contains("example.com"))
121121+ }));
122122+}
123123+124124+#[test]
125125+fn test_trailing_punctuation_stripped() {
126126+ let text = "Check https://example.com, and https://test.org.";
127127+ let builder = RichText::parse(text);
128128+129129+ // Count link facets
130130+ let link_count = builder
131131+ .facet_candidates
132132+ .iter()
133133+ .filter(|fc| matches!(fc, FacetCandidate::Link { .. }))
134134+ .count();
135135+136136+ assert_eq!(link_count, 2);
137137+138138+ // Verify punctuation is not included in ranges
139139+ for fc in &builder.facet_candidates {
140140+ if let FacetCandidate::Link { range } = fc {
141141+ let url = &text[range.clone()];
142142+ assert!(!url.ends_with(','));
143143+ assert!(!url.ends_with('.'));
144144+ }
145145+ }
146146+}
147147+148148+#[test]
149149+#[cfg(feature = "api_bluesky")]
150150+fn test_embed_detection_external() {
151151+ let text = "Check out https://external.com/article";
152152+ let builder = RichText::parse(text);
153153+154154+ assert!(builder.embed_candidates.is_some());
155155+ let embeds = builder.embed_candidates.unwrap();
156156+ assert_eq!(embeds.len(), 1);
157157+158158+ match &embeds[0] {
159159+ EmbedCandidate::External { url, metadata } => {
160160+ assert!(url.contains("external.com"));
161161+ assert!(metadata.is_none());
162162+ }
163163+ _ => panic!("Expected external embed"),
164164+ }
165165+}
166166+167167+#[test]
168168+#[cfg(feature = "api_bluesky")]
169169+fn test_embed_detection_bsky_post() {
170170+ let text = "See https://bsky.app/profile/alice.bsky.social/post/abc123";
171171+ let builder = RichText::parse(text);
172172+173173+ assert!(builder.embed_candidates.is_some());
174174+ let embeds = builder.embed_candidates.unwrap();
175175+ assert_eq!(embeds.len(), 1);
176176+177177+ match &embeds[0] {
178178+ EmbedCandidate::Record { at_uri, .. } => {
179179+ assert_eq!(
180180+ at_uri.as_str(),
181181+ "at://alice.bsky.social/app.bsky.feed.post/abc123"
182182+ );
183183+ }
184184+ _ => panic!("Expected record embed"),
185185+ }
186186+}
187187+188188+#[test]
189189+#[cfg(feature = "api_bluesky")]
190190+fn test_markdown_link_with_embed() {
191191+ let text = "Read [my post](https://bsky.app/profile/me.bsky.social/post/xyz)";
192192+ let builder = RichText::parse(text);
193193+194194+ // Should have markdown facet
195195+ assert!(
196196+ builder
197197+ .facet_candidates
198198+ .iter()
199199+ .any(|fc| matches!(fc, FacetCandidate::MarkdownLink { .. }))
200200+ );
201201+202202+ // Should also detect embed
203203+ assert!(builder.embed_candidates.is_some());
204204+ let embeds = builder.embed_candidates.unwrap();
205205+ assert_eq!(embeds.len(), 1);
206206+}
207207+208208+// === Sanitization Tests ===
209209+210210+#[test]
211211+fn test_sanitize_soft_hyphen() {
212212+ // Soft hyphens should be removed
213213+ let text = "Hello\u{00AD}World";
214214+ let builder = RichText::parse(text);
215215+216216+ assert_eq!(builder.text, "HelloWorld");
217217+}
218218+219219+#[test]
220220+fn test_sanitize_zero_width_space() {
221221+ // Zero-width spaces should be removed
222222+ let text = "Hello\u{200B}World";
223223+ let builder = RichText::parse(text);
224224+225225+ assert_eq!(builder.text, "HelloWorld");
226226+}
227227+228228+#[test]
229229+fn test_sanitize_normalize_newlines() {
230230+ // \r\n should normalize to \n
231231+ let text = "Hello\r\nWorld";
232232+ let builder = RichText::parse(text);
233233+234234+ assert_eq!(builder.text, "Hello\nWorld");
235235+}
236236+237237+#[test]
238238+fn test_sanitize_collapse_multiple_newlines() {
239239+ // Multiple newlines should collapse to \n\n
240240+ let text = "Hello\n\n\n\nWorld";
241241+ let builder = RichText::parse(text);
242242+243243+ assert_eq!(builder.text, "Hello\n\nWorld");
244244+}
245245+246246+#[test]
247247+fn test_sanitize_mixed_invisible_and_newlines() {
248248+ // Mix of invisible chars and newlines
249249+ let text = "Hello\u{200B}\n\u{200C}\n\u{00AD}World";
250250+ let builder = RichText::parse(text);
251251+252252+ assert_eq!(builder.text, "Hello\n\nWorld");
253253+}
254254+255255+#[test]
256256+fn test_sanitize_preserves_facets() {
257257+ // Make sure sanitization doesn't break facet detection
258258+ let text = "Hey @alice.bsky.social\u{200B} check\u{00AD}out https://example.com";
259259+ let builder = RichText::parse(text);
260260+261261+ // Should still detect both mention and link
262262+ assert!(builder
263263+ .facet_candidates
264264+ .iter()
265265+ .any(|fc| matches!(fc, FacetCandidate::Mention { .. })));
266266+ assert!(builder
267267+ .facet_candidates
268268+ .iter()
269269+ .any(|fc| matches!(fc, FacetCandidate::Link { .. })));
270270+}
271271+272272+#[test]
273273+fn test_sanitize_newlines_with_spaces() {
274274+ // Newlines with spaces between should collapse
275275+ let text = "Hello\n \n \nWorld";
276276+ let builder = RichText::parse(text);
277277+278278+ // 3 newlines with spaces -> collapses to \n\n
279279+ assert_eq!(builder.text, "Hello\n\nWorld");
280280+}
281281+282282+#[test]
283283+fn test_sanitize_preserves_no_excess_newlines() {
284284+ // Text without excessive newlines should be unchanged
285285+ let text = "Hello\nWorld";
286286+ let builder = RichText::parse(text);
287287+288288+ assert_eq!(builder.text, "Hello\nWorld");
289289+}
290290+291291+#[test]
292292+fn test_sanitize_empty_input() {
293293+ // Empty string should remain empty
294294+ let text = "";
295295+ let builder = RichText::parse(text);
296296+297297+ assert_eq!(builder.text, "");
298298+}
299299+300300+#[test]
301301+fn test_sanitize_only_invisible_chars() {
302302+ // Only invisible chars should be removed entirely
303303+ let text = "\u{200B}\u{200C}\u{200D}\u{00AD}";
304304+ let builder = RichText::parse(text);
305305+306306+ assert_eq!(builder.text, "");
307307+}
308308+309309+#[test]
310310+fn test_sanitize_cr_normalization() {
311311+ // Standalone \r should normalize to \n
312312+ let text = "Hello\rWorld";
313313+ let builder = RichText::parse(text);
314314+315315+ assert_eq!(builder.text, "Hello\nWorld");
316316+}
317317+318318+#[test]
319319+fn test_sanitize_mixed_line_endings() {
320320+ // Mix of \r\n, \r, \n should all normalize
321321+ let text = "Line1\r\nLine2\rLine3\nLine4";
322322+ let builder = RichText::parse(text);
323323+324324+ assert_eq!(builder.text, "Line1\nLine2\nLine3\nLine4");
325325+}
326326+327327+#[test]
328328+fn test_sanitize_preserves_regular_spaces() {
329329+ // Regular spaces without newlines should be preserved
330330+ let text = "Hello World";
331331+ let builder = RichText::parse(text);
332332+333333+ assert_eq!(builder.text, "Hello World");
334334+}
335335+336336+// === Adversarial / Edge Case Tests ===
337337+338338+#[test]
339339+fn test_tag_too_long() {
340340+ // Tags must be 64 chars or less
341341+ let long_tag = "a".repeat(65);
342342+ let text = format!("#{}", long_tag);
343343+ let builder = RichText::parse(text);
344344+345345+ // Should NOT detect the tag
346346+ assert!(builder
347347+ .facet_candidates
348348+ .iter()
349349+ .all(|fc| !matches!(fc, FacetCandidate::Tag { .. })));
350350+}
351351+352352+#[test]
353353+fn test_tag_with_zero_width_chars() {
354354+ // Zero-width joiners and other invisible unicode
355355+ let text = "This is #cool\u{200B}tag";
356356+ let builder = RichText::parse(text);
357357+358358+ // Tag should stop at zero-width char
359359+ let tags: Vec<_> = builder
360360+ .facet_candidates
361361+ .iter()
362362+ .filter_map(|fc| match fc {
363363+ FacetCandidate::Tag { range } => Some(&builder.text[range.clone()]),
364364+ _ => None,
365365+ })
366366+ .collect();
367367+368368+ // Should only capture up to the zero-width char
369369+ assert!(tags.iter().any(|t| t.starts_with("#cool")));
370370+}
371371+372372+#[test]
373373+fn test_url_with_parens() {
374374+ // URLs like wikipedia with (parens) in them
375375+ let text = "See https://en.wikipedia.org/wiki/Rust_(programming_language)";
376376+ let builder = RichText::parse(text);
377377+378378+ // Should capture the full URL including parens
379379+ assert!(builder.facet_candidates.iter().any(|fc| {
380380+ matches!(fc, FacetCandidate::Link { range } if text[range.clone()].contains("programming_language"))
381381+ }));
382382+}
383383+384384+#[test]
385385+fn test_markdown_link_unclosed() {
386386+ // Malformed markdown should not be processed
387387+ let text = "This is [unclosed link";
388388+ let builder = RichText::parse(text);
389389+390390+ // Should not detect markdown link, text unchanged
391391+ assert_eq!(builder.text, text);
392392+ assert!(builder
393393+ .facet_candidates
394394+ .iter()
395395+ .all(|fc| !matches!(fc, FacetCandidate::MarkdownLink { .. })));
396396+}
397397+398398+#[test]
399399+fn test_nested_markdown_attempts() {
400400+ // Try to nest markdown links
401401+ let text = "[[nested](https://inner.com)](https://outer.com)";
402402+ let builder = RichText::parse(text);
403403+404404+ // Should only match the innermost valid pattern
405405+ let markdown_count = builder
406406+ .facet_candidates
407407+ .iter()
408408+ .filter(|fc| matches!(fc, FacetCandidate::MarkdownLink { .. }))
409409+ .count();
410410+411411+ // Regex should match leftmost, should get one
412412+ assert!(markdown_count > 0);
413413+}
414414+415415+#[test]
416416+fn test_mention_with_emoji() {
417417+ // Handles can't have emoji but let's make sure it doesn't crash
418418+ let text = "Hey @alice๐.bsky.social wassup";
419419+ let builder = RichText::parse(text);
420420+421421+ // Should not match or should stop at emoji
422422+ let mentions: Vec<_> = builder
423423+ .facet_candidates
424424+ .iter()
425425+ .filter_map(|fc| match fc {
426426+ FacetCandidate::Mention { range, .. } => Some(&text[range.clone()]),
427427+ _ => None,
428428+ })
429429+ .collect();
430430+431431+ // Either no mentions or mention stops before emoji
432432+ for mention in mentions {
433433+ assert!(!mention.contains('๐'));
434434+ }
435435+}
436436+437437+#[test]
438438+fn test_handle_with_trailing_dots() {
439439+ // Handles like @alice... should not include trailing dots
440440+ let text = "Hey @alice.bsky.social... how are you";
441441+ let builder = RichText::parse(text);
442442+443443+ if let Some(FacetCandidate::Mention { range, .. }) = builder.facet_candidates.first() {
444444+ let mention = &text[range.clone()];
445445+ assert!(!mention.ends_with('.'));
446446+ }
447447+}
448448+449449+#[test]
450450+fn test_url_javascript_protocol() {
451451+ // Should not detect javascript: or data: URLs
452452+ let text = "Click javascript:alert(1) or data:text/html,<script>alert(1)</script>";
453453+ let builder = RichText::parse(text);
454454+455455+ // Should not match non-http(s) URLs
456456+ for fc in &builder.facet_candidates {
457457+ if let FacetCandidate::Link { range } = fc {
458458+ let url = &text[range.clone()];
459459+ assert!(!url.starts_with("javascript:"));
460460+ assert!(!url.starts_with("data:"));
461461+ }
462462+ }
463463+}
464464+465465+#[test]
466466+fn test_extremely_long_url() {
467467+ // Very long URL should still work (no panic)
468468+ let long_path = "a/".repeat(1000);
469469+ let text = format!("Visit https://example.com/{}", long_path);
470470+ let builder = RichText::parse(text);
471471+472472+ // Should detect the URL without panicking
473473+ assert!(builder
474474+ .facet_candidates
475475+ .iter()
476476+ .any(|fc| matches!(fc, FacetCandidate::Link { .. })));
477477+}
478478+479479+#[test]
480480+fn test_empty_string() {
481481+ let text = "";
482482+ let builder = RichText::parse(text);
483483+484484+ assert_eq!(builder.text, "");
485485+ assert!(builder.facet_candidates.is_empty());
486486+}
487487+488488+#[test]
489489+fn test_only_whitespace() {
490490+ let text = " \t\n ";
491491+ let builder = RichText::parse(text);
492492+493493+ assert!(builder.facet_candidates.is_empty());
494494+}
495495+496496+#[test]
497497+fn test_markdown_with_newlines() {
498498+ // Markdown regex should not match across newlines
499499+ let text = "This is [text\nwith](https://example.com) newline";
500500+ let builder = RichText::parse(text);
501501+502502+ // Current regex won't match \n in the display text part
503503+ // Just make sure it doesn't panic
504504+ let _ = builder.facet_candidates;
505505+}
506506+507507+#[test]
508508+fn test_multiple_at_signs() {
509509+ // @@alice should only match @alice
510510+ let text = "Hey @@alice.bsky.social";
511511+ let builder = RichText::parse(text);
512512+513513+ // Regex requires word boundary before @, so @@ might not match
514514+ // Or might match the second @
515515+ // Just verify it doesn't panic and produces valid ranges
516516+ for fc in &builder.facet_candidates {
517517+ if let FacetCandidate::Mention { range, .. } = fc {
518518+ assert!(range.end <= text.len());
519519+ let _ = &text[range.clone()]; // Shouldn't panic
520520+ }
521521+ }
522522+}
523523+524524+#[test]
525525+fn test_url_with_unicode_domain() {
526526+ // IDN domains
527527+ let text = "Visit https://ไพใ.jp for info";
528528+ let builder = RichText::parse(text);
529529+530530+ // Current regex only matches ASCII domains, so this might not detect
531531+ // Just make sure no panic
532532+ let _ = builder.facet_candidates;
533533+}
534534+535535+#[test]
536536+#[cfg(feature = "api_bluesky")]
537537+fn test_build_with_invalid_range() {
538538+ let did = crate::types::did::Did::new_static("did:plc:z72i7hdynmk6r22z27h6tvur").unwrap();
539539+540540+ // Range exceeds text length
541541+ let result = RichText::builder()
542542+ .text("Short".to_string())
543543+ .mention(&did, 0..100)
544544+ .build();
545545+546546+ assert!(matches!(
547547+ result,
548548+ Err(RichTextError::InvalidRange { .. })
549549+ ));
550550+}
551551+552552+#[test]
553553+fn test_rtl_override_injection() {
554554+ // Right-to-left override character attempts
555555+ let text = "Hey @alice\u{202E}reversed\u{202C}.bsky.social";
556556+ let builder = RichText::parse(text);
557557+558558+ // Should either not match or handle gracefully
559559+ let _ = builder.facet_candidates;
560560+}
561561+562562+#[test]
563563+fn test_tag_empty_after_hash() {
564564+ // Just # with nothing after
565565+ let text = "This is # a test";
566566+ let builder = RichText::parse(text);
567567+568568+ // Should not detect empty tag
569569+ assert!(builder
570570+ .facet_candidates
571571+ .iter()
572572+ .all(|fc| !matches!(fc, FacetCandidate::Tag { .. })));
573573+}
574574+575575+// === Unicode Byte Boundary Tests ===
576576+577577+#[test]
578578+fn test_facet_ranges_valid_utf8_boundaries() {
579579+ // All detected facet ranges must be valid UTF-8 boundaries
580580+ let text = "Hey @alice.bsky.social check ไฝ ๅฅฝ #tag๐ฅ and https://example.com/ๆต่ฏ";
581581+ let builder = RichText::parse(text);
582582+583583+ for fc in &builder.facet_candidates {
584584+ let range = match fc {
585585+ FacetCandidate::Mention { range, .. } => range,
586586+ FacetCandidate::Link { range } => range,
587587+ FacetCandidate::Tag { range } => range,
588588+ FacetCandidate::MarkdownLink { display_range, .. } => display_range,
589589+ };
590590+591591+ // This will panic if range is not on UTF-8 char boundary
592592+ // Use builder.text (sanitized) not original text
593593+ let slice = &builder.text[range.clone()];
594594+ // Verify it's valid UTF-8
595595+ assert!(std::str::from_utf8(slice.as_bytes()).is_ok());
596596+ }
597597+}
598598+599599+#[test]
600600+fn test_emoji_grapheme_clusters() {
601601+ // Family emoji with ZWJ sequences: "๐จโ๐ฉโ๐งโ๐ง" is 25 bytes but 1 grapheme
602602+ let text = "Hello ๐จโ๐ฉโ๐งโ๐ง @alice.bsky.social";
603603+ let builder = RichText::parse(text);
604604+605605+ // Should still detect the mention after the emoji
606606+ assert!(builder
607607+ .facet_candidates
608608+ .iter()
609609+ .any(|fc| matches!(fc, FacetCandidate::Mention { .. })));
610610+611611+ // Verify all ranges are valid against the sanitized text
612612+ for fc in &builder.facet_candidates {
613613+ if let FacetCandidate::Mention { range, .. } = fc {
614614+ let _ = &builder.text[range.clone()]; // Shouldn't panic
615615+ }
616616+ }
617617+}
618618+619619+#[test]
620620+fn test_tag_with_emoji() {
621621+ // Tags can contain emoji
622622+ let text = "This is #cool๐ฅ";
623623+ let builder = RichText::parse(text);
624624+625625+ let tags: Vec<_> = builder
626626+ .facet_candidates
627627+ .iter()
628628+ .filter_map(|fc| match fc {
629629+ FacetCandidate::Tag { range } => Some(&builder.text[range.clone()]),
630630+ _ => None,
631631+ })
632632+ .collect();
633633+634634+ // Should include emoji in tag
635635+ assert!(tags.iter().any(|t| t.contains("๐ฅ")));
636636+}
637637+638638+#[test]
639639+fn test_sanitize_newlines_with_emoji() {
640640+ // Newlines with emoji should still collapse correctly
641641+ let text = "Hello ๐\n\n\n\nWorld ๐";
642642+ let builder = RichText::parse(text);
643643+644644+ assert_eq!(builder.text, "Hello ๐\n\nWorld ๐");
645645+}