A better Rust ATProto crate

fixed tag regex

Orual c251e98b c2bb8963

+24 -3
+24 -3
crates/jacquard/src/richtext.rs
··· 26 26 }); 27 27 28 28 static TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| { 29 - // Simplified version - full unicode handling would need more work 30 - Regex::new(r"(^|\s)[##]([^\s\x{00AD}\x{2060}\x{200A}\x{200B}\x{200C}\x{200D}]+)").unwrap() 29 + // Pattern: (^|\s)[##](prefix* core+ suffix*)? 30 + // 31 + // - prefix: [^\s\u{00AD}...]* - any chars except spaces/zero-width (optional) 32 + // - core: [^\d\s\p{P}\u{00AD}...]+ - at least one char that's not digit/space/punct/zero-width (required) 33 + // - suffix: [^\s\u{00AD}...]* - any chars except spaces/zero-width (optional) 34 + // 35 + // Zero-width chars excluded: \u{00AD} (soft hyphen), \u{2060} (word joiner), 36 + // \u{200A}-\u{200D} (hair space, zero-width space/joiner/non-joiner), \u{20e2} (combining mark) 37 + // 38 + // Note: emoji modifier (\ufe0f) is filtered in detect_tags() since Rust regex 39 + // doesn't support negative lookahead 40 + Regex::new( 41 + r"(^|\s)[##]([^\s\u{00AD}\u{2060}\u{200A}\u{200B}\u{200C}\u{200D}\u{20e2}]*[^\d\s\p{P}\u{00AD}\u{2060}\u{200A}\u{200B}\u{200C}\u{200D}\u{20e2}]+[^\s\u{00AD}\u{2060}\u{200A}\u{200B}\u{200C}\u{200D}\u{20e2}]*)?" 42 + ).unwrap() 31 43 }); 32 44 33 45 static MARKDOWN_LINK_REGEX: LazyLock<Regex> = ··· 552 564 let mut facets = Vec::new(); 553 565 554 566 for cap in TAG_REGEX.captures_iter(text) { 555 - let tag_match = cap.get(2).unwrap(); 567 + // capture group 2 is optional, skip if empty (just # with nothing after) 568 + let tag_match = match cap.get(2) { 569 + Some(m) => m, 570 + None => continue, 571 + }; 556 572 let tag_str = tag_match.as_str(); 573 + 574 + // Filter out tags starting with emoji modifier (since regex can't do negative lookahead) 575 + if tag_str.starts_with('\u{fe0f}') { 576 + continue; 577 + } 557 578 558 579 // Calculate trimmed length after stripping trailing punctuation 559 580 let trimmed_len = if let Some(trimmed) = TRAILING_PUNCT_REGEX.find(tag_str) {