add a horrible at-uri parsing/normalization routine · microcosm.blue/microcosm-rs@283754e

microcosm.blue / microcosm-rs

fork atom

Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm

fork atom

add a horrible at-uri parsing/normalization routine

ughhooooooofff

bad-example.com 1 year ago 283754e2 760e7808

+249 -5

2 changed files

expand all

unified split

src

at_uri.rs

lib.rs

+243

src/at_uri.rs

···

··· 1 + use fluent_uri::{Uri, UriRef}; 2 + use std::sync::LazyLock; 3 + 4 + static BASE: LazyLock<Uri<&str>> = LazyLock::new(|| Uri::parse("https://example.com").unwrap()); 5 + 6 + // normalizing is a bit opinionated but eh 7 + /// see "Full AT URI Syntax" at https://atproto.com/specs/at-uri-scheme 8 + /// this parser is intentinonally lax: it should accept all valid at-uris, and 9 + /// may accept some invalid at-uris. 10 + /// 11 + /// at the moment this implementation is quite bad and incomplete 12 + pub fn parse_at_uri(s: &str) -> Option<String> { 13 + // for now, just working through the rules laid out in the docs in order, 14 + // without much regard for efficiency for now. 15 + 16 + // The overall URI is restricted to a subset of ASCII characters 17 + if !s.is_ascii() { 18 + return None; 19 + } 20 + // // A-Za-z0-9 . - _ ~ 21 + // if !s.chars().all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~')) { 22 + // return None 23 + // } 24 + 25 + // Maximum overall length is 8 kilobytes (which may be shortened in the future) 26 + if s.len() > (8 * 2_usize.pow(10)) { 27 + return None; 28 + } 29 + 30 + // Hex-encoding of characters is permitted (but in practice not necessary) 31 + // -> decode any unreserved characters. from rfc 3986: 32 + // -> For consistency, percent-encoded octets in the ranges of ALPHA 33 + // -> (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), 34 + // -> underscore (%5F), or tilde (%7E) should not be created by URI 35 + // -> producers and, when found in a URI, should be decoded to their 36 + // -> corresponding unreserved characters by URI normalizers. 37 + let s = if let Some((unencoded_prefix, rest)) = s.split_once('%') { 38 + let mut out = String::with_capacity(s.len()); 39 + out.push_str(unencoded_prefix); 40 + for segment in rest.split('%') { 41 + let Some((hex2, unencoded_suffix)) = segment.split_at_checked(2) else { 42 + return None; // bail: % must always be followed by 2 hex digits 43 + }; 44 + let Ok(decoded) = u8::from_str_radix(hex2, 16).map(char::from) else { 45 + return None; // bail: % must be followed by decodable hex 46 + }; 47 + if matches!(decoded, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~') { 48 + out.push(decoded); 49 + } else { 50 + out.push('%'); 51 + out.push_str(&hex2.to_ascii_uppercase()); // norm 52 + } 53 + out.push_str(unencoded_suffix); 54 + } 55 + out 56 + } else { 57 + s.to_string() 58 + }; 59 + 60 + // The URI scheme is `at`, and an authority part preceded with double slashes is always 61 + // required, so the URI always starts at:// 62 + // -> the spec doesn't explicitly say, but it seems like uri schemes are case-insensitive 63 + let (proto, rest) = s.split_at_checked(5)?; 64 + if !proto.eq_ignore_ascii_case("at://") { 65 + return None; 66 + } 67 + 68 + // An authority section is required and must be non-empty. the authority can be either an 69 + // atproto Handle, or a DID meeting the restrictions for use with atproto. note that the 70 + // authority part can not be interpreted as a host:port pair, because of the use of colon 71 + // characters (:) in DIDs. Colons and unreserved characters should not be escaped in DIDs, 72 + // but other reserved characters (including #, /, $, &, @) must be escaped. 73 + // Note that none of the current "blessed" DID methods for atproto allow these 74 + // characters in DID identifiers 75 + 76 + // An optional path section may follow the authority. The path may contain multiple segments 77 + // separated by a single slash (/). Generic URI path normalization rules may be used. 78 + 79 + // An optional query part is allowed, following generic URI syntax restrictions 80 + 81 + // An optional fragment part is allowed, using JSON Path syntax 82 + 83 + // -> work backwards from fragment, query, path -> authority 84 + let mut base = rest; 85 + let (mut fragment, mut query, mut path) = (None, None, None); 86 + if let Some((pre, f)) = base.split_once('#') { 87 + base = pre; 88 + fragment = Some(f); 89 + } 90 + if let Some((pre, q)) = base.split_once('?') { 91 + base = pre; 92 + query = Some(q); 93 + } 94 + if let Some((pre, p)) = base.split_once('/') { 95 + base = pre; 96 + path = Some(p); 97 + } 98 + let mut authority = base.to_string(); 99 + 100 + if authority.is_empty() { 101 + return None; 102 + } 103 + 104 + // Normalization: Authority as handle: lowercased 105 + if !authority.starts_with("did:") { 106 + // lowercase handles 107 + authority.make_ascii_lowercase(); 108 + } 109 + 110 + // Normalization: No trailing slashes in path part 111 + // Normalization: No duplicate slashes or "dot" sections in path part (/./ or /abc/../ for example) 112 + // -> be so lazy 113 + let path = match path { 114 + Some(p) => { 115 + let p = p.trim_end_matches('/'); 116 + let uri_ref = UriRef::parse(p).ok()?; // fully bail if we can't parse path 117 + let resolved = uri_ref.resolve_against(&*BASE).unwrap(); // both fail conditions are specific to BASE 118 + let normalized = resolved.normalize().path().to_string(); 119 + let without_trailing_slashes = normalized.trim_end_matches('/'); 120 + Some(without_trailing_slashes.to_string()) 121 + } 122 + None => None, 123 + }; 124 + 125 + let mut out = format!("at://{authority}"); 126 + if let Some(p) = path { 127 + // no need for `/` -- it's added by fluent_uri normalization 128 + out.push_str(&p); 129 + } 130 + if let Some(q) = query { 131 + out.push('?'); 132 + out.push_str(q); 133 + } 134 + if let Some(f) = fragment { 135 + out.push('#'); 136 + out.push_str(f); 137 + } 138 + 139 + Some(out) 140 + 141 + // there's a more normalization to do still. ugh. 142 + } 143 + 144 + #[cfg(test)] 145 + mod tests { 146 + use super::*; 147 + 148 + #[test] 149 + fn test_at_uri_parse() { 150 + for (case, expected, detail) in vec![ 151 + ("", None, "empty"), 152 + (" ", None, "whitespace"), 153 + ("https://bad-example.com", None, "not at scheme"), 154 + ("at://µcosm.bad-example.com", None, "not ascii"), 155 + ( 156 + "at://bad-example.com", 157 + Some("at://bad-example.com"), 158 + "handle, authority-only", 159 + ), 160 + ( 161 + "at://did:plc:hdhoaan3xa3jiuq4fg4mefid", 162 + Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid"), 163 + "DID, authority-only", 164 + ), 165 + ( 166 + "at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26", 167 + Some("at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26"), 168 + "bsky post (handle)", 169 + ), 170 + ( 171 + "at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27", 172 + Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27"), 173 + "bsky post (DID)", 174 + ), 175 + ( 176 + "AT://bad-example.com", 177 + Some("at://bad-example.com"), 178 + "scheme case is normalized", 179 + ), 180 + ( 181 + "at://bad-example.com", 182 + Some("at://bad-example.com"), 183 + "scheme case is normalized", 184 + ), 185 + ( 186 + "at://bad-example.com?q=z", 187 + Some("at://bad-example.com?q=z"), 188 + "query is allowed", 189 + ), 190 + ( 191 + "at://bad-example.com#a", 192 + Some("at://bad-example.com#a"), 193 + "fragment is allowed", 194 + ), 195 + ( 196 + "at://bad-example.com/%", 197 + None, 198 + "invalid percent-encoding: ends with %", 199 + ), 200 + ( 201 + "at://bad-example.com/%2", 202 + None, 203 + "invalid percent-encoding: ends with only one digit after %", 204 + ), 205 + ( 206 + "at://bad-example.com/%ZZ", 207 + None, 208 + "invalid percent-encoding: non-hex after %", 209 + ), 210 + ( 211 + "at://bad-example.com/%3A", 212 + Some("at://bad-example.com/%3A"), 213 + "valid percent-encoding is left", 214 + ), 215 + ( 216 + "at://bad-example.com/%3a", 217 + Some("at://bad-example.com/%3A"), 218 + "valid percent-encoding is hex-uppercased", 219 + ), 220 + ( 221 + "at://bad-example.com/%61/%62", 222 + Some("at://bad-example.com/a/b"), 223 + "unreserved characters are percent-decoded", 224 + ), 225 + ( 226 + "at://bad-example.com/a/../b", 227 + Some("at://bad-example.com/b"), 228 + "paths have traversals resolved (oof)", 229 + ), 230 + ( 231 + "at://bad-example.com/../", 232 + Some("at://bad-example.com"), 233 + "paths always have trailing slashes removed", 234 + ), 235 + ] { 236 + assert_eq!( 237 + parse_at_uri(case), 238 + expected.map(|s| s.to_string()), 239 + "{detail}" 240 + ); 241 + } 242 + } 243 + }

+6 -5

src/lib.rs

··· 1 use fluent_uri::Uri; 2 3 #[derive(Debug, PartialEq)] 4 pub enum Link { 5 AtUri(String), 6 Uri(String), 7 } 8 9 - // normalizing is a bit opinionated 10 - pub fn parse_at_uri(_s: &str) -> Option<String> { 11 - // TODO 12 - None 13 } 14 15 - // normalizing is a bit opinionated 16 pub fn parse_uri(s: &str) -> Option<String> { 17 Uri::parse(s).map(|u| u.normalize().into_string()).ok() 18 }

··· 1 use fluent_uri::Uri; 2 3 + pub mod at_uri; 4 + 5 #[derive(Debug, PartialEq)] 6 pub enum Link { 7 AtUri(String), 8 Uri(String), 9 } 10 11 + // normalizing is a bit opinionated but ehhh 12 + pub fn parse_at_uri(s: &str) -> Option<String> { 13 + at_uri::parse_at_uri(s) 14 } 15 16 + // normalizing is a bit opinionated but eh 17 pub fn parse_uri(s: &str) -> Option<String> { 18 Uri::parse(s).map(|u| u.normalize().into_string()).ok() 19 }