···1+use fluent_uri::{Uri, UriRef};
2+use std::sync::LazyLock;
3+4+static BASE: LazyLock<Uri<&str>> = LazyLock::new(|| Uri::parse("https://example.com").unwrap());
5+6+// normalizing is a bit opinionated but eh
7+/// see "Full AT URI Syntax" at https://atproto.com/specs/at-uri-scheme
8+/// this parser is intentinonally lax: it should accept all valid at-uris, and
9+/// may accept some invalid at-uris.
10+///
11+/// at the moment this implementation is quite bad and incomplete
12+pub fn parse_at_uri(s: &str) -> Option<String> {
13+ // for now, just working through the rules laid out in the docs in order,
14+ // without much regard for efficiency for now.
15+16+ // The overall URI is restricted to a subset of ASCII characters
17+ if !s.is_ascii() {
18+ return None;
19+ }
20+ // // A-Za-z0-9 . - _ ~
21+ // if !s.chars().all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~')) {
22+ // return None
23+ // }
24+25+ // Maximum overall length is 8 kilobytes (which may be shortened in the future)
26+ if s.len() > (8 * 2_usize.pow(10)) {
27+ return None;
28+ }
29+30+ // Hex-encoding of characters is permitted (but in practice not necessary)
31+ // -> decode any unreserved characters. from rfc 3986:
32+ // -> For consistency, percent-encoded octets in the ranges of ALPHA
33+ // -> (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
34+ // -> underscore (%5F), or tilde (%7E) should not be created by URI
35+ // -> producers and, when found in a URI, should be decoded to their
36+ // -> corresponding unreserved characters by URI normalizers.
37+ let s = if let Some((unencoded_prefix, rest)) = s.split_once('%') {
38+ let mut out = String::with_capacity(s.len());
39+ out.push_str(unencoded_prefix);
40+ for segment in rest.split('%') {
41+ let Some((hex2, unencoded_suffix)) = segment.split_at_checked(2) else {
42+ return None; // bail: % must always be followed by 2 hex digits
43+ };
44+ let Ok(decoded) = u8::from_str_radix(hex2, 16).map(char::from) else {
45+ return None; // bail: % must be followed by decodable hex
46+ };
47+ if matches!(decoded, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~') {
48+ out.push(decoded);
49+ } else {
50+ out.push('%');
51+ out.push_str(&hex2.to_ascii_uppercase()); // norm
52+ }
53+ out.push_str(unencoded_suffix);
54+ }
55+ out
56+ } else {
57+ s.to_string()
58+ };
59+60+ // The URI scheme is `at`, and an authority part preceded with double slashes is always
61+ // required, so the URI always starts at://
62+ // -> the spec doesn't explicitly say, but it seems like uri schemes are case-insensitive
63+ let (proto, rest) = s.split_at_checked(5)?;
64+ if !proto.eq_ignore_ascii_case("at://") {
65+ return None;
66+ }
67+68+ // An authority section is required and must be non-empty. the authority can be either an
69+ // atproto Handle, or a DID meeting the restrictions for use with atproto. note that the
70+ // authority part can not be interpreted as a host:port pair, because of the use of colon
71+ // characters (:) in DIDs. Colons and unreserved characters should not be escaped in DIDs,
72+ // but other reserved characters (including #, /, $, &, @) must be escaped.
73+ // Note that none of the current "blessed" DID methods for atproto allow these
74+ // characters in DID identifiers
75+76+ // An optional path section may follow the authority. The path may contain multiple segments
77+ // separated by a single slash (/). Generic URI path normalization rules may be used.
78+79+ // An optional query part is allowed, following generic URI syntax restrictions
80+81+ // An optional fragment part is allowed, using JSON Path syntax
82+83+ // -> work backwards from fragment, query, path -> authority
84+ let mut base = rest;
85+ let (mut fragment, mut query, mut path) = (None, None, None);
86+ if let Some((pre, f)) = base.split_once('#') {
87+ base = pre;
88+ fragment = Some(f);
89+ }
90+ if let Some((pre, q)) = base.split_once('?') {
91+ base = pre;
92+ query = Some(q);
93+ }
94+ if let Some((pre, p)) = base.split_once('/') {
95+ base = pre;
96+ path = Some(p);
97+ }
98+ let mut authority = base.to_string();
99+100+ if authority.is_empty() {
101+ return None;
102+ }
103+104+ // Normalization: Authority as handle: lowercased
105+ if !authority.starts_with("did:") {
106+ // lowercase handles
107+ authority.make_ascii_lowercase();
108+ }
109+110+ // Normalization: No trailing slashes in path part
111+ // Normalization: No duplicate slashes or "dot" sections in path part (/./ or /abc/../ for example)
112+ // -> be so lazy
113+ let path = match path {
114+ Some(p) => {
115+ let p = p.trim_end_matches('/');
116+ let uri_ref = UriRef::parse(p).ok()?; // fully bail if we can't parse path
117+ let resolved = uri_ref.resolve_against(&*BASE).unwrap(); // both fail conditions are specific to BASE
118+ let normalized = resolved.normalize().path().to_string();
119+ let without_trailing_slashes = normalized.trim_end_matches('/');
120+ Some(without_trailing_slashes.to_string())
121+ }
122+ None => None,
123+ };
124+125+ let mut out = format!("at://{authority}");
126+ if let Some(p) = path {
127+ // no need for `/` -- it's added by fluent_uri normalization
128+ out.push_str(&p);
129+ }
130+ if let Some(q) = query {
131+ out.push('?');
132+ out.push_str(q);
133+ }
134+ if let Some(f) = fragment {
135+ out.push('#');
136+ out.push_str(f);
137+ }
138+139+ Some(out)
140+141+ // there's a more normalization to do still. ugh.
142+}
143+144+#[cfg(test)]
145+mod tests {
146+ use super::*;
147+148+ #[test]
149+ fn test_at_uri_parse() {
150+ for (case, expected, detail) in vec![
151+ ("", None, "empty"),
152+ (" ", None, "whitespace"),
153+ ("https://bad-example.com", None, "not at scheme"),
154+ ("at://µcosm.bad-example.com", None, "not ascii"),
155+ (
156+ "at://bad-example.com",
157+ Some("at://bad-example.com"),
158+ "handle, authority-only",
159+ ),
160+ (
161+ "at://did:plc:hdhoaan3xa3jiuq4fg4mefid",
162+ Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid"),
163+ "DID, authority-only",
164+ ),
165+ (
166+ "at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26",
167+ Some("at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26"),
168+ "bsky post (handle)",
169+ ),
170+ (
171+ "at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27",
172+ Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27"),
173+ "bsky post (DID)",
174+ ),
175+ (
176+ "AT://bad-example.com",
177+ Some("at://bad-example.com"),
178+ "scheme case is normalized",
179+ ),
180+ (
181+ "at://bad-example.com",
182+ Some("at://bad-example.com"),
183+ "scheme case is normalized",
184+ ),
185+ (
186+ "at://bad-example.com?q=z",
187+ Some("at://bad-example.com?q=z"),
188+ "query is allowed",
189+ ),
190+ (
191+ "at://bad-example.com#a",
192+ Some("at://bad-example.com#a"),
193+ "fragment is allowed",
194+ ),
195+ (
196+ "at://bad-example.com/%",
197+ None,
198+ "invalid percent-encoding: ends with %",
199+ ),
200+ (
201+ "at://bad-example.com/%2",
202+ None,
203+ "invalid percent-encoding: ends with only one digit after %",
204+ ),
205+ (
206+ "at://bad-example.com/%ZZ",
207+ None,
208+ "invalid percent-encoding: non-hex after %",
209+ ),
210+ (
211+ "at://bad-example.com/%3A",
212+ Some("at://bad-example.com/%3A"),
213+ "valid percent-encoding is left",
214+ ),
215+ (
216+ "at://bad-example.com/%3a",
217+ Some("at://bad-example.com/%3A"),
218+ "valid percent-encoding is hex-uppercased",
219+ ),
220+ (
221+ "at://bad-example.com/%61/%62",
222+ Some("at://bad-example.com/a/b"),
223+ "unreserved characters are percent-decoded",
224+ ),
225+ (
226+ "at://bad-example.com/a/../b",
227+ Some("at://bad-example.com/b"),
228+ "paths have traversals resolved (oof)",
229+ ),
230+ (
231+ "at://bad-example.com/../",
232+ Some("at://bad-example.com"),
233+ "paths always have trailing slashes removed",
234+ ),
235+ ] {
236+ assert_eq!(
237+ parse_at_uri(case),
238+ expected.map(|s| s.to_string()),
239+ "{detail}"
240+ );
241+ }
242+ }
243+}
+6-5
src/lib.rs
···1use fluent_uri::Uri;
2003#[derive(Debug, PartialEq)]
4pub enum Link {
5 AtUri(String),
6 Uri(String),
7}
89-// normalizing is a bit opinionated
10-pub fn parse_at_uri(_s: &str) -> Option<String> {
11- // TODO
12- None
13}
1415-// normalizing is a bit opinionated
16pub fn parse_uri(s: &str) -> Option<String> {
17 Uri::parse(s).map(|u| u.normalize().into_string()).ok()
18}
···1use fluent_uri::Uri;
23+pub mod at_uri;
4+5#[derive(Debug, PartialEq)]
6pub enum Link {
7 AtUri(String),
8 Uri(String),
9}
1011+// normalizing is a bit opinionated but ehhh
12+pub fn parse_at_uri(s: &str) -> Option<String> {
13+ at_uri::parse_at_uri(s)
014}
1516+// normalizing is a bit opinionated but eh
17pub fn parse_uri(s: &str) -> Option<String> {
18 Uri::parse(s).map(|u| u.normalize().into_string()).ok()
19}