···11+use fluent_uri::{Uri, UriRef};
22+use std::sync::LazyLock;
33+44+static BASE: LazyLock<Uri<&str>> = LazyLock::new(|| Uri::parse("https://example.com").unwrap());
55+66+// normalizing is a bit opinionated but eh
77+/// see "Full AT URI Syntax" at https://atproto.com/specs/at-uri-scheme
88+/// this parser is intentinonally lax: it should accept all valid at-uris, and
99+/// may accept some invalid at-uris.
1010+///
1111+/// at the moment this implementation is quite bad and incomplete
1212+pub fn parse_at_uri(s: &str) -> Option<String> {
1313+ // for now, just working through the rules laid out in the docs in order,
1414+ // without much regard for efficiency for now.
1515+1616+ // The overall URI is restricted to a subset of ASCII characters
1717+ if !s.is_ascii() {
1818+ return None;
1919+ }
2020+ // // A-Za-z0-9 . - _ ~
2121+ // if !s.chars().all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~')) {
2222+ // return None
2323+ // }
2424+2525+ // Maximum overall length is 8 kilobytes (which may be shortened in the future)
2626+ if s.len() > (8 * 2_usize.pow(10)) {
2727+ return None;
2828+ }
2929+3030+ // Hex-encoding of characters is permitted (but in practice not necessary)
3131+ // -> decode any unreserved characters. from rfc 3986:
3232+ // -> For consistency, percent-encoded octets in the ranges of ALPHA
3333+ // -> (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
3434+ // -> underscore (%5F), or tilde (%7E) should not be created by URI
3535+ // -> producers and, when found in a URI, should be decoded to their
3636+ // -> corresponding unreserved characters by URI normalizers.
3737+ let s = if let Some((unencoded_prefix, rest)) = s.split_once('%') {
3838+ let mut out = String::with_capacity(s.len());
3939+ out.push_str(unencoded_prefix);
4040+ for segment in rest.split('%') {
4141+ let Some((hex2, unencoded_suffix)) = segment.split_at_checked(2) else {
4242+ return None; // bail: % must always be followed by 2 hex digits
4343+ };
4444+ let Ok(decoded) = u8::from_str_radix(hex2, 16).map(char::from) else {
4545+ return None; // bail: % must be followed by decodable hex
4646+ };
4747+ if matches!(decoded, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~') {
4848+ out.push(decoded);
4949+ } else {
5050+ out.push('%');
5151+ out.push_str(&hex2.to_ascii_uppercase()); // norm
5252+ }
5353+ out.push_str(unencoded_suffix);
5454+ }
5555+ out
5656+ } else {
5757+ s.to_string()
5858+ };
5959+6060+ // The URI scheme is `at`, and an authority part preceded with double slashes is always
6161+ // required, so the URI always starts at://
6262+ // -> the spec doesn't explicitly say, but it seems like uri schemes are case-insensitive
6363+ let (proto, rest) = s.split_at_checked(5)?;
6464+ if !proto.eq_ignore_ascii_case("at://") {
6565+ return None;
6666+ }
6767+6868+ // An authority section is required and must be non-empty. the authority can be either an
6969+ // atproto Handle, or a DID meeting the restrictions for use with atproto. note that the
7070+ // authority part can not be interpreted as a host:port pair, because of the use of colon
7171+ // characters (:) in DIDs. Colons and unreserved characters should not be escaped in DIDs,
7272+ // but other reserved characters (including #, /, $, &, @) must be escaped.
7373+ // Note that none of the current "blessed" DID methods for atproto allow these
7474+ // characters in DID identifiers
7575+7676+ // An optional path section may follow the authority. The path may contain multiple segments
7777+ // separated by a single slash (/). Generic URI path normalization rules may be used.
7878+7979+ // An optional query part is allowed, following generic URI syntax restrictions
8080+8181+ // An optional fragment part is allowed, using JSON Path syntax
8282+8383+ // -> work backwards from fragment, query, path -> authority
8484+ let mut base = rest;
8585+ let (mut fragment, mut query, mut path) = (None, None, None);
8686+ if let Some((pre, f)) = base.split_once('#') {
8787+ base = pre;
8888+ fragment = Some(f);
8989+ }
9090+ if let Some((pre, q)) = base.split_once('?') {
9191+ base = pre;
9292+ query = Some(q);
9393+ }
9494+ if let Some((pre, p)) = base.split_once('/') {
9595+ base = pre;
9696+ path = Some(p);
9797+ }
9898+ let mut authority = base.to_string();
9999+100100+ if authority.is_empty() {
101101+ return None;
102102+ }
103103+104104+ // Normalization: Authority as handle: lowercased
105105+ if !authority.starts_with("did:") {
106106+ // lowercase handles
107107+ authority.make_ascii_lowercase();
108108+ }
109109+110110+ // Normalization: No trailing slashes in path part
111111+ // Normalization: No duplicate slashes or "dot" sections in path part (/./ or /abc/../ for example)
112112+ // -> be so lazy
113113+ let path = match path {
114114+ Some(p) => {
115115+ let p = p.trim_end_matches('/');
116116+ let uri_ref = UriRef::parse(p).ok()?; // fully bail if we can't parse path
117117+ let resolved = uri_ref.resolve_against(&*BASE).unwrap(); // both fail conditions are specific to BASE
118118+ let normalized = resolved.normalize().path().to_string();
119119+ let without_trailing_slashes = normalized.trim_end_matches('/');
120120+ Some(without_trailing_slashes.to_string())
121121+ }
122122+ None => None,
123123+ };
124124+125125+ let mut out = format!("at://{authority}");
126126+ if let Some(p) = path {
127127+ // no need for `/` -- it's added by fluent_uri normalization
128128+ out.push_str(&p);
129129+ }
130130+ if let Some(q) = query {
131131+ out.push('?');
132132+ out.push_str(q);
133133+ }
134134+ if let Some(f) = fragment {
135135+ out.push('#');
136136+ out.push_str(f);
137137+ }
138138+139139+ Some(out)
140140+141141+ // there's a more normalization to do still. ugh.
142142+}
143143+144144+#[cfg(test)]
145145+mod tests {
146146+ use super::*;
147147+148148+ #[test]
149149+ fn test_at_uri_parse() {
150150+ for (case, expected, detail) in vec![
151151+ ("", None, "empty"),
152152+ (" ", None, "whitespace"),
153153+ ("https://bad-example.com", None, "not at scheme"),
154154+ ("at://µcosm.bad-example.com", None, "not ascii"),
155155+ (
156156+ "at://bad-example.com",
157157+ Some("at://bad-example.com"),
158158+ "handle, authority-only",
159159+ ),
160160+ (
161161+ "at://did:plc:hdhoaan3xa3jiuq4fg4mefid",
162162+ Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid"),
163163+ "DID, authority-only",
164164+ ),
165165+ (
166166+ "at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26",
167167+ Some("at://bad-example.com/app.bsky.feed.post/3jwdwj2ctlk26"),
168168+ "bsky post (handle)",
169169+ ),
170170+ (
171171+ "at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27",
172172+ Some("at://did:plc:hdhoaan3xa3jiuq4fg4mefid/app.bsky.feed.post/3ldqksainxc27"),
173173+ "bsky post (DID)",
174174+ ),
175175+ (
176176+ "AT://bad-example.com",
177177+ Some("at://bad-example.com"),
178178+ "scheme case is normalized",
179179+ ),
180180+ (
181181+ "at://bad-example.com",
182182+ Some("at://bad-example.com"),
183183+ "scheme case is normalized",
184184+ ),
185185+ (
186186+ "at://bad-example.com?q=z",
187187+ Some("at://bad-example.com?q=z"),
188188+ "query is allowed",
189189+ ),
190190+ (
191191+ "at://bad-example.com#a",
192192+ Some("at://bad-example.com#a"),
193193+ "fragment is allowed",
194194+ ),
195195+ (
196196+ "at://bad-example.com/%",
197197+ None,
198198+ "invalid percent-encoding: ends with %",
199199+ ),
200200+ (
201201+ "at://bad-example.com/%2",
202202+ None,
203203+ "invalid percent-encoding: ends with only one digit after %",
204204+ ),
205205+ (
206206+ "at://bad-example.com/%ZZ",
207207+ None,
208208+ "invalid percent-encoding: non-hex after %",
209209+ ),
210210+ (
211211+ "at://bad-example.com/%3A",
212212+ Some("at://bad-example.com/%3A"),
213213+ "valid percent-encoding is left",
214214+ ),
215215+ (
216216+ "at://bad-example.com/%3a",
217217+ Some("at://bad-example.com/%3A"),
218218+ "valid percent-encoding is hex-uppercased",
219219+ ),
220220+ (
221221+ "at://bad-example.com/%61/%62",
222222+ Some("at://bad-example.com/a/b"),
223223+ "unreserved characters are percent-decoded",
224224+ ),
225225+ (
226226+ "at://bad-example.com/a/../b",
227227+ Some("at://bad-example.com/b"),
228228+ "paths have traversals resolved (oof)",
229229+ ),
230230+ (
231231+ "at://bad-example.com/../",
232232+ Some("at://bad-example.com"),
233233+ "paths always have trailing slashes removed",
234234+ ),
235235+ ] {
236236+ assert_eq!(
237237+ parse_at_uri(case),
238238+ expected.map(|s| s.to_string()),
239239+ "{detail}"
240240+ );
241241+ }
242242+ }
243243+}
+6-5
src/lib.rs
···11use fluent_uri::Uri;
2233+pub mod at_uri;
44+35#[derive(Debug, PartialEq)]
46pub enum Link {
57 AtUri(String),
68 Uri(String),
79}
81099-// normalizing is a bit opinionated
1010-pub fn parse_at_uri(_s: &str) -> Option<String> {
1111- // TODO
1212- None
1111+// normalizing is a bit opinionated but ehhh
1212+pub fn parse_at_uri(s: &str) -> Option<String> {
1313+ at_uri::parse_at_uri(s)
1314}
14151515-// normalizing is a bit opinionated
1616+// normalizing is a bit opinionated but eh
1617pub fn parse_uri(s: &str) -> Option<String> {
1718 Uri::parse(s).map(|u| u.normalize().into_string()).ok()
1819}