···1717 if !s.is_ascii() {
1818 return None;
1919 }
2020- // // A-Za-z0-9 . - _ ~
2121- // if !s.chars().all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~')) {
2222- // return None
2323- // }
24202521 // Maximum overall length is 8 kilobytes (which may be shortened in the future)
2622 if s.len() > (8 * 2_usize.pow(10)) {
···59556056 // The URI scheme is `at`, and an authority part preceded with double slashes is always
6157 // required, so the URI always starts at://
6262- // -> the spec doesn't explicitly say, but it seems like uri schemes are case-insensitive
5858+ // -> the spec doesn't explicitly say, but uri schemes can be case-insensitive?
6359 let (proto, rest) = s.split_at_checked(5)?;
6460 if !proto.eq_ignore_ascii_case("at://") {
6561 return None;
···225221 (
226222 "at://bad-example.com/a/../b",
227223 Some("at://bad-example.com/b"),
228228- "paths have traversals resolved (oof)",
224224+ "paths have traversals resolved (oof)", // reminder to self: we are normalizing, not sanitizing
229225 ),
230226 (
231227 "at://bad-example.com/../",
+152
src/did.rs
···11+/// see https://atproto.com/specs/did#at-protocol-did-identifier-syntax
22+/// this parser is intentinonally lax: it should accept all valid DIDs, and
33+/// may accept some invalid DIDs.
44+///
55+/// at the moment this implementation might also be quite bad and incomplete
66+pub fn parse_did(s: &str) -> Option<String> {
77+ // for now, just working through the rules laid out in the docs in order,
88+ // without much regard for efficiency for now.
99+1010+ // The entire URI is made up of a subset of ASCII, containing letters (A-Z, a-z),
1111+ // digits (0-9), period, underscore, colon, percent sign, or hyphen (._:%-)
1212+ if !s
1313+ .chars()
1414+ .all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '_' | ':' | '%' | '-'))
1515+ {
1616+ return None;
1717+ }
1818+1919+ // The URI is case-sensitive
2020+ // -> (nothing to check)
2121+2222+ // The URI starts with lowercase `did:`
2323+ let unprefixed = s.strip_prefix("did:")?;
2424+2525+ // The method segment is one or more lowercase letters (a-z), followed by :
2626+ let (method, identifier) = unprefixed.split_once(':')?;
2727+ if !method.chars().all(|c| c.is_ascii_lowercase()) {
2828+ return None;
2929+ }
3030+3131+ // The remainder of the URI (the identifier) may contain any of the above-allowed
3232+ // ASCII characters, except for percent-sign (%)
3333+ // -> ok, ugh, gotta know our encoding context for this
3434+3535+ // The URI (and thus the remaining identifier) may not end in ':'.
3636+ if identifier.ends_with(':') {
3737+ return None;
3838+ }
3939+4040+ // Percent-sign (%) is used for "percent encoding" in the identifier section, and
4141+ // must always be followed by two hex characters
4242+ // -> again incoding context (bleh)
4343+4444+ // Query (?) and fragment (#) sections are allowed in DID URIs, but not in DID
4545+ // identifiers. In the context of atproto, the query and fragment parts are not
4646+ // allowed.
4747+ // -> disallow here -- the uri decoder should already split them out first.
4848+4949+ // DID identifiers do not generally have a maximum length restriction, but in the
5050+ // context of atproto, there is an initial hard limit of 2 KB.
5151+ // -> we're in atproto, so sure, let's enforce it. (would be sensible to do this
5252+ // -> first but we're following doc order)
5353+ if s.len() > (2 * 2_usize.pow(10)) {
5454+ return None;
5555+ }
5656+5757+ // -> it's not actually written in the spec, but by example in the spec, the
5858+ // -> identifier cannot be empty
5959+ if identifier.is_empty() {
6060+ return None;
6161+ }
6262+6363+ Some(s.to_string())
6464+ // the only normalization we might want would be percent-decoding, but we
6565+ // probably leave that to the uri decoder
6666+}
6767+6868+#[cfg(test)]
6969+mod tests {
7070+ use super::*;
7171+7272+ #[test]
7373+ fn test_did_parse() {
7474+ for (case, expected, detail) in vec![
7575+ ("", None, "empty str"),
7676+ (" ", None, "whitespace str"),
7777+ ("z", None, "not a did"),
7878+ ("did:plc", None, "no identifier separator colon"),
7979+ ("did:plc:", None, "missing identifier"),
8080+ (
8181+ "did:web:bad-example.com",
8282+ Some("did:web:bad-example.com"),
8383+ "web did",
8484+ ),
8585+ (
8686+ "did:plc:hdhoaan3xa3jiuq4fg4mefid",
8787+ Some("did:plc:hdhoaan3xa3jiuq4fg4mefid"),
8888+ "plc did",
8989+ ),
9090+ (
9191+ "DID:plc:hdhoaan3xa3jiuq4fg4mefid",
9292+ None,
9393+ "'did:' prefix must be lowercase",
9494+ ),
9595+ (
9696+ "did:ok:z",
9797+ Some("did:ok:z"),
9898+ "unknown did methods are allowed",
9999+ ),
100100+ ("did:BAD:z", None, "non-lowercase methods are not allowed"),
101101+ ("did:bad:z$z", None, "invalid chars are not allowed"),
102102+ (
103103+ "did:ok:z:z",
104104+ Some("did:ok:z:z"),
105105+ "colons are allowed in identifier",
106106+ ),
107107+ ("did:bad:z:", None, "colons not are allowed at the end"),
108108+ ("did:bad:z?q=y", None, "queries are not allowed in atproto"),
109109+ ("did:bad:z#a", None, "anchors are not allowed in atproto"),
110110+ ] {
111111+ assert_eq!(parse_did(case), expected.map(|s| s.to_string()), "{detail}");
112112+ }
113113+ }
114114+115115+ #[test]
116116+ fn test_doc_exmples_atproto() {
117117+ // https://atproto.com/specs/did#at-protocol-did-identifier-syntax
118118+ for case in vec!["did:plc:z72i7hdynmk6r22z27h6tvur", "did:web:blueskyweb.xyz"] {
119119+ assert!(parse_did(case).is_some(), "should pass: {case}")
120120+ }
121121+ }
122122+123123+ #[test]
124124+ fn test_doc_exmples_lexicon() {
125125+ // https://atproto.com/specs/did#at-protocol-did-identifier-syntax
126126+ for case in vec![
127127+ "did:method:val:two",
128128+ "did:m:v",
129129+ "did:method::::val",
130130+ "did:method:-:_:.",
131131+ "did:key:zQ3shZc2QzApp2oymGvQbzP8eKheVshBHbU4ZYjeXqwSKEn6N",
132132+ ] {
133133+ assert!(parse_did(case).is_some(), "should pass: {case}")
134134+ }
135135+ }
136136+137137+ #[test]
138138+ fn test_doc_exmples_invalid() {
139139+ // https://atproto.com/specs/did#at-protocol-did-identifier-syntax
140140+ for case in vec![
141141+ "did:METHOD:val",
142142+ "did:m123:val",
143143+ "DID:method:val",
144144+ "did:method:",
145145+ "did:method:val/two",
146146+ "did:method:val?two",
147147+ "did:method:val#two",
148148+ ] {
149149+ assert!(parse_did(case).is_none(), "should fail: {case}")
150150+ }
151151+ }
152152+}
+12-8
src/lib.rs
···11use fluent_uri::Uri;
2233pub mod at_uri;
44+pub mod did;
4556#[derive(Debug, PartialEq)]
67pub enum Link {
78 AtUri(String),
89 Uri(String),
99-}
1010-1111-// normalizing is a bit opinionated but ehhh
1212-pub fn parse_at_uri(s: &str) -> Option<String> {
1313- at_uri::parse_at_uri(s)
1010+ Did(String),
1411}
15121613// normalizing is a bit opinionated but eh
···1916}
20172118pub fn parse_any(s: &str) -> Option<Link> {
2222- parse_at_uri(s)
2323- .map(Link::AtUri)
2424- .or_else(|| parse_uri(s).map(Link::Uri))
1919+ at_uri::parse_at_uri(s).map(Link::AtUri).or_else(|| {
2020+ did::parse_did(s)
2121+ .map(Link::Did)
2222+ .or_else(|| parse_uri(s).map(Link::Uri))
2323+ })
2524}
26252726#[cfg(test)]
···6059 "at://did:plc:44ybard66vv44zksje25o7dz/app.bsky.feed.post/3jwdwj2ctlk26".into()
6160 )),
6261 );
6262+6363+ assert_eq!(
6464+ parse_any("did:plc:44ybard66vv44zksje25o7dz"),
6565+ Some(Link::Did("did:plc:44ybard66vv44zksje25o7dz".into()))
6666+ )
6367 }
6468}