Constellation, Spacedust, Slingshot, UFOs: atproto crates and services for microcosm

parse dids

"links" is all references, so blocks, follows, etc. which are just direct dids also need to be detected.

+166 -14
+2 -6
src/at_uri.rs
··· 17 17 if !s.is_ascii() { 18 18 return None; 19 19 } 20 - // // A-Za-z0-9 . - _ ~ 21 - // if !s.chars().all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' | '~')) { 22 - // return None 23 - // } 24 20 25 21 // Maximum overall length is 8 kilobytes (which may be shortened in the future) 26 22 if s.len() > (8 * 2_usize.pow(10)) { ··· 59 55 60 56 // The URI scheme is `at`, and an authority part preceded with double slashes is always 61 57 // required, so the URI always starts at:// 62 - // -> the spec doesn't explicitly say, but it seems like uri schemes are case-insensitive 58 + // -> the spec doesn't explicitly say, but uri schemes can be case-insensitive? 63 59 let (proto, rest) = s.split_at_checked(5)?; 64 60 if !proto.eq_ignore_ascii_case("at://") { 65 61 return None; ··· 225 221 ( 226 222 "at://bad-example.com/a/../b", 227 223 Some("at://bad-example.com/b"), 228 - "paths have traversals resolved (oof)", 224 + "paths have traversals resolved (oof)", // reminder to self: we are normalizing, not sanitizing 229 225 ), 230 226 ( 231 227 "at://bad-example.com/../",
+152
src/did.rs
··· 1 + /// see https://atproto.com/specs/did#at-protocol-did-identifier-syntax 2 + /// this parser is intentinonally lax: it should accept all valid DIDs, and 3 + /// may accept some invalid DIDs. 4 + /// 5 + /// at the moment this implementation might also be quite bad and incomplete 6 + pub fn parse_did(s: &str) -> Option<String> { 7 + // for now, just working through the rules laid out in the docs in order, 8 + // without much regard for efficiency for now. 9 + 10 + // The entire URI is made up of a subset of ASCII, containing letters (A-Z, a-z), 11 + // digits (0-9), period, underscore, colon, percent sign, or hyphen (._:%-) 12 + if !s 13 + .chars() 14 + .all(|c| matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '_' | ':' | '%' | '-')) 15 + { 16 + return None; 17 + } 18 + 19 + // The URI is case-sensitive 20 + // -> (nothing to check) 21 + 22 + // The URI starts with lowercase `did:` 23 + let unprefixed = s.strip_prefix("did:")?; 24 + 25 + // The method segment is one or more lowercase letters (a-z), followed by : 26 + let (method, identifier) = unprefixed.split_once(':')?; 27 + if !method.chars().all(|c| c.is_ascii_lowercase()) { 28 + return None; 29 + } 30 + 31 + // The remainder of the URI (the identifier) may contain any of the above-allowed 32 + // ASCII characters, except for percent-sign (%) 33 + // -> ok, ugh, gotta know our encoding context for this 34 + 35 + // The URI (and thus the remaining identifier) may not end in ':'. 36 + if identifier.ends_with(':') { 37 + return None; 38 + } 39 + 40 + // Percent-sign (%) is used for "percent encoding" in the identifier section, and 41 + // must always be followed by two hex characters 42 + // -> again incoding context (bleh) 43 + 44 + // Query (?) and fragment (#) sections are allowed in DID URIs, but not in DID 45 + // identifiers. In the context of atproto, the query and fragment parts are not 46 + // allowed. 47 + // -> disallow here -- the uri decoder should already split them out first. 48 + 49 + // DID identifiers do not generally have a maximum length restriction, but in the 50 + // context of atproto, there is an initial hard limit of 2 KB. 51 + // -> we're in atproto, so sure, let's enforce it. (would be sensible to do this 52 + // -> first but we're following doc order) 53 + if s.len() > (2 * 2_usize.pow(10)) { 54 + return None; 55 + } 56 + 57 + // -> it's not actually written in the spec, but by example in the spec, the 58 + // -> identifier cannot be empty 59 + if identifier.is_empty() { 60 + return None; 61 + } 62 + 63 + Some(s.to_string()) 64 + // the only normalization we might want would be percent-decoding, but we 65 + // probably leave that to the uri decoder 66 + } 67 + 68 + #[cfg(test)] 69 + mod tests { 70 + use super::*; 71 + 72 + #[test] 73 + fn test_did_parse() { 74 + for (case, expected, detail) in vec![ 75 + ("", None, "empty str"), 76 + (" ", None, "whitespace str"), 77 + ("z", None, "not a did"), 78 + ("did:plc", None, "no identifier separator colon"), 79 + ("did:plc:", None, "missing identifier"), 80 + ( 81 + "did:web:bad-example.com", 82 + Some("did:web:bad-example.com"), 83 + "web did", 84 + ), 85 + ( 86 + "did:plc:hdhoaan3xa3jiuq4fg4mefid", 87 + Some("did:plc:hdhoaan3xa3jiuq4fg4mefid"), 88 + "plc did", 89 + ), 90 + ( 91 + "DID:plc:hdhoaan3xa3jiuq4fg4mefid", 92 + None, 93 + "'did:' prefix must be lowercase", 94 + ), 95 + ( 96 + "did:ok:z", 97 + Some("did:ok:z"), 98 + "unknown did methods are allowed", 99 + ), 100 + ("did:BAD:z", None, "non-lowercase methods are not allowed"), 101 + ("did:bad:z$z", None, "invalid chars are not allowed"), 102 + ( 103 + "did:ok:z:z", 104 + Some("did:ok:z:z"), 105 + "colons are allowed in identifier", 106 + ), 107 + ("did:bad:z:", None, "colons not are allowed at the end"), 108 + ("did:bad:z?q=y", None, "queries are not allowed in atproto"), 109 + ("did:bad:z#a", None, "anchors are not allowed in atproto"), 110 + ] { 111 + assert_eq!(parse_did(case), expected.map(|s| s.to_string()), "{detail}"); 112 + } 113 + } 114 + 115 + #[test] 116 + fn test_doc_exmples_atproto() { 117 + // https://atproto.com/specs/did#at-protocol-did-identifier-syntax 118 + for case in vec!["did:plc:z72i7hdynmk6r22z27h6tvur", "did:web:blueskyweb.xyz"] { 119 + assert!(parse_did(case).is_some(), "should pass: {case}") 120 + } 121 + } 122 + 123 + #[test] 124 + fn test_doc_exmples_lexicon() { 125 + // https://atproto.com/specs/did#at-protocol-did-identifier-syntax 126 + for case in vec![ 127 + "did:method:val:two", 128 + "did:m:v", 129 + "did:method::::val", 130 + "did:method:-:_:.", 131 + "did:key:zQ3shZc2QzApp2oymGvQbzP8eKheVshBHbU4ZYjeXqwSKEn6N", 132 + ] { 133 + assert!(parse_did(case).is_some(), "should pass: {case}") 134 + } 135 + } 136 + 137 + #[test] 138 + fn test_doc_exmples_invalid() { 139 + // https://atproto.com/specs/did#at-protocol-did-identifier-syntax 140 + for case in vec![ 141 + "did:METHOD:val", 142 + "did:m123:val", 143 + "DID:method:val", 144 + "did:method:", 145 + "did:method:val/two", 146 + "did:method:val?two", 147 + "did:method:val#two", 148 + ] { 149 + assert!(parse_did(case).is_none(), "should fail: {case}") 150 + } 151 + } 152 + }
+12 -8
src/lib.rs
··· 1 1 use fluent_uri::Uri; 2 2 3 3 pub mod at_uri; 4 + pub mod did; 4 5 5 6 #[derive(Debug, PartialEq)] 6 7 pub enum Link { 7 8 AtUri(String), 8 9 Uri(String), 9 - } 10 - 11 - // normalizing is a bit opinionated but ehhh 12 - pub fn parse_at_uri(s: &str) -> Option<String> { 13 - at_uri::parse_at_uri(s) 10 + Did(String), 14 11 } 15 12 16 13 // normalizing is a bit opinionated but eh ··· 19 16 } 20 17 21 18 pub fn parse_any(s: &str) -> Option<Link> { 22 - parse_at_uri(s) 23 - .map(Link::AtUri) 24 - .or_else(|| parse_uri(s).map(Link::Uri)) 19 + at_uri::parse_at_uri(s).map(Link::AtUri).or_else(|| { 20 + did::parse_did(s) 21 + .map(Link::Did) 22 + .or_else(|| parse_uri(s).map(Link::Uri)) 23 + }) 25 24 } 26 25 27 26 #[cfg(test)] ··· 60 59 "at://did:plc:44ybard66vv44zksje25o7dz/app.bsky.feed.post/3jwdwj2ctlk26".into() 61 60 )), 62 61 ); 62 + 63 + assert_eq!( 64 + parse_any("did:plc:44ybard66vv44zksje25o7dz"), 65 + Some(Link::Did("did:plc:44ybard66vv44zksje25o7dz".into())) 66 + ) 63 67 } 64 68 }