···5555 </style>
5656 </head>
5757 <body class="{% block body_classes %}{% endblock %}">
5858- <h1><a href="/">This</a> is a <a href="https://github.com/at-ucosm/links/tree/main/constellation">constellation 🌌</a> API server from <a href="https://github.com/at-microcosm">microcosm</a> ✨</h1>
5858+ <h1><a href="/">This</a> is a <a href="https://github.com/at-microcosm/links/tree/main/constellation">constellation 🌌</a> API server from <a href="https://github.com/at-microcosm">microcosm</a> ✨</h1>
5959 {% block content %}{% endblock %}
60606161 <footer>
+2-2
jetstream/Cargo.toml
···10101111[dependencies]
1212async-trait = "0.1.83"
1313-atrium-api = { version = "0.25", default-features = false, features = [
1313+atrium-api = { version = "0.25.2", default-features = false, features = [
1414 "namespace-appbsky",
1515] }
1616tokio = { version = "1.44.2", features = ["full", "sync", "time"] }
···2222futures-util = "0.3.31"
2323url = "2.5.4"
2424serde = { version = "1.0.215", features = ["derive"] }
2525-serde_json = "1.0.132"
2525+serde_json = { version = "1.0.140", features = ["raw_value"] }
2626chrono = "0.4.38"
2727zstd = "0.13.2"
2828thiserror = "2.0.3"
···77use clap::Parser;
88use jetstream::{
99 events::{
1010- commit::{
1111- CommitEvent,
1212- CommitType,
1313- },
1414- JetstreamEvent::Commit,
1010+ CommitEvent,
1111+ CommitOp,
1212+ EventKind,
1313+ JetstreamEvent,
1514 },
1615 DefaultJetstreamEndpoints,
1716 JetstreamCompression,
···2524 /// The DIDs to listen for events on, if not provided we will listen for all DIDs.
2625 #[arg(short, long)]
2726 did: Option<Vec<string::Did>>,
2828- /// The NSID for the collection to listen for (e.g. `app.bsky.feed.post`).
2929- #[arg(short, long)]
3030- nsid: string::Nsid,
3127}
32283329#[tokio::main]
···3733 let dids = args.did.unwrap_or_default();
3834 let config = JetstreamConfig {
3935 endpoint: DefaultJetstreamEndpoints::USEastOne.into(),
4040- wanted_collections: vec![args.nsid.clone()],
3636+ wanted_collections: vec![string::Nsid::new("app.bsky.feed.post".to_string()).unwrap()],
4137 wanted_dids: dids.clone(),
4238 compression: JetstreamCompression::Zstd,
4339 ..Default::default()
···4642 let jetstream = JetstreamConnector::new(config)?;
4743 let mut receiver = jetstream.connect().await?;
48444949- println!(
5050- "Listening for '{}' events on DIDs: {:?}",
5151- args.nsid.as_str(),
5252- dids
5353- );
4545+ println!("Listening for 'app.bsky.feed.post' events on DIDs: {dids:?}");
54465547 while let Some(event) = receiver.recv().await {
5656- if let Commit(commit) = event {
5757- match commit {
5858- CommitEvent::CreateOrUpdate { info: _, commit }
5959- if commit.info.operation == CommitType::Create =>
6060- {
6161- if let AppBskyFeedPost(record) = commit.record {
6262- println!(
6363- "New post created! ({})\n\n'{}'",
6464- commit.info.rkey.as_str(),
6565- record.text
6666- );
6767- }
6868- }
6969- CommitEvent::Delete { info: _, commit } => {
7070- println!("A post has been deleted. ({})", commit.rkey.as_str());
7171- }
7272- _ => {}
4848+ if let JetstreamEvent {
4949+ kind: EventKind::Commit,
5050+ commit:
5151+ Some(CommitEvent {
5252+ operation: CommitOp::Create,
5353+ rkey,
5454+ record: Some(record),
5555+ ..
5656+ }),
5757+ ..
5858+ } = event
5959+ {
6060+ if let Ok(AppBskyFeedPost(rec)) = serde_json::from_str(record.get()) {
6161+ println!("New post created! ({})\n{:?}\n", rkey.as_str(), rec.text);
7362 }
7463 }
7564 }
+205
jetstream/src/events.rs
···11+use std::time::{
22+ Duration,
33+ SystemTime,
44+ SystemTimeError,
55+ UNIX_EPOCH,
66+};
77+88+use chrono::Utc;
99+use serde::{
1010+ Deserialize,
1111+ Serialize,
1212+};
1313+use serde_json::value::RawValue;
1414+1515+use crate::exports;
1616+1717+/// Opaque wrapper for the time_us cursor used by jetstream
1818+#[derive(Debug, Serialize, Deserialize, Copy, Clone, PartialEq, PartialOrd)]
1919+pub struct Cursor(u64);
2020+2121+#[derive(Debug, Deserialize)]
2222+#[serde(rename_all = "snake_case")]
2323+pub struct JetstreamEvent {
2424+ #[serde(rename = "time_us")]
2525+ pub cursor: Cursor,
2626+ pub did: exports::Did,
2727+ pub kind: EventKind,
2828+ pub commit: Option<CommitEvent>,
2929+ pub identity: Option<IdentityEvent>,
3030+ pub account: Option<AccountEvent>,
3131+}
3232+3333+#[derive(Debug, Deserialize, PartialEq)]
3434+#[serde(rename_all = "snake_case")]
3535+pub enum EventKind {
3636+ Commit,
3737+ Identity,
3838+ Account,
3939+}
4040+4141+#[derive(Debug, Deserialize)]
4242+#[serde(rename_all = "snake_case")]
4343+pub struct CommitEvent {
4444+ pub collection: exports::Nsid,
4545+ pub rkey: exports::RecordKey,
4646+ pub rev: String,
4747+ pub operation: CommitOp,
4848+ pub record: Option<Box<RawValue>>,
4949+ pub cid: Option<exports::Cid>,
5050+}
5151+5252+#[derive(Debug, Deserialize, PartialEq)]
5353+#[serde(rename_all = "snake_case")]
5454+pub enum CommitOp {
5555+ Create,
5656+ Update,
5757+ Delete,
5858+}
5959+6060+#[derive(Debug, Deserialize, PartialEq)]
6161+pub struct IdentityEvent {
6262+ pub did: exports::Did,
6363+ pub handle: Option<exports::Handle>,
6464+ pub seq: u64,
6565+ pub time: chrono::DateTime<Utc>,
6666+}
6767+6868+#[derive(Debug, Deserialize, PartialEq)]
6969+pub struct AccountEvent {
7070+ pub active: bool,
7171+ pub did: exports::Did,
7272+ pub seq: u64,
7373+ pub time: chrono::DateTime<Utc>,
7474+ pub status: Option<String>,
7575+}
7676+7777+impl Cursor {
7878+ /// Get a cursor that will consume all available jetstream replay
7979+ ///
8080+ /// This sets the cursor to zero.
8181+ ///
8282+ /// Jetstream instances typically only have a few days of replay.
8383+ pub fn from_start() -> Self {
8484+ Self(0)
8585+ }
8686+ /// Get a cursor for a specific time
8787+ ///
8888+ /// Panics: if t is older than the unix epoch: Jan 1, 1970.
8989+ ///
9090+ /// If you want to receive all available jetstream replay (typically a few days), use
9191+ /// .from_start()
9292+ ///
9393+ /// Warning: this exploits the internal implementation detail of jetstream cursors
9494+ /// being ~microsecond timestamps.
9595+ pub fn at(t: SystemTime) -> Self {
9696+ let unix_dt = t
9797+ .duration_since(UNIX_EPOCH)
9898+ .expect("cannot set jetstream cursor earlier than unix epoch");
9999+ Self(unix_dt.as_micros() as u64)
100100+ }
101101+ /// Get a cursor rewound from now by this amount
102102+ ///
103103+ /// Panics: if d is greater than the time since the unix epoch: Jan 1, 1970.
104104+ ///
105105+ /// Jetstream instances typically only have a few days of replay.
106106+ ///
107107+ /// Warning: this exploits the internal implementation detail of jetstream cursors
108108+ /// being ~microsecond timestamps.
109109+ pub fn back_by(d: Duration) -> Self {
110110+ Self::at(SystemTime::now() - d)
111111+ }
112112+ /// Get a Cursor from a raw u64
113113+ ///
114114+ /// For example, from a jetstream event's `time_us` field.
115115+ pub fn from_raw_u64(time_us: u64) -> Self {
116116+ Self(time_us)
117117+ }
118118+ /// Get the raw u64 value from this cursor.
119119+ pub fn to_raw_u64(&self) -> u64 {
120120+ self.0
121121+ }
122122+ /// Format the cursor value for use in a jetstream connection url querystring
123123+ pub fn to_jetstream(&self) -> String {
124124+ self.0.to_string()
125125+ }
126126+ /// Compute the time span since an earlier cursor or [SystemTime]
127127+ ///
128128+ /// Warning: this exploits the internal implementation detail of jetstream cursors
129129+ /// being ~microsecond timestamps.
130130+ pub fn duration_since(
131131+ &self,
132132+ earlier: impl Into<SystemTime>,
133133+ ) -> Result<Duration, SystemTimeError> {
134134+ let t: SystemTime = self.into();
135135+ t.duration_since(earlier.into())
136136+ }
137137+ /// Compute the age of the cursor vs the local clock
138138+ ///
139139+ /// Warning: this exploits the internal implementation detail of jetstream cursors
140140+ pub fn elapsed(&self) -> Result<Duration, SystemTimeError> {
141141+ let t: SystemTime = self.into();
142142+ t.elapsed()
143143+ }
144144+ /// Get the immediate next cursor value
145145+ ///
146146+ /// This is possible for the implementation of jetstream cursors
147147+ pub fn next(&self) -> Cursor {
148148+ Self(self.0 + 1)
149149+ }
150150+}
151151+152152+impl From<&Cursor> for SystemTime {
153153+ /// Convert a cursor directly to a [SystemTime]
154154+ ///
155155+ /// Warning: this exploits the internal implementation detail of jetstream cursors
156156+ /// being ~microsecond timestamps.
157157+ fn from(c: &Cursor) -> Self {
158158+ UNIX_EPOCH + Duration::from_micros(c.0)
159159+ }
160160+}
161161+162162+#[cfg(test)]
163163+mod test {
164164+ use super::*;
165165+166166+ #[test]
167167+ fn test_parse_commit_event() -> anyhow::Result<()> {
168168+ let json = r#"{
169169+ "rev":"3llrdsginou2i",
170170+ "operation":"create",
171171+ "collection":"app.bsky.feed.post",
172172+ "rkey":"3llrdsglqdc2s",
173173+ "cid": "bafyreidofvwoqvd2cnzbun6dkzgfucxh57tirf3ohhde7lsvh4fu3jehgy",
174174+ "record": {"$type":"app.bsky.feed.post","createdAt":"2025-04-01T16:58:06.154Z","langs":["en"],"text":"I wish apirl 1st would stop existing lol"}
175175+ }"#;
176176+ let commit: CommitEvent = serde_json::from_str(json)?;
177177+ assert_eq!(
178178+ commit.cid.unwrap(),
179179+ "bafyreidofvwoqvd2cnzbun6dkzgfucxh57tirf3ohhde7lsvh4fu3jehgy".parse()?
180180+ );
181181+ assert_eq!(
182182+ commit.record.unwrap().get(),
183183+ r#"{"$type":"app.bsky.feed.post","createdAt":"2025-04-01T16:58:06.154Z","langs":["en"],"text":"I wish apirl 1st would stop existing lol"}"#
184184+ );
185185+ Ok(())
186186+ }
187187+188188+ #[test]
189189+ fn test_parse_whole_event() -> anyhow::Result<()> {
190190+ let json = r#"{"did":"did:plc:ai3dzf35cth7s3st7n7jsd7r","time_us":1743526687419798,"kind":"commit","commit":{"rev":"3llrdsginou2i","operation":"create","collection":"app.bsky.feed.post","rkey":"3llrdsglqdc2s","record":{"$type":"app.bsky.feed.post","createdAt":"2025-04-01T16:58:06.154Z","langs":["en"],"text":"I wish apirl 1st would stop existing lol"},"cid":"bafyreidofvwoqvd2cnzbun6dkzgfucxh57tirf3ohhde7lsvh4fu3jehgy"}}"#;
191191+ let event: JetstreamEvent = serde_json::from_str(json)?;
192192+ assert_eq!(event.kind, EventKind::Commit);
193193+ assert!(event.commit.is_some());
194194+ let commit = event.commit.unwrap();
195195+ assert_eq!(
196196+ commit.cid.unwrap(),
197197+ "bafyreidofvwoqvd2cnzbun6dkzgfucxh57tirf3ohhde7lsvh4fu3jehgy".parse()?
198198+ );
199199+ assert_eq!(
200200+ commit.record.unwrap().get(),
201201+ r#"{"$type":"app.bsky.feed.post","createdAt":"2025-04-01T16:58:06.154Z","langs":["en"],"text":"I wish apirl 1st would stop existing lol"}"#
202202+ );
203203+ Ok(())
204204+ }
205205+}
-40
jetstream/src/events/account.rs
···11-use chrono::Utc;
22-use serde::Deserialize;
33-44-use crate::{
55- events::EventInfo,
66- exports,
77-};
88-99-/// An event representing a change to an account.
1010-#[derive(Deserialize, Debug)]
1111-pub struct AccountEvent {
1212- /// Basic metadata included with every event.
1313- #[serde(flatten)]
1414- pub info: EventInfo,
1515- /// Account specific data bundled with this event.
1616- pub account: AccountData,
1717-}
1818-1919-/// Account specific data bundled with an account event.
2020-#[derive(Deserialize, Debug)]
2121-pub struct AccountData {
2222- /// Whether the account is currently active.
2323- pub active: bool,
2424- /// The DID of the account.
2525- pub did: exports::Did,
2626- pub seq: u64,
2727- pub time: chrono::DateTime<Utc>,
2828- /// If `active` is `false` this will be present to explain why the account is inactive.
2929- pub status: Option<AccountStatus>,
3030-}
3131-3232-/// The possible reasons an account might be listed as inactive.
3333-#[derive(Deserialize, Debug)]
3434-#[serde(rename_all = "lowercase")]
3535-pub enum AccountStatus {
3636- Deactivated,
3737- Deleted,
3838- Suspended,
3939- TakenDown,
4040-}
-55
jetstream/src/events/commit.rs
···11-use serde::Deserialize;
22-33-use crate::{
44- events::EventInfo,
55- exports,
66-};
77-88-/// An event representing a repo commit, which can be a `create`, `update`, or `delete` operation.
99-#[derive(Deserialize, Debug)]
1010-#[serde(untagged, rename_all = "snake_case")]
1111-pub enum CommitEvent<R> {
1212- CreateOrUpdate {
1313- #[serde(flatten)]
1414- info: EventInfo,
1515- commit: CommitData<R>,
1616- },
1717- Delete {
1818- #[serde(flatten)]
1919- info: EventInfo,
2020- commit: CommitInfo,
2121- },
2222-}
2323-2424-/// The type of commit operation that was performed.
2525-#[derive(Deserialize, Debug, PartialEq)]
2626-#[serde(rename_all = "snake_case")]
2727-pub enum CommitType {
2828- Create,
2929- Update,
3030- Delete,
3131-}
3232-3333-/// Basic commit specific info bundled with every event, also the only data included with a `delete`
3434-/// operation.
3535-#[derive(Deserialize, Debug)]
3636-pub struct CommitInfo {
3737- /// The type of commit operation that was performed.
3838- pub operation: CommitType,
3939- pub rev: String,
4040- pub rkey: exports::RecordKey,
4141- /// The NSID of the record type that this commit is associated with.
4242- pub collection: exports::Nsid,
4343-}
4444-4545-/// Detailed data bundled with a commit event. This data is only included when the event is
4646-/// `create` or `update`.
4747-#[derive(Deserialize, Debug)]
4848-pub struct CommitData<R> {
4949- #[serde(flatten)]
5050- pub info: CommitInfo,
5151- /// The CID of the record that was operated on.
5252- pub cid: exports::Cid,
5353- /// The record that was operated on.
5454- pub record: R,
5555-}
-28
jetstream/src/events/identity.rs
···11-use chrono::Utc;
22-use serde::Deserialize;
33-44-use crate::{
55- events::EventInfo,
66- exports,
77-};
88-99-/// An event representing a change to an identity.
1010-#[derive(Deserialize, Debug)]
1111-pub struct IdentityEvent {
1212- /// Basic metadata included with every event.
1313- #[serde(flatten)]
1414- pub info: EventInfo,
1515- /// Identity specific data bundled with this event.
1616- pub identity: IdentityData,
1717-}
1818-1919-/// Identity specific data bundled with an identity event.
2020-#[derive(Deserialize, Debug)]
2121-pub struct IdentityData {
2222- /// The DID of the identity.
2323- pub did: exports::Did,
2424- /// The handle associated with the identity.
2525- pub handle: Option<exports::Handle>,
2626- pub seq: u64,
2727- pub time: chrono::DateTime<Utc>,
2828-}
-138
jetstream/src/events/mod.rs
···11-pub mod account;
22-pub mod commit;
33-pub mod identity;
44-55-use std::time::{
66- Duration,
77- SystemTime,
88- SystemTimeError,
99- UNIX_EPOCH,
1010-};
1111-1212-use serde::Deserialize;
1313-1414-use crate::exports;
1515-1616-/// Opaque wrapper for the time_us cursor used by jetstream
1717-///
1818-/// Generally, you should use a cursor
1919-#[derive(Deserialize, Debug, Clone, PartialEq, PartialOrd)]
2020-pub struct Cursor(u64);
2121-2222-/// Basic data that is included with every event.
2323-#[derive(Deserialize, Debug)]
2424-pub struct EventInfo {
2525- pub did: exports::Did,
2626- pub time_us: Cursor,
2727- pub kind: EventKind,
2828-}
2929-3030-#[derive(Deserialize, Debug)]
3131-#[serde(untagged)]
3232-pub enum JetstreamEvent<R> {
3333- Commit(commit::CommitEvent<R>),
3434- Identity(identity::IdentityEvent),
3535- Account(account::AccountEvent),
3636-}
3737-3838-#[derive(Deserialize, Debug)]
3939-#[serde(rename_all = "snake_case")]
4040-pub enum EventKind {
4141- Commit,
4242- Identity,
4343- Account,
4444-}
4545-4646-impl<R> JetstreamEvent<R> {
4747- pub fn cursor(&self) -> Cursor {
4848- match self {
4949- JetstreamEvent::Commit(commit::CommitEvent::CreateOrUpdate { info, .. }) => {
5050- info.time_us.clone()
5151- }
5252- JetstreamEvent::Commit(commit::CommitEvent::Delete { info, .. }) => {
5353- info.time_us.clone()
5454- }
5555- JetstreamEvent::Identity(e) => e.info.time_us.clone(),
5656- JetstreamEvent::Account(e) => e.info.time_us.clone(),
5757- }
5858- }
5959-}
6060-6161-impl Cursor {
6262- /// Get a cursor that will consume all available jetstream replay
6363- ///
6464- /// This sets the cursor to zero.
6565- ///
6666- /// Jetstream instances typically only have a few days of replay.
6767- pub fn from_start() -> Self {
6868- Self(0)
6969- }
7070- /// Get a cursor for a specific time
7171- ///
7272- /// Panics: if t is older than the unix epoch: Jan 1, 1970.
7373- ///
7474- /// If you want to receive all available jetstream replay (typically a few days), use
7575- /// .from_start()
7676- ///
7777- /// Warning: this exploits the internal implementation detail of jetstream cursors
7878- /// being ~microsecond timestamps.
7979- pub fn at(t: SystemTime) -> Self {
8080- let unix_dt = t
8181- .duration_since(UNIX_EPOCH)
8282- .expect("cannot set jetstream cursor earlier than unix epoch");
8383- Self(unix_dt.as_micros() as u64)
8484- }
8585- /// Get a cursor rewound from now by this amount
8686- ///
8787- /// Panics: if d is greater than the time since the unix epoch: Jan 1, 1970.
8888- ///
8989- /// Jetstream instances typically only have a few days of replay.
9090- ///
9191- /// Warning: this exploits the internal implementation detail of jetstream cursors
9292- /// being ~microsecond timestamps.
9393- pub fn back_by(d: Duration) -> Self {
9494- Self::at(SystemTime::now() - d)
9595- }
9696- /// Get a Cursor from a raw u64
9797- ///
9898- /// For example, from a jetstream event's `time_us` field.
9999- pub fn from_raw_u64(time_us: u64) -> Self {
100100- Self(time_us)
101101- }
102102- /// Get the raw u64 value from this cursor.
103103- pub fn to_raw_u64(&self) -> u64 {
104104- self.0
105105- }
106106- /// Format the cursor value for use in a jetstream connection url querystring
107107- pub fn to_jetstream(&self) -> String {
108108- self.0.to_string()
109109- }
110110- /// Compute the time span since an earlier cursor or [SystemTime]
111111- ///
112112- /// Warning: this exploits the internal implementation detail of jetstream cursors
113113- /// being ~microsecond timestamps.
114114- pub fn duration_since(
115115- &self,
116116- earlier: impl Into<SystemTime>,
117117- ) -> Result<Duration, SystemTimeError> {
118118- let t: SystemTime = self.into();
119119- t.duration_since(earlier.into())
120120- }
121121- /// Compute the age of the cursor vs the local clock
122122- ///
123123- /// Warning: this exploits the internal implementation detail of jetstream cursors
124124- pub fn elapsed(&self) -> Result<Duration, SystemTimeError> {
125125- let t: SystemTime = self.into();
126126- t.elapsed()
127127- }
128128-}
129129-130130-impl From<&Cursor> for SystemTime {
131131- /// Convert a cursor directly to a [SystemTime]
132132- ///
133133- /// Warning: this exploits the internal implementation detail of jetstream cursors
134134- /// being ~microsecond timestamps.
135135- fn from(c: &Cursor) -> Self {
136136- UNIX_EPOCH + Duration::from_micros(c.0)
137137- }
138138-}
+21-40
jetstream/src/lib.rs
···33pub mod exports;
4455use std::{
66- io::{
77- Cursor as IoCursor,
88- Read,
99- },
1010- marker::PhantomData,
66+ io::Cursor as IoCursor,
117 time::{
128 Duration,
139 Instant,
1410 },
1511};
16121717-use atrium_api::record::KnownRecord;
1813use futures_util::{
1914 stream::StreamExt,
2015 SinkExt,
2116};
2222-use serde::de::DeserializeOwned;
2317use tokio::{
2418 net::TcpStream,
2519 sync::mpsc::{
···124118const JETSTREAM_ZSTD_DICTIONARY: &[u8] = include_bytes!("../zstd/dictionary");
125119126120/// A receiver channel for consuming Jetstream events.
127127-pub type JetstreamReceiver<R> = Receiver<JetstreamEvent<R>>;
121121+pub type JetstreamReceiver = Receiver<JetstreamEvent>;
128122129123/// An internal sender channel for sending Jetstream events to [JetstreamReceiver]'s.
130130-type JetstreamSender<R> = Sender<JetstreamEvent<R>>;
124124+type JetstreamSender = Sender<JetstreamEvent>;
131125132126/// A wrapper connector type for working with a WebSocket connection to a Jetstream instance to
133127/// receive and consume events. See [JetstreamConnector::connect] for more info.
134134-pub struct JetstreamConnector<R: DeserializeOwned> {
128128+pub struct JetstreamConnector {
135129 /// The configuration for the Jetstream connection.
136136- config: JetstreamConfig<R>,
130130+ config: JetstreamConfig,
137131}
138132139133pub enum JetstreamCompression {
···163157 }
164158}
165159166166-pub struct JetstreamConfig<R: DeserializeOwned = KnownRecord> {
160160+pub struct JetstreamConfig {
167161 /// A Jetstream endpoint to connect to with a WebSocket Scheme i.e.
168162 /// `wss://jetstream1.us-east.bsky.network/subscribe`.
169163 pub endpoint: String,
···200194 /// can help prevent that if your consumer sometimes pauses, at a cost of higher memory
201195 /// usage while events are buffered.
202196 pub channel_size: usize,
203203- /// Marker for record deserializable type.
204204- ///
205205- /// See examples/arbitrary_record.rs for an example using serde_json::Value
206206- ///
207207- /// You can omit this if you construct `JetstreamConfig { a: b, ..Default::default() }.
208208- /// If you have to specify it, use `std::marker::PhantomData` with no type parameters.
209209- pub record_type: PhantomData<R>,
210197}
211198212212-impl<R: DeserializeOwned> Default for JetstreamConfig<R> {
199199+impl Default for JetstreamConfig {
213200 fn default() -> Self {
214201 JetstreamConfig {
215202 endpoint: DefaultJetstreamEndpoints::USEastOne.into(),
···220207 omit_user_agent_jetstream_info: false,
221208 replay_on_reconnect: false,
222209 channel_size: 4096, // a few seconds of firehose buffer
223223- record_type: PhantomData,
224210 }
225211 }
226212}
227213228228-impl<R: DeserializeOwned> JetstreamConfig<R> {
214214+impl JetstreamConfig {
229215 /// Constructs a new endpoint URL with the given [JetstreamConfig] applied.
230216 pub fn get_request_builder(
231217 &self,
···313299 }
314300}
315301316316-impl<R: DeserializeOwned + Send + 'static> JetstreamConnector<R> {
302302+impl JetstreamConnector {
317303 /// Create a Jetstream connector with a valid [JetstreamConfig].
318304 ///
319305 /// After creation, you can call [connect] to connect to the provided Jetstream instance.
320320- pub fn new(config: JetstreamConfig<R>) -> Result<Self, ConfigValidationError> {
306306+ pub fn new(config: JetstreamConfig) -> Result<Self, ConfigValidationError> {
321307 // We validate the configuration here so any issues are caught early.
322308 config.validate()?;
323309 Ok(JetstreamConnector { config })
···327313 ///
328314 /// A [JetstreamReceiver] is returned which can be used to respond to events. When all instances
329315 /// of this receiver are dropped, the connection and task are automatically closed.
330330- pub async fn connect(&self) -> Result<JetstreamReceiver<R>, ConnectionError> {
316316+ pub async fn connect(&self) -> Result<JetstreamReceiver, ConnectionError> {
331317 self.connect_cursor(None).await
332318 }
333319···343329 pub async fn connect_cursor(
344330 &self,
345331 cursor: Option<Cursor>,
346346- ) -> Result<JetstreamReceiver<R>, ConnectionError> {
332332+ ) -> Result<JetstreamReceiver, ConnectionError> {
347333 // We validate the config again for good measure. Probably not necessary but it can't hurt.
348334 self.config
349335 .validate()
···365351 loop {
366352 let dict = DecoderDictionary::copy(JETSTREAM_ZSTD_DICTIONARY);
367353368368- let req = match build_request(connect_cursor.clone()) {
354354+ let req = match build_request(connect_cursor) {
369355 Ok(req) => req,
370356 Err(e) => {
371357 log::error!("Could not build jetstream websocket request: {e:?}");
···373359 }
374360 };
375361376376- let mut last_cursor = connect_cursor.clone();
362362+ let mut last_cursor = connect_cursor;
377363 retry_attempt += 1;
378364 if let Ok((ws_stream, _)) = connect_async(req).await {
379365 let t_connected = Instant::now();
···424410425411/// The main task that handles the WebSocket connection and sends [JetstreamEvent]'s to any
426412/// receivers that are listening for them.
427427-async fn websocket_task<R: DeserializeOwned>(
413413+async fn websocket_task(
428414 dictionary: DecoderDictionary<'_>,
429415 ws: WebSocketStream<MaybeTlsStream<TcpStream>>,
430430- send_channel: JetstreamSender<R>,
416416+ send_channel: JetstreamSender,
431417 last_cursor: &mut Option<Cursor>,
432418) -> Result<(), JetstreamEventError> {
433419 // TODO: Use the write half to allow the user to change configuration settings on the fly.
···439425 Some(Ok(message)) => {
440426 match message {
441427 Message::Text(json) => {
442442- let event: JetstreamEvent<R> = serde_json::from_str(&json)
428428+ let event: JetstreamEvent = serde_json::from_str(&json)
443429 .map_err(JetstreamEventError::ReceivedMalformedJSON)?;
444444- let event_cursor = event.cursor();
430430+ let event_cursor = event.cursor;
445431446432 if let Some(last) = last_cursor {
447433 if event_cursor <= *last {
···464450 }
465451 Message::Binary(zstd_json) => {
466452 let mut cursor = IoCursor::new(zstd_json);
467467- let mut decoder = zstd::stream::Decoder::with_prepared_dictionary(
453453+ let decoder = zstd::stream::Decoder::with_prepared_dictionary(
468454 &mut cursor,
469455 &dictionary,
470456 )
471457 .map_err(JetstreamEventError::CompressionDictionaryError)?;
472458473473- let mut json = String::new();
474474- decoder
475475- .read_to_string(&mut json)
476476- .map_err(JetstreamEventError::CompressionDecoderError)?;
477477-478478- let event: JetstreamEvent<R> = serde_json::from_str(&json)
459459+ let event: JetstreamEvent = serde_json::from_reader(decoder)
479460 .map_err(JetstreamEventError::ReceivedMalformedJSON)?;
480480- let event_cursor = event.cursor();
461461+ let event_cursor = event.cursor;
481462482463 if let Some(last) = last_cursor {
483464 if event_cursor <= *last {
···122122```bash
123123sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1
124124```
125125+126126+127127+---
128128+129129+## fuzzing
130130+131131+got bit by https://github.com/cloudflare/cardinality-estimator/pull/12, so now we have a fuzz target.
132132+133133+install cargo-fuzz and then
134134+135135+```bash
136136+RUSTFLAGS="-Z sanitizer=address" cargo +nightly fuzz run cardinality_estimator
137137+```
138138+139139+to fuzz the counts value things
+92-140
ufos/src/consumer.rs
···11use jetstream::{
22- events::{
33- account::AccountEvent,
44- commit::{CommitData, CommitEvent, CommitInfo, CommitType},
55- Cursor, EventInfo, JetstreamEvent,
66- },
77- exports::Did,
22+ events::{Cursor, EventKind, JetstreamEvent},
33+ exports::{Did, Nsid},
84 DefaultJetstreamEndpoints, JetstreamCompression, JetstreamConfig, JetstreamConnector,
95 JetstreamReceiver,
106};
···128use std::time::Duration;
139use tokio::sync::mpsc::{channel, Receiver, Sender};
14101515-use crate::{CreateRecord, DeleteAccount, DeleteRecord, EventBatch, ModifyRecord, UpdateRecord};
1111+use crate::error::{BatchInsertError, FirehoseEventError};
1212+use crate::{DeleteAccount, EventBatch, UFOsCommit};
16131717-const MAX_BATCHED_RECORDS: usize = 128; // *non-blocking* limit. drops oldest batched record per collection once reached.
1818-const MAX_BATCHED_MODIFIES: usize = 512; // hard limit, total updates and deletes across all collections.
1919-const MAX_ACCOUNT_REMOVES: usize = 512; // hard limit, total account deletions. actually the least frequent event, but tiny.
2020-const MAX_BATCHED_COLLECTIONS: usize = 64; // hard limit, MAX_BATCHED_RECORDS applies per collection
2121-const MIN_BATCH_SPAN_SECS: f64 = 2.; // try to get a bit of rest a bit.
2222-const MAX_BATCH_SPAN_SECS: f64 = 60.; // hard limit of duration from oldest to latest event cursor within a batch, in seconds.
1414+pub const MAX_BATCHED_RECORDS: usize = 128; // *non-blocking* limit. drops oldest batched record per collection once reached.
1515+pub const MAX_ACCOUNT_REMOVES: usize = 1024; // hard limit, extremely unlikely to reach, but just in case
1616+pub const MAX_BATCHED_COLLECTIONS: usize = 64; // hard limit, MAX_BATCHED_RECORDS applies per-collection
1717+pub const MIN_BATCH_SPAN_SECS: f64 = 2.; // breathe
1818+pub const MAX_BATCH_SPAN_SECS: f64 = 60.; // hard limit, pause consumer if we're unable to send by now
1919+pub const SEND_TIMEOUT_S: f64 = 15.; // if the channel is blocked longer than this, something is probably up
2020+pub const BATCH_QUEUE_SIZE: usize = 1; // nearly-rendez-vous
23212424-const SEND_TIMEOUT_S: f64 = 60.;
2525-const BATCH_QUEUE_SIZE: usize = 512; // 4096 got OOM'd. update: 1024 also got OOM'd during L0 compaction blocking
2222+pub type LimitedBatch = EventBatch<MAX_BATCHED_RECORDS>;
2323+2424+#[derive(Debug, Default)]
2525+struct CurrentBatch {
2626+ initial_cursor: Option<Cursor>,
2727+ batch: LimitedBatch,
2828+}
26292730#[derive(Debug)]
2828-struct Batcher {
2929- jetstream_receiver: JetstreamReceiver<serde_json::Value>,
3030- batch_sender: Sender<EventBatch>,
3131- current_batch: EventBatch,
3131+pub struct Batcher {
3232+ jetstream_receiver: JetstreamReceiver,
3333+ batch_sender: Sender<LimitedBatch>,
3434+ current_batch: CurrentBatch,
3235}
33363437pub async fn consume(
3538 jetstream_endpoint: &str,
3639 cursor: Option<Cursor>,
3740 no_compress: bool,
3838-) -> anyhow::Result<Receiver<EventBatch>> {
4141+) -> anyhow::Result<Receiver<LimitedBatch>> {
3942 let endpoint = DefaultJetstreamEndpoints::endpoint_or_shortcut(jetstream_endpoint);
4043 if endpoint == jetstream_endpoint {
4141- eprintln!("connecting to jetstream at {endpoint}");
4444+ log::info!("connecting to jetstream at {endpoint}");
4245 } else {
4343- eprintln!("connecting to jetstream at {jetstream_endpoint} => {endpoint}");
4646+ log::info!("connecting to jetstream at {jetstream_endpoint} => {endpoint}");
4447 }
4545- let config: JetstreamConfig<serde_json::Value> = JetstreamConfig {
4848+ let config: JetstreamConfig = JetstreamConfig {
4649 endpoint,
4750 compression: if no_compress {
4851 JetstreamCompression::None
4952 } else {
5053 JetstreamCompression::Zstd
5154 },
5252- channel_size: 64, // small because we'd rather buffer events into batches
5555+ replay_on_reconnect: true,
5656+ channel_size: 1024, // buffer up to ~1s of jetstream events
5357 ..Default::default()
5458 };
5559 let jetstream_receiver = JetstreamConnector::new(config)?
5660 .connect_cursor(cursor)
5761 .await?;
5858- let (batch_sender, batch_reciever) = channel::<EventBatch>(BATCH_QUEUE_SIZE);
6262+ let (batch_sender, batch_reciever) = channel::<LimitedBatch>(BATCH_QUEUE_SIZE);
5963 let mut batcher = Batcher::new(jetstream_receiver, batch_sender);
6064 tokio::task::spawn(async move { batcher.run().await });
6165 Ok(batch_reciever)
6266}
63676468impl Batcher {
6565- fn new(
6666- jetstream_receiver: JetstreamReceiver<serde_json::Value>,
6767- batch_sender: Sender<EventBatch>,
6868- ) -> Self {
6969+ pub fn new(jetstream_receiver: JetstreamReceiver, batch_sender: Sender<LimitedBatch>) -> Self {
6970 Self {
7071 jetstream_receiver,
7172 batch_sender,
···7374 }
7475 }
75767676- async fn run(&mut self) -> anyhow::Result<()> {
7777+ pub async fn run(&mut self) -> anyhow::Result<()> {
7778 loop {
7879 if let Some(event) = self.jetstream_receiver.recv().await {
7980 self.handle_event(event).await?
···8384 }
8485 }
85868686- async fn handle_event(
8787- &mut self,
8888- event: JetstreamEvent<serde_json::Value>,
8989- ) -> anyhow::Result<()> {
9090- let event_cursor = event.cursor();
9191-9292- if let Some(earliest) = &self.current_batch.first_jetstream_cursor {
9393- if event_cursor.duration_since(earliest)? > Duration::from_secs_f64(MAX_BATCH_SPAN_SECS)
8787+ async fn handle_event(&mut self, event: JetstreamEvent) -> anyhow::Result<()> {
8888+ if let Some(earliest) = &self.current_batch.initial_cursor {
8989+ if event.cursor.duration_since(earliest)? > Duration::from_secs_f64(MAX_BATCH_SPAN_SECS)
9490 {
9595- self.send_current_batch_now().await?;
9191+ self.send_current_batch_now(false).await?;
9692 }
9793 } else {
9898- self.current_batch.first_jetstream_cursor = Some(event_cursor.clone());
9494+ self.current_batch.initial_cursor = Some(event.cursor);
9995 }
10096101101- match event {
102102- JetstreamEvent::Commit(CommitEvent::CreateOrUpdate { commit, info }) => {
103103- match commit.info.operation {
104104- CommitType::Create => self.handle_create_record(commit, info).await?,
105105- CommitType::Update => {
106106- self.handle_modify_record(modify_update(commit, info))
107107- .await?
108108- }
109109- CommitType::Delete => {
110110- panic!("jetstream Commit::CreateOrUpdate had Delete operation type")
111111- }
9797+ match event.kind {
9898+ EventKind::Commit => {
9999+ let commit = event
100100+ .commit
101101+ .ok_or(FirehoseEventError::CommitEventMissingCommit)?;
102102+ let (commit, nsid) = UFOsCommit::from_commit_info(commit, event.did, event.cursor)?;
103103+ self.handle_commit(commit, nsid).await?;
104104+ }
105105+ EventKind::Account => {
106106+ let account = event
107107+ .account
108108+ .ok_or(FirehoseEventError::AccountEventMissingAccount)?;
109109+ if !account.active {
110110+ self.handle_delete_account(event.did, event.cursor).await?;
112111 }
113112 }
114114- JetstreamEvent::Commit(CommitEvent::Delete { commit, info }) => {
115115- self.handle_modify_record(modify_delete(commit, info))
116116- .await?
117117- }
118118- JetstreamEvent::Account(AccountEvent { info, account }) if !account.active => {
119119- self.handle_remove_account(info.did, info.time_us).await?
120120- }
121121- JetstreamEvent::Account(_) => {} // ignore account *activations*
122122- JetstreamEvent::Identity(_) => {} // identity events are noops for us
123123- };
124124- self.current_batch.last_jetstream_cursor = Some(event_cursor.clone());
113113+ _ => {}
114114+ }
125115126116 // if the queue is empty and we have enough, send immediately. otherewise, let the current batch fill up.
127127- if let Some(earliest) = &self.current_batch.first_jetstream_cursor {
128128- if event_cursor.duration_since(earliest)?.as_secs_f64() > MIN_BATCH_SPAN_SECS
117117+ if let Some(earliest) = &self.current_batch.initial_cursor {
118118+ if event.cursor.duration_since(earliest)?.as_secs_f64() > MIN_BATCH_SPAN_SECS
129119 && self.batch_sender.capacity() == BATCH_QUEUE_SIZE
130120 {
131131- log::trace!("queue empty: immediately sending batch.");
132132- if let Err(send_err) = self
133133- .batch_sender
134134- .send(mem::take(&mut self.current_batch))
135135- .await
136136- {
137137- anyhow::bail!("Could not send batch, likely because the receiver closed or dropped: {send_err:?}");
138138- }
121121+ self.send_current_batch_now(true).await?;
139122 }
140123 }
141124 Ok(())
142125 }
143126144144- // holds up all consumer progress until it can send to the channel
145145- // use this when the current batch is too full to add more to it
146146- async fn send_current_batch_now(&mut self) -> anyhow::Result<()> {
147147- log::warn!(
148148- "attempting to send batch now (capacity: {})",
149149- self.batch_sender.capacity()
127127+ async fn handle_commit(&mut self, commit: UFOsCommit, collection: Nsid) -> anyhow::Result<()> {
128128+ let optimistic_res = self.current_batch.batch.insert_commit_by_nsid(
129129+ &collection,
130130+ commit,
131131+ MAX_BATCHED_COLLECTIONS,
150132 );
151151- self.batch_sender
152152- .send_timeout(
153153- mem::take(&mut self.current_batch),
154154- Duration::from_secs_f64(SEND_TIMEOUT_S),
155155- )
156156- .await?;
157157- Ok(())
158158- }
159133160160- async fn handle_create_record(
161161- &mut self,
162162- commit: CommitData<serde_json::Value>,
163163- info: EventInfo,
164164- ) -> anyhow::Result<()> {
165165- if !self
166166- .current_batch
167167- .record_creates
168168- .contains_key(&commit.info.collection)
169169- && self.current_batch.record_creates.len() >= MAX_BATCHED_COLLECTIONS
170170- {
171171- self.send_current_batch_now().await?;
134134+ if let Err(BatchInsertError::BatchFull(commit)) = optimistic_res {
135135+ self.send_current_batch_now(false).await?;
136136+ self.current_batch.batch.insert_commit_by_nsid(
137137+ &collection,
138138+ commit,
139139+ MAX_BATCHED_COLLECTIONS,
140140+ )?;
141141+ } else {
142142+ optimistic_res?;
172143 }
173173- let record = CreateRecord {
174174- did: info.did,
175175- rkey: commit.info.rkey,
176176- record: commit.record,
177177- cursor: info.time_us,
178178- };
179179- let collection = self
180180- .current_batch
181181- .record_creates
182182- .entry(commit.info.collection)
183183- .or_default();
184184- collection.total_seen += 1;
185185- collection.samples.push_front(record);
186186- collection.samples.truncate(MAX_BATCHED_RECORDS);
187187- Ok(())
188188- }
189144190190- async fn handle_modify_record(&mut self, modify_record: ModifyRecord) -> anyhow::Result<()> {
191191- if self.current_batch.record_modifies.len() >= MAX_BATCHED_MODIFIES {
192192- self.send_current_batch_now().await?;
193193- }
194194- self.current_batch.record_modifies.push(modify_record);
195145 Ok(())
196146 }
197147198198- async fn handle_remove_account(&mut self, did: Did, cursor: Cursor) -> anyhow::Result<()> {
199199- if self.current_batch.account_removes.len() >= MAX_ACCOUNT_REMOVES {
200200- self.send_current_batch_now().await?;
148148+ async fn handle_delete_account(&mut self, did: Did, cursor: Cursor) -> anyhow::Result<()> {
149149+ if self.current_batch.batch.account_removes.len() >= MAX_ACCOUNT_REMOVES {
150150+ self.send_current_batch_now(false).await?;
201151 }
202152 self.current_batch
153153+ .batch
203154 .account_removes
204155 .push(DeleteAccount { did, cursor });
205156 Ok(())
206157 }
207207-}
208158209209-fn modify_update(commit: CommitData<serde_json::Value>, info: EventInfo) -> ModifyRecord {
210210- ModifyRecord::Update(UpdateRecord {
211211- did: info.did,
212212- collection: commit.info.collection,
213213- rkey: commit.info.rkey,
214214- record: commit.record,
215215- cursor: info.time_us,
216216- })
217217-}
218218-219219-fn modify_delete(commit_info: CommitInfo, info: EventInfo) -> ModifyRecord {
220220- ModifyRecord::Delete(DeleteRecord {
221221- did: info.did,
222222- collection: commit_info.collection,
223223- rkey: commit_info.rkey,
224224- cursor: info.time_us,
225225- })
159159+ // holds up all consumer progress until it can send to the channel
160160+ // use this when the current batch is too full to add more to it
161161+ async fn send_current_batch_now(&mut self, small: bool) -> anyhow::Result<()> {
162162+ let beginning = match self.current_batch.initial_cursor.map(|c| c.elapsed()) {
163163+ None => "unknown".to_string(),
164164+ Some(Ok(t)) => format!("{:?}", t),
165165+ Some(Err(e)) => format!("+{:?}", e.duration()),
166166+ };
167167+ log::info!(
168168+ "sending batch now from {beginning}, {}, queue capacity: {}",
169169+ if small { "small" } else { "full" },
170170+ self.batch_sender.capacity(),
171171+ );
172172+ let current = mem::take(&mut self.current_batch);
173173+ self.batch_sender
174174+ .send_timeout(current.batch, Duration::from_secs_f64(SEND_TIMEOUT_S))
175175+ .await?;
176176+ Ok(())
177177+ }
226178}