tangled
alpha
login
or
join now
ptr.pet
/
Allegedly
forked from
microcosm.blue/Allegedly
0
fork
atom
Server tools to backfill, tail, mirror, and verify PLC logs
0
fork
atom
overview
issues
pulls
pipelines
one bin cli (tail to start)
bad-example.com
5 months ago
d68ab94e
c8224075
+242
-235
3 changed files
expand all
collapse all
unified
split
src
bin
backfill.rs
main.rs
tail.rs
+213
src/bin/backfill.rs
···
1
1
+
use clap::Parser;
2
2
+
use std::time::Duration;
3
3
+
use url::Url;
4
4
+
5
5
+
use allegedly::{Db, Dt, ExportPage, Op, bin_init, poll_upstream, week_to_pages};
6
6
+
7
7
+
const EXPORT_PAGE_QUEUE_SIZE: usize = 0; // rendezvous for now
8
8
+
const WEEK_IN_SECONDS: u64 = 7 * 86400;
9
9
+
10
10
+
#[derive(Parser)]
11
11
+
struct Args {
12
12
+
/// Upstream PLC server to mirror
13
13
+
///
14
14
+
/// default: https://plc.directory
15
15
+
#[arg(long, env)]
16
16
+
#[clap(default_value = "https://plc.directory")]
17
17
+
upstream: Url,
18
18
+
/// Bulk export source prefix
19
19
+
///
20
20
+
/// Must be a prefix for urls ending with {WEEK_TIMESTAMP}.jsonl.gz
21
21
+
///
22
22
+
/// default: https://plc.t3.storage.dev/plc.directory/
23
23
+
///
24
24
+
/// pass "off" to skip fast bulk backfilling
25
25
+
#[arg(long, env)]
26
26
+
#[clap(default_value = "https://plc.t3.storage.dev/plc.directory/")]
27
27
+
upstream_bulk: Url,
28
28
+
/// The oldest available bulk upstream export timestamp
29
29
+
///
30
30
+
/// Must be a week-truncated unix timestamp
31
31
+
///
32
32
+
/// plc.directory's oldest week is `1668643200`; you probably don't want to change this.
33
33
+
#[arg(long, env)]
34
34
+
#[clap(default_value = "1668643200")]
35
35
+
bulk_epoch: u64,
36
36
+
/// Mirror PLC's postgres database
37
37
+
///
38
38
+
/// URI string with credentials etc
39
39
+
#[arg(long, env)]
40
40
+
postgres: String,
41
41
+
}
42
42
+
43
43
+
async fn bulk_backfill((upstream, epoch): (Url, u64), tx: flume::Sender<ExportPage>) {
44
44
+
let immutable_cutoff = std::time::SystemTime::now() - Duration::from_secs((7 + 4) * 86400);
45
45
+
let immutable_ts = (immutable_cutoff.duration_since(std::time::SystemTime::UNIX_EPOCH))
46
46
+
.unwrap()
47
47
+
.as_secs();
48
48
+
let immutable_week = (immutable_ts / WEEK_IN_SECONDS) * WEEK_IN_SECONDS;
49
49
+
let mut week = epoch;
50
50
+
let mut week_n = 0;
51
51
+
while week < immutable_week {
52
52
+
log::info!("backfilling week {week_n} ({week})");
53
53
+
let url = upstream.join(&format!("{week}.jsonl.gz")).unwrap();
54
54
+
week_to_pages(url, tx.clone()).await.unwrap();
55
55
+
week_n += 1;
56
56
+
week += WEEK_IN_SECONDS;
57
57
+
}
58
58
+
}
59
59
+
60
60
+
async fn export_upstream(
61
61
+
upstream: Url,
62
62
+
bulk: (Url, u64),
63
63
+
tx: flume::Sender<ExportPage>,
64
64
+
pg_client: tokio_postgres::Client,
65
65
+
) {
66
66
+
let latest = get_latest(&pg_client).await;
67
67
+
68
68
+
if latest.is_none() {
69
69
+
bulk_backfill(bulk, tx.clone()).await;
70
70
+
}
71
71
+
let mut upstream = upstream;
72
72
+
upstream.set_path("/export");
73
73
+
poll_upstream(latest, upstream, tx).await.unwrap();
74
74
+
}
75
75
+
76
76
+
async fn write_pages(
77
77
+
rx: flume::Receiver<ExportPage>,
78
78
+
mut pg_client: tokio_postgres::Client,
79
79
+
) -> Result<(), anyhow::Error> {
80
80
+
// TODO: one big upsert at the end from select distinct on the other table
81
81
+
82
82
+
// let upsert_did = &pg_client
83
83
+
// .prepare(
84
84
+
// r#"
85
85
+
// INSERT INTO dids (did) VALUES ($1)
86
86
+
// ON CONFLICT DO NOTHING"#,
87
87
+
// )
88
88
+
// .await
89
89
+
// .unwrap();
90
90
+
91
91
+
let insert_op = &pg_client
92
92
+
.prepare(
93
93
+
r#"
94
94
+
INSERT INTO operations (did, operation, cid, nullified, "createdAt")
95
95
+
VALUES ($1, $2, $3, $4, $5)
96
96
+
ON CONFLICT (did, cid) DO UPDATE
97
97
+
SET nullified = excluded.nullified,
98
98
+
"createdAt" = excluded."createdAt"
99
99
+
WHERE operations.nullified = excluded.nullified
100
100
+
OR operations."createdAt" = excluded."createdAt""#,
101
101
+
) // idea: op is provable via cid, so leave it out. after did/cid (pk) that leaves nullified and createdAt
102
102
+
// that we want to notice changing.
103
103
+
// normal insert: no conflict, rows changed = 1
104
104
+
// conflict (exact match): where clause passes, rows changed = 1
105
105
+
// conflict (mismatch): where clause fails, rows changed = 0 (detect this and warn!)
106
106
+
.await
107
107
+
.unwrap();
108
108
+
109
109
+
while let Ok(page) = rx.recv_async().await {
110
110
+
log::trace!("got a page...");
111
111
+
112
112
+
let tx = pg_client.transaction().await.unwrap();
113
113
+
114
114
+
// TODO: probably figure out postgres COPY IN
115
115
+
// for now just write everything into a transaction
116
116
+
117
117
+
log::trace!("setting up inserts...");
118
118
+
for op_line in page
119
119
+
.ops
120
120
+
.into_iter()
121
121
+
.flat_map(|s| {
122
122
+
s.replace("}{", "}\n{")
123
123
+
.split('\n')
124
124
+
.map(|s| s.trim())
125
125
+
.map(Into::into)
126
126
+
.collect::<Vec<String>>()
127
127
+
})
128
128
+
.filter(|s| !s.is_empty())
129
129
+
{
130
130
+
let Ok(op) = serde_json::from_str::<Op>(&op_line)
131
131
+
.inspect_err(|e| log::error!("failing! at the {op_line}! {e}"))
132
132
+
else {
133
133
+
log::error!("ayeeeee just ignoring this error for now......");
134
134
+
continue;
135
135
+
};
136
136
+
// let client = &tx;
137
137
+
138
138
+
// client.execute(upsert_did, &[&op.did]).await.unwrap();
139
139
+
140
140
+
// let sp = tx.savepoint("op").await.unwrap();
141
141
+
let inserted = tx
142
142
+
.execute(
143
143
+
insert_op,
144
144
+
&[
145
145
+
&op.did,
146
146
+
&tokio_postgres::types::Json(op.operation),
147
147
+
&op.cid,
148
148
+
&op.nullified,
149
149
+
&op.created_at,
150
150
+
],
151
151
+
)
152
152
+
.await
153
153
+
.unwrap();
154
154
+
if inserted != 1 {
155
155
+
log::warn!(
156
156
+
"possible log modification: {inserted} rows changed after upserting {op:?}"
157
157
+
);
158
158
+
}
159
159
+
// {
160
160
+
// if e.code() != Some(&tokio_postgres::error::SqlState::UNIQUE_VIOLATION) {
161
161
+
// anyhow::bail!(e);
162
162
+
// }
163
163
+
// // TODO: assert that the row has not changed
164
164
+
// log::warn!("ignoring dup");
165
165
+
// }
166
166
+
}
167
167
+
168
168
+
tx.commit().await.unwrap();
169
169
+
}
170
170
+
Ok(())
171
171
+
}
172
172
+
173
173
+
async fn get_latest(pg_client: &tokio_postgres::Client) -> Option<Dt> {
174
174
+
pg_client
175
175
+
.query_opt(
176
176
+
r#"SELECT "createdAt" FROM operations
177
177
+
ORDER BY "createdAt" DESC LIMIT 1"#,
178
178
+
&[],
179
179
+
)
180
180
+
.await
181
181
+
.unwrap()
182
182
+
.map(|r| r.get(0))
183
183
+
}
184
184
+
185
185
+
#[tokio::main]
186
186
+
async fn main() -> anyhow::Result<()> {
187
187
+
bin_init("main");
188
188
+
let args = Args::parse();
189
189
+
let db = Db::new(&args.postgres);
190
190
+
let (tx, rx) = flume::bounded(EXPORT_PAGE_QUEUE_SIZE);
191
191
+
192
192
+
log::trace!("connecting postgres for export task...");
193
193
+
let pg_client = db.connect().await?;
194
194
+
let export_task = tokio::task::spawn(export_upstream(
195
195
+
args.upstream,
196
196
+
(args.upstream_bulk, args.bulk_epoch),
197
197
+
tx,
198
198
+
pg_client,
199
199
+
));
200
200
+
201
201
+
log::trace!("connecting postgres for writer task...");
202
202
+
let pg_client = db.connect().await?;
203
203
+
let writer_task = tokio::task::spawn(write_pages(rx, pg_client));
204
204
+
205
205
+
tokio::select! {
206
206
+
z = export_task => log::warn!("export task ended: {z:?}"),
207
207
+
z = writer_task => log::warn!("writer task ended: {z:?}"),
208
208
+
};
209
209
+
210
210
+
log::error!("todo: shutdown");
211
211
+
212
212
+
Ok(())
213
213
+
}
+29
-197
src/bin/main.rs
···
1
1
-
use clap::Parser;
2
2
-
use std::time::Duration;
1
1
+
use clap::{Parser, Subcommand};
3
2
use url::Url;
3
3
+
use allegedly::{Dt, bin_init, poll_upstream};
4
4
5
5
-
use allegedly::{Db, Dt, ExportPage, Op, bin_init, poll_upstream, week_to_pages};
6
6
-
7
7
-
const EXPORT_PAGE_QUEUE_SIZE: usize = 0; // rendezvous for now
8
8
-
const WEEK_IN_SECONDS: u64 = 7 * 86400;
9
9
-
10
10
-
#[derive(Parser)]
11
11
-
struct Args {
12
12
-
/// Upstream PLC server to mirror
13
13
-
///
14
14
-
/// default: https://plc.directory
5
5
+
#[derive(Debug, Parser)]
6
6
+
struct Cli {
7
7
+
/// Upstream PLC server
15
8
#[arg(long, env)]
16
9
#[clap(default_value = "https://plc.directory")]
17
10
upstream: Url,
18
18
-
/// Bulk export source prefix
19
19
-
///
20
20
-
/// Must be a prefix for urls ending with {WEEK_TIMESTAMP}.jsonl.gz
21
21
-
///
22
22
-
/// default: https://plc.t3.storage.dev/plc.directory/
23
23
-
///
24
24
-
/// pass "off" to skip fast bulk backfilling
25
25
-
#[arg(long, env)]
26
26
-
#[clap(default_value = "https://plc.t3.storage.dev/plc.directory/")]
27
27
-
upstream_bulk: Url,
28
28
-
/// The oldest available bulk upstream export timestamp
29
29
-
///
30
30
-
/// Must be a week-truncated unix timestamp
31
31
-
///
32
32
-
/// plc.directory's oldest week is `1668643200`; you probably don't want to change this.
33
33
-
#[arg(long, env)]
34
34
-
#[clap(default_value = "1668643200")]
35
35
-
bulk_epoch: u64,
36
36
-
/// Mirror PLC's postgres database
37
37
-
///
38
38
-
/// URI string with credentials etc
39
39
-
#[arg(long, env)]
40
40
-
postgres: String,
11
11
+
#[command(subcommand)]
12
12
+
command: Commands,
41
13
}
42
14
43
43
-
async fn bulk_backfill((upstream, epoch): (Url, u64), tx: flume::Sender<ExportPage>) {
44
44
-
let immutable_cutoff = std::time::SystemTime::now() - Duration::from_secs((7 + 4) * 86400);
45
45
-
let immutable_ts = (immutable_cutoff.duration_since(std::time::SystemTime::UNIX_EPOCH))
46
46
-
.unwrap()
47
47
-
.as_secs();
48
48
-
let immutable_week = (immutable_ts / WEEK_IN_SECONDS) * WEEK_IN_SECONDS;
49
49
-
let mut week = epoch;
50
50
-
let mut week_n = 0;
51
51
-
while week < immutable_week {
52
52
-
log::info!("backfilling week {week_n} ({week})");
53
53
-
let url = upstream.join(&format!("{week}.jsonl.gz")).unwrap();
54
54
-
week_to_pages(url, tx.clone()).await.unwrap();
55
55
-
week_n += 1;
56
56
-
week += WEEK_IN_SECONDS;
15
15
+
#[derive(Debug, Subcommand)]
16
16
+
enum Commands {
17
17
+
/// Poll an upstream PLC server and log new ops to stdout
18
18
+
Tail {
19
19
+
/// Begin replay from a specific timestamp
20
20
+
#[arg(long)]
21
21
+
after: Option<Dt>,
57
22
}
58
23
}
59
24
60
60
-
async fn export_upstream(
61
61
-
upstream: Url,
62
62
-
bulk: (Url, u64),
63
63
-
tx: flume::Sender<ExportPage>,
64
64
-
pg_client: tokio_postgres::Client,
65
65
-
) {
66
66
-
let latest = get_latest(&pg_client).await;
67
67
-
68
68
-
if latest.is_none() {
69
69
-
bulk_backfill(bulk, tx.clone()).await;
70
70
-
}
71
71
-
let mut upstream = upstream;
72
72
-
upstream.set_path("/export");
73
73
-
poll_upstream(latest, upstream, tx).await.unwrap();
74
74
-
}
75
75
-
76
76
-
async fn write_pages(
77
77
-
rx: flume::Receiver<ExportPage>,
78
78
-
mut pg_client: tokio_postgres::Client,
79
79
-
) -> Result<(), anyhow::Error> {
80
80
-
// TODO: one big upsert at the end from select distinct on the other table
81
81
-
82
82
-
// let upsert_did = &pg_client
83
83
-
// .prepare(
84
84
-
// r#"
85
85
-
// INSERT INTO dids (did) VALUES ($1)
86
86
-
// ON CONFLICT DO NOTHING"#,
87
87
-
// )
88
88
-
// .await
89
89
-
// .unwrap();
90
90
-
91
91
-
let insert_op = &pg_client
92
92
-
.prepare(
93
93
-
r#"
94
94
-
INSERT INTO operations (did, operation, cid, nullified, "createdAt")
95
95
-
VALUES ($1, $2, $3, $4, $5)
96
96
-
ON CONFLICT (did, cid) DO UPDATE
97
97
-
SET nullified = excluded.nullified,
98
98
-
"createdAt" = excluded."createdAt"
99
99
-
WHERE operations.nullified = excluded.nullified
100
100
-
OR operations."createdAt" = excluded."createdAt""#,
101
101
-
) // idea: op is provable via cid, so leave it out. after did/cid (pk) that leaves nullified and createdAt
102
102
-
// that we want to notice changing.
103
103
-
// normal insert: no conflict, rows changed = 1
104
104
-
// conflict (exact match): where clause passes, rows changed = 1
105
105
-
// conflict (mismatch): where clause fails, rows changed = 0 (detect this and warn!)
106
106
-
.await
107
107
-
.unwrap();
108
108
-
109
109
-
while let Ok(page) = rx.recv_async().await {
110
110
-
log::trace!("got a page...");
25
25
+
#[tokio::main]
26
26
+
async fn main() {
27
27
+
bin_init("main");
111
28
112
112
-
let tx = pg_client.transaction().await.unwrap();
29
29
+
let args = Cli::parse();
113
30
114
114
-
// TODO: probably figure out postgres COPY IN
115
115
-
// for now just write everything into a transaction
116
116
-
117
117
-
log::trace!("setting up inserts...");
118
118
-
for op_line in page
119
119
-
.ops
120
120
-
.into_iter()
121
121
-
.flat_map(|s| {
122
122
-
s.replace("}{", "}\n{")
123
123
-
.split('\n')
124
124
-
.map(|s| s.trim())
125
125
-
.map(Into::into)
126
126
-
.collect::<Vec<String>>()
127
127
-
})
128
128
-
.filter(|s| !s.is_empty())
129
129
-
{
130
130
-
let Ok(op) = serde_json::from_str::<Op>(&op_line)
131
131
-
.inspect_err(|e| log::error!("failing! at the {op_line}! {e}"))
132
132
-
else {
133
133
-
log::error!("ayeeeee just ignoring this error for now......");
134
134
-
continue;
135
135
-
};
136
136
-
// let client = &tx;
137
137
-
138
138
-
// client.execute(upsert_did, &[&op.did]).await.unwrap();
139
139
-
140
140
-
// let sp = tx.savepoint("op").await.unwrap();
141
141
-
let inserted = tx
142
142
-
.execute(
143
143
-
insert_op,
144
144
-
&[
145
145
-
&op.did,
146
146
-
&tokio_postgres::types::Json(op.operation),
147
147
-
&op.cid,
148
148
-
&op.nullified,
149
149
-
&op.created_at,
150
150
-
],
151
151
-
)
152
152
-
.await
153
153
-
.unwrap();
154
154
-
if inserted != 1 {
155
155
-
log::warn!(
156
156
-
"possible log modification: {inserted} rows changed after upserting {op:?}"
157
157
-
);
31
31
+
match args.command {
32
32
+
Commands::Tail { after } => {
33
33
+
let mut url = args.upstream;
34
34
+
url.set_path("/export");
35
35
+
let start_at = after.or_else(|| Some(chrono::Utc::now()));
36
36
+
let (tx, rx) = flume::bounded(0); // rendezvous
37
37
+
tokio::task::spawn(async move { poll_upstream(start_at, url, tx).await.unwrap() });
38
38
+
loop {
39
39
+
for op in rx.recv_async().await.unwrap().ops {
40
40
+
println!("{op}")
41
41
+
}
158
42
}
159
159
-
// {
160
160
-
// if e.code() != Some(&tokio_postgres::error::SqlState::UNIQUE_VIOLATION) {
161
161
-
// anyhow::bail!(e);
162
162
-
// }
163
163
-
// // TODO: assert that the row has not changed
164
164
-
// log::warn!("ignoring dup");
165
165
-
// }
166
43
}
167
167
-
168
168
-
tx.commit().await.unwrap();
169
44
}
170
170
-
Ok(())
171
171
-
}
172
172
-
173
173
-
async fn get_latest(pg_client: &tokio_postgres::Client) -> Option<Dt> {
174
174
-
pg_client
175
175
-
.query_opt(
176
176
-
r#"SELECT "createdAt" FROM operations
177
177
-
ORDER BY "createdAt" DESC LIMIT 1"#,
178
178
-
&[],
179
179
-
)
180
180
-
.await
181
181
-
.unwrap()
182
182
-
.map(|r| r.get(0))
183
183
-
}
184
184
-
185
185
-
#[tokio::main]
186
186
-
async fn main() -> anyhow::Result<()> {
187
187
-
bin_init("main");
188
188
-
let args = Args::parse();
189
189
-
let db = Db::new(&args.postgres);
190
190
-
let (tx, rx) = flume::bounded(EXPORT_PAGE_QUEUE_SIZE);
191
191
-
192
192
-
log::trace!("connecting postgres for export task...");
193
193
-
let pg_client = db.connect().await?;
194
194
-
let export_task = tokio::task::spawn(export_upstream(
195
195
-
args.upstream,
196
196
-
(args.upstream_bulk, args.bulk_epoch),
197
197
-
tx,
198
198
-
pg_client,
199
199
-
));
200
200
-
201
201
-
log::trace!("connecting postgres for writer task...");
202
202
-
let pg_client = db.connect().await?;
203
203
-
let writer_task = tokio::task::spawn(write_pages(rx, pg_client));
204
204
-
205
205
-
tokio::select! {
206
206
-
z = export_task => log::warn!("export task ended: {z:?}"),
207
207
-
z = writer_task => log::warn!("writer task ended: {z:?}"),
208
208
-
};
209
209
-
210
210
-
log::error!("todo: shutdown");
211
211
-
212
212
-
Ok(())
213
45
}
-38
src/bin/tail.rs
···
1
1
-
use allegedly::{bin_init, poll_upstream};
2
2
-
use clap::Parser;
3
3
-
use url::Url;
4
4
-
5
5
-
#[derive(Parser)]
6
6
-
struct Args {
7
7
-
/// Upstream PLC server to poll
8
8
-
///
9
9
-
/// default: https://plc.directory
10
10
-
#[arg(long, env)]
11
11
-
#[clap(default_value = "https://plc.directory")]
12
12
-
upstream: Url,
13
13
-
}
14
14
-
15
15
-
#[tokio::main]
16
16
-
async fn main() {
17
17
-
bin_init("tail");
18
18
-
19
19
-
let mut url = Args::parse().upstream;
20
20
-
url.set_path("/export");
21
21
-
let now = chrono::Utc::now();
22
22
-
23
23
-
let (tx, rx) = flume::bounded(0); // rendezvous
24
24
-
tokio::task::spawn(async move {
25
25
-
if let Err(e) = poll_upstream(Some(now), url, tx).await {
26
26
-
log::error!("polling failed: {e}");
27
27
-
} else {
28
28
-
log::warn!("poller finished ok (weird?)");
29
29
-
}
30
30
-
});
31
31
-
32
32
-
while let Ok(page) = rx.recv_async().await {
33
33
-
for op in page.ops {
34
34
-
println!("{op}");
35
35
-
}
36
36
-
}
37
37
-
log::warn!("recv failed, bye");
38
38
-
}