tangled
alpha
login
or
join now
ptr.pet
/
nsid-tracker
3
fork
atom
tracks lexicons and how many times they appeared on the jetstream
3
fork
atom
overview
issues
pulls
pipelines
feat(server): zstd compression
ptr.pet
7 months ago
4174678b
6001b7f8
verified
This commit was signed with the committer's
known signature
.
ptr.pet
SSH Key Fingerprint:
SHA256:Abmvag+juovVufZTxyWY8KcVgrznxvBjQpJesv071Aw=
+178
-51
5 changed files
expand all
collapse all
unified
split
server
.gitignore
Cargo.toml
src
db
handle.rs
mod.rs
main.rs
+1
server/.gitignore
···
1
1
target
2
2
.fjall_data*
3
3
+
zstd_dict
+5
-1
server/Cargo.toml
···
3
3
version = "0.1.0"
4
4
edition = "2024"
5
5
6
6
+
[features]
7
7
+
default = ["compress"]
8
8
+
compress = ["dep:zstd"]
9
9
+
6
10
[dependencies]
7
11
anyhow = "1.0"
8
12
async-trait = "0.1"
···
30
34
rayon = "1.10.0"
31
35
parking_lot = { version = "0.12", features = ["send_guard", "hardware-lock-elision"] }
32
36
rclite = "0.2.7"
33
33
-
zstd = "0.13.3"
37
37
+
zstd = { version = "0.13.3", optional = true, features = ["experimental"] }
34
38
35
39
[target.'cfg(target_env = "msvc")'.dependencies]
36
40
snmalloc-rs = "0.3.8"
+113
-16
server/src/db/handle.rs
···
14
14
use rclite::Arc;
15
15
use smol_str::SmolStr;
16
16
17
17
+
#[cfg(feature = "compress")]
18
18
+
use zstd::bulk::{Compressor as ZstdCompressor, Decompressor as ZstdDecompressor};
19
19
+
17
20
use crate::{
18
21
db::{EventRecord, NsidHit, block},
19
22
error::AppResult,
20
23
utils::{CLOCK, DefaultRateTracker, RateTracker, ReadVariableExt, varints_unsigned_encoded},
21
24
};
22
25
23
23
-
pub type ItemDecoder = block::ItemDecoder<Cursor<Slice>, NsidHit>;
24
24
-
pub type ItemEncoder = block::ItemEncoder<Vec<u8>, NsidHit>;
26
26
+
#[cfg(feature = "compress")]
27
27
+
thread_local! {
28
28
+
static COMPRESSOR: std::cell::RefCell<Option<ZstdCompressor<'static>>> = std::cell::RefCell::new(None);
29
29
+
static DECOMPRESSOR: std::cell::RefCell<Option<ZstdDecompressor<'static>>> = std::cell::RefCell::new(None);
30
30
+
}
31
31
+
32
32
+
type ItemDecoder = block::ItemDecoder<Cursor<Vec<u8>>, NsidHit>;
33
33
+
type ItemEncoder = block::ItemEncoder<Vec<u8>, NsidHit>;
25
34
pub type Item = block::Item<NsidHit>;
26
35
36
36
+
#[derive(Clone)]
37
37
+
pub enum Compression {
38
38
+
None,
39
39
+
#[cfg(feature = "compress")]
40
40
+
Zstd(ByteView),
41
41
+
}
42
42
+
43
43
+
impl Compression {
44
44
+
#[cfg(feature = "compress")]
45
45
+
fn get_dict(&self) -> Option<&ByteView> {
46
46
+
match self {
47
47
+
Compression::None => None,
48
48
+
Compression::Zstd(dict) => Some(dict),
49
49
+
}
50
50
+
}
51
51
+
}
52
52
+
27
53
pub struct Block {
28
54
pub written: usize,
29
55
pub key: ByteView,
···
36
62
buf: Arc<Mutex<Vec<EventRecord>>>,
37
63
last_insert: AtomicU64, // relaxed
38
64
eps: DefaultRateTracker,
65
65
+
compress: Compression,
39
66
}
40
67
41
68
impl Debug for LexiconHandle {
···
55
82
}
56
83
57
84
impl LexiconHandle {
58
58
-
pub fn new(keyspace: &Keyspace, nsid: &str) -> Self {
85
85
+
pub fn new(keyspace: &Keyspace, nsid: &str, compress: Compression) -> Self {
59
86
let opts = PartitionCreateOptions::default()
60
87
.block_size(1024 * 128)
61
61
-
.compression(fjall::CompressionType::Miniz(9));
88
88
+
.compression(fjall::CompressionType::Lz4);
62
89
Self {
63
90
tree: keyspace.open_partition(nsid, opts).unwrap(),
64
91
nsid: nsid.into(),
65
92
buf: Default::default(),
66
93
last_insert: AtomicU64::new(0),
67
94
eps: RateTracker::new(Duration::from_secs(10)),
95
95
+
compress,
68
96
}
69
97
}
70
98
99
99
+
#[cfg(feature = "compress")]
100
100
+
fn with_compressor<T>(&self, mut f: impl FnMut(&mut ZstdCompressor<'static>) -> T) -> T {
101
101
+
COMPRESSOR.with_borrow_mut(|compressor| {
102
102
+
if compressor.is_none() {
103
103
+
*compressor = Some({
104
104
+
let mut c = ZstdCompressor::new(9).expect("cant construct zstd compressor");
105
105
+
c.include_checksum(false).unwrap();
106
106
+
if let Some(dict) = self.compress.get_dict() {
107
107
+
c.set_dictionary(9, dict).expect("cant set dict");
108
108
+
}
109
109
+
c
110
110
+
});
111
111
+
}
112
112
+
// SAFETY: this is safe because we just initialized the compressor
113
113
+
f(unsafe { compressor.as_mut().unwrap_unchecked() })
114
114
+
})
115
115
+
}
116
116
+
117
117
+
#[cfg(feature = "compress")]
118
118
+
pub fn compress(&self, data: impl AsRef<[u8]>) -> std::io::Result<Vec<u8>> {
119
119
+
self.with_compressor(|compressor| compressor.compress(data.as_ref()))
120
120
+
}
121
121
+
122
122
+
#[cfg(feature = "compress")]
123
123
+
fn with_decompressor<T>(&self, mut f: impl FnMut(&mut ZstdDecompressor<'static>) -> T) -> T {
124
124
+
DECOMPRESSOR.with_borrow_mut(|decompressor| {
125
125
+
if decompressor.is_none() {
126
126
+
*decompressor = Some({
127
127
+
let mut d = ZstdDecompressor::new().expect("cant construct zstd decompressor");
128
128
+
if let Some(dict) = self.compress.get_dict() {
129
129
+
d.set_dictionary(dict).expect("cant set dict");
130
130
+
}
131
131
+
d
132
132
+
});
133
133
+
}
134
134
+
// SAFETY: this is safe because we just initialized the decompressor
135
135
+
f(unsafe { decompressor.as_mut().unwrap_unchecked() })
136
136
+
})
137
137
+
}
138
138
+
139
139
+
#[cfg(feature = "compress")]
140
140
+
pub fn decompress(&self, data: impl AsRef<[u8]>) -> std::io::Result<Vec<u8>> {
141
141
+
self.with_decompressor(|decompressor| {
142
142
+
decompressor.decompress(data.as_ref(), 1024 * 1024 * 20)
143
143
+
})
144
144
+
}
145
145
+
71
146
pub fn nsid(&self) -> &SmolStr {
72
147
&self.nsid
73
148
}
···
123
198
}
124
199
125
200
let start_blocks_size = blocks_to_compact.len();
126
126
-
let keys_to_delete = blocks_to_compact.iter().map(|(key, _)| key);
201
201
+
let keys_to_delete = blocks_to_compact
202
202
+
.iter()
203
203
+
.map(|(key, _)| key)
204
204
+
.cloned()
205
205
+
.collect_vec();
127
206
let mut all_items =
128
207
blocks_to_compact
129
129
-
.iter()
208
208
+
.into_iter()
130
209
.try_fold(Vec::new(), |mut acc, (key, value)| {
131
131
-
let mut timestamps = Cursor::new(key);
132
132
-
let start_timestamp = timestamps.read_varint()?;
133
133
-
let decoder = block::ItemDecoder::new(Cursor::new(value), start_timestamp)?;
210
210
+
let decoder = self.get_decoder_for(key, value)?;
134
211
let mut items = decoder.collect::<Result<Vec<_>, _>>()?;
135
212
acc.append(&mut items);
136
213
AppResult::Ok(acc)
···
149
226
.into_par_iter()
150
227
.map(|chunk| {
151
228
let count = chunk.len();
152
152
-
Self::encode_block_from_items(chunk, count)
229
229
+
self.encode_block_from_items(chunk, count)
153
230
})
154
231
.collect::<Result<Vec<_>, _>>()?;
155
232
let end_blocks_size = new_blocks.len();
···
173
250
}
174
251
175
252
pub fn encode_block_from_items(
253
253
+
&self,
176
254
items: impl IntoIterator<Item = Item>,
177
255
count: usize,
178
256
) -> AppResult<Block> {
···
204
282
.into());
205
283
}
206
284
if let (Some(start_timestamp), Some(end_timestamp)) = (start_timestamp, end_timestamp) {
207
207
-
let value = writer.finish()?;
285
285
+
let data = self.put_raw_block(writer.finish()?)?;
208
286
let key = varints_unsigned_encoded([start_timestamp, end_timestamp]);
209
209
-
return Ok(Block {
210
210
-
written,
211
211
-
key,
212
212
-
data: value,
213
213
-
});
287
287
+
return Ok(Block { written, key, data });
214
288
}
215
289
Err(std::io::Error::new(std::io::ErrorKind::WriteZero, "no items are in queue").into())
216
290
}
···
228
302
)
229
303
})
230
304
.collect()
305
305
+
}
306
306
+
307
307
+
pub fn get_raw_block(&self, value: Slice) -> std::io::Result<Vec<u8>> {
308
308
+
match &self.compress {
309
309
+
Compression::None => Ok(value.as_ref().into()),
310
310
+
#[cfg(feature = "compress")]
311
311
+
Compression::Zstd(_) => self.decompress(value),
312
312
+
}
313
313
+
}
314
314
+
315
315
+
pub fn put_raw_block(&self, value: Vec<u8>) -> std::io::Result<Vec<u8>> {
316
316
+
match &self.compress {
317
317
+
Compression::None => Ok(value),
318
318
+
#[cfg(feature = "compress")]
319
319
+
Compression::Zstd(_) => self.compress(value),
320
320
+
}
321
321
+
}
322
322
+
323
323
+
pub fn get_decoder_for(&self, key: Slice, value: Slice) -> AppResult<ItemDecoder> {
324
324
+
let mut timestamps = Cursor::new(key);
325
325
+
let start_timestamp = timestamps.read_varint()?;
326
326
+
let decoder = ItemDecoder::new(Cursor::new(self.get_raw_block(value)?), start_timestamp)?;
327
327
+
Ok(decoder)
231
328
}
232
329
}
+48
-23
server/src/db/mod.rs
···
3
3
fmt::Debug,
4
4
io::Cursor,
5
5
ops::{Bound, Deref, RangeBounds},
6
6
-
path::{Path, PathBuf},
6
6
+
path::Path,
7
7
time::Duration,
8
8
};
9
9
10
10
use byteview::StrView;
11
11
-
use fjall::{Config, Keyspace, Partition, PartitionCreateOptions};
11
11
+
use fjall::{Keyspace, Partition, PartitionCreateOptions};
12
12
use itertools::{Either, Itertools};
13
13
-
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
13
13
+
use rayon::iter::{IntoParallelIterator, ParallelIterator};
14
14
use rclite::Arc;
15
15
use rkyv::{Archive, Deserialize, Serialize, rancor::Error};
16
16
use smol_str::{SmolStr, ToSmolStr};
···
18
18
use tokio_util::sync::CancellationToken;
19
19
20
20
use crate::{
21
21
-
db::handle::{ItemDecoder, LexiconHandle},
21
21
+
db::handle::{Compression, LexiconHandle},
22
22
error::{AppError, AppResult},
23
23
jetstream::JetstreamEvent,
24
24
utils::{RateTracker, ReadVariableExt, varints_unsigned_encoded},
···
77
77
78
78
pub struct DbConfig {
79
79
pub ks_config: fjall::Config,
80
80
+
#[cfg(feature = "compress")]
81
81
+
pub dict_path: std::path::PathBuf,
80
82
pub min_block_size: usize,
81
83
pub max_block_size: usize,
82
84
pub max_last_activity: u64,
···
98
100
fn default() -> Self {
99
101
Self {
100
102
ks_config: fjall::Config::default(),
103
103
+
#[cfg(feature = "compress")]
104
104
+
dict_path: "zstd_dict".parse().unwrap(),
101
105
min_block_size: 512,
102
106
max_block_size: 500_000,
103
107
max_last_activity: Duration::from_secs(10).as_nanos() as u64,
···
116
120
event_broadcaster: broadcast::Sender<(SmolStr, NsidCounts)>,
117
121
eps: RateTracker<100>,
118
122
cancel_token: CancellationToken,
123
123
+
compression: Compression,
119
124
}
120
125
121
126
impl Db {
122
127
pub fn new(cfg: DbConfig, cancel_token: CancellationToken) -> AppResult<Self> {
123
128
tracing::info!("opening db...");
124
129
let ks = cfg.ks_config.clone().open()?;
130
130
+
let _compression = Compression::None;
131
131
+
#[cfg(feature = "compress")]
132
132
+
let dict = std::fs::File::open(&cfg.dict_path).ok().and_then(|mut f| {
133
133
+
let meta = f.metadata().ok()?;
134
134
+
byteview::ByteView::from_reader(&mut f, meta.len() as usize).ok()
135
135
+
});
136
136
+
#[cfg(feature = "compress")]
137
137
+
let _compression = match dict {
138
138
+
Some(dict) => {
139
139
+
tracing::info!(
140
140
+
"using zstd compression with dict from {}",
141
141
+
cfg.dict_path.to_string_lossy()
142
142
+
);
143
143
+
Compression::Zstd(dict)
144
144
+
}
145
145
+
None => Compression::None,
146
146
+
};
125
147
Ok(Self {
126
148
cfg,
127
149
hits: Default::default(),
···
136
158
event_broadcaster: broadcast::channel(1000).0,
137
159
eps: RateTracker::new(Duration::from_secs(1)),
138
160
cancel_token,
161
161
+
compression: _compression,
139
162
})
140
163
}
141
164
···
213
236
.into_par_iter()
214
237
.map(|(i, items, handle)| {
215
238
let count = items.len();
216
216
-
let block = LexiconHandle::encode_block_from_items(items, count)?;
239
239
+
let block = handle.encode_block_from_items(items, count)?;
217
240
tracing::info!(
218
241
"{}: encoded block with {} items",
219
242
handle.nsid(),
···
282
305
Some(handle) => handle.clone(),
283
306
None => {
284
307
if self.ks.partition_exists(nsid.as_ref()) {
285
285
-
let handle = Arc::new(LexiconHandle::new(&self.ks, nsid.as_ref()));
308
308
+
let handle = Arc::new(LexiconHandle::new(
309
309
+
&self.ks,
310
310
+
nsid.as_ref(),
311
311
+
self.compression.clone(),
312
312
+
));
286
313
let _ = self.hits.insert(SmolStr::new(nsid), handle.clone());
287
314
handle
288
315
} else {
···
295
322
296
323
#[inline(always)]
297
324
fn ensure_handle(&self, nsid: &SmolStr) -> impl Deref<Target = Arc<LexiconHandle>> + use<'_> {
298
298
-
self.hits
299
299
-
.entry(nsid.clone())
300
300
-
.or_insert_with(|| Arc::new(LexiconHandle::new(&self.ks, &nsid)))
325
325
+
self.hits.entry(nsid.clone()).or_insert_with(|| {
326
326
+
Arc::new(LexiconHandle::new(
327
327
+
&self.ks,
328
328
+
&nsid,
329
329
+
self.compression.clone(),
330
330
+
))
331
331
+
})
301
332
}
302
333
303
334
pub fn ingest_events(&self, events: impl Iterator<Item = EventRecord>) -> AppResult<()> {
···
366
397
};
367
398
let block_lens = handle.iter().rev().try_fold(Vec::new(), |mut acc, item| {
368
399
let (key, value) = item?;
369
369
-
let mut timestamps = Cursor::new(key);
370
370
-
let start_timestamp = timestamps.read_varint()?;
371
371
-
let decoder = ItemDecoder::new(Cursor::new(value), start_timestamp)?;
400
400
+
let decoder = handle.get_decoder_for(key, value)?;
372
401
acc.push(decoder.item_count());
373
402
AppResult::Ok(acc)
374
403
})?;
···
380
409
})
381
410
}
382
411
383
383
-
// train zstd dict with 100 blocks from every lexicon
412
412
+
// train zstd dict with 1000 blocks from every lexicon
413
413
+
#[cfg(feature = "compress")]
384
414
pub fn train_zstd_dict(&self) -> AppResult<Vec<u8>> {
385
415
let samples = self
386
416
.get_nsids()
···
388
418
.map(|handle| {
389
419
handle
390
420
.iter()
391
391
-
.rev()
392
392
-
.map(|res| {
421
421
+
.map(move |res| {
393
422
res.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))
394
394
-
.map(|(_, value)| Cursor::new(value))
423
423
+
.and_then(|(_, value)| Ok(Cursor::new(handle.get_raw_block(value)?)))
395
424
})
396
425
.take(1000)
397
426
})
···
420
449
return Either::Right(std::iter::empty());
421
450
};
422
451
423
423
-
let map_block = move |(key, val)| {
424
424
-
let mut key_reader = Cursor::new(key);
425
425
-
let start_timestamp = key_reader.read_varint::<u64>()?;
426
426
-
if start_timestamp < start_limit {
427
427
-
return Ok(None);
428
428
-
}
429
429
-
let items = handle::ItemDecoder::new(Cursor::new(val), start_timestamp)?
452
452
+
let map_block = |(key, val)| {
453
453
+
let decoder = handle.get_decoder_for(key, val)?;
454
454
+
let items = decoder
430
455
.take_while(move |item| {
431
456
item.as_ref().map_or(true, |item| {
432
457
item.timestamp <= end_limit && item.timestamp >= start_limit
+11
-11
server/src/main.rs
···
53
53
debug();
54
54
return;
55
55
}
56
56
-
Some("traindict") => {
57
57
-
train_zstd_dict();
58
58
-
return;
59
59
-
}
60
56
Some(x) => {
61
57
tracing::error!("unknown command: {}", x);
62
58
return;
···
211
207
db.sync(true).expect("cant sync db");
212
208
}
213
209
214
214
-
fn train_zstd_dict() {
215
215
-
let db = Db::new(DbConfig::default(), CancellationToken::new()).expect("couldnt create db");
216
216
-
let dict_data = db.train_zstd_dict().expect("cant train zstd dict");
217
217
-
std::fs::write("zstd_dict", dict_data).expect("cant save zstd dict")
218
218
-
}
219
219
-
220
210
fn debug() {
221
211
let db = Db::new(DbConfig::default(), CancellationToken::new()).expect("couldnt create db");
222
212
let info = db.info().expect("cant get db info");
···
246
236
DbConfig::default().ks(|c| {
247
237
c.max_journaling_size(u64::MAX)
248
238
.max_write_buffer_size(u64::MAX)
239
239
+
.compaction_workers(rayon::current_num_threads() * 4)
240
240
+
.flush_workers(rayon::current_num_threads() * 4)
249
241
}),
250
242
CancellationToken::new(),
251
243
)
···
269
261
270
262
fn migrate() {
271
263
let cancel_token = CancellationToken::new();
264
264
+
272
265
let from = Arc::new(
273
266
Db::new(
274
267
DbConfig::default().path(".fjall_data_from"),
···
276
269
)
277
270
.expect("couldnt create db"),
278
271
);
272
272
+
#[cfg(feature = "compress")]
273
273
+
std::fs::write(
274
274
+
"zstd_dict",
275
275
+
from.train_zstd_dict().expect("cant get zstd dict"),
276
276
+
)
277
277
+
.expect("cant write zstd dict");
278
278
+
279
279
let to = Arc::new(
280
280
Db::new(
281
281
DbConfig::default().path(".fjall_data_to").ks(|c| {
···
290
290
);
291
291
292
292
let nsids = from.get_nsids().collect::<Vec<_>>();
293
293
-
let eps_thread = std::thread::spawn({
293
293
+
let _eps_thread = std::thread::spawn({
294
294
let to = to.clone();
295
295
move || {
296
296
loop {