tracks lexicons and how many times they appeared on the jetstream

refactor(server): delete old db impl, rename migrate to compact

ptr.pet 84d25308 48892f4c

verified
+10 -218
+1 -210
server/src/db/mod.rs
··· 69 69 } 70 70 } 71 71 72 - // counts is nsid -> NsidCounts 73 - // hits is tree per nsid: timestamp -> NsidHit 74 - pub struct DbOld { 75 - inner: Keyspace, 76 - hits: scc::HashIndex<SmolStr, Partition>, 77 - counts: Partition, 78 - event_broadcaster: broadcast::Sender<(SmolStr, NsidCounts)>, 79 - eps: Rate, 80 - } 81 - 82 - impl DbOld { 83 - pub fn new(path: impl AsRef<Path>) -> AppResult<Self> { 84 - tracing::info!("opening db..."); 85 - let ks = Config::new(path) 86 - .cache_size(8 * 1024 * 1024) // from talna 87 - .open()?; 88 - Ok(Self { 89 - hits: Default::default(), 90 - counts: ks.open_partition( 91 - "_counts", 92 - PartitionCreateOptions::default().compression(fjall::CompressionType::None), 93 - )?, 94 - inner: ks, 95 - event_broadcaster: broadcast::channel(1000).0, 96 - eps: Rate::new(Duration::from_secs(1)), 97 - }) 98 - } 99 - 100 - pub fn eps(&self) -> usize { 101 - self.eps.rate(&()) as usize 102 - } 103 - 104 - pub fn new_listener(&self) -> broadcast::Receiver<(SmolStr, NsidCounts)> { 105 - self.event_broadcaster.subscribe() 106 - } 107 - 108 - #[inline(always)] 109 - fn get_part_opts() -> PartitionCreateOptions { 110 - PartitionCreateOptions::default() 111 - .compression(fjall::CompressionType::Miniz(9)) 112 - .compaction_strategy(fjall::compaction::Strategy::Fifo(fjall::compaction::Fifo { 113 - limit: 5 * 1024 * 1024 * 1024, // 5 gb 114 - ttl_seconds: Some(60 * 60 * 24 * 30), // 30 days 115 - })) 116 - } 117 - 118 - #[inline(always)] 119 - fn maybe_run_in_nsid_tree<T>(&self, nsid: &str, f: impl FnOnce(&Partition) -> T) -> Option<T> { 120 - let _guard = scc::ebr::Guard::new(); 121 - let handle = match self.hits.peek(nsid, &_guard) { 122 - Some(handle) => handle.clone(), 123 - None => { 124 - if self.inner.partition_exists(nsid) { 125 - let handle = self 126 - .inner 127 - .open_partition(nsid, Self::get_part_opts()) 128 - .expect("cant open partition"); 129 - let _ = self.hits.insert(SmolStr::new(nsid), handle.clone()); 130 - handle 131 - } else { 132 - return None; 133 - } 134 - } 135 - }; 136 - Some(f(&handle)) 137 - } 138 - 139 - #[inline(always)] 140 - fn run_in_nsid_tree<T>( 141 - &self, 142 - nsid: &str, 143 - f: impl FnOnce(&Partition) -> AppResult<T>, 144 - ) -> AppResult<T> { 145 - f(self 146 - .hits 147 - .entry(SmolStr::new(nsid)) 148 - .or_insert_with(|| { 149 - let opts = Self::get_part_opts(); 150 - self.inner.open_partition(nsid, opts).unwrap() 151 - }) 152 - .get()) 153 - } 154 - 155 - pub fn record_event(&self, e: EventRecord) -> AppResult<()> { 156 - let EventRecord { 157 - nsid, 158 - timestamp, 159 - deleted, 160 - } = e; 161 - 162 - self.insert_event(&nsid, timestamp, deleted)?; 163 - // increment count 164 - let mut counts = self.get_count(&nsid)?; 165 - counts.last_seen = timestamp; 166 - if deleted { 167 - counts.deleted_count += 1; 168 - } else { 169 - counts.count += 1; 170 - } 171 - self.insert_count(&nsid, counts.clone())?; 172 - if self.event_broadcaster.receiver_count() > 0 { 173 - let _ = self.event_broadcaster.send((SmolStr::new(&nsid), counts)); 174 - } 175 - self.eps.observe(&(), 1); 176 - Ok(()) 177 - } 178 - 179 - #[inline(always)] 180 - fn insert_event(&self, nsid: &str, timestamp: u64, deleted: bool) -> AppResult<()> { 181 - self.run_in_nsid_tree(nsid, |tree| { 182 - tree.insert( 183 - timestamp.to_be_bytes(), 184 - unsafe { rkyv::to_bytes::<Error>(&NsidHit { deleted }).unwrap_unchecked() } 185 - .as_slice(), 186 - ) 187 - .map_err(AppError::from) 188 - }) 189 - } 190 - 191 - #[inline(always)] 192 - fn insert_count(&self, nsid: &str, counts: NsidCounts) -> AppResult<()> { 193 - self.counts 194 - .insert( 195 - nsid, 196 - unsafe { rkyv::to_bytes::<Error>(&counts).unwrap_unchecked() }.as_slice(), 197 - ) 198 - .map_err(AppError::from) 199 - } 200 - 201 - pub fn get_count(&self, nsid: &str) -> AppResult<NsidCounts> { 202 - let Some(raw) = self.counts.get(nsid)? else { 203 - return Ok(NsidCounts::default()); 204 - }; 205 - Ok(unsafe { rkyv::from_bytes_unchecked::<_, Error>(&raw).unwrap_unchecked() }) 206 - } 207 - 208 - pub fn get_counts(&self) -> impl Iterator<Item = AppResult<(SmolStr, NsidCounts)>> { 209 - self.counts.iter().map(|res| { 210 - res.map_err(AppError::from).map(|(key, val)| { 211 - ( 212 - SmolStr::new(unsafe { str::from_utf8_unchecked(&key) }), 213 - unsafe { rkyv::from_bytes_unchecked::<_, Error>(&val).unwrap_unchecked() }, 214 - ) 215 - }) 216 - }) 217 - } 218 - 219 - pub fn get_nsids(&self) -> impl Iterator<Item = impl Deref<Target = str> + 'static> { 220 - self.inner 221 - .list_partitions() 222 - .into_iter() 223 - .filter(|k| k.deref() != "_counts") 224 - } 225 - 226 - pub fn get_hits( 227 - &self, 228 - nsid: &str, 229 - range: impl RangeBounds<u64>, 230 - ) -> BoxedIter<AppResult<(u64, NsidHit)>> { 231 - let start = range.start_bound().cloned().map(u64::to_be_bytes); 232 - let end = range.end_bound().cloned().map(u64::to_be_bytes); 233 - 234 - self.maybe_run_in_nsid_tree(nsid, |tree| -> BoxedIter<AppResult<(u64, NsidHit)>> { 235 - Box::new(tree.range(TimestampRangeOld { start, end }).map(|res| { 236 - res.map_err(AppError::from).map(|(key, val)| { 237 - ( 238 - u64::from_be_bytes(key.as_ref().try_into().unwrap()), 239 - unsafe { rkyv::from_bytes_unchecked::<_, Error>(&val).unwrap_unchecked() }, 240 - ) 241 - }) 242 - })) 243 - }) 244 - .unwrap_or_else(|| Box::new(std::iter::empty())) 245 - } 246 - 247 - pub fn tracking_since(&self) -> AppResult<u64> { 248 - // HACK: we should actually store when we started tracking but im lazy 249 - // should be accurate enough 250 - self.maybe_run_in_nsid_tree("app.bsky.feed.like", |tree| { 251 - let Some((timestamp_raw, _)) = tree.first_key_value()? else { 252 - return Ok(0); 253 - }; 254 - Ok(u64::from_be_bytes( 255 - timestamp_raw.as_ref().try_into().unwrap(), 256 - )) 257 - }) 258 - .unwrap_or(Ok(0)) 259 - } 260 - } 261 - 262 72 type ItemDecoder = block::ItemDecoder<Cursor<Slice>, NsidHit>; 263 73 type ItemEncoder = block::ItemEncoder<Vec<u8>, NsidHit>; 264 74 type Item = block::Item<NsidHit>; ··· 376 186 event_broadcaster: broadcast::channel(1000).0, 377 187 eps: Rate::new(Duration::from_secs(1)), 378 188 min_block_size: 512, 379 - max_block_size: 100_000, 189 + max_block_size: 500_000, 380 190 max_last_activity: Duration::from_secs(10), 381 191 }) 382 192 } ··· 594 404 self.end.as_ref() 595 405 } 596 406 } 597 - 598 - type TimestampReprOld = [u8; 8]; 599 - 600 - struct TimestampRangeOld { 601 - start: Bound<TimestampReprOld>, 602 - end: Bound<TimestampReprOld>, 603 - } 604 - 605 - impl RangeBounds<TimestampReprOld> for TimestampRangeOld { 606 - #[inline(always)] 607 - fn start_bound(&self) -> Bound<&TimestampReprOld> { 608 - self.start.as_ref() 609 - } 610 - 611 - #[inline(always)] 612 - fn end_bound(&self) -> Bound<&TimestampReprOld> { 613 - self.end.as_ref() 614 - } 615 - }
+9 -8
server/src/main.rs
··· 9 9 10 10 use crate::{ 11 11 api::serve, 12 - db::{Db, DbOld, EventRecord}, 12 + db::{Db, EventRecord}, 13 13 error::AppError, 14 14 jetstream::JetstreamClient, 15 15 }; ··· 36 36 .init(); 37 37 38 38 match std::env::args().nth(1).as_deref() { 39 - Some("migrate") => { 40 - migrate(); 39 + Some("compact") => { 40 + compact(); 41 41 return; 42 42 } 43 43 Some("debug") => { ··· 143 143 } 144 144 } 145 145 146 - fn migrate() { 147 - let from = Arc::new(DbOld::new(".fjall_data").expect("couldnt create db")); 148 - let to = Arc::new(Db::new(".fjall_data_migrated").expect("couldnt create db")); 146 + fn compact() { 147 + let from = Arc::new(Db::new(".fjall_data_from").expect("couldnt create db")); 148 + let to = Arc::new(Db::new(".fjall_data_to").expect("couldnt create db")); 149 149 150 150 let mut threads = Vec::new(); 151 151 for nsid in from.get_nsids() { ··· 155 155 tracing::info!("migrating {} ...", nsid.deref()); 156 156 let mut count = 0_u64; 157 157 for hit in from.get_hits(&nsid, ..) { 158 - let (timestamp, data) = hit.expect("cant read event"); 158 + let hit = hit.expect("cant read event"); 159 + let data = hit.access(); 159 160 to.record_event(EventRecord { 160 161 nsid: nsid.to_smolstr(), 161 - timestamp, 162 + timestamp: hit.timestamp, 162 163 deleted: data.deleted, 163 164 }) 164 165 .expect("cant record event");