online Minecraft written book viewer
at main 615 lines 18 kB view raw
1use ahash::{AHashMap, AHashSet}; 2use lru::LruCache; 3use nara_core::{ 4 book::{Book, BookHash, BookSource}, 5 component::Component, 6}; 7use smol_str::SmolStr; 8use std::{cell::RefCell, num::NonZeroUsize}; 9use strsim::jaro_winkler; 10 11pub type BookId = usize; 12 13/// In-memory index of all books with lookup and fuzzy search helpers. 14#[derive(Debug)] 15pub struct Library { 16 books: Vec<Book>, 17 18 by_hash: AHashMap<BookHash, BookId>, 19 source_by_hash: AHashMap<BookHash, BookSource>, 20 by_author_lc: AHashMap<SmolStr, Vec<BookId>>, 21 22 // normalized blobs for scoring (same index as `books`) 23 norm_title: Vec<String>, 24 norm_author: Vec<String>, 25 norm_contents: Vec<String>, 26 27 // trigram inverted indices -> candidate generation 28 tri_title: AHashMap<u32, Vec<BookId>>, 29 tri_author: AHashMap<u32, Vec<BookId>>, 30 tri_contents: AHashMap<u32, Vec<BookId>>, 31 32 content_threshold: f64, 33 author_threshold: f64, 34 title_threshold: f64, 35 36 cache_books_by_author: RefCell<LruCache<SmolStr, Vec<BookId>>>, 37 cache_fuzzy_title: RefCell<LruCache<FuzzyKey, Vec<(BookId, f64)>>>, 38 cache_fuzzy_author: RefCell<LruCache<FuzzyKey, Vec<(BookId, f64)>>>, 39 cache_fuzzy_contents: RefCell<LruCache<FuzzyKey, Vec<(BookId, f64)>>>, 40 cache_fuzzy_all: RefCell<LruCache<FuzzyKey, Vec<(BookId, f64)>>>, 41 42 empty_books_filtered: u16, 43} 44 45/// Cache key for fuzzy searches. 46#[derive(Debug, Clone, Hash, PartialEq, Eq)] 47struct FuzzyKey { 48 query: SmolStr, 49 limit: usize, 50} 51 52impl Library { 53 pub fn new( 54 content_threshold: f64, 55 title_threshold: f64, 56 author_threshold: f64, 57 ) -> Self { 58 Self { 59 books: Vec::new(), 60 by_hash: AHashMap::new(), 61 source_by_hash: AHashMap::new(), 62 by_author_lc: AHashMap::new(), 63 norm_title: Vec::new(), 64 norm_author: Vec::new(), 65 norm_contents: Vec::new(), 66 tri_title: AHashMap::new(), 67 tri_author: AHashMap::new(), 68 tri_contents: AHashMap::new(), 69 content_threshold, 70 title_threshold, 71 author_threshold, 72 cache_books_by_author: RefCell::new(new_lru(CACHE_BY_AUTHOR_CAP)), 73 cache_fuzzy_title: RefCell::new(new_lru(CACHE_FUZZY_CAP)), 74 cache_fuzzy_author: RefCell::new(new_lru(CACHE_FUZZY_CAP)), 75 cache_fuzzy_contents: RefCell::new(new_lru(CACHE_FUZZY_CAP)), 76 cache_fuzzy_all: RefCell::new(new_lru(CACHE_FUZZY_CAP)), 77 empty_books_filtered: 0, 78 } 79 } 80 81 /// Inserts a book 82 pub fn add_book( 83 &mut self, 84 book: Book, 85 warn_empty: bool, 86 filter_empty_books: bool, 87 ) { 88 if filter_empty_books && book_plain_text_empty(&book) { 89 if warn_empty { 90 tracing::warn!( 91 "Skipping empty book with source {0:?}: {1} by {2}", 92 book.metadata.source, 93 book.content.title, 94 book.content.author 95 ); 96 } 97 self.empty_books_filtered += 1; 98 return; 99 }; 100 101 let h = book.hash(); 102 if self.by_hash.contains_key(&h) { 103 return; 104 } 105 106 let id = self.books.len(); 107 108 let source = book.metadata.source.clone(); 109 self.books.push(book); 110 111 // indices... 112 self.by_hash.insert(h, id); 113 self.source_by_hash.insert(h, source); 114 115 let author_lc = SmolStr::new(normalize(&self.books[id].content.author)); 116 if !author_lc.is_empty() { 117 self.by_author_lc.entry(author_lc).or_default().push(id); 118 } 119 120 // normalized blobs (for scoring) 121 self.norm_title 122 .push(normalize(&self.books[id].content.title)); 123 self.norm_author 124 .push(normalize(&self.books[id].content.author)); 125 self.norm_contents 126 .push(normalize_contents(&self.books[id].content.pages)); 127 128 // candidate-generation indices 129 index_trigrams(&mut self.tri_title, id, &self.norm_title[id]); 130 index_trigrams(&mut self.tri_author, id, &self.norm_author[id]); 131 index_trigrams(&mut self.tri_contents, id, &self.norm_contents[id]); 132 133 self.cache_books_by_author.borrow_mut().clear(); 134 self.cache_fuzzy_title.borrow_mut().clear(); 135 self.cache_fuzzy_author.borrow_mut().clear(); 136 self.cache_fuzzy_contents.borrow_mut().clear(); 137 self.cache_fuzzy_all.borrow_mut().clear(); 138 } 139 140 /// Looks up a book by its content hash. 141 #[inline] 142 pub fn book_by_hash(&self, hash: BookHash) -> Option<&Book> { 143 self.by_hash.get(&hash).map(|&id| &self.books[id]) 144 } 145 146 /// Lists books for an author (case-insensitive match). 147 #[inline] 148 pub fn books_by_author<'a>( 149 &'a self, 150 author: &str, 151 ) -> impl Iterator<Item = &'a Book> + 'a { 152 let key = SmolStr::new(normalize(author)); 153 let ids = if key.is_empty() { 154 Vec::new() 155 } else if let Some(ids) = 156 self.cache_books_by_author.borrow_mut().get(&key).cloned() 157 { 158 ids 159 } else { 160 let ids = self.by_author_lc.get(&key).cloned().unwrap_or_default(); 161 self.cache_books_by_author 162 .borrow_mut() 163 .put(key.clone(), ids.clone()); 164 ids 165 }; 166 167 ids.into_iter().map(|id| &self.books[id]) 168 } 169 170 /// Fuzzy search over normalized titles. 171 pub fn fuzzy_title(&self, query: &str, limit: usize) -> Vec<(&Book, f64)> { 172 let key = SmolStr::new(normalize(query)); 173 if key.is_empty() || limit == 0 { 174 return Vec::new(); 175 } 176 177 let cache_key = FuzzyKey { 178 query: key.clone(), 179 limit, 180 }; 181 182 let scored = if let Some(scored) = 183 self.cache_fuzzy_title.borrow_mut().get(&cache_key).cloned() 184 { 185 scored 186 } else { 187 let scored = fuzzy_rank( 188 key.as_str(), 189 &self.norm_title, 190 &self.tri_title, 191 limit, 192 self.title_threshold, 193 ); 194 self.cache_fuzzy_title 195 .borrow_mut() 196 .put(cache_key, scored.clone()); 197 scored 198 }; 199 200 scored 201 .into_iter() 202 .map(|(id, s)| (&self.books[id], s)) 203 .collect() 204 } 205 206 /// Fuzzy search over normalized author names. 207 pub fn fuzzy_author(&self, query: &str, limit: usize) -> Vec<(&Book, f64)> { 208 let key = SmolStr::new(normalize(query)); 209 if key.is_empty() || limit == 0 { 210 return Vec::new(); 211 } 212 213 let cache_key = FuzzyKey { 214 query: key.clone(), 215 limit, 216 }; 217 218 let scored = if let Some(scored) = self 219 .cache_fuzzy_author 220 .borrow_mut() 221 .get(&cache_key) 222 .cloned() 223 { 224 scored 225 } else { 226 let scored = fuzzy_rank( 227 key.as_str(), 228 &self.norm_author, 229 &self.tri_author, 230 limit, 231 self.author_threshold, 232 ); 233 self.cache_fuzzy_author 234 .borrow_mut() 235 .put(cache_key, scored.clone()); 236 scored 237 }; 238 239 scored 240 .into_iter() 241 .map(|(id, s)| (&self.books[id], s)) 242 .collect() 243 } 244 245 /// Fuzzy search over normalized contents blobs. 246 pub fn fuzzy_contents( 247 &self, 248 query: &str, 249 limit: usize, 250 ) -> Vec<(&Book, f64)> { 251 let key = SmolStr::new(normalize(query)); 252 if key.is_empty() || limit == 0 { 253 return Vec::new(); 254 } 255 256 let cache_key = FuzzyKey { 257 query: key.clone(), 258 limit, 259 }; 260 261 let scored = if let Some(scored) = self 262 .cache_fuzzy_contents 263 .borrow_mut() 264 .get(&cache_key) 265 .cloned() 266 { 267 scored 268 } else { 269 let scored = fuzzy_rank_contents( 270 key.as_str(), 271 &self.norm_contents, 272 &self.tri_contents, 273 limit, 274 self.content_threshold, 275 ); 276 self.cache_fuzzy_contents 277 .borrow_mut() 278 .put(cache_key, scored.clone()); 279 scored 280 }; 281 282 scored 283 .into_iter() 284 .map(|(id, s)| (&self.books[id], s)) 285 .collect() 286 } 287 288 /// Combined fuzzy search (title + author + contents). 289 pub fn fuzzy(&self, query: &str, limit: usize) -> Vec<(&Book, f64)> { 290 let key = SmolStr::new(normalize(query)); 291 if key.is_empty() || limit == 0 { 292 return Vec::new(); 293 } 294 295 let cache_key = FuzzyKey { 296 query: key.clone(), 297 limit, 298 }; 299 300 let scored = if let Some(scored) = 301 self.cache_fuzzy_all.borrow_mut().get(&cache_key).cloned() 302 { 303 scored 304 } else { 305 let mut totals: AHashMap<BookId, f64> = AHashMap::new(); 306 307 let title = fuzzy_rank( 308 key.as_str(), 309 &self.norm_title, 310 &self.tri_title, 311 (limit * 4).clamp(50, 2000), 312 self.title_threshold, 313 ); 314 for (id, s) in title { 315 *totals.entry(id).or_insert(0.0) += s; 316 } 317 318 let author = fuzzy_rank( 319 key.as_str(), 320 &self.norm_author, 321 &self.tri_author, 322 (limit * 4).clamp(50, 2000), 323 self.author_threshold, 324 ); 325 for (id, s) in author { 326 *totals.entry(id).or_insert(0.0) += s; 327 } 328 329 let contents = fuzzy_rank_contents( 330 key.as_str(), 331 &self.norm_contents, 332 &self.tri_contents, 333 (limit * 6).clamp(100, 4000), 334 self.content_threshold, 335 ); 336 for (id, s) in contents { 337 *totals.entry(id).or_insert(0.0) += s * 0.7; 338 } 339 340 let mut scored: Vec<(BookId, f64)> = totals.into_iter().collect(); 341 scored.sort_by(|a, b| { 342 b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal) 343 }); 344 scored.truncate(limit); 345 346 self.cache_fuzzy_all 347 .borrow_mut() 348 .put(cache_key, scored.clone()); 349 scored 350 }; 351 352 scored 353 .into_iter() 354 .map(|(id, s)| (&self.books[id], s)) 355 .collect() 356 } 357 358 /// Returns the number of indexed books. 359 #[inline] 360 pub fn book_count(&self) -> usize { 361 self.books.len() 362 } 363 364 /// Returns a list of all books in the library. 365 #[inline] 366 pub fn all_books<'a>(&'a self) -> impl Iterator<Item = &'a Book> + 'a { 367 self.books.iter() 368 } 369 370 /// Returns the source for a book hash, if present. 371 #[inline] 372 pub fn source_for_hash(&self, hash: &BookHash) -> Option<&BookSource> { 373 self.source_by_hash.get(hash) 374 } 375} 376 377/// Lowercases and normalizes a query string. 378#[inline] 379fn normalize(s: &str) -> String { 380 s.to_lowercase() 381} 382 383fn book_plain_text_empty(book: &Book) -> bool { 384 book.content.pages.is_empty() 385 || book 386 .content 387 .pages 388 .iter() 389 .all(|page| page.normalize().trim().is_empty()) 390} 391 392const CACHE_BY_AUTHOR_CAP: usize = 1024; 393const CACHE_FUZZY_CAP: usize = 256; 394 395/// Helper to build LRU caches with non-zero capacity. 396fn new_lru<K: std::hash::Hash + Eq, V>(cap: usize) -> LruCache<K, V> { 397 LruCache::new(NonZeroUsize::new(cap).expect("cache cap must be > 0")) 398} 399 400const MAX_CONTENT_INDEX_CHARS: usize = 16_384; 401 402/// Joins and normalizes pages for the contents index. 403fn normalize_contents(pages: &[Component]) -> String { 404 let mut out = String::new(); 405 for p in pages { 406 if out.len() >= MAX_CONTENT_INDEX_CHARS { 407 break; 408 } 409 if !out.is_empty() { 410 out.push('\n'); 411 } 412 out.push_str(&p.to_plain_text()); 413 } 414 let mut out = out.to_lowercase(); 415 if out.len() > MAX_CONTENT_INDEX_CHARS { 416 out.truncate(MAX_CONTENT_INDEX_CHARS); 417 } 418 out 419} 420 421#[inline] 422fn query_terms(q: &str) -> Vec<&str> { 423 q.split_whitespace().filter(|t| !t.is_empty()).collect() 424} 425 426#[inline] 427fn token_coverage(terms: &[&str], haystack: &str) -> f64 { 428 if terms.is_empty() { 429 return 0.0; 430 } 431 let matched = terms.iter().filter(|t| haystack.contains(**t)).count(); 432 matched as f64 / terms.len() as f64 433} 434 435/// Encodes ASCII-ish trigrams into `u32`. 436fn trigrams(s: &str) -> AHashSet<u32> { 437 let b = s.as_bytes(); 438 let mut set = AHashSet::new(); 439 440 if b.is_empty() { 441 return set; 442 } 443 444 if b.len() < 3 { 445 let b0 = b.first().copied().unwrap_or(0); 446 let b1 = b.get(1).copied().unwrap_or(0); 447 let tri = ((b0 as u32) << 16) | ((b1 as u32) << 8); 448 set.insert(tri); 449 return set; 450 } 451 452 for w in b.windows(3) { 453 let tri = ((w[0] as u32) << 16) | ((w[1] as u32) << 8) | (w[2] as u32); 454 set.insert(tri); 455 } 456 457 set 458} 459 460/// Adds all trigrams from a string to the inverted index. 461fn index_trigrams( 462 index: &mut AHashMap<u32, Vec<BookId>>, 463 id: BookId, 464 norm: &str, 465) { 466 for tri in trigrams(norm) { 467 index.entry(tri).or_default().push(id); 468 } 469} 470 471/// Shared pipeline for fuzzy ranking with a custom scoring function. 472fn fuzzy_rank_with<F>( 473 query: &str, 474 norm_field: &[String], 475 tri_index: &AHashMap<u32, Vec<BookId>>, 476 limit: usize, 477 score_threshold: f64, 478 max_to_score: usize, 479 mut score_fn: F, 480) -> Vec<(BookId, f64)> 481where 482 F: FnMut(BookId, u32, &str, &AHashSet<u32>, &str, &[&str]) -> f64, 483{ 484 let q = normalize(query); 485 if q.is_empty() || limit == 0 { 486 return Vec::new(); 487 } 488 489 let q_tris = trigrams(&q); 490 let terms = query_terms(&q); 491 let mut counts: AHashMap<BookId, u32> = AHashMap::new(); 492 493 for tri in q_tris.iter() { 494 if let Some(ids) = tri_index.get(tri) { 495 for &id in ids { 496 *counts.entry(id).or_insert(0) += 1; 497 } 498 } 499 } 500 501 // Always include obvious direct matches so they don't disappear due 502 // trigram candidate generation edge-cases. 503 for (id, s_norm) in norm_field.iter().enumerate() { 504 let contains_query = s_norm.contains(&q); 505 let has_all_terms = 506 !terms.is_empty() && token_coverage(&terms, s_norm) == 1.0; 507 if contains_query { 508 *counts.entry(id).or_insert(0) += 10_000; 509 } 510 if has_all_terms { 511 *counts.entry(id).or_insert(0) += 5_000; 512 } 513 } 514 515 let candidates: Vec<(BookId, u32)> = if q.len() < 3 || counts.is_empty() { 516 (0..norm_field.len()).map(|id| (id, 0)).collect() 517 } else { 518 let mut v: Vec<(BookId, u32)> = counts.into_iter().collect(); 519 v.sort_by(|a, b| b.1.cmp(&a.1)); 520 v.truncate(max_to_score); 521 v 522 }; 523 524 let mut scored: Vec<(BookId, f64)> = Vec::with_capacity(candidates.len()); 525 for (id, overlap) in candidates { 526 let s_norm = &norm_field[id]; 527 let score = score_fn(id, overlap, s_norm, &q_tris, &q, &terms); 528 if score >= (score_threshold * 0.75) || s_norm.contains(&q) { 529 scored.push((id, score)); 530 } 531 } 532 533 scored.sort_by(|a, b| { 534 b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal) 535 }); 536 scored.truncate(limit); 537 538 scored 539} 540 541/// Fuzzy rank using Jaro-Winkler on the normalized field. 542fn fuzzy_rank( 543 query: &str, 544 norm_field: &[String], 545 tri_index: &AHashMap<u32, Vec<BookId>>, 546 limit: usize, 547 score_threshold: f64, 548) -> Vec<(BookId, f64)> { 549 let max_to_score = (limit * 30).clamp(50, 5000); 550 fuzzy_rank_with( 551 query, 552 norm_field, 553 tri_index, 554 limit, 555 score_threshold, 556 max_to_score, 557 |_, overlap, s_norm, q_tris, q, terms| { 558 let overlap_score = if q_tris.is_empty() { 559 0.0 560 } else { 561 overlap as f64 / q_tris.len() as f64 562 }; 563 let coverage = token_coverage(terms, s_norm); 564 let mut score = (jaro_winkler(q, s_norm) * 0.45) 565 + (overlap_score * 0.25) 566 + (coverage * 0.45); 567 if s_norm == q { 568 score += 1.0; 569 } 570 if s_norm.starts_with(q) { 571 score += 0.45; 572 } 573 if s_norm.contains(q) { 574 score += 0.6; 575 } 576 if !terms.is_empty() && coverage == 1.0 { 577 score += 0.5; 578 } 579 score 580 }, 581 ) 582} 583 584/// Fuzzy rank using trigram overlap against contents blobs. 585fn fuzzy_rank_contents( 586 query: &str, 587 norm_field: &[String], 588 tri_index: &AHashMap<u32, Vec<BookId>>, 589 limit: usize, 590 score_threshold: f64, 591) -> Vec<(BookId, f64)> { 592 let max_to_score = (limit * 50).clamp(100, 10_000); 593 fuzzy_rank_with( 594 query, 595 norm_field, 596 tri_index, 597 limit, 598 score_threshold, 599 max_to_score, 600 |_, overlap, s_norm, q_tris, q, terms| { 601 let q_tri_len = q_tris.len().max(1) as f64; 602 let overlap_score = (overlap as f64) / q_tri_len; 603 let coverage = token_coverage(terms, s_norm); 604 let mut score = (overlap_score * 0.55) + (coverage * 0.75); 605 if s_norm.contains(q) { 606 score += 0.9; 607 } 608 if !terms.is_empty() && coverage == 1.0 { 609 score += 0.5; 610 } 611 score += jaro_winkler(q, s_norm).min(0.75) * 0.15; 612 score 613 }, 614 ) 615}