online Minecraft written book viewer
1use ahash::{AHashMap, AHashSet};
2use lru::LruCache;
3use nara_core::{
4 book::{Book, BookHash, BookSource},
5 component::Component,
6};
7use smol_str::SmolStr;
8use std::{cell::RefCell, num::NonZeroUsize};
9use strsim::jaro_winkler;
10
11pub type BookId = usize;
12
13/// In-memory index of all books with lookup and fuzzy search helpers.
14#[derive(Debug)]
15pub struct Library {
16 books: Vec<Book>,
17
18 by_hash: AHashMap<BookHash, BookId>,
19 source_by_hash: AHashMap<BookHash, BookSource>,
20 by_author_lc: AHashMap<SmolStr, Vec<BookId>>,
21
22 // normalized blobs for scoring (same index as `books`)
23 norm_title: Vec<String>,
24 norm_author: Vec<String>,
25 norm_contents: Vec<String>,
26
27 // trigram inverted indices -> candidate generation
28 tri_title: AHashMap<u32, Vec<BookId>>,
29 tri_author: AHashMap<u32, Vec<BookId>>,
30 tri_contents: AHashMap<u32, Vec<BookId>>,
31
32 content_threshold: f64,
33 author_threshold: f64,
34 title_threshold: f64,
35
36 cache_books_by_author: RefCell<LruCache<SmolStr, Vec<BookId>>>,
37 cache_fuzzy_title: RefCell<LruCache<FuzzyKey, Vec<(BookId, f64)>>>,
38 cache_fuzzy_author: RefCell<LruCache<FuzzyKey, Vec<(BookId, f64)>>>,
39 cache_fuzzy_contents: RefCell<LruCache<FuzzyKey, Vec<(BookId, f64)>>>,
40 cache_fuzzy_all: RefCell<LruCache<FuzzyKey, Vec<(BookId, f64)>>>,
41
42 empty_books_filtered: u16,
43}
44
45/// Cache key for fuzzy searches.
46#[derive(Debug, Clone, Hash, PartialEq, Eq)]
47struct FuzzyKey {
48 query: SmolStr,
49 limit: usize,
50}
51
52impl Library {
53 pub fn new(
54 content_threshold: f64,
55 title_threshold: f64,
56 author_threshold: f64,
57 ) -> Self {
58 Self {
59 books: Vec::new(),
60 by_hash: AHashMap::new(),
61 source_by_hash: AHashMap::new(),
62 by_author_lc: AHashMap::new(),
63 norm_title: Vec::new(),
64 norm_author: Vec::new(),
65 norm_contents: Vec::new(),
66 tri_title: AHashMap::new(),
67 tri_author: AHashMap::new(),
68 tri_contents: AHashMap::new(),
69 content_threshold,
70 title_threshold,
71 author_threshold,
72 cache_books_by_author: RefCell::new(new_lru(CACHE_BY_AUTHOR_CAP)),
73 cache_fuzzy_title: RefCell::new(new_lru(CACHE_FUZZY_CAP)),
74 cache_fuzzy_author: RefCell::new(new_lru(CACHE_FUZZY_CAP)),
75 cache_fuzzy_contents: RefCell::new(new_lru(CACHE_FUZZY_CAP)),
76 cache_fuzzy_all: RefCell::new(new_lru(CACHE_FUZZY_CAP)),
77 empty_books_filtered: 0,
78 }
79 }
80
81 /// Inserts a book
82 pub fn add_book(
83 &mut self,
84 book: Book,
85 warn_empty: bool,
86 filter_empty_books: bool,
87 ) {
88 if filter_empty_books && book_plain_text_empty(&book) {
89 if warn_empty {
90 tracing::warn!(
91 "Skipping empty book with source {0:?}: {1} by {2}",
92 book.metadata.source,
93 book.content.title,
94 book.content.author
95 );
96 }
97 self.empty_books_filtered += 1;
98 return;
99 };
100
101 let h = book.hash();
102 if self.by_hash.contains_key(&h) {
103 return;
104 }
105
106 let id = self.books.len();
107
108 let source = book.metadata.source.clone();
109 self.books.push(book);
110
111 // indices...
112 self.by_hash.insert(h, id);
113 self.source_by_hash.insert(h, source);
114
115 let author_lc = SmolStr::new(normalize(&self.books[id].content.author));
116 if !author_lc.is_empty() {
117 self.by_author_lc.entry(author_lc).or_default().push(id);
118 }
119
120 // normalized blobs (for scoring)
121 self.norm_title
122 .push(normalize(&self.books[id].content.title));
123 self.norm_author
124 .push(normalize(&self.books[id].content.author));
125 self.norm_contents
126 .push(normalize_contents(&self.books[id].content.pages));
127
128 // candidate-generation indices
129 index_trigrams(&mut self.tri_title, id, &self.norm_title[id]);
130 index_trigrams(&mut self.tri_author, id, &self.norm_author[id]);
131 index_trigrams(&mut self.tri_contents, id, &self.norm_contents[id]);
132
133 self.cache_books_by_author.borrow_mut().clear();
134 self.cache_fuzzy_title.borrow_mut().clear();
135 self.cache_fuzzy_author.borrow_mut().clear();
136 self.cache_fuzzy_contents.borrow_mut().clear();
137 self.cache_fuzzy_all.borrow_mut().clear();
138 }
139
140 /// Looks up a book by its content hash.
141 #[inline]
142 pub fn book_by_hash(&self, hash: BookHash) -> Option<&Book> {
143 self.by_hash.get(&hash).map(|&id| &self.books[id])
144 }
145
146 /// Lists books for an author (case-insensitive match).
147 #[inline]
148 pub fn books_by_author<'a>(
149 &'a self,
150 author: &str,
151 ) -> impl Iterator<Item = &'a Book> + 'a {
152 let key = SmolStr::new(normalize(author));
153 let ids = if key.is_empty() {
154 Vec::new()
155 } else if let Some(ids) =
156 self.cache_books_by_author.borrow_mut().get(&key).cloned()
157 {
158 ids
159 } else {
160 let ids = self.by_author_lc.get(&key).cloned().unwrap_or_default();
161 self.cache_books_by_author
162 .borrow_mut()
163 .put(key.clone(), ids.clone());
164 ids
165 };
166
167 ids.into_iter().map(|id| &self.books[id])
168 }
169
170 /// Fuzzy search over normalized titles.
171 pub fn fuzzy_title(&self, query: &str, limit: usize) -> Vec<(&Book, f64)> {
172 let key = SmolStr::new(normalize(query));
173 if key.is_empty() || limit == 0 {
174 return Vec::new();
175 }
176
177 let cache_key = FuzzyKey {
178 query: key.clone(),
179 limit,
180 };
181
182 let scored = if let Some(scored) =
183 self.cache_fuzzy_title.borrow_mut().get(&cache_key).cloned()
184 {
185 scored
186 } else {
187 let scored = fuzzy_rank(
188 key.as_str(),
189 &self.norm_title,
190 &self.tri_title,
191 limit,
192 self.title_threshold,
193 );
194 self.cache_fuzzy_title
195 .borrow_mut()
196 .put(cache_key, scored.clone());
197 scored
198 };
199
200 scored
201 .into_iter()
202 .map(|(id, s)| (&self.books[id], s))
203 .collect()
204 }
205
206 /// Fuzzy search over normalized author names.
207 pub fn fuzzy_author(&self, query: &str, limit: usize) -> Vec<(&Book, f64)> {
208 let key = SmolStr::new(normalize(query));
209 if key.is_empty() || limit == 0 {
210 return Vec::new();
211 }
212
213 let cache_key = FuzzyKey {
214 query: key.clone(),
215 limit,
216 };
217
218 let scored = if let Some(scored) = self
219 .cache_fuzzy_author
220 .borrow_mut()
221 .get(&cache_key)
222 .cloned()
223 {
224 scored
225 } else {
226 let scored = fuzzy_rank(
227 key.as_str(),
228 &self.norm_author,
229 &self.tri_author,
230 limit,
231 self.author_threshold,
232 );
233 self.cache_fuzzy_author
234 .borrow_mut()
235 .put(cache_key, scored.clone());
236 scored
237 };
238
239 scored
240 .into_iter()
241 .map(|(id, s)| (&self.books[id], s))
242 .collect()
243 }
244
245 /// Fuzzy search over normalized contents blobs.
246 pub fn fuzzy_contents(
247 &self,
248 query: &str,
249 limit: usize,
250 ) -> Vec<(&Book, f64)> {
251 let key = SmolStr::new(normalize(query));
252 if key.is_empty() || limit == 0 {
253 return Vec::new();
254 }
255
256 let cache_key = FuzzyKey {
257 query: key.clone(),
258 limit,
259 };
260
261 let scored = if let Some(scored) = self
262 .cache_fuzzy_contents
263 .borrow_mut()
264 .get(&cache_key)
265 .cloned()
266 {
267 scored
268 } else {
269 let scored = fuzzy_rank_contents(
270 key.as_str(),
271 &self.norm_contents,
272 &self.tri_contents,
273 limit,
274 self.content_threshold,
275 );
276 self.cache_fuzzy_contents
277 .borrow_mut()
278 .put(cache_key, scored.clone());
279 scored
280 };
281
282 scored
283 .into_iter()
284 .map(|(id, s)| (&self.books[id], s))
285 .collect()
286 }
287
288 /// Combined fuzzy search (title + author + contents).
289 pub fn fuzzy(&self, query: &str, limit: usize) -> Vec<(&Book, f64)> {
290 let key = SmolStr::new(normalize(query));
291 if key.is_empty() || limit == 0 {
292 return Vec::new();
293 }
294
295 let cache_key = FuzzyKey {
296 query: key.clone(),
297 limit,
298 };
299
300 let scored = if let Some(scored) =
301 self.cache_fuzzy_all.borrow_mut().get(&cache_key).cloned()
302 {
303 scored
304 } else {
305 let mut totals: AHashMap<BookId, f64> = AHashMap::new();
306
307 let title = fuzzy_rank(
308 key.as_str(),
309 &self.norm_title,
310 &self.tri_title,
311 (limit * 4).clamp(50, 2000),
312 self.title_threshold,
313 );
314 for (id, s) in title {
315 *totals.entry(id).or_insert(0.0) += s;
316 }
317
318 let author = fuzzy_rank(
319 key.as_str(),
320 &self.norm_author,
321 &self.tri_author,
322 (limit * 4).clamp(50, 2000),
323 self.author_threshold,
324 );
325 for (id, s) in author {
326 *totals.entry(id).or_insert(0.0) += s;
327 }
328
329 let contents = fuzzy_rank_contents(
330 key.as_str(),
331 &self.norm_contents,
332 &self.tri_contents,
333 (limit * 6).clamp(100, 4000),
334 self.content_threshold,
335 );
336 for (id, s) in contents {
337 *totals.entry(id).or_insert(0.0) += s * 0.7;
338 }
339
340 let mut scored: Vec<(BookId, f64)> = totals.into_iter().collect();
341 scored.sort_by(|a, b| {
342 b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
343 });
344 scored.truncate(limit);
345
346 self.cache_fuzzy_all
347 .borrow_mut()
348 .put(cache_key, scored.clone());
349 scored
350 };
351
352 scored
353 .into_iter()
354 .map(|(id, s)| (&self.books[id], s))
355 .collect()
356 }
357
358 /// Returns the number of indexed books.
359 #[inline]
360 pub fn book_count(&self) -> usize {
361 self.books.len()
362 }
363
364 /// Returns a list of all books in the library.
365 #[inline]
366 pub fn all_books<'a>(&'a self) -> impl Iterator<Item = &'a Book> + 'a {
367 self.books.iter()
368 }
369
370 /// Returns the source for a book hash, if present.
371 #[inline]
372 pub fn source_for_hash(&self, hash: &BookHash) -> Option<&BookSource> {
373 self.source_by_hash.get(hash)
374 }
375}
376
377/// Lowercases and normalizes a query string.
378#[inline]
379fn normalize(s: &str) -> String {
380 s.to_lowercase()
381}
382
383fn book_plain_text_empty(book: &Book) -> bool {
384 book.content.pages.is_empty()
385 || book
386 .content
387 .pages
388 .iter()
389 .all(|page| page.normalize().trim().is_empty())
390}
391
392const CACHE_BY_AUTHOR_CAP: usize = 1024;
393const CACHE_FUZZY_CAP: usize = 256;
394
395/// Helper to build LRU caches with non-zero capacity.
396fn new_lru<K: std::hash::Hash + Eq, V>(cap: usize) -> LruCache<K, V> {
397 LruCache::new(NonZeroUsize::new(cap).expect("cache cap must be > 0"))
398}
399
400const MAX_CONTENT_INDEX_CHARS: usize = 16_384;
401
402/// Joins and normalizes pages for the contents index.
403fn normalize_contents(pages: &[Component]) -> String {
404 let mut out = String::new();
405 for p in pages {
406 if out.len() >= MAX_CONTENT_INDEX_CHARS {
407 break;
408 }
409 if !out.is_empty() {
410 out.push('\n');
411 }
412 out.push_str(&p.to_plain_text());
413 }
414 let mut out = out.to_lowercase();
415 if out.len() > MAX_CONTENT_INDEX_CHARS {
416 out.truncate(MAX_CONTENT_INDEX_CHARS);
417 }
418 out
419}
420
421#[inline]
422fn query_terms(q: &str) -> Vec<&str> {
423 q.split_whitespace().filter(|t| !t.is_empty()).collect()
424}
425
426#[inline]
427fn token_coverage(terms: &[&str], haystack: &str) -> f64 {
428 if terms.is_empty() {
429 return 0.0;
430 }
431 let matched = terms.iter().filter(|t| haystack.contains(**t)).count();
432 matched as f64 / terms.len() as f64
433}
434
435/// Encodes ASCII-ish trigrams into `u32`.
436fn trigrams(s: &str) -> AHashSet<u32> {
437 let b = s.as_bytes();
438 let mut set = AHashSet::new();
439
440 if b.is_empty() {
441 return set;
442 }
443
444 if b.len() < 3 {
445 let b0 = b.first().copied().unwrap_or(0);
446 let b1 = b.get(1).copied().unwrap_or(0);
447 let tri = ((b0 as u32) << 16) | ((b1 as u32) << 8);
448 set.insert(tri);
449 return set;
450 }
451
452 for w in b.windows(3) {
453 let tri = ((w[0] as u32) << 16) | ((w[1] as u32) << 8) | (w[2] as u32);
454 set.insert(tri);
455 }
456
457 set
458}
459
460/// Adds all trigrams from a string to the inverted index.
461fn index_trigrams(
462 index: &mut AHashMap<u32, Vec<BookId>>,
463 id: BookId,
464 norm: &str,
465) {
466 for tri in trigrams(norm) {
467 index.entry(tri).or_default().push(id);
468 }
469}
470
471/// Shared pipeline for fuzzy ranking with a custom scoring function.
472fn fuzzy_rank_with<F>(
473 query: &str,
474 norm_field: &[String],
475 tri_index: &AHashMap<u32, Vec<BookId>>,
476 limit: usize,
477 score_threshold: f64,
478 max_to_score: usize,
479 mut score_fn: F,
480) -> Vec<(BookId, f64)>
481where
482 F: FnMut(BookId, u32, &str, &AHashSet<u32>, &str, &[&str]) -> f64,
483{
484 let q = normalize(query);
485 if q.is_empty() || limit == 0 {
486 return Vec::new();
487 }
488
489 let q_tris = trigrams(&q);
490 let terms = query_terms(&q);
491 let mut counts: AHashMap<BookId, u32> = AHashMap::new();
492
493 for tri in q_tris.iter() {
494 if let Some(ids) = tri_index.get(tri) {
495 for &id in ids {
496 *counts.entry(id).or_insert(0) += 1;
497 }
498 }
499 }
500
501 // Always include obvious direct matches so they don't disappear due
502 // trigram candidate generation edge-cases.
503 for (id, s_norm) in norm_field.iter().enumerate() {
504 let contains_query = s_norm.contains(&q);
505 let has_all_terms =
506 !terms.is_empty() && token_coverage(&terms, s_norm) == 1.0;
507 if contains_query {
508 *counts.entry(id).or_insert(0) += 10_000;
509 }
510 if has_all_terms {
511 *counts.entry(id).or_insert(0) += 5_000;
512 }
513 }
514
515 let candidates: Vec<(BookId, u32)> = if q.len() < 3 || counts.is_empty() {
516 (0..norm_field.len()).map(|id| (id, 0)).collect()
517 } else {
518 let mut v: Vec<(BookId, u32)> = counts.into_iter().collect();
519 v.sort_by(|a, b| b.1.cmp(&a.1));
520 v.truncate(max_to_score);
521 v
522 };
523
524 let mut scored: Vec<(BookId, f64)> = Vec::with_capacity(candidates.len());
525 for (id, overlap) in candidates {
526 let s_norm = &norm_field[id];
527 let score = score_fn(id, overlap, s_norm, &q_tris, &q, &terms);
528 if score >= (score_threshold * 0.75) || s_norm.contains(&q) {
529 scored.push((id, score));
530 }
531 }
532
533 scored.sort_by(|a, b| {
534 b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
535 });
536 scored.truncate(limit);
537
538 scored
539}
540
541/// Fuzzy rank using Jaro-Winkler on the normalized field.
542fn fuzzy_rank(
543 query: &str,
544 norm_field: &[String],
545 tri_index: &AHashMap<u32, Vec<BookId>>,
546 limit: usize,
547 score_threshold: f64,
548) -> Vec<(BookId, f64)> {
549 let max_to_score = (limit * 30).clamp(50, 5000);
550 fuzzy_rank_with(
551 query,
552 norm_field,
553 tri_index,
554 limit,
555 score_threshold,
556 max_to_score,
557 |_, overlap, s_norm, q_tris, q, terms| {
558 let overlap_score = if q_tris.is_empty() {
559 0.0
560 } else {
561 overlap as f64 / q_tris.len() as f64
562 };
563 let coverage = token_coverage(terms, s_norm);
564 let mut score = (jaro_winkler(q, s_norm) * 0.45)
565 + (overlap_score * 0.25)
566 + (coverage * 0.45);
567 if s_norm == q {
568 score += 1.0;
569 }
570 if s_norm.starts_with(q) {
571 score += 0.45;
572 }
573 if s_norm.contains(q) {
574 score += 0.6;
575 }
576 if !terms.is_empty() && coverage == 1.0 {
577 score += 0.5;
578 }
579 score
580 },
581 )
582}
583
584/// Fuzzy rank using trigram overlap against contents blobs.
585fn fuzzy_rank_contents(
586 query: &str,
587 norm_field: &[String],
588 tri_index: &AHashMap<u32, Vec<BookId>>,
589 limit: usize,
590 score_threshold: f64,
591) -> Vec<(BookId, f64)> {
592 let max_to_score = (limit * 50).clamp(100, 10_000);
593 fuzzy_rank_with(
594 query,
595 norm_field,
596 tri_index,
597 limit,
598 score_threshold,
599 max_to_score,
600 |_, overlap, s_norm, q_tris, q, terms| {
601 let q_tri_len = q_tris.len().max(1) as f64;
602 let overlap_score = (overlap as f64) / q_tri_len;
603 let coverage = token_coverage(terms, s_norm);
604 let mut score = (overlap_score * 0.55) + (coverage * 0.75);
605 if s_norm.contains(q) {
606 score += 0.9;
607 }
608 if !terms.is_empty() && coverage == 1.0 {
609 score += 0.5;
610 }
611 score += jaro_winkler(q, s_norm).min(0.75) * 0.15;
612 score
613 },
614 )
615}