use ahash::{AHashMap, AHashSet}; use lru::LruCache; use nara_core::{ book::{Book, BookHash, BookSource}, component::Component, }; use smol_str::SmolStr; use std::{cell::RefCell, num::NonZeroUsize}; use strsim::jaro_winkler; pub type BookId = usize; /// In-memory index of all books with lookup and fuzzy search helpers. #[derive(Debug)] pub struct Library { books: Vec, by_hash: AHashMap, source_by_hash: AHashMap, by_author_lc: AHashMap>, // normalized blobs for scoring (same index as `books`) norm_title: Vec, norm_author: Vec, norm_contents: Vec, // trigram inverted indices -> candidate generation tri_title: AHashMap>, tri_author: AHashMap>, tri_contents: AHashMap>, content_threshold: f64, author_threshold: f64, title_threshold: f64, cache_books_by_author: RefCell>>, cache_fuzzy_title: RefCell>>, cache_fuzzy_author: RefCell>>, cache_fuzzy_contents: RefCell>>, cache_fuzzy_all: RefCell>>, empty_books_filtered: u16, } /// Cache key for fuzzy searches. #[derive(Debug, Clone, Hash, PartialEq, Eq)] struct FuzzyKey { query: SmolStr, limit: usize, } impl Library { pub fn new( content_threshold: f64, title_threshold: f64, author_threshold: f64, ) -> Self { Self { books: Vec::new(), by_hash: AHashMap::new(), source_by_hash: AHashMap::new(), by_author_lc: AHashMap::new(), norm_title: Vec::new(), norm_author: Vec::new(), norm_contents: Vec::new(), tri_title: AHashMap::new(), tri_author: AHashMap::new(), tri_contents: AHashMap::new(), content_threshold, title_threshold, author_threshold, cache_books_by_author: RefCell::new(new_lru(CACHE_BY_AUTHOR_CAP)), cache_fuzzy_title: RefCell::new(new_lru(CACHE_FUZZY_CAP)), cache_fuzzy_author: RefCell::new(new_lru(CACHE_FUZZY_CAP)), cache_fuzzy_contents: RefCell::new(new_lru(CACHE_FUZZY_CAP)), cache_fuzzy_all: RefCell::new(new_lru(CACHE_FUZZY_CAP)), empty_books_filtered: 0, } } /// Inserts a book pub fn add_book( &mut self, book: Book, warn_empty: bool, filter_empty_books: bool, ) { if filter_empty_books && book_plain_text_empty(&book) { if warn_empty { tracing::warn!( "Skipping empty book with source {0:?}: {1} by {2}", book.metadata.source, book.content.title, book.content.author ); } self.empty_books_filtered += 1; return; }; let h = book.hash(); if self.by_hash.contains_key(&h) { return; } let id = self.books.len(); let source = book.metadata.source.clone(); self.books.push(book); // indices... self.by_hash.insert(h, id); self.source_by_hash.insert(h, source); let author_lc = SmolStr::new(normalize(&self.books[id].content.author)); if !author_lc.is_empty() { self.by_author_lc.entry(author_lc).or_default().push(id); } // normalized blobs (for scoring) self.norm_title .push(normalize(&self.books[id].content.title)); self.norm_author .push(normalize(&self.books[id].content.author)); self.norm_contents .push(normalize_contents(&self.books[id].content.pages)); // candidate-generation indices index_trigrams(&mut self.tri_title, id, &self.norm_title[id]); index_trigrams(&mut self.tri_author, id, &self.norm_author[id]); index_trigrams(&mut self.tri_contents, id, &self.norm_contents[id]); self.cache_books_by_author.borrow_mut().clear(); self.cache_fuzzy_title.borrow_mut().clear(); self.cache_fuzzy_author.borrow_mut().clear(); self.cache_fuzzy_contents.borrow_mut().clear(); self.cache_fuzzy_all.borrow_mut().clear(); } /// Looks up a book by its content hash. #[inline] pub fn book_by_hash(&self, hash: BookHash) -> Option<&Book> { self.by_hash.get(&hash).map(|&id| &self.books[id]) } /// Lists books for an author (case-insensitive match). #[inline] pub fn books_by_author<'a>( &'a self, author: &str, ) -> impl Iterator + 'a { let key = SmolStr::new(normalize(author)); let ids = if key.is_empty() { Vec::new() } else if let Some(ids) = self.cache_books_by_author.borrow_mut().get(&key).cloned() { ids } else { let ids = self.by_author_lc.get(&key).cloned().unwrap_or_default(); self.cache_books_by_author .borrow_mut() .put(key.clone(), ids.clone()); ids }; ids.into_iter().map(|id| &self.books[id]) } /// Fuzzy search over normalized titles. pub fn fuzzy_title(&self, query: &str, limit: usize) -> Vec<(&Book, f64)> { let key = SmolStr::new(normalize(query)); if key.is_empty() || limit == 0 { return Vec::new(); } let cache_key = FuzzyKey { query: key.clone(), limit, }; let scored = if let Some(scored) = self.cache_fuzzy_title.borrow_mut().get(&cache_key).cloned() { scored } else { let scored = fuzzy_rank( key.as_str(), &self.norm_title, &self.tri_title, limit, self.title_threshold, ); self.cache_fuzzy_title .borrow_mut() .put(cache_key, scored.clone()); scored }; scored .into_iter() .map(|(id, s)| (&self.books[id], s)) .collect() } /// Fuzzy search over normalized author names. pub fn fuzzy_author(&self, query: &str, limit: usize) -> Vec<(&Book, f64)> { let key = SmolStr::new(normalize(query)); if key.is_empty() || limit == 0 { return Vec::new(); } let cache_key = FuzzyKey { query: key.clone(), limit, }; let scored = if let Some(scored) = self .cache_fuzzy_author .borrow_mut() .get(&cache_key) .cloned() { scored } else { let scored = fuzzy_rank( key.as_str(), &self.norm_author, &self.tri_author, limit, self.author_threshold, ); self.cache_fuzzy_author .borrow_mut() .put(cache_key, scored.clone()); scored }; scored .into_iter() .map(|(id, s)| (&self.books[id], s)) .collect() } /// Fuzzy search over normalized contents blobs. pub fn fuzzy_contents( &self, query: &str, limit: usize, ) -> Vec<(&Book, f64)> { let key = SmolStr::new(normalize(query)); if key.is_empty() || limit == 0 { return Vec::new(); } let cache_key = FuzzyKey { query: key.clone(), limit, }; let scored = if let Some(scored) = self .cache_fuzzy_contents .borrow_mut() .get(&cache_key) .cloned() { scored } else { let scored = fuzzy_rank_contents( key.as_str(), &self.norm_contents, &self.tri_contents, limit, self.content_threshold, ); self.cache_fuzzy_contents .borrow_mut() .put(cache_key, scored.clone()); scored }; scored .into_iter() .map(|(id, s)| (&self.books[id], s)) .collect() } /// Combined fuzzy search (title + author + contents). pub fn fuzzy(&self, query: &str, limit: usize) -> Vec<(&Book, f64)> { let key = SmolStr::new(normalize(query)); if key.is_empty() || limit == 0 { return Vec::new(); } let cache_key = FuzzyKey { query: key.clone(), limit, }; let scored = if let Some(scored) = self.cache_fuzzy_all.borrow_mut().get(&cache_key).cloned() { scored } else { let mut totals: AHashMap = AHashMap::new(); let title = fuzzy_rank( key.as_str(), &self.norm_title, &self.tri_title, (limit * 4).clamp(50, 2000), self.title_threshold, ); for (id, s) in title { *totals.entry(id).or_insert(0.0) += s; } let author = fuzzy_rank( key.as_str(), &self.norm_author, &self.tri_author, (limit * 4).clamp(50, 2000), self.author_threshold, ); for (id, s) in author { *totals.entry(id).or_insert(0.0) += s; } let contents = fuzzy_rank_contents( key.as_str(), &self.norm_contents, &self.tri_contents, (limit * 6).clamp(100, 4000), self.content_threshold, ); for (id, s) in contents { *totals.entry(id).or_insert(0.0) += s * 0.7; } let mut scored: Vec<(BookId, f64)> = totals.into_iter().collect(); scored.sort_by(|a, b| { b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal) }); scored.truncate(limit); self.cache_fuzzy_all .borrow_mut() .put(cache_key, scored.clone()); scored }; scored .into_iter() .map(|(id, s)| (&self.books[id], s)) .collect() } /// Returns the number of indexed books. #[inline] pub fn book_count(&self) -> usize { self.books.len() } /// Returns a list of all books in the library. #[inline] pub fn all_books<'a>(&'a self) -> impl Iterator + 'a { self.books.iter() } /// Returns the source for a book hash, if present. #[inline] pub fn source_for_hash(&self, hash: &BookHash) -> Option<&BookSource> { self.source_by_hash.get(hash) } } /// Lowercases and normalizes a query string. #[inline] fn normalize(s: &str) -> String { s.to_lowercase() } fn book_plain_text_empty(book: &Book) -> bool { book.content.pages.is_empty() || book .content .pages .iter() .all(|page| page.normalize().trim().is_empty()) } const CACHE_BY_AUTHOR_CAP: usize = 1024; const CACHE_FUZZY_CAP: usize = 256; /// Helper to build LRU caches with non-zero capacity. fn new_lru(cap: usize) -> LruCache { LruCache::new(NonZeroUsize::new(cap).expect("cache cap must be > 0")) } const MAX_CONTENT_INDEX_CHARS: usize = 16_384; /// Joins and normalizes pages for the contents index. fn normalize_contents(pages: &[Component]) -> String { let mut out = String::new(); for p in pages { if out.len() >= MAX_CONTENT_INDEX_CHARS { break; } if !out.is_empty() { out.push('\n'); } out.push_str(&p.to_plain_text()); } let mut out = out.to_lowercase(); if out.len() > MAX_CONTENT_INDEX_CHARS { out.truncate(MAX_CONTENT_INDEX_CHARS); } out } #[inline] fn query_terms(q: &str) -> Vec<&str> { q.split_whitespace().filter(|t| !t.is_empty()).collect() } #[inline] fn token_coverage(terms: &[&str], haystack: &str) -> f64 { if terms.is_empty() { return 0.0; } let matched = terms.iter().filter(|t| haystack.contains(**t)).count(); matched as f64 / terms.len() as f64 } /// Encodes ASCII-ish trigrams into `u32`. fn trigrams(s: &str) -> AHashSet { let b = s.as_bytes(); let mut set = AHashSet::new(); if b.is_empty() { return set; } if b.len() < 3 { let b0 = b.first().copied().unwrap_or(0); let b1 = b.get(1).copied().unwrap_or(0); let tri = ((b0 as u32) << 16) | ((b1 as u32) << 8); set.insert(tri); return set; } for w in b.windows(3) { let tri = ((w[0] as u32) << 16) | ((w[1] as u32) << 8) | (w[2] as u32); set.insert(tri); } set } /// Adds all trigrams from a string to the inverted index. fn index_trigrams( index: &mut AHashMap>, id: BookId, norm: &str, ) { for tri in trigrams(norm) { index.entry(tri).or_default().push(id); } } /// Shared pipeline for fuzzy ranking with a custom scoring function. fn fuzzy_rank_with( query: &str, norm_field: &[String], tri_index: &AHashMap>, limit: usize, score_threshold: f64, max_to_score: usize, mut score_fn: F, ) -> Vec<(BookId, f64)> where F: FnMut(BookId, u32, &str, &AHashSet, &str, &[&str]) -> f64, { let q = normalize(query); if q.is_empty() || limit == 0 { return Vec::new(); } let q_tris = trigrams(&q); let terms = query_terms(&q); let mut counts: AHashMap = AHashMap::new(); for tri in q_tris.iter() { if let Some(ids) = tri_index.get(tri) { for &id in ids { *counts.entry(id).or_insert(0) += 1; } } } // Always include obvious direct matches so they don't disappear due // trigram candidate generation edge-cases. for (id, s_norm) in norm_field.iter().enumerate() { let contains_query = s_norm.contains(&q); let has_all_terms = !terms.is_empty() && token_coverage(&terms, s_norm) == 1.0; if contains_query { *counts.entry(id).or_insert(0) += 10_000; } if has_all_terms { *counts.entry(id).or_insert(0) += 5_000; } } let candidates: Vec<(BookId, u32)> = if q.len() < 3 || counts.is_empty() { (0..norm_field.len()).map(|id| (id, 0)).collect() } else { let mut v: Vec<(BookId, u32)> = counts.into_iter().collect(); v.sort_by(|a, b| b.1.cmp(&a.1)); v.truncate(max_to_score); v }; let mut scored: Vec<(BookId, f64)> = Vec::with_capacity(candidates.len()); for (id, overlap) in candidates { let s_norm = &norm_field[id]; let score = score_fn(id, overlap, s_norm, &q_tris, &q, &terms); if score >= (score_threshold * 0.75) || s_norm.contains(&q) { scored.push((id, score)); } } scored.sort_by(|a, b| { b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal) }); scored.truncate(limit); scored } /// Fuzzy rank using Jaro-Winkler on the normalized field. fn fuzzy_rank( query: &str, norm_field: &[String], tri_index: &AHashMap>, limit: usize, score_threshold: f64, ) -> Vec<(BookId, f64)> { let max_to_score = (limit * 30).clamp(50, 5000); fuzzy_rank_with( query, norm_field, tri_index, limit, score_threshold, max_to_score, |_, overlap, s_norm, q_tris, q, terms| { let overlap_score = if q_tris.is_empty() { 0.0 } else { overlap as f64 / q_tris.len() as f64 }; let coverage = token_coverage(terms, s_norm); let mut score = (jaro_winkler(q, s_norm) * 0.45) + (overlap_score * 0.25) + (coverage * 0.45); if s_norm == q { score += 1.0; } if s_norm.starts_with(q) { score += 0.45; } if s_norm.contains(q) { score += 0.6; } if !terms.is_empty() && coverage == 1.0 { score += 0.5; } score }, ) } /// Fuzzy rank using trigram overlap against contents blobs. fn fuzzy_rank_contents( query: &str, norm_field: &[String], tri_index: &AHashMap>, limit: usize, score_threshold: f64, ) -> Vec<(BookId, f64)> { let max_to_score = (limit * 50).clamp(100, 10_000); fuzzy_rank_with( query, norm_field, tri_index, limit, score_threshold, max_to_score, |_, overlap, s_norm, q_tris, q, terms| { let q_tri_len = q_tris.len().max(1) as f64; let overlap_score = (overlap as f64) / q_tri_len; let coverage = token_coverage(terms, s_norm); let mut score = (overlap_score * 0.55) + (coverage * 0.75); if s_norm.contains(q) { score += 0.9; } if !terms.is_empty() && coverage == 1.0 { score += 0.5; } score += jaro_winkler(q, s_norm).min(0.75) * 0.15; score }, ) }