semantic bufo search find-bufo.com
bufo
at main 404 lines 12 kB view raw
1//! hybrid search combining semantic embeddings with keyword matching 2//! 3//! this implementation uses weighted fusion to balance semantic understanding with exact matches. 4//! 5//! ## search components 6//! 7//! ### 1. semantic search (vector/ANN) 8//! - voyage AI multimodal-3 embeddings via early fusion: 9//! - filename text (e.g., "bufo-jumping-on-bed" → "bufo jumping on bed") + image content 10//! - unified transformer encoder creates 1024-dim vectors 11//! - cosine distance similarity against turbopuffer 12//! - **strength**: finds semantically related bufos (e.g., "happy" → excited, smiling bufos) 13//! - **weakness**: may miss exact filename matches (e.g., "happy" might not surface "bufo-is-happy") 14//! 15//! ### 2. keyword search (BM25) 16//! - full-text search on bufo `name` field (filename without extension) 17//! - BM25 ranking: IDF-weighted term frequency with document length normalization 18//! - **strength**: excellent for exact/partial matches (e.g., "jumping" → "bufos-jumping-on-the-bed") 19//! - **weakness**: no semantic understanding (e.g., "happy" won't find "excited" or "smiling") 20//! 21//! ### 3. weighted fusion 22//! - formula: `score = α * semantic + (1-α) * keyword` 23//! - both scores normalized to 0-1 range before fusion 24//! - configurable `alpha` parameter (default 0.7): 25//! - `α=1.0`: pure semantic (best for conceptual queries like "apocalyptic", "in a giving mood") 26//! - `α=0.7`: default (70% semantic, 30% keyword - balances both strengths) 27//! - `α=0.5`: balanced (equal weight to semantic and keyword signals) 28//! - `α=0.0`: pure keyword (best for exact filename searches) 29//! 30//! ## references 31//! 32//! - voyage multimodal embeddings: https://docs.voyageai.com/docs/multimodal-embeddings 33//! - turbopuffer BM25: https://turbopuffer.com/docs/fts 34//! - weighted fusion: standard approach in modern hybrid search systems (2024) 35 36use crate::config::Config; 37use crate::embedding::VoyageEmbedder; 38use crate::filter::{ContentFilter, Filter, Filterable}; 39use crate::providers::{Embedder, VectorSearchError, VectorStore}; 40use crate::scoring::{cosine_distance_to_similarity, fuse_scores, normalize_bm25_scores, FusionConfig}; 41use crate::turbopuffer::TurbopufferStore; 42use actix_web::{web, HttpRequest, HttpResponse, Result as ActixResult}; 43use serde::{Deserialize, Serialize}; 44use std::collections::hash_map::DefaultHasher; 45use std::collections::HashMap; 46use std::hash::{Hash, Hasher}; 47 48#[derive(Debug, Deserialize)] 49pub struct SearchQuery { 50 pub query: String, 51 #[serde(default = "default_top_k")] 52 pub top_k: usize, 53 /// alpha parameter for weighted fusion (0.0 = pure keyword, 1.0 = pure semantic) 54 /// default 0.7 favors semantic search while still considering exact matches 55 #[serde(default = "default_alpha")] 56 pub alpha: f32, 57 /// family-friendly mode: filters out inappropriate content (default true) 58 #[serde(default = "default_family_friendly")] 59 pub family_friendly: bool, 60 /// comma-separated regex patterns to exclude from results (e.g., "excited,party") 61 #[serde(default)] 62 pub exclude: Option<String>, 63 /// comma-separated regex patterns to include (overrides exclude) 64 #[serde(default)] 65 pub include: Option<String>, 66} 67 68fn default_top_k() -> usize { 69 10 70} 71 72fn default_alpha() -> f32 { 73 0.7 74} 75 76fn default_family_friendly() -> bool { 77 true 78} 79 80#[derive(Debug, Serialize)] 81pub struct SearchResponse { 82 pub results: Vec<BufoResult>, 83} 84 85#[derive(Debug, Serialize, Clone)] 86pub struct BufoResult { 87 pub id: String, 88 pub url: String, 89 pub name: String, 90 pub score: f32, 91} 92 93impl Filterable for BufoResult { 94 fn name(&self) -> &str { 95 &self.name 96 } 97} 98 99/// errors that can occur during search 100#[derive(Debug, thiserror::Error)] 101pub enum SearchError { 102 #[error("embedding error: {0}")] 103 Embedding(#[from] crate::providers::EmbeddingError), 104 105 #[error("vector search error: {0}")] 106 VectorSearch(#[from] VectorSearchError), 107} 108 109impl SearchError { 110 fn into_actix_error(self) -> actix_web::Error { 111 match &self { 112 SearchError::VectorSearch(VectorSearchError::QueryTooLong { .. }) => { 113 actix_web::error::ErrorBadRequest( 114 "search query is too long (max 1024 characters for text search). try a shorter query." 115 ) 116 } 117 _ => actix_web::error::ErrorInternalServerError(self.to_string()), 118 } 119 } 120} 121 122/// generate etag for caching based on query parameters 123fn generate_etag( 124 query: &str, 125 top_k: usize, 126 alpha: f32, 127 family_friendly: bool, 128 exclude: &Option<String>, 129 include: &Option<String>, 130) -> String { 131 let mut hasher = DefaultHasher::new(); 132 query.hash(&mut hasher); 133 top_k.hash(&mut hasher); 134 alpha.to_bits().hash(&mut hasher); 135 family_friendly.hash(&mut hasher); 136 exclude.hash(&mut hasher); 137 include.hash(&mut hasher); 138 format!("\"{}\"", hasher.finish()) 139} 140 141/// execute hybrid search using the provided embedder and vector store 142async fn execute_hybrid_search<E: Embedder, V: VectorStore>( 143 query: &str, 144 top_k: usize, 145 fusion_config: &FusionConfig, 146 embedder: &E, 147 vector_store: &V, 148) -> Result<Vec<(String, f32, HashMap<String, String>)>, SearchError> { 149 // fetch extra results to ensure we have enough after filtering 150 let search_top_k = top_k * 5; 151 let query_owned = query.to_string(); 152 153 // generate query embedding 154 let _embed_span = logfire::span!( 155 "embedding.generate", 156 query = &query_owned, 157 model = embedder.name() 158 ) 159 .entered(); 160 161 let query_embedding = embedder.embed(query).await?; 162 163 logfire::info!( 164 "embedding generated", 165 query = &query_owned, 166 embedding_dim = query_embedding.len() as i64 167 ); 168 169 // run both searches in sequence (could parallelize with tokio::join! if needed) 170 let namespace = vector_store.name().to_string(); 171 172 let vector_results = { 173 let _span = logfire::span!( 174 "turbopuffer.vector_search", 175 query = &query_owned, 176 top_k = search_top_k as i64, 177 namespace = &namespace 178 ) 179 .entered(); 180 181 vector_store 182 .search_by_vector(&query_embedding, search_top_k) 183 .await? 184 }; 185 186 logfire::info!( 187 "vector search completed", 188 query = &query_owned, 189 results_found = vector_results.len() as i64 190 ); 191 192 let bm25_results = { 193 let _span = logfire::span!( 194 "turbopuffer.bm25_search", 195 query = &query_owned, 196 top_k = search_top_k as i64, 197 namespace = &namespace 198 ) 199 .entered(); 200 201 vector_store.search_by_keyword(query, search_top_k).await? 202 }; 203 204 // normalize scores 205 let semantic_scores: HashMap<String, f32> = vector_results 206 .iter() 207 .map(|r| (r.id.clone(), cosine_distance_to_similarity(r.score))) 208 .collect(); 209 210 let bm25_raw: Vec<(String, f32)> = bm25_results 211 .iter() 212 .map(|r| (r.id.clone(), r.score)) 213 .collect(); 214 let keyword_scores = normalize_bm25_scores(&bm25_raw); 215 216 let max_bm25 = bm25_raw 217 .iter() 218 .map(|(_, s)| *s) 219 .fold(f32::NEG_INFINITY, f32::max); 220 221 logfire::info!( 222 "bm25 search completed", 223 query = &query_owned, 224 results_found = bm25_results.len() as i64, 225 max_bm25 = max_bm25 as f64, 226 top_bm25_raw = bm25_raw.first().map(|(_, s)| *s).unwrap_or(0.0) as f64 227 ); 228 229 // fuse scores 230 let fused = fuse_scores(&semantic_scores, &keyword_scores, fusion_config); 231 232 logfire::info!( 233 "weighted fusion completed", 234 total_candidates = (vector_results.len() + bm25_results.len()) as i64, 235 alpha = fusion_config.alpha as f64, 236 pre_filter_results = fused.len() as i64 237 ); 238 239 // collect attributes from both result sets 240 let mut all_attributes: HashMap<String, HashMap<String, String>> = HashMap::new(); 241 for result in vector_results.into_iter().chain(bm25_results.into_iter()) { 242 all_attributes 243 .entry(result.id.clone()) 244 .or_insert(result.attributes); 245 } 246 247 // return fused results with attributes 248 Ok(fused 249 .into_iter() 250 .map(|(id, score)| { 251 let attrs = all_attributes.remove(&id).unwrap_or_default(); 252 (id, score, attrs) 253 }) 254 .collect()) 255} 256 257/// shared search implementation used by both POST and GET handlers 258async fn perform_search( 259 query_text: String, 260 top_k_val: usize, 261 alpha: f32, 262 family_friendly: bool, 263 exclude: Option<String>, 264 include: Option<String>, 265 config: &Config, 266) -> ActixResult<SearchResponse> { 267 let content_filter = ContentFilter::new( 268 family_friendly, 269 exclude.as_deref(), 270 include.as_deref(), 271 ); 272 273 let _search_span = logfire::span!( 274 "bufo_search", 275 query = &query_text, 276 top_k = top_k_val as i64, 277 alpha = alpha as f64, 278 family_friendly = family_friendly, 279 exclude_patterns_count = content_filter.exclude_pattern_count() as i64 280 ) 281 .entered(); 282 283 logfire::info!( 284 "search request received", 285 query = &query_text, 286 top_k = top_k_val as i64, 287 alpha = alpha as f64, 288 exclude_patterns = &content_filter.exclude_patterns_str() 289 ); 290 291 // create clients 292 let embedder = VoyageEmbedder::new(config.voyage_api_key.clone()); 293 let vector_store = TurbopufferStore::new( 294 config.turbopuffer_api_key.clone(), 295 config.turbopuffer_namespace.clone(), 296 ); 297 298 let fusion_config = FusionConfig::new(alpha); 299 300 // execute hybrid search 301 let fused_results = execute_hybrid_search( 302 &query_text, 303 top_k_val, 304 &fusion_config, 305 &embedder, 306 &vector_store, 307 ) 308 .await 309 .map_err(|e| e.into_actix_error())?; 310 311 // convert to BufoResults and apply filtering 312 let results: Vec<BufoResult> = fused_results 313 .into_iter() 314 .map(|(id, score, attrs)| BufoResult { 315 id: id.clone(), 316 url: attrs.get("url").cloned().unwrap_or_default(), 317 name: attrs.get("name").cloned().unwrap_or_else(|| id.clone()), 318 score, 319 }) 320 .filter(|result| content_filter.matches(result)) 321 .take(top_k_val) 322 .collect(); 323 324 let results_count = results.len() as i64; 325 let top_result_name = results 326 .first() 327 .map(|r| r.name.clone()) 328 .unwrap_or_else(|| "none".to_string()); 329 let top_score_val = results.first().map(|r| r.score as f64).unwrap_or(0.0); 330 let avg_score_val = if !results.is_empty() { 331 results.iter().map(|r| r.score as f64).sum::<f64>() / results.len() as f64 332 } else { 333 0.0 334 }; 335 336 logfire::info!( 337 "search completed successfully", 338 query = &query_text, 339 results_count = results_count, 340 top_result = &top_result_name, 341 top_score = top_score_val, 342 avg_score = avg_score_val 343 ); 344 345 Ok(SearchResponse { results }) 346} 347 348/// POST /api/search handler (existing API) 349pub async fn search( 350 query: web::Json<SearchQuery>, 351 config: web::Data<Config>, 352) -> ActixResult<HttpResponse> { 353 let response = perform_search( 354 query.query.clone(), 355 query.top_k, 356 query.alpha, 357 query.family_friendly, 358 query.exclude.clone(), 359 query.include.clone(), 360 &config, 361 ) 362 .await?; 363 Ok(HttpResponse::Ok().json(response)) 364} 365 366/// GET /api/search handler for shareable URLs 367pub async fn search_get( 368 query: web::Query<SearchQuery>, 369 config: web::Data<Config>, 370 req: HttpRequest, 371) -> ActixResult<HttpResponse> { 372 let etag = generate_etag( 373 &query.query, 374 query.top_k, 375 query.alpha, 376 query.family_friendly, 377 &query.exclude, 378 &query.include, 379 ); 380 381 if let Some(if_none_match) = req.headers().get("if-none-match") { 382 if if_none_match.to_str().unwrap_or("") == etag { 383 return Ok(HttpResponse::NotModified() 384 .insert_header(("etag", etag)) 385 .finish()); 386 } 387 } 388 389 let response = perform_search( 390 query.query.clone(), 391 query.top_k, 392 query.alpha, 393 query.family_friendly, 394 query.exclude.clone(), 395 query.include.clone(), 396 &config, 397 ) 398 .await?; 399 400 Ok(HttpResponse::Ok() 401 .insert_header(("etag", etag.clone())) 402 .insert_header(("cache-control", "public, max-age=300")) 403 .json(response)) 404}