semantic bufo search
find-bufo.com
bufo
1//! hybrid search combining semantic embeddings with keyword matching
2//!
3//! this implementation uses weighted fusion to balance semantic understanding with exact matches.
4//!
5//! ## search components
6//!
7//! ### 1. semantic search (vector/ANN)
8//! - voyage AI multimodal-3 embeddings via early fusion:
9//! - filename text (e.g., "bufo-jumping-on-bed" → "bufo jumping on bed") + image content
10//! - unified transformer encoder creates 1024-dim vectors
11//! - cosine distance similarity against turbopuffer
12//! - **strength**: finds semantically related bufos (e.g., "happy" → excited, smiling bufos)
13//! - **weakness**: may miss exact filename matches (e.g., "happy" might not surface "bufo-is-happy")
14//!
15//! ### 2. keyword search (BM25)
16//! - full-text search on bufo `name` field (filename without extension)
17//! - BM25 ranking: IDF-weighted term frequency with document length normalization
18//! - **strength**: excellent for exact/partial matches (e.g., "jumping" → "bufos-jumping-on-the-bed")
19//! - **weakness**: no semantic understanding (e.g., "happy" won't find "excited" or "smiling")
20//!
21//! ### 3. weighted fusion
22//! - formula: `score = α * semantic + (1-α) * keyword`
23//! - both scores normalized to 0-1 range before fusion
24//! - configurable `alpha` parameter (default 0.7):
25//! - `α=1.0`: pure semantic (best for conceptual queries like "apocalyptic", "in a giving mood")
26//! - `α=0.7`: default (70% semantic, 30% keyword - balances both strengths)
27//! - `α=0.5`: balanced (equal weight to semantic and keyword signals)
28//! - `α=0.0`: pure keyword (best for exact filename searches)
29//!
30//! ## references
31//!
32//! - voyage multimodal embeddings: https://docs.voyageai.com/docs/multimodal-embeddings
33//! - turbopuffer BM25: https://turbopuffer.com/docs/fts
34//! - weighted fusion: standard approach in modern hybrid search systems (2024)
35
36use crate::config::Config;
37use crate::embedding::VoyageEmbedder;
38use crate::filter::{ContentFilter, Filter, Filterable};
39use crate::providers::{Embedder, VectorSearchError, VectorStore};
40use crate::scoring::{cosine_distance_to_similarity, fuse_scores, normalize_bm25_scores, FusionConfig};
41use crate::turbopuffer::TurbopufferStore;
42use actix_web::{web, HttpRequest, HttpResponse, Result as ActixResult};
43use serde::{Deserialize, Serialize};
44use std::collections::hash_map::DefaultHasher;
45use std::collections::HashMap;
46use std::hash::{Hash, Hasher};
47
48#[derive(Debug, Deserialize)]
49pub struct SearchQuery {
50 pub query: String,
51 #[serde(default = "default_top_k")]
52 pub top_k: usize,
53 /// alpha parameter for weighted fusion (0.0 = pure keyword, 1.0 = pure semantic)
54 /// default 0.7 favors semantic search while still considering exact matches
55 #[serde(default = "default_alpha")]
56 pub alpha: f32,
57 /// family-friendly mode: filters out inappropriate content (default true)
58 #[serde(default = "default_family_friendly")]
59 pub family_friendly: bool,
60 /// comma-separated regex patterns to exclude from results (e.g., "excited,party")
61 #[serde(default)]
62 pub exclude: Option<String>,
63 /// comma-separated regex patterns to include (overrides exclude)
64 #[serde(default)]
65 pub include: Option<String>,
66}
67
68fn default_top_k() -> usize {
69 10
70}
71
72fn default_alpha() -> f32 {
73 0.7
74}
75
76fn default_family_friendly() -> bool {
77 true
78}
79
80#[derive(Debug, Serialize)]
81pub struct SearchResponse {
82 pub results: Vec<BufoResult>,
83}
84
85#[derive(Debug, Serialize, Clone)]
86pub struct BufoResult {
87 pub id: String,
88 pub url: String,
89 pub name: String,
90 pub score: f32,
91}
92
93impl Filterable for BufoResult {
94 fn name(&self) -> &str {
95 &self.name
96 }
97}
98
99/// errors that can occur during search
100#[derive(Debug, thiserror::Error)]
101pub enum SearchError {
102 #[error("embedding error: {0}")]
103 Embedding(#[from] crate::providers::EmbeddingError),
104
105 #[error("vector search error: {0}")]
106 VectorSearch(#[from] VectorSearchError),
107}
108
109impl SearchError {
110 fn into_actix_error(self) -> actix_web::Error {
111 match &self {
112 SearchError::VectorSearch(VectorSearchError::QueryTooLong { .. }) => {
113 actix_web::error::ErrorBadRequest(
114 "search query is too long (max 1024 characters for text search). try a shorter query."
115 )
116 }
117 _ => actix_web::error::ErrorInternalServerError(self.to_string()),
118 }
119 }
120}
121
122/// generate etag for caching based on query parameters
123fn generate_etag(
124 query: &str,
125 top_k: usize,
126 alpha: f32,
127 family_friendly: bool,
128 exclude: &Option<String>,
129 include: &Option<String>,
130) -> String {
131 let mut hasher = DefaultHasher::new();
132 query.hash(&mut hasher);
133 top_k.hash(&mut hasher);
134 alpha.to_bits().hash(&mut hasher);
135 family_friendly.hash(&mut hasher);
136 exclude.hash(&mut hasher);
137 include.hash(&mut hasher);
138 format!("\"{}\"", hasher.finish())
139}
140
141/// execute hybrid search using the provided embedder and vector store
142async fn execute_hybrid_search<E: Embedder, V: VectorStore>(
143 query: &str,
144 top_k: usize,
145 fusion_config: &FusionConfig,
146 embedder: &E,
147 vector_store: &V,
148) -> Result<Vec<(String, f32, HashMap<String, String>)>, SearchError> {
149 // fetch extra results to ensure we have enough after filtering
150 let search_top_k = top_k * 5;
151 let query_owned = query.to_string();
152
153 // generate query embedding
154 let _embed_span = logfire::span!(
155 "embedding.generate",
156 query = &query_owned,
157 model = embedder.name()
158 )
159 .entered();
160
161 let query_embedding = embedder.embed(query).await?;
162
163 logfire::info!(
164 "embedding generated",
165 query = &query_owned,
166 embedding_dim = query_embedding.len() as i64
167 );
168
169 // run both searches in sequence (could parallelize with tokio::join! if needed)
170 let namespace = vector_store.name().to_string();
171
172 let vector_results = {
173 let _span = logfire::span!(
174 "turbopuffer.vector_search",
175 query = &query_owned,
176 top_k = search_top_k as i64,
177 namespace = &namespace
178 )
179 .entered();
180
181 vector_store
182 .search_by_vector(&query_embedding, search_top_k)
183 .await?
184 };
185
186 logfire::info!(
187 "vector search completed",
188 query = &query_owned,
189 results_found = vector_results.len() as i64
190 );
191
192 let bm25_results = {
193 let _span = logfire::span!(
194 "turbopuffer.bm25_search",
195 query = &query_owned,
196 top_k = search_top_k as i64,
197 namespace = &namespace
198 )
199 .entered();
200
201 vector_store.search_by_keyword(query, search_top_k).await?
202 };
203
204 // normalize scores
205 let semantic_scores: HashMap<String, f32> = vector_results
206 .iter()
207 .map(|r| (r.id.clone(), cosine_distance_to_similarity(r.score)))
208 .collect();
209
210 let bm25_raw: Vec<(String, f32)> = bm25_results
211 .iter()
212 .map(|r| (r.id.clone(), r.score))
213 .collect();
214 let keyword_scores = normalize_bm25_scores(&bm25_raw);
215
216 let max_bm25 = bm25_raw
217 .iter()
218 .map(|(_, s)| *s)
219 .fold(f32::NEG_INFINITY, f32::max);
220
221 logfire::info!(
222 "bm25 search completed",
223 query = &query_owned,
224 results_found = bm25_results.len() as i64,
225 max_bm25 = max_bm25 as f64,
226 top_bm25_raw = bm25_raw.first().map(|(_, s)| *s).unwrap_or(0.0) as f64
227 );
228
229 // fuse scores
230 let fused = fuse_scores(&semantic_scores, &keyword_scores, fusion_config);
231
232 logfire::info!(
233 "weighted fusion completed",
234 total_candidates = (vector_results.len() + bm25_results.len()) as i64,
235 alpha = fusion_config.alpha as f64,
236 pre_filter_results = fused.len() as i64
237 );
238
239 // collect attributes from both result sets
240 let mut all_attributes: HashMap<String, HashMap<String, String>> = HashMap::new();
241 for result in vector_results.into_iter().chain(bm25_results.into_iter()) {
242 all_attributes
243 .entry(result.id.clone())
244 .or_insert(result.attributes);
245 }
246
247 // return fused results with attributes
248 Ok(fused
249 .into_iter()
250 .map(|(id, score)| {
251 let attrs = all_attributes.remove(&id).unwrap_or_default();
252 (id, score, attrs)
253 })
254 .collect())
255}
256
257/// shared search implementation used by both POST and GET handlers
258async fn perform_search(
259 query_text: String,
260 top_k_val: usize,
261 alpha: f32,
262 family_friendly: bool,
263 exclude: Option<String>,
264 include: Option<String>,
265 config: &Config,
266) -> ActixResult<SearchResponse> {
267 let content_filter = ContentFilter::new(
268 family_friendly,
269 exclude.as_deref(),
270 include.as_deref(),
271 );
272
273 let _search_span = logfire::span!(
274 "bufo_search",
275 query = &query_text,
276 top_k = top_k_val as i64,
277 alpha = alpha as f64,
278 family_friendly = family_friendly,
279 exclude_patterns_count = content_filter.exclude_pattern_count() as i64
280 )
281 .entered();
282
283 logfire::info!(
284 "search request received",
285 query = &query_text,
286 top_k = top_k_val as i64,
287 alpha = alpha as f64,
288 exclude_patterns = &content_filter.exclude_patterns_str()
289 );
290
291 // create clients
292 let embedder = VoyageEmbedder::new(config.voyage_api_key.clone());
293 let vector_store = TurbopufferStore::new(
294 config.turbopuffer_api_key.clone(),
295 config.turbopuffer_namespace.clone(),
296 );
297
298 let fusion_config = FusionConfig::new(alpha);
299
300 // execute hybrid search
301 let fused_results = execute_hybrid_search(
302 &query_text,
303 top_k_val,
304 &fusion_config,
305 &embedder,
306 &vector_store,
307 )
308 .await
309 .map_err(|e| e.into_actix_error())?;
310
311 // convert to BufoResults and apply filtering
312 let results: Vec<BufoResult> = fused_results
313 .into_iter()
314 .map(|(id, score, attrs)| BufoResult {
315 id: id.clone(),
316 url: attrs.get("url").cloned().unwrap_or_default(),
317 name: attrs.get("name").cloned().unwrap_or_else(|| id.clone()),
318 score,
319 })
320 .filter(|result| content_filter.matches(result))
321 .take(top_k_val)
322 .collect();
323
324 let results_count = results.len() as i64;
325 let top_result_name = results
326 .first()
327 .map(|r| r.name.clone())
328 .unwrap_or_else(|| "none".to_string());
329 let top_score_val = results.first().map(|r| r.score as f64).unwrap_or(0.0);
330 let avg_score_val = if !results.is_empty() {
331 results.iter().map(|r| r.score as f64).sum::<f64>() / results.len() as f64
332 } else {
333 0.0
334 };
335
336 logfire::info!(
337 "search completed successfully",
338 query = &query_text,
339 results_count = results_count,
340 top_result = &top_result_name,
341 top_score = top_score_val,
342 avg_score = avg_score_val
343 );
344
345 Ok(SearchResponse { results })
346}
347
348/// POST /api/search handler (existing API)
349pub async fn search(
350 query: web::Json<SearchQuery>,
351 config: web::Data<Config>,
352) -> ActixResult<HttpResponse> {
353 let response = perform_search(
354 query.query.clone(),
355 query.top_k,
356 query.alpha,
357 query.family_friendly,
358 query.exclude.clone(),
359 query.include.clone(),
360 &config,
361 )
362 .await?;
363 Ok(HttpResponse::Ok().json(response))
364}
365
366/// GET /api/search handler for shareable URLs
367pub async fn search_get(
368 query: web::Query<SearchQuery>,
369 config: web::Data<Config>,
370 req: HttpRequest,
371) -> ActixResult<HttpResponse> {
372 let etag = generate_etag(
373 &query.query,
374 query.top_k,
375 query.alpha,
376 query.family_friendly,
377 &query.exclude,
378 &query.include,
379 );
380
381 if let Some(if_none_match) = req.headers().get("if-none-match") {
382 if if_none_match.to_str().unwrap_or("") == etag {
383 return Ok(HttpResponse::NotModified()
384 .insert_header(("etag", etag))
385 .finish());
386 }
387 }
388
389 let response = perform_search(
390 query.query.clone(),
391 query.top_k,
392 query.alpha,
393 query.family_friendly,
394 query.exclude.clone(),
395 query.include.clone(),
396 &config,
397 )
398 .await?;
399
400 Ok(HttpResponse::Ok()
401 .insert_header(("etag", etag.clone()))
402 .insert_header(("cache-control", "public, max-age=300"))
403 .json(response))
404}