semantic bufo search find-bufo.com
bufo

add regex-based exclude patterns for search results

- filter results by comma-separated regex patterns via `exclude` param
- filtering happens before truncation so you always get top_k results
- frontend UI exposes exclude input with links to regex101 and claude

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

+85 -14
+1
Cargo.lock
··· 683 "opentelemetry 0.26.0", 684 "opentelemetry-instrumentation-actix-web", 685 "opentelemetry-otlp 0.26.0", 686 "reqwest", 687 "serde", 688 "serde_json",
··· 683 "opentelemetry 0.26.0", 684 "opentelemetry-instrumentation-actix-web", 685 "opentelemetry-otlp 0.26.0", 686 + "regex", 687 "reqwest", 688 "serde", 689 "serde_json",
+1
Cargo.toml
··· 24 opentelemetry = { version = "0.26", features = ["trace", "metrics"] } 25 opentelemetry-instrumentation-actix-web = { version = "0.23", features = ["metrics"] } 26 opentelemetry-otlp = { version = "0.26", features = ["trace", "http-proto", "reqwest-client", "reqwest-rustls"] }
··· 24 opentelemetry = { version = "0.26", features = ["trace", "metrics"] } 25 opentelemetry-instrumentation-actix-web = { version = "0.23", features = ["metrics"] } 26 opentelemetry-otlp = { version = "0.26", features = ["trace", "http-proto", "reqwest-client", "reqwest-rustls"] } 27 + regex = "1.12"
+44 -13
src/search.rs
··· 44 use crate::embedding::EmbeddingClient; 45 use crate::turbopuffer::{QueryRequest, TurbopufferClient, TurbopufferError}; 46 use actix_web::{web, HttpRequest, HttpResponse, Result as ActixResult}; 47 use serde::{Deserialize, Serialize}; 48 use std::collections::hash_map::DefaultHasher; 49 use std::hash::{Hash, Hasher}; ··· 60 /// family-friendly mode: filters out inappropriate content (default true) 61 #[serde(default = "default_family_friendly")] 62 pub family_friendly: bool, 63 } 64 65 fn default_top_k() -> usize { ··· 98 } 99 100 /// generate etag for caching based on query parameters 101 - fn generate_etag(query: &str, top_k: usize, alpha: f32, family_friendly: bool) -> String { 102 let mut hasher = DefaultHasher::new(); 103 query.hash(&mut hasher); 104 top_k.hash(&mut hasher); 105 // convert f32 to bits for consistent hashing 106 alpha.to_bits().hash(&mut hasher); 107 family_friendly.hash(&mut hasher); 108 format!("\"{}\"", hasher.finish()) 109 } 110 ··· 114 top_k_val: usize, 115 alpha: f32, 116 family_friendly: bool, 117 config: &Config, 118 ) -> ActixResult<SearchResponse> { 119 120 let _search_span = logfire::span!( 121 "bufo_search", 122 query = &query_text, 123 top_k = top_k_val as i64, 124 alpha = alpha as f64, 125 - family_friendly = family_friendly 126 ).entered(); 127 128 logfire::info!( 129 "search request received", 130 query = &query_text, 131 top_k = top_k_val as i64, 132 - alpha = alpha as f64 133 ); 134 135 let embedding_client = EmbeddingClient::new(config.voyage_api_key.clone()); ··· 170 ); 171 172 // run vector search (semantic) 173 - let search_top_k = top_k_val * 2; // get more results for better fusion 174 let vector_request = QueryRequest { 175 rank_by: vec![ 176 serde_json::json!("vector"), ··· 299 // and keyword-only results from appearing when alpha=1.0 (pure semantic) 300 fused_scores.retain(|(_, score)| *score > 0.001); 301 302 - // sort by fused score (descending) and take top_k 303 fused_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); 304 - fused_scores.truncate(top_k_val); 305 306 logfire::info!( 307 "weighted fusion completed", 308 total_candidates = all_results.len() as i64, 309 alpha = alpha as f64, 310 - final_results = fused_scores.len() as i64 311 ); 312 313 - // convert to bufo results 314 let inappropriate_bufos = get_inappropriate_bufos(); 315 let results: Vec<BufoResult> = fused_scores 316 .into_iter() ··· 340 }) 341 .filter(|result| { 342 // filter out inappropriate bufos if family_friendly mode is enabled 343 - if family_friendly { 344 - !inappropriate_bufos.iter().any(|&blocked| result.name.contains(blocked)) 345 - } else { 346 - true 347 } 348 }) 349 .collect(); 350 351 let results_count = results.len() as i64; ··· 379 query.top_k, 380 query.alpha, 381 query.family_friendly, 382 &config 383 ).await?; 384 Ok(HttpResponse::Ok().json(response)) ··· 391 req: HttpRequest, 392 ) -> ActixResult<HttpResponse> { 393 // generate etag for caching 394 - let etag = generate_etag(&query.query, query.top_k, query.alpha, query.family_friendly); 395 396 // check if client has cached version 397 if let Some(if_none_match) = req.headers().get("if-none-match") { ··· 407 query.top_k, 408 query.alpha, 409 query.family_friendly, 410 &config 411 ).await?; 412
··· 44 use crate::embedding::EmbeddingClient; 45 use crate::turbopuffer::{QueryRequest, TurbopufferClient, TurbopufferError}; 46 use actix_web::{web, HttpRequest, HttpResponse, Result as ActixResult}; 47 + use regex::Regex; 48 use serde::{Deserialize, Serialize}; 49 use std::collections::hash_map::DefaultHasher; 50 use std::hash::{Hash, Hasher}; ··· 61 /// family-friendly mode: filters out inappropriate content (default true) 62 #[serde(default = "default_family_friendly")] 63 pub family_friendly: bool, 64 + /// comma-separated glob patterns to exclude from results (e.g., "*party*,*sad*") 65 + #[serde(default)] 66 + pub exclude: Option<String>, 67 } 68 69 fn default_top_k() -> usize { ··· 102 } 103 104 /// generate etag for caching based on query parameters 105 + fn generate_etag(query: &str, top_k: usize, alpha: f32, family_friendly: bool, exclude: &Option<String>) -> String { 106 let mut hasher = DefaultHasher::new(); 107 query.hash(&mut hasher); 108 top_k.hash(&mut hasher); 109 // convert f32 to bits for consistent hashing 110 alpha.to_bits().hash(&mut hasher); 111 family_friendly.hash(&mut hasher); 112 + exclude.hash(&mut hasher); 113 format!("\"{}\"", hasher.finish()) 114 } 115 ··· 119 top_k_val: usize, 120 alpha: f32, 121 family_friendly: bool, 122 + exclude: Option<String>, 123 config: &Config, 124 ) -> ActixResult<SearchResponse> { 125 + // parse and compile exclusion regex patterns from comma-separated string 126 + let exclude_patterns: Vec<Regex> = exclude 127 + .as_ref() 128 + .map(|s| { 129 + s.split(',') 130 + .map(|p| p.trim()) 131 + .filter(|p| !p.is_empty()) 132 + .filter_map(|p| Regex::new(p).ok()) // silently skip invalid patterns 133 + .collect() 134 + }) 135 + .unwrap_or_default(); 136 137 let _search_span = logfire::span!( 138 "bufo_search", 139 query = &query_text, 140 top_k = top_k_val as i64, 141 alpha = alpha as f64, 142 + family_friendly = family_friendly, 143 + exclude_patterns_count = exclude_patterns.len() as i64 144 ).entered(); 145 146 + let exclude_patterns_str: String = exclude_patterns.iter().map(|r| r.as_str()).collect::<Vec<_>>().join(","); 147 logfire::info!( 148 "search request received", 149 query = &query_text, 150 top_k = top_k_val as i64, 151 + alpha = alpha as f64, 152 + exclude_patterns = &exclude_patterns_str 153 ); 154 155 let embedding_client = EmbeddingClient::new(config.voyage_api_key.clone()); ··· 190 ); 191 192 // run vector search (semantic) 193 + // fetch extra results to ensure we have enough after filtering by family_friendly and exclude patterns 194 + let search_top_k = top_k_val * 5; 195 let vector_request = QueryRequest { 196 rank_by: vec![ 197 serde_json::json!("vector"), ··· 320 // and keyword-only results from appearing when alpha=1.0 (pure semantic) 321 fused_scores.retain(|(_, score)| *score > 0.001); 322 323 + // sort by fused score (descending) 324 fused_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); 325 326 logfire::info!( 327 "weighted fusion completed", 328 total_candidates = all_results.len() as i64, 329 alpha = alpha as f64, 330 + pre_filter_results = fused_scores.len() as i64 331 ); 332 333 + // convert to bufo results and apply ALL filtering BEFORE truncating 334 + // this ensures we return top_k results after filtering, not fewer 335 let inappropriate_bufos = get_inappropriate_bufos(); 336 let results: Vec<BufoResult> = fused_scores 337 .into_iter() ··· 361 }) 362 .filter(|result| { 363 // filter out inappropriate bufos if family_friendly mode is enabled 364 + if family_friendly && inappropriate_bufos.iter().any(|&blocked| result.name.contains(blocked)) { 365 + return false; 366 } 367 + 368 + // filter out results matching any exclude regex pattern 369 + for pattern in &exclude_patterns { 370 + if pattern.is_match(&result.name) { 371 + return false; 372 + } 373 + } 374 + 375 + true 376 }) 377 + .take(top_k_val) // take top_k AFTER filtering 378 .collect(); 379 380 let results_count = results.len() as i64; ··· 408 query.top_k, 409 query.alpha, 410 query.family_friendly, 411 + query.exclude.clone(), 412 &config 413 ).await?; 414 Ok(HttpResponse::Ok().json(response)) ··· 421 req: HttpRequest, 422 ) -> ActixResult<HttpResponse> { 423 // generate etag for caching 424 + let etag = generate_etag(&query.query, query.top_k, query.alpha, query.family_friendly, &query.exclude); 425 426 // check if client has cached version 427 if let Some(if_none_match) = req.headers().get("if-none-match") { ··· 437 query.top_k, 438 query.alpha, 439 query.family_friendly, 440 + query.exclude.clone(), 441 &config 442 ).await?; 443
+39 -1
static/index.html
··· 213 accent-color: #667eea; 214 } 215 216 .sample-queries-container { 217 text-align: center; 218 margin-bottom: 30px; ··· 600 <span>enabled</span> 601 </label> 602 </div> 603 </div> 604 </div> 605 ··· 632 const alphaSlider = document.getElementById('alphaSlider'); 633 const alphaValue = document.getElementById('alphaValue'); 634 const familyFriendlyCheckbox = document.getElementById('familyFriendlyCheckbox'); 635 636 let hasSearched = false; 637 ··· 655 656 const alpha = parseFloat(alphaSlider.value); 657 const familyFriendly = familyFriendlyCheckbox.checked; 658 659 // hide bufo after first search 660 if (!hasSearched) { ··· 669 params.set('top_k', '20'); 670 params.set('alpha', alpha.toString()); 671 params.set('family_friendly', familyFriendly.toString()); 672 const newUrl = `${window.location.pathname}?${params.toString()}`; 673 - window.history.pushState({ query, alpha, familyFriendly }, '', newUrl); 674 } 675 676 searchButton.disabled = true; ··· 685 params.set('top_k', '20'); 686 params.set('alpha', alpha.toString()); 687 params.set('family_friendly', familyFriendly.toString()); 688 689 const response = await fetch(`/api/search?${params.toString()}`, { 690 method: 'GET', ··· 761 if (e.state.familyFriendly !== undefined) { 762 familyFriendlyCheckbox.checked = e.state.familyFriendly; 763 } 764 search(false); 765 } 766 }); ··· 771 const query = params.get('q'); 772 const alpha = params.get('alpha'); 773 const familyFriendly = params.get('family_friendly'); 774 775 if (alpha) { 776 alphaSlider.value = alpha; ··· 779 780 if (familyFriendly !== null) { 781 familyFriendlyCheckbox.checked = familyFriendly === 'true'; 782 } 783 784 if (query) {
··· 213 accent-color: #667eea; 214 } 215 216 + .option-group a { 217 + color: #667eea; 218 + text-decoration: none; 219 + } 220 + 221 + .option-group a:hover { 222 + text-decoration: underline; 223 + } 224 + 225 .sample-queries-container { 226 text-align: center; 227 margin-bottom: 30px; ··· 609 <span>enabled</span> 610 </label> 611 </div> 612 + 613 + <div class="option-group"> 614 + <div class="option-label"> 615 + <span class="option-name">exclude patterns</span> 616 + </div> 617 + <div class="option-description"> 618 + comma-separated <a href="https://regex101.com/" target="_blank">regex</a> patterns to exclude (e.g., excited,party) 619 + <br> 620 + <span style="color: #999; font-size: 0.9em;">new to regex? <a href="https://claude.ai" target="_blank">claude</a> can write patterns for you</span> 621 + </div> 622 + <input 623 + type="text" 624 + id="excludeInput" 625 + placeholder="pattern1,pattern2" 626 + style="width: 100%; padding: 10px; font-size: 14px;" 627 + > 628 + </div> 629 </div> 630 </div> 631 ··· 658 const alphaSlider = document.getElementById('alphaSlider'); 659 const alphaValue = document.getElementById('alphaValue'); 660 const familyFriendlyCheckbox = document.getElementById('familyFriendlyCheckbox'); 661 + const excludeInput = document.getElementById('excludeInput'); 662 663 let hasSearched = false; 664 ··· 682 683 const alpha = parseFloat(alphaSlider.value); 684 const familyFriendly = familyFriendlyCheckbox.checked; 685 + const exclude = excludeInput.value.trim(); 686 687 // hide bufo after first search 688 if (!hasSearched) { ··· 697 params.set('top_k', '20'); 698 params.set('alpha', alpha.toString()); 699 params.set('family_friendly', familyFriendly.toString()); 700 + if (exclude) params.set('exclude', exclude); 701 const newUrl = `${window.location.pathname}?${params.toString()}`; 702 + window.history.pushState({ query, alpha, familyFriendly, exclude }, '', newUrl); 703 } 704 705 searchButton.disabled = true; ··· 714 params.set('top_k', '20'); 715 params.set('alpha', alpha.toString()); 716 params.set('family_friendly', familyFriendly.toString()); 717 + if (exclude) params.set('exclude', exclude); 718 719 const response = await fetch(`/api/search?${params.toString()}`, { 720 method: 'GET', ··· 791 if (e.state.familyFriendly !== undefined) { 792 familyFriendlyCheckbox.checked = e.state.familyFriendly; 793 } 794 + if (e.state.exclude !== undefined) { 795 + excludeInput.value = e.state.exclude; 796 + } 797 search(false); 798 } 799 }); ··· 804 const query = params.get('q'); 805 const alpha = params.get('alpha'); 806 const familyFriendly = params.get('family_friendly'); 807 + const exclude = params.get('exclude'); 808 809 if (alpha) { 810 alphaSlider.value = alpha; ··· 813 814 if (familyFriendly !== null) { 815 familyFriendlyCheckbox.checked = familyFriendly === 'true'; 816 + } 817 + 818 + if (exclude) { 819 + excludeInput.value = exclude; 820 } 821 822 if (query) {