semantic bufo search find-bufo.com
bufo

add regex-based exclude patterns for search results

- filter results by comma-separated regex patterns via `exclude` param
- filtering happens before truncation so you always get top_k results
- frontend UI exposes exclude input with links to regex101 and claude

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

+85 -14
+1
Cargo.lock
··· 683 683 "opentelemetry 0.26.0", 684 684 "opentelemetry-instrumentation-actix-web", 685 685 "opentelemetry-otlp 0.26.0", 686 + "regex", 686 687 "reqwest", 687 688 "serde", 688 689 "serde_json",
+1
Cargo.toml
··· 24 24 opentelemetry = { version = "0.26", features = ["trace", "metrics"] } 25 25 opentelemetry-instrumentation-actix-web = { version = "0.23", features = ["metrics"] } 26 26 opentelemetry-otlp = { version = "0.26", features = ["trace", "http-proto", "reqwest-client", "reqwest-rustls"] } 27 + regex = "1.12"
+44 -13
src/search.rs
··· 44 44 use crate::embedding::EmbeddingClient; 45 45 use crate::turbopuffer::{QueryRequest, TurbopufferClient, TurbopufferError}; 46 46 use actix_web::{web, HttpRequest, HttpResponse, Result as ActixResult}; 47 + use regex::Regex; 47 48 use serde::{Deserialize, Serialize}; 48 49 use std::collections::hash_map::DefaultHasher; 49 50 use std::hash::{Hash, Hasher}; ··· 60 61 /// family-friendly mode: filters out inappropriate content (default true) 61 62 #[serde(default = "default_family_friendly")] 62 63 pub family_friendly: bool, 64 + /// comma-separated glob patterns to exclude from results (e.g., "*party*,*sad*") 65 + #[serde(default)] 66 + pub exclude: Option<String>, 63 67 } 64 68 65 69 fn default_top_k() -> usize { ··· 98 102 } 99 103 100 104 /// generate etag for caching based on query parameters 101 - fn generate_etag(query: &str, top_k: usize, alpha: f32, family_friendly: bool) -> String { 105 + fn generate_etag(query: &str, top_k: usize, alpha: f32, family_friendly: bool, exclude: &Option<String>) -> String { 102 106 let mut hasher = DefaultHasher::new(); 103 107 query.hash(&mut hasher); 104 108 top_k.hash(&mut hasher); 105 109 // convert f32 to bits for consistent hashing 106 110 alpha.to_bits().hash(&mut hasher); 107 111 family_friendly.hash(&mut hasher); 112 + exclude.hash(&mut hasher); 108 113 format!("\"{}\"", hasher.finish()) 109 114 } 110 115 ··· 114 119 top_k_val: usize, 115 120 alpha: f32, 116 121 family_friendly: bool, 122 + exclude: Option<String>, 117 123 config: &Config, 118 124 ) -> ActixResult<SearchResponse> { 125 + // parse and compile exclusion regex patterns from comma-separated string 126 + let exclude_patterns: Vec<Regex> = exclude 127 + .as_ref() 128 + .map(|s| { 129 + s.split(',') 130 + .map(|p| p.trim()) 131 + .filter(|p| !p.is_empty()) 132 + .filter_map(|p| Regex::new(p).ok()) // silently skip invalid patterns 133 + .collect() 134 + }) 135 + .unwrap_or_default(); 119 136 120 137 let _search_span = logfire::span!( 121 138 "bufo_search", 122 139 query = &query_text, 123 140 top_k = top_k_val as i64, 124 141 alpha = alpha as f64, 125 - family_friendly = family_friendly 142 + family_friendly = family_friendly, 143 + exclude_patterns_count = exclude_patterns.len() as i64 126 144 ).entered(); 127 145 146 + let exclude_patterns_str: String = exclude_patterns.iter().map(|r| r.as_str()).collect::<Vec<_>>().join(","); 128 147 logfire::info!( 129 148 "search request received", 130 149 query = &query_text, 131 150 top_k = top_k_val as i64, 132 - alpha = alpha as f64 151 + alpha = alpha as f64, 152 + exclude_patterns = &exclude_patterns_str 133 153 ); 134 154 135 155 let embedding_client = EmbeddingClient::new(config.voyage_api_key.clone()); ··· 170 190 ); 171 191 172 192 // run vector search (semantic) 173 - let search_top_k = top_k_val * 2; // get more results for better fusion 193 + // fetch extra results to ensure we have enough after filtering by family_friendly and exclude patterns 194 + let search_top_k = top_k_val * 5; 174 195 let vector_request = QueryRequest { 175 196 rank_by: vec![ 176 197 serde_json::json!("vector"), ··· 299 320 // and keyword-only results from appearing when alpha=1.0 (pure semantic) 300 321 fused_scores.retain(|(_, score)| *score > 0.001); 301 322 302 - // sort by fused score (descending) and take top_k 323 + // sort by fused score (descending) 303 324 fused_scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); 304 - fused_scores.truncate(top_k_val); 305 325 306 326 logfire::info!( 307 327 "weighted fusion completed", 308 328 total_candidates = all_results.len() as i64, 309 329 alpha = alpha as f64, 310 - final_results = fused_scores.len() as i64 330 + pre_filter_results = fused_scores.len() as i64 311 331 ); 312 332 313 - // convert to bufo results 333 + // convert to bufo results and apply ALL filtering BEFORE truncating 334 + // this ensures we return top_k results after filtering, not fewer 314 335 let inappropriate_bufos = get_inappropriate_bufos(); 315 336 let results: Vec<BufoResult> = fused_scores 316 337 .into_iter() ··· 340 361 }) 341 362 .filter(|result| { 342 363 // filter out inappropriate bufos if family_friendly mode is enabled 343 - if family_friendly { 344 - !inappropriate_bufos.iter().any(|&blocked| result.name.contains(blocked)) 345 - } else { 346 - true 364 + if family_friendly && inappropriate_bufos.iter().any(|&blocked| result.name.contains(blocked)) { 365 + return false; 347 366 } 367 + 368 + // filter out results matching any exclude regex pattern 369 + for pattern in &exclude_patterns { 370 + if pattern.is_match(&result.name) { 371 + return false; 372 + } 373 + } 374 + 375 + true 348 376 }) 377 + .take(top_k_val) // take top_k AFTER filtering 349 378 .collect(); 350 379 351 380 let results_count = results.len() as i64; ··· 379 408 query.top_k, 380 409 query.alpha, 381 410 query.family_friendly, 411 + query.exclude.clone(), 382 412 &config 383 413 ).await?; 384 414 Ok(HttpResponse::Ok().json(response)) ··· 391 421 req: HttpRequest, 392 422 ) -> ActixResult<HttpResponse> { 393 423 // generate etag for caching 394 - let etag = generate_etag(&query.query, query.top_k, query.alpha, query.family_friendly); 424 + let etag = generate_etag(&query.query, query.top_k, query.alpha, query.family_friendly, &query.exclude); 395 425 396 426 // check if client has cached version 397 427 if let Some(if_none_match) = req.headers().get("if-none-match") { ··· 407 437 query.top_k, 408 438 query.alpha, 409 439 query.family_friendly, 440 + query.exclude.clone(), 410 441 &config 411 442 ).await?; 412 443
+39 -1
static/index.html
··· 213 213 accent-color: #667eea; 214 214 } 215 215 216 + .option-group a { 217 + color: #667eea; 218 + text-decoration: none; 219 + } 220 + 221 + .option-group a:hover { 222 + text-decoration: underline; 223 + } 224 + 216 225 .sample-queries-container { 217 226 text-align: center; 218 227 margin-bottom: 30px; ··· 600 609 <span>enabled</span> 601 610 </label> 602 611 </div> 612 + 613 + <div class="option-group"> 614 + <div class="option-label"> 615 + <span class="option-name">exclude patterns</span> 616 + </div> 617 + <div class="option-description"> 618 + comma-separated <a href="https://regex101.com/" target="_blank">regex</a> patterns to exclude (e.g., excited,party) 619 + <br> 620 + <span style="color: #999; font-size: 0.9em;">new to regex? <a href="https://claude.ai" target="_blank">claude</a> can write patterns for you</span> 621 + </div> 622 + <input 623 + type="text" 624 + id="excludeInput" 625 + placeholder="pattern1,pattern2" 626 + style="width: 100%; padding: 10px; font-size: 14px;" 627 + > 628 + </div> 603 629 </div> 604 630 </div> 605 631 ··· 632 658 const alphaSlider = document.getElementById('alphaSlider'); 633 659 const alphaValue = document.getElementById('alphaValue'); 634 660 const familyFriendlyCheckbox = document.getElementById('familyFriendlyCheckbox'); 661 + const excludeInput = document.getElementById('excludeInput'); 635 662 636 663 let hasSearched = false; 637 664 ··· 655 682 656 683 const alpha = parseFloat(alphaSlider.value); 657 684 const familyFriendly = familyFriendlyCheckbox.checked; 685 + const exclude = excludeInput.value.trim(); 658 686 659 687 // hide bufo after first search 660 688 if (!hasSearched) { ··· 669 697 params.set('top_k', '20'); 670 698 params.set('alpha', alpha.toString()); 671 699 params.set('family_friendly', familyFriendly.toString()); 700 + if (exclude) params.set('exclude', exclude); 672 701 const newUrl = `${window.location.pathname}?${params.toString()}`; 673 - window.history.pushState({ query, alpha, familyFriendly }, '', newUrl); 702 + window.history.pushState({ query, alpha, familyFriendly, exclude }, '', newUrl); 674 703 } 675 704 676 705 searchButton.disabled = true; ··· 685 714 params.set('top_k', '20'); 686 715 params.set('alpha', alpha.toString()); 687 716 params.set('family_friendly', familyFriendly.toString()); 717 + if (exclude) params.set('exclude', exclude); 688 718 689 719 const response = await fetch(`/api/search?${params.toString()}`, { 690 720 method: 'GET', ··· 761 791 if (e.state.familyFriendly !== undefined) { 762 792 familyFriendlyCheckbox.checked = e.state.familyFriendly; 763 793 } 794 + if (e.state.exclude !== undefined) { 795 + excludeInput.value = e.state.exclude; 796 + } 764 797 search(false); 765 798 } 766 799 }); ··· 771 804 const query = params.get('q'); 772 805 const alpha = params.get('alpha'); 773 806 const familyFriendly = params.get('family_friendly'); 807 + const exclude = params.get('exclude'); 774 808 775 809 if (alpha) { 776 810 alphaSlider.value = alpha; ··· 779 813 780 814 if (familyFriendly !== null) { 781 815 familyFriendlyCheckbox.checked = familyFriendly === 'true'; 816 + } 817 + 818 + if (exclude) { 819 + excludeInput.value = exclude; 782 820 } 783 821 784 822 if (query) {