detectors experiments · angrydutchman.peedee.es/plcbundle@40336cb

+629

cmd/plcbundle/detector.go

··· 1 + // cmd/plcbundle/detector.go 2 + package main 3 + 4 + import ( 5 + "bufio" 6 + "context" 7 + "encoding/json" 8 + "flag" 9 + "fmt" 10 + "os" 11 + "sort" 12 + "strings" 13 + "time" 14 + 15 + "tangled.org/atscan.net/plcbundle/detector" 16 + "tangled.org/atscan.net/plcbundle/plc" 17 + ) 18 + 19 + type defaultLogger struct{} 20 + 21 + func (d *defaultLogger) Printf(format string, v ...interface{}) { 22 + fmt.Fprintf(os.Stderr, format+"\n", v...) 23 + } 24 + 25 + func cmdDetector() { 26 + if len(os.Args) < 3 { 27 + printDetectorUsage() 28 + os.Exit(1) 29 + } 30 + 31 + subcommand := os.Args[2] 32 + 33 + switch subcommand { 34 + case "list": 35 + cmdDetectorList() 36 + case "test": 37 + cmdDetectorTest() 38 + case "run": 39 + cmdDetectorRun() 40 + case "filter": // ← Add this 41 + cmdDetectorFilter() 42 + case "info": 43 + cmdDetectorInfo() 44 + default: 45 + fmt.Fprintf(os.Stderr, "Unknown detector subcommand: %s\n", subcommand) 46 + printDetectorUsage() 47 + os.Exit(1) 48 + } 49 + } 50 + 51 + func printDetectorUsage() { 52 + fmt.Printf(`Usage: plcbundle detector <command> [options] 53 + 54 + Commands: 55 + list List available detectors 56 + test Test a detector on specific bundles 57 + run Run detector and output CSV results 58 + filter Filter JSONL operations from stdin 59 + info Show detailed detector information 60 + 61 + Examples: 62 + plcbundle detector list 63 + plcbundle detector test nostr --bundle 42 64 + plcbundle detector run all --bundles 1-100 > results.csv 65 + plcbundle backfill | plcbundle detector filter all > filtered.jsonl 66 + plcbundle detector info nostr 67 + `) 68 + } 69 + 70 + // cmdDetectorFilter reads JSONL from stdin, filters OUT spam, outputs clean operations 71 + func cmdDetectorFilter() { 72 + if len(os.Args) < 4 { 73 + fmt.Fprintf(os.Stderr, "Usage: plcbundle detector filter <detector1> [detector2...] [--confidence 0.9]\n") 74 + fmt.Fprintf(os.Stderr, "\nFilters OUT operations that match detectors (outputs clean data)\n\n") 75 + fmt.Fprintf(os.Stderr, "Examples:\n") 76 + fmt.Fprintf(os.Stderr, " plcbundle backfill | plcbundle detector filter all > clean.jsonl\n") 77 + fmt.Fprintf(os.Stderr, " plcbundle export --bundle 1 | plcbundle detector filter invalid_handle > clean.jsonl\n") 78 + os.Exit(1) 79 + } 80 + 81 + // Manually separate detector names from flags 82 + var detectorNames []string 83 + var flagArgs []string 84 + 85 + for i := 3; i < len(os.Args); i++ { 86 + arg := os.Args[i] 87 + if strings.HasPrefix(arg, "-") { 88 + flagArgs = os.Args[i:] 89 + break 90 + } 91 + detectorNames = append(detectorNames, arg) 92 + } 93 + 94 + if len(detectorNames) == 0 { 95 + fmt.Fprintf(os.Stderr, "Error: at least one detector name required\n") 96 + os.Exit(1) 97 + } 98 + 99 + // Parse flags 100 + fs := flag.NewFlagSet("detector filter", flag.ExitOnError) 101 + confidence := fs.Float64("confidence", 0.90, "minimum confidence") 102 + fs.Parse(flagArgs) 103 + 104 + // Setup registry 105 + registry := detector.DefaultRegistry() 106 + 107 + // Handle "all" keyword 108 + if len(detectorNames) == 1 && detectorNames[0] == "all" { 109 + detectorNames = registry.Names() 110 + fmt.Fprintf(os.Stderr, "Using all detectors: %s\n", strings.Join(detectorNames, ", ")) 111 + } 112 + 113 + // Get all detectors 114 + detectors := make([]detector.Detector, 0, len(detectorNames)) 115 + for _, name := range detectorNames { 116 + d, err := registry.Get(name) 117 + if err != nil { 118 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 119 + os.Exit(1) 120 + } 121 + detectors = append(detectors, d) 122 + } 123 + 124 + // Log to stderr 125 + fmt.Fprintf(os.Stderr, "Filtering OUT spam with %d detector(s)\n", len(detectorNames)) 126 + if len(detectorNames) <= 5 { 127 + fmt.Fprintf(os.Stderr, "Detectors: %s\n", strings.Join(detectorNames, ", ")) 128 + } 129 + fmt.Fprintf(os.Stderr, "Min confidence: %.2f\n\n", *confidence) 130 + 131 + ctx := context.Background() 132 + scanner := bufio.NewScanner(os.Stdin) 133 + 134 + // Set large buffer for long lines 135 + buf := make([]byte, 0, 64*1024) 136 + scanner.Buffer(buf, 1024*1024) 137 + 138 + cleanCount := 0 139 + filteredCount := 0 140 + totalCount := 0 141 + totalBytes := int64(0) // ← Add total bytes 142 + filteredBytes := int64(0) // ← Add filtered bytes 143 + 144 + // Read JSONL from stdin 145 + for scanner.Scan() { 146 + line := scanner.Bytes() 147 + if len(line) == 0 { 148 + continue 149 + } 150 + 151 + totalCount++ 152 + opSize := int64(len(line)) 153 + totalBytes += opSize // ← Track total 154 + 155 + // Parse operation 156 + var op plc.PLCOperation 157 + if err := json.Unmarshal(line, &op); err != nil { 158 + fmt.Fprintf(os.Stderr, "Warning: failed to parse line %d: %v\n", totalCount, err) 159 + continue 160 + } 161 + 162 + // Run all detectors on this operation 163 + isSpam := false 164 + 165 + for _, det := range detectors { 166 + match, err := det.Detect(ctx, op) 167 + if err != nil { 168 + continue 169 + } 170 + 171 + if match != nil && match.Confidence >= *confidence { 172 + // Detected as spam - filter it out 173 + isSpam = true 174 + break 175 + } 176 + } 177 + 178 + // Output only if NOT spam (clean operation) 179 + if !isSpam { 180 + cleanCount++ 181 + fmt.Println(string(line)) 182 + } else { 183 + filteredCount++ 184 + filteredBytes += opSize // ← Track filtered bytes 185 + } 186 + 187 + // Progress to stderr 188 + if totalCount%1000 == 0 { 189 + fmt.Fprintf(os.Stderr, "Processed: %d | Clean: %d | Filtered: %d | Saved: %s\r", 190 + totalCount, cleanCount, filteredCount, formatBytes(filteredBytes)) 191 + } 192 + } 193 + 194 + if err := scanner.Err(); err != nil { 195 + fmt.Fprintf(os.Stderr, "\nError reading stdin: %v\n", err) 196 + os.Exit(1) 197 + } 198 + 199 + // Final stats to stderr 200 + fmt.Fprintf(os.Stderr, "\n\n") 201 + fmt.Fprintf(os.Stderr, "✓ Filter complete\n") 202 + fmt.Fprintf(os.Stderr, " Total operations: %d\n", totalCount) 203 + fmt.Fprintf(os.Stderr, " Clean: %d (%.2f%%)\n", cleanCount, float64(cleanCount)/float64(totalCount)*100) 204 + fmt.Fprintf(os.Stderr, " Filtered out: %d (%.2f%%)\n", filteredCount, float64(filteredCount)/float64(totalCount)*100) 205 + fmt.Fprintf(os.Stderr, " Total size: %s\n", formatBytes(totalBytes)) 206 + fmt.Fprintf(os.Stderr, " Filtered size: %s (%.2f%%)\n", formatBytes(filteredBytes), float64(filteredBytes)/float64(totalBytes)*100) 207 + fmt.Fprintf(os.Stderr, " Clean size: %s (%.2f%%)\n", formatBytes(totalBytes-filteredBytes), float64(totalBytes-filteredBytes)/float64(totalBytes)*100) 208 + fmt.Fprintf(os.Stderr, " Detectors used: %d\n", len(detectorNames)) 209 + } 210 + 211 + func cmdDetectorList() { 212 + registry := detector.DefaultRegistry() 213 + detectors := registry.List() 214 + 215 + // Sort by name 216 + sort.Slice(detectors, func(i, j int) bool { 217 + return detectors[i].Name() < detectors[j].Name() 218 + }) 219 + 220 + fmt.Printf("Available detectors:\n\n") 221 + for _, d := range detectors { 222 + fmt.Printf(" %-20s %s (v%s)\n", d.Name(), d.Description(), d.Version()) 223 + } 224 + fmt.Printf("\nUse 'plcbundle detector info <name>' for details\n") 225 + } 226 + 227 + func cmdDetectorTest() { 228 + // Extract detector name first 229 + if len(os.Args) < 4 { 230 + fmt.Fprintf(os.Stderr, "Usage: plcbundle detector test <detector-name> --bundle N\n") 231 + os.Exit(1) 232 + } 233 + 234 + detectorName := os.Args[3] 235 + 236 + // Parse flags from os.Args[4:] 237 + fs := flag.NewFlagSet("detector test", flag.ExitOnError) 238 + bundleNum := fs.Int("bundle", 0, "bundle number to test") 239 + confidence := fs.Float64("confidence", 0.90, "minimum confidence threshold") 240 + verbose := fs.Bool("v", false, "verbose output") 241 + fs.Parse(os.Args[4:]) // ← Changed from os.Args[3:] 242 + 243 + if *bundleNum == 0 { 244 + fmt.Fprintf(os.Stderr, "Error: --bundle required\n") 245 + os.Exit(1) 246 + } 247 + 248 + // Load bundle 249 + mgr, _, err := getManager("") 250 + if err != nil { 251 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 252 + os.Exit(1) 253 + } 254 + defer mgr.Close() 255 + 256 + ctx := context.Background() 257 + bundle, err := mgr.LoadBundle(ctx, *bundleNum) 258 + if err != nil { 259 + fmt.Fprintf(os.Stderr, "Error loading bundle: %v\n", err) 260 + os.Exit(1) 261 + } 262 + 263 + fmt.Printf("Testing detector '%s' on bundle %06d...\n", detectorName, *bundleNum) 264 + fmt.Printf("Min confidence: %.2f\n\n", *confidence) 265 + 266 + // Run detector 267 + registry := detector.DefaultRegistry() 268 + config := detector.DefaultConfig() 269 + config.MinConfidence = *confidence 270 + 271 + runner := detector.NewRunner(registry, config, &defaultLogger{}) 272 + results, err := runner.RunOnBundle(ctx, detectorName, bundle) 273 + if err != nil { 274 + fmt.Fprintf(os.Stderr, "Detection failed: %v\n", err) 275 + os.Exit(1) 276 + } 277 + 278 + // Calculate stats 279 + stats := detector.CalculateStats(results, len(bundle.Operations)) 280 + 281 + // Display results 282 + fmt.Printf("Results:\n") 283 + fmt.Printf(" Total operations: %d\n", stats.TotalOperations) 284 + fmt.Printf(" Matches found: %d (%.2f%%)\n", stats.MatchedCount, stats.MatchRate*100) 285 + fmt.Printf("\n") 286 + 287 + if len(stats.ByReason) > 0 { 288 + fmt.Printf("Breakdown by reason:\n") 289 + for reason, count := range stats.ByReason { 290 + pct := float64(count) / float64(stats.MatchedCount) * 100 291 + fmt.Printf(" %-25s %d (%.1f%%)\n", reason, count, pct) 292 + } 293 + fmt.Printf("\n") 294 + } 295 + 296 + if len(stats.ByCategory) > 0 { 297 + fmt.Printf("Breakdown by category:\n") 298 + for category, count := range stats.ByCategory { 299 + pct := float64(count) / float64(stats.MatchedCount) * 100 300 + fmt.Printf(" %-25s %d (%.1f%%)\n", category, count, pct) 301 + } 302 + fmt.Printf("\n") 303 + } 304 + 305 + if len(stats.ByConfidence) > 0 { 306 + fmt.Printf("Confidence distribution:\n") 307 + for bucket, count := range stats.ByConfidence { 308 + pct := float64(count) / float64(stats.MatchedCount) * 100 309 + fmt.Printf(" %-25s %d (%.1f%%)\n", bucket, count, pct) 310 + } 311 + fmt.Printf("\n") 312 + } 313 + 314 + if *verbose && len(results) > 0 { 315 + fmt.Printf("Sample matches (first 10):\n") 316 + displayCount := 10 317 + if len(results) < displayCount { 318 + displayCount = len(results) 319 + } 320 + 321 + for i := 0; i < displayCount; i++ { 322 + res := results[i] 323 + fmt.Printf(" %d. Position %d: %s\n", i+1, res.Position, res.DID) 324 + fmt.Printf(" Reason: %s (confidence: %.2f)\n", res.Match.Reason, res.Match.Confidence) 325 + if res.Match.Note != "" { 326 + fmt.Printf(" Note: %s\n", res.Match.Note) 327 + } 328 + } 329 + 330 + if len(results) > displayCount { 331 + fmt.Printf(" ... and %d more\n", len(results)-displayCount) 332 + } 333 + } 334 + } 335 + 336 + func cmdDetectorRun() { 337 + if len(os.Args) < 4 { 338 + fmt.Fprintf(os.Stderr, "Usage: plcbundle detector run <detector1> [detector2...] --bundles 1-100\n") 339 + fmt.Fprintf(os.Stderr, "\nUse 'all' to run all available detectors\n") 340 + os.Exit(1) 341 + } 342 + 343 + // Manually separate detector names from flags 344 + var detectorNames []string 345 + var flagArgs []string 346 + 347 + for i := 3; i < len(os.Args); i++ { 348 + arg := os.Args[i] 349 + if strings.HasPrefix(arg, "-") { 350 + // This and all remaining are flags 351 + flagArgs = os.Args[i:] 352 + break 353 + } 354 + // Detector name 355 + detectorNames = append(detectorNames, arg) 356 + } 357 + 358 + if len(detectorNames) == 0 { 359 + fmt.Fprintf(os.Stderr, "Error: at least one detector name required\n") 360 + fmt.Fprintf(os.Stderr, "\nExamples:\n") 361 + fmt.Fprintf(os.Stderr, " plcbundle detector run invalid_handle --bundles 1-100\n") 362 + fmt.Fprintf(os.Stderr, " plcbundle detector run invalid_handle aka_spam --bundles 1-100\n") 363 + fmt.Fprintf(os.Stderr, " plcbundle detector run all --bundles 1-100\n") 364 + os.Exit(1) 365 + } 366 + 367 + // Parse flags 368 + fs := flag.NewFlagSet("detector run", flag.ExitOnError) 369 + bundleRange := fs.String("bundles", "", "bundle range (e.g., '1-100')") 370 + confidence := fs.Float64("confidence", 0.90, "minimum confidence") 371 + fs.Parse(flagArgs) 372 + 373 + if *bundleRange == "" { 374 + fmt.Fprintf(os.Stderr, "Error: --bundles required\n") 375 + os.Exit(1) 376 + } 377 + 378 + // Parse bundle range 379 + start, end, err := parseBundleRange(*bundleRange) 380 + if err != nil { 381 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 382 + os.Exit(1) 383 + } 384 + 385 + // Load manager 386 + mgr, _, err := getManager("") 387 + if err != nil { 388 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 389 + os.Exit(1) 390 + } 391 + defer mgr.Close() 392 + 393 + // Setup registry 394 + registry := detector.DefaultRegistry() 395 + config := detector.DefaultConfig() 396 + config.MinConfidence = *confidence 397 + 398 + // Handle "all" keyword - expand to all available detectors 399 + if len(detectorNames) == 1 && detectorNames[0] == "all" { 400 + detectorNames = registry.Names() 401 + fmt.Fprintf(os.Stderr, "Using all available detectors: %s\n", strings.Join(detectorNames, ", ")) 402 + } 403 + 404 + // Log to stderr 405 + fmt.Fprintf(os.Stderr, "Running %d detector(s) on bundles %d-%d...\n", len(detectorNames), start, end) 406 + if len(detectorNames) <= 5 { 407 + fmt.Fprintf(os.Stderr, "Detectors: %s\n", strings.Join(detectorNames, ", ")) 408 + } 409 + fmt.Fprintf(os.Stderr, "Min confidence: %.2f\n\n", *confidence) 410 + 411 + // Get all detectors 412 + detectors := make([]detector.Detector, 0, len(detectorNames)) 413 + for _, name := range detectorNames { 414 + d, err := registry.Get(name) 415 + if err != nil { 416 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 417 + os.Exit(1) 418 + } 419 + detectors = append(detectors, d) 420 + } 421 + 422 + ctx := context.Background() 423 + 424 + // Write CSV header to stdout 425 + fmt.Println("bundle,position,cid,detectors,confidence,detected_at,size") 426 + 427 + // Track statistics 428 + totalOps := 0 429 + matchCount := 0 430 + totalBytes := int64(0) 431 + matchedBytes := int64(0) 432 + bundlesProcessed := 0 433 + detectorMatchCounts := make(map[string]int) 434 + 435 + totalBundles := end - start + 1 436 + 437 + // Create progress bar with byte tracking enabled 438 + fmt.Fprintf(os.Stderr, "Processing bundles:\n") 439 + progress := NewProgressBar(totalBundles) 440 + progress.showBytes = true // Enable byte tracking 441 + 442 + // Process bundles and stream results 443 + for bundleNum := start; bundleNum <= end; bundleNum++ { 444 + bundle, err := mgr.LoadBundle(ctx, bundleNum) 445 + if err != nil { 446 + // Don't update progress on error, just log 447 + progress.Finish() 448 + fmt.Fprintf(os.Stderr, "\n⚠️ Warning: failed to load bundle %d: %v\n", bundleNum, err) 449 + progress = NewProgressBar(totalBundles) 450 + progress.showBytes = true 451 + progress.SetWithBytes(bundleNum-start, totalBytes) 452 + continue 453 + } 454 + 455 + bundlesProcessed++ 456 + totalOps += len(bundle.Operations) 457 + 458 + // Process each operation with all detectors 459 + for position, op := range bundle.Operations { 460 + // Calculate operation size first 461 + var opSize int 462 + if len(op.RawJSON) > 0 { 463 + opSize = len(op.RawJSON) 464 + } else { 465 + // Fallback: marshal to get size 466 + data, _ := json.Marshal(op) 467 + opSize = len(data) 468 + } 469 + totalBytes += int64(opSize) 470 + 471 + // Collect all matches for this operation 472 + var matchedDetectors []string 473 + var maxConfidence float64 474 + var detectedAt time.Time 475 + 476 + // Run all detectors on this operation 477 + for _, det := range detectors { 478 + match, err := det.Detect(ctx, op) 479 + if err != nil { 480 + continue 481 + } 482 + 483 + // Skip if no match or confidence too low 484 + if match == nil || match.Confidence < *confidence { 485 + continue 486 + } 487 + 488 + // Collect detector name 489 + matchedDetectors = append(matchedDetectors, det.Name()) 490 + detectorMatchCounts[det.Name()]++ 491 + 492 + // Track highest confidence 493 + if match.Confidence > maxConfidence { 494 + maxConfidence = match.Confidence 495 + } 496 + 497 + // Use current time for first match 498 + if detectedAt.IsZero() { 499 + detectedAt = time.Now() 500 + } 501 + } 502 + 503 + // Output only if at least one detector matched 504 + if len(matchedDetectors) > 0 { 505 + matchCount++ 506 + matchedBytes += int64(opSize) 507 + 508 + fmt.Printf("%d,%d,%s,%s,%.2f,%s,%d\n", 509 + bundleNum, 510 + position, 511 + op.CID, 512 + strings.Join(matchedDetectors, ";"), 513 + maxConfidence, 514 + detectedAt.Format("2006-01-02T15:04:05Z"), 515 + opSize, 516 + ) 517 + } 518 + } 519 + 520 + // Update progress with bytes 521 + progress.SetWithBytes(bundleNum-start+1, totalBytes) 522 + } 523 + 524 + // Finish progress bar 525 + progress.Finish() 526 + 527 + // Final stats to stderr 528 + fmt.Fprintf(os.Stderr, "\n") 529 + fmt.Fprintf(os.Stderr, "✓ Detection complete\n") 530 + fmt.Fprintf(os.Stderr, " Bundles processed: %d\n", bundlesProcessed) 531 + fmt.Fprintf(os.Stderr, " Total operations: %d\n", totalOps) 532 + fmt.Fprintf(os.Stderr, " Matches found: %d (%.2f%%)\n", matchCount, float64(matchCount)/float64(totalOps)*100) 533 + fmt.Fprintf(os.Stderr, " Clean operations: %d (%.2f%%)\n", totalOps-matchCount, float64(totalOps-matchCount)/float64(totalOps)*100) 534 + fmt.Fprintf(os.Stderr, "\n") 535 + fmt.Fprintf(os.Stderr, " Total size: %s\n", formatBytes(totalBytes)) 536 + fmt.Fprintf(os.Stderr, " Matched size: %s (%.2f%%)\n", formatBytes(matchedBytes), float64(matchedBytes)/float64(totalBytes)*100) 537 + fmt.Fprintf(os.Stderr, " Clean size: %s (%.2f%%)\n", formatBytes(totalBytes-matchedBytes), float64(totalBytes-matchedBytes)/float64(totalBytes)*100) 538 + 539 + if matchedBytes > 0 { 540 + fmt.Fprintf(os.Stderr, "\n") 541 + fmt.Fprintf(os.Stderr, " 💾 Potential savings if filtered: %s (%.2f%% reduction)\n", 542 + formatBytes(matchedBytes), 543 + float64(matchedBytes)/float64(totalBytes)*100) 544 + } 545 + 546 + fmt.Fprintf(os.Stderr, "\n") 547 + fmt.Fprintf(os.Stderr, " Detectors used: %d\n", len(detectorNames)) 548 + 549 + // Show breakdown by detector if multiple used 550 + if len(detectorNames) > 1 { 551 + fmt.Fprintf(os.Stderr, "\n") 552 + fmt.Fprintf(os.Stderr, " Matches by detector:\n") 553 + for _, name := range detectorNames { 554 + count := detectorMatchCounts[name] 555 + if count > 0 { 556 + pct := float64(count) / float64(matchCount) * 100 557 + fmt.Fprintf(os.Stderr, " %-20s %d (%.1f%%)\n", name, count, pct) 558 + } else { 559 + fmt.Fprintf(os.Stderr, " %-20s 0\n", name) 560 + } 561 + } 562 + } 563 + } 564 + 565 + func cmdDetectorInfo() { 566 + if len(os.Args) < 4 { 567 + fmt.Fprintf(os.Stderr, "Usage: plcbundle detector info <name>\n") 568 + os.Exit(1) 569 + } 570 + 571 + detectorName := os.Args[3] 572 + 573 + registry := detector.DefaultRegistry() 574 + d, err := registry.Get(detectorName) 575 + if err != nil { 576 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 577 + os.Exit(1) 578 + } 579 + 580 + fmt.Printf("Detector: %s\n", d.Name()) 581 + fmt.Printf("Version: %s\n", d.Version()) 582 + fmt.Printf("Description: %s\n", d.Description()) 583 + fmt.Printf("\n") 584 + 585 + // Show example usage 586 + fmt.Printf("Usage examples:\n") 587 + fmt.Printf(" # Test on single bundle\n") 588 + fmt.Printf(" plcbundle detector test %s --bundle 42\n\n", d.Name()) 589 + fmt.Printf(" # Run on range and save\n") 590 + fmt.Printf(" plcbundle detector run %s --bundles 1-100 --output results.csv\n\n", d.Name()) 591 + fmt.Printf(" # Use with filter creation\n") 592 + fmt.Printf(" plcbundle filter detect --detector %s --bundles 1-100\n", d.Name()) 593 + } 594 + 595 + // Helper functions 596 + 597 + func parseBundleRange(rangeStr string) (start, end int, err error) { 598 + // Handle single bundle number 599 + if !strings.Contains(rangeStr, "-") { 600 + var num int 601 + _, err = fmt.Sscanf(rangeStr, "%d", &num) 602 + if err != nil { 603 + return 0, 0, fmt.Errorf("invalid bundle number: %w", err) 604 + } 605 + return num, num, nil 606 + } 607 + 608 + // Handle range (e.g., "1-100") 609 + parts := strings.Split(rangeStr, "-") 610 + if len(parts) != 2 { 611 + return 0, 0, fmt.Errorf("invalid range format (expected: N or start-end)") 612 + } 613 + 614 + _, err = fmt.Sscanf(parts[0], "%d", &start) 615 + if err != nil { 616 + return 0, 0, fmt.Errorf("invalid start: %w", err) 617 + } 618 + 619 + _, err = fmt.Sscanf(parts[1], "%d", &end) 620 + if err != nil { 621 + return 0, 0, fmt.Errorf("invalid end: %w", err) 622 + } 623 + 624 + if start > end { 625 + return 0, 0, fmt.Errorf("start must be <= end") 626 + } 627 + 628 + return start, end, nil 629 + }

+97 -12

cmd/plcbundle/main.go

··· 2 2 3 3 import ( 4 4 "context" 5 + "encoding/json" 5 6 "flag" 6 7 "fmt" 7 8 "net/http" ··· 82 83 cmdServe() 83 84 case "compare": 84 85 cmdCompare() 86 + case "detector": 87 + cmdDetector() 85 88 case "version": 86 89 fmt.Printf("plcbundle version %s\n", version) 87 90 fmt.Printf(" commit: %s\n", gitCommit) ··· 110 113 mempool Show mempool status and operations 111 114 serve Start HTTP server to serve bundle data 112 115 compare Compare local index with target index 116 + detector 113 117 version Show version 114 118 115 119 Security Model: ··· 845 849 846 850 func cmdExport() { 847 851 fs := flag.NewFlagSet("export", flag.ExitOnError) 848 - count := fs.Int("count", 1000, "number of operations to export") 852 + bundles := fs.String("bundles", "", "bundle number or range (e.g., '42' or '1-100')") 853 + all := fs.Bool("all", false, "export all bundles") 854 + count := fs.Int("count", 0, "limit number of operations (0 = all)") 849 855 after := fs.String("after", "", "timestamp to start after (RFC3339)") 850 856 fs.Parse(os.Args[2:]) 851 857 858 + // Validate flags 859 + if !*all && *bundles == "" { 860 + fmt.Fprintf(os.Stderr, "Usage: plcbundle export --bundles <number|range> [options]\n") 861 + fmt.Fprintf(os.Stderr, " or: plcbundle export --all [options]\n") 862 + fmt.Fprintf(os.Stderr, "\nExamples:\n") 863 + fmt.Fprintf(os.Stderr, " plcbundle export --bundles 42\n") 864 + fmt.Fprintf(os.Stderr, " plcbundle export --bundles 1-100\n") 865 + fmt.Fprintf(os.Stderr, " plcbundle export --all\n") 866 + fmt.Fprintf(os.Stderr, " plcbundle export --all --count 50000\n") 867 + fmt.Fprintf(os.Stderr, " plcbundle export --bundles 42 | jq .\n") 868 + os.Exit(1) 869 + } 870 + 871 + // Load manager 852 872 mgr, _, err := getManager("") 853 873 if err != nil { 854 874 fmt.Fprintf(os.Stderr, "Error: %v\n", err) ··· 856 876 } 857 877 defer mgr.Close() 858 878 859 - // Parse after time 879 + // Determine bundle range 880 + var start, end int 881 + if *all { 882 + // Export all bundles 883 + index := mgr.GetIndex() 884 + bundles := index.GetBundles() 885 + if len(bundles) == 0 { 886 + fmt.Fprintf(os.Stderr, "No bundles available\n") 887 + os.Exit(1) 888 + } 889 + start = bundles[0].BundleNumber 890 + end = bundles[len(bundles)-1].BundleNumber 891 + 892 + fmt.Fprintf(os.Stderr, "Exporting all bundles (%d-%d)\n", start, end) 893 + } else { 894 + // Parse bundle range 895 + start, end, err = parseBundleRange(*bundles) 896 + if err != nil { 897 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 898 + os.Exit(1) 899 + } 900 + fmt.Fprintf(os.Stderr, "Exporting bundles %d-%d\n", start, end) 901 + } 902 + 903 + // Log to stderr 904 + if *count > 0 { 905 + fmt.Fprintf(os.Stderr, "Limit: %d operations\n", *count) 906 + } 907 + if *after != "" { 908 + fmt.Fprintf(os.Stderr, "After: %s\n", *after) 909 + } 910 + fmt.Fprintf(os.Stderr, "\n") 911 + 912 + // Parse after time if provided 860 913 var afterTime time.Time 861 914 if *after != "" { 862 915 afterTime, err = time.Parse(time.RFC3339, *after) ··· 867 920 } 868 921 869 922 ctx := context.Background() 870 - ops, err := mgr.ExportOperations(ctx, afterTime, *count) 871 - if err != nil { 872 - fmt.Fprintf(os.Stderr, "Export failed: %v\n", err) 873 - os.Exit(1) 874 - } 923 + exported := 0 875 924 876 - // Output as JSONL 877 - for _, op := range ops { 878 - if len(op.RawJSON) > 0 { 879 - fmt.Println(string(op.RawJSON)) 925 + // Export operations from bundles 926 + for bundleNum := start; bundleNum <= end; bundleNum++ { 927 + // Check if we've reached the limit 928 + if *count > 0 && exported >= *count { 929 + break 930 + } 931 + 932 + fmt.Fprintf(os.Stderr, "Processing bundle %d...\r", bundleNum) 933 + 934 + bundle, err := mgr.LoadBundle(ctx, bundleNum) 935 + if err != nil { 936 + fmt.Fprintf(os.Stderr, "\nWarning: failed to load bundle %d: %v\n", bundleNum, err) 937 + continue 938 + } 939 + 940 + // Output operations 941 + for _, op := range bundle.Operations { 942 + // Check after time filter 943 + if !afterTime.IsZero() && op.CreatedAt.Before(afterTime) { 944 + continue 945 + } 946 + 947 + // Check count limit 948 + if *count > 0 && exported >= *count { 949 + break 950 + } 951 + 952 + // Output operation as JSONL 953 + if len(op.RawJSON) > 0 { 954 + fmt.Println(string(op.RawJSON)) 955 + } else { 956 + // Fallback to marshaling 957 + data, _ := json.Marshal(op) 958 + fmt.Println(string(data)) 959 + } 960 + 961 + exported++ 880 962 } 881 963 } 882 964 883 - fmt.Fprintf(os.Stderr, "Exported %d operations\n", len(ops)) 965 + // Final stats to stderr 966 + fmt.Fprintf(os.Stderr, "\n\n") 967 + fmt.Fprintf(os.Stderr, "✓ Export complete\n") 968 + fmt.Fprintf(os.Stderr, " Exported: %d operations\n", exported) 884 969 } 885 970 886 971 func cmdBackfill() {

+6 -5

cmd/plcbundle/progress.go

··· 2 2 3 3 import ( 4 4 "fmt" 5 + "os" 5 6 "strings" 6 7 "sync" 7 8 "time" ··· 80 81 pb.current = pb.total 81 82 pb.currentBytes = pb.totalBytes 82 83 pb.print() 83 - fmt.Println() // New line after completion 84 + fmt.Fprintf(os.Stderr, "\n") // ← FIXED: Use stderr 84 85 } 85 86 86 87 // print renders the progress bar (must be called with lock held) ··· 113 114 eta = time.Duration(float64(remaining)/speed) * time.Second 114 115 } 115 116 116 - // Print progress bar 117 - if pb.showBytes && pb.totalBytes > 0 { 117 + // Show MB/s if bytes are being tracked (changed condition) 118 + if pb.showBytes && pb.currentBytes > 0 { 118 119 // Calculate MB/s (using decimal units: 1 MB = 1,000,000 bytes) 119 120 mbProcessed := float64(pb.currentBytes) / (1000 * 1000) 120 121 mbPerSec := mbProcessed / elapsed.Seconds() 121 122 122 - fmt.Printf("\r [%s] %6.2f%% | %d/%d bundles | %.1f/s | %.1f MB/s | ETA: %s ", 123 + fmt.Fprintf(os.Stderr, "\r [%s] %6.2f%% | %d/%d bundles | %.1f/s | %.1f MB/s | ETA: %s ", 123 124 bar, 124 125 percent, 125 126 pb.current, ··· 128 129 mbPerSec, 129 130 formatETA(eta)) 130 131 } else { 131 - fmt.Printf("\r [%s] %6.2f%% | %d/%d bundles | %.1f/s | ETA: %s ", 132 + fmt.Fprintf(os.Stderr, "\r [%s] %6.2f%% | %d/%d bundles | %.1f/s | ETA: %s ", 132 133 bar, 133 134 percent, 134 135 pb.current,

+467

detector/builtin.go

··· 1 + // detector/builtin.go 2 + package detector 3 + 4 + import ( 5 + "context" 6 + "regexp" 7 + "strings" 8 + 9 + "tangled.org/atscan.net/plcbundle/plc" 10 + ) 11 + 12 + // InvalidHandleDetector detects operations with invalid handle patterns 13 + type InvalidHandleDetector struct { 14 + // Valid handle regex: lowercase letters, numbers, hyphens, dots only 15 + validHandlePattern *regexp.Regexp 16 + } 17 + 18 + func NewInvalidHandleDetector() *InvalidHandleDetector { 19 + return &InvalidHandleDetector{ 20 + // Valid handle: alphanumeric, hyphens, dots (no underscores!) 21 + validHandlePattern: regexp.MustCompile(`^at://[a-z0-9][a-z0-9-]*(\.[a-z0-9][a-z0-9-]*)*\.[a-z]+$`), 22 + } 23 + } 24 + 25 + func (d *InvalidHandleDetector) Name() string { return "invalid_handle" } 26 + func (d *InvalidHandleDetector) Description() string { 27 + return "Detects operations with invalid handle patterns (underscores, invalid chars)" 28 + } 29 + func (d *InvalidHandleDetector) Version() string { return "1.0.0" } 30 + 31 + func (d *InvalidHandleDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) { 32 + if alsoKnownAs, ok := op.Operation["alsoKnownAs"].([]interface{}); ok { 33 + for _, aka := range alsoKnownAs { 34 + if str, ok := aka.(string); ok { 35 + // Check if it's an at:// handle 36 + if !strings.HasPrefix(str, "at://") { 37 + continue 38 + } 39 + 40 + // Check for underscore (invalid in Bluesky handles) 41 + if strings.Contains(str, "_") { 42 + return &Match{ 43 + Reason: "underscore_in_handle", 44 + Category: "invalid_handle", 45 + Confidence: 0.99, 46 + Note: "Handle contains underscore which is invalid in Bluesky", 47 + Metadata: map[string]interface{}{ 48 + "invalid_handle": str, 49 + "violation": "underscore_character", 50 + }, 51 + }, nil 52 + } 53 + 54 + // Check if handle matches valid pattern 55 + if !d.validHandlePattern.MatchString(str) { 56 + return &Match{ 57 + Reason: "invalid_handle_pattern", 58 + Category: "invalid_handle", 59 + Confidence: 0.95, 60 + Note: "Handle does not match valid Bluesky handle pattern", 61 + Metadata: map[string]interface{}{ 62 + "invalid_handle": str, 63 + "violation": "pattern_mismatch", 64 + }, 65 + }, nil 66 + } 67 + } 68 + } 69 + } 70 + 71 + return nil, nil 72 + } 73 + 74 + // AlsoKnownAsSpamDetector detects excessive/garbage alsoKnownAs entries 75 + type AlsoKnownAsSpamDetector struct { 76 + maxLegitimateEntries int 77 + minGarbageLength int 78 + } 79 + 80 + func NewAlsoKnownAsSpamDetector() *AlsoKnownAsSpamDetector { 81 + return &AlsoKnownAsSpamDetector{ 82 + maxLegitimateEntries: 3, // Normal operations have 1-3 entries 83 + minGarbageLength: 100, // Garbage strings are very long 84 + } 85 + } 86 + 87 + func (d *AlsoKnownAsSpamDetector) Name() string { return "aka_spam" } 88 + func (d *AlsoKnownAsSpamDetector) Description() string { 89 + return "Detects spam through excessive or garbage alsoKnownAs entries" 90 + } 91 + func (d *AlsoKnownAsSpamDetector) Version() string { return "1.0.0" } 92 + 93 + func (d *AlsoKnownAsSpamDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) { 94 + if alsoKnownAs, ok := op.Operation["alsoKnownAs"].([]interface{}); ok { 95 + entryCount := len(alsoKnownAs) 96 + 97 + // Count different types of entries 98 + atURICount := 0 99 + garbageCount := 0 100 + var garbageExamples []string 101 + 102 + for _, aka := range alsoKnownAs { 103 + if str, ok := aka.(string); ok { 104 + if strings.HasPrefix(str, "at://") { 105 + atURICount++ 106 + } else if len(str) > d.minGarbageLength { 107 + garbageCount++ 108 + if len(garbageExamples) < 2 { 109 + // Store first few for evidence 110 + preview := str 111 + if len(preview) > 50 { 112 + preview = preview[:50] + "..." 113 + } 114 + garbageExamples = append(garbageExamples, preview) 115 + } 116 + } 117 + } 118 + } 119 + 120 + // Detection: Excessive entries 121 + if entryCount > d.maxLegitimateEntries { 122 + confidence := 0.80 123 + if garbageCount > 0 { 124 + confidence = 0.95 // Higher confidence if garbage detected 125 + } 126 + 127 + return &Match{ 128 + Reason: "excessive_aka_entries", 129 + Category: "spam", 130 + Confidence: confidence, 131 + Note: "Operation has excessive alsoKnownAs entries", 132 + Metadata: map[string]interface{}{ 133 + "total_entries": entryCount, 134 + "at_uri_count": atURICount, 135 + "garbage_count": garbageCount, 136 + "garbage_examples": garbageExamples, 137 + }, 138 + }, nil 139 + } 140 + 141 + // Detection: Garbage entries present (even if count is low) 142 + if garbageCount > 0 { 143 + return &Match{ 144 + Reason: "garbage_aka_entries", 145 + Category: "spam", 146 + Confidence: 0.98, 147 + Note: "Operation contains garbage/random strings in alsoKnownAs", 148 + Metadata: map[string]interface{}{ 149 + "total_entries": entryCount, 150 + "garbage_count": garbageCount, 151 + "garbage_examples": garbageExamples, 152 + }, 153 + }, nil 154 + } 155 + } 156 + 157 + return nil, nil 158 + } 159 + 160 + // CompositeSpamDetector combines multiple signals for higher confidence 161 + type CompositeSpamDetector struct { 162 + invalidHandle *InvalidHandleDetector 163 + akaSpam *AlsoKnownAsSpamDetector 164 + } 165 + 166 + func NewCompositeSpamDetector() *CompositeSpamDetector { 167 + return &CompositeSpamDetector{ 168 + invalidHandle: NewInvalidHandleDetector(), 169 + akaSpam: NewAlsoKnownAsSpamDetector(), 170 + } 171 + } 172 + 173 + func (d *CompositeSpamDetector) Name() string { return "composite_spam" } 174 + func (d *CompositeSpamDetector) Description() string { 175 + return "Combines multiple spam signals for high-confidence detection" 176 + } 177 + func (d *CompositeSpamDetector) Version() string { return "1.0.0" } 178 + 179 + func (d *CompositeSpamDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) { 180 + // Check both detectors 181 + invalidHandleMatch, _ := d.invalidHandle.Detect(ctx, op) 182 + akaSpamMatch, _ := d.akaSpam.Detect(ctx, op) 183 + 184 + // If both match, very high confidence 185 + if invalidHandleMatch != nil && akaSpamMatch != nil { 186 + return &Match{ 187 + Reason: "multiple_spam_indicators", 188 + Category: "spam", 189 + Confidence: 0.99, 190 + Note: "Operation has both invalid handle and excessive alsoKnownAs entries", 191 + Metadata: map[string]interface{}{ 192 + "invalid_handle_reason": invalidHandleMatch.Reason, 193 + "aka_spam_reason": akaSpamMatch.Reason, 194 + "invalid_handle_data": invalidHandleMatch.Metadata, 195 + "aka_spam_data": akaSpamMatch.Metadata, 196 + }, 197 + }, nil 198 + } 199 + 200 + // Return whichever matched 201 + if invalidHandleMatch != nil { 202 + return invalidHandleMatch, nil 203 + } 204 + if akaSpamMatch != nil { 205 + return akaSpamMatch, nil 206 + } 207 + 208 + return nil, nil 209 + } 210 + 211 + // SpamPDSDetector detects known spam PDS endpoints 212 + type SpamPDSDetector struct { 213 + spamEndpoints map[string]bool 214 + spamDomains map[string]bool 215 + } 216 + 217 + func NewSpamPDSDetector() *SpamPDSDetector { 218 + return &SpamPDSDetector{ 219 + spamEndpoints: map[string]bool{ 220 + "pds.trump.com": true, 221 + // Add more as discovered 222 + }, 223 + spamDomains: map[string]bool{ 224 + "trump.com": true, 225 + "donald.trump.com": true, 226 + // Add more as discovered 227 + }, 228 + } 229 + } 230 + 231 + func (d *SpamPDSDetector) Name() string { return "spam_pds" } 232 + func (d *SpamPDSDetector) Description() string { 233 + return "Detects operations using known spam PDS endpoints and fake domain claims" 234 + } 235 + func (d *SpamPDSDetector) Version() string { return "1.0.0" } 236 + 237 + func (d *SpamPDSDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) { 238 + // Check PDS endpoint 239 + if services, ok := op.Operation["services"].(map[string]interface{}); ok { 240 + if pds, ok := services["atproto_pds"].(map[string]interface{}); ok { 241 + if endpoint, ok := pds["endpoint"].(string); ok { 242 + host := extractHost(endpoint) 243 + 244 + // Check if it's a known spam PDS 245 + if d.spamEndpoints[host] { 246 + return &Match{ 247 + Reason: "spam_pds_endpoint", 248 + Category: "spam", 249 + Confidence: 0.99, 250 + Note: "Operation uses known spam PDS endpoint", 251 + Metadata: map[string]interface{}{ 252 + "endpoint": endpoint, 253 + "host": host, 254 + }, 255 + }, nil 256 + } 257 + } 258 + } 259 + } 260 + 261 + // Check for spam domain claims in alsoKnownAs 262 + if alsoKnownAs, ok := op.Operation["alsoKnownAs"].([]interface{}); ok { 263 + for _, aka := range alsoKnownAs { 264 + if str, ok := aka.(string); ok { 265 + if !strings.HasPrefix(str, "at://") { 266 + continue 267 + } 268 + 269 + // Extract domain from at:// URI 270 + domain := strings.TrimPrefix(str, "at://") 271 + if idx := strings.Index(domain, "/"); idx > 0 { 272 + domain = domain[:idx] 273 + } 274 + 275 + // Check if claiming spam domain 276 + if d.spamDomains[domain] { 277 + return &Match{ 278 + Reason: "fake_domain_claim", 279 + Category: "impersonation", 280 + Confidence: 0.99, 281 + Note: "Operation claims known spam/fake domain", 282 + Metadata: map[string]interface{}{ 283 + "claimed_domain": domain, 284 + "handle": str, 285 + }, 286 + }, nil 287 + } 288 + 289 + // Check for subdomain patterns (like jr.donald.trump.com) 290 + for spamDomain := range d.spamDomains { 291 + if strings.HasSuffix(domain, "."+spamDomain) || domain == spamDomain { 292 + return &Match{ 293 + Reason: "fake_domain_claim", 294 + Category: "impersonation", 295 + Confidence: 0.99, 296 + Note: "Operation claims domain related to known spam domain", 297 + Metadata: map[string]interface{}{ 298 + "claimed_domain": domain, 299 + "spam_domain": spamDomain, 300 + }, 301 + }, nil 302 + } 303 + } 304 + } 305 + } 306 + } 307 + 308 + return nil, nil 309 + } 310 + 311 + // ServiceAbuseDetector detects operations with abused service structures 312 + type ServiceAbuseDetector struct { 313 + maxServiceTypeLength int 314 + maxEndpointLength int 315 + maxHandleLength int 316 + } 317 + 318 + func NewServiceAbuseDetector() *ServiceAbuseDetector { 319 + return &ServiceAbuseDetector{ 320 + maxServiceTypeLength: 100, // Normal types are short (e.g., "AtprotoPersonalDataServer") 321 + maxEndpointLength: 200, // Normal endpoints are reasonable URLs 322 + maxHandleLength: 100, // Normal handles are short 323 + } 324 + } 325 + 326 + func (d *ServiceAbuseDetector) Name() string { return "service_abuse" } 327 + func (d *ServiceAbuseDetector) Description() string { 328 + return "Detects operations with abused service structures (random strings, numeric keys)" 329 + } 330 + func (d *ServiceAbuseDetector) Version() string { return "1.0.0" } 331 + 332 + func (d *ServiceAbuseDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) { 333 + if services, ok := op.Operation["services"].(map[string]interface{}); ok { 334 + // Check for numeric service keys (spam uses "0", "1", "2" instead of proper names) 335 + hasNumericKeys := false 336 + numericKeyCount := 0 337 + 338 + for key := range services { 339 + // Check if key is a digit 340 + if len(key) == 1 && key >= "0" && key <= "9" { 341 + hasNumericKeys = true 342 + numericKeyCount++ 343 + } 344 + } 345 + 346 + if hasNumericKeys && numericKeyCount > 1 { 347 + return &Match{ 348 + Reason: "numeric_service_keys", 349 + Category: "service_abuse", 350 + Confidence: 0.98, 351 + Note: "Services use numeric keys instead of proper names", 352 + Metadata: map[string]interface{}{ 353 + "numeric_key_count": numericKeyCount, 354 + }, 355 + }, nil 356 + } 357 + 358 + // Check each service for abuse patterns 359 + for serviceName, serviceData := range services { 360 + if serviceMap, ok := serviceData.(map[string]interface{}); ok { 361 + // Check service type length 362 + if serviceType, ok := serviceMap["type"].(string); ok { 363 + if len(serviceType) > d.maxServiceTypeLength { 364 + return &Match{ 365 + Reason: "excessive_service_type_length", 366 + Category: "service_abuse", 367 + Confidence: 0.99, 368 + Note: "Service type field contains excessively long random string", 369 + Metadata: map[string]interface{}{ 370 + "service_name": serviceName, 371 + "type_length": len(serviceType), 372 + "type_preview": serviceType[:50] + "...", 373 + }, 374 + }, nil 375 + } 376 + } 377 + 378 + // Check endpoint length 379 + if endpoint, ok := serviceMap["endpoint"].(string); ok { 380 + if len(endpoint) > d.maxEndpointLength { 381 + return &Match{ 382 + Reason: "excessive_endpoint_length", 383 + Category: "service_abuse", 384 + Confidence: 0.99, 385 + Note: "Service endpoint contains excessively long random string", 386 + Metadata: map[string]interface{}{ 387 + "service_name": serviceName, 388 + "endpoint_length": len(endpoint), 389 + "endpoint_preview": endpoint[:min(100, len(endpoint))] + "...", 390 + }, 391 + }, nil 392 + } 393 + } 394 + } 395 + } 396 + } 397 + 398 + // Check for excessively long handles in alsoKnownAs 399 + if alsoKnownAs, ok := op.Operation["alsoKnownAs"].([]interface{}); ok { 400 + for _, aka := range alsoKnownAs { 401 + if str, ok := aka.(string); ok { 402 + if strings.HasPrefix(str, "at://") { 403 + handle := strings.TrimPrefix(str, "at://") 404 + if len(handle) > d.maxHandleLength { 405 + return &Match{ 406 + Reason: "excessive_handle_length", 407 + Category: "service_abuse", 408 + Confidence: 0.98, 409 + Note: "Handle contains excessively long random string", 410 + Metadata: map[string]interface{}{ 411 + "handle_length": len(handle), 412 + "handle_preview": handle[:min(50, len(handle))] + "...", 413 + }, 414 + }, nil 415 + } 416 + } 417 + } 418 + } 419 + } 420 + 421 + // Check for empty verificationMethods (common in this spam) 422 + if vm, ok := op.Operation["verificationMethods"].(map[string]interface{}); ok { 423 + if len(vm) == 0 { 424 + // Empty verificationMethods alone isn't enough, but combined with other signals... 425 + // Check if there are other suspicious signals 426 + if services, ok := op.Operation["services"].(map[string]interface{}); ok { 427 + if len(services) > 2 { 428 + // Multiple services + empty verificationMethods = suspicious 429 + return &Match{ 430 + Reason: "empty_verification_methods", 431 + Category: "service_abuse", 432 + Confidence: 0.85, 433 + Note: "Empty verificationMethods with multiple services", 434 + Metadata: map[string]interface{}{ 435 + "service_count": len(services), 436 + }, 437 + }, nil 438 + } 439 + } 440 + } 441 + } 442 + 443 + return nil, nil 444 + } 445 + 446 + // Helper function for min 447 + func min(a, b int) int { 448 + if a < b { 449 + return a 450 + } 451 + return b 452 + } 453 + 454 + // Helper functions 455 + 456 + func extractHost(endpoint string) string { 457 + // Extract host from URL 458 + endpoint = strings.TrimPrefix(endpoint, "http://") 459 + endpoint = strings.TrimPrefix(endpoint, "https://") 460 + if idx := strings.Index(endpoint, "/"); idx > 0 { 461 + endpoint = endpoint[:idx] 462 + } 463 + if idx := strings.Index(endpoint, ":"); idx > 0 { 464 + endpoint = endpoint[:idx] 465 + } 466 + return endpoint 467 + }

+63

detector/detector.go

··· 1 + // detector/detector.go 2 + package detector 3 + 4 + import ( 5 + "context" 6 + "time" 7 + 8 + "tangled.org/atscan.net/plcbundle/plc" 9 + ) 10 + 11 + // Detector represents a spam detection algorithm 12 + type Detector interface { 13 + // Name returns the detector's unique identifier 14 + Name() string 15 + 16 + // Description returns a human-readable description 17 + Description() string 18 + 19 + // Detect analyzes an operation and returns a match result 20 + Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) 21 + 22 + // Version returns the detector version 23 + Version() string 24 + } 25 + 26 + // Match represents a positive spam detection 27 + type Match struct { 28 + Reason string // Short identifier (e.g., "nostr_crosspost") 29 + Category string // Broader category (e.g., "cross_posting") 30 + Confidence float64 // 0.0 to 1.0 31 + Note string // Optional human-readable explanation 32 + Metadata map[string]interface{} // Additional context 33 + } 34 + 35 + // Result represents the outcome of running a detector on an operation 36 + type Result struct { 37 + BundleNumber int 38 + Position int 39 + DID string 40 + CID string // ← Add this field 41 + Match *Match // nil if no match 42 + Error error 43 + DetectorName string 44 + DetectedAt time.Time 45 + } 46 + 47 + // Config holds detector configuration 48 + type Config struct { 49 + MinConfidence float64 50 + Timeout time.Duration 51 + Parallel bool 52 + Workers int 53 + } 54 + 55 + // DefaultConfig returns sensible defaults 56 + func DefaultConfig() *Config { 57 + return &Config{ 58 + MinConfidence: 0.90, 59 + Timeout: 5 * time.Second, 60 + Parallel: true, 61 + Workers: 4, 62 + } 63 + }

+87

detector/registry.go

··· 1 + // detector/registry.go 2 + package detector 3 + 4 + import ( 5 + "fmt" 6 + "sync" 7 + ) 8 + 9 + // Registry manages available detectors 10 + type Registry struct { 11 + detectors map[string]Detector 12 + mu sync.RWMutex 13 + } 14 + 15 + // NewRegistry creates a new detector registry 16 + func NewRegistry() *Registry { 17 + return &Registry{ 18 + detectors: make(map[string]Detector), 19 + } 20 + } 21 + 22 + // Register adds a detector to the registry 23 + func (r *Registry) Register(d Detector) error { 24 + r.mu.Lock() 25 + defer r.mu.Unlock() 26 + 27 + name := d.Name() 28 + if _, exists := r.detectors[name]; exists { 29 + return fmt.Errorf("detector %q already registered", name) 30 + } 31 + 32 + r.detectors[name] = d 33 + return nil 34 + } 35 + 36 + // Get retrieves a detector by name 37 + func (r *Registry) Get(name string) (Detector, error) { 38 + r.mu.RLock() 39 + defer r.mu.RUnlock() 40 + 41 + d, ok := r.detectors[name] 42 + if !ok { 43 + return nil, fmt.Errorf("detector %q not found", name) 44 + } 45 + 46 + return d, nil 47 + } 48 + 49 + // List returns all registered detectors 50 + func (r *Registry) List() []Detector { 51 + r.mu.RLock() 52 + defer r.mu.RUnlock() 53 + 54 + detectors := make([]Detector, 0, len(r.detectors)) 55 + for _, d := range r.detectors { 56 + detectors = append(detectors, d) 57 + } 58 + 59 + return detectors 60 + } 61 + 62 + // Names returns all detector names 63 + func (r *Registry) Names() []string { 64 + r.mu.RLock() 65 + defer r.mu.RUnlock() 66 + 67 + names := make([]string, 0, len(r.detectors)) 68 + for name := range r.detectors { 69 + names = append(names, name) 70 + } 71 + 72 + return names 73 + } 74 + 75 + // DefaultRegistry returns a registry with built-in detectors 76 + func DefaultRegistry() *Registry { 77 + r := NewRegistry() 78 + 79 + // Register real spam detectors 80 + r.Register(NewInvalidHandleDetector()) 81 + r.Register(NewAlsoKnownAsSpamDetector()) 82 + r.Register(NewCompositeSpamDetector()) 83 + r.Register(NewSpamPDSDetector()) 84 + r.Register(NewServiceAbuseDetector()) 85 + 86 + return r 87 + }

+216

detector/runner.go

··· 1 + // detector/runner.go 2 + package detector 3 + 4 + import ( 5 + "context" 6 + "fmt" 7 + "sync" 8 + "time" 9 + 10 + "tangled.org/atscan.net/plcbundle/bundle" 11 + "tangled.org/atscan.net/plcbundle/plc" 12 + ) 13 + 14 + // Runner executes detectors against operations 15 + type Runner struct { 16 + registry *Registry 17 + config *Config 18 + logger Logger 19 + } 20 + 21 + type Logger interface { 22 + Printf(format string, v ...interface{}) 23 + } 24 + 25 + // NewRunner creates a new detector runner 26 + func NewRunner(registry *Registry, config *Config, logger Logger) *Runner { 27 + if config == nil { 28 + config = DefaultConfig() 29 + } 30 + return &Runner{ 31 + registry: registry, 32 + config: config, 33 + logger: logger, 34 + } 35 + } 36 + 37 + // RunOnBundle runs detector(s) on all operations in a bundle 38 + func (r *Runner) RunOnBundle(ctx context.Context, detectorName string, b *bundle.Bundle) ([]*Result, error) { 39 + detector, err := r.registry.Get(detectorName) 40 + if err != nil { 41 + return nil, err 42 + } 43 + 44 + var results []*Result 45 + 46 + if r.config.Parallel { 47 + results = r.runParallel(ctx, detector, b) 48 + } else { 49 + results = r.runSequential(ctx, detector, b) 50 + } 51 + 52 + // Filter by minimum confidence 53 + filtered := make([]*Result, 0) 54 + for _, res := range results { 55 + if res.Match != nil && res.Match.Confidence >= r.config.MinConfidence { 56 + filtered = append(filtered, res) 57 + } 58 + } 59 + 60 + return filtered, nil 61 + } 62 + 63 + func (r *Runner) runSequential(ctx context.Context, detector Detector, b *bundle.Bundle) []*Result { 64 + results := make([]*Result, 0) 65 + 66 + for pos, op := range b.Operations { 67 + select { 68 + case <-ctx.Done(): 69 + return results 70 + default: 71 + } 72 + 73 + result := r.detectOne(ctx, detector, b.BundleNumber, pos, op) 74 + if result.Match != nil || result.Error != nil { 75 + results = append(results, result) 76 + } 77 + } 78 + 79 + return results 80 + } 81 + 82 + func (r *Runner) runParallel(ctx context.Context, detector Detector, b *bundle.Bundle) []*Result { 83 + type job struct { 84 + pos int 85 + op plc.PLCOperation 86 + } 87 + 88 + jobs := make(chan job, len(b.Operations)) 89 + resultsChan := make(chan *Result, len(b.Operations)) 90 + 91 + // Start workers 92 + var wg sync.WaitGroup 93 + for i := 0; i < r.config.Workers; i++ { 94 + wg.Add(1) 95 + go func() { 96 + defer wg.Done() 97 + for j := range jobs { 98 + select { 99 + case <-ctx.Done(): 100 + return 101 + default: 102 + } 103 + 104 + result := r.detectOne(ctx, detector, b.BundleNumber, j.pos, j.op) 105 + if result.Match != nil || result.Error != nil { 106 + resultsChan <- result 107 + } 108 + } 109 + }() 110 + } 111 + 112 + // Send jobs 113 + for pos, op := range b.Operations { 114 + jobs <- job{pos: pos, op: op} 115 + } 116 + close(jobs) 117 + 118 + // Wait for completion 119 + go func() { 120 + wg.Wait() 121 + close(resultsChan) 122 + }() 123 + 124 + // Collect results 125 + results := make([]*Result, 0) 126 + for result := range resultsChan { 127 + results = append(results, result) 128 + } 129 + 130 + return results 131 + } 132 + 133 + func (r *Runner) detectOne(ctx context.Context, detector Detector, bundleNum, pos int, op plc.PLCOperation) *Result { 134 + // Create timeout context 135 + detectCtx, cancel := context.WithTimeout(ctx, r.config.Timeout) 136 + defer cancel() 137 + 138 + result := &Result{ 139 + BundleNumber: bundleNum, 140 + Position: pos, 141 + DID: op.DID, 142 + CID: op.CID, // ← Add this 143 + DetectorName: detector.Name(), 144 + DetectedAt: time.Now(), 145 + } 146 + 147 + match, err := detector.Detect(detectCtx, op) 148 + result.Match = match 149 + result.Error = err 150 + 151 + return result 152 + } 153 + 154 + // RunMultipleDetectors runs multiple detectors on a bundle 155 + func (r *Runner) RunMultipleDetectors(ctx context.Context, detectorNames []string, b *bundle.Bundle) (map[string][]*Result, error) { 156 + allResults := make(map[string][]*Result) 157 + 158 + for _, name := range detectorNames { 159 + results, err := r.RunOnBundle(ctx, name, b) 160 + if err != nil { 161 + return nil, fmt.Errorf("detector %s failed: %w", name, err) 162 + } 163 + allResults[name] = results 164 + } 165 + 166 + return allResults, nil 167 + } 168 + 169 + // Stats represents detection statistics 170 + type Stats struct { 171 + TotalOperations int 172 + MatchedCount int 173 + MatchRate float64 174 + ByReason map[string]int 175 + ByCategory map[string]int 176 + ByConfidence map[string]int // 0.9-1.0, 0.8-0.9, etc. 177 + } 178 + 179 + // CalculateStats computes statistics from results 180 + func CalculateStats(results []*Result, totalOps int) *Stats { 181 + stats := &Stats{ 182 + TotalOperations: totalOps, 183 + MatchedCount: len(results), 184 + ByReason: make(map[string]int), 185 + ByCategory: make(map[string]int), 186 + ByConfidence: make(map[string]int), 187 + } 188 + 189 + if totalOps > 0 { 190 + stats.MatchRate = float64(len(results)) / float64(totalOps) 191 + } 192 + 193 + for _, res := range results { 194 + if res.Match == nil { 195 + continue 196 + } 197 + 198 + stats.ByReason[res.Match.Reason]++ 199 + stats.ByCategory[res.Match.Category]++ 200 + 201 + // Confidence buckets 202 + conf := res.Match.Confidence 203 + switch { 204 + case conf >= 0.95: 205 + stats.ByConfidence["0.95-1.00"]++ 206 + case conf >= 0.90: 207 + stats.ByConfidence["0.90-0.95"]++ 208 + case conf >= 0.85: 209 + stats.ByConfidence["0.85-0.90"]++ 210 + default: 211 + stats.ByConfidence["0.00-0.85"]++ 212 + } 213 + } 214 + 215 + return stats 216 + }