detectors experiments · angrydutchman.peedee.es/plcbundle@40336cb

+629

cmd/plcbundle/detector.go

···

··· 1 + // cmd/plcbundle/detector.go 2 + package main 3 + 4 + import ( 5 + "bufio" 6 + "context" 7 + "encoding/json" 8 + "flag" 9 + "fmt" 10 + "os" 11 + "sort" 12 + "strings" 13 + "time" 14 + 15 + "tangled.org/atscan.net/plcbundle/detector" 16 + "tangled.org/atscan.net/plcbundle/plc" 17 + ) 18 + 19 + type defaultLogger struct{} 20 + 21 + func (d *defaultLogger) Printf(format string, v ...interface{}) { 22 + fmt.Fprintf(os.Stderr, format+"\n", v...) 23 + } 24 + 25 + func cmdDetector() { 26 + if len(os.Args) < 3 { 27 + printDetectorUsage() 28 + os.Exit(1) 29 + } 30 + 31 + subcommand := os.Args[2] 32 + 33 + switch subcommand { 34 + case "list": 35 + cmdDetectorList() 36 + case "test": 37 + cmdDetectorTest() 38 + case "run": 39 + cmdDetectorRun() 40 + case "filter": // ← Add this 41 + cmdDetectorFilter() 42 + case "info": 43 + cmdDetectorInfo() 44 + default: 45 + fmt.Fprintf(os.Stderr, "Unknown detector subcommand: %s\n", subcommand) 46 + printDetectorUsage() 47 + os.Exit(1) 48 + } 49 + } 50 + 51 + func printDetectorUsage() { 52 + fmt.Printf(`Usage: plcbundle detector <command> [options] 53 + 54 + Commands: 55 + list List available detectors 56 + test Test a detector on specific bundles 57 + run Run detector and output CSV results 58 + filter Filter JSONL operations from stdin 59 + info Show detailed detector information 60 + 61 + Examples: 62 + plcbundle detector list 63 + plcbundle detector test nostr --bundle 42 64 + plcbundle detector run all --bundles 1-100 > results.csv 65 + plcbundle backfill | plcbundle detector filter all > filtered.jsonl 66 + plcbundle detector info nostr 67 + `) 68 + } 69 + 70 + // cmdDetectorFilter reads JSONL from stdin, filters OUT spam, outputs clean operations 71 + func cmdDetectorFilter() { 72 + if len(os.Args) < 4 { 73 + fmt.Fprintf(os.Stderr, "Usage: plcbundle detector filter <detector1> [detector2...] [--confidence 0.9]\n") 74 + fmt.Fprintf(os.Stderr, "\nFilters OUT operations that match detectors (outputs clean data)\n\n") 75 + fmt.Fprintf(os.Stderr, "Examples:\n") 76 + fmt.Fprintf(os.Stderr, " plcbundle backfill | plcbundle detector filter all > clean.jsonl\n") 77 + fmt.Fprintf(os.Stderr, " plcbundle export --bundle 1 | plcbundle detector filter invalid_handle > clean.jsonl\n") 78 + os.Exit(1) 79 + } 80 + 81 + // Manually separate detector names from flags 82 + var detectorNames []string 83 + var flagArgs []string 84 + 85 + for i := 3; i < len(os.Args); i++ { 86 + arg := os.Args[i] 87 + if strings.HasPrefix(arg, "-") { 88 + flagArgs = os.Args[i:] 89 + break 90 + } 91 + detectorNames = append(detectorNames, arg) 92 + } 93 + 94 + if len(detectorNames) == 0 { 95 + fmt.Fprintf(os.Stderr, "Error: at least one detector name required\n") 96 + os.Exit(1) 97 + } 98 + 99 + // Parse flags 100 + fs := flag.NewFlagSet("detector filter", flag.ExitOnError) 101 + confidence := fs.Float64("confidence", 0.90, "minimum confidence") 102 + fs.Parse(flagArgs) 103 + 104 + // Setup registry 105 + registry := detector.DefaultRegistry() 106 + 107 + // Handle "all" keyword 108 + if len(detectorNames) == 1 && detectorNames[0] == "all" { 109 + detectorNames = registry.Names() 110 + fmt.Fprintf(os.Stderr, "Using all detectors: %s\n", strings.Join(detectorNames, ", ")) 111 + } 112 + 113 + // Get all detectors 114 + detectors := make([]detector.Detector, 0, len(detectorNames)) 115 + for _, name := range detectorNames { 116 + d, err := registry.Get(name) 117 + if err != nil { 118 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 119 + os.Exit(1) 120 + } 121 + detectors = append(detectors, d) 122 + } 123 + 124 + // Log to stderr 125 + fmt.Fprintf(os.Stderr, "Filtering OUT spam with %d detector(s)\n", len(detectorNames)) 126 + if len(detectorNames) <= 5 { 127 + fmt.Fprintf(os.Stderr, "Detectors: %s\n", strings.Join(detectorNames, ", ")) 128 + } 129 + fmt.Fprintf(os.Stderr, "Min confidence: %.2f\n\n", *confidence) 130 + 131 + ctx := context.Background() 132 + scanner := bufio.NewScanner(os.Stdin) 133 + 134 + // Set large buffer for long lines 135 + buf := make([]byte, 0, 64*1024) 136 + scanner.Buffer(buf, 1024*1024) 137 + 138 + cleanCount := 0 139 + filteredCount := 0 140 + totalCount := 0 141 + totalBytes := int64(0) // ← Add total bytes 142 + filteredBytes := int64(0) // ← Add filtered bytes 143 + 144 + // Read JSONL from stdin 145 + for scanner.Scan() { 146 + line := scanner.Bytes() 147 + if len(line) == 0 { 148 + continue 149 + } 150 + 151 + totalCount++ 152 + opSize := int64(len(line)) 153 + totalBytes += opSize // ← Track total 154 + 155 + // Parse operation 156 + var op plc.PLCOperation 157 + if err := json.Unmarshal(line, &op); err != nil { 158 + fmt.Fprintf(os.Stderr, "Warning: failed to parse line %d: %v\n", totalCount, err) 159 + continue 160 + } 161 + 162 + // Run all detectors on this operation 163 + isSpam := false 164 + 165 + for _, det := range detectors { 166 + match, err := det.Detect(ctx, op) 167 + if err != nil { 168 + continue 169 + } 170 + 171 + if match != nil && match.Confidence >= *confidence { 172 + // Detected as spam - filter it out 173 + isSpam = true 174 + break 175 + } 176 + } 177 + 178 + // Output only if NOT spam (clean operation) 179 + if !isSpam { 180 + cleanCount++ 181 + fmt.Println(string(line)) 182 + } else { 183 + filteredCount++ 184 + filteredBytes += opSize // ← Track filtered bytes 185 + } 186 + 187 + // Progress to stderr 188 + if totalCount%1000 == 0 { 189 + fmt.Fprintf(os.Stderr, "Processed: %d | Clean: %d | Filtered: %d | Saved: %s\r", 190 + totalCount, cleanCount, filteredCount, formatBytes(filteredBytes)) 191 + } 192 + } 193 + 194 + if err := scanner.Err(); err != nil { 195 + fmt.Fprintf(os.Stderr, "\nError reading stdin: %v\n", err) 196 + os.Exit(1) 197 + } 198 + 199 + // Final stats to stderr 200 + fmt.Fprintf(os.Stderr, "\n\n") 201 + fmt.Fprintf(os.Stderr, "✓ Filter complete\n") 202 + fmt.Fprintf(os.Stderr, " Total operations: %d\n", totalCount) 203 + fmt.Fprintf(os.Stderr, " Clean: %d (%.2f%%)\n", cleanCount, float64(cleanCount)/float64(totalCount)*100) 204 + fmt.Fprintf(os.Stderr, " Filtered out: %d (%.2f%%)\n", filteredCount, float64(filteredCount)/float64(totalCount)*100) 205 + fmt.Fprintf(os.Stderr, " Total size: %s\n", formatBytes(totalBytes)) 206 + fmt.Fprintf(os.Stderr, " Filtered size: %s (%.2f%%)\n", formatBytes(filteredBytes), float64(filteredBytes)/float64(totalBytes)*100) 207 + fmt.Fprintf(os.Stderr, " Clean size: %s (%.2f%%)\n", formatBytes(totalBytes-filteredBytes), float64(totalBytes-filteredBytes)/float64(totalBytes)*100) 208 + fmt.Fprintf(os.Stderr, " Detectors used: %d\n", len(detectorNames)) 209 + } 210 + 211 + func cmdDetectorList() { 212 + registry := detector.DefaultRegistry() 213 + detectors := registry.List() 214 + 215 + // Sort by name 216 + sort.Slice(detectors, func(i, j int) bool { 217 + return detectors[i].Name() < detectors[j].Name() 218 + }) 219 + 220 + fmt.Printf("Available detectors:\n\n") 221 + for _, d := range detectors { 222 + fmt.Printf(" %-20s %s (v%s)\n", d.Name(), d.Description(), d.Version()) 223 + } 224 + fmt.Printf("\nUse 'plcbundle detector info <name>' for details\n") 225 + } 226 + 227 + func cmdDetectorTest() { 228 + // Extract detector name first 229 + if len(os.Args) < 4 { 230 + fmt.Fprintf(os.Stderr, "Usage: plcbundle detector test <detector-name> --bundle N\n") 231 + os.Exit(1) 232 + } 233 + 234 + detectorName := os.Args[3] 235 + 236 + // Parse flags from os.Args[4:] 237 + fs := flag.NewFlagSet("detector test", flag.ExitOnError) 238 + bundleNum := fs.Int("bundle", 0, "bundle number to test") 239 + confidence := fs.Float64("confidence", 0.90, "minimum confidence threshold") 240 + verbose := fs.Bool("v", false, "verbose output") 241 + fs.Parse(os.Args[4:]) // ← Changed from os.Args[3:] 242 + 243 + if *bundleNum == 0 { 244 + fmt.Fprintf(os.Stderr, "Error: --bundle required\n") 245 + os.Exit(1) 246 + } 247 + 248 + // Load bundle 249 + mgr, _, err := getManager("") 250 + if err != nil { 251 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 252 + os.Exit(1) 253 + } 254 + defer mgr.Close() 255 + 256 + ctx := context.Background() 257 + bundle, err := mgr.LoadBundle(ctx, *bundleNum) 258 + if err != nil { 259 + fmt.Fprintf(os.Stderr, "Error loading bundle: %v\n", err) 260 + os.Exit(1) 261 + } 262 + 263 + fmt.Printf("Testing detector '%s' on bundle %06d...\n", detectorName, *bundleNum) 264 + fmt.Printf("Min confidence: %.2f\n\n", *confidence) 265 + 266 + // Run detector 267 + registry := detector.DefaultRegistry() 268 + config := detector.DefaultConfig() 269 + config.MinConfidence = *confidence 270 + 271 + runner := detector.NewRunner(registry, config, &defaultLogger{}) 272 + results, err := runner.RunOnBundle(ctx, detectorName, bundle) 273 + if err != nil { 274 + fmt.Fprintf(os.Stderr, "Detection failed: %v\n", err) 275 + os.Exit(1) 276 + } 277 + 278 + // Calculate stats 279 + stats := detector.CalculateStats(results, len(bundle.Operations)) 280 + 281 + // Display results 282 + fmt.Printf("Results:\n") 283 + fmt.Printf(" Total operations: %d\n", stats.TotalOperations) 284 + fmt.Printf(" Matches found: %d (%.2f%%)\n", stats.MatchedCount, stats.MatchRate*100) 285 + fmt.Printf("\n") 286 + 287 + if len(stats.ByReason) > 0 { 288 + fmt.Printf("Breakdown by reason:\n") 289 + for reason, count := range stats.ByReason { 290 + pct := float64(count) / float64(stats.MatchedCount) * 100 291 + fmt.Printf(" %-25s %d (%.1f%%)\n", reason, count, pct) 292 + } 293 + fmt.Printf("\n") 294 + } 295 + 296 + if len(stats.ByCategory) > 0 { 297 + fmt.Printf("Breakdown by category:\n") 298 + for category, count := range stats.ByCategory { 299 + pct := float64(count) / float64(stats.MatchedCount) * 100 300 + fmt.Printf(" %-25s %d (%.1f%%)\n", category, count, pct) 301 + } 302 + fmt.Printf("\n") 303 + } 304 + 305 + if len(stats.ByConfidence) > 0 { 306 + fmt.Printf("Confidence distribution:\n") 307 + for bucket, count := range stats.ByConfidence { 308 + pct := float64(count) / float64(stats.MatchedCount) * 100 309 + fmt.Printf(" %-25s %d (%.1f%%)\n", bucket, count, pct) 310 + } 311 + fmt.Printf("\n") 312 + } 313 + 314 + if *verbose && len(results) > 0 { 315 + fmt.Printf("Sample matches (first 10):\n") 316 + displayCount := 10 317 + if len(results) < displayCount { 318 + displayCount = len(results) 319 + } 320 + 321 + for i := 0; i < displayCount; i++ { 322 + res := results[i] 323 + fmt.Printf(" %d. Position %d: %s\n", i+1, res.Position, res.DID) 324 + fmt.Printf(" Reason: %s (confidence: %.2f)\n", res.Match.Reason, res.Match.Confidence) 325 + if res.Match.Note != "" { 326 + fmt.Printf(" Note: %s\n", res.Match.Note) 327 + } 328 + } 329 + 330 + if len(results) > displayCount { 331 + fmt.Printf(" ... and %d more\n", len(results)-displayCount) 332 + } 333 + } 334 + } 335 + 336 + func cmdDetectorRun() { 337 + if len(os.Args) < 4 { 338 + fmt.Fprintf(os.Stderr, "Usage: plcbundle detector run <detector1> [detector2...] --bundles 1-100\n") 339 + fmt.Fprintf(os.Stderr, "\nUse 'all' to run all available detectors\n") 340 + os.Exit(1) 341 + } 342 + 343 + // Manually separate detector names from flags 344 + var detectorNames []string 345 + var flagArgs []string 346 + 347 + for i := 3; i < len(os.Args); i++ { 348 + arg := os.Args[i] 349 + if strings.HasPrefix(arg, "-") { 350 + // This and all remaining are flags 351 + flagArgs = os.Args[i:] 352 + break 353 + } 354 + // Detector name 355 + detectorNames = append(detectorNames, arg) 356 + } 357 + 358 + if len(detectorNames) == 0 { 359 + fmt.Fprintf(os.Stderr, "Error: at least one detector name required\n") 360 + fmt.Fprintf(os.Stderr, "\nExamples:\n") 361 + fmt.Fprintf(os.Stderr, " plcbundle detector run invalid_handle --bundles 1-100\n") 362 + fmt.Fprintf(os.Stderr, " plcbundle detector run invalid_handle aka_spam --bundles 1-100\n") 363 + fmt.Fprintf(os.Stderr, " plcbundle detector run all --bundles 1-100\n") 364 + os.Exit(1) 365 + } 366 + 367 + // Parse flags 368 + fs := flag.NewFlagSet("detector run", flag.ExitOnError) 369 + bundleRange := fs.String("bundles", "", "bundle range (e.g., '1-100')") 370 + confidence := fs.Float64("confidence", 0.90, "minimum confidence") 371 + fs.Parse(flagArgs) 372 + 373 + if *bundleRange == "" { 374 + fmt.Fprintf(os.Stderr, "Error: --bundles required\n") 375 + os.Exit(1) 376 + } 377 + 378 + // Parse bundle range 379 + start, end, err := parseBundleRange(*bundleRange) 380 + if err != nil { 381 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 382 + os.Exit(1) 383 + } 384 + 385 + // Load manager 386 + mgr, _, err := getManager("") 387 + if err != nil { 388 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 389 + os.Exit(1) 390 + } 391 + defer mgr.Close() 392 + 393 + // Setup registry 394 + registry := detector.DefaultRegistry() 395 + config := detector.DefaultConfig() 396 + config.MinConfidence = *confidence 397 + 398 + // Handle "all" keyword - expand to all available detectors 399 + if len(detectorNames) == 1 && detectorNames[0] == "all" { 400 + detectorNames = registry.Names() 401 + fmt.Fprintf(os.Stderr, "Using all available detectors: %s\n", strings.Join(detectorNames, ", ")) 402 + } 403 + 404 + // Log to stderr 405 + fmt.Fprintf(os.Stderr, "Running %d detector(s) on bundles %d-%d...\n", len(detectorNames), start, end) 406 + if len(detectorNames) <= 5 { 407 + fmt.Fprintf(os.Stderr, "Detectors: %s\n", strings.Join(detectorNames, ", ")) 408 + } 409 + fmt.Fprintf(os.Stderr, "Min confidence: %.2f\n\n", *confidence) 410 + 411 + // Get all detectors 412 + detectors := make([]detector.Detector, 0, len(detectorNames)) 413 + for _, name := range detectorNames { 414 + d, err := registry.Get(name) 415 + if err != nil { 416 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 417 + os.Exit(1) 418 + } 419 + detectors = append(detectors, d) 420 + } 421 + 422 + ctx := context.Background() 423 + 424 + // Write CSV header to stdout 425 + fmt.Println("bundle,position,cid,detectors,confidence,detected_at,size") 426 + 427 + // Track statistics 428 + totalOps := 0 429 + matchCount := 0 430 + totalBytes := int64(0) 431 + matchedBytes := int64(0) 432 + bundlesProcessed := 0 433 + detectorMatchCounts := make(map[string]int) 434 + 435 + totalBundles := end - start + 1 436 + 437 + // Create progress bar with byte tracking enabled 438 + fmt.Fprintf(os.Stderr, "Processing bundles:\n") 439 + progress := NewProgressBar(totalBundles) 440 + progress.showBytes = true // Enable byte tracking 441 + 442 + // Process bundles and stream results 443 + for bundleNum := start; bundleNum <= end; bundleNum++ { 444 + bundle, err := mgr.LoadBundle(ctx, bundleNum) 445 + if err != nil { 446 + // Don't update progress on error, just log 447 + progress.Finish() 448 + fmt.Fprintf(os.Stderr, "\n⚠️ Warning: failed to load bundle %d: %v\n", bundleNum, err) 449 + progress = NewProgressBar(totalBundles) 450 + progress.showBytes = true 451 + progress.SetWithBytes(bundleNum-start, totalBytes) 452 + continue 453 + } 454 + 455 + bundlesProcessed++ 456 + totalOps += len(bundle.Operations) 457 + 458 + // Process each operation with all detectors 459 + for position, op := range bundle.Operations { 460 + // Calculate operation size first 461 + var opSize int 462 + if len(op.RawJSON) > 0 { 463 + opSize = len(op.RawJSON) 464 + } else { 465 + // Fallback: marshal to get size 466 + data, _ := json.Marshal(op) 467 + opSize = len(data) 468 + } 469 + totalBytes += int64(opSize) 470 + 471 + // Collect all matches for this operation 472 + var matchedDetectors []string 473 + var maxConfidence float64 474 + var detectedAt time.Time 475 + 476 + // Run all detectors on this operation 477 + for _, det := range detectors { 478 + match, err := det.Detect(ctx, op) 479 + if err != nil { 480 + continue 481 + } 482 + 483 + // Skip if no match or confidence too low 484 + if match == nil || match.Confidence < *confidence { 485 + continue 486 + } 487 + 488 + // Collect detector name 489 + matchedDetectors = append(matchedDetectors, det.Name()) 490 + detectorMatchCounts[det.Name()]++ 491 + 492 + // Track highest confidence 493 + if match.Confidence > maxConfidence { 494 + maxConfidence = match.Confidence 495 + } 496 + 497 + // Use current time for first match 498 + if detectedAt.IsZero() { 499 + detectedAt = time.Now() 500 + } 501 + } 502 + 503 + // Output only if at least one detector matched 504 + if len(matchedDetectors) > 0 { 505 + matchCount++ 506 + matchedBytes += int64(opSize) 507 + 508 + fmt.Printf("%d,%d,%s,%s,%.2f,%s,%d\n", 509 + bundleNum, 510 + position, 511 + op.CID, 512 + strings.Join(matchedDetectors, ";"), 513 + maxConfidence, 514 + detectedAt.Format("2006-01-02T15:04:05Z"), 515 + opSize, 516 + ) 517 + } 518 + } 519 + 520 + // Update progress with bytes 521 + progress.SetWithBytes(bundleNum-start+1, totalBytes) 522 + } 523 + 524 + // Finish progress bar 525 + progress.Finish() 526 + 527 + // Final stats to stderr 528 + fmt.Fprintf(os.Stderr, "\n") 529 + fmt.Fprintf(os.Stderr, "✓ Detection complete\n") 530 + fmt.Fprintf(os.Stderr, " Bundles processed: %d\n", bundlesProcessed) 531 + fmt.Fprintf(os.Stderr, " Total operations: %d\n", totalOps) 532 + fmt.Fprintf(os.Stderr, " Matches found: %d (%.2f%%)\n", matchCount, float64(matchCount)/float64(totalOps)*100) 533 + fmt.Fprintf(os.Stderr, " Clean operations: %d (%.2f%%)\n", totalOps-matchCount, float64(totalOps-matchCount)/float64(totalOps)*100) 534 + fmt.Fprintf(os.Stderr, "\n") 535 + fmt.Fprintf(os.Stderr, " Total size: %s\n", formatBytes(totalBytes)) 536 + fmt.Fprintf(os.Stderr, " Matched size: %s (%.2f%%)\n", formatBytes(matchedBytes), float64(matchedBytes)/float64(totalBytes)*100) 537 + fmt.Fprintf(os.Stderr, " Clean size: %s (%.2f%%)\n", formatBytes(totalBytes-matchedBytes), float64(totalBytes-matchedBytes)/float64(totalBytes)*100) 538 + 539 + if matchedBytes > 0 { 540 + fmt.Fprintf(os.Stderr, "\n") 541 + fmt.Fprintf(os.Stderr, " 💾 Potential savings if filtered: %s (%.2f%% reduction)\n", 542 + formatBytes(matchedBytes), 543 + float64(matchedBytes)/float64(totalBytes)*100) 544 + } 545 + 546 + fmt.Fprintf(os.Stderr, "\n") 547 + fmt.Fprintf(os.Stderr, " Detectors used: %d\n", len(detectorNames)) 548 + 549 + // Show breakdown by detector if multiple used 550 + if len(detectorNames) > 1 { 551 + fmt.Fprintf(os.Stderr, "\n") 552 + fmt.Fprintf(os.Stderr, " Matches by detector:\n") 553 + for _, name := range detectorNames { 554 + count := detectorMatchCounts[name] 555 + if count > 0 { 556 + pct := float64(count) / float64(matchCount) * 100 557 + fmt.Fprintf(os.Stderr, " %-20s %d (%.1f%%)\n", name, count, pct) 558 + } else { 559 + fmt.Fprintf(os.Stderr, " %-20s 0\n", name) 560 + } 561 + } 562 + } 563 + } 564 + 565 + func cmdDetectorInfo() { 566 + if len(os.Args) < 4 { 567 + fmt.Fprintf(os.Stderr, "Usage: plcbundle detector info <name>\n") 568 + os.Exit(1) 569 + } 570 + 571 + detectorName := os.Args[3] 572 + 573 + registry := detector.DefaultRegistry() 574 + d, err := registry.Get(detectorName) 575 + if err != nil { 576 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 577 + os.Exit(1) 578 + } 579 + 580 + fmt.Printf("Detector: %s\n", d.Name()) 581 + fmt.Printf("Version: %s\n", d.Version()) 582 + fmt.Printf("Description: %s\n", d.Description()) 583 + fmt.Printf("\n") 584 + 585 + // Show example usage 586 + fmt.Printf("Usage examples:\n") 587 + fmt.Printf(" # Test on single bundle\n") 588 + fmt.Printf(" plcbundle detector test %s --bundle 42\n\n", d.Name()) 589 + fmt.Printf(" # Run on range and save\n") 590 + fmt.Printf(" plcbundle detector run %s --bundles 1-100 --output results.csv\n\n", d.Name()) 591 + fmt.Printf(" # Use with filter creation\n") 592 + fmt.Printf(" plcbundle filter detect --detector %s --bundles 1-100\n", d.Name()) 593 + } 594 + 595 + // Helper functions 596 + 597 + func parseBundleRange(rangeStr string) (start, end int, err error) { 598 + // Handle single bundle number 599 + if !strings.Contains(rangeStr, "-") { 600 + var num int 601 + _, err = fmt.Sscanf(rangeStr, "%d", &num) 602 + if err != nil { 603 + return 0, 0, fmt.Errorf("invalid bundle number: %w", err) 604 + } 605 + return num, num, nil 606 + } 607 + 608 + // Handle range (e.g., "1-100") 609 + parts := strings.Split(rangeStr, "-") 610 + if len(parts) != 2 { 611 + return 0, 0, fmt.Errorf("invalid range format (expected: N or start-end)") 612 + } 613 + 614 + _, err = fmt.Sscanf(parts[0], "%d", &start) 615 + if err != nil { 616 + return 0, 0, fmt.Errorf("invalid start: %w", err) 617 + } 618 + 619 + _, err = fmt.Sscanf(parts[1], "%d", &end) 620 + if err != nil { 621 + return 0, 0, fmt.Errorf("invalid end: %w", err) 622 + } 623 + 624 + if start > end { 625 + return 0, 0, fmt.Errorf("start must be <= end") 626 + } 627 + 628 + return start, end, nil 629 + }

+97 -12

cmd/plcbundle/main.go

··· 2 3 import ( 4 "context" 5 "flag" 6 "fmt" 7 "net/http" ··· 82 cmdServe() 83 case "compare": 84 cmdCompare() 85 case "version": 86 fmt.Printf("plcbundle version %s\n", version) 87 fmt.Printf(" commit: %s\n", gitCommit) ··· 110 mempool Show mempool status and operations 111 serve Start HTTP server to serve bundle data 112 compare Compare local index with target index 113 version Show version 114 115 Security Model: ··· 845 846 func cmdExport() { 847 fs := flag.NewFlagSet("export", flag.ExitOnError) 848 - count := fs.Int("count", 1000, "number of operations to export") 849 after := fs.String("after", "", "timestamp to start after (RFC3339)") 850 fs.Parse(os.Args[2:]) 851 852 mgr, _, err := getManager("") 853 if err != nil { 854 fmt.Fprintf(os.Stderr, "Error: %v\n", err) ··· 856 } 857 defer mgr.Close() 858 859 - // Parse after time 860 var afterTime time.Time 861 if *after != "" { 862 afterTime, err = time.Parse(time.RFC3339, *after) ··· 867 } 868 869 ctx := context.Background() 870 - ops, err := mgr.ExportOperations(ctx, afterTime, *count) 871 - if err != nil { 872 - fmt.Fprintf(os.Stderr, "Export failed: %v\n", err) 873 - os.Exit(1) 874 - } 875 876 - // Output as JSONL 877 - for _, op := range ops { 878 - if len(op.RawJSON) > 0 { 879 - fmt.Println(string(op.RawJSON)) 880 } 881 } 882 883 - fmt.Fprintf(os.Stderr, "Exported %d operations\n", len(ops)) 884 } 885 886 func cmdBackfill() {

··· 2 3 import ( 4 "context" 5 + "encoding/json" 6 "flag" 7 "fmt" 8 "net/http" ··· 83 cmdServe() 84 case "compare": 85 cmdCompare() 86 + case "detector": 87 + cmdDetector() 88 case "version": 89 fmt.Printf("plcbundle version %s\n", version) 90 fmt.Printf(" commit: %s\n", gitCommit) ··· 113 mempool Show mempool status and operations 114 serve Start HTTP server to serve bundle data 115 compare Compare local index with target index 116 + detector 117 version Show version 118 119 Security Model: ··· 849 850 func cmdExport() { 851 fs := flag.NewFlagSet("export", flag.ExitOnError) 852 + bundles := fs.String("bundles", "", "bundle number or range (e.g., '42' or '1-100')") 853 + all := fs.Bool("all", false, "export all bundles") 854 + count := fs.Int("count", 0, "limit number of operations (0 = all)") 855 after := fs.String("after", "", "timestamp to start after (RFC3339)") 856 fs.Parse(os.Args[2:]) 857 858 + // Validate flags 859 + if !*all && *bundles == "" { 860 + fmt.Fprintf(os.Stderr, "Usage: plcbundle export --bundles <number|range> [options]\n") 861 + fmt.Fprintf(os.Stderr, " or: plcbundle export --all [options]\n") 862 + fmt.Fprintf(os.Stderr, "\nExamples:\n") 863 + fmt.Fprintf(os.Stderr, " plcbundle export --bundles 42\n") 864 + fmt.Fprintf(os.Stderr, " plcbundle export --bundles 1-100\n") 865 + fmt.Fprintf(os.Stderr, " plcbundle export --all\n") 866 + fmt.Fprintf(os.Stderr, " plcbundle export --all --count 50000\n") 867 + fmt.Fprintf(os.Stderr, " plcbundle export --bundles 42 | jq .\n") 868 + os.Exit(1) 869 + } 870 + 871 + // Load manager 872 mgr, _, err := getManager("") 873 if err != nil { 874 fmt.Fprintf(os.Stderr, "Error: %v\n", err) ··· 876 } 877 defer mgr.Close() 878 879 + // Determine bundle range 880 + var start, end int 881 + if *all { 882 + // Export all bundles 883 + index := mgr.GetIndex() 884 + bundles := index.GetBundles() 885 + if len(bundles) == 0 { 886 + fmt.Fprintf(os.Stderr, "No bundles available\n") 887 + os.Exit(1) 888 + } 889 + start = bundles[0].BundleNumber 890 + end = bundles[len(bundles)-1].BundleNumber 891 + 892 + fmt.Fprintf(os.Stderr, "Exporting all bundles (%d-%d)\n", start, end) 893 + } else { 894 + // Parse bundle range 895 + start, end, err = parseBundleRange(*bundles) 896 + if err != nil { 897 + fmt.Fprintf(os.Stderr, "Error: %v\n", err) 898 + os.Exit(1) 899 + } 900 + fmt.Fprintf(os.Stderr, "Exporting bundles %d-%d\n", start, end) 901 + } 902 + 903 + // Log to stderr 904 + if *count > 0 { 905 + fmt.Fprintf(os.Stderr, "Limit: %d operations\n", *count) 906 + } 907 + if *after != "" { 908 + fmt.Fprintf(os.Stderr, "After: %s\n", *after) 909 + } 910 + fmt.Fprintf(os.Stderr, "\n") 911 + 912 + // Parse after time if provided 913 var afterTime time.Time 914 if *after != "" { 915 afterTime, err = time.Parse(time.RFC3339, *after) ··· 920 } 921 922 ctx := context.Background() 923 + exported := 0 924 925 + // Export operations from bundles 926 + for bundleNum := start; bundleNum <= end; bundleNum++ { 927 + // Check if we've reached the limit 928 + if *count > 0 && exported >= *count { 929 + break 930 + } 931 + 932 + fmt.Fprintf(os.Stderr, "Processing bundle %d...\r", bundleNum) 933 + 934 + bundle, err := mgr.LoadBundle(ctx, bundleNum) 935 + if err != nil { 936 + fmt.Fprintf(os.Stderr, "\nWarning: failed to load bundle %d: %v\n", bundleNum, err) 937 + continue 938 + } 939 + 940 + // Output operations 941 + for _, op := range bundle.Operations { 942 + // Check after time filter 943 + if !afterTime.IsZero() && op.CreatedAt.Before(afterTime) { 944 + continue 945 + } 946 + 947 + // Check count limit 948 + if *count > 0 && exported >= *count { 949 + break 950 + } 951 + 952 + // Output operation as JSONL 953 + if len(op.RawJSON) > 0 { 954 + fmt.Println(string(op.RawJSON)) 955 + } else { 956 + // Fallback to marshaling 957 + data, _ := json.Marshal(op) 958 + fmt.Println(string(data)) 959 + } 960 + 961 + exported++ 962 } 963 } 964 965 + // Final stats to stderr 966 + fmt.Fprintf(os.Stderr, "\n\n") 967 + fmt.Fprintf(os.Stderr, "✓ Export complete\n") 968 + fmt.Fprintf(os.Stderr, " Exported: %d operations\n", exported) 969 } 970 971 func cmdBackfill() {

+6 -5

cmd/plcbundle/progress.go

··· 2 3 import ( 4 "fmt" 5 "strings" 6 "sync" 7 "time" ··· 80 pb.current = pb.total 81 pb.currentBytes = pb.totalBytes 82 pb.print() 83 - fmt.Println() // New line after completion 84 } 85 86 // print renders the progress bar (must be called with lock held) ··· 113 eta = time.Duration(float64(remaining)/speed) * time.Second 114 } 115 116 - // Print progress bar 117 - if pb.showBytes && pb.totalBytes > 0 { 118 // Calculate MB/s (using decimal units: 1 MB = 1,000,000 bytes) 119 mbProcessed := float64(pb.currentBytes) / (1000 * 1000) 120 mbPerSec := mbProcessed / elapsed.Seconds() 121 122 - fmt.Printf("\r [%s] %6.2f%% | %d/%d bundles | %.1f/s | %.1f MB/s | ETA: %s ", 123 bar, 124 percent, 125 pb.current, ··· 128 mbPerSec, 129 formatETA(eta)) 130 } else { 131 - fmt.Printf("\r [%s] %6.2f%% | %d/%d bundles | %.1f/s | ETA: %s ", 132 bar, 133 percent, 134 pb.current,

··· 2 3 import ( 4 "fmt" 5 + "os" 6 "strings" 7 "sync" 8 "time" ··· 81 pb.current = pb.total 82 pb.currentBytes = pb.totalBytes 83 pb.print() 84 + fmt.Fprintf(os.Stderr, "\n") // ← FIXED: Use stderr 85 } 86 87 // print renders the progress bar (must be called with lock held) ··· 114 eta = time.Duration(float64(remaining)/speed) * time.Second 115 } 116 117 + // Show MB/s if bytes are being tracked (changed condition) 118 + if pb.showBytes && pb.currentBytes > 0 { 119 // Calculate MB/s (using decimal units: 1 MB = 1,000,000 bytes) 120 mbProcessed := float64(pb.currentBytes) / (1000 * 1000) 121 mbPerSec := mbProcessed / elapsed.Seconds() 122 123 + fmt.Fprintf(os.Stderr, "\r [%s] %6.2f%% | %d/%d bundles | %.1f/s | %.1f MB/s | ETA: %s ", 124 bar, 125 percent, 126 pb.current, ··· 129 mbPerSec, 130 formatETA(eta)) 131 } else { 132 + fmt.Fprintf(os.Stderr, "\r [%s] %6.2f%% | %d/%d bundles | %.1f/s | ETA: %s ", 133 bar, 134 percent, 135 pb.current,

+467

detector/builtin.go

···

··· 1 + // detector/builtin.go 2 + package detector 3 + 4 + import ( 5 + "context" 6 + "regexp" 7 + "strings" 8 + 9 + "tangled.org/atscan.net/plcbundle/plc" 10 + ) 11 + 12 + // InvalidHandleDetector detects operations with invalid handle patterns 13 + type InvalidHandleDetector struct { 14 + // Valid handle regex: lowercase letters, numbers, hyphens, dots only 15 + validHandlePattern *regexp.Regexp 16 + } 17 + 18 + func NewInvalidHandleDetector() *InvalidHandleDetector { 19 + return &InvalidHandleDetector{ 20 + // Valid handle: alphanumeric, hyphens, dots (no underscores!) 21 + validHandlePattern: regexp.MustCompile(`^at://[a-z0-9][a-z0-9-]*(\.[a-z0-9][a-z0-9-]*)*\.[a-z]+$`), 22 + } 23 + } 24 + 25 + func (d *InvalidHandleDetector) Name() string { return "invalid_handle" } 26 + func (d *InvalidHandleDetector) Description() string { 27 + return "Detects operations with invalid handle patterns (underscores, invalid chars)" 28 + } 29 + func (d *InvalidHandleDetector) Version() string { return "1.0.0" } 30 + 31 + func (d *InvalidHandleDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) { 32 + if alsoKnownAs, ok := op.Operation["alsoKnownAs"].([]interface{}); ok { 33 + for _, aka := range alsoKnownAs { 34 + if str, ok := aka.(string); ok { 35 + // Check if it's an at:// handle 36 + if !strings.HasPrefix(str, "at://") { 37 + continue 38 + } 39 + 40 + // Check for underscore (invalid in Bluesky handles) 41 + if strings.Contains(str, "_") { 42 + return &Match{ 43 + Reason: "underscore_in_handle", 44 + Category: "invalid_handle", 45 + Confidence: 0.99, 46 + Note: "Handle contains underscore which is invalid in Bluesky", 47 + Metadata: map[string]interface{}{ 48 + "invalid_handle": str, 49 + "violation": "underscore_character", 50 + }, 51 + }, nil 52 + } 53 + 54 + // Check if handle matches valid pattern 55 + if !d.validHandlePattern.MatchString(str) { 56 + return &Match{ 57 + Reason: "invalid_handle_pattern", 58 + Category: "invalid_handle", 59 + Confidence: 0.95, 60 + Note: "Handle does not match valid Bluesky handle pattern", 61 + Metadata: map[string]interface{}{ 62 + "invalid_handle": str, 63 + "violation": "pattern_mismatch", 64 + }, 65 + }, nil 66 + } 67 + } 68 + } 69 + } 70 + 71 + return nil, nil 72 + } 73 + 74 + // AlsoKnownAsSpamDetector detects excessive/garbage alsoKnownAs entries 75 + type AlsoKnownAsSpamDetector struct { 76 + maxLegitimateEntries int 77 + minGarbageLength int 78 + } 79 + 80 + func NewAlsoKnownAsSpamDetector() *AlsoKnownAsSpamDetector { 81 + return &AlsoKnownAsSpamDetector{ 82 + maxLegitimateEntries: 3, // Normal operations have 1-3 entries 83 + minGarbageLength: 100, // Garbage strings are very long 84 + } 85 + } 86 + 87 + func (d *AlsoKnownAsSpamDetector) Name() string { return "aka_spam" } 88 + func (d *AlsoKnownAsSpamDetector) Description() string { 89 + return "Detects spam through excessive or garbage alsoKnownAs entries" 90 + } 91 + func (d *AlsoKnownAsSpamDetector) Version() string { return "1.0.0" } 92 + 93 + func (d *AlsoKnownAsSpamDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) { 94 + if alsoKnownAs, ok := op.Operation["alsoKnownAs"].([]interface{}); ok { 95 + entryCount := len(alsoKnownAs) 96 + 97 + // Count different types of entries 98 + atURICount := 0 99 + garbageCount := 0 100 + var garbageExamples []string 101 + 102 + for _, aka := range alsoKnownAs { 103 + if str, ok := aka.(string); ok { 104 + if strings.HasPrefix(str, "at://") { 105 + atURICount++ 106 + } else if len(str) > d.minGarbageLength { 107 + garbageCount++ 108 + if len(garbageExamples) < 2 { 109 + // Store first few for evidence 110 + preview := str 111 + if len(preview) > 50 { 112 + preview = preview[:50] + "..." 113 + } 114 + garbageExamples = append(garbageExamples, preview) 115 + } 116 + } 117 + } 118 + } 119 + 120 + // Detection: Excessive entries 121 + if entryCount > d.maxLegitimateEntries { 122 + confidence := 0.80 123 + if garbageCount > 0 { 124 + confidence = 0.95 // Higher confidence if garbage detected 125 + } 126 + 127 + return &Match{ 128 + Reason: "excessive_aka_entries", 129 + Category: "spam", 130 + Confidence: confidence, 131 + Note: "Operation has excessive alsoKnownAs entries", 132 + Metadata: map[string]interface{}{ 133 + "total_entries": entryCount, 134 + "at_uri_count": atURICount, 135 + "garbage_count": garbageCount, 136 + "garbage_examples": garbageExamples, 137 + }, 138 + }, nil 139 + } 140 + 141 + // Detection: Garbage entries present (even if count is low) 142 + if garbageCount > 0 { 143 + return &Match{ 144 + Reason: "garbage_aka_entries", 145 + Category: "spam", 146 + Confidence: 0.98, 147 + Note: "Operation contains garbage/random strings in alsoKnownAs", 148 + Metadata: map[string]interface{}{ 149 + "total_entries": entryCount, 150 + "garbage_count": garbageCount, 151 + "garbage_examples": garbageExamples, 152 + }, 153 + }, nil 154 + } 155 + } 156 + 157 + return nil, nil 158 + } 159 + 160 + // CompositeSpamDetector combines multiple signals for higher confidence 161 + type CompositeSpamDetector struct { 162 + invalidHandle *InvalidHandleDetector 163 + akaSpam *AlsoKnownAsSpamDetector 164 + } 165 + 166 + func NewCompositeSpamDetector() *CompositeSpamDetector { 167 + return &CompositeSpamDetector{ 168 + invalidHandle: NewInvalidHandleDetector(), 169 + akaSpam: NewAlsoKnownAsSpamDetector(), 170 + } 171 + } 172 + 173 + func (d *CompositeSpamDetector) Name() string { return "composite_spam" } 174 + func (d *CompositeSpamDetector) Description() string { 175 + return "Combines multiple spam signals for high-confidence detection" 176 + } 177 + func (d *CompositeSpamDetector) Version() string { return "1.0.0" } 178 + 179 + func (d *CompositeSpamDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) { 180 + // Check both detectors 181 + invalidHandleMatch, _ := d.invalidHandle.Detect(ctx, op) 182 + akaSpamMatch, _ := d.akaSpam.Detect(ctx, op) 183 + 184 + // If both match, very high confidence 185 + if invalidHandleMatch != nil && akaSpamMatch != nil { 186 + return &Match{ 187 + Reason: "multiple_spam_indicators", 188 + Category: "spam", 189 + Confidence: 0.99, 190 + Note: "Operation has both invalid handle and excessive alsoKnownAs entries", 191 + Metadata: map[string]interface{}{ 192 + "invalid_handle_reason": invalidHandleMatch.Reason, 193 + "aka_spam_reason": akaSpamMatch.Reason, 194 + "invalid_handle_data": invalidHandleMatch.Metadata, 195 + "aka_spam_data": akaSpamMatch.Metadata, 196 + }, 197 + }, nil 198 + } 199 + 200 + // Return whichever matched 201 + if invalidHandleMatch != nil { 202 + return invalidHandleMatch, nil 203 + } 204 + if akaSpamMatch != nil { 205 + return akaSpamMatch, nil 206 + } 207 + 208 + return nil, nil 209 + } 210 + 211 + // SpamPDSDetector detects known spam PDS endpoints 212 + type SpamPDSDetector struct { 213 + spamEndpoints map[string]bool 214 + spamDomains map[string]bool 215 + } 216 + 217 + func NewSpamPDSDetector() *SpamPDSDetector { 218 + return &SpamPDSDetector{ 219 + spamEndpoints: map[string]bool{ 220 + "pds.trump.com": true, 221 + // Add more as discovered 222 + }, 223 + spamDomains: map[string]bool{ 224 + "trump.com": true, 225 + "donald.trump.com": true, 226 + // Add more as discovered 227 + }, 228 + } 229 + } 230 + 231 + func (d *SpamPDSDetector) Name() string { return "spam_pds" } 232 + func (d *SpamPDSDetector) Description() string { 233 + return "Detects operations using known spam PDS endpoints and fake domain claims" 234 + } 235 + func (d *SpamPDSDetector) Version() string { return "1.0.0" } 236 + 237 + func (d *SpamPDSDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) { 238 + // Check PDS endpoint 239 + if services, ok := op.Operation["services"].(map[string]interface{}); ok { 240 + if pds, ok := services["atproto_pds"].(map[string]interface{}); ok { 241 + if endpoint, ok := pds["endpoint"].(string); ok { 242 + host := extractHost(endpoint) 243 + 244 + // Check if it's a known spam PDS 245 + if d.spamEndpoints[host] { 246 + return &Match{ 247 + Reason: "spam_pds_endpoint", 248 + Category: "spam", 249 + Confidence: 0.99, 250 + Note: "Operation uses known spam PDS endpoint", 251 + Metadata: map[string]interface{}{ 252 + "endpoint": endpoint, 253 + "host": host, 254 + }, 255 + }, nil 256 + } 257 + } 258 + } 259 + } 260 + 261 + // Check for spam domain claims in alsoKnownAs 262 + if alsoKnownAs, ok := op.Operation["alsoKnownAs"].([]interface{}); ok { 263 + for _, aka := range alsoKnownAs { 264 + if str, ok := aka.(string); ok { 265 + if !strings.HasPrefix(str, "at://") { 266 + continue 267 + } 268 + 269 + // Extract domain from at:// URI 270 + domain := strings.TrimPrefix(str, "at://") 271 + if idx := strings.Index(domain, "/"); idx > 0 { 272 + domain = domain[:idx] 273 + } 274 + 275 + // Check if claiming spam domain 276 + if d.spamDomains[domain] { 277 + return &Match{ 278 + Reason: "fake_domain_claim", 279 + Category: "impersonation", 280 + Confidence: 0.99, 281 + Note: "Operation claims known spam/fake domain", 282 + Metadata: map[string]interface{}{ 283 + "claimed_domain": domain, 284 + "handle": str, 285 + }, 286 + }, nil 287 + } 288 + 289 + // Check for subdomain patterns (like jr.donald.trump.com) 290 + for spamDomain := range d.spamDomains { 291 + if strings.HasSuffix(domain, "."+spamDomain) || domain == spamDomain { 292 + return &Match{ 293 + Reason: "fake_domain_claim", 294 + Category: "impersonation", 295 + Confidence: 0.99, 296 + Note: "Operation claims domain related to known spam domain", 297 + Metadata: map[string]interface{}{ 298 + "claimed_domain": domain, 299 + "spam_domain": spamDomain, 300 + }, 301 + }, nil 302 + } 303 + } 304 + } 305 + } 306 + } 307 + 308 + return nil, nil 309 + } 310 + 311 + // ServiceAbuseDetector detects operations with abused service structures 312 + type ServiceAbuseDetector struct { 313 + maxServiceTypeLength int 314 + maxEndpointLength int 315 + maxHandleLength int 316 + } 317 + 318 + func NewServiceAbuseDetector() *ServiceAbuseDetector { 319 + return &ServiceAbuseDetector{ 320 + maxServiceTypeLength: 100, // Normal types are short (e.g., "AtprotoPersonalDataServer") 321 + maxEndpointLength: 200, // Normal endpoints are reasonable URLs 322 + maxHandleLength: 100, // Normal handles are short 323 + } 324 + } 325 + 326 + func (d *ServiceAbuseDetector) Name() string { return "service_abuse" } 327 + func (d *ServiceAbuseDetector) Description() string { 328 + return "Detects operations with abused service structures (random strings, numeric keys)" 329 + } 330 + func (d *ServiceAbuseDetector) Version() string { return "1.0.0" } 331 + 332 + func (d *ServiceAbuseDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) { 333 + if services, ok := op.Operation["services"].(map[string]interface{}); ok { 334 + // Check for numeric service keys (spam uses "0", "1", "2" instead of proper names) 335 + hasNumericKeys := false 336 + numericKeyCount := 0 337 + 338 + for key := range services { 339 + // Check if key is a digit 340 + if len(key) == 1 && key >= "0" && key <= "9" { 341 + hasNumericKeys = true 342 + numericKeyCount++ 343 + } 344 + } 345 + 346 + if hasNumericKeys && numericKeyCount > 1 { 347 + return &Match{ 348 + Reason: "numeric_service_keys", 349 + Category: "service_abuse", 350 + Confidence: 0.98, 351 + Note: "Services use numeric keys instead of proper names", 352 + Metadata: map[string]interface{}{ 353 + "numeric_key_count": numericKeyCount, 354 + }, 355 + }, nil 356 + } 357 + 358 + // Check each service for abuse patterns 359 + for serviceName, serviceData := range services { 360 + if serviceMap, ok := serviceData.(map[string]interface{}); ok { 361 + // Check service type length 362 + if serviceType, ok := serviceMap["type"].(string); ok { 363 + if len(serviceType) > d.maxServiceTypeLength { 364 + return &Match{ 365 + Reason: "excessive_service_type_length", 366 + Category: "service_abuse", 367 + Confidence: 0.99, 368 + Note: "Service type field contains excessively long random string", 369 + Metadata: map[string]interface{}{ 370 + "service_name": serviceName, 371 + "type_length": len(serviceType), 372 + "type_preview": serviceType[:50] + "...", 373 + }, 374 + }, nil 375 + } 376 + } 377 + 378 + // Check endpoint length 379 + if endpoint, ok := serviceMap["endpoint"].(string); ok { 380 + if len(endpoint) > d.maxEndpointLength { 381 + return &Match{ 382 + Reason: "excessive_endpoint_length", 383 + Category: "service_abuse", 384 + Confidence: 0.99, 385 + Note: "Service endpoint contains excessively long random string", 386 + Metadata: map[string]interface{}{ 387 + "service_name": serviceName, 388 + "endpoint_length": len(endpoint), 389 + "endpoint_preview": endpoint[:min(100, len(endpoint))] + "...", 390 + }, 391 + }, nil 392 + } 393 + } 394 + } 395 + } 396 + } 397 + 398 + // Check for excessively long handles in alsoKnownAs 399 + if alsoKnownAs, ok := op.Operation["alsoKnownAs"].([]interface{}); ok { 400 + for _, aka := range alsoKnownAs { 401 + if str, ok := aka.(string); ok { 402 + if strings.HasPrefix(str, "at://") { 403 + handle := strings.TrimPrefix(str, "at://") 404 + if len(handle) > d.maxHandleLength { 405 + return &Match{ 406 + Reason: "excessive_handle_length", 407 + Category: "service_abuse", 408 + Confidence: 0.98, 409 + Note: "Handle contains excessively long random string", 410 + Metadata: map[string]interface{}{ 411 + "handle_length": len(handle), 412 + "handle_preview": handle[:min(50, len(handle))] + "...", 413 + }, 414 + }, nil 415 + } 416 + } 417 + } 418 + } 419 + } 420 + 421 + // Check for empty verificationMethods (common in this spam) 422 + if vm, ok := op.Operation["verificationMethods"].(map[string]interface{}); ok { 423 + if len(vm) == 0 { 424 + // Empty verificationMethods alone isn't enough, but combined with other signals... 425 + // Check if there are other suspicious signals 426 + if services, ok := op.Operation["services"].(map[string]interface{}); ok { 427 + if len(services) > 2 { 428 + // Multiple services + empty verificationMethods = suspicious 429 + return &Match{ 430 + Reason: "empty_verification_methods", 431 + Category: "service_abuse", 432 + Confidence: 0.85, 433 + Note: "Empty verificationMethods with multiple services", 434 + Metadata: map[string]interface{}{ 435 + "service_count": len(services), 436 + }, 437 + }, nil 438 + } 439 + } 440 + } 441 + } 442 + 443 + return nil, nil 444 + } 445 + 446 + // Helper function for min 447 + func min(a, b int) int { 448 + if a < b { 449 + return a 450 + } 451 + return b 452 + } 453 + 454 + // Helper functions 455 + 456 + func extractHost(endpoint string) string { 457 + // Extract host from URL 458 + endpoint = strings.TrimPrefix(endpoint, "http://") 459 + endpoint = strings.TrimPrefix(endpoint, "https://") 460 + if idx := strings.Index(endpoint, "/"); idx > 0 { 461 + endpoint = endpoint[:idx] 462 + } 463 + if idx := strings.Index(endpoint, ":"); idx > 0 { 464 + endpoint = endpoint[:idx] 465 + } 466 + return endpoint 467 + }

+63

detector/detector.go

···

··· 1 + // detector/detector.go 2 + package detector 3 + 4 + import ( 5 + "context" 6 + "time" 7 + 8 + "tangled.org/atscan.net/plcbundle/plc" 9 + ) 10 + 11 + // Detector represents a spam detection algorithm 12 + type Detector interface { 13 + // Name returns the detector's unique identifier 14 + Name() string 15 + 16 + // Description returns a human-readable description 17 + Description() string 18 + 19 + // Detect analyzes an operation and returns a match result 20 + Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) 21 + 22 + // Version returns the detector version 23 + Version() string 24 + } 25 + 26 + // Match represents a positive spam detection 27 + type Match struct { 28 + Reason string // Short identifier (e.g., "nostr_crosspost") 29 + Category string // Broader category (e.g., "cross_posting") 30 + Confidence float64 // 0.0 to 1.0 31 + Note string // Optional human-readable explanation 32 + Metadata map[string]interface{} // Additional context 33 + } 34 + 35 + // Result represents the outcome of running a detector on an operation 36 + type Result struct { 37 + BundleNumber int 38 + Position int 39 + DID string 40 + CID string // ← Add this field 41 + Match *Match // nil if no match 42 + Error error 43 + DetectorName string 44 + DetectedAt time.Time 45 + } 46 + 47 + // Config holds detector configuration 48 + type Config struct { 49 + MinConfidence float64 50 + Timeout time.Duration 51 + Parallel bool 52 + Workers int 53 + } 54 + 55 + // DefaultConfig returns sensible defaults 56 + func DefaultConfig() *Config { 57 + return &Config{ 58 + MinConfidence: 0.90, 59 + Timeout: 5 * time.Second, 60 + Parallel: true, 61 + Workers: 4, 62 + } 63 + }

+87

detector/registry.go

···

··· 1 + // detector/registry.go 2 + package detector 3 + 4 + import ( 5 + "fmt" 6 + "sync" 7 + ) 8 + 9 + // Registry manages available detectors 10 + type Registry struct { 11 + detectors map[string]Detector 12 + mu sync.RWMutex 13 + } 14 + 15 + // NewRegistry creates a new detector registry 16 + func NewRegistry() *Registry { 17 + return &Registry{ 18 + detectors: make(map[string]Detector), 19 + } 20 + } 21 + 22 + // Register adds a detector to the registry 23 + func (r *Registry) Register(d Detector) error { 24 + r.mu.Lock() 25 + defer r.mu.Unlock() 26 + 27 + name := d.Name() 28 + if _, exists := r.detectors[name]; exists { 29 + return fmt.Errorf("detector %q already registered", name) 30 + } 31 + 32 + r.detectors[name] = d 33 + return nil 34 + } 35 + 36 + // Get retrieves a detector by name 37 + func (r *Registry) Get(name string) (Detector, error) { 38 + r.mu.RLock() 39 + defer r.mu.RUnlock() 40 + 41 + d, ok := r.detectors[name] 42 + if !ok { 43 + return nil, fmt.Errorf("detector %q not found", name) 44 + } 45 + 46 + return d, nil 47 + } 48 + 49 + // List returns all registered detectors 50 + func (r *Registry) List() []Detector { 51 + r.mu.RLock() 52 + defer r.mu.RUnlock() 53 + 54 + detectors := make([]Detector, 0, len(r.detectors)) 55 + for _, d := range r.detectors { 56 + detectors = append(detectors, d) 57 + } 58 + 59 + return detectors 60 + } 61 + 62 + // Names returns all detector names 63 + func (r *Registry) Names() []string { 64 + r.mu.RLock() 65 + defer r.mu.RUnlock() 66 + 67 + names := make([]string, 0, len(r.detectors)) 68 + for name := range r.detectors { 69 + names = append(names, name) 70 + } 71 + 72 + return names 73 + } 74 + 75 + // DefaultRegistry returns a registry with built-in detectors 76 + func DefaultRegistry() *Registry { 77 + r := NewRegistry() 78 + 79 + // Register real spam detectors 80 + r.Register(NewInvalidHandleDetector()) 81 + r.Register(NewAlsoKnownAsSpamDetector()) 82 + r.Register(NewCompositeSpamDetector()) 83 + r.Register(NewSpamPDSDetector()) 84 + r.Register(NewServiceAbuseDetector()) 85 + 86 + return r 87 + }

+216

detector/runner.go

···

··· 1 + // detector/runner.go 2 + package detector 3 + 4 + import ( 5 + "context" 6 + "fmt" 7 + "sync" 8 + "time" 9 + 10 + "tangled.org/atscan.net/plcbundle/bundle" 11 + "tangled.org/atscan.net/plcbundle/plc" 12 + ) 13 + 14 + // Runner executes detectors against operations 15 + type Runner struct { 16 + registry *Registry 17 + config *Config 18 + logger Logger 19 + } 20 + 21 + type Logger interface { 22 + Printf(format string, v ...interface{}) 23 + } 24 + 25 + // NewRunner creates a new detector runner 26 + func NewRunner(registry *Registry, config *Config, logger Logger) *Runner { 27 + if config == nil { 28 + config = DefaultConfig() 29 + } 30 + return &Runner{ 31 + registry: registry, 32 + config: config, 33 + logger: logger, 34 + } 35 + } 36 + 37 + // RunOnBundle runs detector(s) on all operations in a bundle 38 + func (r *Runner) RunOnBundle(ctx context.Context, detectorName string, b *bundle.Bundle) ([]*Result, error) { 39 + detector, err := r.registry.Get(detectorName) 40 + if err != nil { 41 + return nil, err 42 + } 43 + 44 + var results []*Result 45 + 46 + if r.config.Parallel { 47 + results = r.runParallel(ctx, detector, b) 48 + } else { 49 + results = r.runSequential(ctx, detector, b) 50 + } 51 + 52 + // Filter by minimum confidence 53 + filtered := make([]*Result, 0) 54 + for _, res := range results { 55 + if res.Match != nil && res.Match.Confidence >= r.config.MinConfidence { 56 + filtered = append(filtered, res) 57 + } 58 + } 59 + 60 + return filtered, nil 61 + } 62 + 63 + func (r *Runner) runSequential(ctx context.Context, detector Detector, b *bundle.Bundle) []*Result { 64 + results := make([]*Result, 0) 65 + 66 + for pos, op := range b.Operations { 67 + select { 68 + case <-ctx.Done(): 69 + return results 70 + default: 71 + } 72 + 73 + result := r.detectOne(ctx, detector, b.BundleNumber, pos, op) 74 + if result.Match != nil || result.Error != nil { 75 + results = append(results, result) 76 + } 77 + } 78 + 79 + return results 80 + } 81 + 82 + func (r *Runner) runParallel(ctx context.Context, detector Detector, b *bundle.Bundle) []*Result { 83 + type job struct { 84 + pos int 85 + op plc.PLCOperation 86 + } 87 + 88 + jobs := make(chan job, len(b.Operations)) 89 + resultsChan := make(chan *Result, len(b.Operations)) 90 + 91 + // Start workers 92 + var wg sync.WaitGroup 93 + for i := 0; i < r.config.Workers; i++ { 94 + wg.Add(1) 95 + go func() { 96 + defer wg.Done() 97 + for j := range jobs { 98 + select { 99 + case <-ctx.Done(): 100 + return 101 + default: 102 + } 103 + 104 + result := r.detectOne(ctx, detector, b.BundleNumber, j.pos, j.op) 105 + if result.Match != nil || result.Error != nil { 106 + resultsChan <- result 107 + } 108 + } 109 + }() 110 + } 111 + 112 + // Send jobs 113 + for pos, op := range b.Operations { 114 + jobs <- job{pos: pos, op: op} 115 + } 116 + close(jobs) 117 + 118 + // Wait for completion 119 + go func() { 120 + wg.Wait() 121 + close(resultsChan) 122 + }() 123 + 124 + // Collect results 125 + results := make([]*Result, 0) 126 + for result := range resultsChan { 127 + results = append(results, result) 128 + } 129 + 130 + return results 131 + } 132 + 133 + func (r *Runner) detectOne(ctx context.Context, detector Detector, bundleNum, pos int, op plc.PLCOperation) *Result { 134 + // Create timeout context 135 + detectCtx, cancel := context.WithTimeout(ctx, r.config.Timeout) 136 + defer cancel() 137 + 138 + result := &Result{ 139 + BundleNumber: bundleNum, 140 + Position: pos, 141 + DID: op.DID, 142 + CID: op.CID, // ← Add this 143 + DetectorName: detector.Name(), 144 + DetectedAt: time.Now(), 145 + } 146 + 147 + match, err := detector.Detect(detectCtx, op) 148 + result.Match = match 149 + result.Error = err 150 + 151 + return result 152 + } 153 + 154 + // RunMultipleDetectors runs multiple detectors on a bundle 155 + func (r *Runner) RunMultipleDetectors(ctx context.Context, detectorNames []string, b *bundle.Bundle) (map[string][]*Result, error) { 156 + allResults := make(map[string][]*Result) 157 + 158 + for _, name := range detectorNames { 159 + results, err := r.RunOnBundle(ctx, name, b) 160 + if err != nil { 161 + return nil, fmt.Errorf("detector %s failed: %w", name, err) 162 + } 163 + allResults[name] = results 164 + } 165 + 166 + return allResults, nil 167 + } 168 + 169 + // Stats represents detection statistics 170 + type Stats struct { 171 + TotalOperations int 172 + MatchedCount int 173 + MatchRate float64 174 + ByReason map[string]int 175 + ByCategory map[string]int 176 + ByConfidence map[string]int // 0.9-1.0, 0.8-0.9, etc. 177 + } 178 + 179 + // CalculateStats computes statistics from results 180 + func CalculateStats(results []*Result, totalOps int) *Stats { 181 + stats := &Stats{ 182 + TotalOperations: totalOps, 183 + MatchedCount: len(results), 184 + ByReason: make(map[string]int), 185 + ByCategory: make(map[string]int), 186 + ByConfidence: make(map[string]int), 187 + } 188 + 189 + if totalOps > 0 { 190 + stats.MatchRate = float64(len(results)) / float64(totalOps) 191 + } 192 + 193 + for _, res := range results { 194 + if res.Match == nil { 195 + continue 196 + } 197 + 198 + stats.ByReason[res.Match.Reason]++ 199 + stats.ByCategory[res.Match.Category]++ 200 + 201 + // Confidence buckets 202 + conf := res.Match.Confidence 203 + switch { 204 + case conf >= 0.95: 205 + stats.ByConfidence["0.95-1.00"]++ 206 + case conf >= 0.90: 207 + stats.ByConfidence["0.90-0.95"]++ 208 + case conf >= 0.85: 209 + stats.ByConfidence["0.85-0.90"]++ 210 + default: 211 + stats.ByConfidence["0.00-0.85"]++ 212 + } 213 + } 214 + 215 + return stats 216 + }