tangled
alpha
login
or
join now
angrydutchman.peedee.es
/
plcbundle
forked from
atscan.net/plcbundle
0
fork
atom
A Transparent and Verifiable Way to Sync the AT Protocol's PLC Directory
0
fork
atom
overview
issues
pulls
pipelines
more work on detectors
tree.fail
4 months ago
e8349a48
7eefd51f
+288
-337
8 changed files
expand all
collapse all
unified
split
cmd
plcbundle
detector.go
detector
builtin.go
registry.go
script.go
go.mod
go.sum
scripts
benchmark-detector.js
detector-template.js
+138
-285
cmd/plcbundle/detector.go
···
36
36
cmdDetectorTest()
37
37
case "run":
38
38
cmdDetectorRun()
39
39
-
case "filter": // ← Add this
39
39
+
case "filter":
40
40
cmdDetectorFilter()
41
41
case "info":
42
42
cmdDetectorInfo()
···
82
82
func cmdDetectorFilter() {
83
83
if len(os.Args) < 4 {
84
84
fmt.Fprintf(os.Stderr, "Usage: plcbundle detector filter <detector1|script.js> [detector2...] [--confidence 0.9]\n")
85
85
-
fmt.Fprintf(os.Stderr, "\nFilters OUT operations that match detectors (outputs clean data)\n\n")
86
86
-
fmt.Fprintf(os.Stderr, "Examples:\n")
87
87
-
fmt.Fprintf(os.Stderr, " plcbundle backfill | plcbundle detector filter all > clean.jsonl\n")
88
88
-
fmt.Fprintf(os.Stderr, " plcbundle export --bundle 1 | plcbundle detector filter invalid_handle > clean.jsonl\n")
89
89
-
fmt.Fprintf(os.Stderr, " plcbundle backfill | plcbundle detector filter ./my_detector.js > clean.jsonl\n")
90
85
os.Exit(1)
91
86
}
92
87
93
93
-
// Manually separate detector names from flags
88
88
+
// Parse detector names and flags
94
89
var detectorNames []string
95
90
var flagArgs []string
96
96
-
97
91
for i := 3; i < len(os.Args); i++ {
98
98
-
arg := os.Args[i]
99
99
-
if strings.HasPrefix(arg, "-") {
92
92
+
if strings.HasPrefix(os.Args[i], "-") {
100
93
flagArgs = os.Args[i:]
101
94
break
102
95
}
103
103
-
detectorNames = append(detectorNames, arg)
96
96
+
detectorNames = append(detectorNames, os.Args[i])
104
97
}
105
98
106
99
if len(detectorNames) == 0 {
···
108
101
os.Exit(1)
109
102
}
110
103
111
111
-
// Parse flags
112
104
fs := flag.NewFlagSet("detector filter", flag.ExitOnError)
113
105
confidence := fs.Float64("confidence", 0.90, "minimum confidence")
114
106
fs.Parse(flagArgs)
115
107
116
116
-
// Setup registry
117
117
-
registry := detector.DefaultRegistry()
118
118
-
119
119
-
// Track script detectors for cleanup
120
120
-
var scriptDetectors []*detector.ScriptDetector
121
121
-
defer func() {
122
122
-
for _, sd := range scriptDetectors {
123
123
-
sd.Close()
124
124
-
}
125
125
-
}()
126
126
-
127
127
-
// Handle "all" keyword
128
128
-
if len(detectorNames) == 1 && detectorNames[0] == "all" {
129
129
-
detectorNames = registry.Names()
130
130
-
fmt.Fprintf(os.Stderr, "Using all detectors: %s\n", strings.Join(detectorNames, ", "))
131
131
-
}
132
132
-
133
133
-
// Get all detectors
134
134
-
detectors := make([]detector.Detector, 0, len(detectorNames))
135
135
-
for _, name := range detectorNames {
136
136
-
// Check if it's a .js file (script detector)
137
137
-
if strings.HasSuffix(name, ".js") {
138
138
-
scriptDetector, err := detector.NewScriptDetector(name)
139
139
-
if err != nil {
140
140
-
fmt.Fprintf(os.Stderr, "Error loading script %s: %v\n", name, err)
141
141
-
os.Exit(1)
142
142
-
}
143
143
-
scriptDetectors = append(scriptDetectors, scriptDetector)
144
144
-
registry.Register(scriptDetector)
145
145
-
detectors = append(detectors, scriptDetector)
146
146
-
fmt.Fprintf(os.Stderr, "✓ Started detector server: %s\n", scriptDetector.Name())
147
147
-
} else {
148
148
-
d, err := registry.Get(name)
149
149
-
if err != nil {
150
150
-
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
151
151
-
os.Exit(1)
152
152
-
}
153
153
-
detectors = append(detectors, d)
154
154
-
}
108
108
+
// Load detectors (common logic)
109
109
+
setup, err := parseAndLoadDetectors(detectorNames, *confidence)
110
110
+
if err != nil {
111
111
+
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
112
112
+
os.Exit(1)
155
113
}
114
114
+
defer setup.cleanup()
156
115
157
157
-
// Log to stderr
158
158
-
fmt.Fprintf(os.Stderr, "Filtering OUT spam with %d detector(s)\n", len(detectors))
159
159
-
if len(detectorNames) <= 5 {
160
160
-
fmt.Fprintf(os.Stderr, "Detectors: %s\n", strings.Join(detectorNames, ", "))
161
161
-
}
162
162
-
fmt.Fprintf(os.Stderr, "Min confidence: %.2f\n\n", *confidence)
116
116
+
fmt.Fprintf(os.Stderr, "Filtering with %d detector(s), min confidence: %.2f\n\n", len(setup.detectors), *confidence)
163
117
164
118
ctx := context.Background()
165
119
scanner := bufio.NewScanner(os.Stdin)
166
166
-
167
167
-
// Set large buffer for long lines
168
120
buf := make([]byte, 0, 64*1024)
169
121
scanner.Buffer(buf, 1024*1024)
170
122
171
171
-
cleanCount := 0
172
172
-
filteredCount := 0
173
173
-
totalCount := 0
174
174
-
totalBytes := int64(0)
175
175
-
filteredBytes := int64(0)
123
123
+
cleanCount, filteredCount, totalCount := 0, 0, 0
124
124
+
totalBytes, filteredBytes := int64(0), int64(0)
176
125
177
177
-
// Read JSONL from stdin
178
126
for scanner.Scan() {
179
127
line := scanner.Bytes()
180
128
if len(line) == 0 {
···
182
130
}
183
131
184
132
totalCount++
185
185
-
opSize := int64(len(line))
186
186
-
totalBytes += opSize
133
133
+
totalBytes += int64(len(line))
187
134
188
188
-
// Parse operation
189
135
var op plc.PLCOperation
190
136
if err := json.Unmarshal(line, &op); err != nil {
191
191
-
fmt.Fprintf(os.Stderr, "Warning: failed to parse line %d: %v\n", totalCount, err)
192
137
continue
193
138
}
194
139
195
195
-
// Run all detectors on this operation
196
196
-
isSpam := false
197
197
-
198
198
-
for _, det := range detectors {
199
199
-
match, err := det.Detect(ctx, op)
200
200
-
if err != nil {
201
201
-
continue
202
202
-
}
203
203
-
204
204
-
if match != nil && match.Confidence >= *confidence {
205
205
-
// Detected as spam - filter it out
206
206
-
isSpam = true
207
207
-
break
208
208
-
}
209
209
-
}
140
140
+
// Run detection (common logic)
141
141
+
labels, _ := detectOperation(ctx, setup.detectors, op, setup.confidence)
210
142
211
211
-
// Output only if NOT spam (clean operation)
212
212
-
if !isSpam {
143
143
+
if len(labels) == 0 {
213
144
cleanCount++
214
145
fmt.Println(string(line))
215
146
} else {
216
147
filteredCount++
217
217
-
filteredBytes += opSize
148
148
+
filteredBytes += int64(len(line))
218
149
}
219
150
220
220
-
// Progress to stderr
221
151
if totalCount%1000 == 0 {
222
222
-
fmt.Fprintf(os.Stderr, "Processed: %d | Clean: %d | Filtered: %d | Saved: %s\r",
223
223
-
totalCount, cleanCount, filteredCount, formatBytes(filteredBytes))
152
152
+
fmt.Fprintf(os.Stderr, "Processed: %d | Clean: %d | Filtered: %d\r", totalCount, cleanCount, filteredCount)
224
153
}
225
154
}
226
155
227
227
-
if err := scanner.Err(); err != nil {
228
228
-
fmt.Fprintf(os.Stderr, "\nError reading stdin: %v\n", err)
229
229
-
os.Exit(1)
230
230
-
}
231
231
-
232
232
-
// Final stats to stderr
233
233
-
fmt.Fprintf(os.Stderr, "\n\n")
234
234
-
fmt.Fprintf(os.Stderr, "✓ Filter complete\n")
235
235
-
fmt.Fprintf(os.Stderr, " Total operations: %d\n", totalCount)
236
236
-
fmt.Fprintf(os.Stderr, " Clean: %d (%.2f%%)\n", cleanCount, float64(cleanCount)/float64(totalCount)*100)
237
237
-
fmt.Fprintf(os.Stderr, " Filtered out: %d (%.2f%%)\n", filteredCount, float64(filteredCount)/float64(totalCount)*100)
238
238
-
fmt.Fprintf(os.Stderr, "\n")
239
239
-
fmt.Fprintf(os.Stderr, " Total size: %s\n", formatBytes(totalBytes))
240
240
-
fmt.Fprintf(os.Stderr, " Filtered size: %s (%.2f%%)\n", formatBytes(filteredBytes), float64(filteredBytes)/float64(totalBytes)*100)
241
241
-
fmt.Fprintf(os.Stderr, " Clean size: %s (%.2f%%)\n", formatBytes(totalBytes-filteredBytes), float64(totalBytes-filteredBytes)/float64(totalBytes)*100)
242
242
-
fmt.Fprintf(os.Stderr, "\n")
243
243
-
fmt.Fprintf(os.Stderr, " Detectors used: %d\n", len(detectors))
156
156
+
// Stats
157
157
+
fmt.Fprintf(os.Stderr, "\n\n✓ Filter complete\n")
158
158
+
fmt.Fprintf(os.Stderr, " Total: %d | Clean: %d (%.2f%%) | Filtered: %d (%.2f%%)\n",
159
159
+
totalCount, cleanCount, float64(cleanCount)/float64(totalCount)*100,
160
160
+
filteredCount, float64(filteredCount)/float64(totalCount)*100)
161
161
+
fmt.Fprintf(os.Stderr, " Size saved: %s (%.2f%%)\n", formatBytes(filteredBytes), float64(filteredBytes)/float64(totalBytes)*100)
244
162
}
245
163
246
164
func cmdDetectorList() {
···
371
289
func cmdDetectorRun() {
372
290
if len(os.Args) < 4 {
373
291
fmt.Fprintf(os.Stderr, "Usage: plcbundle detector run <detector1|script.js> [detector2...] [--bundles 1-100]\n")
374
374
-
fmt.Fprintf(os.Stderr, "\nExamples:\n")
375
375
-
fmt.Fprintf(os.Stderr, " plcbundle detector run invalid_handle --bundles 1-100\n")
376
376
-
fmt.Fprintf(os.Stderr, " plcbundle detector run invalid_handle aka_spam --bundles 1-100\n")
377
377
-
fmt.Fprintf(os.Stderr, " plcbundle detector run ./my_detector.js --bundles 1-100\n")
378
378
-
fmt.Fprintf(os.Stderr, " plcbundle detector run all # runs on all bundles\n")
379
292
os.Exit(1)
380
293
}
381
294
382
382
-
// Manually separate detector names from flags
295
295
+
// Parse detector names and flags
383
296
var detectorNames []string
384
297
var flagArgs []string
385
385
-
386
298
for i := 3; i < len(os.Args); i++ {
387
387
-
arg := os.Args[i]
388
388
-
if strings.HasPrefix(arg, "-") {
299
299
+
if strings.HasPrefix(os.Args[i], "-") {
389
300
flagArgs = os.Args[i:]
390
301
break
391
302
}
392
392
-
detectorNames = append(detectorNames, arg)
303
303
+
detectorNames = append(detectorNames, os.Args[i])
393
304
}
394
305
395
306
if len(detectorNames) == 0 {
···
397
308
os.Exit(1)
398
309
}
399
310
400
400
-
// Parse flags
401
311
fs := flag.NewFlagSet("detector run", flag.ExitOnError)
402
402
-
bundleRange := fs.String("bundles", "", "bundle range (e.g., '1-100'), default: all bundles")
312
312
+
bundleRange := fs.String("bundles", "", "bundle range, default: all bundles")
403
313
confidence := fs.Float64("confidence", 0.90, "minimum confidence")
404
314
fs.Parse(flagArgs)
405
315
406
406
-
// Load manager (needed to determine bundle range)
316
316
+
// Load manager
407
317
mgr, _, err := getManager("")
408
318
if err != nil {
409
319
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
···
414
324
// Determine bundle range
415
325
var start, end int
416
326
if *bundleRange == "" {
417
417
-
// Default to all bundles
418
327
index := mgr.GetIndex()
419
328
bundles := index.GetBundles()
420
329
if len(bundles) == 0 {
···
423
332
}
424
333
start = bundles[0].BundleNumber
425
334
end = bundles[len(bundles)-1].BundleNumber
426
426
-
fmt.Fprintf(os.Stderr, "No --bundles specified, using all available bundles: %d-%d\n", start, end)
335
335
+
fmt.Fprintf(os.Stderr, "Using all bundles: %d-%d\n", start, end)
427
336
} else {
428
428
-
// Parse provided range
429
337
start, end, err = parseBundleRange(*bundleRange)
430
338
if err != nil {
431
339
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
···
433
341
}
434
342
}
435
343
436
436
-
// Setup registry
437
437
-
registry := detector.DefaultRegistry()
438
438
-
config := detector.DefaultConfig()
439
439
-
config.MinConfidence = *confidence
440
440
-
441
441
-
// Track script detectors for cleanup
442
442
-
var scriptDetectors []*detector.ScriptDetector
443
443
-
defer func() {
444
444
-
for _, sd := range scriptDetectors {
445
445
-
sd.Close()
446
446
-
}
447
447
-
}()
448
448
-
449
449
-
// Handle "all" keyword - expand to all available detectors
450
450
-
if len(detectorNames) == 1 && detectorNames[0] == "all" {
451
451
-
detectorNames = registry.Names()
452
452
-
fmt.Fprintf(os.Stderr, "Using all available detectors: %s\n", strings.Join(detectorNames, ", "))
344
344
+
// Load detectors (common logic)
345
345
+
setup, err := parseAndLoadDetectors(detectorNames, *confidence)
346
346
+
if err != nil {
347
347
+
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
348
348
+
os.Exit(1)
453
349
}
350
350
+
defer setup.cleanup()
454
351
455
455
-
// Load detectors (built-in or scripts)
456
456
-
detectors := make([]detector.Detector, 0, len(detectorNames))
457
457
-
for _, name := range detectorNames {
458
458
-
// Check if it's a .js file (script detector)
459
459
-
if strings.HasSuffix(name, ".js") {
460
460
-
scriptDetector, err := detector.NewScriptDetector(name)
461
461
-
if err != nil {
462
462
-
fmt.Fprintf(os.Stderr, "Error loading script %s: %v\n", name, err)
463
463
-
os.Exit(1)
464
464
-
}
465
465
-
// Track for cleanup
466
466
-
scriptDetectors = append(scriptDetectors, scriptDetector)
467
467
-
// Register it so it can be used
468
468
-
registry.Register(scriptDetector)
469
469
-
detectors = append(detectors, scriptDetector)
470
470
-
fmt.Fprintf(os.Stderr, "✓ Started detector server: %s\n", scriptDetector.Name())
471
471
-
} else {
472
472
-
// Try to get built-in detector
473
473
-
d, err := registry.Get(name)
474
474
-
if err != nil {
475
475
-
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
476
476
-
os.Exit(1)
477
477
-
}
478
478
-
detectors = append(detectors, d)
479
479
-
}
480
480
-
}
481
481
-
482
482
-
// Log to stderr
483
483
-
fmt.Fprintf(os.Stderr, "Running %d detector(s) on bundles %d-%d...\n", len(detectors), start, end)
484
484
-
if len(detectorNames) <= 5 {
485
485
-
fmt.Fprintf(os.Stderr, "Detectors: %s\n", strings.Join(detectorNames, ", "))
486
486
-
}
352
352
+
fmt.Fprintf(os.Stderr, "Running %d detector(s) on bundles %d-%d\n", len(setup.detectors), start, end)
487
353
fmt.Fprintf(os.Stderr, "Min confidence: %.2f\n\n", *confidence)
488
354
489
355
ctx := context.Background()
490
490
-
491
491
-
// Write CSV header to stdout
492
356
fmt.Println("bundle,position,cid,size,confidence,labels")
493
357
494
494
-
// Track statistics
495
495
-
totalOps := 0
496
496
-
matchCount := 0
497
497
-
totalBytes := int64(0)
498
498
-
matchedBytes := int64(0)
499
499
-
bundlesProcessed := 0
500
500
-
detectorMatchCounts := make(map[string]int)
501
501
-
358
358
+
// Stats
359
359
+
totalOps, matchCount := 0, 0
360
360
+
totalBytes, matchedBytes := int64(0), int64(0)
502
361
totalBundles := end - start + 1
503
503
-
504
504
-
// Create progress bar with byte tracking enabled
505
505
-
fmt.Fprintf(os.Stderr, "Processing bundles:\n")
506
362
progress := NewProgressBar(totalBundles)
507
363
progress.showBytes = true
508
364
509
509
-
// Process bundles and stream results
365
365
+
// Process bundles
510
366
for bundleNum := start; bundleNum <= end; bundleNum++ {
511
367
bundle, err := mgr.LoadBundle(ctx, bundleNum)
512
368
if err != nil {
513
513
-
progress.Finish()
514
514
-
fmt.Fprintf(os.Stderr, "\n⚠️ Warning: failed to load bundle %d: %v\n", bundleNum, err)
515
515
-
progress = NewProgressBar(totalBundles)
516
516
-
progress.showBytes = true
517
517
-
progress.SetWithBytes(bundleNum-start, totalBytes)
518
369
continue
519
370
}
520
371
521
521
-
bundlesProcessed++
522
372
totalOps += len(bundle.Operations)
523
373
524
524
-
// Process each operation with all detectors
525
374
for position, op := range bundle.Operations {
526
526
-
// Calculate operation size
527
527
-
var opSize int
528
528
-
if len(op.RawJSON) > 0 {
529
529
-
opSize = len(op.RawJSON)
530
530
-
} else {
375
375
+
opSize := len(op.RawJSON)
376
376
+
if opSize == 0 {
531
377
data, _ := json.Marshal(op)
532
378
opSize = len(data)
533
379
}
534
380
totalBytes += int64(opSize)
535
381
536
536
-
// Collect all matches for this operation
537
537
-
var matchedLabels []string
538
538
-
var maxConfidence float64
539
539
-
540
540
-
// Run all detectors on this operation
541
541
-
for _, det := range detectors {
542
542
-
match, err := det.Detect(ctx, op)
543
543
-
if err != nil {
544
544
-
continue
545
545
-
}
546
546
-
547
547
-
// Skip if no match or confidence too low
548
548
-
if match == nil || match.Confidence < *confidence {
549
549
-
continue
550
550
-
}
551
551
-
552
552
-
// Extract labels from match metadata
553
553
-
var labels []string
554
554
-
if labelList, ok := match.Metadata["labels"].([]string); ok {
555
555
-
labels = labelList
556
556
-
} else if labelList, ok := match.Metadata["labels"].([]interface{}); ok {
557
557
-
for _, l := range labelList {
558
558
-
if str, ok := l.(string); ok {
559
559
-
labels = append(labels, str)
560
560
-
}
561
561
-
}
562
562
-
}
563
563
-
564
564
-
// If no labels in metadata, use detector name
565
565
-
if len(labels) == 0 {
566
566
-
labels = []string{det.Name()}
567
567
-
}
382
382
+
// Run detection (common logic)
383
383
+
labels, confidence := detectOperation(ctx, setup.detectors, op, setup.confidence)
568
384
569
569
-
// Collect all labels
570
570
-
matchedLabels = append(matchedLabels, labels...)
571
571
-
detectorMatchCounts[det.Name()]++
572
572
-
573
573
-
// Track highest confidence
574
574
-
if match.Confidence > maxConfidence {
575
575
-
maxConfidence = match.Confidence
576
576
-
}
577
577
-
}
578
578
-
579
579
-
// Output only if at least one detector matched
580
580
-
if len(matchedLabels) > 0 {
385
385
+
if len(labels) > 0 {
581
386
matchCount++
582
387
matchedBytes += int64(opSize)
583
388
584
584
-
// Extract last 4 chars of CID
585
389
cidShort := op.CID
586
390
if len(cidShort) > 4 {
587
391
cidShort = cidShort[len(cidShort)-4:]
588
392
}
589
393
590
394
fmt.Printf("%d,%d,%s,%d,%.2f,%s\n",
591
591
-
bundleNum,
592
592
-
position,
593
593
-
cidShort,
594
594
-
opSize,
595
595
-
maxConfidence,
596
596
-
strings.Join(matchedLabels, ";"),
597
597
-
)
395
395
+
bundleNum, position, cidShort, opSize, confidence, strings.Join(labels, ";"))
598
396
}
599
397
}
600
398
601
601
-
// Update progress with bytes
602
399
progress.SetWithBytes(bundleNum-start+1, totalBytes)
603
400
}
604
401
605
605
-
// Finish progress bar
606
402
progress.Finish()
607
403
608
608
-
// Final stats to stderr
609
609
-
fmt.Fprintf(os.Stderr, "\n")
610
610
-
fmt.Fprintf(os.Stderr, "✓ Detection complete\n")
611
611
-
fmt.Fprintf(os.Stderr, " Bundles processed: %d\n", bundlesProcessed)
404
404
+
// Stats
405
405
+
fmt.Fprintf(os.Stderr, "\n✓ Detection complete\n")
612
406
fmt.Fprintf(os.Stderr, " Total operations: %d\n", totalOps)
613
407
fmt.Fprintf(os.Stderr, " Matches found: %d (%.2f%%)\n", matchCount, float64(matchCount)/float64(totalOps)*100)
614
614
-
fmt.Fprintf(os.Stderr, " Clean operations: %d (%.2f%%)\n", totalOps-matchCount, float64(totalOps-matchCount)/float64(totalOps)*100)
615
615
-
fmt.Fprintf(os.Stderr, "\n")
616
408
fmt.Fprintf(os.Stderr, " Total size: %s\n", formatBytes(totalBytes))
617
409
fmt.Fprintf(os.Stderr, " Matched size: %s (%.2f%%)\n", formatBytes(matchedBytes), float64(matchedBytes)/float64(totalBytes)*100)
618
618
-
fmt.Fprintf(os.Stderr, " Clean size: %s (%.2f%%)\n", formatBytes(totalBytes-matchedBytes), float64(totalBytes-matchedBytes)/float64(totalBytes)*100)
619
619
-
620
620
-
if matchedBytes > 0 {
621
621
-
fmt.Fprintf(os.Stderr, "\n")
622
622
-
fmt.Fprintf(os.Stderr, " 💾 Potential savings if filtered: %s (%.2f%% reduction)\n",
623
623
-
formatBytes(matchedBytes),
624
624
-
float64(matchedBytes)/float64(totalBytes)*100)
625
625
-
}
626
626
-
627
627
-
fmt.Fprintf(os.Stderr, "\n")
628
628
-
fmt.Fprintf(os.Stderr, " Detectors used: %d\n", len(detectors))
629
629
-
630
630
-
// Show breakdown by detector if multiple used
631
631
-
if len(detectors) > 1 {
632
632
-
fmt.Fprintf(os.Stderr, "\n")
633
633
-
fmt.Fprintf(os.Stderr, " Matches by detector:\n")
634
634
-
for _, det := range detectors {
635
635
-
name := det.Name()
636
636
-
count := detectorMatchCounts[name]
637
637
-
if count > 0 {
638
638
-
pct := float64(count) / float64(matchCount) * 100
639
639
-
fmt.Fprintf(os.Stderr, " %-20s %d (%.1f%%)\n", name, count, pct)
640
640
-
} else {
641
641
-
fmt.Fprintf(os.Stderr, " %-20s 0\n", name)
642
642
-
}
643
643
-
}
644
644
-
}
645
410
}
646
411
647
412
func cmdDetectorInfo() {
···
709
474
710
475
return start, end, nil
711
476
}
477
477
+
478
478
+
// Common detector setup
479
479
+
type detectorSetup struct {
480
480
+
detectors []detector.Detector
481
481
+
scriptDetectors []interface{ Close() error }
482
482
+
confidence float64
483
483
+
}
484
484
+
485
485
+
func (ds *detectorSetup) cleanup() {
486
486
+
for _, sd := range ds.scriptDetectors {
487
487
+
sd.Close()
488
488
+
}
489
489
+
}
490
490
+
491
491
+
// parseAndLoadDetectors handles common detector loading logic
492
492
+
func parseAndLoadDetectors(detectorNames []string, confidence float64) (*detectorSetup, error) {
493
493
+
registry := detector.DefaultRegistry()
494
494
+
495
495
+
if len(detectorNames) == 1 && detectorNames[0] == "all" {
496
496
+
detectorNames = registry.Names()
497
497
+
fmt.Fprintf(os.Stderr, "Using all detectors: %s\n", strings.Join(detectorNames, ", "))
498
498
+
}
499
499
+
500
500
+
setup := &detectorSetup{
501
501
+
detectors: make([]detector.Detector, 0, len(detectorNames)),
502
502
+
scriptDetectors: make([]interface{ Close() error }, 0),
503
503
+
confidence: confidence,
504
504
+
}
505
505
+
506
506
+
for _, name := range detectorNames {
507
507
+
if strings.HasSuffix(name, ".js") {
508
508
+
sd, err := detector.NewScriptDetector(name) // Simple single process
509
509
+
if err != nil {
510
510
+
setup.cleanup()
511
511
+
return nil, fmt.Errorf("error loading script %s: %w", name, err)
512
512
+
}
513
513
+
setup.scriptDetectors = append(setup.scriptDetectors, sd)
514
514
+
registry.Register(sd)
515
515
+
setup.detectors = append(setup.detectors, sd)
516
516
+
fmt.Fprintf(os.Stderr, "✓ Started detector server: %s\n", sd.Name())
517
517
+
} else {
518
518
+
d, err := registry.Get(name)
519
519
+
if err != nil {
520
520
+
setup.cleanup()
521
521
+
return nil, err
522
522
+
}
523
523
+
setup.detectors = append(setup.detectors, d)
524
524
+
}
525
525
+
}
526
526
+
527
527
+
return setup, nil
528
528
+
}
529
529
+
530
530
+
// detectOperation runs all detectors on an operation and returns labels + confidence
531
531
+
func detectOperation(ctx context.Context, detectors []detector.Detector, op plc.PLCOperation, minConfidence float64) ([]string, float64) {
532
532
+
var matchedLabels []string
533
533
+
var maxConfidence float64
534
534
+
535
535
+
for _, det := range detectors {
536
536
+
match, err := det.Detect(ctx, op)
537
537
+
if err != nil || match == nil || match.Confidence < minConfidence {
538
538
+
continue
539
539
+
}
540
540
+
541
541
+
// Extract labels
542
542
+
var labels []string
543
543
+
if labelList, ok := match.Metadata["labels"].([]string); ok {
544
544
+
labels = labelList
545
545
+
} else if labelList, ok := match.Metadata["labels"].([]interface{}); ok {
546
546
+
for _, l := range labelList {
547
547
+
if str, ok := l.(string); ok {
548
548
+
labels = append(labels, str)
549
549
+
}
550
550
+
}
551
551
+
}
552
552
+
553
553
+
if len(labels) == 0 {
554
554
+
labels = []string{det.Name()}
555
555
+
}
556
556
+
557
557
+
matchedLabels = append(matchedLabels, labels...)
558
558
+
if match.Confidence > maxConfidence {
559
559
+
maxConfidence = match.Confidence
560
560
+
}
561
561
+
}
562
562
+
563
563
+
return matchedLabels, maxConfidence
564
564
+
}
+18
detector/builtin.go
···
9
9
"tangled.org/atscan.net/plcbundle/plc"
10
10
)
11
11
12
12
+
// NoOpDetector is an empty detector for speed testing
13
13
+
type NoOpDetector struct{}
14
14
+
15
15
+
func NewNoOpDetector() *NoOpDetector {
16
16
+
return &NoOpDetector{}
17
17
+
}
18
18
+
19
19
+
func (d *NoOpDetector) Name() string { return "noop" }
20
20
+
func (d *NoOpDetector) Description() string {
21
21
+
return "Empty detector for benchmarking (always returns no match)"
22
22
+
}
23
23
+
func (d *NoOpDetector) Version() string { return "1.0.0" }
24
24
+
25
25
+
func (d *NoOpDetector) Detect(ctx context.Context, op plc.PLCOperation) (*Match, error) {
26
26
+
// Instant return - no work done
27
27
+
return nil, nil
28
28
+
}
29
29
+
12
30
// InvalidHandleDetector detects operations with invalid handle patterns
13
31
type InvalidHandleDetector struct {
14
32
// Valid handle regex based on AT Protocol handle specification
+2
detector/registry.go
···
76
76
func DefaultRegistry() *Registry {
77
77
r := NewRegistry()
78
78
79
79
+
r.Register(NewNoOpDetector())
80
80
+
79
81
// Register real spam detectors
80
82
r.Register(NewInvalidHandleDetector())
81
83
r.Register(NewAlsoKnownAsSpamDetector())
+17
-50
detector/script.go
···
28
28
}
29
29
30
30
func NewScriptDetector(scriptPath string) (*ScriptDetector, error) {
31
31
-
// Verify bun is available
32
31
if _, err := exec.LookPath("bun"); err != nil {
33
32
return nil, fmt.Errorf("bun runtime not found in PATH")
34
33
}
···
38
37
return nil, fmt.Errorf("invalid script path: %w", err)
39
38
}
40
39
41
41
-
// Use filename as detector name
42
40
name := strings.TrimSuffix(filepath.Base(scriptPath), filepath.Ext(scriptPath))
43
43
-
44
44
-
// Create unique socket path
45
41
socketPath := filepath.Join(os.TempDir(), fmt.Sprintf("detector-%d-%s.sock", os.Getpid(), name))
46
42
47
43
sd := &ScriptDetector{
···
50
46
socketPath: socketPath,
51
47
}
52
48
53
53
-
// Start the server
54
49
if err := sd.startServer(); err != nil {
55
50
return nil, err
56
51
}
···
63
58
func (d *ScriptDetector) Version() string { return "1.0.0" }
64
59
65
60
func (d *ScriptDetector) startServer() error {
66
66
-
// Read user's script
67
61
userCode, err := os.ReadFile(d.scriptPath)
68
62
if err != nil {
69
63
return fmt.Errorf("failed to read script: %w", err)
70
64
}
71
65
72
72
-
// Create wrapper script with socket server
73
66
wrapperScript := d.createSocketWrapper(string(userCode))
74
74
-
75
75
-
// Remove old socket if exists
76
67
os.Remove(d.socketPath)
77
68
78
78
-
// Start Bun server with script piped to stdin
79
69
d.serverCmd = exec.Command("bun", "run", "-", d.socketPath)
80
70
d.serverCmd.Stdout = os.Stderr
81
71
d.serverCmd.Stderr = os.Stderr
82
72
83
83
-
// Get stdin pipe
84
73
stdin, err := d.serverCmd.StdinPipe()
85
74
if err != nil {
86
75
return fmt.Errorf("failed to create stdin pipe: %w", err)
87
76
}
88
77
89
89
-
// Start the process
90
78
if err := d.serverCmd.Start(); err != nil {
91
79
return fmt.Errorf("failed to start server: %w", err)
92
80
}
93
81
94
94
-
// Write wrapper script to stdin
95
95
-
if _, err := stdin.Write([]byte(wrapperScript)); err != nil {
96
96
-
d.serverCmd.Process.Kill()
97
97
-
return fmt.Errorf("failed to write script: %w", err)
98
98
-
}
82
82
+
stdin.Write([]byte(wrapperScript))
99
83
stdin.Close()
100
84
101
101
-
// Wait for socket to be ready and connect
102
85
if err := d.connectToServer(); err != nil {
103
86
d.serverCmd.Process.Kill()
104
87
os.Remove(d.socketPath)
···
116
99
// Unix socket server
117
100
const socketPath = process.argv[2];
118
101
119
119
-
// Clean up old socket
120
102
try {
121
103
await Bun.file(socketPath).unlink();
122
104
} catch {}
···
125
107
unix: socketPath,
126
108
socket: {
127
109
data(socket, data) {
128
128
-
const lines = data.toString().split('\n').filter(line => line.trim());
129
129
-
130
130
-
for (const line of lines) {
131
131
-
try {
132
132
-
const operation = JSON.parse(line);
133
133
-
const labels = detect({ op: operation }) || [];
134
134
-
socket.write(JSON.stringify({ labels }) + '\n');
135
135
-
} catch (error) {
136
136
-
socket.write(JSON.stringify({ labels: [], error: error.message }) + '\n');
137
137
-
}
110
110
+
try {
111
111
+
const operation = JSON.parse(data.toString());
112
112
+
const labels = detect({ op: operation }) || [];
113
113
+
socket.write(JSON.stringify({ labels }) + '\n');
114
114
+
} catch (error) {
115
115
+
socket.write(JSON.stringify({ labels: [], error: error.message }) + '\n');
138
116
}
139
117
},
140
140
-
error(socket, error) {
141
141
-
console.error('Socket error:', error);
142
142
-
},
143
143
-
close(socket) {
144
144
-
console.error('Socket closed');
145
145
-
}
118
118
+
error(socket, error) {},
119
119
+
close(socket) {}
146
120
}
147
121
});
148
122
···
151
125
}
152
126
153
127
func (d *ScriptDetector) connectToServer() error {
154
154
-
// Try to connect with retries
155
128
maxRetries := 50
156
129
for i := 0; i < maxRetries; i++ {
157
130
conn, err := net.Dial("unix", d.socketPath)
···
171
144
return nil, fmt.Errorf("not connected to server")
172
145
}
173
146
174
174
-
// Serialize operation
175
175
-
data, err := json.Marshal(op)
176
176
-
if err != nil {
177
177
-
return nil, fmt.Errorf("failed to serialize operation: %w", err)
147
147
+
// Use RawJSON directly
148
148
+
data := op.RawJSON
149
149
+
if len(data) == 0 {
150
150
+
var err error
151
151
+
data, err = json.Marshal(op)
152
152
+
if err != nil {
153
153
+
return nil, fmt.Errorf("failed to serialize operation: %w", err)
154
154
+
}
178
155
}
179
156
180
180
-
// Write to socket with newline delimiter
181
157
if _, err := d.writer.Write(data); err != nil {
182
158
return nil, fmt.Errorf("failed to write to socket: %w", err)
183
159
}
···
188
164
return nil, fmt.Errorf("failed to flush: %w", err)
189
165
}
190
166
191
191
-
// Read response
192
167
line, err := d.reader.ReadString('\n')
193
168
if err != nil {
194
169
return nil, fmt.Errorf("failed to read response: %w", err)
195
170
}
196
171
197
197
-
// Parse response
198
172
var result struct {
199
173
Labels []string `json:"labels"`
200
174
Error string `json:"error,omitempty"`
···
208
182
return nil, fmt.Errorf("detector error: %s", result.Error)
209
183
}
210
184
211
211
-
// No match if no labels
212
185
if len(result.Labels) == 0 {
213
186
return nil, nil
214
187
}
215
188
216
216
-
// Convert labels to Match
217
189
return &Match{
218
190
Reason: strings.Join(result.Labels, "_"),
219
191
Category: "custom",
···
226
198
}, nil
227
199
}
228
200
229
229
-
// Close shuts down the server
230
201
func (d *ScriptDetector) Close() error {
231
231
-
// Close connection
232
202
if d.conn != nil {
233
203
d.conn.Close()
234
204
d.conn = nil
235
205
}
236
206
237
237
-
// Kill server process
238
207
if d.serverCmd != nil && d.serverCmd.Process != nil {
239
208
d.serverCmd.Process.Kill()
240
209
d.serverCmd.Wait()
241
210
}
242
211
243
243
-
// Remove socket file
244
212
os.Remove(d.socketPath)
245
245
-
246
213
return nil
247
214
}
+5
go.mod
···
5
5
require github.com/klauspost/compress v1.18.1
6
6
7
7
require github.com/gorilla/websocket v1.5.3
8
8
+
9
9
+
require (
10
10
+
github.com/vmihailenco/msgpack/v5 v5.4.1 // indirect
11
11
+
github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
12
12
+
)
+4
go.sum
···
2
2
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
3
3
github.com/klauspost/compress v1.18.1 h1:bcSGx7UbpBqMChDtsF28Lw6v/G94LPrrbMbdC3JH2co=
4
4
github.com/klauspost/compress v1.18.1/go.mod h1:ZQFFVG+MdnR0P+l6wpXgIL4NTtwiKIdBnrBd8Nrxr+0=
5
5
+
github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8=
6
6
+
github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok=
7
7
+
github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g=
8
8
+
github.com/vmihailenco/tagparser/v2 v2.0.0/go.mod h1:Wri+At7QHww0WTrCBeu4J6bNtoV6mEfg5OIWRZA9qds=
+100
scripts/benchmark-detector.js
···
1
1
+
#!/usr/bin/env bun
2
2
+
3
3
+
// User's detect function
4
4
+
function detect({ op }) {
5
5
+
const labels = [];
6
6
+
7
7
+
if (op.did.match(/^did:plc:aa/)) {
8
8
+
labels.push('test')
9
9
+
}
10
10
+
11
11
+
return labels;
12
12
+
}
13
13
+
14
14
+
// ==========================================
15
15
+
// Pure Bun bundle processor
16
16
+
// ==========================================
17
17
+
18
18
+
import { spawn } from 'bun';
19
19
+
import { readdir } from 'fs/promises';
20
20
+
21
21
+
const BUNDLE_DIR = process.argv[2] || './';
22
22
+
const START_BUNDLE = parseInt(process.argv[3]) || 1;
23
23
+
const END_BUNDLE = parseInt(process.argv[4]) || 100;
24
24
+
25
25
+
console.error(`Processing bundles ${START_BUNDLE}-${END_BUNDLE} from ${BUNDLE_DIR}`);
26
26
+
console.error('');
27
27
+
28
28
+
// CSV header
29
29
+
console.log('bundle,position,cid,size,confidence,labels');
30
30
+
31
31
+
let totalOps = 0;
32
32
+
let matchCount = 0;
33
33
+
let totalBytes = 0;
34
34
+
let matchedBytes = 0;
35
35
+
36
36
+
const startTime = Date.now();
37
37
+
38
38
+
for (let bundleNum = START_BUNDLE; bundleNum <= END_BUNDLE; bundleNum++) {
39
39
+
const bundleFile = `${BUNDLE_DIR}/${bundleNum.toString().padStart(6, '0')}.jsonl.zst`;
40
40
+
41
41
+
try {
42
42
+
// Decompress bundle using zstd command
43
43
+
const proc = spawn(['zstd', '-d', '-c', bundleFile]);
44
44
+
const text = await new Response(proc.stdout).text();
45
45
+
46
46
+
const lines = text.split('\n').filter(line => line.trim());
47
47
+
48
48
+
for (let position = 0; position < lines.length; position++) {
49
49
+
const line = lines[position];
50
50
+
if (!line) continue;
51
51
+
52
52
+
totalOps++;
53
53
+
const opSize = line.length;
54
54
+
totalBytes += opSize;
55
55
+
56
56
+
try {
57
57
+
const op = JSON.parse(line);
58
58
+
const labels = detect({ op });
59
59
+
60
60
+
if (labels && labels.length > 0) {
61
61
+
matchCount++;
62
62
+
matchedBytes += opSize;
63
63
+
64
64
+
// Extract last 4 chars of CID
65
65
+
const cidShort = op.cid.slice(-4);
66
66
+
67
67
+
console.log(
68
68
+
`${bundleNum},${position},${cidShort},${opSize},0.95,${labels.join(';')}`
69
69
+
);
70
70
+
}
71
71
+
} catch (err) {
72
72
+
console.error(`Error parsing operation: ${err.message}`);
73
73
+
}
74
74
+
}
75
75
+
76
76
+
// Progress
77
77
+
if (bundleNum % 10 === 0) {
78
78
+
const elapsed = (Date.now() - startTime) / 1000;
79
79
+
const opsPerSec = (totalOps / elapsed).toFixed(0);
80
80
+
console.error(`Processed ${bundleNum}/${END_BUNDLE} bundles | ${totalOps} ops | ${opsPerSec} ops/sec\r`);
81
81
+
}
82
82
+
83
83
+
} catch (err) {
84
84
+
console.error(`\nError loading bundle ${bundleNum}: ${err.message}`);
85
85
+
}
86
86
+
}
87
87
+
88
88
+
const elapsed = (Date.now() - startTime) / 1000;
89
89
+
90
90
+
// Stats
91
91
+
console.error('\n');
92
92
+
console.error('✓ Detection complete');
93
93
+
console.error(` Total operations: ${totalOps}`);
94
94
+
console.error(` Matches found: ${matchCount} (${(matchCount/totalOps*100).toFixed(2)}%)`);
95
95
+
console.error(` Total size: ${(totalBytes / 1e6).toFixed(1)} MB`);
96
96
+
console.error(` Matched size: ${(matchedBytes / 1e6).toFixed(1)} MB (${(matchedBytes/totalBytes*100).toFixed(2)}%)`);
97
97
+
console.error('');
98
98
+
console.error(` Time elapsed: ${elapsed.toFixed(2)}s`);
99
99
+
console.error(` Throughput: ${(totalOps / elapsed).toFixed(0)} ops/sec`);
100
100
+
console.error(` Speed: ${(totalBytes / elapsed / 1e6).toFixed(1)} MB/sec`);
+4
-2
scripts/detector-template.js
···
3
3
// op.did - DID identifier
4
4
// op.cid - Content ID
5
5
// op.createdAt - Timestamp
6
6
-
7
7
-
console.log(op.did)
8
6
9
7
const labels = [];
10
8
11
9
// Add your detection logic here
12
10
// Return array of label strings
13
11
// Return empty array [] for no match
12
12
+
13
13
+
if (op.did.match(/^did:plc:aa/)) {
14
14
+
labels.push('test')
15
15
+
}
14
16
15
17
return labels;
16
18
}