···135135 </div>
136136 {{end}}
137137138138- <!-- Run GC button (only if there are actions to take) -->
138138+ <!-- Individual action buttons -->
139139 {{if or .Preview.OrphanedRecords .Preview.OrphanedBlobs .Preview.MissingRecords}}
140140- <div class="flex items-center gap-3 mt-6">
140140+ <div class="flex flex-wrap items-center gap-3 mt-6">
141141+ {{if .Preview.MissingRecords}}
142142+ <button class="btn btn-warning gap-2"
143143+ hx-post="/admin/api/gc/reconcile"
144144+ hx-target="#gc-results"
145145+ hx-swap="innerHTML"
146146+ hx-indicator="#gc-loading">
147147+ {{ icon "file-plus" "size-4" }}
148148+ Reconcile {{len .Preview.MissingRecords}} Records
149149+ </button>
150150+ {{end}}
151151+ {{if .Preview.OrphanedRecords}}
141152 <button class="btn btn-error gap-2"
142142- hx-post="/admin/api/gc/run"
153153+ hx-post="/admin/api/gc/delete-records"
143154 hx-target="#gc-results"
144155 hx-swap="innerHTML"
145145- hx-confirm="Are you sure you want to run garbage collection?"
156156+ hx-confirm="Delete {{len .Preview.OrphanedRecords}} orphaned layer records?"
146157 hx-indicator="#gc-loading">
147147- {{ icon "zap" "size-4" }}
148148- Run GC
158158+ {{ icon "file-x" "size-4" }}
159159+ Delete {{len .Preview.OrphanedRecords}} Orphaned Records
149160 </button>
161161+ {{end}}
162162+ {{if .Preview.OrphanedBlobs}}
163163+ <button class="btn btn-error gap-2"
164164+ hx-post="/admin/api/gc/delete-blobs"
165165+ hx-target="#gc-results"
166166+ hx-swap="innerHTML"
167167+ hx-confirm="Delete {{len .Preview.OrphanedBlobs}} orphaned blobs from S3? This cannot be undone."
168168+ hx-indicator="#gc-loading">
169169+ {{ icon "trash-2" "size-4" }}
170170+ Delete {{len .Preview.OrphanedBlobs}} Orphaned Blobs
171171+ </button>
172172+ {{end}}
150173 </div>
174174+ <p class="text-sm text-base-content/50 mt-2">Run Scan again after each operation to see updated counts.</p>
151175 {{end}}
152176153177 <!-- Nothing to clean -->
+2-2
pkg/hold/config.go
···141141 Secret string `yaml:"secret" comment:"Shared secret for scanner WebSocket auth. Empty disables scanning."`
142142143143 // Minimum interval between re-scans of the same manifest. 0 disables proactive scanning.
144144- RescanInterval time.Duration `yaml:"rescan_interval" comment:"Minimum interval between re-scans of the same manifest. When set, the hold proactively scans manifests when the scanner is idle. Default: 24h. Set to 0 to disable."`
144144+ RescanInterval time.Duration `yaml:"rescan_interval" comment:"Minimum interval between re-scans of the same manifest. When set, the hold proactively scans manifests when the scanner is idle. Default: 168h (7 days). Set to 0 to disable."`
145145}
146146147147// DatabaseConfig defines embedded PDS database settings
···223223 v.SetDefault("gc.enabled", false)
224224 // Scanner defaults
225225 v.SetDefault("scanner.secret", "")
226226- v.SetDefault("scanner.rescan_interval", "24h")
226226+ v.SetDefault("scanner.rescan_interval", "168h") // 7 days
227227228228 // Log shipper defaults
229229 v.SetDefault("log_shipper.batch_size", 100)
+112
pkg/hold/gc/gc.go
···296296 return preview, nil
297297}
298298299299+// Reconcile creates missing layer records without deleting anything.
300300+// Requires a prior Preview() to identify missing records.
301301+func (gc *GarbageCollector) Reconcile(ctx context.Context) (*GCResult, error) {
302302+ if !gc.tryStart() {
303303+ return nil, fmt.Errorf("GC operation already in progress")
304304+ }
305305+ defer gc.finish()
306306+307307+ gc.mu.Lock()
308308+ preview := gc.lastPreview
309309+ gc.mu.Unlock()
310310+311311+ if preview == nil {
312312+ return nil, fmt.Errorf("no preview available — run Scan first")
313313+ }
314314+ if len(preview.MissingRecords) == 0 {
315315+ return &GCResult{}, nil
316316+ }
317317+318318+ start := time.Now()
319319+ result := &GCResult{}
320320+321321+ gc.logger.Info("Starting reconciliation", "missingRecords", len(preview.MissingRecords))
322322+ gc.reconcileMissingRecords(ctx, preview.MissingRecords, result)
323323+ result.Duration = time.Since(start)
324324+325325+ gc.mu.Lock()
326326+ gc.lastResult = result
327327+ gc.lastResultAt = time.Now()
328328+ gc.mu.Unlock()
329329+330330+ return result, nil
331331+}
332332+333333+// DeleteOrphanedRecords deletes layer records whose manifests no longer exist.
334334+// Requires a prior Preview() to identify orphaned records.
335335+func (gc *GarbageCollector) DeleteOrphanedRecords(ctx context.Context) (*GCResult, error) {
336336+ if !gc.tryStart() {
337337+ return nil, fmt.Errorf("GC operation already in progress")
338338+ }
339339+ defer gc.finish()
340340+341341+ gc.mu.Lock()
342342+ preview := gc.lastPreview
343343+ gc.mu.Unlock()
344344+345345+ if preview == nil {
346346+ return nil, fmt.Errorf("no preview available — run Scan first")
347347+ }
348348+ if len(preview.OrphanedRecords) == 0 {
349349+ return &GCResult{}, nil
350350+ }
351351+352352+ start := time.Now()
353353+ result := &GCResult{
354354+ OrphanedRecords: int64(len(preview.OrphanedRecords)),
355355+ }
356356+357357+ rkeys := make([]string, len(preview.OrphanedRecords))
358358+ for i, r := range preview.OrphanedRecords {
359359+ rkeys[i] = r.Rkey
360360+ }
361361+362362+ gc.logger.Info("Deleting orphaned records", "count", len(rkeys))
363363+ if err := gc.deleteOrphanedRecords(ctx, rkeys, result); err != nil {
364364+ return nil, fmt.Errorf("delete orphaned records: %w", err)
365365+ }
366366+ result.Duration = time.Since(start)
367367+368368+ gc.mu.Lock()
369369+ gc.lastResult = result
370370+ gc.lastResultAt = time.Now()
371371+ gc.mu.Unlock()
372372+373373+ return result, nil
374374+}
375375+376376+// DeleteOrphanedBlobs walks S3 and deletes blobs not referenced by any manifest.
377377+// Runs a fresh analysis to build the current referenced set (reflects any reconciliation
378378+// done since the last preview).
379379+func (gc *GarbageCollector) DeleteOrphanedBlobs(ctx context.Context) (*GCResult, error) {
380380+ if !gc.tryStart() {
381381+ return nil, fmt.Errorf("GC operation already in progress")
382382+ }
383383+ defer gc.finish()
384384+385385+ start := time.Now()
386386+ result := &GCResult{}
387387+388388+ gc.logger.Info("Starting orphaned blob deletion (fresh analysis)")
389389+390390+ // Fresh analysis so the referenced set includes any records reconciled since preview
391391+ analysis, err := gc.analyzeRecords(ctx)
392392+ if err != nil {
393393+ return nil, fmt.Errorf("analyze records: %w", err)
394394+ }
395395+396396+ result.ReferencedBlobs = int64(len(analysis.referenced))
397397+398398+ if err := gc.deleteOrphanedBlobs(ctx, analysis.referenced, result); err != nil {
399399+ return nil, fmt.Errorf("delete orphaned blobs: %w", err)
400400+ }
401401+ result.Duration = time.Since(start)
402402+403403+ gc.mu.Lock()
404404+ gc.lastResult = result
405405+ gc.lastResultAt = time.Now()
406406+ gc.mu.Unlock()
407407+408408+ return result, nil
409409+}
410410+299411// analyzeRecords performs Phase 1 analysis: builds referenced set, finds orphaned records,
300412// and identifies missing layer records. Pure analysis — no mutations.
301413// Discovers users, fetches manifests, scans records, identifies missing records.
+2-2
pkg/hold/oci/xrpc.go
···380380 }
381381 }
382382383383- // Enqueue scan job if scanner is connected
384384- if h.scanBroadcaster != nil {
383383+ // Enqueue scan job if scanner is connected (skip manifest lists — children get their own jobs)
384384+ if h.scanBroadcaster != nil && !isMultiArch {
385385 tier := "deckhand"
386386 if stats != nil && stats.Tier != "" {
387387 tier = stats.Tier
+146-48
pkg/hold/pds/scan_broadcaster.go
···551551 "total", msg.Summary.Total)
552552}
553553554554-// handleError marks a job as failed
554554+// handleError marks a job as failed and creates a scan record so the proactive
555555+// scanner treats it as "stale" rather than "never scanned" (avoids retry loops).
555556func (sb *ScanBroadcaster) handleError(sub *ScanSubscriber, msg ScannerMessage) {
556556- _, err := sb.db.Exec(`
557557+ ctx := context.Background()
558558+559559+ // Get job details to create failure scan record
560560+ var manifestDigest, repository, userDID string
561561+ err := sb.db.QueryRow(`
562562+ SELECT manifest_digest, repository, user_did
563563+ FROM scan_jobs WHERE seq = ?
564564+ `, msg.Seq).Scan(&manifestDigest, &repository, &userDID)
565565+ if err != nil {
566566+ slog.Error("Failed to get job details for failure record",
567567+ "seq", msg.Seq, "error", err)
568568+ } else {
569569+ // Create a scan record with zero counts and nil blobs — marks it as
570570+ // "scanned" so the proactive scheduler won't retry until rescan interval
571571+ scanRecord := atproto.NewScanRecord(
572572+ manifestDigest, repository, userDID,
573573+ nil, nil, // no SBOM or vuln report
574574+ 0, 0, 0, 0, 0,
575575+ "failed: "+truncateError(msg.Error, 200),
576576+ )
577577+ if _, _, err := sb.pds.CreateScanRecord(ctx, scanRecord); err != nil {
578578+ slog.Error("Failed to store failure scan record",
579579+ "seq", msg.Seq, "error", err)
580580+ }
581581+ }
582582+583583+ // Mark job as failed
584584+ _, err = sb.db.Exec(`
557585 UPDATE scan_jobs SET status = 'failed', completed_at = ?
558586 WHERE seq = ?
559587 `, time.Now(), msg.Seq)
···567595 "seq", msg.Seq,
568596 "subscriberId", sub.id,
569597 "error", msg.Error)
598598+}
599599+600600+func truncateError(s string, maxLen int) string {
601601+ if len(s) <= maxLen {
602602+ return s
603603+ }
604604+ return s[:maxLen]
570605}
571606572607// drainPendingJobs sends pending/timed-out jobs to a newly connected scanner.
···650685 }
651686}
652687653653-// reDispatchTimedOut finds jobs that were assigned but not acked/completed within timeout.
688688+// reDispatchTimedOut finds jobs that were assigned but not acked/completed within timeout,
689689+// and also marks stuck processing jobs as failed.
654690// Collects timed-out rows first, closes cursor, then resets and re-dispatches
655691// to avoid holding a SELECT cursor open during UPDATEs (prevents SQLite BUSY).
656692func (sb *ScanBroadcaster) reDispatchTimedOut() {
657693 timeout := time.Now().Add(-sb.ackTimeout)
658694695695+ // Fail processing jobs stuck for >10 minutes (scanner likely crashed mid-scan)
696696+ processingTimeout := time.Now().Add(-10 * time.Minute)
697697+ res, err := sb.db.Exec(`
698698+ UPDATE scan_jobs SET status = 'failed', completed_at = ?
699699+ WHERE status = 'processing' AND assigned_at < ?
700700+ `, time.Now(), processingTimeout)
701701+ if err != nil {
702702+ slog.Error("Failed to clean up stuck processing jobs", "error", err)
703703+ } else if n, _ := res.RowsAffected(); n > 0 {
704704+ slog.Warn("Cleaned up stuck processing jobs", "count", n)
705705+ }
706706+659707 rows, err := sb.db.Query(`
660708 SELECT seq, manifest_digest, repository, tag, user_did, user_handle, hold_did, hold_endpoint, tier, config_json, layers_json
661709 FROM scan_jobs
···798846func (sb *ScanBroadcaster) proactiveScanLoop() {
799847 defer sb.wg.Done()
800848801801- // Wait a bit before starting to let the system settle
849849+ // Wait for the system to settle and DID list to populate
802850 select {
803851 case <-sb.stopCh:
804852 return
805805- case <-time.After(30 * time.Second):
853853+ case <-time.After(45 * time.Second):
806854 }
807855856856+ // Run immediately on startup, then every 60s
857857+ slog.Info("Proactive scan loop started")
858858+ sb.tryEnqueueProactiveScan()
859859+808860 ticker := time.NewTicker(60 * time.Second)
809861 defer ticker.Stop()
810862···824876// Uses the cached DID list from the relay (refreshed by refreshManifestDIDsLoop).
825877func (sb *ScanBroadcaster) tryEnqueueProactiveScan() {
826878 if !sb.hasConnectedScanners() {
879879+ slog.Debug("Proactive scan: no scanners connected, skipping")
827880 return
828881 }
829882 if sb.hasActiveJobs() {
883883+ slog.Debug("Proactive scan: active jobs in queue, skipping")
830884 return
831885 }
832886···839893 sb.manifestDIDsMu.RUnlock()
840894841895 if len(userDIDs) == 0 {
896896+ slog.Debug("Proactive scan: no manifest DIDs cached from relay, skipping")
842897 return
843898 }
844899···854909 }
855910}
856911912912+// scanCandidate is a manifest that needs scanning, with its scan freshness.
913913+type scanCandidate struct {
914914+ manifest atproto.ManifestRecord
915915+ userDID string
916916+ userHandle string
917917+ scannedAt time.Time // zero value = never scanned
918918+}
919919+857920// tryEnqueueForUser fetches manifests from a user's PDS and enqueues a scan for the
858858-// first one that needs scanning. Returns true if a job was enqueued.
921921+// one that most needs it: never-scanned manifests first, then the stalest scan.
922922+// Returns true if a job was enqueued.
859923func (sb *ScanBroadcaster) tryEnqueueForUser(ctx context.Context, userDID string) bool {
860924 // Resolve user DID to PDS endpoint and handle
861925 did, userHandle, pdsEndpoint, err := atproto.ResolveIdentity(ctx, userDID)
···865929 return false
866930 }
867931868868- // Fetch manifest records from user's PDS
932932+ // Collect all scannable manifests with their scan age
933933+ var unscanned []scanCandidate
934934+ var oldest *scanCandidate
935935+869936 client := atproto.NewClient(pdsEndpoint, did, "")
870937 var cursor string
871938 for {
···879946 for _, record := range records {
880947 var manifest atproto.ManifestRecord
881948 if err := json.Unmarshal(record.Value, &manifest); err != nil {
882882- slog.Debug("Proactive scan: failed to unmarshal manifest record",
883883- "uri", record.URI, "error", err)
884949 continue
885950 }
886951···898963 continue
899964 }
900965901901- // Skip if config is nil (shouldn't happen for image manifests, but be safe)
966966+ // Skip if config is nil
902967 if manifest.Config == nil {
903968 continue
904969 }
905970906906- // Check if already scanned recently
907907- if sb.isRecentlyScanned(ctx, manifest.Digest) {
971971+ // Check scan status
972972+ _, scanRecord, err := sb.pds.GetScanRecord(ctx, manifest.Digest)
973973+ if err != nil {
974974+ // No scan record — never scanned
975975+ unscanned = append(unscanned, scanCandidate{
976976+ manifest: manifest,
977977+ userDID: did,
978978+ userHandle: userHandle,
979979+ })
908980 continue
909981 }
910982911911- // Construct and enqueue scan job
912912- configJSON, _ := json.Marshal(manifest.Config)
913913- layersJSON, _ := json.Marshal(manifest.Layers)
983983+ scannedAt, err := time.Parse(time.RFC3339, scanRecord.ScannedAt)
984984+ if err != nil {
985985+ // Can't parse timestamp — treat as never scanned
986986+ unscanned = append(unscanned, scanCandidate{
987987+ manifest: manifest,
988988+ userDID: did,
989989+ userHandle: userHandle,
990990+ })
991991+ continue
992992+ }
914993915915- slog.Info("Enqueuing proactive scan",
916916- "manifestDigest", manifest.Digest,
917917- "repository", manifest.Repository,
918918- "userDID", did)
994994+ // Skip if scanned recently
995995+ if time.Since(scannedAt) < sb.rescanInterval {
996996+ continue
997997+ }
919998920920- if err := sb.Enqueue(&ScanJobEvent{
921921- ManifestDigest: manifest.Digest,
922922- Repository: manifest.Repository,
923923- UserDID: did,
924924- UserHandle: userHandle,
925925- Tier: "deckhand",
926926- Config: configJSON,
927927- Layers: layersJSON,
928928- }); err != nil {
929929- slog.Error("Proactive scan: failed to enqueue",
930930- "manifest", manifest.Digest, "error", err)
931931- return false
999999+ // Stale scan — track the oldest
10001000+ if oldest == nil || scannedAt.Before(oldest.scannedAt) {
10011001+ oldest = &scanCandidate{
10021002+ manifest: manifest,
10031003+ userDID: did,
10041004+ userHandle: userHandle,
10051005+ scannedAt: scannedAt,
10061006+ }
9321007 }
933933- return true
9341008 }
93510099361010 if nextCursor == "" || len(records) == 0 {
···9391013 cursor = nextCursor
9401014 }
9411015942942- return false
10161016+ // Prefer never-scanned, then oldest stale scan
10171017+ var pick *scanCandidate
10181018+ if len(unscanned) > 0 {
10191019+ pick = &unscanned[0]
10201020+ } else if oldest != nil {
10211021+ pick = oldest
10221022+ }
10231023+10241024+ if pick == nil {
10251025+ return false
10261026+ }
10271027+10281028+ configJSON, _ := json.Marshal(pick.manifest.Config)
10291029+ layersJSON, _ := json.Marshal(pick.manifest.Layers)
10301030+10311031+ reason := "never scanned"
10321032+ if !pick.scannedAt.IsZero() {
10331033+ reason = fmt.Sprintf("last scanned %s ago", time.Since(pick.scannedAt).Truncate(time.Minute))
10341034+ }
10351035+10361036+ slog.Info("Enqueuing proactive scan",
10371037+ "manifestDigest", pick.manifest.Digest,
10381038+ "repository", pick.manifest.Repository,
10391039+ "userDID", pick.userDID,
10401040+ "reason", reason)
10411041+10421042+ if err := sb.Enqueue(&ScanJobEvent{
10431043+ ManifestDigest: pick.manifest.Digest,
10441044+ Repository: pick.manifest.Repository,
10451045+ UserDID: pick.userDID,
10461046+ UserHandle: pick.userHandle,
10471047+ Tier: "deckhand",
10481048+ Config: configJSON,
10491049+ Layers: layersJSON,
10501050+ }); err != nil {
10511051+ slog.Error("Proactive scan: failed to enqueue",
10521052+ "manifest", pick.manifest.Digest, "error", err)
10531053+ return false
10541054+ }
10551055+ return true
9431056}
94410579451058// isOurManifest checks if a manifest's holdDID matches this hold directly,
···10261139 }
1027114010281141 return false
10291029-}
10301030-10311031-// isRecentlyScanned checks if a manifest has been scanned within the rescan interval.
10321032-func (sb *ScanBroadcaster) isRecentlyScanned(ctx context.Context, manifestDigest string) bool {
10331033- _, scanRecord, err := sb.pds.GetScanRecord(ctx, manifestDigest)
10341034- if err != nil {
10351035- return false // Not scanned or error reading → needs scanning
10361036- }
10371037-10381038- scannedAt, err := time.Parse(time.RFC3339, scanRecord.ScannedAt)
10391039- if err != nil {
10401040- return false // Can't parse timestamp → treat as needing scan
10411041- }
10421042-10431043- return time.Since(scannedAt) < sb.rescanInterval
10441142}
1045114310461144// hasConnectedScanners returns true if at least one scanner is connected.