audio streaming app
plyr.fm
1#!/usr/bin/env -S uv run --script --quiet
2"""audit which tracks have been processed by ML features.
3
4## Context
5
6Several features run track audio through external ML services:
7- **genre classification**: effnet-discogs on Replicate — stored in track.extra["genre_predictions"]
8- **CLAP embeddings**: laion/clap-htsat-unfused on Modal — stored in turbopuffer
9- **auto-tagging**: applies genre predictions as tags — leaves no extra flag (cleaned up after)
10
11this script reports which tracks and artists have been processed, for privacy
12policy and terms-of-service auditing.
13
14## Usage
15
16```bash
17# from repo root (requires DATABASE_URL or backend config)
18cd backend && uv run python ../scripts/ml_audit.py
19
20# show track-level detail instead of just counts
21cd backend && uv run python ../scripts/ml_audit.py --verbose
22
23# check turbopuffer embedding counts too (requires TURBOPUFFER_API_KEY)
24cd backend && uv run python ../scripts/ml_audit.py --check-embeddings
25```
26"""
27
28import argparse
29import asyncio
30import logging
31import os
32
33from sqlalchemy import func, select, text
34
35from backend.config import settings
36
37# suppress SQLAlchemy echo noise from debug mode
38settings.app.debug = False
39
40from backend.models import Artist, Track # noqa: E402
41from backend.utilities.database import db_session # noqa: E402
42
43logging.basicConfig(
44 level=logging.INFO,
45 format="%(message)s",
46)
47logger = logging.getLogger(__name__)
48
49
50async def audit_genre_classifications(verbose: bool) -> None:
51 """report tracks with genre predictions in extra."""
52 async with db_session() as db:
53 # summary by artist
54 summary = await db.execute(
55 select(
56 Artist.handle,
57 func.count(Track.id).label("track_count"),
58 )
59 .join(Artist, Track.artist_did == Artist.did)
60 .where(text("tracks.extra->'genre_predictions' IS NOT NULL"))
61 .group_by(Artist.handle)
62 .order_by(func.count(Track.id).desc())
63 )
64 rows = summary.all()
65
66 total = sum(r.track_count for r in rows)
67 logger.info(
68 "genre classification: %d tracks across %d artists", total, len(rows)
69 )
70 for row in rows:
71 logger.info(" %s: %d tracks", row.handle, row.track_count)
72
73 if verbose and total > 0:
74 detail = await db.execute(
75 select(
76 Track.id,
77 Track.title,
78 Artist.handle,
79 Track.created_at,
80 )
81 .join(Artist, Track.artist_did == Artist.did)
82 .where(text("tracks.extra->'genre_predictions' IS NOT NULL"))
83 .order_by(Track.created_at.desc())
84 )
85 logger.info("")
86 logger.info(" %-6s %-40s %-25s %s", "id", "title", "handle", "created")
87 logger.info(" %s", "-" * 100)
88 for row in detail.all():
89 title = row.title[:38] + ".." if len(row.title) > 40 else row.title
90 logger.info(
91 " %-6d %-40s %-25s %s",
92 row.id,
93 title,
94 row.handle,
95 row.created_at.strftime("%Y-%m-%d %H:%M"),
96 )
97
98
99async def audit_auto_tagged(verbose: bool) -> None:
100 """report tracks with auto_tag flag still pending (not yet processed)."""
101 async with db_session() as db:
102 result = await db.execute(
103 select(func.count(Track.id)).where(
104 text("(tracks.extra->>'auto_tag')::boolean = true")
105 )
106 )
107 pending = result.scalar() or 0
108 if pending > 0:
109 logger.info(
110 "\nauto-tag: %d tracks pending (flag not yet cleaned up)", pending
111 )
112 else:
113 logger.info(
114 "\nauto-tag: no pending flags (all processed or none requested)"
115 )
116
117
118async def audit_embeddings(verbose: bool) -> None:
119 """check turbopuffer for embedding counts."""
120 try:
121 import turbopuffer as tpuf
122 except ImportError:
123 logger.warning("turbopuffer not installed, skipping embedding audit")
124 return
125
126 api_key = os.environ.get("TURBOPUFFER_API_KEY")
127 if not api_key:
128 logger.warning("TURBOPUFFER_API_KEY not set, skipping embedding audit")
129 return
130
131 tpuf.api_key = api_key
132 namespace = os.environ.get("TURBOPUFFER_NAMESPACE", "plyr-tracks")
133 ns = tpuf.Namespace(namespace)
134
135 try:
136 # query with a zero vector to get total count
137 # turbopuffer requires rank_by for queries
138 zero_vec = [0.0] * 512
139 results = ns.query(
140 rank_by=["vector", "ANN", zero_vec],
141 top_k=10000,
142 include_attributes=["title", "artist_handle"],
143 )
144
145 if not results:
146 logger.info("\nembeddings (%s): 0 vectors", namespace)
147 return
148
149 # count by artist
150 artist_counts: dict[str, int] = {}
151 for row in results:
152 handle = getattr(row, "attributes", {}).get("artist_handle", "unknown")
153 artist_counts[handle] = artist_counts.get(handle, 0) + 1
154
155 total = len(results)
156 logger.info(
157 "\nembeddings (%s): %d vectors across %d artists",
158 namespace,
159 total,
160 len(artist_counts),
161 )
162 for handle, count in sorted(artist_counts.items(), key=lambda x: -x[1]):
163 logger.info(" %s: %d tracks", handle, count)
164
165 except Exception as e:
166 logger.error("failed to query turbopuffer: %s", e)
167
168
169async def main() -> None:
170 parser = argparse.ArgumentParser(description="audit ML-processed tracks")
171 parser.add_argument(
172 "--verbose", "-v", action="store_true", help="show track-level detail"
173 )
174 parser.add_argument(
175 "--check-embeddings",
176 action="store_true",
177 help="also check turbopuffer for embedding counts",
178 )
179 args = parser.parse_args()
180
181 # total track count for context
182 async with db_session() as db:
183 result = await db.execute(select(func.count(Track.id)))
184 total = result.scalar() or 0
185 logger.info("total tracks in database: %d\n", total)
186
187 await audit_genre_classifications(args.verbose)
188 await audit_auto_tagged(args.verbose)
189
190 if args.check_embeddings:
191 await audit_embeddings(args.verbose)
192
193 logger.info("")
194
195
196if __name__ == "__main__":
197 asyncio.run(main())