audio streaming app plyr.fm
at b21fb4027643b58b9a2e1b7dcd64fec5c282b922 197 lines 6.5 kB view raw
1#!/usr/bin/env -S uv run --script --quiet 2"""audit which tracks have been processed by ML features. 3 4## Context 5 6Several features run track audio through external ML services: 7- **genre classification**: effnet-discogs on Replicate — stored in track.extra["genre_predictions"] 8- **CLAP embeddings**: laion/clap-htsat-unfused on Modal — stored in turbopuffer 9- **auto-tagging**: applies genre predictions as tags — leaves no extra flag (cleaned up after) 10 11this script reports which tracks and artists have been processed, for privacy 12policy and terms-of-service auditing. 13 14## Usage 15 16```bash 17# from repo root (requires DATABASE_URL or backend config) 18cd backend && uv run python ../scripts/ml_audit.py 19 20# show track-level detail instead of just counts 21cd backend && uv run python ../scripts/ml_audit.py --verbose 22 23# check turbopuffer embedding counts too (requires TURBOPUFFER_API_KEY) 24cd backend && uv run python ../scripts/ml_audit.py --check-embeddings 25``` 26""" 27 28import argparse 29import asyncio 30import logging 31import os 32 33from sqlalchemy import func, select, text 34 35from backend.config import settings 36 37# suppress SQLAlchemy echo noise from debug mode 38settings.app.debug = False 39 40from backend.models import Artist, Track # noqa: E402 41from backend.utilities.database import db_session # noqa: E402 42 43logging.basicConfig( 44 level=logging.INFO, 45 format="%(message)s", 46) 47logger = logging.getLogger(__name__) 48 49 50async def audit_genre_classifications(verbose: bool) -> None: 51 """report tracks with genre predictions in extra.""" 52 async with db_session() as db: 53 # summary by artist 54 summary = await db.execute( 55 select( 56 Artist.handle, 57 func.count(Track.id).label("track_count"), 58 ) 59 .join(Artist, Track.artist_did == Artist.did) 60 .where(text("tracks.extra->'genre_predictions' IS NOT NULL")) 61 .group_by(Artist.handle) 62 .order_by(func.count(Track.id).desc()) 63 ) 64 rows = summary.all() 65 66 total = sum(r.track_count for r in rows) 67 logger.info( 68 "genre classification: %d tracks across %d artists", total, len(rows) 69 ) 70 for row in rows: 71 logger.info(" %s: %d tracks", row.handle, row.track_count) 72 73 if verbose and total > 0: 74 detail = await db.execute( 75 select( 76 Track.id, 77 Track.title, 78 Artist.handle, 79 Track.created_at, 80 ) 81 .join(Artist, Track.artist_did == Artist.did) 82 .where(text("tracks.extra->'genre_predictions' IS NOT NULL")) 83 .order_by(Track.created_at.desc()) 84 ) 85 logger.info("") 86 logger.info(" %-6s %-40s %-25s %s", "id", "title", "handle", "created") 87 logger.info(" %s", "-" * 100) 88 for row in detail.all(): 89 title = row.title[:38] + ".." if len(row.title) > 40 else row.title 90 logger.info( 91 " %-6d %-40s %-25s %s", 92 row.id, 93 title, 94 row.handle, 95 row.created_at.strftime("%Y-%m-%d %H:%M"), 96 ) 97 98 99async def audit_auto_tagged(verbose: bool) -> None: 100 """report tracks with auto_tag flag still pending (not yet processed).""" 101 async with db_session() as db: 102 result = await db.execute( 103 select(func.count(Track.id)).where( 104 text("(tracks.extra->>'auto_tag')::boolean = true") 105 ) 106 ) 107 pending = result.scalar() or 0 108 if pending > 0: 109 logger.info( 110 "\nauto-tag: %d tracks pending (flag not yet cleaned up)", pending 111 ) 112 else: 113 logger.info( 114 "\nauto-tag: no pending flags (all processed or none requested)" 115 ) 116 117 118async def audit_embeddings(verbose: bool) -> None: 119 """check turbopuffer for embedding counts.""" 120 try: 121 import turbopuffer as tpuf 122 except ImportError: 123 logger.warning("turbopuffer not installed, skipping embedding audit") 124 return 125 126 api_key = os.environ.get("TURBOPUFFER_API_KEY") 127 if not api_key: 128 logger.warning("TURBOPUFFER_API_KEY not set, skipping embedding audit") 129 return 130 131 tpuf.api_key = api_key 132 namespace = os.environ.get("TURBOPUFFER_NAMESPACE", "plyr-tracks") 133 ns = tpuf.Namespace(namespace) 134 135 try: 136 # query with a zero vector to get total count 137 # turbopuffer requires rank_by for queries 138 zero_vec = [0.0] * 512 139 results = ns.query( 140 rank_by=["vector", "ANN", zero_vec], 141 top_k=10000, 142 include_attributes=["title", "artist_handle"], 143 ) 144 145 if not results: 146 logger.info("\nembeddings (%s): 0 vectors", namespace) 147 return 148 149 # count by artist 150 artist_counts: dict[str, int] = {} 151 for row in results: 152 handle = getattr(row, "attributes", {}).get("artist_handle", "unknown") 153 artist_counts[handle] = artist_counts.get(handle, 0) + 1 154 155 total = len(results) 156 logger.info( 157 "\nembeddings (%s): %d vectors across %d artists", 158 namespace, 159 total, 160 len(artist_counts), 161 ) 162 for handle, count in sorted(artist_counts.items(), key=lambda x: -x[1]): 163 logger.info(" %s: %d tracks", handle, count) 164 165 except Exception as e: 166 logger.error("failed to query turbopuffer: %s", e) 167 168 169async def main() -> None: 170 parser = argparse.ArgumentParser(description="audit ML-processed tracks") 171 parser.add_argument( 172 "--verbose", "-v", action="store_true", help="show track-level detail" 173 ) 174 parser.add_argument( 175 "--check-embeddings", 176 action="store_true", 177 help="also check turbopuffer for embedding counts", 178 ) 179 args = parser.parse_args() 180 181 # total track count for context 182 async with db_session() as db: 183 result = await db.execute(select(func.count(Track.id))) 184 total = result.scalar() or 0 185 logger.info("total tracks in database: %d\n", total) 186 187 await audit_genre_classifications(args.verbose) 188 await audit_auto_tagged(args.verbose) 189 190 if args.check_embeddings: 191 await audit_embeddings(args.verbose) 192 193 logger.info("") 194 195 196if __name__ == "__main__": 197 asyncio.run(main())