scripts/ml_audit.py at b21fb4027643b58b9a2e1b7dcd64fec5c282b922

zzstoatzz.io / plyr.fm
fork atom
audio streaming app plyr.fm
fork atom
plyr.fm / scripts / ml_audit.py
at b21fb4027643b58b9a2e1b7dcd64fec5c282b922 197 lines 6.5 kB view raw
wrap content
zzstoatzz.io chore: add ML processing audit script (#872) 4w ago
5046e77e
  1#!/usr/bin/env -S uv run --script --quiet
  2"""audit which tracks have been processed by ML features.
  3
  4## Context
  5
  6Several features run track audio through external ML services:
  7- **genre classification**: effnet-discogs on Replicate — stored in track.extra["genre_predictions"]
  8- **CLAP embeddings**: laion/clap-htsat-unfused on Modal — stored in turbopuffer
  9- **auto-tagging**: applies genre predictions as tags — leaves no extra flag (cleaned up after)
 10
 11this script reports which tracks and artists have been processed, for privacy
 12policy and terms-of-service auditing.
 13
 14## Usage
 15
 16```bash
 17# from repo root (requires DATABASE_URL or backend config)
 18cd backend && uv run python ../scripts/ml_audit.py
 19
 20# show track-level detail instead of just counts
 21cd backend && uv run python ../scripts/ml_audit.py --verbose
 22
 23# check turbopuffer embedding counts too (requires TURBOPUFFER_API_KEY)
 24cd backend && uv run python ../scripts/ml_audit.py --check-embeddings
 25```
 26"""
 27
 28import argparse
 29import asyncio
 30import logging
 31import os
 32
 33from sqlalchemy import func, select, text
 34
 35from backend.config import settings
 36
 37# suppress SQLAlchemy echo noise from debug mode
 38settings.app.debug = False
 39
 40from backend.models import Artist, Track  # noqa: E402
 41from backend.utilities.database import db_session  # noqa: E402
 42
 43logging.basicConfig(
 44    level=logging.INFO,
 45    format="%(message)s",
 46)
 47logger = logging.getLogger(__name__)
 48
 49
 50async def audit_genre_classifications(verbose: bool) -> None:
 51    """report tracks with genre predictions in extra."""
 52    async with db_session() as db:
 53        # summary by artist
 54        summary = await db.execute(
 55            select(
 56                Artist.handle,
 57                func.count(Track.id).label("track_count"),
 58            )
 59            .join(Artist, Track.artist_did == Artist.did)
 60            .where(text("tracks.extra->'genre_predictions' IS NOT NULL"))
 61            .group_by(Artist.handle)
 62            .order_by(func.count(Track.id).desc())
 63        )
 64        rows = summary.all()
 65
 66        total = sum(r.track_count for r in rows)
 67        logger.info(
 68            "genre classification: %d tracks across %d artists", total, len(rows)
 69        )
 70        for row in rows:
 71            logger.info("  %s: %d tracks", row.handle, row.track_count)
 72
 73        if verbose and total > 0:
 74            detail = await db.execute(
 75                select(
 76                    Track.id,
 77                    Track.title,
 78                    Artist.handle,
 79                    Track.created_at,
 80                )
 81                .join(Artist, Track.artist_did == Artist.did)
 82                .where(text("tracks.extra->'genre_predictions' IS NOT NULL"))
 83                .order_by(Track.created_at.desc())
 84            )
 85            logger.info("")
 86            logger.info("  %-6s %-40s %-25s %s", "id", "title", "handle", "created")
 87            logger.info("  %s", "-" * 100)
 88            for row in detail.all():
 89                title = row.title[:38] + ".." if len(row.title) > 40 else row.title
 90                logger.info(
 91                    "  %-6d %-40s %-25s %s",
 92                    row.id,
 93                    title,
 94                    row.handle,
 95                    row.created_at.strftime("%Y-%m-%d %H:%M"),
 96                )
 97
 98
 99async def audit_auto_tagged(verbose: bool) -> None:
100    """report tracks with auto_tag flag still pending (not yet processed)."""
101    async with db_session() as db:
102        result = await db.execute(
103            select(func.count(Track.id)).where(
104                text("(tracks.extra->>'auto_tag')::boolean = true")
105            )
106        )
107        pending = result.scalar() or 0
108        if pending > 0:
109            logger.info(
110                "\nauto-tag: %d tracks pending (flag not yet cleaned up)", pending
111            )
112        else:
113            logger.info(
114                "\nauto-tag: no pending flags (all processed or none requested)"
115            )
116
117
118async def audit_embeddings(verbose: bool) -> None:
119    """check turbopuffer for embedding counts."""
120    try:
121        import turbopuffer as tpuf
122    except ImportError:
123        logger.warning("turbopuffer not installed, skipping embedding audit")
124        return
125
126    api_key = os.environ.get("TURBOPUFFER_API_KEY")
127    if not api_key:
128        logger.warning("TURBOPUFFER_API_KEY not set, skipping embedding audit")
129        return
130
131    tpuf.api_key = api_key
132    namespace = os.environ.get("TURBOPUFFER_NAMESPACE", "plyr-tracks")
133    ns = tpuf.Namespace(namespace)
134
135    try:
136        # query with a zero vector to get total count
137        # turbopuffer requires rank_by for queries
138        zero_vec = [0.0] * 512
139        results = ns.query(
140            rank_by=["vector", "ANN", zero_vec],
141            top_k=10000,
142            include_attributes=["title", "artist_handle"],
143        )
144
145        if not results:
146            logger.info("\nembeddings (%s): 0 vectors", namespace)
147            return
148
149        # count by artist
150        artist_counts: dict[str, int] = {}
151        for row in results:
152            handle = getattr(row, "attributes", {}).get("artist_handle", "unknown")
153            artist_counts[handle] = artist_counts.get(handle, 0) + 1
154
155        total = len(results)
156        logger.info(
157            "\nembeddings (%s): %d vectors across %d artists",
158            namespace,
159            total,
160            len(artist_counts),
161        )
162        for handle, count in sorted(artist_counts.items(), key=lambda x: -x[1]):
163            logger.info("  %s: %d tracks", handle, count)
164
165    except Exception as e:
166        logger.error("failed to query turbopuffer: %s", e)
167
168
169async def main() -> None:
170    parser = argparse.ArgumentParser(description="audit ML-processed tracks")
171    parser.add_argument(
172        "--verbose", "-v", action="store_true", help="show track-level detail"
173    )
174    parser.add_argument(
175        "--check-embeddings",
176        action="store_true",
177        help="also check turbopuffer for embedding counts",
178    )
179    args = parser.parse_args()
180
181    # total track count for context
182    async with db_session() as db:
183        result = await db.execute(select(func.count(Track.id)))
184        total = result.scalar() or 0
185    logger.info("total tracks in database: %d\n", total)
186
187    await audit_genre_classifications(args.verbose)
188    await audit_auto_tagged(args.verbose)
189
190    if args.check_embeddings:
191        await audit_embeddings(args.verbose)
192
193    logger.info("")
194
195
196if __name__ == "__main__":
197    asyncio.run(main())