feat: add robots.txt and sitemap.xml for SEO (#820)

audio streaming app plyr.fm

* feat: add robots.txt and sitemap.xml for SEO

- API robots.txt: block all crawlers (it's an API, not a website)
- frontend robots.txt: allow search engines, block AI training bots
(GPTBot, ClaudeBot, CCBot, Google-Extended), allow AI search bots
(ChatGPT-User, Claude-User, PerplexityBot), disallow private paths
- sitemap.xml: dynamic generation via +server.ts route
- includes static pages, tracks, artists, albums
- backend /sitemap-data endpoint provides minimal data
- CDN-cached for 1 hour (s-maxage=3600)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix: move deferred imports to top of file

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>

authored by zzstoatzz.io

Claude Opus 4.5 and committed by

GitHub 1 month ago fa921b56 f61ce5d1

+198 -5

3 changed files

expand all

unified split

backend

src

backend

main.py

frontend

src

routes

sitemap.xml

+server.ts

static

robots.txt

+63 -3

backend/src/backend/main.py

··· 6 6 import warnings 7 7 from collections.abc import AsyncIterator 8 8 from contextlib import asynccontextmanager 9 - from typing import Any 9 + from typing import Annotated, Any 10 10 11 - from fastapi import FastAPI, HTTPException, Request, WebSocket 11 + from fastapi import Depends, FastAPI, HTTPException, Request, WebSocket 12 12 from fastapi.middleware.cors import CORSMiddleware 13 - from fastapi.responses import ORJSONResponse 13 + from fastapi.responses import ORJSONResponse, PlainTextResponse 14 14 from slowapi import _rate_limit_exceeded_handler 15 15 from slowapi.errors import RateLimitExceeded 16 16 from slowapi.middleware import SlowAPIMiddleware 17 + from sqlalchemy import select 18 + from sqlalchemy.ext.asyncio import AsyncSession 17 19 from starlette.middleware.base import BaseHTTPMiddleware 18 20 19 21 # filter pydantic warning from atproto library ··· 47 49 from backend.api.lists import router as lists_router 48 50 from backend.api.migration import router as migration_router 49 51 from backend.config import settings 52 + from backend.models import Album, Artist, Track, get_db 50 53 from backend.utilities.rate_limit import limiter 51 54 52 55 # configure logfire if enabled ··· 294 297 ) 295 298 296 299 return jwks 300 + 301 + 302 + @app.get("/robots.txt", include_in_schema=False) 303 + async def robots_txt(): 304 + """serve robots.txt to tell crawlers this is an API, not a website.""" 305 + return PlainTextResponse( 306 + "User-agent: *\nDisallow: /\n", 307 + media_type="text/plain", 308 + ) 309 + 310 + 311 + @app.get("/sitemap-data") 312 + async def sitemap_data( 313 + db: Annotated[AsyncSession, Depends(get_db)], 314 + ) -> dict[str, Any]: 315 + """return minimal data needed to generate sitemap.xml. 316 + 317 + returns tracks, artists, and albums with just IDs/slugs and timestamps. 318 + the frontend renders this into XML at /sitemap.xml. 319 + """ 320 + # fetch all tracks (id, created_at) 321 + tracks_result = await db.execute( 322 + select(Track.id, Track.created_at).order_by(Track.created_at.desc()) 323 + ) 324 + tracks = [ 325 + {"id": row.id, "updated": row.created_at.strftime("%Y-%m-%d")} 326 + for row in tracks_result.all() 327 + ] 328 + 329 + # fetch all artists with at least one track (handle, updated_at) 330 + artists_result = await db.execute( 331 + select(Artist.handle, Artist.updated_at) 332 + .join(Track, Artist.did == Track.artist_did) 333 + .distinct() 334 + .order_by(Artist.updated_at.desc()) 335 + ) 336 + artists = [ 337 + {"handle": row.handle, "updated": row.updated_at.strftime("%Y-%m-%d")} 338 + for row in artists_result.all() 339 + ] 340 + 341 + # fetch all albums (artist handle, slug, updated_at) 342 + albums_result = await db.execute( 343 + select(Album.slug, Artist.handle, Album.updated_at) 344 + .join(Artist, Album.artist_did == Artist.did) 345 + .order_by(Album.updated_at.desc()) 346 + ) 347 + albums = [ 348 + { 349 + "handle": row.handle, 350 + "slug": row.slug, 351 + "updated": row.updated_at.strftime("%Y-%m-%d"), 352 + } 353 + for row in albums_result.all() 354 + ] 355 + 356 + return {"tracks": tracks, "artists": artists, "albums": albums}

+81

frontend/src/routes/sitemap.xml/+server.ts

··· 1 + import { API_URL } from '$lib/config'; 2 + import type { RequestHandler } from './$types'; 3 + 4 + interface SitemapData { 5 + tracks: Array<{ id: number; updated: string }>; 6 + artists: Array<{ handle: string; updated: string }>; 7 + albums: Array<{ handle: string; slug: string; updated: string }>; 8 + } 9 + 10 + // static pages with their approximate update frequency 11 + const STATIC_PAGES = [ 12 + { path: '', updated: '2026-01-28' }, // homepage 13 + { path: '/terms', updated: '2026-01-20' }, 14 + { path: '/privacy', updated: '2026-01-20' }, 15 + { path: '/costs', updated: '2026-01-01' } 16 + ]; 17 + 18 + export const GET: RequestHandler = async ({ fetch }) => { 19 + const baseUrl = 'https://plyr.fm'; 20 + 21 + // fetch dynamic content from backend 22 + let data: SitemapData = { tracks: [], artists: [], albums: [] }; 23 + try { 24 + const response = await fetch(`${API_URL}/sitemap-data`); 25 + if (response.ok) { 26 + data = await response.json(); 27 + } 28 + } catch { 29 + // if backend is down, still return static pages 30 + } 31 + 32 + // build XML 33 + const urls: string[] = []; 34 + 35 + // static pages 36 + for (const page of STATIC_PAGES) { 37 + urls.push(` 38 + <url> 39 + <loc>${baseUrl}${page.path}</loc> 40 + <lastmod>${page.updated}</lastmod> 41 + </url>`); 42 + } 43 + 44 + // track pages 45 + for (const track of data.tracks) { 46 + urls.push(` 47 + <url> 48 + <loc>${baseUrl}/track/${track.id}</loc> 49 + <lastmod>${track.updated}</lastmod> 50 + </url>`); 51 + } 52 + 53 + // artist pages 54 + for (const artist of data.artists) { 55 + urls.push(` 56 + <url> 57 + <loc>${baseUrl}/u/${artist.handle}</loc> 58 + <lastmod>${artist.updated}</lastmod> 59 + </url>`); 60 + } 61 + 62 + // album pages 63 + for (const album of data.albums) { 64 + urls.push(` 65 + <url> 66 + <loc>${baseUrl}/u/${album.handle}/album/${album.slug}</loc> 67 + <lastmod>${album.updated}</lastmod> 68 + </url>`); 69 + } 70 + 71 + const xml = `<?xml version="1.0" encoding="UTF-8"?> 72 + <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">${urls.join('')} 73 + </urlset>`; 74 + 75 + return new Response(xml, { 76 + headers: { 77 + 'Content-Type': 'application/xml', 78 + 'Cache-Control': 'max-age=0, s-maxage=3600' // CDN caches for 1 hour 79 + } 80 + }); 81 + };

+54 -2

frontend/static/robots.txt

··· 1 - # allow crawling everything by default 1 + # plyr.fm robots.txt 2 + 3 + # search engines - full access to public content 4 + User-agent: Googlebot 5 + Allow: / 6 + 7 + User-agent: Bingbot 8 + Allow: / 9 + 10 + # AI training crawlers - block 11 + # these crawl content for model training, not search 12 + User-agent: GPTBot 13 + Disallow: / 14 + 15 + User-agent: ClaudeBot 16 + Disallow: / 17 + 18 + User-agent: CCBot 19 + Disallow: / 20 + 21 + User-agent: Google-Extended 22 + Disallow: / 23 + 24 + User-agent: Applebot-Extended 25 + Disallow: / 26 + 27 + User-agent: anthropic-ai 28 + Disallow: / 29 + 30 + User-agent: Bytespider 31 + Disallow: / 32 + 33 + # AI search/assistant crawlers - allow 34 + # these fetch content to answer user queries 35 + User-agent: ChatGPT-User 36 + Allow: / 37 + 38 + User-agent: Claude-User 39 + Allow: / 40 + 41 + User-agent: PerplexityBot 42 + Allow: / 43 + 44 + # default rules for all other crawlers 2 45 User-agent: * 3 - Disallow: 46 + Allow: / 47 + Disallow: /portal 48 + Disallow: /settings 49 + Disallow: /library 50 + Disallow: /liked 51 + Disallow: /upload 52 + Disallow: /profile/setup 53 + Disallow: /login 54 + 55 + Sitemap: https://plyr.fm/sitemap.xml