feat: add whitewind support to backfill-pds script + fix dep URLs

search for standard sites pub-search.waow.tech

search zig blog atproto

- backfill-pds now handles com.whtwnd.blog.entry collection
- extracts markdown content from whitewind's content field
- sets platform to "whitewind", skips visibility:"author" entries
- prefers publishedAt over createdAt for date extraction
- update tangled.sh URLs to tangled.org in build.zig.zon

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

zzstoatzz.io 1 month ago a3da00c9 8067e3f1

+24 -14

2 changed files

expand all

unified split

backend

build.zig.zon

scripts

backfill-pds

+2 -2

backend/build.zig.zon

··· 13 13 .hash = "zql-0.0.1-alpha-xNRI4IRNAABUb9gLat5FWUaZDD5HvxAxet_-elgR_A_y", 14 14 }, 15 15 .zat = .{ 16 - .url = "https://tangled.sh/zat.dev/zat/archive/main", 16 + .url = "https://tangled.org/zat.dev/zat/archive/main", 17 17 .hash = "zat-0.1.0-5PuC7s8BAwDFmyYbkyZ8w4azed6eR_cqmHbh8f00NBSB", 18 18 }, 19 19 .zqlite = .{ ··· 21 21 .hash = "zqlite-0.0.1-RWLaYz6bmAAT7E_jxopXf-j5Ea8VQldnxsd6TU8sa0Bb", 22 22 }, 23 23 .logfire = .{ 24 - .url = "https://tangled.sh/zzstoatzz.io/logfire-zig/archive/main", 24 + .url = "https://tangled.org/zzstoatzz.io/logfire-zig/archive/main", 25 25 .hash = "logfire_zig-0.1.0-x2yDLgdwAADOXAZLNQJ8FUH5v1vfFwe5CApJtQ7c_pZd", 26 26 }, 27 27 },

+22 -12

scripts/backfill-pds

··· 161 161 if not title: 162 162 return None 163 163 164 - # Get content - try textContent (site.standard), then leaflet blocks 164 + # Skip author-only whitewind entries 165 + if collection == "com.whtwnd.blog.entry": 166 + if value.get("visibility") == "author": 167 + return None 168 + 169 + # Get content - try textContent (site.standard), then content as string 170 + # (whitewind stores markdown), then leaflet blocks 165 171 content = value.get("textContent") or "" 166 172 if not content: 167 - # Try leaflet-style pages/blocks at top level (pub.leaflet.document) 168 - pages = value.get("pages", []) 169 - if pages: 170 - content = extract_leaflet_blocks(pages) 171 - if not content: 172 - # Try content.pages (site.standard.document with pub.leaflet.content) 173 173 content_obj = value.get("content") 174 - if isinstance(content_obj, dict): 174 + if isinstance(content_obj, str): 175 + content = content_obj 176 + elif isinstance(content_obj, dict): 175 177 pages = content_obj.get("pages", []) 176 178 if pages: 177 179 content = extract_leaflet_blocks(pages) 180 + if not content: 181 + # Try leaflet-style pages/blocks at top level (pub.leaflet.document) 182 + pages = value.get("pages", []) 183 + if pages: 184 + content = extract_leaflet_blocks(pages) 178 185 179 - # Get created_at 180 - created_at = value.get("createdAt", "") 186 + # Get created_at (prefer publishedAt for leaflet/standard, createdAt for whitewind) 187 + created_at = value.get("publishedAt") or value.get("createdAt", "") 181 188 182 189 # Get publication reference - try "publication" (leaflet) then "site" (site.standard) 183 190 publication = value.get("publication") or value.get("site") ··· 196 203 if not isinstance(tags, list): 197 204 tags = [] 198 205 199 - # Determine platform from collection (site.standard is a lexicon, not a platform) 206 + # Determine platform from collection 200 207 if collection.startswith("pub.leaflet"): 201 208 platform = "leaflet" 202 209 elif collection.startswith("blog.pckt"): 203 210 platform = "pckt" 211 + elif collection.startswith("com.whtwnd"): 212 + platform = "whitewind" 204 213 else: 205 214 # site.standard.* and others - platform will be detected from publication basePath 206 215 platform = "unknown" ··· 250 259 "pub.leaflet.publication", 251 260 "site.standard.document", 252 261 "site.standard.publication", 262 + "com.whtwnd.blog.entry", 253 263 ] 254 264 255 265 total_docs = 0 ··· 277 287 parts = uri.split("/") 278 288 rkey = parts[-1] 279 289 280 - if collection.endswith(".document"): 290 + if collection.endswith(".document") or collection == "com.whtwnd.blog.entry": 281 291 doc = extract_document(record, collection) 282 292 if not doc: 283 293 print(f" skip {uri} (no title)")