web archiver with MASL bundle mode for ATProto. captures web pages as content-addressed bundles stored on your PDS with optional IPFS pinning.

consolidate to ing.dasl.masl collection (web tiles format)

- COLLECTION_BUNDLE now writes to ing.dasl.masl instead of
systems.witchcraft.archive.bundle
- record format: name + resources at top level (MASL required),
archive metadata under systems.witchcraft.archive namespace
- blob refs inlined into resources[path].src (tiles spec compliant)
- simplified list/search/verify to use single format
- added migrate_bundles.py for migrating old bundle records
- fixed site_archive.py auth (pass env vars to subprocess)

+274 -65
+211
migrate_bundles.py
··· 1 + #!/usr/bin/env python3 2 + """ 3 + migrate_bundles.py - Migrate records from systems.witchcraft.archive.bundle to ing.dasl.masl 4 + 5 + Converts the new-format bundle records to proper ing.dasl.masl tile records, 6 + with archive metadata namespaced under systems.witchcraft.archive. 7 + 8 + Usage: 9 + python migrate_bundles.py --dry-run # Preview what would be migrated 10 + python migrate_bundles.py # Actually migrate 11 + python migrate_bundles.py --cleanup # Delete old bundle records after migration 12 + """ 13 + 14 + import argparse 15 + import json 16 + import sys 17 + import requests 18 + from pykeepass import PyKeePass 19 + from datetime import datetime, timezone 20 + 21 + 22 + def get_session(): 23 + kp = PyKeePass('/home/astra/clawd/.kira.kdbx', keyfile='/home/astra/clawd/.keyfile') 24 + entry = kp.find_entries(title='Bluesky - kira.pds.witchcraft.systems', first=True) 25 + pds = 'https://pds.witchcraft.systems' 26 + resp = requests.post(f'{pds}/xrpc/com.atproto.server.createSession', 27 + json={'identifier': entry.username, 'password': entry.password}) 28 + resp.raise_for_status() 29 + session = resp.json() 30 + return pds, session 31 + 32 + 33 + def list_bundle_records(pds, did): 34 + """Get all records from systems.witchcraft.archive.bundle.""" 35 + records = [] 36 + cursor = None 37 + while True: 38 + params = {'repo': did, 'collection': 'systems.witchcraft.archive.bundle', 'limit': 100} 39 + if cursor: 40 + params['cursor'] = cursor 41 + resp = requests.get(f'{pds}/xrpc/com.atproto.repo.listRecords', params=params) 42 + resp.raise_for_status() 43 + data = resp.json() 44 + records.extend(data.get('records', [])) 45 + cursor = data.get('cursor') 46 + if not cursor: 47 + break 48 + return records 49 + 50 + 51 + def list_masl_records(pds, did): 52 + """Get all records from ing.dasl.masl (to check for duplicates).""" 53 + records = [] 54 + cursor = None 55 + while True: 56 + params = {'repo': did, 'collection': 'ing.dasl.masl', 'limit': 100} 57 + if cursor: 58 + params['cursor'] = cursor 59 + resp = requests.get(f'{pds}/xrpc/com.atproto.repo.listRecords', params=params) 60 + resp.raise_for_status() 61 + data = resp.json() 62 + records.extend(data.get('records', [])) 63 + cursor = data.get('cursor') 64 + if not cursor: 65 + break 66 + return records 67 + 68 + 69 + def convert_bundle_to_masl(bundle_val): 70 + """Convert a systems.witchcraft.archive.bundle record to ing.dasl.masl format. 71 + 72 + Bundle format: 73 + {url, masl: {name, resources}, blobs, title, capturedAt, ...} 74 + 75 + Target MASL format (matching tiles spec + archive metadata): 76 + {$type: ing.dasl.masl, name, resources, systems.witchcraft.archive: {url, title, ...}} 77 + 78 + Resources need blob refs inlined (resources[path].src = blob ref). 79 + """ 80 + masl = bundle_val.get('masl', {}) 81 + blobs = bundle_val.get('blobs', {}) 82 + 83 + # Build resources map with inline blob refs (matching old format) 84 + resources = {} 85 + for path, res_data in masl.get('resources', {}).items(): 86 + entry = {} 87 + # Copy content-type 88 + if isinstance(res_data, dict): 89 + if 'content-type' in res_data: 90 + entry['content-type'] = res_data['content-type'] 91 + # Copy CID src string 92 + if 'src' in res_data: 93 + entry['src'] = res_data['src'] 94 + elif isinstance(res_data, str): 95 + entry['src'] = res_data 96 + 97 + # Add blob ref if available 98 + if path in blobs: 99 + entry['src'] = blobs[path] 100 + 101 + resources[path] = entry 102 + 103 + # Build the MASL record 104 + record = { 105 + '$type': 'ing.dasl.masl', 106 + 'name': masl.get('name', bundle_val.get('title', '(untitled)')), 107 + 'resources': resources, 108 + } 109 + 110 + # Add archive metadata under namespace 111 + archive_meta = {} 112 + for key in ['url', 'title', 'capturedAt', 'totalSize', 'wordCount', 113 + 'contentHash', 'rootIpfsCid', 'resourceCount', 'pinned', 'finalUrl']: 114 + if key in bundle_val: 115 + archive_meta[key] = bundle_val[key] 116 + 117 + if archive_meta: 118 + record['systems.witchcraft.archive'] = archive_meta 119 + 120 + return record 121 + 122 + 123 + def main(): 124 + parser = argparse.ArgumentParser(description='Migrate bundle records to ing.dasl.masl') 125 + parser.add_argument('--dry-run', action='store_true', help='Preview without making changes') 126 + parser.add_argument('--cleanup', action='store_true', help='Delete old bundle records after migration') 127 + args = parser.parse_args() 128 + 129 + pds, session = get_session() 130 + token = session['accessJwt'] 131 + did = session['did'] 132 + 133 + # Get existing records 134 + bundle_records = list_bundle_records(pds, did) 135 + masl_records = list_masl_records(pds, did) 136 + 137 + # Check which URLs are already in ing.dasl.masl 138 + existing_urls = set() 139 + for r in masl_records: 140 + v = r['value'] 141 + archive = v.get('systems.witchcraft.archive', {}) 142 + url = archive.get('url', '') 143 + if url: 144 + existing_urls.add(url) 145 + 146 + print(f'found {len(bundle_records)} bundle records') 147 + print(f'found {len(masl_records)} existing MASL records') 148 + print(f'existing URLs: {len(existing_urls)}') 149 + print() 150 + 151 + migrated = 0 152 + skipped = 0 153 + 154 + for rec in bundle_records: 155 + rkey = rec['uri'].split('/')[-1] 156 + val = rec['value'] 157 + url = val.get('url', '') 158 + title = val.get('title', val.get('masl', {}).get('name', '?')) 159 + 160 + # Check if already exists in MASL collection 161 + if url and url in existing_urls: 162 + print(f' skip (exists): {rkey} - {title}') 163 + skipped += 1 164 + continue 165 + 166 + masl_record = convert_bundle_to_masl(val) 167 + 168 + if args.dry_run: 169 + print(f' would migrate: {rkey} - {title}') 170 + print(f' resources: {len(masl_record.get("resources", {}))} files') 171 + migrated += 1 172 + continue 173 + 174 + # Create new record in ing.dasl.masl with same rkey 175 + resp = requests.post(f'{pds}/xrpc/com.atproto.repo.putRecord', 176 + headers={'Authorization': f'Bearer {token}'}, 177 + json={ 178 + 'repo': did, 179 + 'collection': 'ing.dasl.masl', 180 + 'rkey': rkey, 181 + 'record': masl_record 182 + }) 183 + 184 + if resp.ok: 185 + print(f' migrated: {rkey} - {title}') 186 + migrated += 1 187 + else: 188 + print(f' FAILED: {rkey} - {resp.status_code}: {resp.text[:200]}') 189 + 190 + print(f'\n{"would migrate" if args.dry_run else "migrated"}: {migrated}, skipped: {skipped}') 191 + 192 + # Cleanup old records 193 + if args.cleanup and not args.dry_run: 194 + print(f'\ncleaning up {len(bundle_records)} old bundle records...') 195 + for rec in bundle_records: 196 + rkey = rec['uri'].split('/')[-1] 197 + resp = requests.post(f'{pds}/xrpc/com.atproto.repo.deleteRecord', 198 + headers={'Authorization': f'Bearer {token}'}, 199 + json={ 200 + 'repo': did, 201 + 'collection': 'systems.witchcraft.archive.bundle', 202 + 'rkey': rkey 203 + }) 204 + if resp.ok: 205 + print(f' deleted: {rkey}') 206 + else: 207 + print(f' delete FAILED: {rkey} - {resp.status_code}') 208 + 209 + 210 + if __name__ == '__main__': 211 + main()
+20 -1
site_archive.py
··· 212 212 213 213 # --- archiving --- 214 214 215 + def _get_archive_env(): 216 + """Get environment variables needed for web_archive.py auth.""" 217 + env = os.environ.copy() 218 + if 'ATP_HANDLE' not in env or not env.get('ATP_HANDLE'): 219 + try: 220 + from pykeepass import PyKeePass 221 + kp = PyKeePass('/home/astra/clawd/.kira.kdbx', 222 + keyfile='/home/astra/clawd/.keyfile') 223 + entry = kp.find_entries(title='Bluesky - kira.pds.witchcraft.systems', 224 + first=True) 225 + env['ATP_PDS_URL'] = 'https://pds.witchcraft.systems' 226 + env['ATP_HANDLE'] = entry.username 227 + env['ATP_PASSWORD'] = entry.password 228 + except Exception as e: 229 + print(f" warning: could not load credentials: {e}") 230 + return env 231 + 232 + 215 233 def archive_page_as_bundle(url, no_ipfs=False): 216 234 """Archive a single page using web_archive.py's bundle mode. 217 235 Returns the record URI and rkey, or None on failure.""" ··· 221 239 cmd.append('--no-ipfs') 222 240 223 241 try: 224 - result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) 242 + env = _get_archive_env() 243 + result = subprocess.run(cmd, capture_output=True, text=True, timeout=120, env=env) 225 244 output = result.stdout 226 245 227 246 # Parse the URI from output
+43 -64
web_archive.py
··· 39 39 ATP_HANDLE = os.environ.get("ATP_HANDLE", "") 40 40 ATP_PASSWORD = os.environ.get("ATP_PASSWORD", "") 41 41 COLLECTION_CAPTURE = "systems.witchcraft.archive.capture" 42 - COLLECTION_BUNDLE = "systems.witchcraft.archive.bundle" 43 - # Legacy collection for backwards compat with old records 44 - COLLECTION_MASL_LEGACY = "ing.dasl.masl" 42 + COLLECTION_BUNDLE = "ing.dasl.masl" 43 + COLLECTION_MASL_LEGACY = COLLECTION_BUNDLE # consolidated 45 44 46 45 # Max subresources to fetch per bundle (safety limit) 47 46 MAX_SUBRESOURCES = 100 ··· 282 281 283 282 def create_bundle_record(session, name, url, resources_map, blobs_map, 284 283 captured_at, archive_meta=None): 285 - """Create a systems.witchcraft.archive.bundle record on ATProto. 284 + """Create an ing.dasl.masl record on ATProto (Web Tiles format). 286 285 287 - The record wraps a MASL-shaped bundle (with CID strings in src fields) 288 - alongside ATProto blob refs for actual content retrieval. 286 + Uses MASL bundle format with archive metadata namespaced under 287 + systems.witchcraft.archive. 289 288 290 - resources_map: dict of path -> {src: "cid_string", content-type: str} 291 - (MASL-conformant: src is a CID, not a blob ref) 292 - blobs_map: dict of path -> blob_ref (ATProto blob refs for fetching) 289 + resources_map: dict of path -> {src: blob_ref, content-type: str} 290 + blobs_map: dict of path -> blob_ref (kept for compat, merged into resources) 293 291 archive_meta: dict with url, capturedAt, title, etc. 294 292 """ 293 + # Merge blob refs into resources (MASL format: resources[path].src = blob ref) 294 + merged_resources = {} 295 + for path, res_data in resources_map.items(): 296 + entry = dict(res_data) if isinstance(res_data, dict) else {"src": res_data} 297 + if path in blobs_map: 298 + entry["src"] = blobs_map[path] 299 + merged_resources[path] = entry 300 + 295 301 record = { 296 302 "$type": COLLECTION_BUNDLE, 297 - # Archive metadata at top level 303 + # MASL required fields at top level 304 + "name": name, 305 + "resources": merged_resources, 306 + } 307 + 308 + # Archive metadata namespaced 309 + archive_ns = { 298 310 "url": archive_meta.get("url", url) if archive_meta else url, 299 - "capturedAt": captured_at, 300 311 "title": archive_meta.get("title", name) if archive_meta else name, 301 - # MASL-shaped bundle data 302 - "masl": { 303 - "name": name, 304 - "resources": resources_map, 305 - }, 306 - # ATProto blob refs keyed by path (for content retrieval from PDS) 307 - "blobs": blobs_map, 312 + "capturedAt": captured_at, 308 313 } 309 - 310 - # Add extra archive metadata 311 314 if archive_meta: 312 315 for key in ["wordCount", "totalSize", "resourceCount", 313 316 "rootIpfsCid", "contentHash", "pinned", "finalUrl"]: 314 317 if key in archive_meta: 315 - record[key] = archive_meta[key] 318 + archive_ns[key] = archive_meta[key] 319 + record["systems.witchcraft.archive"] = archive_ns 316 320 317 321 resp = requests.post(f"{PDS}/xrpc/com.atproto.repo.createRecord", 318 322 headers={"Authorization": f"Bearer {session['accessJwt']}"}, ··· 538 542 session = get_session() 539 543 all_records = [] 540 544 541 - collections = ([COLLECTION_CAPTURE, COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY] 545 + collections = ([COLLECTION_CAPTURE, COLLECTION_BUNDLE] 542 546 if not collection else [collection]) 543 547 544 548 for coll in collections: ··· 565 569 # Sort by captured time (newest first) 566 570 def sort_key(r): 567 571 val = r.get("value", {}) 568 - # New bundles: capturedAt at top level 569 - # Legacy MASL: capturedAt in systems.witchcraft.archive namespace 570 572 meta = val.get("systems.witchcraft.archive", {}) 571 - return val.get("capturedAt", meta.get("capturedAt", "")) 573 + return meta.get("capturedAt", val.get("capturedAt", "")) 572 574 all_records.sort(key=sort_key, reverse=True) 573 575 574 576 for rec in all_records[:limit]: ··· 578 580 is_bundle = coll in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY) 579 581 580 582 if is_bundle: 581 - if coll == COLLECTION_BUNDLE: 582 - # New format: metadata at top level, MASL in .masl field 583 - title = val.get("title", "(untitled)")[:60] 584 - url = val.get("url", "")[:60] 585 - captured = val.get("capturedAt", "")[:19] 586 - cid = val.get("rootIpfsCid", "") 587 - res_count = val.get("resourceCount", 588 - len(val.get("masl", {}).get("resources", {}))) 589 - total_size = val.get("totalSize", 0) 590 - else: 591 - # Legacy MASL format 592 - meta = val.get("systems.witchcraft.archive", {}) 593 - title = val.get("name", meta.get("title", "(untitled)"))[:60] 594 - url = meta.get("url", "")[:60] 595 - captured = meta.get("capturedAt", "")[:19] 596 - cid = meta.get("rootIpfsCid", "") 597 - res_count = meta.get("resourceCount", 598 - len(val.get("resources", {}))) 599 - total_size = meta.get("totalSize", 0) 583 + meta = val.get("systems.witchcraft.archive", {}) 584 + title = val.get("name", meta.get("title", "(untitled)"))[:60] 585 + url = meta.get("url", "")[:60] 586 + captured = meta.get("capturedAt", "")[:19] 587 + cid = meta.get("rootIpfsCid", "") 588 + res_count = meta.get("resourceCount", len(val.get("resources", {}))) 589 + total_size = meta.get("totalSize", 0) 600 590 tag = f"[BUNDLE {res_count} files, {total_size:,}b]" 601 591 else: 602 592 title = val.get("title", "(untitled)")[:60] ··· 619 609 620 610 # Try all collections 621 611 rec = None 622 - for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY]: 612 + for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE]: 623 613 try: 624 614 resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.getRecord", 625 615 headers={"Authorization": f"Bearer {session['accessJwt']}"}, ··· 639 629 is_bundle = rec["_collection"] in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY) 640 630 641 631 if is_bundle: 642 - if rec["_collection"] == COLLECTION_BUNDLE: 643 - # New format 644 - url = val.get("finalUrl", val.get("url", "")) 645 - stored_cid = val.get("rootIpfsCid", "") 646 - stored_hash = val.get("contentHash", "") 647 - captured_at = val.get("capturedAt", "") 648 - else: 649 - # Legacy MASL format 650 - meta = val.get("systems.witchcraft.archive", {}) 651 - url = meta.get("finalUrl", meta.get("url", "")) 652 - stored_cid = meta.get("rootIpfsCid", "") 653 - stored_hash = meta.get("contentHash", "") 654 - captured_at = meta.get("capturedAt", "") 632 + meta = val.get("systems.witchcraft.archive", {}) 633 + url = meta.get("finalUrl", meta.get("url", "")) 634 + stored_cid = meta.get("rootIpfsCid", "") 635 + stored_hash = meta.get("contentHash", "") 636 + captured_at = meta.get("capturedAt", "") 655 637 else: 656 638 url = val.get("finalUrl", val.get("url", "")) 657 639 stored_cid = val.get("ipfsCid", val.get("cid", "")) ··· 688 670 query_lower = query.lower() 689 671 matches = [] 690 672 691 - for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY]: 673 + for coll in [COLLECTION_CAPTURE, COLLECTION_BUNDLE]: 692 674 try: 693 675 resp = requests.get(f"{PDS}/xrpc/com.atproto.repo.listRecords", 694 676 headers={"Authorization": f"Bearer {session['accessJwt']}"}, ··· 696 678 resp.raise_for_status() 697 679 for rec in resp.json().get("records", []): 698 680 val = rec.get("value", {}) 699 - if coll == COLLECTION_BUNDLE: 700 - url = val.get("url", "").lower() 701 - title = val.get("title", "").lower() 702 - elif coll == COLLECTION_MASL_LEGACY: 681 + if coll in (COLLECTION_BUNDLE, COLLECTION_MASL_LEGACY): 703 682 meta = val.get("systems.witchcraft.archive", {}) 704 683 url = meta.get("url", "").lower() 705 684 title = val.get("name", meta.get("title", "")).lower() ··· 721 700 val = rec.get("value", {}) 722 701 rkey = rec["uri"].split("/")[-1] 723 702 coll = rec["_collection"] 724 - is_bundle = coll == COLLECTION_MASL 703 + is_bundle = coll == COLLECTION_BUNDLE 725 704 726 705 if is_bundle: 727 706 meta = val.get("systems.witchcraft.archive", {})