A community based topic aggregation platform built on atproto

feat(aggregators): implement Kagi News RSS aggregator core

Implements Phase 1 of the Kagi News aggregator system, a reference
implementation for the Coves aggregator architecture.

Core components:
- RSS Fetcher: Fetches feeds with retry logic and error handling
- HTML Parser: Extracts structured data from Kagi's HTML descriptions
(summary, highlights, perspectives, quotes, sources)
- Rich Text Formatter: Formats content with proper Coves facets
- State Manager: JSON-based deduplication with rolling window
- Config Manager: YAML configuration with environment variable support
- Coves Client: HTTP client for authentication and post creation
- Main Orchestrator: Coordinates all components with error isolation

Key features:
- Verified feed structure: 3 H3 sections (Highlights, Perspectives, Sources)
- Historical context woven into summary/highlights
- UTF-8 byte position calculation for facets
- Feed-level and item-level error isolation
- Structured logging throughout

Implementation uses Python 3.11+ with:
- feedparser for RSS parsing
- beautifulsoup4 for HTML extraction
- requests for HTTP operations
- pyyaml for configuration

+1467
+41
aggregators/kagi-news/.gitignore
··· 1 + # Environment and config 2 + .env 3 + config.yaml 4 + venv/ 5 + 6 + # State files 7 + data/*.json 8 + data/world.xml 9 + 10 + # Python 11 + __pycache__/ 12 + *.py[cod] 13 + *$py.class 14 + *.so 15 + .Python 16 + build/ 17 + develop-eggs/ 18 + dist/ 19 + downloads/ 20 + eggs/ 21 + .eggs/ 22 + lib/ 23 + lib64/ 24 + parts/ 25 + sdist/ 26 + var/ 27 + wheels/ 28 + *.egg-info/ 29 + .installed.cfg 30 + *.egg 31 + 32 + # Testing 33 + .pytest_cache/ 34 + .coverage 35 + htmlcov/ 36 + 37 + # IDE 38 + .vscode/ 39 + .idea/ 40 + *.swp 41 + *.swo
+3
aggregators/kagi-news/src/__init__.py
··· 1 + """Kagi News RSS Aggregator for Coves.""" 2 + 3 + __version__ = "0.1.0"
+165
aggregators/kagi-news/src/config.py
··· 1 + """ 2 + Configuration Loader for Kagi News Aggregator. 3 + 4 + Loads and validates configuration from YAML files. 5 + """ 6 + import os 7 + import logging 8 + from pathlib import Path 9 + from typing import Dict, Any 10 + import yaml 11 + from urllib.parse import urlparse 12 + 13 + from src.models import AggregatorConfig, FeedConfig 14 + 15 + logger = logging.getLogger(__name__) 16 + 17 + 18 + class ConfigError(Exception): 19 + """Configuration error.""" 20 + pass 21 + 22 + 23 + class ConfigLoader: 24 + """ 25 + Loads and validates aggregator configuration. 26 + 27 + Supports: 28 + - Loading from YAML file 29 + - Environment variable overrides 30 + - Validation of required fields 31 + - URL validation 32 + """ 33 + 34 + def __init__(self, config_path: Path): 35 + """ 36 + Initialize config loader. 37 + 38 + Args: 39 + config_path: Path to config.yaml file 40 + """ 41 + self.config_path = Path(config_path) 42 + 43 + def load(self) -> AggregatorConfig: 44 + """ 45 + Load and validate configuration. 46 + 47 + Returns: 48 + AggregatorConfig object 49 + 50 + Raises: 51 + ConfigError: If config is invalid or missing 52 + """ 53 + # Check file exists 54 + if not self.config_path.exists(): 55 + raise ConfigError(f"Configuration file not found: {self.config_path}") 56 + 57 + # Load YAML 58 + try: 59 + with open(self.config_path, 'r') as f: 60 + config_data = yaml.safe_load(f) 61 + except yaml.YAMLError as e: 62 + raise ConfigError(f"Failed to parse YAML: {e}") 63 + 64 + if not config_data: 65 + raise ConfigError("Configuration file is empty") 66 + 67 + # Validate and parse 68 + try: 69 + return self._parse_config(config_data) 70 + except Exception as e: 71 + raise ConfigError(f"Invalid configuration: {e}") 72 + 73 + def _parse_config(self, data: Dict[str, Any]) -> AggregatorConfig: 74 + """ 75 + Parse and validate configuration data. 76 + 77 + Args: 78 + data: Parsed YAML data 79 + 80 + Returns: 81 + AggregatorConfig object 82 + 83 + Raises: 84 + ConfigError: If validation fails 85 + """ 86 + # Get coves_api_url (with env override) 87 + coves_api_url = os.getenv('COVES_API_URL', data.get('coves_api_url')) 88 + if not coves_api_url: 89 + raise ConfigError("Missing required field: coves_api_url") 90 + 91 + # Validate URL 92 + if not self._is_valid_url(coves_api_url): 93 + raise ConfigError(f"Invalid URL for coves_api_url: {coves_api_url}") 94 + 95 + # Get log level (default to info) 96 + log_level = data.get('log_level', 'info') 97 + 98 + # Parse feeds 99 + feeds_data = data.get('feeds', []) 100 + if not feeds_data: 101 + raise ConfigError("Configuration must include at least one feed") 102 + 103 + feeds = [] 104 + for feed_data in feeds_data: 105 + feed = self._parse_feed(feed_data) 106 + feeds.append(feed) 107 + 108 + logger.info(f"Loaded configuration with {len(feeds)} feeds ({sum(1 for f in feeds if f.enabled)} enabled)") 109 + 110 + return AggregatorConfig( 111 + coves_api_url=coves_api_url, 112 + feeds=feeds, 113 + log_level=log_level 114 + ) 115 + 116 + def _parse_feed(self, data: Dict[str, Any]) -> FeedConfig: 117 + """ 118 + Parse and validate a single feed configuration. 119 + 120 + Args: 121 + data: Feed configuration data 122 + 123 + Returns: 124 + FeedConfig object 125 + 126 + Raises: 127 + ConfigError: If validation fails 128 + """ 129 + # Required fields 130 + required_fields = ['name', 'url', 'community_handle'] 131 + for field in required_fields: 132 + if field not in data: 133 + raise ConfigError(f"Missing required field in feed config: {field}") 134 + 135 + name = data['name'] 136 + url = data['url'] 137 + community_handle = data['community_handle'] 138 + enabled = data.get('enabled', True) # Default to True 139 + 140 + # Validate URL 141 + if not self._is_valid_url(url): 142 + raise ConfigError(f"Invalid URL for feed '{name}': {url}") 143 + 144 + return FeedConfig( 145 + name=name, 146 + url=url, 147 + community_handle=community_handle, 148 + enabled=enabled 149 + ) 150 + 151 + def _is_valid_url(self, url: str) -> bool: 152 + """ 153 + Validate URL format. 154 + 155 + Args: 156 + url: URL to validate 157 + 158 + Returns: 159 + True if valid, False otherwise 160 + """ 161 + try: 162 + result = urlparse(url) 163 + return all([result.scheme, result.netloc]) 164 + except Exception: 165 + return False
+175
aggregators/kagi-news/src/coves_client.py
··· 1 + """ 2 + Coves API Client for posting to communities. 3 + 4 + Handles authentication and posting via XRPC. 5 + """ 6 + import logging 7 + import requests 8 + from typing import Dict, List, Optional 9 + from atproto import Client 10 + 11 + logger = logging.getLogger(__name__) 12 + 13 + 14 + class CovesClient: 15 + """ 16 + Client for posting to Coves communities via XRPC. 17 + 18 + Handles: 19 + - Authentication with aggregator credentials 20 + - Creating posts in communities (social.coves.post.create) 21 + - External embed formatting 22 + """ 23 + 24 + def __init__(self, api_url: str, handle: str, password: str, pds_url: Optional[str] = None): 25 + """ 26 + Initialize Coves client. 27 + 28 + Args: 29 + api_url: Coves AppView URL for posting (e.g., "http://localhost:8081") 30 + handle: Aggregator handle (e.g., "kagi-news.coves.social") 31 + password: Aggregator password/app password 32 + pds_url: Optional PDS URL for authentication (defaults to api_url) 33 + """ 34 + self.api_url = api_url 35 + self.pds_url = pds_url or api_url # Auth through PDS, post through AppView 36 + self.handle = handle 37 + self.password = password 38 + self.client = Client(base_url=self.pds_url) # Use PDS for auth 39 + self._authenticated = False 40 + 41 + def authenticate(self): 42 + """ 43 + Authenticate with Coves API. 44 + 45 + Uses com.atproto.server.createSession directly to avoid 46 + Bluesky-specific endpoints that don't exist on Coves PDS. 47 + 48 + Raises: 49 + Exception: If authentication fails 50 + """ 51 + try: 52 + logger.info(f"Authenticating as {self.handle}") 53 + 54 + # Use createSession directly (avoid app.bsky.actor.getProfile) 55 + session = self.client.com.atproto.server.create_session( 56 + {"identifier": self.handle, "password": self.password} 57 + ) 58 + 59 + # Manually set session (skip profile fetch) 60 + self.client._session = session 61 + self._authenticated = True 62 + self.did = session.did 63 + 64 + logger.info(f"Authentication successful (DID: {self.did})") 65 + except Exception as e: 66 + logger.error(f"Authentication failed: {e}") 67 + raise 68 + 69 + def create_post( 70 + self, 71 + community_handle: str, 72 + content: str, 73 + facets: List[Dict], 74 + embed: Optional[Dict] = None 75 + ) -> str: 76 + """ 77 + Create a post in a community. 78 + 79 + Args: 80 + community_handle: Community handle (e.g., "world-news.coves.social") 81 + content: Post content (rich text) 82 + facets: Rich text facets (formatting, links) 83 + embed: Optional external embed 84 + 85 + Returns: 86 + AT Proto URI of created post (e.g., "at://did:plc:.../social.coves.post/...") 87 + 88 + Raises: 89 + Exception: If post creation fails 90 + """ 91 + if not self._authenticated: 92 + self.authenticate() 93 + 94 + try: 95 + # Prepare post data for social.coves.post.create endpoint 96 + post_data = { 97 + "community": community_handle, 98 + "content": content, 99 + "facets": facets 100 + } 101 + 102 + # Add embed if provided 103 + if embed: 104 + post_data["embed"] = embed 105 + 106 + # Use Coves-specific endpoint (not direct PDS write) 107 + # This provides validation, authorization, and business logic 108 + logger.info(f"Creating post in community: {community_handle}") 109 + 110 + # Make direct HTTP request to XRPC endpoint 111 + url = f"{self.api_url}/xrpc/social.coves.post.create" 112 + headers = { 113 + "Authorization": f"Bearer {self.client._session.access_jwt}", 114 + "Content-Type": "application/json" 115 + } 116 + 117 + response = requests.post(url, json=post_data, headers=headers, timeout=30) 118 + 119 + # Log detailed error if request fails 120 + if not response.ok: 121 + error_body = response.text 122 + logger.error(f"Post creation failed ({response.status_code}): {error_body}") 123 + response.raise_for_status() 124 + 125 + result = response.json() 126 + post_uri = result["uri"] 127 + logger.info(f"Post created: {post_uri}") 128 + return post_uri 129 + 130 + except Exception as e: 131 + logger.error(f"Failed to create post: {e}") 132 + raise 133 + 134 + def create_external_embed( 135 + self, 136 + uri: str, 137 + title: str, 138 + description: str, 139 + thumb: Optional[str] = None 140 + ) -> Dict: 141 + """ 142 + Create external embed object for hot-linked content. 143 + 144 + Args: 145 + uri: External URL (story link) 146 + title: Story title 147 + description: Story description/summary 148 + thumb: Optional thumbnail image URL 149 + 150 + Returns: 151 + External embed dictionary 152 + """ 153 + embed = { 154 + "$type": "social.coves.embed.external", 155 + "external": { 156 + "uri": uri, 157 + "title": title, 158 + "description": description 159 + } 160 + } 161 + 162 + if thumb: 163 + embed["external"]["thumb"] = thumb 164 + 165 + return embed 166 + 167 + def _get_timestamp(self) -> str: 168 + """ 169 + Get current timestamp in ISO 8601 format. 170 + 171 + Returns: 172 + ISO timestamp string 173 + """ 174 + from datetime import datetime, timezone 175 + return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
+300
aggregators/kagi-news/src/html_parser.py
··· 1 + """ 2 + Kagi News HTML description parser. 3 + 4 + Parses the HTML content from RSS feed item descriptions 5 + into structured data. 6 + """ 7 + import re 8 + import logging 9 + from typing import Dict, List, Optional 10 + from datetime import datetime 11 + from bs4 import BeautifulSoup 12 + from urllib.parse import urlparse 13 + 14 + from src.models import KagiStory, Perspective, Quote, Source 15 + 16 + logger = logging.getLogger(__name__) 17 + 18 + 19 + class KagiHTMLParser: 20 + """Parses Kagi News HTML descriptions into structured data.""" 21 + 22 + def parse(self, html_description: str) -> Dict: 23 + """ 24 + Parse HTML description into structured data. 25 + 26 + Args: 27 + html_description: HTML content from RSS item description 28 + 29 + Returns: 30 + Dictionary with extracted data: 31 + - summary: str 32 + - image_url: Optional[str] 33 + - image_alt: Optional[str] 34 + - highlights: List[str] 35 + - quote: Optional[Dict[str, str]] 36 + - perspectives: List[Dict] 37 + - sources: List[Dict] 38 + """ 39 + soup = BeautifulSoup(html_description, 'html.parser') 40 + 41 + return { 42 + 'summary': self._extract_summary(soup), 43 + 'image_url': self._extract_image_url(soup), 44 + 'image_alt': self._extract_image_alt(soup), 45 + 'highlights': self._extract_highlights(soup), 46 + 'quote': self._extract_quote(soup), 47 + 'perspectives': self._extract_perspectives(soup), 48 + 'sources': self._extract_sources(soup), 49 + } 50 + 51 + def parse_to_story( 52 + self, 53 + title: str, 54 + link: str, 55 + guid: str, 56 + pub_date: datetime, 57 + categories: List[str], 58 + html_description: str 59 + ) -> KagiStory: 60 + """ 61 + Parse HTML and create a KagiStory object. 62 + 63 + Args: 64 + title: Story title 65 + link: Story URL 66 + guid: Unique identifier 67 + pub_date: Publication date 68 + categories: List of categories 69 + html_description: HTML content from description 70 + 71 + Returns: 72 + KagiStory object 73 + """ 74 + parsed = self.parse(html_description) 75 + 76 + # Convert parsed data to model objects 77 + perspectives = [ 78 + Perspective( 79 + actor=p['actor'], 80 + description=p['description'], 81 + source_url=p['source_url'] 82 + ) 83 + for p in parsed['perspectives'] 84 + ] 85 + 86 + sources = [ 87 + Source( 88 + title=s['title'], 89 + url=s['url'], 90 + domain=s['domain'] 91 + ) 92 + for s in parsed['sources'] 93 + ] 94 + 95 + quote = None 96 + if parsed['quote']: 97 + quote = Quote( 98 + text=parsed['quote']['text'], 99 + attribution=parsed['quote']['attribution'] 100 + ) 101 + 102 + return KagiStory( 103 + title=title, 104 + link=link, 105 + guid=guid, 106 + pub_date=pub_date, 107 + categories=categories, 108 + summary=parsed['summary'], 109 + highlights=parsed['highlights'], 110 + perspectives=perspectives, 111 + quote=quote, 112 + sources=sources, 113 + image_url=parsed['image_url'], 114 + image_alt=parsed['image_alt'] 115 + ) 116 + 117 + def _extract_summary(self, soup: BeautifulSoup) -> str: 118 + """Extract summary from first <p> tag.""" 119 + p_tag = soup.find('p') 120 + if p_tag: 121 + return p_tag.get_text(strip=True) 122 + return "" 123 + 124 + def _extract_image_url(self, soup: BeautifulSoup) -> Optional[str]: 125 + """Extract image URL from <img> tag.""" 126 + img_tag = soup.find('img') 127 + if img_tag and img_tag.get('src'): 128 + return img_tag['src'] 129 + return None 130 + 131 + def _extract_image_alt(self, soup: BeautifulSoup) -> Optional[str]: 132 + """Extract image alt text from <img> tag.""" 133 + img_tag = soup.find('img') 134 + if img_tag and img_tag.get('alt'): 135 + return img_tag['alt'] 136 + return None 137 + 138 + def _extract_highlights(self, soup: BeautifulSoup) -> List[str]: 139 + """Extract highlights list from H3 section.""" 140 + highlights = [] 141 + 142 + # Find "Highlights:" h3 tag 143 + h3_tags = soup.find_all('h3') 144 + for h3 in h3_tags: 145 + if 'Highlights' in h3.get_text(): 146 + # Get the <ul> that follows this h3 147 + ul = h3.find_next_sibling('ul') 148 + if ul: 149 + for li in ul.find_all('li'): 150 + highlights.append(li.get_text(strip=True)) 151 + break 152 + 153 + return highlights 154 + 155 + def _extract_quote(self, soup: BeautifulSoup) -> Optional[Dict[str, str]]: 156 + """Extract quote from <blockquote> tag.""" 157 + blockquote = soup.find('blockquote') 158 + if not blockquote: 159 + return None 160 + 161 + text = blockquote.get_text(strip=True) 162 + 163 + # Try to split on " - " to separate quote from attribution 164 + if ' - ' in text: 165 + quote_text, attribution = text.rsplit(' - ', 1) 166 + return { 167 + 'text': quote_text.strip(), 168 + 'attribution': attribution.strip() 169 + } 170 + 171 + # If no attribution found, entire text is the quote 172 + # Try to infer attribution from context (often mentioned in highlights/perspectives) 173 + return { 174 + 'text': text, 175 + 'attribution': self._infer_quote_attribution(soup, text) 176 + } 177 + 178 + def _infer_quote_attribution(self, soup: BeautifulSoup, quote_text: str) -> str: 179 + """ 180 + Try to infer quote attribution from context. 181 + 182 + This is a fallback when quote doesn't have explicit attribution. 183 + """ 184 + # For now, check if any perspective mentions similar keywords 185 + perspectives_section = soup.find('h3', string=re.compile(r'Perspectives')) 186 + if perspectives_section: 187 + ul = perspectives_section.find_next_sibling('ul') 188 + if ul: 189 + for li in ul.find_all('li'): 190 + li_text = li.get_text() 191 + # Extract actor name (before first colon) 192 + if ':' in li_text: 193 + actor = li_text.split(':', 1)[0].strip() 194 + return actor 195 + 196 + return "Unknown" 197 + 198 + def _extract_perspectives(self, soup: BeautifulSoup) -> List[Dict]: 199 + """Extract perspectives from H3 section.""" 200 + perspectives = [] 201 + 202 + # Find "Perspectives:" h3 tag 203 + h3_tags = soup.find_all('h3') 204 + for h3 in h3_tags: 205 + if 'Perspectives' in h3.get_text(): 206 + # Get the <ul> that follows this h3 207 + ul = h3.find_next_sibling('ul') 208 + if ul: 209 + for li in ul.find_all('li'): 210 + perspective = self._parse_perspective_li(li) 211 + if perspective: 212 + perspectives.append(perspective) 213 + break 214 + 215 + return perspectives 216 + 217 + def _parse_perspective_li(self, li) -> Optional[Dict]: 218 + """ 219 + Parse a single perspective <li> element. 220 + 221 + Format: "Actor: Description. (Source)" 222 + """ 223 + # Get full text 224 + full_text = li.get_text() 225 + 226 + # Extract actor (before first colon) 227 + if ':' not in full_text: 228 + return None 229 + 230 + actor, rest = full_text.split(':', 1) 231 + actor = actor.strip() 232 + 233 + # Find the <a> tag for source URL 234 + a_tag = li.find('a') 235 + source_url = a_tag['href'] if a_tag and a_tag.get('href') else "" 236 + 237 + # Extract description (between colon and source link) 238 + # Remove the source citation part in parentheses 239 + description = rest 240 + 241 + # Remove source citation like "(The Straits Times)" from description 242 + if a_tag: 243 + # Remove the link text and surrounding parentheses 244 + link_text = a_tag.get_text() 245 + description = description.replace(f"({link_text})", "").strip() 246 + 247 + # Clean up trailing period 248 + description = description.strip('. ') 249 + 250 + return { 251 + 'actor': actor, 252 + 'description': description, 253 + 'source_url': source_url 254 + } 255 + 256 + def _extract_sources(self, soup: BeautifulSoup) -> List[Dict]: 257 + """Extract sources list from H3 section.""" 258 + sources = [] 259 + 260 + # Find "Sources:" h3 tag 261 + h3_tags = soup.find_all('h3') 262 + for h3 in h3_tags: 263 + if 'Sources' in h3.get_text(): 264 + # Get the <ul> that follows this h3 265 + ul = h3.find_next_sibling('ul') 266 + if ul: 267 + for li in ul.find_all('li'): 268 + source = self._parse_source_li(li) 269 + if source: 270 + sources.append(source) 271 + break 272 + 273 + return sources 274 + 275 + def _parse_source_li(self, li) -> Optional[Dict]: 276 + """ 277 + Parse a single source <li> element. 278 + 279 + Format: "<a href='...'>Title</a> - domain.com" 280 + """ 281 + a_tag = li.find('a') 282 + if not a_tag or not a_tag.get('href'): 283 + return None 284 + 285 + title = a_tag.get_text(strip=True) 286 + url = a_tag['href'] 287 + 288 + # Extract domain from URL 289 + parsed_url = urlparse(url) 290 + domain = parsed_url.netloc 291 + 292 + # Remove "www." prefix if present 293 + if domain.startswith('www.'): 294 + domain = domain[4:] 295 + 296 + return { 297 + 'title': title, 298 + 'url': url, 299 + 'domain': domain 300 + }
+243
aggregators/kagi-news/src/main.py
··· 1 + """ 2 + Main Orchestration Script for Kagi News Aggregator. 3 + 4 + Coordinates all components to: 5 + 1. Fetch RSS feeds 6 + 2. Parse HTML content 7 + 3. Format as rich text 8 + 4. Deduplicate stories 9 + 5. Post to Coves communities 10 + 6. Track state 11 + 12 + Designed to run via CRON (single execution, then exit). 13 + """ 14 + import os 15 + import sys 16 + import logging 17 + from pathlib import Path 18 + from datetime import datetime 19 + from typing import Optional 20 + 21 + from src.config import ConfigLoader 22 + from src.rss_fetcher import RSSFetcher 23 + from src.html_parser import KagiHTMLParser 24 + from src.richtext_formatter import RichTextFormatter 25 + from src.state_manager import StateManager 26 + from src.coves_client import CovesClient 27 + 28 + # Setup logging 29 + logging.basicConfig( 30 + level=logging.INFO, 31 + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 32 + ) 33 + logger = logging.getLogger(__name__) 34 + 35 + 36 + class Aggregator: 37 + """ 38 + Main aggregator orchestration. 39 + 40 + Coordinates all components to fetch, parse, format, and post stories. 41 + """ 42 + 43 + def __init__( 44 + self, 45 + config_path: Path, 46 + state_file: Path, 47 + coves_client: Optional[CovesClient] = None 48 + ): 49 + """ 50 + Initialize aggregator. 51 + 52 + Args: 53 + config_path: Path to config.yaml 54 + state_file: Path to state.json 55 + coves_client: Optional CovesClient (for testing) 56 + """ 57 + # Load configuration 58 + logger.info("Loading configuration...") 59 + config_loader = ConfigLoader(config_path) 60 + self.config = config_loader.load() 61 + 62 + # Initialize components 63 + logger.info("Initializing components...") 64 + self.rss_fetcher = RSSFetcher() 65 + self.html_parser = KagiHTMLParser() 66 + self.richtext_formatter = RichTextFormatter() 67 + self.state_manager = StateManager(state_file) 68 + self.state_file = state_file 69 + 70 + # Initialize Coves client (or use provided one for testing) 71 + if coves_client: 72 + self.coves_client = coves_client 73 + else: 74 + # Get credentials from environment 75 + aggregator_handle = os.getenv('AGGREGATOR_HANDLE') 76 + aggregator_password = os.getenv('AGGREGATOR_PASSWORD') 77 + pds_url = os.getenv('PDS_URL') # Optional: separate PDS for auth 78 + 79 + if not aggregator_handle or not aggregator_password: 80 + raise ValueError( 81 + "Missing AGGREGATOR_HANDLE or AGGREGATOR_PASSWORD environment variables" 82 + ) 83 + 84 + self.coves_client = CovesClient( 85 + api_url=self.config.coves_api_url, 86 + handle=aggregator_handle, 87 + password=aggregator_password, 88 + pds_url=pds_url # Auth through PDS if specified 89 + ) 90 + 91 + def run(self): 92 + """ 93 + Run aggregator: fetch, parse, post, and update state. 94 + 95 + This is the main entry point for CRON execution. 96 + """ 97 + logger.info("=" * 60) 98 + logger.info("Starting Kagi News Aggregator") 99 + logger.info("=" * 60) 100 + 101 + # Get enabled feeds only 102 + enabled_feeds = [f for f in self.config.feeds if f.enabled] 103 + logger.info(f"Processing {len(enabled_feeds)} enabled feeds") 104 + 105 + # Authenticate once at the start 106 + try: 107 + self.coves_client.authenticate() 108 + except Exception as e: 109 + logger.error(f"Failed to authenticate: {e}") 110 + logger.error("Cannot continue without authentication") 111 + return 112 + 113 + # Process each feed 114 + for feed_config in enabled_feeds: 115 + try: 116 + self._process_feed(feed_config) 117 + except Exception as e: 118 + # Log error but continue with other feeds 119 + logger.error(f"Error processing feed '{feed_config.name}': {e}", exc_info=True) 120 + continue 121 + 122 + logger.info("=" * 60) 123 + logger.info("Aggregator run completed") 124 + logger.info("=" * 60) 125 + 126 + def _process_feed(self, feed_config): 127 + """ 128 + Process a single RSS feed. 129 + 130 + Args: 131 + feed_config: FeedConfig object 132 + """ 133 + logger.info(f"Processing feed: {feed_config.name} -> {feed_config.community_handle}") 134 + 135 + # Fetch RSS feed 136 + try: 137 + feed = self.rss_fetcher.fetch_feed(feed_config.url) 138 + except Exception as e: 139 + logger.error(f"Failed to fetch feed '{feed_config.name}': {e}") 140 + raise 141 + 142 + # Check for feed errors 143 + if feed.bozo: 144 + logger.warning(f"Feed '{feed_config.name}' has parsing issues (bozo flag set)") 145 + 146 + # Process entries 147 + new_posts = 0 148 + skipped_posts = 0 149 + 150 + for entry in feed.entries: 151 + try: 152 + # Check if already posted 153 + guid = entry.guid if hasattr(entry, 'guid') else entry.link 154 + if self.state_manager.is_posted(feed_config.url, guid): 155 + skipped_posts += 1 156 + logger.debug(f"Skipping already-posted story: {guid}") 157 + continue 158 + 159 + # Parse story 160 + story = self.html_parser.parse_to_story( 161 + title=entry.title, 162 + link=entry.link, 163 + guid=guid, 164 + pub_date=entry.published_parsed, 165 + categories=[tag.term for tag in entry.tags] if hasattr(entry, 'tags') else [], 166 + html_description=entry.description 167 + ) 168 + 169 + # Format as rich text 170 + rich_text = self.richtext_formatter.format_full(story) 171 + 172 + # Create external embed 173 + embed = self.coves_client.create_external_embed( 174 + uri=story.link, 175 + title=story.title, 176 + description=story.summary[:200] if len(story.summary) > 200 else story.summary, 177 + thumb=story.image_url 178 + ) 179 + 180 + # Post to community 181 + try: 182 + post_uri = self.coves_client.create_post( 183 + community_handle=feed_config.community_handle, 184 + content=rich_text["content"], 185 + facets=rich_text["facets"], 186 + embed=embed 187 + ) 188 + 189 + # Mark as posted (only if successful) 190 + self.state_manager.mark_posted(feed_config.url, guid, post_uri) 191 + new_posts += 1 192 + logger.info(f"Posted: {story.title[:50]}... -> {post_uri}") 193 + 194 + except Exception as e: 195 + # Don't update state if posting failed 196 + logger.error(f"Failed to post story '{story.title}': {e}") 197 + continue 198 + 199 + except Exception as e: 200 + # Log error but continue with other entries 201 + logger.error(f"Error processing entry: {e}", exc_info=True) 202 + continue 203 + 204 + # Update last run timestamp 205 + self.state_manager.update_last_run(feed_config.url, datetime.now()) 206 + 207 + logger.info( 208 + f"Feed '{feed_config.name}': {new_posts} new posts, {skipped_posts} duplicates" 209 + ) 210 + 211 + 212 + def main(): 213 + """ 214 + Main entry point for command-line execution. 215 + 216 + Usage: 217 + python -m src.main 218 + """ 219 + # Get paths from environment or use defaults 220 + config_path = Path(os.getenv('CONFIG_PATH', 'config.yaml')) 221 + state_file = Path(os.getenv('STATE_FILE', 'data/state.json')) 222 + 223 + # Validate config file exists 224 + if not config_path.exists(): 225 + logger.error(f"Configuration file not found: {config_path}") 226 + logger.error("Please create config.yaml (see config.example.yaml)") 227 + sys.exit(1) 228 + 229 + # Create aggregator and run 230 + try: 231 + aggregator = Aggregator( 232 + config_path=config_path, 233 + state_file=state_file 234 + ) 235 + aggregator.run() 236 + sys.exit(0) 237 + except Exception as e: 238 + logger.error(f"Aggregator failed: {e}", exc_info=True) 239 + sys.exit(1) 240 + 241 + 242 + if __name__ == '__main__': 243 + main()
+79
aggregators/kagi-news/src/models.py
··· 1 + """ 2 + Data models for Kagi News RSS aggregator. 3 + """ 4 + from dataclasses import dataclass, field 5 + from datetime import datetime 6 + from typing import List, Optional 7 + 8 + 9 + @dataclass 10 + class Source: 11 + """A news source citation.""" 12 + title: str 13 + url: str 14 + domain: str 15 + 16 + 17 + @dataclass 18 + class Perspective: 19 + """A perspective from a particular actor/stakeholder.""" 20 + actor: str 21 + description: str 22 + source_url: str 23 + 24 + 25 + @dataclass 26 + class Quote: 27 + """A notable quote from the story.""" 28 + text: str 29 + attribution: str 30 + 31 + 32 + @dataclass 33 + class KagiStory: 34 + """ 35 + Structured representation of a Kagi News story. 36 + 37 + Parsed from RSS feed item with HTML description. 38 + """ 39 + # RSS metadata 40 + title: str 41 + link: str # Kagi story permalink 42 + guid: str 43 + pub_date: datetime 44 + categories: List[str] = field(default_factory=list) 45 + 46 + # Parsed from HTML description 47 + summary: str = "" 48 + highlights: List[str] = field(default_factory=list) 49 + perspectives: List[Perspective] = field(default_factory=list) 50 + quote: Optional[Quote] = None 51 + sources: List[Source] = field(default_factory=list) 52 + image_url: Optional[str] = None 53 + image_alt: Optional[str] = None 54 + 55 + def __post_init__(self): 56 + """Validate required fields.""" 57 + if not self.title: 58 + raise ValueError("title is required") 59 + if not self.link: 60 + raise ValueError("link is required") 61 + if not self.guid: 62 + raise ValueError("guid is required") 63 + 64 + 65 + @dataclass 66 + class FeedConfig: 67 + """Configuration for a single RSS feed.""" 68 + name: str 69 + url: str 70 + community_handle: str 71 + enabled: bool = True 72 + 73 + 74 + @dataclass 75 + class AggregatorConfig: 76 + """Full aggregator configuration.""" 77 + coves_api_url: str 78 + feeds: List[FeedConfig] 79 + log_level: str = "info"
+177
aggregators/kagi-news/src/richtext_formatter.py
··· 1 + """ 2 + Rich Text Formatter for Coves posts. 3 + 4 + Converts KagiStory objects to Coves rich text format with facets. 5 + Handles UTF-8 byte position calculation for multi-byte characters. 6 + """ 7 + import logging 8 + from typing import Dict, List, Tuple 9 + from src.models import KagiStory, Perspective, Source 10 + 11 + logger = logging.getLogger(__name__) 12 + 13 + 14 + class RichTextFormatter: 15 + """ 16 + Formats KagiStory into Coves rich text with facets. 17 + 18 + Applies: 19 + - Bold facets for section headers and perspective actors 20 + - Italic facets for quotes 21 + - Link facets for all URLs 22 + """ 23 + 24 + def format_full(self, story: KagiStory) -> Dict: 25 + """ 26 + Format KagiStory into full rich text format. 27 + 28 + Args: 29 + story: KagiStory object to format 30 + 31 + Returns: 32 + Dictionary with 'content' (str) and 'facets' (list) 33 + """ 34 + builder = RichTextBuilder() 35 + 36 + # Summary 37 + builder.add_text(story.summary) 38 + builder.add_text("\n\n") 39 + 40 + # Highlights (if present) 41 + if story.highlights: 42 + builder.add_bold("Highlights:") 43 + builder.add_text("\n") 44 + for highlight in story.highlights: 45 + builder.add_text(f"• {highlight}\n") 46 + builder.add_text("\n") 47 + 48 + # Perspectives (if present) 49 + if story.perspectives: 50 + builder.add_bold("Perspectives:") 51 + builder.add_text("\n") 52 + for perspective in story.perspectives: 53 + # Bold the actor name 54 + actor_with_colon = f"{perspective.actor}:" 55 + builder.add_bold(actor_with_colon) 56 + builder.add_text(f" {perspective.description} (") 57 + 58 + # Add link to source 59 + source_link_text = "Source" 60 + builder.add_link(source_link_text, perspective.source_url) 61 + builder.add_text(")\n") 62 + builder.add_text("\n") 63 + 64 + # Quote (if present) 65 + if story.quote: 66 + quote_text = f'"{story.quote.text}"' 67 + builder.add_italic(quote_text) 68 + builder.add_text(f" — {story.quote.attribution}\n\n") 69 + 70 + # Sources (if present) 71 + if story.sources: 72 + builder.add_bold("Sources:") 73 + builder.add_text("\n") 74 + for source in story.sources: 75 + builder.add_text("• ") 76 + builder.add_link(source.title, source.url) 77 + builder.add_text(f" - {source.domain}\n") 78 + builder.add_text("\n") 79 + 80 + # Kagi News attribution 81 + builder.add_text("---\n📰 Story aggregated by ") 82 + builder.add_link("Kagi News", story.link) 83 + 84 + return builder.build() 85 + 86 + 87 + class RichTextBuilder: 88 + """ 89 + Helper class to build rich text content with facets. 90 + 91 + Handles UTF-8 byte position tracking automatically. 92 + """ 93 + 94 + def __init__(self): 95 + self.content_parts = [] 96 + self.facets = [] 97 + 98 + def add_text(self, text: str): 99 + """Add plain text without any facets.""" 100 + self.content_parts.append(text) 101 + 102 + def add_bold(self, text: str): 103 + """Add text with bold facet.""" 104 + start_byte = self._get_current_byte_position() 105 + self.content_parts.append(text) 106 + end_byte = self._get_current_byte_position() 107 + 108 + self.facets.append({ 109 + "index": { 110 + "byteStart": start_byte, 111 + "byteEnd": end_byte 112 + }, 113 + "features": [ 114 + {"$type": "social.coves.richtext.facet#bold"} 115 + ] 116 + }) 117 + 118 + def add_italic(self, text: str): 119 + """Add text with italic facet.""" 120 + start_byte = self._get_current_byte_position() 121 + self.content_parts.append(text) 122 + end_byte = self._get_current_byte_position() 123 + 124 + self.facets.append({ 125 + "index": { 126 + "byteStart": start_byte, 127 + "byteEnd": end_byte 128 + }, 129 + "features": [ 130 + {"$type": "social.coves.richtext.facet#italic"} 131 + ] 132 + }) 133 + 134 + def add_link(self, text: str, uri: str): 135 + """Add text with link facet.""" 136 + start_byte = self._get_current_byte_position() 137 + self.content_parts.append(text) 138 + end_byte = self._get_current_byte_position() 139 + 140 + self.facets.append({ 141 + "index": { 142 + "byteStart": start_byte, 143 + "byteEnd": end_byte 144 + }, 145 + "features": [ 146 + { 147 + "$type": "social.coves.richtext.facet#link", 148 + "uri": uri 149 + } 150 + ] 151 + }) 152 + 153 + def _get_current_byte_position(self) -> int: 154 + """ 155 + Get the current byte position in the content. 156 + 157 + Uses UTF-8 encoding to handle multi-byte characters correctly. 158 + """ 159 + current_content = ''.join(self.content_parts) 160 + return len(current_content.encode('utf-8')) 161 + 162 + def build(self) -> Dict: 163 + """ 164 + Build the final rich text object. 165 + 166 + Returns: 167 + Dictionary with 'content' and 'facets' 168 + """ 169 + content = ''.join(self.content_parts) 170 + 171 + # Sort facets by start position for consistency 172 + sorted_facets = sorted(self.facets, key=lambda f: f['index']['byteStart']) 173 + 174 + return { 175 + "content": content, 176 + "facets": sorted_facets 177 + }
+71
aggregators/kagi-news/src/rss_fetcher.py
··· 1 + """ 2 + RSS feed fetcher with retry logic and error handling. 3 + """ 4 + import time 5 + import logging 6 + import requests 7 + import feedparser 8 + from typing import Optional 9 + 10 + logger = logging.getLogger(__name__) 11 + 12 + 13 + class RSSFetcher: 14 + """Fetches RSS feeds with retry logic.""" 15 + 16 + def __init__(self, timeout: int = 30, max_retries: int = 3): 17 + """ 18 + Initialize RSS fetcher. 19 + 20 + Args: 21 + timeout: Request timeout in seconds 22 + max_retries: Maximum number of retry attempts 23 + """ 24 + self.timeout = timeout 25 + self.max_retries = max_retries 26 + 27 + def fetch_feed(self, url: str) -> feedparser.FeedParserDict: 28 + """ 29 + Fetch and parse an RSS feed. 30 + 31 + Args: 32 + url: RSS feed URL 33 + 34 + Returns: 35 + Parsed feed object 36 + 37 + Raises: 38 + ValueError: If URL is empty 39 + requests.RequestException: If all retry attempts fail 40 + """ 41 + if not url: 42 + raise ValueError("URL cannot be empty") 43 + 44 + last_error = None 45 + 46 + for attempt in range(self.max_retries): 47 + try: 48 + logger.info(f"Fetching feed from {url} (attempt {attempt + 1}/{self.max_retries})") 49 + 50 + response = requests.get(url, timeout=self.timeout) 51 + response.raise_for_status() 52 + 53 + # Parse with feedparser 54 + feed = feedparser.parse(response.content) 55 + 56 + logger.info(f"Successfully fetched feed: {feed.feed.get('title', 'Unknown')}") 57 + return feed 58 + 59 + except requests.RequestException as e: 60 + last_error = e 61 + logger.warning(f"Fetch attempt {attempt + 1} failed: {e}") 62 + 63 + if attempt < self.max_retries - 1: 64 + # Exponential backoff 65 + sleep_time = 2 ** attempt 66 + logger.info(f"Retrying in {sleep_time} seconds...") 67 + time.sleep(sleep_time) 68 + 69 + # All retries exhausted 70 + logger.error(f"Failed to fetch feed after {self.max_retries} attempts") 71 + raise last_error
+213
aggregators/kagi-news/src/state_manager.py
··· 1 + """ 2 + State Manager for tracking posted stories. 3 + 4 + Handles deduplication by tracking which stories have already been posted. 5 + Uses JSON file for persistence. 6 + """ 7 + import json 8 + import logging 9 + from pathlib import Path 10 + from datetime import datetime, timedelta 11 + from typing import Optional, Dict, List 12 + 13 + logger = logging.getLogger(__name__) 14 + 15 + 16 + class StateManager: 17 + """ 18 + Manages aggregator state for deduplication. 19 + 20 + Tracks: 21 + - Posted GUIDs per feed (with timestamps) 22 + - Last successful run timestamp per feed 23 + - Automatic cleanup of old entries 24 + """ 25 + 26 + def __init__(self, state_file: Path, max_guids_per_feed: int = 100, max_age_days: int = 30): 27 + """ 28 + Initialize state manager. 29 + 30 + Args: 31 + state_file: Path to JSON state file 32 + max_guids_per_feed: Maximum GUIDs to keep per feed (default: 100) 33 + max_age_days: Maximum age in days for GUIDs (default: 30) 34 + """ 35 + self.state_file = Path(state_file) 36 + self.max_guids_per_feed = max_guids_per_feed 37 + self.max_age_days = max_age_days 38 + self.state = self._load_state() 39 + 40 + def _load_state(self) -> Dict: 41 + """Load state from file, or create new state if file doesn't exist.""" 42 + if not self.state_file.exists(): 43 + logger.info(f"Creating new state file at {self.state_file}") 44 + state = {'feeds': {}} 45 + self._save_state(state) 46 + return state 47 + 48 + try: 49 + with open(self.state_file, 'r') as f: 50 + state = json.load(f) 51 + logger.info(f"Loaded state from {self.state_file}") 52 + return state 53 + except json.JSONDecodeError as e: 54 + logger.error(f"Failed to load state file: {e}. Creating new state.") 55 + state = {'feeds': {}} 56 + self._save_state(state) 57 + return state 58 + 59 + def _save_state(self, state: Optional[Dict] = None): 60 + """Save state to file.""" 61 + if state is None: 62 + state = self.state 63 + 64 + # Ensure parent directory exists 65 + self.state_file.parent.mkdir(parents=True, exist_ok=True) 66 + 67 + with open(self.state_file, 'w') as f: 68 + json.dump(state, f, indent=2) 69 + 70 + def _ensure_feed_exists(self, feed_url: str): 71 + """Ensure feed entry exists in state.""" 72 + if feed_url not in self.state['feeds']: 73 + self.state['feeds'][feed_url] = { 74 + 'posted_guids': [], 75 + 'last_successful_run': None 76 + } 77 + 78 + def is_posted(self, feed_url: str, guid: str) -> bool: 79 + """ 80 + Check if a story has already been posted. 81 + 82 + Args: 83 + feed_url: RSS feed URL 84 + guid: Story GUID 85 + 86 + Returns: 87 + True if already posted, False otherwise 88 + """ 89 + self._ensure_feed_exists(feed_url) 90 + 91 + posted_guids = self.state['feeds'][feed_url]['posted_guids'] 92 + return any(entry['guid'] == guid for entry in posted_guids) 93 + 94 + def mark_posted(self, feed_url: str, guid: str, post_uri: str): 95 + """ 96 + Mark a story as posted. 97 + 98 + Args: 99 + feed_url: RSS feed URL 100 + guid: Story GUID 101 + post_uri: AT Proto URI of created post 102 + """ 103 + self._ensure_feed_exists(feed_url) 104 + 105 + # Add to posted list 106 + entry = { 107 + 'guid': guid, 108 + 'post_uri': post_uri, 109 + 'posted_at': datetime.now().isoformat() 110 + } 111 + self.state['feeds'][feed_url]['posted_guids'].append(entry) 112 + 113 + # Auto-cleanup to keep state file manageable 114 + self.cleanup_old_entries(feed_url) 115 + 116 + # Save state 117 + self._save_state() 118 + 119 + logger.info(f"Marked as posted: {guid} -> {post_uri}") 120 + 121 + def get_last_run(self, feed_url: str) -> Optional[datetime]: 122 + """ 123 + Get last successful run timestamp for a feed. 124 + 125 + Args: 126 + feed_url: RSS feed URL 127 + 128 + Returns: 129 + Datetime of last run, or None if never run 130 + """ 131 + self._ensure_feed_exists(feed_url) 132 + 133 + timestamp_str = self.state['feeds'][feed_url]['last_successful_run'] 134 + if timestamp_str is None: 135 + return None 136 + 137 + return datetime.fromisoformat(timestamp_str) 138 + 139 + def update_last_run(self, feed_url: str, timestamp: datetime): 140 + """ 141 + Update last successful run timestamp. 142 + 143 + Args: 144 + feed_url: RSS feed URL 145 + timestamp: Timestamp of successful run 146 + """ 147 + self._ensure_feed_exists(feed_url) 148 + 149 + self.state['feeds'][feed_url]['last_successful_run'] = timestamp.isoformat() 150 + self._save_state() 151 + 152 + logger.info(f"Updated last run for {feed_url}: {timestamp}") 153 + 154 + def cleanup_old_entries(self, feed_url: str): 155 + """ 156 + Remove old entries from state. 157 + 158 + Removes entries that are: 159 + - Older than max_age_days 160 + - Beyond max_guids_per_feed limit (keeps most recent) 161 + 162 + Args: 163 + feed_url: RSS feed URL 164 + """ 165 + self._ensure_feed_exists(feed_url) 166 + 167 + posted_guids = self.state['feeds'][feed_url]['posted_guids'] 168 + 169 + # Filter out entries older than max_age_days 170 + cutoff_date = datetime.now() - timedelta(days=self.max_age_days) 171 + filtered = [ 172 + entry for entry in posted_guids 173 + if datetime.fromisoformat(entry['posted_at']) > cutoff_date 174 + ] 175 + 176 + # Keep only most recent max_guids_per_feed entries 177 + # Sort by posted_at (most recent first) 178 + filtered.sort(key=lambda x: x['posted_at'], reverse=True) 179 + filtered = filtered[:self.max_guids_per_feed] 180 + 181 + # Update state 182 + old_count = len(posted_guids) 183 + new_count = len(filtered) 184 + self.state['feeds'][feed_url]['posted_guids'] = filtered 185 + 186 + if old_count != new_count: 187 + logger.info(f"Cleaned up {old_count - new_count} old entries for {feed_url}") 188 + 189 + def get_posted_count(self, feed_url: str) -> int: 190 + """ 191 + Get count of posted items for a feed. 192 + 193 + Args: 194 + feed_url: RSS feed URL 195 + 196 + Returns: 197 + Number of posted items 198 + """ 199 + self._ensure_feed_exists(feed_url) 200 + return len(self.state['feeds'][feed_url]['posted_guids']) 201 + 202 + def get_all_posted_guids(self, feed_url: str) -> List[str]: 203 + """ 204 + Get all posted GUIDs for a feed. 205 + 206 + Args: 207 + feed_url: RSS feed URL 208 + 209 + Returns: 210 + List of GUIDs 211 + """ 212 + self._ensure_feed_exists(feed_url) 213 + return [entry['guid'] for entry in self.state['feeds'][feed_url]['posted_guids']]