Merge branch 'main' of tangled.sh:cameron.stream/void

cameron.stream / void

fork atom

a digital person for bluesky

fork atom

cameron.stream 2 months ago 6d0a4501 1f978880

+819 -18

1 changed file

expand all

unified split

bsky_utils.py

+819 -18

bsky_utils.py

··· 115 115 return obj 116 116 117 117 118 + def extract_links_from_facets(record_text: str, facets: list) -> list: 119 + """ 120 + Extract link URLs from facets with their associated text. 121 + 122 + Args: 123 + record_text: The post text (needed to extract link text using byte offsets) 124 + facets: List of facet objects from post record 125 + 126 + Returns: 127 + List of dicts with 'url' and 'text' keys 128 + """ 129 + links = [] 130 + text_bytes = record_text.encode('utf-8') 131 + 132 + for facet in facets: 133 + for feature in facet.features: 134 + if hasattr(feature, 'uri'): # Link facet 135 + byte_start = facet.index.byte_start 136 + byte_end = facet.index.byte_end 137 + try: 138 + link_text = text_bytes[byte_start:byte_end].decode('utf-8') 139 + except (UnicodeDecodeError, IndexError): 140 + link_text = feature.uri # Fallback to URL itself 141 + links.append({ 142 + 'url': feature.uri, 143 + 'text': link_text 144 + }) 145 + return links 146 + 147 + 148 + def extract_images_from_embed(embed, include_thumbnails: bool = True) -> list[dict]: 149 + """Extract image URLs and alt text from a post embed (View type). 150 + 151 + This function handles the View types returned by get_post_thread(), 152 + which contain CDN URLs for images (unlike raw record embeds which 153 + only have BlobRefs). 154 + 155 + Also extracts thumbnails from external links and videos when include_thumbnails=True. 156 + 157 + Args: 158 + embed: The embed object from post.embed (View type) 159 + include_thumbnails: Whether to include thumbnails from links/videos (default True) 160 + 161 + Returns: 162 + List of dicts with 'fullsize', 'thumb', 'alt', and optional 'source' keys 163 + """ 164 + images = [] 165 + if not embed: 166 + return images 167 + 168 + embed_type = getattr(embed, 'py_type', '') 169 + 170 + # Direct image embed (app.bsky.embed.images#view) 171 + if 'images' in embed_type and 'record' not in embed_type: 172 + for img in embed.images: 173 + images.append({ 174 + 'fullsize': getattr(img, 'fullsize', None), 175 + 'thumb': getattr(img, 'thumb', None), 176 + 'alt': getattr(img, 'alt', '') or '' 177 + }) 178 + 179 + # External link with thumbnail (app.bsky.embed.external#view) 180 + elif 'external' in embed_type and 'record' not in embed_type and include_thumbnails: 181 + if hasattr(embed, 'external') and embed.external: 182 + thumb = getattr(embed.external, 'thumb', None) 183 + if thumb: 184 + title = getattr(embed.external, 'title', '') or '' 185 + images.append({ 186 + 'fullsize': thumb, # External links only have thumb, use as fullsize too 187 + 'thumb': thumb, 188 + 'alt': f"Link preview: {title}" if title else 'Link preview image', 189 + 'source': 'external_link' 190 + }) 191 + 192 + # Video with thumbnail (app.bsky.embed.video#view) 193 + elif 'video' in embed_type and 'record' not in embed_type and include_thumbnails: 194 + thumb = getattr(embed, 'thumbnail', None) 195 + if thumb: 196 + alt = getattr(embed, 'alt', '') or 'Video thumbnail' 197 + images.append({ 198 + 'fullsize': thumb, 199 + 'thumb': thumb, 200 + 'alt': alt, 201 + 'source': 'video' 202 + }) 203 + 204 + # Quote post with media (app.bsky.embed.recordWithMedia#view) 205 + elif 'recordWithMedia' in embed_type and hasattr(embed, 'media'): 206 + media_type = getattr(embed.media, 'py_type', '') 207 + # Images in media 208 + if 'images' in media_type and hasattr(embed.media, 'images'): 209 + for img in embed.media.images: 210 + images.append({ 211 + 'fullsize': getattr(img, 'fullsize', None), 212 + 'thumb': getattr(img, 'thumb', None), 213 + 'alt': getattr(img, 'alt', '') or '' 214 + }) 215 + # External link thumbnail in media 216 + elif 'external' in media_type and include_thumbnails: 217 + if hasattr(embed.media, 'external') and embed.media.external: 218 + thumb = getattr(embed.media.external, 'thumb', None) 219 + if thumb: 220 + title = getattr(embed.media.external, 'title', '') or '' 221 + images.append({ 222 + 'fullsize': thumb, 223 + 'thumb': thumb, 224 + 'alt': f"Link preview: {title}" if title else 'Link preview image', 225 + 'source': 'external_link' 226 + }) 227 + # Video thumbnail in media 228 + elif 'video' in media_type and include_thumbnails: 229 + thumb = getattr(embed.media, 'thumbnail', None) 230 + if thumb: 231 + alt = getattr(embed.media, 'alt', '') or 'Video thumbnail' 232 + images.append({ 233 + 'fullsize': thumb, 234 + 'thumb': thumb, 235 + 'alt': alt, 236 + 'source': 'video' 237 + }) 238 + 239 + # Quote post - check for images in nested embeds (app.bsky.embed.record#view) 240 + elif 'record' in embed_type and 'recordWithMedia' not in embed_type: 241 + if hasattr(embed, 'record') and embed.record: 242 + record = embed.record 243 + if hasattr(record, 'embeds') and record.embeds: 244 + for nested in record.embeds: 245 + nested_type = getattr(nested, 'py_type', '') 246 + # Nested images 247 + if 'images' in nested_type and hasattr(nested, 'images'): 248 + for img in nested.images: 249 + images.append({ 250 + 'fullsize': getattr(img, 'fullsize', None), 251 + 'thumb': getattr(img, 'thumb', None), 252 + 'alt': getattr(img, 'alt', '') or '', 253 + 'source': 'quoted_post' 254 + }) 255 + # Nested external link thumbnail 256 + elif 'external' in nested_type and include_thumbnails: 257 + if hasattr(nested, 'external') and nested.external: 258 + thumb = getattr(nested.external, 'thumb', None) 259 + if thumb: 260 + title = getattr(nested.external, 'title', '') or '' 261 + images.append({ 262 + 'fullsize': thumb, 263 + 'thumb': thumb, 264 + 'alt': f"Link preview: {title}" if title else 'Link preview image', 265 + 'source': 'quoted_post_link' 266 + }) 267 + # Nested video thumbnail 268 + elif 'video' in nested_type and include_thumbnails: 269 + thumb = getattr(nested, 'thumbnail', None) 270 + if thumb: 271 + alt = getattr(nested, 'alt', '') or 'Video thumbnail' 272 + images.append({ 273 + 'fullsize': thumb, 274 + 'thumb': thumb, 275 + 'alt': alt, 276 + 'source': 'quoted_post_video' 277 + }) 278 + 279 + return images 280 + 281 + 282 + def extract_images_from_thread(thread_data, max_images: int = 8) -> list[dict]: 283 + """Extract all images from a thread, up to max_images. 284 + 285 + Traverses the thread structure and extracts image URLs from post embeds. 286 + Images are collected in chronological order (parents before children). 287 + 288 + Args: 289 + thread_data: The thread data from get_post_thread 290 + max_images: Maximum number of images to extract (default 8) 291 + 292 + Returns: 293 + List of image dicts with 'fullsize', 'thumb', 'alt', 'author_handle' keys 294 + """ 295 + images = [] 296 + 297 + def traverse_thread(node): 298 + if not node or len(images) >= max_images: 299 + return 300 + 301 + # Traverse parent first (chronological order) 302 + if hasattr(node, 'parent') and node.parent: 303 + traverse_thread(node.parent) 304 + 305 + # Extract images from this post's embed (View type, not record.embed) 306 + if hasattr(node, 'post') and node.post: 307 + post = node.post 308 + if hasattr(post, 'embed') and post.embed: 309 + post_images = extract_images_from_embed(post.embed) 310 + author_handle = getattr(post.author, 'handle', 'unknown') if hasattr(post, 'author') else 'unknown' 311 + for img in post_images: 312 + if len(images) >= max_images: 313 + break 314 + img['author_handle'] = author_handle 315 + images.append(img) 316 + 317 + # Traverse replies 318 + if hasattr(node, 'replies') and node.replies: 319 + for reply in node.replies: 320 + if len(images) >= max_images: 321 + break 322 + traverse_thread(reply) 323 + 324 + if hasattr(thread_data, 'thread'): 325 + traverse_thread(thread_data.thread) 326 + 327 + return images 328 + 329 + 330 + def extract_external_link_from_embed(embed) -> dict | None: 331 + """Extract external link card data from a post embed (View type). 332 + 333 + External links are shown as "link cards" with URL, title, description, 334 + and optional thumbnail. 335 + 336 + Args: 337 + embed: The embed object from post.embed (View type) 338 + 339 + Returns: 340 + Dict with 'url', 'title', 'description', 'thumbnail' keys, or None 341 + """ 342 + if not embed: 343 + return None 344 + 345 + embed_type = getattr(embed, 'py_type', '') 346 + 347 + # Direct external link embed (app.bsky.embed.external#view) 348 + if 'external' in embed_type and hasattr(embed, 'external'): 349 + external = embed.external 350 + return { 351 + 'url': getattr(external, 'uri', ''), 352 + 'title': getattr(external, 'title', ''), 353 + 'description': getattr(external, 'description', ''), 354 + 'thumbnail': getattr(external, 'thumb', None) 355 + } 356 + 357 + # RecordWithMedia with external link (app.bsky.embed.recordWithMedia#view) 358 + if 'recordWithMedia' in embed_type and hasattr(embed, 'media'): 359 + media_type = getattr(embed.media, 'py_type', '') 360 + if 'external' in media_type and hasattr(embed.media, 'external'): 361 + external = embed.media.external 362 + return { 363 + 'url': getattr(external, 'uri', ''), 364 + 'title': getattr(external, 'title', ''), 365 + 'description': getattr(external, 'description', ''), 366 + 'thumbnail': getattr(external, 'thumb', None) 367 + } 368 + 369 + return None 370 + 371 + 372 + def extract_quote_post_from_embed(embed) -> dict | None: 373 + """Extract quoted post data from a record embed (View type). 374 + 375 + Quote posts embed another post, which can include the quoted text, 376 + author, and any media attached to the quoted post. 377 + 378 + Args: 379 + embed: The embed object from post.embed (View type) 380 + 381 + Returns: 382 + Dict with quote post data, or None if not a quote or unavailable 383 + """ 384 + if not embed: 385 + return None 386 + 387 + embed_type = getattr(embed, 'py_type', '') 388 + 389 + # Get the record object (works for both record and recordWithMedia) 390 + record = None 391 + if 'recordWithMedia' in embed_type and hasattr(embed, 'record'): 392 + # recordWithMedia has record.record for the actual quote 393 + record = getattr(embed.record, 'record', None) 394 + elif 'record' in embed_type and hasattr(embed, 'record'): 395 + record = embed.record 396 + 397 + if not record: 398 + return None 399 + 400 + record_type = getattr(record, 'py_type', '') 401 + 402 + # Handle different quote post states 403 + if 'viewNotFound' in record_type: 404 + return { 405 + 'status': 'not_found', 406 + 'uri': getattr(record, 'uri', ''), 407 + 'message': 'Quoted post was deleted or not found' 408 + } 409 + 410 + if 'viewBlocked' in record_type: 411 + return { 412 + 'status': 'blocked', 413 + 'uri': getattr(record, 'uri', ''), 414 + 'message': 'Quoted post is from a blocked account' 415 + } 416 + 417 + if 'viewDetached' in record_type: 418 + return { 419 + 'status': 'detached', 420 + 'uri': getattr(record, 'uri', ''), 421 + 'message': 'Quoted post was detached' 422 + } 423 + 424 + # Normal quote post (viewRecord) 425 + if 'viewRecord' in record_type or hasattr(record, 'author'): 426 + result = { 427 + 'status': 'available', 428 + 'uri': getattr(record, 'uri', ''), 429 + } 430 + 431 + # Extract author info 432 + if hasattr(record, 'author') and record.author: 433 + author = record.author 434 + result['author'] = { 435 + 'handle': getattr(author, 'handle', 'unknown'), 436 + 'display_name': getattr(author, 'display_name', '') or getattr(author, 'handle', 'unknown') 437 + } 438 + 439 + # Extract the quoted post text from value 440 + # The 'value' field contains the actual post record 441 + if hasattr(record, 'value') and record.value: 442 + value = record.value 443 + # value can be a dict or an object 444 + if isinstance(value, dict): 445 + result['text'] = value.get('text', '') 446 + elif hasattr(value, 'text'): 447 + result['text'] = getattr(value, 'text', '') 448 + 449 + # Extract engagement metrics if present 450 + metrics = {} 451 + if hasattr(record, 'like_count') and record.like_count is not None: 452 + metrics['likes'] = record.like_count 453 + if hasattr(record, 'repost_count') and record.repost_count is not None: 454 + metrics['reposts'] = record.repost_count 455 + if hasattr(record, 'reply_count') and record.reply_count is not None: 456 + metrics['replies'] = record.reply_count 457 + if hasattr(record, 'quote_count') and record.quote_count is not None: 458 + metrics['quotes'] = record.quote_count 459 + if metrics: 460 + result['metrics'] = metrics 461 + 462 + # Add thread context hints (for hybrid thread navigation) 463 + thread_context = {} 464 + 465 + # Reply count indicates replies exist below this post 466 + if metrics.get('replies'): 467 + thread_context['reply_count'] = metrics['replies'] 468 + 469 + # Check if quoted post is itself a reply (has parents above) 470 + if hasattr(record, 'value') and record.value: 471 + value = record.value 472 + reply_ref = value.get('reply') if isinstance(value, dict) else getattr(value, 'reply', None) 473 + if reply_ref: 474 + thread_context['has_parents'] = True 475 + 476 + if thread_context: 477 + result['thread_context'] = thread_context 478 + 479 + # Check for nested embeds in the quoted post 480 + if hasattr(record, 'embeds') and record.embeds: 481 + nested_embeds = [] 482 + for nested in record.embeds: 483 + nested_type = getattr(nested, 'py_type', '') 484 + if 'images' in nested_type: 485 + nested_embeds.append({'type': 'images', 'count': len(getattr(nested, 'images', []))}) 486 + elif 'video' in nested_type: 487 + nested_embeds.append({'type': 'video'}) 488 + elif 'external' in nested_type: 489 + ext = getattr(nested, 'external', None) 490 + if ext: 491 + nested_embeds.append({ 492 + 'type': 'external_link', 493 + 'url': getattr(ext, 'uri', ''), 494 + 'title': getattr(ext, 'title', '') 495 + }) 496 + if nested_embeds: 497 + result['embeds'] = nested_embeds 498 + 499 + return result 500 + 501 + return None 502 + 503 + 504 + def extract_embed_data(embed) -> dict | None: 505 + """Extract structured data from any embed type. 506 + 507 + This is the main entry point for embed extraction. It detects the embed 508 + type and delegates to the appropriate extraction function. 509 + 510 + Args: 511 + embed: The embed object from post.embed (View type) 512 + 513 + Returns: 514 + Dict with embed type and extracted data, or None if no embed 515 + """ 516 + if not embed: 517 + return None 518 + 519 + embed_type = getattr(embed, 'py_type', '') 520 + 521 + # Images 522 + if 'images' in embed_type and 'record' not in embed_type: 523 + images = extract_images_from_embed(embed) 524 + if images: 525 + return { 526 + 'type': 'images', 527 + 'images': images 528 + } 529 + 530 + # External link 531 + if 'external' in embed_type and 'record' not in embed_type: 532 + link = extract_external_link_from_embed(embed) 533 + if link: 534 + return { 535 + 'type': 'external_link', 536 + 'link': link 537 + } 538 + 539 + # Quote post (record) 540 + if embed_type == 'app.bsky.embed.record#view': 541 + quote = extract_quote_post_from_embed(embed) 542 + if quote: 543 + return { 544 + 'type': 'quote_post', 545 + 'quote': quote 546 + } 547 + 548 + # Quote post with media (recordWithMedia) 549 + if 'recordWithMedia' in embed_type: 550 + result = {'type': 'quote_with_media'} 551 + 552 + # Extract the quote 553 + quote = extract_quote_post_from_embed(embed) 554 + if quote: 555 + result['quote'] = quote 556 + 557 + # Extract the media 558 + if hasattr(embed, 'media'): 559 + media_type = getattr(embed.media, 'py_type', '') 560 + if 'images' in media_type: 561 + images = extract_images_from_embed(embed) 562 + if images: 563 + result['media'] = {'type': 'images', 'images': images} 564 + elif 'external' in media_type: 565 + link = extract_external_link_from_embed(embed) 566 + if link: 567 + result['media'] = {'type': 'external_link', 'link': link} 568 + elif 'video' in media_type: 569 + # Basic video info 570 + result['media'] = { 571 + 'type': 'video', 572 + 'thumbnail': getattr(embed.media, 'thumbnail', None), 573 + 'alt': getattr(embed.media, 'alt', None) 574 + } 575 + 576 + return result 577 + 578 + # Video (basic handling) 579 + if 'video' in embed_type: 580 + return { 581 + 'type': 'video', 582 + 'thumbnail': getattr(embed, 'thumbnail', None), 583 + 'alt': getattr(embed, 'alt', None) 584 + } 585 + 586 + return None 587 + 588 + 118 589 def flatten_thread_structure(thread_data): 119 590 """ 120 591 Flatten a nested thread structure into a list while preserving all data. 121 - 592 + 122 593 Args: 123 594 thread_data: The thread data from get_post_thread 124 - 595 + 125 596 Returns: 126 597 Dict with 'posts' key containing a list of posts in chronological order 127 598 """ 128 599 posts = [] 129 - 600 + 130 601 def traverse_thread(node): 131 602 """Recursively traverse the thread structure to collect posts.""" 132 603 if not node: 133 604 return 134 - 605 + 135 606 # If this node has a parent, traverse it first (to maintain chronological order) 136 607 if hasattr(node, 'parent') and node.parent: 137 608 traverse_thread(node.parent) 138 - 609 + 139 610 # Then add this node's post 140 611 if hasattr(node, 'post') and node.post: 141 - # Convert to dict if needed to ensure we can process it 142 - if hasattr(node.post, '__dict__'): 143 - post_dict = node.post.__dict__.copy() 144 - elif isinstance(node.post, dict): 145 - post_dict = node.post.copy() 146 - else: 147 - post_dict = {} 148 - 612 + # Extract post data by accessing properties directly (not __dict__) 613 + # AT Protocol objects store data in properties, not __dict__ 614 + post = node.post 615 + 616 + # Build post dict with proper property access 617 + post_dict = {} 618 + 619 + # Extract basic fields 620 + if hasattr(post, 'uri'): 621 + post_dict['uri'] = post.uri 622 + if hasattr(post, 'cid'): 623 + post_dict['cid'] = post.cid 624 + 625 + # Extract author info 626 + if hasattr(post, 'author') and post.author: 627 + author = post.author 628 + post_dict['author'] = { 629 + 'handle': getattr(author, 'handle', 'unknown'), 630 + 'display_name': getattr(author, 'display_name', 'unknown'), 631 + 'did': getattr(author, 'did', 'unknown') 632 + } 633 + 634 + # Extract record info (text, created_at, etc.) 635 + if hasattr(post, 'record') and post.record: 636 + record = post.record 637 + record_dict = { 638 + 'text': getattr(record, 'text', ''), 639 + 'createdAt': getattr(record, 'created_at', 'unknown') 640 + } 641 + 642 + # Extract links from facets if present 643 + if hasattr(record, 'facets') and record.facets: 644 + links = extract_links_from_facets( 645 + getattr(record, 'text', ''), 646 + record.facets 647 + ) 648 + if links: 649 + record_dict['links'] = links 650 + 651 + post_dict['record'] = record_dict 652 + 653 + # Extract embed data from post.embed (View type with CDN URLs) 654 + # This is different from record.embed which only has raw BlobRefs 655 + if hasattr(post, 'embed') and post.embed: 656 + embed_data = extract_embed_data(post.embed) 657 + if embed_data: 658 + post_dict['embed'] = embed_data 659 + 660 + # Extract parent_uri for tree visualization 661 + parent_uri = None 662 + if hasattr(post, 'record') and post.record: 663 + record_obj = post.record 664 + if hasattr(record_obj, 'reply') and record_obj.reply: 665 + reply_ref = record_obj.reply 666 + if hasattr(reply_ref, 'parent') and reply_ref.parent: 667 + if hasattr(reply_ref.parent, 'uri'): 668 + parent_uri = reply_ref.parent.uri 669 + post_dict['parent_uri'] = parent_uri 670 + 149 671 posts.append(post_dict) 150 - 672 + 673 + # Then traverse any replies (going DOWN the thread) 674 + if hasattr(node, 'replies') and node.replies: 675 + for reply in node.replies: 676 + traverse_thread(reply) 677 + 151 678 # Handle the thread structure 152 679 if hasattr(thread_data, 'thread'): 153 680 # Start from the main thread node 154 681 traverse_thread(thread_data.thread) 155 682 elif hasattr(thread_data, '__dict__') and 'thread' in thread_data.__dict__: 156 683 traverse_thread(thread_data.__dict__['thread']) 157 - 684 + 158 685 # Return a simple structure with posts list 159 686 return {'posts': posts} 160 687 ··· 173 700 return len(flattened.get('posts', [])) 174 701 175 702 176 - def thread_to_yaml_string(thread, strip_metadata=True): 703 + def compute_tree_prefixes(posts: List[Dict]) -> Dict[str, str]: 704 + """ 705 + Compute tree-style prefixes based on parent relationships. 706 + 707 + Args: 708 + posts: List of post dicts, each with 'uri' and 'parent_uri' keys 709 + 710 + Returns: 711 + Dict mapping uri -> prefix string (e.g., "├─ ", "│ └─ ") 712 + """ 713 + if not posts: 714 + return {} 715 + 716 + uri_to_post = {p.get('uri'): p for p in posts if p.get('uri')} 717 + children_map: Dict[str, List[str]] = {} # parent_uri -> [child_uris] 718 + root_uris: List[str] = [] 719 + 720 + for post in posts: 721 + uri = post.get('uri') 722 + if not uri: 723 + continue 724 + parent_uri = post.get('parent_uri') 725 + if not parent_uri or parent_uri not in uri_to_post: 726 + root_uris.append(uri) 727 + else: 728 + children_map.setdefault(parent_uri, []).append(uri) 729 + 730 + prefixes: Dict[str, str] = {} 731 + visited: set = set() 732 + 733 + def compute_recursive(uri: str, ancestors_last: List[bool]): 734 + if uri in visited: 735 + return 736 + visited.add(uri) 737 + 738 + prefix_parts = [] 739 + for is_last in ancestors_last[:-1]: 740 + prefix_parts.append(" " if is_last else "│ ") 741 + if ancestors_last: 742 + prefix_parts.append("└─ " if ancestors_last[-1] else "├─ ") 743 + prefixes[uri] = "".join(prefix_parts) 744 + 745 + children = children_map.get(uri, []) 746 + for i, child_uri in enumerate(children): 747 + compute_recursive(child_uri, ancestors_last + [i == len(children) - 1]) 748 + 749 + for i, root_uri in enumerate(root_uris): 750 + if len(root_uris) == 1: 751 + prefixes[root_uri] = "" 752 + children = children_map.get(root_uri, []) 753 + for j, child_uri in enumerate(children): 754 + compute_recursive(child_uri, [j == len(children) - 1]) 755 + else: 756 + compute_recursive(root_uri, [i == len(root_uris) - 1]) 757 + 758 + return prefixes 759 + 760 + 761 + def build_tree_view(posts: List[Dict]) -> str: 762 + """ 763 + Build a tree-style text visualization of a thread. 764 + 765 + Args: 766 + posts: List of post dicts with uri, parent_uri, author, record fields 767 + 768 + Returns: 769 + Multi-line string showing thread structure with tree prefixes 770 + """ 771 + if not posts: 772 + return "(empty thread)" 773 + 774 + prefixes = compute_tree_prefixes(posts) 775 + lines = [] 776 + 777 + for post in posts: 778 + uri = post.get('uri', '') 779 + prefix = prefixes.get(uri, '') 780 + 781 + author = post.get('author', {}) 782 + handle = author.get('handle', 'unknown') 783 + record = post.get('record', {}) 784 + text = record.get('text', '').replace('\n', ' | ') 785 + 786 + lines.append(f"{prefix}@{handle}: {text}") 787 + 788 + return "\n".join(lines) 789 + 790 + 791 + def thread_to_yaml_string(thread, strip_metadata=True, include_tree_view=True): 177 792 """ 178 793 Convert thread data to a YAML-formatted string for LLM parsing. 179 794 180 795 Args: 181 796 thread: The thread data from get_post_thread 182 797 strip_metadata: Whether to strip metadata fields for cleaner output 798 + include_tree_view: Whether to prepend a tree visualization of the thread 183 799 184 800 Returns: 185 - YAML-formatted string representation of the thread 801 + String representation of the thread with optional tree view and YAML data 186 802 """ 187 803 # First flatten the thread structure to avoid deep nesting 188 804 flattened = flatten_thread_structure(thread) 805 + posts = flattened.get('posts', []) 806 + 807 + output_parts = [] 808 + 809 + # Build tree visualization if requested 810 + if include_tree_view and posts: 811 + tree_view = build_tree_view(posts) 812 + output_parts.append("THREAD STRUCTURE:") 813 + output_parts.append(tree_view) 814 + output_parts.append("") 815 + output_parts.append("FULL POST DATA:") 189 816 190 817 # Convert complex objects to basic types 191 818 basic_thread = convert_to_basic_types(flattened) ··· 196 823 else: 197 824 cleaned_thread = basic_thread 198 825 199 - return yaml.dump(cleaned_thread, indent=2, allow_unicode=True, default_flow_style=False) 826 + yaml_output = yaml.dump(cleaned_thread, indent=2, allow_unicode=True, default_flow_style=False) 827 + output_parts.append(yaml_output) 828 + 829 + return "\n".join(output_parts) 200 830 201 831 202 832 ··· 512 1142 except Exception as e: 513 1143 logger.error(f"Error fetching post thread: {e}") 514 1144 return None 1145 + 1146 + 1147 + def find_last_consecutive_post_in_chain(thread_node, author_handle: str): 1148 + """ 1149 + Find the last consecutive post in the direct reply chain by the same author. 1150 + 1151 + Starting from the given thread node, this function traverses down the direct reply chain 1152 + (not all branches) to find the last consecutive post made by the specified author. 1153 + 1154 + Args: 1155 + thread_node: The thread node to start from (usually the mention post's thread node) 1156 + author_handle: The handle of the author to match (e.g., "user.bsky.social") 1157 + 1158 + Returns: 1159 + Tuple of (uri, cid, text) for the last consecutive post by the author, or None if no consecutive posts 1160 + 1161 + Example: 1162 + If the thread structure is: 1163 + - Post A by @alice (mention) -> thread_node starts here 1164 + - Post B by @alice (consecutive) 1165 + - Post C by @alice (consecutive) 1166 + - Post D by @bob (different author, stop here) 1167 + 1168 + Returns (uri_C, cid_C, text_C) 1169 + """ 1170 + if not thread_node: 1171 + return None 1172 + 1173 + # Start with the current node's post 1174 + current_post = None 1175 + if hasattr(thread_node, 'post') and thread_node.post: 1176 + current_post = thread_node.post 1177 + 1178 + if not current_post: 1179 + return None 1180 + 1181 + # Check if current post is by the target author 1182 + current_author = None 1183 + if hasattr(current_post, 'author') and hasattr(current_post.author, 'handle'): 1184 + current_author = current_post.author.handle 1185 + 1186 + if current_author != author_handle: 1187 + # Current post is not by target author, can't find consecutive posts 1188 + return None 1189 + 1190 + # Track the last consecutive post (start with current) 1191 + last_uri = current_post.uri if hasattr(current_post, 'uri') else None 1192 + last_cid = current_post.cid if hasattr(current_post, 'cid') else None 1193 + last_text = "" 1194 + if hasattr(current_post, 'record') and hasattr(current_post.record, 'text'): 1195 + last_text = current_post.record.text 1196 + 1197 + # Traverse down the direct reply chain 1198 + current_node = thread_node 1199 + while True: 1200 + # Check if there are replies to this node 1201 + if not hasattr(current_node, 'replies') or not current_node.replies: 1202 + # No more replies, we've found the last consecutive post 1203 + break 1204 + 1205 + # For direct chain traversal, we look for replies by the same author 1206 + # If there are multiple replies, we'll take the first one by the same author 1207 + next_node = None 1208 + for reply in current_node.replies: 1209 + if hasattr(reply, 'post') and reply.post: 1210 + reply_author = None 1211 + if hasattr(reply.post, 'author') and hasattr(reply.post.author, 'handle'): 1212 + reply_author = reply.post.author.handle 1213 + 1214 + if reply_author == author_handle: 1215 + # Found a consecutive post by same author 1216 + next_node = reply 1217 + break 1218 + 1219 + if not next_node: 1220 + # No more consecutive posts by same author 1221 + break 1222 + 1223 + # Update last post info to this consecutive post 1224 + current_node = next_node 1225 + current_post = current_node.post 1226 + 1227 + if hasattr(current_post, 'uri'): 1228 + last_uri = current_post.uri 1229 + if hasattr(current_post, 'cid'): 1230 + last_cid = current_post.cid 1231 + if hasattr(current_post, 'record') and hasattr(current_post.record, 'text'): 1232 + last_text = current_post.record.text 1233 + 1234 + # Return the last consecutive post's metadata 1235 + # Only return if we actually have valid URI and CID 1236 + if last_uri and last_cid: 1237 + return (last_uri, last_cid, last_text) 1238 + 1239 + return None 1240 + 1241 + 1242 + def find_consecutive_parent_posts_by_author(thread_node, author_handle: str) -> List[Dict]: 1243 + """ 1244 + Find consecutive posts by the same author in the parent chain. 1245 + 1246 + Starting from the given thread node, this function traverses UP the parent chain 1247 + to find all consecutive posts made by the specified author. 1248 + 1249 + This is the inverse of find_last_consecutive_post_in_chain which traverses DOWN. 1250 + 1251 + Args: 1252 + thread_node: The thread node to start from (the notification post's thread node) 1253 + author_handle: The handle of the author to match (e.g., "user.bsky.social") 1254 + 1255 + Returns: 1256 + List of post dicts for consecutive posts by the author in the parent chain, 1257 + in chronological order (oldest first). Returns empty list if no parent posts 1258 + by the same author. 1259 + 1260 + Example: 1261 + If the thread structure is: 1262 + - Post A by @alice (first part) 1263 + - Post B by @alice (consecutive) <- start from here (notification) 1264 + 1265 + Returns [Post A dict] (not including Post B since that's the current node) 1266 + """ 1267 + parent_posts = [] 1268 + 1269 + if not thread_node: 1270 + return parent_posts 1271 + 1272 + # Traverse up the parent chain 1273 + current_node = thread_node 1274 + while True: 1275 + # Check if this node has a parent 1276 + if not hasattr(current_node, 'parent') or not current_node.parent: 1277 + break 1278 + 1279 + parent_node = current_node.parent 1280 + if not hasattr(parent_node, 'post') or not parent_node.post: 1281 + break 1282 + 1283 + parent_post = parent_node.post 1284 + 1285 + # Check if parent is by the same author 1286 + parent_author = None 1287 + if hasattr(parent_post, 'author') and hasattr(parent_post.author, 'handle'): 1288 + parent_author = parent_post.author.handle 1289 + 1290 + if parent_author != author_handle: 1291 + # Parent is by different author, stop here 1292 + break 1293 + 1294 + # Collect this parent post 1295 + post_dict = { 1296 + 'uri': getattr(parent_post, 'uri', ''), 1297 + 'cid': getattr(parent_post, 'cid', ''), 1298 + 'author': { 1299 + 'handle': parent_author, 1300 + 'display_name': getattr(parent_post.author, 'display_name', '') if hasattr(parent_post, 'author') else '', 1301 + 'did': getattr(parent_post.author, 'did', '') if hasattr(parent_post, 'author') else '' 1302 + }, 1303 + 'record': { 1304 + 'text': getattr(parent_post.record, 'text', '') if hasattr(parent_post, 'record') else '', 1305 + 'createdAt': getattr(parent_post.record, 'created_at', '') if hasattr(parent_post, 'record') else '' 1306 + } 1307 + } 1308 + parent_posts.append(post_dict) 1309 + 1310 + # Move up to the next parent 1311 + current_node = parent_node 1312 + 1313 + # Return in chronological order (oldest first) 1314 + parent_posts.reverse() 1315 + return parent_posts 515 1316 516 1317 517 1318 def reply_to_notification(client: Client, notification: Any, reply_text: str, lang: str = "en-US", correlation_id: Optional[str] = None) -> Optional[Dict[str, Any]]: