a digital entity named phi that roams bsky
at mcp-refactor 88 lines 3.0 kB view raw
1"""Eval test configuration.""" 2 3import os 4from collections.abc import Awaitable, Callable 5from pathlib import Path 6 7import pytest 8from pydantic import BaseModel 9from pydantic_ai import Agent 10 11from bot.agent import Response 12from bot.config import Settings 13from bot.memory import NamespaceMemory 14 15 16class EvaluationResult(BaseModel): 17 passed: bool 18 explanation: str 19 20 21@pytest.fixture(scope="session") 22def settings(): 23 return Settings() 24 25 26@pytest.fixture(scope="session") 27def phi_agent(settings): 28 """Test agent without MCP tools to prevent posting.""" 29 if not settings.anthropic_api_key: 30 pytest.skip("Requires ANTHROPIC_API_KEY") 31 32 if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"): 33 os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key 34 if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"): 35 os.environ["OPENAI_API_KEY"] = settings.openai_api_key 36 37 personality = Path(settings.personality_file).read_text() 38 39 class TestAgent: 40 def __init__(self): 41 self.memory = None 42 if settings.turbopuffer_api_key and settings.openai_api_key: 43 self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key) 44 45 self.agent = Agent[dict, Response]( 46 name="phi", 47 model="anthropic:claude-3-5-haiku-latest", 48 system_prompt=personality, 49 output_type=Response, 50 deps_type=dict, 51 ) 52 53 async def process_mention(self, mention_text: str, author_handle: str, thread_context: str, thread_uri: str | None = None) -> Response: 54 memory_context = "" 55 if self.memory: 56 try: 57 memory_context = await self.memory.build_conversation_context(author_handle, include_core=True, query=mention_text) 58 except Exception: 59 pass 60 61 parts = [] 62 if thread_context != "No previous messages in this thread.": 63 parts.append(thread_context) 64 if memory_context: 65 parts.append(memory_context) 66 parts.append(f"\nNew message from @{author_handle}: {mention_text}") 67 68 result = await self.agent.run("\n\n".join(parts), deps={"thread_uri": thread_uri}) 69 return result.output 70 71 return TestAgent() 72 73 74@pytest.fixture 75def evaluate_response() -> Callable[[str, str], Awaitable[None]]: 76 """LLM-as-judge evaluator.""" 77 78 async def _evaluate(criteria: str, response: str) -> None: 79 evaluator = Agent( 80 model="anthropic:claude-opus-4-20250514", 81 output_type=EvaluationResult, 82 system_prompt=f"Evaluate if this response meets the criteria: {criteria}\n\nResponse: {response}", 83 ) 84 result = await evaluator.run("Evaluate.") 85 if not result.output.passed: 86 raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}") 87 88 return _evaluate