evals/conftest.py at mcp-refactor

zzstoatzz.io / bot
fork atom
a digital entity named phi that roams bsky
fork atom
bot / evals / conftest.py
at mcp-refactor 88 lines 3.0 kB view raw
wrap content
zzstoatzz.io feat: minimal eval proof of concept 4mo ago
205359a0
 1"""Eval test configuration."""
 2
 3import os
 4from collections.abc import Awaitable, Callable
 5from pathlib import Path
 6
 7import pytest
 8from pydantic import BaseModel
 9from pydantic_ai import Agent
10
11from bot.agent import Response
12from bot.config import Settings
13from bot.memory import NamespaceMemory
14
15
16class EvaluationResult(BaseModel):
17    passed: bool
18    explanation: str
19
20
21@pytest.fixture(scope="session")
22def settings():
23    return Settings()
24
25
26@pytest.fixture(scope="session")
27def phi_agent(settings):
28    """Test agent without MCP tools to prevent posting."""
29    if not settings.anthropic_api_key:
30        pytest.skip("Requires ANTHROPIC_API_KEY")
31
32    if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
33        os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
34    if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"):
35        os.environ["OPENAI_API_KEY"] = settings.openai_api_key
36
37    personality = Path(settings.personality_file).read_text()
38
39    class TestAgent:
40        def __init__(self):
41            self.memory = None
42            if settings.turbopuffer_api_key and settings.openai_api_key:
43                self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
44
45            self.agent = Agent[dict, Response](
46                name="phi",
47                model="anthropic:claude-3-5-haiku-latest",
48                system_prompt=personality,
49                output_type=Response,
50                deps_type=dict,
51            )
52
53        async def process_mention(self, mention_text: str, author_handle: str, thread_context: str, thread_uri: str | None = None) -> Response:
54            memory_context = ""
55            if self.memory:
56                try:
57                    memory_context = await self.memory.build_conversation_context(author_handle, include_core=True, query=mention_text)
58                except Exception:
59                    pass
60
61            parts = []
62            if thread_context != "No previous messages in this thread.":
63                parts.append(thread_context)
64            if memory_context:
65                parts.append(memory_context)
66            parts.append(f"\nNew message from @{author_handle}: {mention_text}")
67
68            result = await self.agent.run("\n\n".join(parts), deps={"thread_uri": thread_uri})
69            return result.output
70
71    return TestAgent()
72
73
74@pytest.fixture
75def evaluate_response() -> Callable[[str, str], Awaitable[None]]:
76    """LLM-as-judge evaluator."""
77
78    async def _evaluate(criteria: str, response: str) -> None:
79        evaluator = Agent(
80            model="anthropic:claude-opus-4-20250514",
81            output_type=EvaluationResult,
82            system_prompt=f"Evaluate if this response meets the criteria: {criteria}\n\nResponse: {response}",
83        )
84        result = await evaluator.run("Evaluate.")
85        if not result.output.passed:
86            raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}")
87
88    return _evaluate