a digital entity named phi that roams bsky
1"""Eval test configuration."""
2
3import os
4from collections.abc import Awaitable, Callable
5from pathlib import Path
6
7import pytest
8from pydantic import BaseModel
9from pydantic_ai import Agent
10
11from bot.agent import Response
12from bot.config import Settings
13from bot.memory import NamespaceMemory
14
15
16class EvaluationResult(BaseModel):
17 passed: bool
18 explanation: str
19
20
21@pytest.fixture(scope="session")
22def settings():
23 return Settings()
24
25
26@pytest.fixture(scope="session")
27def phi_agent(settings):
28 """Test agent without MCP tools to prevent posting."""
29 if not settings.anthropic_api_key:
30 pytest.skip("Requires ANTHROPIC_API_KEY")
31
32 if settings.anthropic_api_key and not os.environ.get("ANTHROPIC_API_KEY"):
33 os.environ["ANTHROPIC_API_KEY"] = settings.anthropic_api_key
34 if settings.openai_api_key and not os.environ.get("OPENAI_API_KEY"):
35 os.environ["OPENAI_API_KEY"] = settings.openai_api_key
36
37 personality = Path(settings.personality_file).read_text()
38
39 class TestAgent:
40 def __init__(self):
41 self.memory = None
42 if settings.turbopuffer_api_key and settings.openai_api_key:
43 self.memory = NamespaceMemory(api_key=settings.turbopuffer_api_key)
44
45 self.agent = Agent[dict, Response](
46 name="phi",
47 model="anthropic:claude-3-5-haiku-latest",
48 system_prompt=personality,
49 output_type=Response,
50 deps_type=dict,
51 )
52
53 async def process_mention(self, mention_text: str, author_handle: str, thread_context: str, thread_uri: str | None = None) -> Response:
54 memory_context = ""
55 if self.memory:
56 try:
57 memory_context = await self.memory.build_conversation_context(author_handle, include_core=True, query=mention_text)
58 except Exception:
59 pass
60
61 parts = []
62 if thread_context != "No previous messages in this thread.":
63 parts.append(thread_context)
64 if memory_context:
65 parts.append(memory_context)
66 parts.append(f"\nNew message from @{author_handle}: {mention_text}")
67
68 result = await self.agent.run("\n\n".join(parts), deps={"thread_uri": thread_uri})
69 return result.output
70
71 return TestAgent()
72
73
74@pytest.fixture
75def evaluate_response() -> Callable[[str, str], Awaitable[None]]:
76 """LLM-as-judge evaluator."""
77
78 async def _evaluate(criteria: str, response: str) -> None:
79 evaluator = Agent(
80 model="anthropic:claude-opus-4-20250514",
81 output_type=EvaluationResult,
82 system_prompt=f"Evaluate if this response meets the criteria: {criteria}\n\nResponse: {response}",
83 )
84 result = await evaluator.run("Evaluate.")
85 if not result.output.passed:
86 raise AssertionError(f"{result.output.explanation}\n\nResponse: {response}")
87
88 return _evaluate