Skip to content

Evaluation

any_agent.evaluation.LlmJudge

Source code in src/any_agent/evaluation/llm_judge.py
class LlmJudge:
    def __init__(
        self,
        model_id: str,
        framework: AgentFramework = AgentFramework.TINYAGENT,
        output_type: type[BaseModel] = EvaluationOutput,
        model_args: dict[str, Any] | None = None,
        system_prompt: str = LLM_JUDGE_SYSTEM_PROMPT,
    ):
        if model_args is None:
            model_args = {}
        self.model_id = model_id
        self.framework = framework
        self.model_args = model_args
        self.output_type = output_type
        self.system_prompt = system_prompt.format(
            response_schema=self.output_type.model_json_schema()
        )
        # If LiteLLM detects that the model supports response_format, set it to the output_type automatically
        if supports_response_schema(model=self.model_id):
            self.model_args["response_format"] = self.output_type

    def _create_prompt(self, context: str, question: str, prompt: str) -> str:
        if "{context}" not in prompt or "{question}" not in prompt:
            msg = "Prompt must contain the following placeholders: {context} and {question}"
            raise ValueError(msg)
        return prompt.format(
            context=context,
            question=question,
        )

    def run(
        self,
        context: str,
        question: str,
        prompt_template: str = DEFAULT_PROMPT_TEMPLATE,
    ) -> BaseModel:
        """Run the judge synchronously.

        Args:
            context: Any relevant information that may be needed to answer the question
            question: The question to ask the agent
            prompt_template: The prompt to use for the LLM

        Returns:
            The evaluation result

        """
        return run_async_in_sync(self.run_async(context, question, prompt_template))

    async def run_async(
        self,
        context: str,
        question: str,
        prompt_template: str = DEFAULT_PROMPT_TEMPLATE,
    ) -> BaseModel:
        """Run the LLM asynchronously.

        Args:
            context: Any relevant information that may be needed to answer the question
            question: The question to ask the agent
            prompt_template: The prompt to use for the LLM

        Returns:
            The evaluation result

        """
        prompt = self._create_prompt(context, question, prompt_template)

        # Make the LLM call
        response = await acompletion(
            model=self.model_id,
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt},
            ],
            **self.model_args,
        )

        return self.output_type.model_validate_json(
            response.choices[0].message["content"]
        )

run(context, question, prompt_template=DEFAULT_PROMPT_TEMPLATE)

Run the judge synchronously.

Parameters:

Name Type Description Default
context str

Any relevant information that may be needed to answer the question

required
question str

The question to ask the agent

required
prompt_template str

The prompt to use for the LLM

DEFAULT_PROMPT_TEMPLATE

Returns:

Type Description
BaseModel

The evaluation result

Source code in src/any_agent/evaluation/llm_judge.py
def run(
    self,
    context: str,
    question: str,
    prompt_template: str = DEFAULT_PROMPT_TEMPLATE,
) -> BaseModel:
    """Run the judge synchronously.

    Args:
        context: Any relevant information that may be needed to answer the question
        question: The question to ask the agent
        prompt_template: The prompt to use for the LLM

    Returns:
        The evaluation result

    """
    return run_async_in_sync(self.run_async(context, question, prompt_template))

run_async(context, question, prompt_template=DEFAULT_PROMPT_TEMPLATE) async

Run the LLM asynchronously.

Parameters:

Name Type Description Default
context str

Any relevant information that may be needed to answer the question

required
question str

The question to ask the agent

required
prompt_template str

The prompt to use for the LLM

DEFAULT_PROMPT_TEMPLATE

Returns:

Type Description
BaseModel

The evaluation result

Source code in src/any_agent/evaluation/llm_judge.py
async def run_async(
    self,
    context: str,
    question: str,
    prompt_template: str = DEFAULT_PROMPT_TEMPLATE,
) -> BaseModel:
    """Run the LLM asynchronously.

    Args:
        context: Any relevant information that may be needed to answer the question
        question: The question to ask the agent
        prompt_template: The prompt to use for the LLM

    Returns:
        The evaluation result

    """
    prompt = self._create_prompt(context, question, prompt_template)

    # Make the LLM call
    response = await acompletion(
        model=self.model_id,
        messages=[
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": prompt},
        ],
        **self.model_args,
    )

    return self.output_type.model_validate_json(
        response.choices[0].message["content"]
    )

any_agent.evaluation.AgentJudge

An agent that evaluates the correctness of another agent's trace.

Source code in src/any_agent/evaluation/agent_judge.py
class AgentJudge:
    """An agent that evaluates the correctness of another agent's trace."""

    def __init__(
        self,
        model_id: str,
        framework: AgentFramework = AgentFramework.TINYAGENT,
        output_type: type[BaseModel] = EvaluationOutput,
        model_args: dict[str, Any] | None = None,
    ):
        self.model_id = model_id
        self.framework = framework
        self.model_args = model_args
        self.output_type = output_type

    def run(
        self,
        trace: AgentTrace,
        question: str,
        additional_tools: list[Callable[[], Any]] | None = None,
    ) -> BaseModel:
        """Run the agent judge.

        Args:
            trace: The agent trace to evaluate
            question: The question to ask the agent
            additional_tools: Additional tools to use for the agent

        Returns:
            The evaluation result

        """
        if additional_tools is None:
            additional_tools = []
        return run_async_in_sync(self.run_async(trace, question, additional_tools))

    async def run_async(
        self,
        trace: AgentTrace,
        question: str,
        additional_tools: list[Callable[[], Any]] | None = None,
    ) -> BaseModel:
        """Run the agent judge asynchronously.

        Args:
            trace: The agent trace to evaluate
            question: The question to ask the agent
            additional_tools: Additional tools to use for the agent
        Returns:
            The evaluation result

        """
        if additional_tools is None:
            additional_tools = []
        tooling = TraceTools(trace)

        agent_config = AgentConfig(
            model_id=self.model_id,
            instructions=AGENT_INSTRUCTIONS.format(
                response_schema=self.output_type.model_json_schema()
            ),
            tools=tooling.get_all_tools() + additional_tools,
            output_type=self.output_type,
            model_args=self.model_args,
        )

        agent = await AnyAgent.create_async(
            self.framework,
            agent_config=agent_config,
        )
        agent_trace = await agent.run_async(question)
        if not isinstance(agent_trace.final_output, self.output_type):
            msg = f"Agent output is not an {self.output_type} instance."
            raise ValueError(msg)
        return agent_trace.final_output

run(trace, question, additional_tools=None)

Run the agent judge.

Parameters:

Name Type Description Default
trace AgentTrace

The agent trace to evaluate

required
question str

The question to ask the agent

required
additional_tools list[Callable[[], Any]] | None

Additional tools to use for the agent

None

Returns:

Type Description
BaseModel

The evaluation result

Source code in src/any_agent/evaluation/agent_judge.py
def run(
    self,
    trace: AgentTrace,
    question: str,
    additional_tools: list[Callable[[], Any]] | None = None,
) -> BaseModel:
    """Run the agent judge.

    Args:
        trace: The agent trace to evaluate
        question: The question to ask the agent
        additional_tools: Additional tools to use for the agent

    Returns:
        The evaluation result

    """
    if additional_tools is None:
        additional_tools = []
    return run_async_in_sync(self.run_async(trace, question, additional_tools))

run_async(trace, question, additional_tools=None) async

Run the agent judge asynchronously.

Parameters:

Name Type Description Default
trace AgentTrace

The agent trace to evaluate

required
question str

The question to ask the agent

required
additional_tools list[Callable[[], Any]] | None

Additional tools to use for the agent

None

Returns: The evaluation result

Source code in src/any_agent/evaluation/agent_judge.py
async def run_async(
    self,
    trace: AgentTrace,
    question: str,
    additional_tools: list[Callable[[], Any]] | None = None,
) -> BaseModel:
    """Run the agent judge asynchronously.

    Args:
        trace: The agent trace to evaluate
        question: The question to ask the agent
        additional_tools: Additional tools to use for the agent
    Returns:
        The evaluation result

    """
    if additional_tools is None:
        additional_tools = []
    tooling = TraceTools(trace)

    agent_config = AgentConfig(
        model_id=self.model_id,
        instructions=AGENT_INSTRUCTIONS.format(
            response_schema=self.output_type.model_json_schema()
        ),
        tools=tooling.get_all_tools() + additional_tools,
        output_type=self.output_type,
        model_args=self.model_args,
    )

    agent = await AnyAgent.create_async(
        self.framework,
        agent_config=agent_config,
    )
    agent_trace = await agent.run_async(question)
    if not isinstance(agent_trace.final_output, self.output_type):
        msg = f"Agent output is not an {self.output_type} instance."
        raise ValueError(msg)
    return agent_trace.final_output