Evaluation

`any_agent.evaluation.LlmJudge`

Source code in src/any_agent/evaluation/llm_judge.py

class LlmJudge:
    def __init__(
        self,
        model_id: str,
        framework: AgentFramework = AgentFramework.TINYAGENT,
        output_type: type[BaseModel] = EvaluationOutput,
        model_args: dict[str, Any] | None = None,
        system_prompt: str = LLM_JUDGE_SYSTEM_PROMPT,
    ):
        if model_args is None:
            model_args = {}
        self.model_id = model_id
        self.framework = framework
        self.model_args = model_args
        self.output_type = output_type
        self.system_prompt = system_prompt.format(
            response_schema=self.output_type.model_json_schema()
        )
        # If LiteLLM detects that the model supports response_format, set it to the output_type automatically
        if supports_response_schema(model=self.model_id):
            self.model_args["response_format"] = self.output_type

    def _create_prompt(self, context: str, question: str, prompt: str) -> str:
        if "{context}" not in prompt or "{question}" not in prompt:
            msg = "Prompt must contain the following placeholders: {context} and {question}"
            raise ValueError(msg)
        return prompt.format(
            context=context,
            question=question,
        )

    def run(
        self,
        context: str,
        question: str,
        prompt_template: str = DEFAULT_PROMPT_TEMPLATE,
    ) -> BaseModel:
        """Run the judge synchronously.

        Args:
            context: Any relevant information that may be needed to answer the question
            question: The question to ask the agent
            prompt_template: The prompt to use for the LLM

        Returns:
            The evaluation result

        """
        return run_async_in_sync(self.run_async(context, question, prompt_template))

    async def run_async(
        self,
        context: str,
        question: str,
        prompt_template: str = DEFAULT_PROMPT_TEMPLATE,
    ) -> BaseModel:
        """Run the LLM asynchronously.

        Args:
            context: Any relevant information that may be needed to answer the question
            question: The question to ask the agent
            prompt_template: The prompt to use for the LLM

        Returns:
            The evaluation result

        """
        prompt = self._create_prompt(context, question, prompt_template)

        # Make the LLM call
        response = await acompletion(
            model=self.model_id,
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": prompt},
            ],
            **self.model_args,
        )

        return self.output_type.model_validate_json(
            response.choices[0].message["content"]
        )

`run(context, question, prompt_template=DEFAULT_PROMPT_TEMPLATE)`

Run the judge synchronously.

Parameters:

Name	Type	Description	Default
`context`	`str`	Any relevant information that may be needed to answer the question	required
`question`	`str`	The question to ask the agent	required
`prompt_template`	`str`	The prompt to use for the LLM	`DEFAULT_PROMPT_TEMPLATE`

Returns:

Type	Description
`BaseModel`	The evaluation result

Source code in src/any_agent/evaluation/llm_judge.py

def run(
    self,
    context: str,
    question: str,
    prompt_template: str = DEFAULT_PROMPT_TEMPLATE,
) -> BaseModel:
    """Run the judge synchronously.

    Args:
        context: Any relevant information that may be needed to answer the question
        question: The question to ask the agent
        prompt_template: The prompt to use for the LLM

    Returns:
        The evaluation result

    """
    return run_async_in_sync(self.run_async(context, question, prompt_template))

`run_async(context, question, prompt_template=DEFAULT_PROMPT_TEMPLATE)` `async`

Run the LLM asynchronously.

Parameters:

Name	Type	Description	Default
`context`	`str`	Any relevant information that may be needed to answer the question	required
`question`	`str`	The question to ask the agent	required
`prompt_template`	`str`	The prompt to use for the LLM	`DEFAULT_PROMPT_TEMPLATE`

Returns:

Type	Description
`BaseModel`	The evaluation result

Source code in src/any_agent/evaluation/llm_judge.py

async def run_async(
    self,
    context: str,
    question: str,
    prompt_template: str = DEFAULT_PROMPT_TEMPLATE,
) -> BaseModel:
    """Run the LLM asynchronously.

    Args:
        context: Any relevant information that may be needed to answer the question
        question: The question to ask the agent
        prompt_template: The prompt to use for the LLM

    Returns:
        The evaluation result

    """
    prompt = self._create_prompt(context, question, prompt_template)

    # Make the LLM call
    response = await acompletion(
        model=self.model_id,
        messages=[
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": prompt},
        ],
        **self.model_args,
    )

    return self.output_type.model_validate_json(
        response.choices[0].message["content"]
    )

`any_agent.evaluation.AgentJudge`

An agent that evaluates the correctness of another agent's trace.

Source code in src/any_agent/evaluation/agent_judge.py

class AgentJudge:
    """An agent that evaluates the correctness of another agent's trace."""

    def __init__(
        self,
        model_id: str,
        framework: AgentFramework = AgentFramework.TINYAGENT,
        output_type: type[BaseModel] = EvaluationOutput,
        model_args: dict[str, Any] | None = None,
    ):
        self.model_id = model_id
        self.framework = framework
        self.model_args = model_args
        self.output_type = output_type

    def run(
        self,
        trace: AgentTrace,
        question: str,
        additional_tools: list[Callable[[], Any]] | None = None,
    ) -> AgentTrace:
        """Run the agent judge.

        Args:
            trace: The agent trace to evaluate
            question: The question to ask the agent
            additional_tools: Additional tools to use for the agent

        Returns:
            The trace of the evaluation run.
            You can access the evaluation result in the `final_output`
            property.

        """
        if additional_tools is None:
            additional_tools = []
        return run_async_in_sync(self.run_async(trace, question, additional_tools))

    async def run_async(
        self,
        trace: AgentTrace,
        question: str,
        additional_tools: list[Callable[[], Any]] | None = None,
    ) -> AgentTrace:
        """Run the agent judge asynchronously.

        Args:
            trace: The agent trace to evaluate
            question: The question to ask the agent
            additional_tools: Additional tools to use for the agent
        Returns:
            The trace of the evaluation run.
            You can access the evaluation result in the `final_output`
            property.

        """
        if additional_tools is None:
            additional_tools = []
        tooling = TraceTools(trace)

        agent_config = AgentConfig(
            model_id=self.model_id,
            instructions=AGENT_INSTRUCTIONS.format(
                response_schema=self.output_type.model_json_schema()
            ),
            tools=tooling.get_all_tools() + additional_tools,
            output_type=self.output_type,
            model_args=self.model_args,
        )

        agent = await AnyAgent.create_async(
            self.framework,
            agent_config=agent_config,
        )
        agent_trace = await agent.run_async(question)
        if not isinstance(agent_trace.final_output, self.output_type):
            msg = f"Agent output is not an {self.output_type} instance."
            raise ValueError(msg)
        return agent_trace

`run(trace, question, additional_tools=None)`

Run the agent judge.

Parameters:

Name	Type	Description	Default
`trace`	`AgentTrace`	The agent trace to evaluate	required
`question`	`str`	The question to ask the agent	required
`additional_tools`	`list[Callable[[], Any]] \| None`	Additional tools to use for the agent	`None`

Returns:

Type	Description
`AgentTrace`	The trace of the evaluation run.
`AgentTrace`	You can access the evaluation result in the `final_output`
`AgentTrace`	property.

Source code in src/any_agent/evaluation/agent_judge.py

def run(
    self,
    trace: AgentTrace,
    question: str,
    additional_tools: list[Callable[[], Any]] | None = None,
) -> AgentTrace:
    """Run the agent judge.

    Args:
        trace: The agent trace to evaluate
        question: The question to ask the agent
        additional_tools: Additional tools to use for the agent

    Returns:
        The trace of the evaluation run.
        You can access the evaluation result in the `final_output`
        property.

    """
    if additional_tools is None:
        additional_tools = []
    return run_async_in_sync(self.run_async(trace, question, additional_tools))

`run_async(trace, question, additional_tools=None)` `async`

Run the agent judge asynchronously.

Parameters:

Name	Type	Description	Default
`trace`	`AgentTrace`	The agent trace to evaluate	required
`question`	`str`	The question to ask the agent	required
`additional_tools`	`list[Callable[[], Any]] \| None`	Additional tools to use for the agent	`None`

Returns: The trace of the evaluation run. You can access the evaluation result in the final_output property.

Source code in src/any_agent/evaluation/agent_judge.py

async def run_async(
    self,
    trace: AgentTrace,
    question: str,
    additional_tools: list[Callable[[], Any]] | None = None,
) -> AgentTrace:
    """Run the agent judge asynchronously.

    Args:
        trace: The agent trace to evaluate
        question: The question to ask the agent
        additional_tools: Additional tools to use for the agent
    Returns:
        The trace of the evaluation run.
        You can access the evaluation result in the `final_output`
        property.

    """
    if additional_tools is None:
        additional_tools = []
    tooling = TraceTools(trace)

    agent_config = AgentConfig(
        model_id=self.model_id,
        instructions=AGENT_INSTRUCTIONS.format(
            response_schema=self.output_type.model_json_schema()
        ),
        tools=tooling.get_all_tools() + additional_tools,
        output_type=self.output_type,
        model_args=self.model_args,
    )

    agent = await AnyAgent.create_async(
        self.framework,
        agent_config=agent_config,
    )
    agent_trace = await agent.run_async(question)
    if not isinstance(agent_trace.final_output, self.output_type):
        msg = f"Agent output is not an {self.output_type} instance."
        raise ValueError(msg)
    return agent_trace

Evaluation

any_agent.evaluation.LlmJudge

run(context, question, prompt_template=DEFAULT_PROMPT_TEMPLATE)

run_async(context, question, prompt_template=DEFAULT_PROMPT_TEMPLATE) async

any_agent.evaluation.AgentJudge

run(trace, question, additional_tools=None)

run_async(trace, question, additional_tools=None) async

`any_agent.evaluation.LlmJudge`

`run(context, question, prompt_template=DEFAULT_PROMPT_TEMPLATE)`

`run_async(context, question, prompt_template=DEFAULT_PROMPT_TEMPLATE)` `async`

`any_agent.evaluation.AgentJudge`

`run(trace, question, additional_tools=None)`

`run_async(trace, question, additional_tools=None)` `async`