Skip to content

Evaluation

Evaluates agent performance by passing trace text and a question to an LLM.

def __init__(
self,
model_id: str,
framework: AgentFramework = "tinyagent",
output_type: type[BaseModel] = <class 'schemas.EvaluationOutput'>,
model_args: dict[str, Any] | None = None,
system_prompt: str = "You are an expert evaluator that analyzes contextual information to answer specific questions about agent performance and behavior.
You will be provided with:
1. Contextual information of an agent's execution that may be relevant to the evaluation question
2. A specific evaluation question to answer
Your task is to carefully analyze the context and provide a judgment on whether the agent's performance meets the criteria specified in the question.
EVALUATION GUIDELINES:
- Be objective and thorough in your analysis
- If the question asks about specific actions, look for evidence of those actions in the context
- If unsure, err on the side of being more critical rather than lenient
Your output must match the following JSON schema:
{response_schema}",
)
ParameterTypeDefaultDescription
selfrequired
model_idstrrequired
frameworkAgentFramework”tinyagent”
output_typetype[BaseModel]<class ‘schemas.EvaluationOutput’>
model_argsdict[str, Any] | NoneNone
system_promptstr”You are an expert evaluator that analyzes contextual information to answer specific questions about agent performance and behavior.

You will be provided with:

  1. Contextual information of an agent’s execution that may be relevant to the evaluation question
  2. A specific evaluation question to answer

Your task is to carefully analyze the context and provide a judgment on whether the agent’s performance meets the criteria specified in the question.

EVALUATION GUIDELINES:

  • Be objective and thorough in your analysis
  • If the question asks about specific actions, look for evidence of those actions in the context
  • If unsure, err on the side of being more critical rather than lenient

Your output must match the following JSON schema: {response_schema}” | |

Run the judge synchronously.

def run(
self,
context: str,
question: str,
prompt_template: str = "Please answer the evaluation question given the following contextual information:
CONTEXT:
{context}
EVALUATION QUESTION:
{question}",
) -> BaseModel
ParameterTypeDefaultDescription
selfrequired
contextstrrequiredAny relevant information that may be needed to answer the question
questionstrrequiredThe question to ask the agent
prompt_templatestr”Please answer the evaluation question given the following contextual information:

CONTEXT: {context}

EVALUATION QUESTION: {question}” | The prompt to use for the LLM |

Run the LLM asynchronously.

async def run_async(
self,
context: str,
question: str,
prompt_template: str = "Please answer the evaluation question given the following contextual information:
CONTEXT:
{context}
EVALUATION QUESTION:
{question}",
) -> BaseModel
ParameterTypeDefaultDescription
selfrequired
contextstrrequiredAny relevant information that may be needed to answer the question
questionstrrequiredThe question to ask the agent
prompt_templatestr”Please answer the evaluation question given the following contextual information:

CONTEXT: {context}

EVALUATION QUESTION: {question}” | The prompt to use for the LLM |


An agent that evaluates the correctness of another agent’s trace.

Agent-based evaluator with built-in tools for trace inspection.

def __init__(
self,
model_id: str,
framework: AgentFramework = "tinyagent",
output_type: type[BaseModel] = <class 'schemas.EvaluationOutput'>,
model_args: dict[str, Any] | None = None,
)
ParameterTypeDefaultDescription
selfrequired
model_idstrrequired
frameworkAgentFramework”tinyagent”
output_typetype[BaseModel]<class ‘schemas.EvaluationOutput’>
model_argsdict[str, Any] | NoneNone

Run the agent judge.

def run(
self,
trace: AgentTrace,
question: str,
additional_tools: list[Callable[[], Any]] | None = None,
) -> AgentTrace
ParameterTypeDefaultDescription
selfrequired
traceAgentTracerequiredThe agent trace to evaluate
questionstrrequiredThe question to ask the agent
additional_toolslist[Callable[[], Any]] | NoneNoneAdditional tools to use for the agent

Run the agent judge asynchronously.

async def run_async(
self,
trace: AgentTrace,
question: str,
additional_tools: list[Callable[[], Any]] | None = None,
) -> AgentTrace
ParameterTypeDefaultDescription
selfrequired
traceAgentTracerequiredThe agent trace to evaluate
questionstrrequiredThe question to ask the agent
additional_toolslist[Callable[[], Any]] | NoneNoneAdditional tools to use for the agent