Skip to content

Completion

Completion

any_llm.completion(model, messages, *, provider=None, tools=None, tool_choice=None, temperature=None, top_p=None, max_tokens=None, response_format=None, stream=None, n=None, stop=None, presence_penalty=None, frequency_penalty=None, seed=None, api_key=None, api_base=None, api_timeout=None, user=None, parallel_tool_calls=None, logprobs=None, top_logprobs=None, logit_bias=None, stream_options=None, max_completion_tokens=None, reasoning_effort='auto', **kwargs)

Create a chat completion.

Parameters:

Name Type Description Default
model str

Model identifier. Recommended: Use with separate provider parameter (e.g., model='gpt-4', provider='openai'). Alternative: Combined format 'provider:model' (e.g., 'openai:gpt-4'). Legacy format 'provider/model' is also supported but deprecated.

required
provider str | ProviderName | None

Recommended: Provider name to use for the request (e.g., 'openai', 'mistral'). When provided, the model parameter should contain only the model name.

None
messages list[dict[str, Any] | ChatCompletionMessage]

List of messages for the conversation

required
tools list[dict[str, Any] | Callable[..., Any]] | None

List of tools for tool calling. Can be Python callables or OpenAI tool format dicts

None
tool_choice str | dict[str, Any] | None

Controls which tools the model can call

None
temperature float | None

Controls randomness in the response (0.0 to 2.0)

None
top_p float | None

Controls diversity via nucleus sampling (0.0 to 1.0)

None
max_tokens int | None

Maximum number of tokens to generate

None
response_format dict[str, Any] | type[BaseModel] | None

Format specification for the response

None
stream bool | None

Whether to stream the response

None
n int | None

Number of completions to generate

None
stop str | list[str] | None

Stop sequences for generation

None
presence_penalty float | None

Penalize new tokens based on presence in text

None
frequency_penalty float | None

Penalize new tokens based on frequency in text

None
seed int | None

Random seed for reproducible results

None
api_key str | None

API key for the provider

None
api_base str | None

Base URL for the provider API

None
api_timeout float | None

Request timeout in seconds

None
user str | None

Unique identifier for the end user

None
parallel_tool_calls bool | None

Whether to allow parallel tool calls

None
logprobs bool | None

Include token-level log probabilities in the response

None
top_logprobs int | None

Number of alternatives to return when logprobs are requested

None
logit_bias dict[str, float] | None

Bias the likelihood of specified tokens during generation

None
stream_options dict[str, Any] | None

Additional options controlling streaming behavior

None
max_completion_tokens int | None

Maximum number of tokens for the completion

None
reasoning_effort Literal['minimal', 'low', 'medium', 'high', 'auto'] | None

Reasoning effort level for models that support it. "auto" will map to each provider's default.

'auto'
**kwargs Any

Additional provider-specific parameters

{}

Returns:

Type Description
ChatCompletion | Iterator[ChatCompletionChunk]

The completion response from the provider

Source code in src/any_llm/api.py
def completion(
    model: str,
    messages: list[dict[str, Any] | ChatCompletionMessage],
    *,
    provider: str | ProviderName | None = None,
    tools: list[dict[str, Any] | Callable[..., Any]] | None = None,
    tool_choice: str | dict[str, Any] | None = None,
    temperature: float | None = None,
    top_p: float | None = None,
    max_tokens: int | None = None,
    response_format: dict[str, Any] | type[BaseModel] | None = None,
    stream: bool | None = None,
    n: int | None = None,
    stop: str | list[str] | None = None,
    presence_penalty: float | None = None,
    frequency_penalty: float | None = None,
    seed: int | None = None,
    api_key: str | None = None,
    api_base: str | None = None,
    api_timeout: float | None = None,
    user: str | None = None,
    parallel_tool_calls: bool | None = None,
    logprobs: bool | None = None,
    top_logprobs: int | None = None,
    logit_bias: dict[str, float] | None = None,
    stream_options: dict[str, Any] | None = None,
    max_completion_tokens: int | None = None,
    reasoning_effort: Literal["minimal", "low", "medium", "high", "auto"] | None = "auto",
    **kwargs: Any,
) -> ChatCompletion | Iterator[ChatCompletionChunk]:
    """Create a chat completion.

    Args:
        model: Model identifier. **Recommended**: Use with separate `provider` parameter (e.g., model='gpt-4', provider='openai').
            **Alternative**: Combined format 'provider:model' (e.g., 'openai:gpt-4').
            Legacy format 'provider/model' is also supported but deprecated.
        provider: **Recommended**: Provider name to use for the request (e.g., 'openai', 'mistral').
            When provided, the model parameter should contain only the model name.
        messages: List of messages for the conversation
        tools: List of tools for tool calling. Can be Python callables or OpenAI tool format dicts
        tool_choice: Controls which tools the model can call
        temperature: Controls randomness in the response (0.0 to 2.0)
        top_p: Controls diversity via nucleus sampling (0.0 to 1.0)
        max_tokens: Maximum number of tokens to generate
        response_format: Format specification for the response
        stream: Whether to stream the response
        n: Number of completions to generate
        stop: Stop sequences for generation
        presence_penalty: Penalize new tokens based on presence in text
        frequency_penalty: Penalize new tokens based on frequency in text
        seed: Random seed for reproducible results
        api_key: API key for the provider
        api_base: Base URL for the provider API
        api_timeout: Request timeout in seconds
        user: Unique identifier for the end user
        parallel_tool_calls: Whether to allow parallel tool calls
        logprobs: Include token-level log probabilities in the response
        top_logprobs: Number of alternatives to return when logprobs are requested
        logit_bias: Bias the likelihood of specified tokens during generation
        stream_options: Additional options controlling streaming behavior
        max_completion_tokens: Maximum number of tokens for the completion
        reasoning_effort: Reasoning effort level for models that support it. "auto" will map to each provider's default.
        **kwargs: Additional provider-specific parameters

    Returns:
        The completion response from the provider

    """
    provider_instance, completion_params = _process_completion_params(
        model=model,
        provider=provider,
        messages=messages,
        tools=tools,
        tool_choice=tool_choice,
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens,
        response_format=response_format,
        stream=stream,
        n=n,
        stop=stop,
        presence_penalty=presence_penalty,
        frequency_penalty=frequency_penalty,
        seed=seed,
        api_key=api_key,
        api_base=api_base,
        api_timeout=api_timeout,
        user=user,
        parallel_tool_calls=parallel_tool_calls,
        logprobs=logprobs,
        top_logprobs=top_logprobs,
        logit_bias=logit_bias,
        stream_options=stream_options,
        max_completion_tokens=max_completion_tokens,
        reasoning_effort=reasoning_effort,
        **kwargs,
    )

    return provider_instance.completion(completion_params, **kwargs)

any_llm.acompletion(model, messages, *, provider=None, tools=None, tool_choice=None, temperature=None, top_p=None, max_tokens=None, response_format=None, stream=None, n=None, stop=None, presence_penalty=None, frequency_penalty=None, seed=None, api_key=None, api_base=None, api_timeout=None, user=None, parallel_tool_calls=None, logprobs=None, top_logprobs=None, logit_bias=None, stream_options=None, max_completion_tokens=None, reasoning_effort='auto', **kwargs) async

Create a chat completion asynchronously.

Parameters:

Name Type Description Default
model str

Model identifier. Recommended: Use with separate provider parameter (e.g., model='gpt-4', provider='openai'). Alternative: Combined format 'provider:model' (e.g., 'openai:gpt-4'). Legacy format 'provider/model' is also supported but deprecated.

required
provider str | ProviderName | None

Recommended: Provider name to use for the request (e.g., 'openai', 'mistral'). When provided, the model parameter should contain only the model name.

None
messages list[dict[str, Any] | ChatCompletionMessage]

List of messages for the conversation

required
tools list[dict[str, Any] | Callable[..., Any]] | None

List of tools for tool calling. Can be Python callables or OpenAI tool format dicts

None
tool_choice str | dict[str, Any] | None

Controls which tools the model can call

None
temperature float | None

Controls randomness in the response (0.0 to 2.0)

None
top_p float | None

Controls diversity via nucleus sampling (0.0 to 1.0)

None
max_tokens int | None

Maximum number of tokens to generate

None
response_format dict[str, Any] | type[BaseModel] | None

Format specification for the response

None
stream bool | None

Whether to stream the response

None
n int | None

Number of completions to generate

None
stop str | list[str] | None

Stop sequences for generation

None
presence_penalty float | None

Penalize new tokens based on presence in text

None
frequency_penalty float | None

Penalize new tokens based on frequency in text

None
seed int | None

Random seed for reproducible results

None
api_key str | None

API key for the provider

None
api_base str | None

Base URL for the provider API

None
api_timeout float | None

Request timeout in seconds

None
user str | None

Unique identifier for the end user

None
parallel_tool_calls bool | None

Whether to allow parallel tool calls

None
logprobs bool | None

Include token-level log probabilities in the response

None
top_logprobs int | None

Number of alternatives to return when logprobs are requested

None
logit_bias dict[str, float] | None

Bias the likelihood of specified tokens during generation

None
stream_options dict[str, Any] | None

Additional options controlling streaming behavior

None
max_completion_tokens int | None

Maximum number of tokens for the completion

None
reasoning_effort Literal['minimal', 'low', 'medium', 'high', 'auto'] | None

Reasoning effort level for models that support it. "auto" will map to each provider's default.

'auto'
**kwargs Any

Additional provider-specific parameters

{}

Returns:

Type Description
ChatCompletion | AsyncIterator[ChatCompletionChunk]

The completion response from the provider

Source code in src/any_llm/api.py
async def acompletion(
    model: str,
    messages: list[dict[str, Any] | ChatCompletionMessage],
    *,
    provider: str | ProviderName | None = None,
    tools: list[dict[str, Any] | Callable[..., Any]] | None = None,
    tool_choice: str | dict[str, Any] | None = None,
    temperature: float | None = None,
    top_p: float | None = None,
    max_tokens: int | None = None,
    response_format: dict[str, Any] | type[BaseModel] | None = None,
    stream: bool | None = None,
    n: int | None = None,
    stop: str | list[str] | None = None,
    presence_penalty: float | None = None,
    frequency_penalty: float | None = None,
    seed: int | None = None,
    api_key: str | None = None,
    api_base: str | None = None,
    api_timeout: float | None = None,
    user: str | None = None,
    parallel_tool_calls: bool | None = None,
    logprobs: bool | None = None,
    top_logprobs: int | None = None,
    logit_bias: dict[str, float] | None = None,
    stream_options: dict[str, Any] | None = None,
    max_completion_tokens: int | None = None,
    reasoning_effort: Literal["minimal", "low", "medium", "high", "auto"] | None = "auto",
    **kwargs: Any,
) -> ChatCompletion | AsyncIterator[ChatCompletionChunk]:
    """Create a chat completion asynchronously.

    Args:
        model: Model identifier. **Recommended**: Use with separate `provider` parameter (e.g., model='gpt-4', provider='openai').
            **Alternative**: Combined format 'provider:model' (e.g., 'openai:gpt-4').
            Legacy format 'provider/model' is also supported but deprecated.
        provider: **Recommended**: Provider name to use for the request (e.g., 'openai', 'mistral').
            When provided, the model parameter should contain only the model name.
        messages: List of messages for the conversation
        tools: List of tools for tool calling. Can be Python callables or OpenAI tool format dicts
        tool_choice: Controls which tools the model can call
        temperature: Controls randomness in the response (0.0 to 2.0)
        top_p: Controls diversity via nucleus sampling (0.0 to 1.0)
        max_tokens: Maximum number of tokens to generate
        response_format: Format specification for the response
        stream: Whether to stream the response
        n: Number of completions to generate
        stop: Stop sequences for generation
        presence_penalty: Penalize new tokens based on presence in text
        frequency_penalty: Penalize new tokens based on frequency in text
        seed: Random seed for reproducible results
        api_key: API key for the provider
        api_base: Base URL for the provider API
        api_timeout: Request timeout in seconds
        user: Unique identifier for the end user
        parallel_tool_calls: Whether to allow parallel tool calls
        logprobs: Include token-level log probabilities in the response
        top_logprobs: Number of alternatives to return when logprobs are requested
        logit_bias: Bias the likelihood of specified tokens during generation
        stream_options: Additional options controlling streaming behavior
        max_completion_tokens: Maximum number of tokens for the completion
        reasoning_effort: Reasoning effort level for models that support it. "auto" will map to each provider's default.
        **kwargs: Additional provider-specific parameters

    Returns:
        The completion response from the provider

    """
    provider_instance, completion_params = _process_completion_params(
        model=model,
        provider=provider,
        messages=messages,
        tools=tools,
        tool_choice=tool_choice,
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens,
        response_format=response_format,
        stream=stream,
        n=n,
        stop=stop,
        presence_penalty=presence_penalty,
        frequency_penalty=frequency_penalty,
        seed=seed,
        api_key=api_key,
        api_base=api_base,
        api_timeout=api_timeout,
        user=user,
        parallel_tool_calls=parallel_tool_calls,
        logprobs=logprobs,
        top_logprobs=top_logprobs,
        logit_bias=logit_bias,
        stream_options=stream_options,
        max_completion_tokens=max_completion_tokens,
        reasoning_effort=reasoning_effort,
        **kwargs,
    )

    return await provider_instance.acompletion(completion_params, **kwargs)