Skip to content

Completion

The completion and acompletion functions are the primary way to generate chat completions across all supported providers. They accept an OpenAI-compatible parameter set and return OpenAI-compatible response types.

def completion(
model: str,
messages: list[dict[str, Any] | ChatCompletionMessage],
*,
provider: str | LLMProvider | None = None,
tools: list[dict[str, Any] | Callable[..., Any]] | None = None,
tool_choice: str | dict[str, Any] | None = None,
temperature: float | None = None,
top_p: float | None = None,
max_tokens: int | None = None,
response_format: dict[str, Any] | type | None = None,
stream: bool | None = None,
n: int | None = None,
stop: str | list[str] | None = None,
presence_penalty: float | None = None,
frequency_penalty: float | None = None,
seed: int | None = None,
api_key: str | None = None,
api_base: str | None = None,
user: str | None = None,
session_label: str | None = None,
parallel_tool_calls: bool | None = None,
logprobs: bool | None = None,
top_logprobs: int | None = None,
logit_bias: dict[str, float] | None = None,
stream_options: dict[str, Any] | None = None,
max_completion_tokens: int | None = None,
reasoning_effort: Literal['none', 'minimal', 'low', 'medium', 'high', 'xhigh', 'auto'] | None = "auto",
client_args: dict[str, Any] | None = None,
**kwargs: Any,
) -> ChatCompletion | Iterator[ChatCompletionChunk]

Async variant with the same parameters. Returns ChatCompletion | AsyncIterator[ChatCompletionChunk].

async def acompletion(
model: str,
messages: list[dict[str, Any] | ChatCompletionMessage],
*,
provider: str | LLMProvider | None = None,
tools: list[dict[str, Any] | Callable[..., Any]] | None = None,
tool_choice: str | dict[str, Any] | None = None,
temperature: float | None = None,
top_p: float | None = None,
max_tokens: int | None = None,
response_format: dict[str, Any] | type | None = None,
stream: bool | None = None,
n: int | None = None,
stop: str | list[str] | None = None,
presence_penalty: float | None = None,
frequency_penalty: float | None = None,
seed: int | None = None,
api_key: str | None = None,
api_base: str | None = None,
user: str | None = None,
session_label: str | None = None,
parallel_tool_calls: bool | None = None,
logprobs: bool | None = None,
top_logprobs: int | None = None,
logit_bias: dict[str, float] | None = None,
stream_options: dict[str, Any] | None = None,
max_completion_tokens: int | None = None,
reasoning_effort: Literal['none', 'minimal', 'low', 'medium', 'high', 'xhigh', 'auto'] | None = "auto",
client_args: dict[str, Any] | None = None,
**kwargs: Any,
) -> ChatCompletion | AsyncIterator[ChatCompletionChunk]
ParameterTypeDefaultDescription
modelstrrequiredModel identifier. Recommended: Use with separate provider parameter (e.g., model=‘gpt-4’, provider=‘openai’). Alternative: Combined format ‘provider:model’ (e.g., ‘openai:gpt-4’). Legacy format ‘provider/model’ is also supported but deprecated.
messageslist[dict[str, Any] | ChatCompletionMessage]requiredList of messages for the conversation
providerstr | LLMProvider | NoneNoneRecommended: Provider name to use for the request (e.g., ‘openai’, ‘mistral’). When provided, the model parameter should contain only the model name.
toolslist[dict[str, Any] | Callable[..., Any]] | NoneNoneList of tools for tool calling. Can be Python callables or OpenAI tool format dicts
tool_choicestr | dict[str, Any] | NoneNoneControls which tools the model can call
temperaturefloat | NoneNoneControls randomness in the response (0.0 to 2.0)
top_pfloat | NoneNoneControls diversity via nucleus sampling (0.0 to 1.0)
max_tokensint | NoneNoneMaximum number of tokens to generate
response_formatdict[str, Any] | type | NoneNoneFormat specification for the response
streambool | NoneNoneWhether to stream the response
nint | NoneNoneNumber of completions to generate
stopstr | list[str] | NoneNoneStop sequences for generation
presence_penaltyfloat | NoneNonePenalize new tokens based on presence in text
frequency_penaltyfloat | NoneNonePenalize new tokens based on frequency in text
seedint | NoneNoneRandom seed for reproducible results
api_keystr | NoneNoneAPI key for the provider
api_basestr | NoneNoneBase URL for the provider API
userstr | NoneNoneUnique identifier for the end user
session_labelstr | NoneNoneOptional user session label metadata for platform traces; exported as anyllm.user_session_label
parallel_tool_callsbool | NoneNoneWhether to allow parallel tool calls
logprobsbool | NoneNoneInclude token-level log probabilities in the response
top_logprobsint | NoneNoneNumber of alternatives to return when logprobs are requested
logit_biasdict[str, float] | NoneNoneBias the likelihood of specified tokens during generation
stream_optionsdict[str, Any] | NoneNoneAdditional options controlling streaming behavior
max_completion_tokensint | NoneNoneMaximum number of tokens for the completion
reasoning_effortLiteral['none', 'minimal', 'low', 'medium', 'high', 'xhigh', 'auto'] | None”auto”Reasoning effort level for models that support it. “auto” will map to each provider’s default.
client_argsdict[str, Any] | NoneNoneAdditional provider-specific arguments that will be passed to the provider’s client instantiation.
**kwargsAnyrequiredAdditional provider-specific arguments that will be passed to the provider’s API call.
  • Non-streaming (stream=None or stream=False): Returns a ChatCompletion object.
  • Streaming (stream=True): Returns an Iterator[ChatCompletionChunk] (sync) or AsyncIterator[ChatCompletionChunk] (async).
  • Structured output (when response_format is a Pydantic model or dataclass): Returns a ParsedChatCompletion[T] with a .choices[0].message.parsed field containing the deserialized object.
from any_llm import completion
response = completion(
model="mistral-small-latest",
provider="mistral",
messages=[{"role": "user", "content": "What is the capital of France?"}],
)
print(response.choices[0].message.content)
for chunk in completion(
model="gpt-4.1-mini",
provider="openai",
messages=[{"role": "user", "content": "Tell me a story."}],
stream=True,
):
print(chunk.choices[0].delta.content or "", end="")
import asyncio
from any_llm import acompletion
async def main():
response = await acompletion(
model="claude-sonnet-4-20250514",
provider="anthropic",
messages=[{"role": "user", "content": "Hello!"}],
)
print(response.choices[0].message.content)
asyncio.run(main())
from pydantic import BaseModel
from any_llm import completion
class CityInfo(BaseModel):
name: str
country: str
population: int
response = completion(
model="gpt-4.1-mini",
provider="openai",
messages=[{"role": "user", "content": "Tell me about Paris."}],
response_format=CityInfo,
)
city = response.choices[0].message.parsed
print(f"{city.name}, {city.country} - pop. {city.population}")
from any_llm import completion
def get_weather(location: str, unit: str = "F") -> str:
"""Get weather information for a location.
Args:
location: The city or location to get weather for
unit: Temperature unit, either 'C' or 'F'
Returns:
Current weather description
"""
return f"Weather in {location} is sunny and 75{unit}!"
response = completion(
model="mistral-small-latest",
provider="mistral",
messages=[{"role": "user", "content": "What's the weather in Paris?"}],
tools=[get_weather],
)