Python API
Note: this API is in preview and is subject to change.
- Install and import
- Model class
- Config class
- GeneratorParams class
- Generator class
- Tokenizer class
- TokenizerStream class
- NamedTensors class
- Tensor class
- Adapters class
- MultiModalProcessor class
- Images class
- Audios class
- Utility functions
Install and import
The Python API is delivered by the onnxruntime-genai Python package.
pip install onnxruntime-genai
import onnxruntime_genai
Model class
Load a model
onnxruntime_genai.Model(config_path: str) -> Model
onnxruntime_genai.Model(config: onnxruntime_genai.Config) -> Model
Properties
-
type
: Returns the model type as a string.model = onnxruntime_genai.Model("config.json") print(model.type)
-
device_type
: Returns the device type as a string.print(model.device_type)
Methods
-
create_multimodal_processor() -> MultiModalProcessor
processor = model.create_multimodal_processor()
Config class
onnxruntime_genai.Config(config_path: str) -> Config
Methods
-
append_provider(provider: str)
config = onnxruntime_genai.Config("config.json") config.append_provider("CUDAExecutionProvider")
-
set_provider_option(option: str, value: str)
config.set_provider_option("device_id", "0")
-
clear_providers()
config.clear_providers()
GeneratorParams class
onnxruntime_genai.GeneratorParams(model: Model) -> GeneratorParams
Methods
-
set_inputs(named_tensors: NamedTensors)
params = onnxruntime_genai.GeneratorParams(model) named_tensors = onnxruntime_genai.NamedTensors() params.set_inputs(named_tensors)
-
set_model_input(name: str, value: numpy.ndarray)
import numpy as np params.set_model_input("input_ids", np.array([1, 2, 3], dtype=np.int32))
-
try_graph_capture_with_max_batch_size(max_batch_size: int)
params.try_graph_capture_with_max_batch_size(8)
-
set_search_options(**options)
params.set_search_options(temperature=0.7, top_p=0.9)
-
set_guidance(type: str, data: str)
params.set_guidance("prefix", "Once upon a time")
Generator class
onnxruntime_genai.Generator(model: Model, params: GeneratorParams) -> Generator
Methods
-
is_done() -> bool
generator = onnxruntime_genai.Generator(model, params) done = generator.is_done()
-
get_output(name: str) -> numpy.ndarray
output = generator.get_output("output_ids")
-
append_tokens(tokens: numpy.ndarray[int32])
generator.append_tokens(np.array([4, 5], dtype=np.int32))
-
append_tokens(tokens: onnxruntime_genai.Tensor)
tensor = onnxruntime_genai.Tensor(np.array([4, 5], dtype=np.int32)) generator.append_tokens(tensor)
-
get_logits() -> numpy.ndarray[float32]
logits = generator.get_logits()
-
set_logits(new_logits: numpy.ndarray[float32])
generator.set_logits(np.zeros_like(logits))
-
generate_next_token()
generator.generate_next_token()
-
rewind_to(new_length: int)
generator.rewind_to(2)
-
get_next_tokens() -> numpy.ndarray[int32]
next_tokens = generator.get_next_tokens()
-
get_sequence(index: int) -> numpy.ndarray[int32]
sequence = generator.get_sequence(0)
-
set_active_adapter(adapters: onnxruntime_genai.Adapters, adapter_name: str)
adapters = onnxruntime_genai.Adapters(model) generator.set_active_adapter(adapters, "adapter_name")
Tokenizer class
onnxruntime_genai.Tokenizer(model: Model) -> Tokenizer
Methods
-
encode(text: str) -> numpy.ndarray[int32]
tokenizer = onnxruntime_genai.Tokenizer(model) tokens = tokenizer.encode("Hello world")
-
to_token_id(text: str) -> int
token_id = tokenizer.to_token_id("Hello")
-
decode(tokens: numpy.ndarray[int32]) -> str
text = tokenizer.decode(tokens)
-
apply_chat_template(template_str: str, messages: str, tools: str = None, add_generation_prompt: bool = False) -> str
chat = tokenizer.apply_chat_template("{user}: {message}", messages="Hi!", add_generation_prompt=True)
-
encode_batch(texts: list[str]) -> onnxruntime_genai.Tensor
batch_tensor = tokenizer.encode_batch(["Hello", "World"])
-
decode_batch(tokens: onnxruntime_genai.Tensor) -> list[str]
texts = tokenizer.decode_batch(batch_tensor)
-
create_stream() -> TokenizerStream
stream = tokenizer.create_stream()
TokenizerStream class
onnxruntime_genai.TokenizerStream(tokenizer: Tokenizer) -> TokenizerStream
Methods
-
decode(token: int32) -> str
token_str = stream.decode(123)
NamedTensors class
onnxruntime_genai.NamedTensors() -> NamedTensors
Methods
-
__getitem__(name: str) -> onnxruntime_genai.Tensor
tensor = named_tensors["input_ids"]
-
__setitem__(name: str, value: numpy.ndarray or onnxruntime_genai.Tensor)
named_tensors["input_ids"] = np.array([1, 2, 3], dtype=np.int32)
-
__contains__(name: str) -> bool
exists = "input_ids" in named_tensors
-
__delitem__(name: str)
del named_tensors["input_ids"]
-
__len__() -> int
length = len(named_tensors)
-
keys() -> list[str]
keys = named_tensors.keys()
Tensor class
onnxruntime_genai.Tensor(array: numpy.ndarray) -> Tensor
Methods
-
shape() -> list[int]
tensor = onnxruntime_genai.Tensor(np.array([1, 2, 3])) print(tensor.shape())
-
type() -> int
print(tensor.type())
-
data() -> memoryview
data = tensor.data()
-
as_numpy() -> numpy.ndarray
arr = tensor.as_numpy()
Adapters class
onnxruntime_genai.Adapters(model: Model) -> Adapters
Methods
-
unload(adapter_name: str)
adapters.unload("adapter_name")
-
load(file: str, name: str)
adapters.load("adapter_file.bin", "adapter_name")
MultiModalProcessor class
onnxruntime_genai.MultiModalProcessor(model: Model) -> MultiModalProcessor
Methods
-
__call__(prompt: str = None, images: Images = None, audios: Audios = None) -> onnxruntime_genai.Tensor
result = processor(prompt="Describe this image", images=onnxruntime_genai.Images.open("image.png"))
-
create_stream() -> TokenizerStream
stream = processor.create_stream()
-
decode(tokens: numpy.ndarray[int32]) -> str
text = processor.decode(tokens)
Images class
onnxruntime_genai.Images.open(*image_paths: str) -> Images
onnxruntime_genai.Images.open_bytes(*image_datas: bytes) -> Images
images = onnxruntime_genai.Images.open("image1.png", "image2.jpg")
with open("image1.png", "rb") as f:
images_bytes = onnxruntime_genai.Images.open_bytes(f.read())
Audios class
onnxruntime_genai.Audios.open(*audio_paths: str) -> Audios
onnxruntime_genai.Audios.open_bytes(*audio_datas: bytes) -> Audios
audios = onnxruntime_genai.Audios.open("audio1.wav")
with open("audio1.wav", "rb") as f:
audios_bytes = onnxruntime_genai.Audios.open_bytes(f.read())
Utility functions
-
onnxruntime_genai.set_log_options(**options)
onnxruntime_genai.set_log_options(verbose=True)
-
onnxruntime_genai.is_cuda_available() -> bool
print(onnxruntime_genai.is_cuda_available())
-
onnxruntime_genai.is_dml_available() -> bool
print(onnxruntime_genai.is_dml_available())
-
onnxruntime_genai.is_rocm_available() -> bool
print(onnxruntime_genai.is_rocm_available())
-
onnxruntime_genai.is_webgpu_available() -> bool
print(onnxruntime_genai.is_webgpu_available())
-
onnxruntime_genai.is_qnn_available() -> bool
print(onnxruntime_genai.is_qnn_available())
-
onnxruntime_genai.is_openvino_available() -> bool
print(onnxruntime_genai.is_openvino_available())
-
onnxruntime_genai.set_current_gpu_device_id(device_id: int)
onnxruntime_genai.set_current_gpu_device_id(0)
-
onnxruntime_genai.get_current_gpu_device_id() -> int
print(onnxruntime_genai.get_current_gpu_device_id())