Python API

Note: this API is in preview and is subject to change.

Install and import

The Python API is delivered by the onnxruntime-genai Python package.

pip install onnxruntime-genai
import onnxruntime_genai

Model class

Load a model

onnxruntime_genai.Model(config_path: str) -> Model
onnxruntime_genai.Model(config: onnxruntime_genai.Config) -> Model

Properties

  • type: Returns the model type as a string.

    model = onnxruntime_genai.Model("config.json")
    print(model.type)
    
  • device_type: Returns the device type as a string.

    print(model.device_type)
    

Methods

  • create_multimodal_processor() -> MultiModalProcessor

    processor = model.create_multimodal_processor()
    

Config class

onnxruntime_genai.Config(config_path: str) -> Config

Methods

  • append_provider(provider: str)

    config = onnxruntime_genai.Config("config.json")
    config.append_provider("CUDAExecutionProvider")
    
  • set_provider_option(option: str, value: str)

    config.set_provider_option("device_id", "0")
    
  • clear_providers()

    config.clear_providers()
    

GeneratorParams class

onnxruntime_genai.GeneratorParams(model: Model) -> GeneratorParams

Methods

  • set_inputs(named_tensors: NamedTensors)

    params = onnxruntime_genai.GeneratorParams(model)
    named_tensors = onnxruntime_genai.NamedTensors()
    params.set_inputs(named_tensors)
    
  • set_model_input(name: str, value: numpy.ndarray)

    import numpy as np
    params.set_model_input("input_ids", np.array([1, 2, 3], dtype=np.int32))
    
  • try_graph_capture_with_max_batch_size(max_batch_size: int)

    params.try_graph_capture_with_max_batch_size(8)
    
  • set_search_options(**options)

    params.set_search_options(temperature=0.7, top_p=0.9)
    
  • set_guidance(type: str, data: str)

    params.set_guidance("prefix", "Once upon a time")
    

Generator class

onnxruntime_genai.Generator(model: Model, params: GeneratorParams) -> Generator

Methods

  • is_done() -> bool

    generator = onnxruntime_genai.Generator(model, params)
    done = generator.is_done()
    
  • get_output(name: str) -> numpy.ndarray

    output = generator.get_output("output_ids")
    
  • append_tokens(tokens: numpy.ndarray[int32])

    generator.append_tokens(np.array([4, 5], dtype=np.int32))
    
  • append_tokens(tokens: onnxruntime_genai.Tensor)

    tensor = onnxruntime_genai.Tensor(np.array([4, 5], dtype=np.int32))
    generator.append_tokens(tensor)
    
  • get_logits() -> numpy.ndarray[float32]

    logits = generator.get_logits()
    
  • set_logits(new_logits: numpy.ndarray[float32])

    generator.set_logits(np.zeros_like(logits))
    
  • generate_next_token()

    generator.generate_next_token()
    
  • rewind_to(new_length: int)

    generator.rewind_to(2)
    
  • get_next_tokens() -> numpy.ndarray[int32]

    next_tokens = generator.get_next_tokens()
    
  • get_sequence(index: int) -> numpy.ndarray[int32]

    sequence = generator.get_sequence(0)
    
  • set_active_adapter(adapters: onnxruntime_genai.Adapters, adapter_name: str)

    adapters = onnxruntime_genai.Adapters(model)
    generator.set_active_adapter(adapters, "adapter_name")
    

Tokenizer class

onnxruntime_genai.Tokenizer(model: Model) -> Tokenizer

Methods

  • encode(text: str) -> numpy.ndarray[int32]

    tokenizer = onnxruntime_genai.Tokenizer(model)
    tokens = tokenizer.encode("Hello world")
    
  • to_token_id(text: str) -> int

    token_id = tokenizer.to_token_id("Hello")
    
  • decode(tokens: numpy.ndarray[int32]) -> str

    text = tokenizer.decode(tokens)
    
  • apply_chat_template(template_str: str, messages: str, tools: str = None, add_generation_prompt: bool = False) -> str

    chat = tokenizer.apply_chat_template("{user}: {message}", messages="Hi!", add_generation_prompt=True)
    
  • encode_batch(texts: list[str]) -> onnxruntime_genai.Tensor

    batch_tensor = tokenizer.encode_batch(["Hello", "World"])
    
  • decode_batch(tokens: onnxruntime_genai.Tensor) -> list[str]

    texts = tokenizer.decode_batch(batch_tensor)
    
  • create_stream() -> TokenizerStream

    stream = tokenizer.create_stream()
    

TokenizerStream class

onnxruntime_genai.TokenizerStream(tokenizer: Tokenizer) -> TokenizerStream

Methods

  • decode(token: int32) -> str

    token_str = stream.decode(123)
    

NamedTensors class

onnxruntime_genai.NamedTensors() -> NamedTensors

Methods

  • __getitem__(name: str) -> onnxruntime_genai.Tensor

    tensor = named_tensors["input_ids"]
    
  • __setitem__(name: str, value: numpy.ndarray or onnxruntime_genai.Tensor)

    named_tensors["input_ids"] = np.array([1, 2, 3], dtype=np.int32)
    
  • __contains__(name: str) -> bool

    exists = "input_ids" in named_tensors
    
  • __delitem__(name: str)

    del named_tensors["input_ids"]
    
  • __len__() -> int

    length = len(named_tensors)
    
  • keys() -> list[str]

    keys = named_tensors.keys()
    

Tensor class

onnxruntime_genai.Tensor(array: numpy.ndarray) -> Tensor

Methods

  • shape() -> list[int]

    tensor = onnxruntime_genai.Tensor(np.array([1, 2, 3]))
    print(tensor.shape())
    
  • type() -> int

    print(tensor.type())
    
  • data() -> memoryview

    data = tensor.data()
    
  • as_numpy() -> numpy.ndarray

    arr = tensor.as_numpy()
    

Adapters class

onnxruntime_genai.Adapters(model: Model) -> Adapters

Methods

  • unload(adapter_name: str)

    adapters.unload("adapter_name")
    
  • load(file: str, name: str)

    adapters.load("adapter_file.bin", "adapter_name")
    

MultiModalProcessor class

onnxruntime_genai.MultiModalProcessor(model: Model) -> MultiModalProcessor

Methods

  • __call__(prompt: str = None, images: Images = None, audios: Audios = None) -> onnxruntime_genai.Tensor

    result = processor(prompt="Describe this image", images=onnxruntime_genai.Images.open("image.png"))
    
  • create_stream() -> TokenizerStream

    stream = processor.create_stream()
    
  • decode(tokens: numpy.ndarray[int32]) -> str

    text = processor.decode(tokens)
    

Images class

onnxruntime_genai.Images.open(*image_paths: str) -> Images
onnxruntime_genai.Images.open_bytes(*image_datas: bytes) -> Images
images = onnxruntime_genai.Images.open("image1.png", "image2.jpg")
with open("image1.png", "rb") as f:
    images_bytes = onnxruntime_genai.Images.open_bytes(f.read())

Audios class

onnxruntime_genai.Audios.open(*audio_paths: str) -> Audios
onnxruntime_genai.Audios.open_bytes(*audio_datas: bytes) -> Audios
audios = onnxruntime_genai.Audios.open("audio1.wav")
with open("audio1.wav", "rb") as f:
    audios_bytes = onnxruntime_genai.Audios.open_bytes(f.read())

Utility functions

  • onnxruntime_genai.set_log_options(**options)

    onnxruntime_genai.set_log_options(verbose=True)
    
  • onnxruntime_genai.is_cuda_available() -> bool

    print(onnxruntime_genai.is_cuda_available())
    
  • onnxruntime_genai.is_dml_available() -> bool

    print(onnxruntime_genai.is_dml_available())
    
  • onnxruntime_genai.is_rocm_available() -> bool

    print(onnxruntime_genai.is_rocm_available())
    
  • onnxruntime_genai.is_webgpu_available() -> bool

    print(onnxruntime_genai.is_webgpu_available())
    
  • onnxruntime_genai.is_qnn_available() -> bool

    print(onnxruntime_genai.is_qnn_available())
    
  • onnxruntime_genai.is_openvino_available() -> bool

    print(onnxruntime_genai.is_openvino_available())
    
  • onnxruntime_genai.set_current_gpu_device_id(device_id: int)

    onnxruntime_genai.set_current_gpu_device_id(0)
    
  • onnxruntime_genai.get_current_gpu_device_id() -> int

    print(onnxruntime_genai.get_current_gpu_device_id())