ONNX Runtime GenAI C++ API

Note: this API is in preview and is subject to change.

Overview

This document describes the C++ API for ONNX Runtime GenAI.
Below are the main classes and methods, with code snippets and descriptions for each.


OgaModel

Create

Creates a model from a configuration directory, with optional runtime settings or config object.

auto model = OgaModel::Create("path/to/model_dir");
auto model2 = OgaModel::Create("path/to/model_dir", *settings);
auto model3 = OgaModel::Create(*config);

GetType

Gets the type of the model.

auto type = model->GetType();

GetDeviceType

Gets the device type used by the model.

auto device_type = model->GetDeviceType();

OgaConfig

Create

Creates a configuration object from a config path.

auto config = OgaConfig::Create("path/to/model_dir");

ClearProviders

Clears all providers from the configuration.

config->ClearProviders();

AppendProvider

Appends a provider to the configuration.

config->AppendProvider("CUDAExecutionProvider");

SetProviderOption

Sets a provider option in the configuration.

config->SetProviderOption("CUDAExecutionProvider", "device_id", "0");

Overlay

Overlays a JSON string onto the configuration.

config->Overlay("{\"option\": \"value\"}");

OgaRuntimeSettings

Create

Creates a runtime settings object.

auto settings = OgaRuntimeSettings::Create();

SetHandle

Sets a named handle in the runtime settings.

settings->SetHandle("custom_handle", handle_ptr);

OgaTokenizer

Create

Creates a tokenizer for the given model.

auto tokenizer = OgaTokenizer::Create(*model);

Encode

Encodes a string and adds the encoded sequence of tokens to the provided OgaSequences.

auto sequences = OgaSequences::Create();
tokenizer->Encode("Hello world", *sequences);

EncodeBatch

Encodes a batch of strings.

const char* texts[] = {"Hello", "World"};
auto tensor = tokenizer->EncodeBatch(texts, 2);

ToTokenId

Converts a string to its corresponding token ID.

int32_t token_id = tokenizer->ToTokenId("Hello");

Decode

Decodes a sequence of tokens into a string.

auto str = tokenizer->Decode(tokens, token_count);

ApplyChatTemplate

Applies a chat template to messages and tools.

auto templated = tokenizer->ApplyChatTemplate("template", "messages", "tools", true);

DecodeBatch

Decodes a batch of token sequences.

auto decoded = tokenizer->DecodeBatch(*tensor);

OgaTokenizerStream

Create

Creates a tokenizer stream for incremental decoding.

auto stream = OgaTokenizerStream::Create(*tokenizer);

Decode

Decodes a single token in the stream. If this results in a word being generated, it will be returned.

const char* chunk = stream->Decode(token);

OgaSequences

Create

Creates an empty OgaSequences object.

auto sequences = OgaSequences::Create();

Count

Returns the number of sequences.

size_t n = sequences->Count();

SequenceCount

Returns the number of tokens in the sequence at the given index.

size_t tokens = sequences->SequenceCount(0);

SequenceData

Returns a pointer to the token data for the sequence at the given index.

const int32_t* data = sequences->SequenceData(0);

Append

Appends a sequence of tokens or a single token to the sequences.

sequences->Append(tokens, token_count);
sequences->Append(token, sequence_index);

OgaGeneratorParams

Create

Creates generator parameters for the given model.

auto params = OgaGeneratorParams::Create(*model);

SetSearchOption

Sets a numeric search option.

params->SetSearchOption("max_length", 128);

SetSearchOptionBool

Sets a boolean search option.

params->SetSearchOptionBool("do_sample", true);

SetModelInput

Sets an additional model input.

params->SetModelInput("input_name", *tensor);

SetInputs

Sets named tensors as inputs.

params->SetInputs(*named_tensors);

SetGuidance

Sets guidance data.

params->SetGuidance("type", "data");

OgaGenerator

Create

Creates a generator from the given model and parameters.

auto generator = OgaGenerator::Create(*model, *params);

IsDone

Checks if generation is complete.

bool done = generator->IsDone();

AppendTokenSequences

Appends token sequences to the generator.

generator->AppendTokenSequences(*sequences);

AppendTokens

Appends tokens to the generator.

generator->AppendTokens(tokens, token_count);

IsSessionTerminated

Checks if the session is terminated.

bool terminated = generator->IsSessionTerminated();

GenerateNextToken

Generates the next token.

generator->GenerateNextToken();

RewindTo

Rewinds the sequence to a new length.

generator->RewindTo(new_length);

SetRuntimeOption

Sets a runtime option.

generator->SetRuntimeOption("terminate_session", "1");

GetSequenceCount

Returns the number of tokens in the sequence at the given index.

size_t count = generator->GetSequenceCount(0);

GetSequenceData

Returns a pointer to the sequence data at the given index.

const int32_t* data = generator->GetSequenceData(0);

GetOutput

Gets a named output tensor.

auto tensor = generator->GetOutput("output_name");

GetLogits

Gets the logits tensor.

auto logits = generator->GetLogits();

SetLogits

Sets the logits tensor.

generator->SetLogits(*tensor);

SetActiveAdapter

Sets the active adapter for the generator.

generator->SetActiveAdapter(*adapters, "adapter_name");

OgaTensor

Create

Creates a tensor from a buffer.

auto tensor = OgaTensor::Create(data, shape, shape_dims_count, element_type);

Type

Returns the element type of the tensor.

auto type = tensor->Type();

Shape

Returns the shape of the tensor.

auto shape = tensor->Shape();

Data

Returns a pointer to the tensor data.

void* data = tensor->Data();

OgaImages

Load

Loads images from file paths or memory buffers.

std::vector<const char*> image_paths = {"img1.png", "img2.png"};
auto images = OgaImages::Load(image_paths);

auto images2 = OgaImages::Load(image_data_ptrs, image_sizes, count);

OgaAudios

Load

Loads audios from file paths or memory buffers.

std::vector<const char*> audio_paths = {"audio1.wav", "audio2.wav"};
auto audios = OgaAudios::Load(audio_paths);

auto audios2 = OgaAudios::Load(audio_data_ptrs, audio_sizes, count);

OgaNamedTensors

Create

Creates a named tensors object.

auto named_tensors = OgaNamedTensors::Create();

Get

Gets a tensor by name.

auto tensor = named_tensors->Get("input_name");

Set

Sets a tensor by name.

named_tensors->Set("input_name", *tensor);

Delete

Deletes a tensor by name.

named_tensors->Delete("input_name");

Count

Returns the number of named tensors.

size_t count = named_tensors->Count();

GetNames

Gets the names of all tensors.

auto names = named_tensors->GetNames();

OgaAdapters

Create

Creates an adapters manager for the given model.

auto adapters = OgaAdapters::Create(*model);

LoadAdapter

Loads an adapter from file.

adapters->LoadAdapter("adapter_file_path", "adapter_name");

UnloadAdapter

Unloads an adapter by name.

adapters->UnloadAdapter("adapter_name");

OgaMultiModalProcessor

Create

Creates a multi-modal processor for the given model.

auto processor = OgaMultiModalProcessor::Create(*model);

ProcessImages

Processes images and returns named tensors.

auto named_tensors = processor->ProcessImages("prompt", images.get());

ProcessAudios

Processes audios and returns named tensors.

auto named_tensors = processor->ProcessAudios(audios.get());

ProcessImagesAndAudios

Processes both images and audios.

auto named_tensors = processor->ProcessImagesAndAudios("prompt", images.get(), audios.get());

Decode

Decodes a sequence of tokens into a string.

auto str = processor->Decode(tokens, token_count);

OgaHandle

Constructor / Destructor

Initializes and shuts down the global Oga runtime.

OgaHandle handle;

Oga Utility Functions

SetLogBool

Sets a boolean logging option.

Oga::SetLogBool("option_name", true);

SetLogString

Sets a string logging option.

Oga::SetLogString("option_name", "value");

SetCurrentGpuDeviceId

Sets the current GPU device ID.

Oga::SetCurrentGpuDeviceId(0);

GetCurrentGpuDeviceId

Gets the current GPU device ID.

int id = Oga::GetCurrentGpuDeviceId();