Skip to content
C++
Domain Track
Difficulty 3/5

C++ for AI / Machine Learning

C++ in AI and ML — high-performance inference, ONNX Runtime, libtorch, custom CUDA kernels, SIMD, and deploying ML models without Python.

Why C++ in AI/ML

Python is the training language; C++ is the deployment language. Most ML frameworks have C++ backends (PyTorch/libtorch, TensorFlow, ONNX Runtime) and expose native C++ APIs. Use C++ when you need:

  • Low-latency inference — no Python GIL, no interpreter overhead
  • Embedded / edge deployment — no Python runtime on device
  • Custom operators / CUDA kernels — extend frameworks in C++
  • Production serving — integrate ML directly into your C++ service

ONNX Runtime — universal inference engine

ONNX Runtime runs models exported from PyTorch, TensorFlow, scikit-learn, and more.

cpp
// CMakeLists.txt:
// find_package(onnxruntime REQUIRED)
// target_link_libraries(myapp onnxruntime::onnxruntime)

#include <onnxruntime_cxx_api.h>
#include <vector>

int main() {
    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "inference");
    Ort::SessionOptions opts;
    opts.SetIntraOpNumThreads(4);
    // opts.AppendExecutionProvider_CUDA(0);  // enable GPU

    Ort::Session session(env, L"model.onnx", opts);

    // Input tensor setup
    std::vector<float> input_data(1 * 3 * 224 * 224, 0.5f);  // batch=1, CHW
    std::vector<int64_t> input_shape = {1, 3, 224, 224};

    Ort::MemoryInfo mem_info = Ort::MemoryInfo::CreateCpu(
        OrtArenaAllocator, OrtMemTypeDefault);

    Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
        mem_info, input_data.data(), input_data.size(),
        input_shape.data(), input_shape.size());

    // Run inference
    const char* input_names[] = {"input"};
    const char* output_names[] = {"output"};

    auto outputs = session.Run(
        Ort::RunOptions{nullptr},
        input_names, &input_tensor, 1,
        output_names, 1);

    // Read output
    float* out_data = outputs[0].GetTensorMutableData<float>();
    auto out_shape = outputs[0].GetTensorTypeAndShapeInfo().GetShape();
    // out_data[0..N-1] = logits
}

libtorch — PyTorch C++ API

cpp
#include <torch/torch.h>
#include <torch/script.h>  // TorchScript

// Load a TorchScript model (export with torch.jit.script or torch.jit.trace)
torch::jit::script::Module model = torch::jit::load("model.pt");
model.eval();

// Create input tensor
auto input = torch::randn({1, 3, 224, 224});
if (torch::cuda::is_available()) input = input.to(torch::kCUDA);

// Run inference
std::vector<torch::jit::IValue> inputs = {input};
auto output = model.forward(inputs).toTensor();

// Postprocess
auto probs = torch::softmax(output, 1);
auto [top_prob, top_idx] = probs.topk(5, 1);

Defining a model in C++ (nn::Module)

cpp
struct Net : torch::nn::Module {
    torch::nn::Linear fc1{nullptr}, fc2{nullptr};

    Net(int in_features, int hidden, int num_classes) {
        fc1 = register_module("fc1", torch::nn::Linear(in_features, hidden));
        fc2 = register_module("fc2", torch::nn::Linear(hidden, num_classes));
    }

    torch::Tensor forward(torch::Tensor x) {
        x = torch::relu(fc1->forward(x));
        x = fc2->forward(x);
        return x;
    }
};

Net net(784, 256, 10);
auto optimizer = torch::optim::Adam(net.parameters(), 1e-3);

// Training step
optimizer.zero_grad();
auto loss = torch::nn::functional::cross_entropy(net.forward(batch_x), batch_y);
loss.backward();
optimizer.step();

BLAS/LAPACK for linear algebra

For inference without a full ML framework — matrix multiplications power transformers, CNNs, MLPs.

cpp
// OpenBLAS / MKL — dgemm for double, sgemm for float

// C = alpha*A*B + beta*C
// A: m×k, B: k×n, C: m×n (column-major convention)
#include <cblas.h>

void matmul(const float* A, const float* B, float* C, int m, int k, int n) {
    cblas_sgemm(
        CblasRowMajor,   // row-major storage
        CblasNoTrans, CblasNoTrans,
        m, n, k,
        1.0f,            // alpha
        A, k,
        B, n,
        0.0f,            // beta
        C, n
    );
}
// ~10-100× faster than naive loops on large matrices (SIMD + threading)

Custom CUDA kernel

cpp
// softmax kernel — simple illustrative version
__global__ void softmax_kernel(const float* input, float* output, int n) {
    int tid = threadIdx.x;
    extern __shared__ float smem[];

    // Load and find max (numerically stable softmax)
    smem[tid] = (tid < n) ? input[tid] : -FLT_MAX;
    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {
        if (tid < stride) smem[tid] = fmaxf(smem[tid], smem[tid + stride]);
        __syncthreads();
    }
    float max_val = smem[0];

    smem[tid] = (tid < n) ? expf(input[tid] - max_val) : 0.0f;
    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {
        if (tid < stride) smem[tid] += smem[tid + stride];
        __syncthreads();
    }
    float sum = smem[0];

    if (tid < n) output[tid] = expf(input[tid] - max_val) / sum;
}

// Launch
int n = 1024;
softmax_kernel<<<1, n, n * sizeof(float)>>>(d_input, d_output, n);

Embedding a tokenizer with HuggingFace tokenizers C++ bindings

cpp
// tokenizers-cpp (HuggingFace tokenizers Rust → C++ via FFI)
#include "tokenizers_cpp.h"

auto tokenizer = tokenizers::Tokenizer::FromFile("tokenizer.json");

std::string text = "The quick brown fox";
auto encoding = tokenizer->Encode(text, /*add_special_tokens=*/true);

std::vector<int> input_ids(encoding.GetIds().begin(), encoding.GetIds().end());
// input_ids ready for transformer model

On-device inference: NCNN (Tencent, mobile/edge)

cpp
#include "ncnn/net.h"

ncnn::Net net;
net.opt.use_vulkan_compute = true;  // GPU on Android/iOS
net.load_param("mobilenet.param");
net.load_model("mobilenet.bin");

ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgb_data, ncnn::Mat::PIXEL_RGB,
    orig_w, orig_h, 224, 224);

// Normalize
const float mean_vals[3] = {103.94f, 116.78f, 123.68f};
const float norm_vals[3] = {0.017f, 0.017f, 0.017f};
in.substract_mean_normalize(mean_vals, norm_vals);

ncnn::Extractor ex = net.create_extractor();
ex.input("data", in);

ncnn::Mat out;
ex.extract("prob", out);
// out[i] = probability of class i

Key libraries

LibraryPurposeNotes
ONNX RuntimeUniversal inference (CPU/GPU/TensorRT)Microsoft, production-grade
libtorchPyTorch C++ APITraining + inference
TensorFlow C APITF inferenceOfficial C API, C++ wrapper available
NCNNMobile/edge inferenceNo dependencies, Vulkan support
MNNMobile neural networkAlibaba, fast mobile inference
OpenBLAS / MKLBLAS operationsCore of all matrix computation
EigenMatrix/vector mathHeader-only, no BLAS needed
FAISSVector similarity searchFacebook AI, used for embeddings
Triton Inference ServergRPC/HTTP servingNVIDIA, ONNX/TensorRT/TorchScript

SIMD for inference hot paths

cpp
#include <immintrin.h>

// AVX2 dot product — 8 floats per instruction
float dot_avx2(const float* a, const float* b, int n) {
    __m256 sum = _mm256_setzero_ps();
    int i = 0;
    for (; i + 8 <= n; i += 8) {
        __m256 va = _mm256_loadu_ps(a + i);
        __m256 vb = _mm256_loadu_ps(b + i);
        sum = _mm256_fmadd_ps(va, vb, sum);  // FMA: sum += va * vb
    }
    // Horizontal sum
    float tmp[8];
    _mm256_storeu_ps(tmp, sum);
    float result = tmp[0]+tmp[1]+tmp[2]+tmp[3]+tmp[4]+tmp[5]+tmp[6]+tmp[7];
    for (; i < n; ++i) result += a[i] * b[i];  // remainder
    return result;
}

Quantization (INT8/FP16)

cpp
// ONNX Runtime INT8 quantization at load time
Ort::SessionOptions opts;
// ORT_ENABLE_ALL applies all quantization passes
opts.SetGraphOptimizationLevel(ORT_ENABLE_ALL);

// Or load a pre-quantized INT8 model (quantized offline with onnxruntime tools)
// python -m onnxruntime.quantization.quantize_static model.onnx model_int8.onnx calibration_data

// libtorch FP16
auto model_half = model.to(torch::kHalf);
auto input_half = input.to(torch::kHalf);
auto output = model_half.forward({input_half}).toTensor();