Domain Track
Difficulty 3/5C++ for AI / Machine Learning
C++ in AI and ML — high-performance inference, ONNX Runtime, libtorch, custom CUDA kernels, SIMD, and deploying ML models without Python.
Why C++ in AI/ML
Python is the training language; C++ is the deployment language. Most ML frameworks have C++ backends (PyTorch/libtorch, TensorFlow, ONNX Runtime) and expose native C++ APIs. Use C++ when you need:
- Low-latency inference — no Python GIL, no interpreter overhead
- Embedded / edge deployment — no Python runtime on device
- Custom operators / CUDA kernels — extend frameworks in C++
- Production serving — integrate ML directly into your C++ service
ONNX Runtime — universal inference engine
ONNX Runtime runs models exported from PyTorch, TensorFlow, scikit-learn, and more.
cpp
// CMakeLists.txt:
// find_package(onnxruntime REQUIRED)
// target_link_libraries(myapp onnxruntime::onnxruntime)
#include <onnxruntime_cxx_api.h>
#include <vector>
int main() {
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "inference");
Ort::SessionOptions opts;
opts.SetIntraOpNumThreads(4);
// opts.AppendExecutionProvider_CUDA(0); // enable GPU
Ort::Session session(env, L"model.onnx", opts);
// Input tensor setup
std::vector<float> input_data(1 * 3 * 224 * 224, 0.5f); // batch=1, CHW
std::vector<int64_t> input_shape = {1, 3, 224, 224};
Ort::MemoryInfo mem_info = Ort::MemoryInfo::CreateCpu(
OrtArenaAllocator, OrtMemTypeDefault);
Ort::Value input_tensor = Ort::Value::CreateTensor<float>(
mem_info, input_data.data(), input_data.size(),
input_shape.data(), input_shape.size());
// Run inference
const char* input_names[] = {"input"};
const char* output_names[] = {"output"};
auto outputs = session.Run(
Ort::RunOptions{nullptr},
input_names, &input_tensor, 1,
output_names, 1);
// Read output
float* out_data = outputs[0].GetTensorMutableData<float>();
auto out_shape = outputs[0].GetTensorTypeAndShapeInfo().GetShape();
// out_data[0..N-1] = logits
}libtorch — PyTorch C++ API
cpp
#include <torch/torch.h>
#include <torch/script.h> // TorchScript
// Load a TorchScript model (export with torch.jit.script or torch.jit.trace)
torch::jit::script::Module model = torch::jit::load("model.pt");
model.eval();
// Create input tensor
auto input = torch::randn({1, 3, 224, 224});
if (torch::cuda::is_available()) input = input.to(torch::kCUDA);
// Run inference
std::vector<torch::jit::IValue> inputs = {input};
auto output = model.forward(inputs).toTensor();
// Postprocess
auto probs = torch::softmax(output, 1);
auto [top_prob, top_idx] = probs.topk(5, 1);Defining a model in C++ (nn::Module)
cpp
struct Net : torch::nn::Module {
torch::nn::Linear fc1{nullptr}, fc2{nullptr};
Net(int in_features, int hidden, int num_classes) {
fc1 = register_module("fc1", torch::nn::Linear(in_features, hidden));
fc2 = register_module("fc2", torch::nn::Linear(hidden, num_classes));
}
torch::Tensor forward(torch::Tensor x) {
x = torch::relu(fc1->forward(x));
x = fc2->forward(x);
return x;
}
};
Net net(784, 256, 10);
auto optimizer = torch::optim::Adam(net.parameters(), 1e-3);
// Training step
optimizer.zero_grad();
auto loss = torch::nn::functional::cross_entropy(net.forward(batch_x), batch_y);
loss.backward();
optimizer.step();BLAS/LAPACK for linear algebra
For inference without a full ML framework — matrix multiplications power transformers, CNNs, MLPs.
cpp
// OpenBLAS / MKL — dgemm for double, sgemm for float
// C = alpha*A*B + beta*C
// A: m×k, B: k×n, C: m×n (column-major convention)
#include <cblas.h>
void matmul(const float* A, const float* B, float* C, int m, int k, int n) {
cblas_sgemm(
CblasRowMajor, // row-major storage
CblasNoTrans, CblasNoTrans,
m, n, k,
1.0f, // alpha
A, k,
B, n,
0.0f, // beta
C, n
);
}
// ~10-100× faster than naive loops on large matrices (SIMD + threading)Custom CUDA kernel
cpp
// softmax kernel — simple illustrative version
__global__ void softmax_kernel(const float* input, float* output, int n) {
int tid = threadIdx.x;
extern __shared__ float smem[];
// Load and find max (numerically stable softmax)
smem[tid] = (tid < n) ? input[tid] : -FLT_MAX;
__syncthreads();
for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {
if (tid < stride) smem[tid] = fmaxf(smem[tid], smem[tid + stride]);
__syncthreads();
}
float max_val = smem[0];
smem[tid] = (tid < n) ? expf(input[tid] - max_val) : 0.0f;
__syncthreads();
for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {
if (tid < stride) smem[tid] += smem[tid + stride];
__syncthreads();
}
float sum = smem[0];
if (tid < n) output[tid] = expf(input[tid] - max_val) / sum;
}
// Launch
int n = 1024;
softmax_kernel<<<1, n, n * sizeof(float)>>>(d_input, d_output, n);Embedding a tokenizer with HuggingFace tokenizers C++ bindings
cpp
// tokenizers-cpp (HuggingFace tokenizers Rust → C++ via FFI)
#include "tokenizers_cpp.h"
auto tokenizer = tokenizers::Tokenizer::FromFile("tokenizer.json");
std::string text = "The quick brown fox";
auto encoding = tokenizer->Encode(text, /*add_special_tokens=*/true);
std::vector<int> input_ids(encoding.GetIds().begin(), encoding.GetIds().end());
// input_ids ready for transformer modelOn-device inference: NCNN (Tencent, mobile/edge)
cpp
#include "ncnn/net.h"
ncnn::Net net;
net.opt.use_vulkan_compute = true; // GPU on Android/iOS
net.load_param("mobilenet.param");
net.load_model("mobilenet.bin");
ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgb_data, ncnn::Mat::PIXEL_RGB,
orig_w, orig_h, 224, 224);
// Normalize
const float mean_vals[3] = {103.94f, 116.78f, 123.68f};
const float norm_vals[3] = {0.017f, 0.017f, 0.017f};
in.substract_mean_normalize(mean_vals, norm_vals);
ncnn::Extractor ex = net.create_extractor();
ex.input("data", in);
ncnn::Mat out;
ex.extract("prob", out);
// out[i] = probability of class iKey libraries
| Library | Purpose | Notes |
|---|---|---|
| ONNX Runtime | Universal inference (CPU/GPU/TensorRT) | Microsoft, production-grade |
| libtorch | PyTorch C++ API | Training + inference |
| TensorFlow C API | TF inference | Official C API, C++ wrapper available |
| NCNN | Mobile/edge inference | No dependencies, Vulkan support |
| MNN | Mobile neural network | Alibaba, fast mobile inference |
| OpenBLAS / MKL | BLAS operations | Core of all matrix computation |
| Eigen | Matrix/vector math | Header-only, no BLAS needed |
| FAISS | Vector similarity search | Facebook AI, used for embeddings |
| Triton Inference Server | gRPC/HTTP serving | NVIDIA, ONNX/TensorRT/TorchScript |
SIMD for inference hot paths
cpp
#include <immintrin.h>
// AVX2 dot product — 8 floats per instruction
float dot_avx2(const float* a, const float* b, int n) {
__m256 sum = _mm256_setzero_ps();
int i = 0;
for (; i + 8 <= n; i += 8) {
__m256 va = _mm256_loadu_ps(a + i);
__m256 vb = _mm256_loadu_ps(b + i);
sum = _mm256_fmadd_ps(va, vb, sum); // FMA: sum += va * vb
}
// Horizontal sum
float tmp[8];
_mm256_storeu_ps(tmp, sum);
float result = tmp[0]+tmp[1]+tmp[2]+tmp[3]+tmp[4]+tmp[5]+tmp[6]+tmp[7];
for (; i < n; ++i) result += a[i] * b[i]; // remainder
return result;
}Quantization (INT8/FP16)
cpp
// ONNX Runtime INT8 quantization at load time
Ort::SessionOptions opts;
// ORT_ENABLE_ALL applies all quantization passes
opts.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
// Or load a pre-quantized INT8 model (quantized offline with onnxruntime tools)
// python -m onnxruntime.quantization.quantize_static model.onnx model_int8.onnx calibration_data
// libtorch FP16
auto model_half = model.to(torch::kHalf);
auto input_half = input.to(torch::kHalf);
auto output = model_half.forward({input_half}).toTensor();