Custom Allocators for HFT in C++
"Memory allocation strategies for ultra-low-latency C++: slab allocators, arena/pool allocators, PMR, and eliminating heap allocation on the hot path."
TL;DR
malloc/free are non-deterministic — they can take anywhere from 50ns to 50µs depending on heap state. In HFT, no allocation happens on the hot path. All memory is pre-allocated at startup and recycled. Use slab allocators for fixed-size objects, arenas for per-message scratch space, and C++17 PMR to retrofit standard containers.
Why allocator latency matters
malloc() latency (approximate):
Best case (free list hit): ~50 ns
Thread cache miss: ~200 ns
mmap() call: ~1–5 µs
Heavy fragmentation: ~10–50 µs
Target order latency: < 1 µsOne std::string construction on the hot path can eat your entire latency budget.
Slab allocator — fixed-size objects
A slab allocator keeps a free list of same-size blocks. Alloc/free = pointer swap.
template<typename T, size_t Capacity>
class SlabAllocator {
public:
SlabAllocator() {
// Pre-build free list
for (size_t i = 0; i < Capacity - 1; ++i)
reinterpret_cast<Slot*>(&storage_[i])->next =
reinterpret_cast<Slot*>(&storage_[i + 1]);
reinterpret_cast<Slot*>(&storage_[Capacity - 1])->next = nullptr;
free_head_ = reinterpret_cast<Slot*>(&storage_[0]);
}
template<typename... Args>
T* construct(Args&&... args) {
Slot* slot = free_head_;
if (!slot) return nullptr; // pool exhausted
free_head_ = slot->next;
return new (slot) T(std::forward<Args>(args)...);
}
void destroy(T* p) {
p->~T();
auto* slot = reinterpret_cast<Slot*>(p);
slot->next = free_head_;
free_head_ = slot;
}
size_t available() const {
size_t n = 0;
for (Slot* s = free_head_; s; s = s->next) ++n;
return n;
}
private:
union Slot {
alignas(T) std::byte data[sizeof(T)];
Slot* next;
};
std::array<Slot, Capacity> storage_;
Slot* free_head_;
};
// Usage: order pool — pre-allocated, zero runtime allocation
struct Order { uint64_t id; double price; int32_t qty; Side side; };
static SlabAllocator<Order, 65536> order_pool;
Order* o = order_pool.construct(next_id++, 150.25, 100, Side::Buy);
// ... process order ...
order_pool.destroy(o);Arena allocator — per-message scratch space
An arena bumps a pointer forward on each alloc; reset to base resets everything in O(1).
class ArenaAllocator {
public:
explicit ArenaAllocator(size_t capacity)
: buf_(std::make_unique<std::byte[]>(capacity))
, capacity_(capacity)
, offset_(0) {}
void* alloc(size_t size, size_t align = alignof(std::max_align_t)) {
size_t aligned = (offset_ + align - 1) & ~(align - 1);
if (aligned + size > capacity_) return nullptr; // out of space
offset_ = aligned + size;
return buf_.get() + aligned;
}
template<typename T, typename... Args>
T* create(Args&&... args) {
void* p = alloc(sizeof(T), alignof(T));
return p ? new (p) T(std::forward<Args>(args)...) : nullptr;
}
void reset() { offset_ = 0; } // O(1) — no destructors called
size_t used() const { return offset_; }
private:
std::unique_ptr<std::byte[]> buf_;
size_t capacity_;
size_t offset_;
};
// Per-message processing: allocate scratch, process, reset
static ArenaAllocator msg_arena{64 * 1024}; // 64KB per-message scratch
void processMarketUpdate(const RawMessage& raw) {
msg_arena.reset(); // free everything from last message — O(1)
auto* parsed = msg_arena.create<ParsedUpdate>(raw);
auto* enriched = msg_arena.create<EnrichedUpdate>(*parsed);
// ... all temporaries allocated from arena, no heap ...
}Lock-free pool — multi-threaded variant
#include <atomic>
template<typename T, size_t N>
class LockFreePool {
public:
LockFreePool() {
for (size_t i = 0; i < N - 1; ++i)
nodes_[i].next.store(&nodes_[i + 1], std::memory_order_relaxed);
nodes_[N - 1].next.store(nullptr, std::memory_order_relaxed);
head_.store(&nodes_[0], std::memory_order_release);
}
T* acquire() {
Node* node = head_.load(std::memory_order_acquire);
while (node) {
Node* next = node->next.load(std::memory_order_relaxed);
if (head_.compare_exchange_weak(node, next,
std::memory_order_release, std::memory_order_acquire))
return reinterpret_cast<T*>(node->storage);
}
return nullptr; // pool exhausted
}
void release(T* p) {
p->~T();
auto* node = reinterpret_cast<Node*>(p);
node->next.store(head_.load(std::memory_order_relaxed),
std::memory_order_relaxed);
while (!head_.compare_exchange_weak(node->next, node,
std::memory_order_release, std::memory_order_acquire))
{}
}
private:
struct Node {
alignas(T) std::byte storage[sizeof(T)];
std::atomic<Node*> next;
};
alignas(64) std::atomic<Node*> head_;
std::array<Node, N> nodes_;
};C++17 PMR — retrofit standard containers
std::pmr lets you use standard containers backed by a custom allocator:
#include <memory_resource>
#include <vector>
#include <string>
// Thread-local bump arena for per-request processing
class ThreadLocalArena {
static constexpr size_t BUFFER_SIZE = 256 * 1024;
static thread_local std::byte buffer_[BUFFER_SIZE];
static thread_local std::pmr::monotonic_buffer_resource resource_;
public:
static std::pmr::memory_resource* get() { return &resource_; }
static void reset() {
resource_.~monotonic_buffer_resource();
new (&resource_) std::pmr::monotonic_buffer_resource(
buffer_, BUFFER_SIZE, std::pmr::null_memory_resource());
}
};
thread_local std::byte ThreadLocalArena::buffer_[ThreadLocalArena::BUFFER_SIZE];
thread_local std::pmr::monotonic_buffer_resource ThreadLocalArena::resource_{
ThreadLocalArena::buffer_, ThreadLocalArena::BUFFER_SIZE,
std::pmr::null_memory_resource()
};
// Now use pmr containers — no heap allocation
void processOrder(const RawOrder& raw) {
ThreadLocalArena::reset();
auto* mr = ThreadLocalArena::get();
// pmr::vector, pmr::string — backed by our arena
std::pmr::vector<Leg> legs(mr);
legs.reserve(8);
std::pmr::string symbol(raw.symbol, mr);
// ... all allocations from arena ...
}Pinning memory to prevent page faults
Page faults during live trading can cause multi-millisecond spikes:
#include <sys/mman.h>
// Pre-allocate and lock hot memory into RAM
void* allocateLocked(size_t size) {
void* p = mmap(nullptr, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
if (p == MAP_FAILED) return nullptr;
// Lock pages — prevent swap, prevent page fault during access
if (mlock(p, size) != 0) {
munmap(p, size);
return nullptr;
}
// Touch every page to fault it in now, not later
memset(p, 0, size);
return p;
}
// At startup — lock all critical pools
struct MemoryLayout {
void* order_pool; // 64MB
void* market_data; // 128MB
void* scratch; // 16MB
} g_mem;
void initMemory() {
g_mem.order_pool = allocateLocked(64 * 1024 * 1024);
g_mem.market_data = allocateLocked(128 * 1024 * 1024);
g_mem.scratch = allocateLocked(16 * 1024 * 1024);
}Huge pages (2MB / 1GB)
Huge pages reduce TLB pressure — each TLB entry covers 2MB instead of 4KB:
// Linux transparent huge pages — hint to OS
void* p = mmap(nullptr, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
madvise(p, size, MADV_HUGEPAGE);
// Or explicit huge pages (requires hugepages configured in /proc)
void* p = mmap(nullptr, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);# Reserve 512 huge pages (2MB each = 1GB)
echo 512 | sudo tee /proc/sys/vm/nr_hugepagesAllocation audit — ensuring no hot-path allocs
// Override global new to catch unexpected allocations
#ifndef NDEBUG
static std::atomic<bool> g_alloc_forbidden{false};
void* operator new(size_t size) {
if (g_alloc_forbidden.load(std::memory_order_relaxed)) {
// Break into debugger or log + abort
__builtin_trap();
}
return malloc(size);
}
// In your hot-path thread
struct NoAllocGuard {
NoAllocGuard() { g_alloc_forbidden.store(true, std::memory_order_relaxed); }
~NoAllocGuard() { g_alloc_forbidden.store(false, std::memory_order_relaxed); }
};
void hotPathThread() {
NoAllocGuard guard;
while (running) {
processMarketData(); // any new/delete here = crash in debug builds
}
}
#endif