SIMD Game Math in C++
"Vectorized 3D math for games: SSE/AVX2 intrinsics, GLM's SIMD backend, SoA for batch transforms, and auto-vectorization hints."
TL;DR
SIMD (Single Instruction Multiple Data) processes 4–8 floats in one instruction. For game math — transforming thousands of entities, frustum culling, physics broadphase — SIMD is 4–8× faster than scalar code. Use GLM's SIMD backend for convenience or raw intrinsics for maximum control.
Why game math needs SIMD
A typical frame might transform 10,000 entities:
Scalar: 10,000 × 4×4 matrix × vec4 = 640,000 multiplications
SSE: Same work in ~160,000 instructions (4× throughput)
AVX2: Same work in ~80,000 instructions (8× throughput)SIMD is not micro-optimization here — it's the difference between 2ms and 0.25ms for bulk transforms.
GLM with SIMD
GLM auto-uses SSE/AVX when you define the right macros:
// Before any GLM include
#define GLM_FORCE_INTRINSICS // enable SIMD
#define GLM_FORCE_AVX2 // target AVX2 (or GLM_FORCE_SSE42)
#define GLM_FORCE_INLINE // force-inline hot functions
#define GLM_FORCE_ALIGNED_GENTYPES // 16-byte aligned types
#include <glm/glm.hpp>
#include <glm/gtc/matrix_transform.hpp>
// Same API — SIMD happens inside GLM
glm::vec4 v{1.0f, 0.0f, 0.0f, 1.0f};
glm::mat4 m = glm::rotate(glm::mat4{1.0f}, angle, glm::vec3{0, 1, 0});
glm::vec4 result = m * v; // uses SIMD internallyRaw SSE intrinsics
For critical paths where you need full control:
#include <immintrin.h> // SSE2/SSE4/AVX2
// Dot product of two vec3s (padded to 16 bytes)
float dot_sse(const float* a, const float* b) {
__m128 va = _mm_load_ps(a); // requires 16-byte alignment
__m128 vb = _mm_load_ps(b);
__m128 mul = _mm_mul_ps(va, vb);
// Horizontal add: mul[0]+mul[1]+mul[2]
__m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
__m128 sums = _mm_add_ps(mul, shuf);
shuf = _mm_movehl_ps(shuf, sums);
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
}
// Normalize 4 vec3s simultaneously (AoSoA layout)
void normalize4(float* __restrict__ xs, float* __restrict__ ys,
float* __restrict__ zs) {
__m128 vx = _mm_load_ps(xs);
__m128 vy = _mm_load_ps(ys);
__m128 vz = _mm_load_ps(zs);
// len_sq = x*x + y*y + z*z
__m128 len_sq = _mm_add_ps(_mm_add_ps(_mm_mul_ps(vx, vx),
_mm_mul_ps(vy, vy)),
_mm_mul_ps(vz, vz));
// rsqrt = 1 / sqrt(len_sq) — fast approximate
__m128 rsqrt = _mm_rsqrt_ps(len_sq);
_mm_store_ps(xs, _mm_mul_ps(vx, rsqrt));
_mm_store_ps(ys, _mm_mul_ps(vy, rsqrt));
_mm_store_ps(zs, _mm_mul_ps(vz, rsqrt));
}SoA for batch transforms
The secret to SIMD efficiency: Structure of Arrays layout.
// AoS — BAD for SIMD: positions interleaved
struct Transform {
float x, y, z, w;
float rx, ry, rz, rw;
float sx, sy, sz, sw;
};
std::vector<Transform> transforms; // x y z w rx ry rz rw ...
// SoA — GOOD for SIMD: same component packed together
struct TransformSoA {
std::vector<float> x, y, z; // positions
std::vector<float> qx, qy, qz, qw; // rotations
std::vector<float> sx, sy, sz; // scales
};Bulk world-space position update (SSE)
void updateWorldPositions(TransformSoA& transforms,
const float* parent_x, const float* parent_y,
const float* parent_z, size_t count) {
// Process 4 entities per iteration
size_t simd_count = count & ~3u;
for (size_t i = 0; i < simd_count; i += 4) {
__m128 px = _mm_load_ps(&parent_x[i]);
__m128 py = _mm_load_ps(&parent_y[i]);
__m128 pz = _mm_load_ps(&parent_z[i]);
__m128 lx = _mm_load_ps(&transforms.x[i]);
__m128 ly = _mm_load_ps(&transforms.y[i]);
__m128 lz = _mm_load_ps(&transforms.z[i]);
_mm_store_ps(&transforms.x[i], _mm_add_ps(px, lx));
_mm_store_ps(&transforms.y[i], _mm_add_ps(py, ly));
_mm_store_ps(&transforms.z[i], _mm_add_ps(pz, lz));
}
// Scalar tail
for (size_t i = simd_count; i < count; ++i) {
transforms.x[i] += parent_x[i];
transforms.y[i] += parent_y[i];
transforms.z[i] += parent_z[i];
}
}AVX2 — 8 floats at once
// Requires: -mavx2 or /arch:AVX2
void scaleVelocities_avx2(float* vx, float* vy, float* vz,
float drag, size_t count) {
__m256 drag8 = _mm256_set1_ps(drag); // broadcast scalar to all 8 lanes
size_t i = 0;
for (; i + 8 <= count; i += 8) {
_mm256_store_ps(vx + i, _mm256_mul_ps(_mm256_load_ps(vx + i), drag8));
_mm256_store_ps(vy + i, _mm256_mul_ps(_mm256_load_ps(vy + i), drag8));
_mm256_store_ps(vz + i, _mm256_mul_ps(_mm256_load_ps(vz + i), drag8));
}
for (; i < count; ++i) {
vx[i] *= drag; vy[i] *= drag; vz[i] *= drag;
}
}SIMD frustum culling
Cull 4 sphere bounding volumes against a frustum in one pass:
// Each frustum plane: ax + by + cz + d >= 0 means "inside"
struct FrustumPlane { float a, b, c, d; };
struct FrustumSoA {
// 6 planes × {a,b,c,d}
float a[6], b[6], c[6], d[6];
};
// Returns bitmask: bit i set if sphere i is visible
uint32_t cullSpheres_sse(const FrustumSoA& f, const float* cx,
const float* cy, const float* cz,
const float* radii) {
__m128 x = _mm_load_ps(cx);
__m128 y = _mm_load_ps(cy);
__m128 z = _mm_load_ps(cz);
__m128 r = _mm_load_ps(radii);
__m128 inside = _mm_set1_ps(-1.0f); // all bits set
for (int p = 0; p < 6; ++p) {
__m128 pa = _mm_set1_ps(f.a[p]);
__m128 pb = _mm_set1_ps(f.b[p]);
__m128 pc = _mm_set1_ps(f.c[p]);
__m128 pd = _mm_set1_ps(f.d[p]);
// dist = a*cx + b*cy + c*cz + d
__m128 dist = _mm_add_ps(_mm_add_ps(_mm_add_ps(
_mm_mul_ps(pa, x),
_mm_mul_ps(pb, y)),
_mm_mul_ps(pc, z)),
pd);
// Sphere is outside plane if dist < -radius
__m128 outside = _mm_cmplt_ps(dist, _mm_sub_ps(_mm_setzero_ps(), r));
inside = _mm_andnot_ps(outside, inside);
}
return static_cast<uint32_t>(_mm_movemask_ps(inside));
}Auto-vectorization hints
The compiler can often vectorize for you — help it:
// Hint: restrict means no aliasing — allows auto-vectorize
void addVelocity(float* __restrict__ x, const float* __restrict__ vx,
float dt, size_t n) {
for (size_t i = 0; i < n; ++i)
x[i] += vx[i] * dt; // compiler vectorizes this
}
// Aligned allocation — required for _mm_load_ps (unaligned = _mm_loadu_ps)
float* allocAligned(size_t count) {
return static_cast<float*>(
std::aligned_alloc(32, count * sizeof(float)));
}
// Or use aligned_storage in a struct
struct alignas(32) Vec8 { float v[8]; };Checking vectorization
# GCC/Clang — emit vectorization report
g++ -O2 -march=native -fopt-info-vec-optimized -fopt-info-vec-missed src.cpp
# Clang
clang++ -O2 -march=native -Rpass=loop-vectorize -Rpass-missed=loop-vectorize src.cppPortable SIMD with std::experimental::simd (C++23)
#include <experimental/simd>
namespace stdx = std::experimental;
void normalize_simd(float* x, float* y, float* z, size_t n) {
using floatv = stdx::native_simd<float>;
constexpr size_t W = floatv::size();
for (size_t i = 0; i < n; i += W) {
floatv vx(&x[i], stdx::element_aligned);
floatv vy(&y[i], stdx::element_aligned);
floatv vz(&z[i], stdx::element_aligned);
floatv len = stdx::sqrt(vx*vx + vy*vy + vz*vz);
vx /= len; vy /= len; vz /= len;
vx.copy_to(&x[i], stdx::element_aligned);
vy.copy_to(&y[i], stdx::element_aligned);
vz.copy_to(&z[i], stdx::element_aligned);
}
}std::simd is standard C++23 and generates optimal code on all ISAs (SSE, AVX2, NEON) without intrinsics.