SIMD Game Math in C++

TL;DR

SIMD (Single Instruction Multiple Data) processes 4–8 floats in one instruction. For game math — transforming thousands of entities, frustum culling, physics broadphase — SIMD is 4–8× faster than scalar code. Use GLM's SIMD backend for convenience or raw intrinsics for maximum control.

Why game math needs SIMD

A typical frame might transform 10,000 entities:

cpp

Godbolt

Scalar:   10,000 × 4×4 matrix × vec4 = 640,000 multiplications
SSE:      Same work in ~160,000 instructions (4× throughput)
AVX2:     Same work in ~80,000 instructions (8× throughput)

SIMD is not micro-optimization here — it's the difference between 2ms and 0.25ms for bulk transforms.

GLM with SIMD

GLM auto-uses SSE/AVX when you define the right macros:

cpp

Godbolt

// Before any GLM include
#define GLM_FORCE_INTRINSICS    // enable SIMD
#define GLM_FORCE_AVX2          // target AVX2 (or GLM_FORCE_SSE42)
#define GLM_FORCE_INLINE        // force-inline hot functions
#define GLM_FORCE_ALIGNED_GENTYPES  // 16-byte aligned types

#include <glm/glm.hpp>
#include <glm/gtc/matrix_transform.hpp>

// Same API — SIMD happens inside GLM
glm::vec4 v{1.0f, 0.0f, 0.0f, 1.0f};
glm::mat4 m = glm::rotate(glm::mat4{1.0f}, angle, glm::vec3{0, 1, 0});
glm::vec4 result = m * v;  // uses SIMD internally

Raw SSE intrinsics

For critical paths where you need full control:

cpp

Godbolt

#include <immintrin.h>  // SSE2/SSE4/AVX2

// Dot product of two vec3s (padded to 16 bytes)
float dot_sse(const float* a, const float* b) {
    __m128 va = _mm_load_ps(a);  // requires 16-byte alignment
    __m128 vb = _mm_load_ps(b);
    __m128 mul = _mm_mul_ps(va, vb);
    // Horizontal add: mul[0]+mul[1]+mul[2]
    __m128 shuf = _mm_shuffle_ps(mul, mul, _MM_SHUFFLE(2, 3, 0, 1));
    __m128 sums = _mm_add_ps(mul, shuf);
    shuf = _mm_movehl_ps(shuf, sums);
    sums = _mm_add_ss(sums, shuf);
    return _mm_cvtss_f32(sums);
}

// Normalize 4 vec3s simultaneously (AoSoA layout)
void normalize4(float* __restrict__ xs, float* __restrict__ ys,
                float* __restrict__ zs) {
    __m128 vx = _mm_load_ps(xs);
    __m128 vy = _mm_load_ps(ys);
    __m128 vz = _mm_load_ps(zs);

    // len_sq = x*x + y*y + z*z
    __m128 len_sq = _mm_add_ps(_mm_add_ps(_mm_mul_ps(vx, vx),
                                           _mm_mul_ps(vy, vy)),
                                _mm_mul_ps(vz, vz));

    // rsqrt = 1 / sqrt(len_sq) — fast approximate
    __m128 rsqrt = _mm_rsqrt_ps(len_sq);

    _mm_store_ps(xs, _mm_mul_ps(vx, rsqrt));
    _mm_store_ps(ys, _mm_mul_ps(vy, rsqrt));
    _mm_store_ps(zs, _mm_mul_ps(vz, rsqrt));
}

SoA for batch transforms

The secret to SIMD efficiency: Structure of Arrays layout.

cpp

Godbolt

// AoS — BAD for SIMD: positions interleaved
struct Transform {
    float x, y, z, w;
    float rx, ry, rz, rw;
    float sx, sy, sz, sw;
};
std::vector<Transform> transforms;  // x y z w rx ry rz rw ...

// SoA — GOOD for SIMD: same component packed together
struct TransformSoA {
    std::vector<float> x, y, z;    // positions
    std::vector<float> qx, qy, qz, qw;  // rotations
    std::vector<float> sx, sy, sz;  // scales
};

Bulk world-space position update (SSE)

cpp

Godbolt

void updateWorldPositions(TransformSoA& transforms,
                           const float* parent_x, const float* parent_y,
                           const float* parent_z, size_t count) {
    // Process 4 entities per iteration
    size_t simd_count = count & ~3u;
    for (size_t i = 0; i < simd_count; i += 4) {
        __m128 px = _mm_load_ps(&parent_x[i]);
        __m128 py = _mm_load_ps(&parent_y[i]);
        __m128 pz = _mm_load_ps(&parent_z[i]);

        __m128 lx = _mm_load_ps(&transforms.x[i]);
        __m128 ly = _mm_load_ps(&transforms.y[i]);
        __m128 lz = _mm_load_ps(&transforms.z[i]);

        _mm_store_ps(&transforms.x[i], _mm_add_ps(px, lx));
        _mm_store_ps(&transforms.y[i], _mm_add_ps(py, ly));
        _mm_store_ps(&transforms.z[i], _mm_add_ps(pz, lz));
    }

    // Scalar tail
    for (size_t i = simd_count; i < count; ++i) {
        transforms.x[i] += parent_x[i];
        transforms.y[i] += parent_y[i];
        transforms.z[i] += parent_z[i];
    }
}

AVX2 — 8 floats at once

cpp

Godbolt

// Requires: -mavx2 or /arch:AVX2
void scaleVelocities_avx2(float* vx, float* vy, float* vz,
                            float drag, size_t count) {
    __m256 drag8 = _mm256_set1_ps(drag);  // broadcast scalar to all 8 lanes

    size_t i = 0;
    for (; i + 8 <= count; i += 8) {
        _mm256_store_ps(vx + i, _mm256_mul_ps(_mm256_load_ps(vx + i), drag8));
        _mm256_store_ps(vy + i, _mm256_mul_ps(_mm256_load_ps(vy + i), drag8));
        _mm256_store_ps(vz + i, _mm256_mul_ps(_mm256_load_ps(vz + i), drag8));
    }
    for (; i < count; ++i) {
        vx[i] *= drag; vy[i] *= drag; vz[i] *= drag;
    }
}

SIMD frustum culling

Cull 4 sphere bounding volumes against a frustum in one pass:

cpp

Godbolt

// Each frustum plane: ax + by + cz + d >= 0 means "inside"
struct FrustumPlane { float a, b, c, d; };

struct FrustumSoA {
    // 6 planes × {a,b,c,d}
    float a[6], b[6], c[6], d[6];
};

// Returns bitmask: bit i set if sphere i is visible
uint32_t cullSpheres_sse(const FrustumSoA& f, const float* cx,
                          const float* cy, const float* cz,
                          const float* radii) {
    __m128 x = _mm_load_ps(cx);
    __m128 y = _mm_load_ps(cy);
    __m128 z = _mm_load_ps(cz);
    __m128 r = _mm_load_ps(radii);

    __m128 inside = _mm_set1_ps(-1.0f);  // all bits set

    for (int p = 0; p < 6; ++p) {
        __m128 pa = _mm_set1_ps(f.a[p]);
        __m128 pb = _mm_set1_ps(f.b[p]);
        __m128 pc = _mm_set1_ps(f.c[p]);
        __m128 pd = _mm_set1_ps(f.d[p]);

        // dist = a*cx + b*cy + c*cz + d
        __m128 dist = _mm_add_ps(_mm_add_ps(_mm_add_ps(
            _mm_mul_ps(pa, x),
            _mm_mul_ps(pb, y)),
            _mm_mul_ps(pc, z)),
            pd);

        // Sphere is outside plane if dist < -radius
        __m128 outside = _mm_cmplt_ps(dist, _mm_sub_ps(_mm_setzero_ps(), r));
        inside = _mm_andnot_ps(outside, inside);
    }

    return static_cast<uint32_t>(_mm_movemask_ps(inside));
}

Auto-vectorization hints

The compiler can often vectorize for you — help it:

cpp

Godbolt

// Hint: restrict means no aliasing — allows auto-vectorize
void addVelocity(float* __restrict__ x, const float* __restrict__ vx,
                  float dt, size_t n) {
    for (size_t i = 0; i < n; ++i)
        x[i] += vx[i] * dt;  // compiler vectorizes this
}

// Aligned allocation — required for _mm_load_ps (unaligned = _mm_loadu_ps)
float* allocAligned(size_t count) {
    return static_cast<float*>(
        std::aligned_alloc(32, count * sizeof(float)));
}

// Or use aligned_storage in a struct
struct alignas(32) Vec8 { float v[8]; };

Checking vectorization

bash

# GCC/Clang — emit vectorization report
g++ -O2 -march=native -fopt-info-vec-optimized -fopt-info-vec-missed src.cpp

# Clang
clang++ -O2 -march=native -Rpass=loop-vectorize -Rpass-missed=loop-vectorize src.cpp

Portable SIMD with `std::experimental::simd` (C++23)

cpp

Godbolt

#include <experimental/simd>
namespace stdx = std::experimental;

void normalize_simd(float* x, float* y, float* z, size_t n) {
    using floatv = stdx::native_simd<float>;
    constexpr size_t W = floatv::size();

    for (size_t i = 0; i < n; i += W) {
        floatv vx(&x[i], stdx::element_aligned);
        floatv vy(&y[i], stdx::element_aligned);
        floatv vz(&z[i], stdx::element_aligned);

        floatv len = stdx::sqrt(vx*vx + vy*vy + vz*vz);
        vx /= len; vy /= len; vz /= len;

        vx.copy_to(&x[i], stdx::element_aligned);
        vy.copy_to(&y[i], stdx::element_aligned);
        vz.copy_to(&z[i], stdx::element_aligned);
    }
}

std::simd is standard C++23 and generates optimal code on all ISAs (SSE, AVX2, NEON) without intrinsics.