C++ in Embedded & IoT
"C++ for embedded systems: freestanding C++, MISRA-C++, RTOS integration (FreeRTOS, Zephyr), bare-metal patterns, deterministic timing, and zero-overhead abstractions."
C++ on microcontrollers
Modern C++ (C++14/17/20) is fully usable on bare-metal microcontrollers — ARM Cortex-M, RISC-V, AVR, ESP32. The key insight: most of C++ has zero overhead. Templates, constexpr, RAII, and inline functions generate identical machine code to their C equivalents when optimized. The overhead comes from specific features: exceptions, RTTI, dynamic allocation.
Freestanding C++
A freestanding implementation provides only a minimal subset of the standard library — no dynamic memory, no filesystem, no threads, no I/O streams. Most embedded compilers target freestanding mode.
// Available in freestanding C++
#include <cstdint> // uint8_t, uint32_t, ...
#include <cstddef> // size_t, ptrdiff_t, nullptr_t
#include <type_traits> // all type trait templates
#include <limits> // std::numeric_limits
#include <bit> // std::bit_cast, popcount (C++20)
#include <array> // std::array (C++23 made more of this freestanding)
#include <span> // std::span (C++23 freestanding)
// NOT available without hosted stdlib:
// #include <vector> // dynamic memory
// #include <string> // dynamic memory
// #include <iostream> // file I/O, streams
// #include <thread> // OS threadsDisable exceptions and RTTI
# CMakeLists.txt for embedded
target_compile_options(firmware PRIVATE
-fno-exceptions # disables exception handling code
-fno-rtti # disables typeid and dynamic_cast
-fno-threadsafe-statics # no mutex for static local init
-Os # optimize for size
)Zero-overhead abstractions for hardware
Register abstraction with constexpr
// Model a peripheral register with compile-time address
template<std::uintptr_t Address, typename T = uint32_t>
struct Register {
static volatile T& ref() {
return *reinterpret_cast<volatile T*>(Address);
}
static T read() { return ref(); }
static void write(T val) { ref() = val; }
static void set(T mask) { ref() |= mask; }
static void clear(T mask) { ref() &= ~mask; }
};
// STM32F4 GPIOA ODR (Output Data Register)
using GPIOA_ODR = Register<0x40020014>;
GPIOA_ODR::set(1 << 5); // set pin PA5
GPIOA_ODR::clear(1 << 5); // clear pin PA5
// Generates: LDR/ORR/STR — same as hand-written CGPIO abstraction without virtual dispatch
template<uint32_t Port, uint8_t Pin>
struct GPIO {
static constexpr uint32_t mask = 1u << Pin;
static void set() noexcept {
*reinterpret_cast<volatile uint32_t*>(Port + 0x14) = mask;
}
static void clear() noexcept {
*reinterpret_cast<volatile uint32_t*>(Port + 0x18) = mask;
}
static bool read() noexcept {
return (*reinterpret_cast<volatile uint32_t*>(Port + 0x10) & mask) != 0;
}
};
// Usage — all calls inlined, zero overhead
using LED = GPIO<0x40020000, 5>; // GPIOA Pin 5
LED::set();
LED::clear();RAII on bare metal
RAII works perfectly without the heap:
// Interrupt guard — disable interrupts for a critical section
class CriticalSection {
public:
CriticalSection() noexcept : saved_primask_(__get_PRIMASK()) {
__disable_irq();
}
~CriticalSection() noexcept {
if (!saved_primask_) __enable_irq();
}
CriticalSection(const CriticalSection&) = delete;
private:
uint32_t saved_primask_;
};
void update_shared_state() {
CriticalSection cs; // interrupts disabled here
shared_data_.value = compute();
// cs destructor re-enables interrupts automatically
} // even if exception (if enabled) or early returnStatic memory — no heap required
// Static buffer instead of std::vector
template<typename T, std::size_t N>
class StaticVector {
alignas(T) std::byte storage_[sizeof(T) * N];
std::size_t size_ = 0;
public:
bool push_back(const T& val) noexcept {
if (size_ >= N) return false;
new(storage_ + size_ * sizeof(T)) T{val};
++size_;
return true;
}
T& operator[](std::size_t i) noexcept {
return *std::launder(reinterpret_cast<T*>(storage_ + i * sizeof(T)));
}
std::size_t size() const noexcept { return size_; }
std::size_t capacity() const noexcept { return N; }
};
StaticVector<SensorReading, 64> readings;
readings.push_back({timestamp, value});RTOS integration
FreeRTOS with C++ tasks
#include "FreeRTOS.h"
#include "task.h"
// Wrap FreeRTOS task in a C++ class
class SensorTask {
TaskHandle_t handle_{nullptr};
StaticTask_t task_buffer_;
StackType_t stack_[256];
public:
void start(UBaseType_t priority) {
handle_ = xTaskCreateStatic(
&SensorTask::task_func,
"Sensor",
256,
this,
priority,
stack_,
&task_buffer_
);
}
void run() {
TickType_t last_wake = xTaskGetTickCount();
for (;;) {
vTaskDelayUntil(&last_wake, pdMS_TO_TICKS(10));
read_sensor();
}
}
private:
static void task_func(void* param) {
static_cast<SensorTask*>(param)->run();
}
void read_sensor() {
// ADC read, filter, enqueue
}
};
// Static allocation — no heap needed for the task itself
SensorTask sensor_task;Thread-safe queue between tasks
#include "queue.h"
template<typename T, std::size_t N>
class RTOSQueue {
QueueHandle_t handle_;
StaticQueue_t queue_def_;
uint8_t storage_[N * sizeof(T)];
public:
RTOSQueue() {
handle_ = xQueueCreateStatic(N, sizeof(T), storage_, &queue_def_);
}
bool send(const T& item, TickType_t timeout = 0) noexcept {
return xQueueSend(handle_, &item, timeout) == pdTRUE;
}
bool receive(T& item, TickType_t timeout = portMAX_DELAY) noexcept {
return xQueueReceive(handle_, &item, timeout) == pdTRUE;
}
};
RTOSQueue<SensorReading, 16> sensor_queue;MISRA-C++ guidelines
MISRA-C++ (2023 edition) is a subset of C++ for safety-critical systems (automotive, aerospace, medical). Key rules:
| Rule | Rationale |
|---|---|
| No dynamic memory allocation after init | Heap fragmentation, non-deterministic timing |
| No recursion | Stack depth unpredictable |
| No exceptions | Overhead, non-determinism |
| No RTTI | Code size, overhead |
| Limited use of templates | Complexity, code bloat |
| All paths must return a value | Undefined behavior if not |
| No implicit integral conversions | Width-dependent bugs |
// MISRA compliant pattern
// Explicit types, no implicit conversion
uint32_t read_adc() noexcept; // noexcept — no exceptions
// Explicit cast where widening occurs
uint32_t raw = read_adc();
uint64_t accumulated = static_cast<uint64_t>(raw) + prev_sum;
// No VLAs, no alloca — use fixed-size arrays
void process(uint8_t data[32], uint8_t len) noexcept;
// Better with span (C++20 / MISRA 2023):
void process(std::span<const uint8_t, 32> data) noexcept;Linker script essentials
/* Typical ARM Cortex-M linker script */
MEMORY {
FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 256K
SRAM (rwx) : ORIGIN = 0x20000000, LENGTH = 64K
}
SECTIONS {
.text : { *(.text*) *(.rodata*) } > FLASH
.data : {
_data_start = .;
*(.data*)
_data_end = .;
} > SRAM AT > FLASH /* stored in FLASH, runs from SRAM */
.bss : {
_bss_start = .;
*(.bss*)
*(COMMON)
_bss_end = .;
} > SRAM
_stack_top = ORIGIN(SRAM) + LENGTH(SRAM);
}C++ startup must call global constructors (.init_array) before main():
// startup.cpp — called before main
extern "C" void __libc_init_array(); // calls global constructors
extern "C" void Reset_Handler() {
// 1. Copy .data from FLASH to SRAM
// 2. Zero .bss
// 3. Call global constructors
__libc_init_array();
// 4. Call main
main();
// 5. Loop forever (never return from main on embedded)
for (;;) {}
}Size optimization
# Optimize for size
-Os # size-optimized (similar to -O2 but avoids size-increasing opts)
-Oz # (Clang) even more aggressive size reduction
# Strip debug symbols for release
-s # strip all symbols
objcopy --strip-debug firmware.elf firmware_stripped.elf
# Check binary size breakdown
size firmware.elf # text/data/bss breakdown
nm --size-sort firmware.elf | tail -20 # largest symbols
# Link-time optimization — removes dead code across TUs
-fltoToolchains
| Target | Toolchain |
|---|---|
| ARM Cortex-M (STM32, NXP) | arm-none-eabi-gcc / arm-none-eabi-g++ |
| RISC-V | riscv32-unknown-elf-g++ |
| AVR (Arduino) | avr-g++ |
| ESP32 (Xtensa) | ESP-IDF with xtensa-esp32-elf-g++ |
| ESP32-C3 (RISC-V) | ESP-IDF with riscv32-esp-elf-g++ |
| Nordic nRF5x | arm-none-eabi-g++ + nRF SDK / Zephyr |