Domain Deep-Dive
ExpertBare-Metal C++ Startup & Linker Scripts
"ARM Cortex-M startup sequence in C++: vector tables, Reset_Handler, .data/.bss init, global constructors, and linker script anatomy."
TL;DR
On bare-metal, the C++ runtime (cstartup) initializes before main(): copies .data from flash to RAM, zeroes .bss, and calls global constructors. You must provide a vector table and Reset_Handler. Understand this sequence to debug startup failures and control init order.
Boot sequence (ARM Cortex-M)
cpp
Power on / Reset
↓
CPU loads Stack Pointer from vector_table[0]
CPU loads PC (program counter) from vector_table[1] = Reset_Handler
↓
Reset_Handler:
1. Set up stack (if needed)
2. Copy .data from flash to RAM
3. Zero .bss
4. Call __libc_init_array() → calls global constructors
5. Call main()
6. If main() returns → infinite loop or resetVector table
cpp
// startup.cpp
#include <cstdint>
// Declare handlers as weak — user can override
extern "C" {
void Reset_Handler();
void NMI_Handler() __attribute__((weak, alias("Default_Handler")));
void HardFault_Handler() __attribute__((weak, alias("Default_Handler")));
void MemManage_Handler() __attribute__((weak, alias("Default_Handler")));
void BusFault_Handler() __attribute__((weak, alias("Default_Handler")));
void UsageFault_Handler() __attribute__((weak, alias("Default_Handler")));
void SVC_Handler() __attribute__((weak, alias("Default_Handler")));
void PendSV_Handler() __attribute__((weak, alias("Default_Handler")));
void SysTick_Handler() __attribute__((weak, alias("Default_Handler")));
// Device-specific IRQs
void USART1_IRQHandler() __attribute__((weak, alias("Default_Handler")));
// ... more IRQs
void Default_Handler() {
while (true) {} // infinite loop — attach debugger to find which fault
}
}
// Linker provides these symbols
extern uint32_t _estack; // top of stack (end of RAM)
extern uint32_t _sidata; // start of .data in flash
extern uint32_t _sdata; // start of .data in RAM
extern uint32_t _edata; // end of .data in RAM
extern uint32_t _sbss; // start of .bss
extern uint32_t _ebss; // end of .bss
// Vector table — must be at 0x00000000 or remapped via VTOR
__attribute__((section(".isr_vector")))
const uint32_t vector_table[] = {
(uint32_t)&_estack, // [0] Initial stack pointer
(uint32_t)Reset_Handler, // [1] Reset
(uint32_t)NMI_Handler, // [2] NMI
(uint32_t)HardFault_Handler, // [3] Hard Fault
(uint32_t)MemManage_Handler, // [4] MemManage
(uint32_t)BusFault_Handler, // [5] BusFault
(uint32_t)UsageFault_Handler,// [6] UsageFault
0, 0, 0, 0, // [7-10] Reserved
(uint32_t)SVC_Handler, // [11] SVCall
0, 0, // [12-13] Reserved
(uint32_t)PendSV_Handler, // [14] PendSV
(uint32_t)SysTick_Handler, // [15] SysTick
// External interrupts start at [16]
(uint32_t)USART1_IRQHandler,
// ...
};Reset_Handler — the C++ runtime init
cpp
extern "C" void Reset_Handler() {
// 1. Copy .data from flash to RAM
uint32_t* src = &_sidata; // flash source
uint32_t* dst = &_sdata; // RAM destination
while (dst < &_edata)
*dst++ = *src++;
// 2. Zero .bss
dst = &_sbss;
while (dst < &_ebss)
*dst++ = 0;
// 3. Call global constructors (C++ init)
// __libc_init_array() calls functions in .preinit_array and .init_array
// These include constructors for global C++ objects
extern void __libc_init_array();
__libc_init_array();
// 4. Call application
extern int main();
main();
// 5. Halt if main returns
while (true) {}
}Linker script
/* STM32F4xx example — 1MB flash, 192KB RAM */
MEMORY {
FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 1024K
RAM (rwx) : ORIGIN = 0x20000000, LENGTH = 128K
CCMRAM(rwx) : ORIGIN = 0x10000000, LENGTH = 64K /* Core-Coupled Memory */
}
ENTRY(Reset_Handler)
SECTIONS {
/* Vector table — must be first in flash */
.isr_vector : {
. = ALIGN(4);
KEEP(*(.isr_vector))
. = ALIGN(4);
} > FLASH
/* Code + read-only data */
.text : {
. = ALIGN(4);
*(.text)
*(.text*)
*(.rodata)
*(.rodata*)
/* C++ init/fini arrays */
. = ALIGN(4);
PROVIDE_HIDDEN(__preinit_array_start = .);
KEEP(*(.preinit_array*))
PROVIDE_HIDDEN(__preinit_array_end = .);
PROVIDE_HIDDEN(__init_array_start = .);
KEEP(*(SORT(.init_array.*)))
KEEP(*(.init_array*))
PROVIDE_HIDDEN(__init_array_end = .);
PROVIDE_HIDDEN(__fini_array_start = .);
KEEP(*(SORT(.fini_array.*)))
KEEP(*(.fini_array*))
PROVIDE_HIDDEN(__fini_array_end = .);
} > FLASH
/* .data init values in flash, loaded to RAM at startup */
_sidata = LOADADDR(.data);
.data : {
. = ALIGN(4);
_sdata = .;
*(.data)
*(.data*)
. = ALIGN(4);
_edata = .;
} > RAM AT > FLASH
/* Zero-initialized data */
.bss : {
. = ALIGN(4);
_sbss = .;
*(.bss)
*(.bss*)
*(COMMON)
. = ALIGN(4);
_ebss = .;
} > RAM
/* Stack at top of RAM */
.stack (NOLOAD) : {
. = ALIGN(8);
_sstack = .;
. = . + 0x2000; /* 8KB stack */
. = ALIGN(8);
_estack = .;
} > RAM
/* Discard C++ exception tables (if -fno-exceptions) */
/DISCARD/ : {
*(.eh_frame)
*(.ARM.extab)
*(.ARM.exidx)
}
}Global constructors
Global C++ objects are constructed before main() via .init_array:
cpp
// These are constructed in linker-determined order before main()
static MyClass obj1; // constructor called pre-main
static std::array<int,10> a; // trivially constructed
// Problem: order of construction across translation units is unspecified
// If obj2 depends on obj1 being initialized first, it may fail
// Solution 1: Construct on first use (lazy init)
MyClass& getObj() {
static MyClass obj; // constructed on first call, thread-safe in C++11
return obj;
}
// Solution 2: Constexpr (no runtime init)
constexpr std::array<int, 4> lookup = {1, 2, 4, 8}; // in .rodata, no constructor
// Solution 3: Trivially-constructible types
struct Config {
int baud_rate = 115200;
int timeout_ms = 100;
};
// Config has constexpr-compatible constructor → zero-cost initDebugging startup failures
cpp
// HardFault_Handler with register dump
extern "C" void HardFault_Handler() {
// Read fault registers
volatile uint32_t cfsr = SCB->CFSR;
volatile uint32_t hfsr = SCB->HFSR;
volatile uint32_t mmfar = SCB->MMFAR;
volatile uint32_t bfar = SCB->BFAR;
// If debugger is attached, break here
__BKPT(0);
while (true) {}
}
// Check stack usage
extern uint32_t _estack, _sstack;
void checkStackUsage() {
// Stack grows down — check for overflow
uint32_t* p = &_sstack;
while (p < &_estack && *p == 0xDEADBEEF)
++p;
uint32_t used = (&_estack - p) * sizeof(uint32_t);
printf("Stack used: %u bytes\n", used);
}Placing code in CCMRAM
Core-Coupled Memory (CCMRAM) on STM32F4 is zero-wait-state, accessible only by the CPU core — ideal for ISRs and hot paths:
cpp
// Place ISR in CCMRAM for lowest latency
__attribute__((section(".ccmram")))
extern "C" void USART1_IRQHandler() {
// Single-cycle access from CCMRAM
rx_queue.push(USART1->DR & 0xFF);
}Add to linker script:
.ccmram (NOLOAD) : {
. = ALIGN(4);
*(.ccmram)
*(.ccmram*)
. = ALIGN(4);
} > CCMRAMEdit on GitHubUpdated 2026-05-01T00:00:00.000Z