Domain Track
Difficulty 3/5Compiler Development in C++
Writing compilers and interpreters in C++ — lexing, parsing, AST design, visitor pattern, LLVM IR generation, Clang LibTooling, and JIT compilation.
TL;DR
C++ is the dominant language for compiler development. This page covers building language tools from scratch and using LLVM/Clang APIs for industrial-grade work.
cpp
// Classic pipeline
Source text
→ Lexer → Token stream
→ Parser → AST
→ Semantic → Typed AST
→ Codegen → LLVM IR / bytecode / machine codeLexer (Tokenizer)
cpp
enum class TokenKind {
Number, Plus, Minus, Star, Slash,
LParen, RParen, Eof, Invalid
};
struct Token {
TokenKind kind;
std::string_view text;
int line, col;
};
class Lexer {
std::string_view src_;
size_t pos_ = 0;
int line_ = 1, col_ = 1;
char peek(int ahead = 0) const {
return pos_ + ahead < src_.size() ? src_[pos_ + ahead] : '\0';
}
char advance() {
char c = src_[pos_++];
if (c == '\n') { ++line_; col_ = 1; } else ++col_;
return c;
}
public:
explicit Lexer(std::string_view src) : src_(src) {}
Token next() {
// Skip whitespace
while (pos_ < src_.size() && std::isspace(peek())) advance();
if (pos_ >= src_.size()) return {TokenKind::Eof, {}, line_, col_};
int start_line = line_, start_col = col_;
size_t start = pos_;
char c = advance();
if (std::isdigit(c)) {
while (std::isdigit(peek())) advance();
if (peek() == '.' && std::isdigit(peek(1))) {
advance();
while (std::isdigit(peek())) advance();
}
return {TokenKind::Number, src_.substr(start, pos_ - start),
start_line, start_col};
}
switch (c) {
case '+': return {TokenKind::Plus, "+", start_line, start_col};
case '-': return {TokenKind::Minus, "-", start_line, start_col};
case '*': return {TokenKind::Star, "*", start_line, start_col};
case '/': return {TokenKind::Slash, "/", start_line, start_col};
case '(': return {TokenKind::LParen, "(", start_line, start_col};
case ')': return {TokenKind::RParen, ")", start_line, start_col};
default: return {TokenKind::Invalid, src_.substr(start,1), start_line, start_col};
}
}
};AST Design
cpp
// Polymorphic AST with unique_ptr ownership
struct Expr {
virtual ~Expr() = default;
};
using ExprPtr = std::unique_ptr<Expr>;
struct NumberExpr : Expr {
double value;
explicit NumberExpr(double v) : value(v) {}
};
struct BinaryExpr : Expr {
char op;
ExprPtr left, right;
BinaryExpr(char op, ExprPtr l, ExprPtr r)
: op(op), left(std::move(l)), right(std::move(r)) {}
};
struct UnaryExpr : Expr {
char op;
ExprPtr operand;
};
// Variant-based AST (no virtual, value semantics)
using ExprV = std::variant<
struct NumberNode,
struct BinaryNode,
struct VariableNode
>;
struct NumberNode { double value; };
struct BinaryNode {
char op;
std::shared_ptr<ExprV> left, right;
};
struct VariableNode { std::string name; };Recursive Descent Parser
cpp
class Parser {
Lexer lexer_;
Token current_;
Token consume() {
Token t = current_;
current_ = lexer_.next();
return t;
}
Token expect(TokenKind kind) {
if (current_.kind != kind)
throw std::runtime_error(
std::format("expected {} at {}:{}", (int)kind, current_.line, current_.col));
return consume();
}
public:
explicit Parser(std::string_view src) : lexer_(src) {
current_ = lexer_.next(); // prime the pump
}
ExprPtr parse_expr() { return parse_additive(); }
private:
ExprPtr parse_primary() {
if (current_.kind == TokenKind::Number) {
double val = std::stod(std::string(consume().text));
return std::make_unique<NumberExpr>(val);
}
if (current_.kind == TokenKind::LParen) {
consume();
auto e = parse_expr();
expect(TokenKind::RParen);
return e;
}
throw std::runtime_error("expected expression");
}
ExprPtr parse_multiplicative() {
auto left = parse_primary();
while (current_.kind == TokenKind::Star || current_.kind == TokenKind::Slash) {
char op = consume().text[0];
left = std::make_unique<BinaryExpr>(op, std::move(left), parse_primary());
}
return left;
}
ExprPtr parse_additive() {
auto left = parse_multiplicative();
while (current_.kind == TokenKind::Plus || current_.kind == TokenKind::Minus) {
char op = consume().text[0];
left = std::make_unique<BinaryExpr>(op, std::move(left), parse_multiplicative());
}
return left;
}
};Visitor Pattern for AST Traversal
cpp
// Visitor interface
struct ExprVisitor {
virtual ~ExprVisitor() = default;
virtual void visit(NumberExpr&) = 0;
virtual void visit(BinaryExpr&) = 0;
};
// Add accept() to AST nodes
struct Expr {
virtual ~Expr() = default;
virtual void accept(ExprVisitor& v) = 0;
};
struct NumberExpr : Expr {
double value;
void accept(ExprVisitor& v) override { v.visit(*this); }
};
struct BinaryExpr : Expr {
char op; ExprPtr left, right;
void accept(ExprVisitor& v) override { v.visit(*this); }
};
// Evaluator visitor
class Evaluator : public ExprVisitor {
double result_;
public:
double evaluate(Expr& e) { e.accept(*this); return result_; }
void visit(NumberExpr& e) override { result_ = e.value; }
void visit(BinaryExpr& e) override {
double l = evaluate(*e.left);
double r = evaluate(*e.right);
switch (e.op) {
case '+': result_ = l + r; break;
case '-': result_ = l - r; break;
case '*': result_ = l * r; break;
case '/': result_ = l / r; break;
}
}
};
// std::visit with variant AST (no virtual)
double eval(const ExprV& node) {
return std::visit(overloaded{
[](const NumberNode& n) { return n.value; },
[](const BinaryNode& b) {
double l = eval(*b.left), r = eval(*b.right);
switch (b.op) {
case '+': return l + r;
case '-': return l - r;
case '*': return l * r;
case '/': return l / r;
default: throw std::runtime_error("bad op");
}
},
[](const VariableNode&) -> double { throw std::runtime_error("no vars yet"); },
}, node);
}LLVM IR Generation
cpp
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/IRBuilder.h>
class LLVMCodegen : public ExprVisitor {
llvm::LLVMContext ctx_;
std::unique_ptr<llvm::Module> module_;
llvm::IRBuilder<> builder_;
llvm::Value* value_ = nullptr;
public:
LLVMCodegen() : module_(std::make_unique<llvm::Module>("jit", ctx_)),
builder_(ctx_) {}
llvm::Value* codegen(Expr& e) { e.accept(*this); return value_; }
void visit(NumberExpr& e) override {
value_ = llvm::ConstantFP::get(ctx_, llvm::APFloat(e.value));
}
void visit(BinaryExpr& e) override {
llvm::Value* l = codegen(*e.left);
llvm::Value* r = codegen(*e.right);
switch (e.op) {
case '+': value_ = builder_.CreateFAdd(l, r, "addtmp"); break;
case '-': value_ = builder_.CreateFSub(l, r, "subtmp"); break;
case '*': value_ = builder_.CreateFMul(l, r, "multmp"); break;
case '/': value_ = builder_.CreateFDiv(l, r, "divtmp"); break;
}
}
};Clang LibTooling
Use Clang's AST APIs to analyze or transform C++ source:
cpp
#include "clang/Tooling/CommonOptionsParser.h"
#include "clang/Tooling/Tooling.h"
#include "clang/ASTMatchers/ASTMatchers.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"
using namespace clang;
using namespace clang::ast_matchers;
// Find all calls to a specific function
class FunctionCallFinder : public MatchFinder::MatchCallback {
public:
void run(const MatchFinder::MatchResult& result) override {
if (const auto* call = result.Nodes.getNodeAs<CallExpr>("call"))
llvm::outs() << "Found call at line "
<< result.SourceManager->getSpellingLineNumber(call->getBeginLoc())
<< "\n";
}
};
int main(int argc, const char** argv) {
auto op = clang::tooling::CommonOptionsParser::create(argc, argv, MyCategory);
clang::tooling::ClangTool tool(op->getCompilations(), op->getSourcePathList());
FunctionCallFinder finder;
MatchFinder match_finder;
match_finder.addMatcher(
callExpr(callee(functionDecl(hasName("malloc")))).bind("call"),
&finder);
return tool.run(clang::tooling::newFrontendActionFactory(&match_finder).get());
}Symbol Table Design
cpp
template<typename Value>
class SymbolTable {
std::vector<std::unordered_map<std::string, Value>> scopes_;
public:
void push_scope() { scopes_.emplace_back(); }
void pop_scope() { scopes_.pop_back(); }
void define(std::string name, Value val) {
scopes_.back()[std::move(name)] = std::move(val);
}
std::optional<Value> lookup(const std::string& name) const {
for (auto it = scopes_.rbegin(); it != scopes_.rend(); ++it) {
auto found = it->find(name);
if (found != it->end()) return found->second;
}
return {};
}
};
// Usage in semantic analysis
struct Type { std::string name; bool is_const; };
SymbolTable<Type> sym;
sym.push_scope(); // function scope
sym.define("x", Type{"int", false});
sym.define("y", Type{"double", true});
auto t = sym.lookup("x"); // {int, false}
sym.pop_scope();Key Libraries
| Library | Purpose | Link |
|---|---|---|
| LLVM + Clang | Production compiler backend + frontend | llvm.org |
| libclang | C API for Clang AST (simpler than LibTooling) | Part of LLVM |
| tree-sitter | Fast incremental parser (LSP use cases) | tree-sitter.github.io |
| RE/flex | Fast lexer generator | github.com/Genivia/RE-flex |
| Bison/Flex | Classic LALR parser + lexer generators | gnu.org |
| ANTLR4 | LL(*) parser generator with C++ runtime | antlr.org |
| MLIR | Multi-level IR (part of LLVM) | mlir.llvm.org |