Skip to content
C++
Domain Track
Difficulty 3/5

Compiler Development in C++

Writing compilers and interpreters in C++ — lexing, parsing, AST design, visitor pattern, LLVM IR generation, Clang LibTooling, and JIT compilation.

TL;DR

C++ is the dominant language for compiler development. This page covers building language tools from scratch and using LLVM/Clang APIs for industrial-grade work.

cpp
// Classic pipeline
Source text
  → Lexer    → Token stream
  → Parser   → AST
  → Semantic → Typed AST
  → Codegen  → LLVM IR / bytecode / machine code

Lexer (Tokenizer)

cpp
enum class TokenKind {
    Number, Plus, Minus, Star, Slash,
    LParen, RParen, Eof, Invalid
};

struct Token {
    TokenKind kind;
    std::string_view text;
    int line, col;
};

class Lexer {
    std::string_view src_;
    size_t pos_ = 0;
    int line_ = 1, col_ = 1;

    char peek(int ahead = 0) const {
        return pos_ + ahead < src_.size() ? src_[pos_ + ahead] : '\0';
    }

    char advance() {
        char c = src_[pos_++];
        if (c == '\n') { ++line_; col_ = 1; } else ++col_;
        return c;
    }

public:
    explicit Lexer(std::string_view src) : src_(src) {}

    Token next() {
        // Skip whitespace
        while (pos_ < src_.size() && std::isspace(peek())) advance();
        if (pos_ >= src_.size()) return {TokenKind::Eof, {}, line_, col_};

        int start_line = line_, start_col = col_;
        size_t start = pos_;

        char c = advance();

        if (std::isdigit(c)) {
            while (std::isdigit(peek())) advance();
            if (peek() == '.' && std::isdigit(peek(1))) {
                advance();
                while (std::isdigit(peek())) advance();
            }
            return {TokenKind::Number, src_.substr(start, pos_ - start),
                    start_line, start_col};
        }
        switch (c) {
        case '+': return {TokenKind::Plus,   "+", start_line, start_col};
        case '-': return {TokenKind::Minus,  "-", start_line, start_col};
        case '*': return {TokenKind::Star,   "*", start_line, start_col};
        case '/': return {TokenKind::Slash,  "/", start_line, start_col};
        case '(': return {TokenKind::LParen, "(", start_line, start_col};
        case ')': return {TokenKind::RParen, ")", start_line, start_col};
        default:  return {TokenKind::Invalid, src_.substr(start,1), start_line, start_col};
        }
    }
};

AST Design

cpp
// Polymorphic AST with unique_ptr ownership
struct Expr {
    virtual ~Expr() = default;
};
using ExprPtr = std::unique_ptr<Expr>;

struct NumberExpr : Expr {
    double value;
    explicit NumberExpr(double v) : value(v) {}
};

struct BinaryExpr : Expr {
    char op;
    ExprPtr left, right;
    BinaryExpr(char op, ExprPtr l, ExprPtr r)
        : op(op), left(std::move(l)), right(std::move(r)) {}
};

struct UnaryExpr : Expr {
    char op;
    ExprPtr operand;
};

// Variant-based AST (no virtual, value semantics)
using ExprV = std::variant<
    struct NumberNode,
    struct BinaryNode,
    struct VariableNode
>;

struct NumberNode { double value; };
struct BinaryNode {
    char op;
    std::shared_ptr<ExprV> left, right;
};
struct VariableNode { std::string name; };

Recursive Descent Parser

cpp
class Parser {
    Lexer lexer_;
    Token current_;

    Token consume() {
        Token t = current_;
        current_ = lexer_.next();
        return t;
    }

    Token expect(TokenKind kind) {
        if (current_.kind != kind)
            throw std::runtime_error(
                std::format("expected {} at {}:{}", (int)kind, current_.line, current_.col));
        return consume();
    }

public:
    explicit Parser(std::string_view src) : lexer_(src) {
        current_ = lexer_.next();  // prime the pump
    }

    ExprPtr parse_expr() { return parse_additive(); }

private:
    ExprPtr parse_primary() {
        if (current_.kind == TokenKind::Number) {
            double val = std::stod(std::string(consume().text));
            return std::make_unique<NumberExpr>(val);
        }
        if (current_.kind == TokenKind::LParen) {
            consume();
            auto e = parse_expr();
            expect(TokenKind::RParen);
            return e;
        }
        throw std::runtime_error("expected expression");
    }

    ExprPtr parse_multiplicative() {
        auto left = parse_primary();
        while (current_.kind == TokenKind::Star || current_.kind == TokenKind::Slash) {
            char op = consume().text[0];
            left = std::make_unique<BinaryExpr>(op, std::move(left), parse_primary());
        }
        return left;
    }

    ExprPtr parse_additive() {
        auto left = parse_multiplicative();
        while (current_.kind == TokenKind::Plus || current_.kind == TokenKind::Minus) {
            char op = consume().text[0];
            left = std::make_unique<BinaryExpr>(op, std::move(left), parse_multiplicative());
        }
        return left;
    }
};

Visitor Pattern for AST Traversal

cpp
// Visitor interface
struct ExprVisitor {
    virtual ~ExprVisitor() = default;
    virtual void visit(NumberExpr&) = 0;
    virtual void visit(BinaryExpr&) = 0;
};

// Add accept() to AST nodes
struct Expr {
    virtual ~Expr() = default;
    virtual void accept(ExprVisitor& v) = 0;
};

struct NumberExpr : Expr {
    double value;
    void accept(ExprVisitor& v) override { v.visit(*this); }
};

struct BinaryExpr : Expr {
    char op; ExprPtr left, right;
    void accept(ExprVisitor& v) override { v.visit(*this); }
};

// Evaluator visitor
class Evaluator : public ExprVisitor {
    double result_;
public:
    double evaluate(Expr& e) { e.accept(*this); return result_; }

    void visit(NumberExpr& e) override { result_ = e.value; }
    void visit(BinaryExpr& e) override {
        double l = evaluate(*e.left);
        double r = evaluate(*e.right);
        switch (e.op) {
        case '+': result_ = l + r; break;
        case '-': result_ = l - r; break;
        case '*': result_ = l * r; break;
        case '/': result_ = l / r; break;
        }
    }
};

// std::visit with variant AST (no virtual)
double eval(const ExprV& node) {
    return std::visit(overloaded{
        [](const NumberNode& n) { return n.value; },
        [](const BinaryNode& b) {
            double l = eval(*b.left), r = eval(*b.right);
            switch (b.op) {
            case '+': return l + r;
            case '-': return l - r;
            case '*': return l * r;
            case '/': return l / r;
            default: throw std::runtime_error("bad op");
            }
        },
        [](const VariableNode&) -> double { throw std::runtime_error("no vars yet"); },
    }, node);
}

LLVM IR Generation

cpp
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/IRBuilder.h>

class LLVMCodegen : public ExprVisitor {
    llvm::LLVMContext ctx_;
    std::unique_ptr<llvm::Module> module_;
    llvm::IRBuilder<> builder_;
    llvm::Value* value_ = nullptr;

public:
    LLVMCodegen() : module_(std::make_unique<llvm::Module>("jit", ctx_)),
                    builder_(ctx_) {}

    llvm::Value* codegen(Expr& e) { e.accept(*this); return value_; }

    void visit(NumberExpr& e) override {
        value_ = llvm::ConstantFP::get(ctx_, llvm::APFloat(e.value));
    }

    void visit(BinaryExpr& e) override {
        llvm::Value* l = codegen(*e.left);
        llvm::Value* r = codegen(*e.right);
        switch (e.op) {
        case '+': value_ = builder_.CreateFAdd(l, r, "addtmp"); break;
        case '-': value_ = builder_.CreateFSub(l, r, "subtmp"); break;
        case '*': value_ = builder_.CreateFMul(l, r, "multmp"); break;
        case '/': value_ = builder_.CreateFDiv(l, r, "divtmp"); break;
        }
    }
};

Clang LibTooling

Use Clang's AST APIs to analyze or transform C++ source:

cpp
#include "clang/Tooling/CommonOptionsParser.h"
#include "clang/Tooling/Tooling.h"
#include "clang/ASTMatchers/ASTMatchers.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"

using namespace clang;
using namespace clang::ast_matchers;

// Find all calls to a specific function
class FunctionCallFinder : public MatchFinder::MatchCallback {
public:
    void run(const MatchFinder::MatchResult& result) override {
        if (const auto* call = result.Nodes.getNodeAs<CallExpr>("call"))
            llvm::outs() << "Found call at line "
                         << result.SourceManager->getSpellingLineNumber(call->getBeginLoc())
                         << "\n";
    }
};

int main(int argc, const char** argv) {
    auto op = clang::tooling::CommonOptionsParser::create(argc, argv, MyCategory);
    clang::tooling::ClangTool tool(op->getCompilations(), op->getSourcePathList());

    FunctionCallFinder finder;
    MatchFinder match_finder;
    match_finder.addMatcher(
        callExpr(callee(functionDecl(hasName("malloc")))).bind("call"),
        &finder);

    return tool.run(clang::tooling::newFrontendActionFactory(&match_finder).get());
}

Symbol Table Design

cpp
template<typename Value>
class SymbolTable {
    std::vector<std::unordered_map<std::string, Value>> scopes_;

public:
    void push_scope() { scopes_.emplace_back(); }
    void pop_scope()  { scopes_.pop_back(); }

    void define(std::string name, Value val) {
        scopes_.back()[std::move(name)] = std::move(val);
    }

    std::optional<Value> lookup(const std::string& name) const {
        for (auto it = scopes_.rbegin(); it != scopes_.rend(); ++it) {
            auto found = it->find(name);
            if (found != it->end()) return found->second;
        }
        return {};
    }
};

// Usage in semantic analysis
struct Type { std::string name; bool is_const; };
SymbolTable<Type> sym;

sym.push_scope();    // function scope
sym.define("x", Type{"int", false});
sym.define("y", Type{"double", true});
auto t = sym.lookup("x");  // {int, false}
sym.pop_scope();

Key Libraries

LibraryPurposeLink
LLVM + ClangProduction compiler backend + frontendllvm.org
libclangC API for Clang AST (simpler than LibTooling)Part of LLVM
tree-sitterFast incremental parser (LSP use cases)tree-sitter.github.io
RE/flexFast lexer generatorgithub.com/Genivia/RE-flex
Bison/FlexClassic LALR parser + lexer generatorsgnu.org
ANTLR4LL(*) parser generator with C++ runtimeantlr.org
MLIRMulti-level IR (part of LLVM)mlir.llvm.org