Loading...
Loading...
Compare original and translation side by side
"Measure, don't guess."
"The fastest code is code that doesn't run."
"Understand your hardware or it will humble you."
"用数据说话,而非猜测。"
"最快的代码是无需运行的代码。"
"了解你的硬件,否则它会让你受挫。"
// LuaJIT's key insight: trace hot paths, not methods
// A trace is a linear sequence of operations
typedef struct Trace {
uint32_t *mcode; // Generated machine code
IRIns *ir; // IR instructions
uint16_t nins; // Number of IR instructions
uint16_t nk; // Number of constants
SnapShot *snap; // Side exit snapshots
uint16_t nsnap;
// Link to next trace (for loops)
struct Trace *link;
} Trace;
// Recording: capture operations as they execute
void record_instruction(JitState *J, BCIns ins) {
switch (bc_op(ins)) {
case BC_ADDVN:
// Record: result = slot[A] + constant[D]
TRef tr = emitir(IR_ADD, J->slots[bc_a(ins)],
lj_ir_knum(J, bc_d(ins)));
J->slots[bc_a(ins)] = tr;
break;
// ... other bytecodes
}
}
// Key: traces are LINEAR
// No control flow in the trace itself
// Side exits handle divergence// LuaJIT的核心洞见:追踪热点路径,而非方法
// 追踪是一系列线性的操作序列
typedef struct Trace {
uint32_t *mcode; // 生成的机器码
IRIns *ir; // IR指令
uint16_t nins; // IR指令数量
uint16_t nk; // 常量数量
SnapShot *snap; // 侧出口快照
uint16_t nsnap;
// 指向下一个追踪(用于循环)
struct Trace *link;
} Trace;
// 记录:捕获执行时的操作
void record_instruction(JitState *J, BCIns ins) {
switch (bc_op(ins)) {
case BC_ADDVN:
// 记录:结果 = slot[A] + constant[D]
TRef tr = emitir(IR_ADD, J->slots[bc_a(ins)],
lj_ir_knum(J, bc_d(ins)));
J->slots[bc_a(ins)] = tr;
break;
// ... 其他字节码
}
}
// 核心:追踪是线性的
// 追踪本身无控制流
// 侧出口处理分支发散// LuaJIT IR: compact, cache-friendly, no pointers
typedef struct IRIns {
uint16_t op; // Operation + type
uint16_t op1; // First operand (IR ref or slot)
uint16_t op2; // Second operand
uint16_t prev; // Previous instruction (for CSE chains)
} IRIns; // 8 bytes, fits in cache line nicely
// IR references are indices, not pointers
// Enables: compact storage, easy serialization, cache efficiency
#define IRREF_BIAS 0x8000
#define irref_isk(r) ((r) < IRREF_BIAS) // Is constant?
// Constants stored separately, referenced by negative indices
// Instructions stored linearly, referenced by positive indices
// Example IR sequence for: x = a + b * c
// K001 NUM 3.14 -- constant
// 0001 SLOAD #1 -- load slot 1 (a)
// 0002 SLOAD #2 -- load slot 2 (b)
// 0003 SLOAD #3 -- load slot 3 (c)
// 0004 MUL 0002 0003 -- b * c
// 0005 ADD 0001 0004 -- a + (b * c)// LuaJIT IR:紧凑、缓存友好、无指针
typedef struct IRIns {
uint16_t op; // 操作 + 类型
uint16_t op1; // 第一个操作数(IR引用或槽位)
uint16_t op2; // 第二个操作数
uint16_t prev; // 前一条指令(用于公共子表达式消除链)
} IRIns; // 8字节,完美适配缓存行
// IR引用是索引,而非指针
// 支持:紧凑存储、易于序列化、缓存高效
#define IRREF_BIAS 0x8000
#define irref_isk(r) ((r) < IRREF_BIAS) // 是否为常量?
// 常量单独存储,通过负索引引用
// 指令线性存储,通过正索引引用
// 示例IR序列:x = a + b * c
// K001 NUM 3.14 -- 常量
// 0001 SLOAD #1 -- 加载槽位1(a)
// 0002 SLOAD #2 -- 加载槽位2(b)
// 0003 SLOAD #3 -- 加载槽位3(c)
// 0004 MUL 0002 0003 -- b * c
// 0005 ADD 0001 0004 -- a + (b * c)// Traces assume types and values
// Guards verify assumptions, exit if wrong
void emit_guard(JitState *J, IRType expected, TRef tr) {
IRIns *ir = &J->cur.ir[tref_ref(tr)];
if (ir->t != expected) {
// Emit type guard
emitir(IR_GUARD, tr, expected);
// Record snapshot for side exit
snapshot_add(J);
}
}
// Side exit: restore interpreter state, continue there
typedef struct SnapShot {
uint16_t ref; // First IR ref in snapshot
uint8_t nslots; // Number of slots to restore
uint8_t topslot; // Top slot number
uint32_t *map; // Slot -> IR ref mapping
} SnapShot;
// When guard fails:
// 1. Look up snapshot for this guard
// 2. Restore Lua stack from IR values
// 3. Jump back to interpreter
// 4. Maybe record a new trace from exit point// 追踪会假设类型和值
// 守卫验证假设,不匹配则退出
void emit_guard(JitState *J, IRType expected, TRef tr) {
IRIns *ir = &J->cur.ir[tref_ref(tr)];
if (ir->t != expected) {
// 生成类型守卫
emitir(IR_GUARD, tr, expected);
// 记录侧出口快照
snapshot_add(J);
}
}
// 侧出口:恢复解释器状态,在该点继续执行
typedef struct SnapShot {
uint16_t ref; // 快照中的第一个IR引用
uint8_t nslots; // 需恢复的槽位数量
uint8_t topslot; // 顶部槽位编号
uint32_t *map; // 槽位 -> IR引用的映射
} SnapShot;
// 当守卫失败时:
// 1. 查找该守卫对应的快照
// 2. 从IR值恢复Lua栈
// 3. 跳转回解释器
// 4. 可能从退出点记录新的追踪// LuaJIT generates assembly directly
// Every instruction chosen deliberately
// x86-64 code emission helpers
static void emit_rr(ASMState *as, x86Op op, Reg r1, Reg r2) {
// REX prefix if needed
if (r1 >= 8 || r2 >= 8) {
*--as->mcp = 0x40 | ((r1 >> 3) << 2) | (r2 >> 3);
}
*--as->mcp = 0xc0 | ((r1 & 7) << 3) | (r2 & 7);
*--as->mcp = op;
}
// Register allocation: linear scan, but smarter
// Allocate backwards from trace end for better results
void ra_allocate(ASMState *as) {
// Process IR in reverse order
for (IRRef ref = as->curins; ref >= as->stopins; ref--) {
IRIns *ir = &as->ir[ref];
// Allocate destination register
Reg dest = ra_dest(as, ir);
// Allocate source registers
ra_left(as, ir, dest);
ra_right(as, ir);
}
}
// Key insight: backwards allocation sees all uses
// Can make better spill decisions// LuaJIT直接生成汇编代码
// 每条指令都经过精心选择
// x86-64代码生成助手
static void emit_rr(ASMState *as, x86Op op, Reg r1, Reg r2) {
// 按需添加REX前缀
if (r1 >= 8 || r2 >= 8) {
*--as->mcp = 0x40 | ((r1 >> 3) << 2) | (r2 >> 3);
}
*--as->mcp = 0xc0 | ((r1 & 7) << 3) | (r2 & 7);
*--as->mcp = op;
}
// 寄存器分配:线性扫描,但更智能
// 从追踪末尾反向分配以获得更好的结果
void ra_allocate(ASMState *as) {
// 反向处理IR
for (IRRef ref = as->curins; ref >= as->stopins; ref--) {
IRIns *ir = &as->ir[ref];
// 分配目标寄存器
Reg dest = ra_dest(as, ir);
// 分配源寄存器
ra_left(as, ir, dest);
ra_right(as, ir);
}
}
// 核心洞见:反向分配能看到所有使用场景
// 可以做出更优的溢出决策// Cache-friendly data structures are critical
// BAD: Linked list of variable-size nodes
struct Node {
struct Node *next;
int type;
union {
double num;
struct String *str;
// ...
} value;
};
// GOOD: Separate arrays by type (SoA)
struct ValueArray {
uint8_t *types; // Type tags: sequential access
TValue *values; // Values: sequential access
size_t count;
};
// Iteration patterns matter enormously
// This is ~10x faster than pointer chasing:
for (size_t i = 0; i < arr->count; i++) {
if (arr->types[i] == TYPE_NUMBER) {
sum += arr->values[i].n;
}
}// 缓存友好的数据结构至关重要
// 糟糕的设计:可变大小节点的链表
struct Node {
struct Node *next;
int type;
union {
double num;
struct String *str;
// ...
} value;
};
// 良好的设计:按类型分离数组(SoA)
struct ValueArray {
uint8_t *types; // 类型标签:顺序访问
TValue *values; // 值:顺序访问
size_t count;
};
// 迭代模式影响巨大
// 这种方式比指针追踪快约10倍:
for (size_t i = 0; i < arr->count; i++) {
if (arr->types[i] == TYPE_NUMBER) {
sum += arr->values[i].n;
}
}// Polymorphic inline cache for property access
// Avoids hash lookup in common case
typedef struct InlineCache {
uint32_t shape_id; // Expected object shape
uint16_t offset; // Cached property offset
uint16_t _pad;
} InlineCache;
TValue get_property_cached(Object *obj, String *key, InlineCache *ic) {
// Fast path: shape matches
if (likely(obj->shape_id == ic->shape_id)) {
return obj->slots[ic->offset]; // Direct access!
}
// Slow path: lookup and update cache
uint16_t offset = shape_lookup(obj->shape, key);
ic->shape_id = obj->shape_id;
ic->offset = offset;
return obj->slots[offset];
}
// Monomorphic: one shape, one offset
// Polymorphic: small set of shapes
// Megamorphic: too many shapes, fall back to hash// 用于属性访问的多态内联缓存
// 常见场景下避免哈希查找
typedef struct InlineCache {
uint32_t shape_id; // 预期的对象形状ID
uint16_t offset; // 缓存的属性偏移量
uint16_t _pad;
} InlineCache;
TValue get_property_cached(Object *obj, String *key, InlineCache *ic) {
// 快速路径:形状匹配
if (likely(obj->shape_id == ic->shape_id)) {
return obj->slots[ic->offset]; // 直接访问!
}
// 慢速路径:查找并更新缓存
uint16_t offset = shape_lookup(obj->shape, key);
ic->shape_id = obj->shape_id;
ic->offset = offset;
return obj->slots[offset];
}
// 单态:一种形状,一个偏移量
// 多态:少量形状集合
// 超态:形状过多,回退到哈希查找// CPUs predict branches; help them be right
// BAD: Unpredictable branches in hot loop
for (int i = 0; i < n; i++) {
if (data[i] > threshold) { // 50% taken = unpredictable
sum += data[i];
}
}
// GOOD: Branchless version
for (int i = 0; i < n; i++) {
int mask = -(data[i] > threshold); // 0 or -1
sum += data[i] & mask;
}
// GOOD: Sort first if possible
qsort(data, n, sizeof(int), compare);
for (int i = 0; i < n && data[i] <= threshold; i++) {
// All branches now predictable
}
// Loop unrolling: reduce branch overhead
for (int i = 0; i + 4 <= n; i += 4) {
sum += data[i];
sum += data[i + 1];
sum += data[i + 2];
sum += data[i + 3];
}// CPU会预测分支;帮助CPU做出正确预测
// 糟糕的写法:热点循环中存在不可预测的分支
for (int i = 0; i < n; i++) {
if (data[i] > threshold) { // 50%的概率触发 = 不可预测
sum += data[i];
}
}
// 良好的写法:无分支版本
for (int i = 0; i < n; i++) {
int mask = -(data[i] > threshold); // 0 或 -1
sum += data[i] & mask;
}
// 更好的写法:可能的话先排序
qsort(data, n, sizeof(int), compare);
for (int i = 0; i < n && data[i] <= threshold; i++) {
// 所有分支现在都可预测
}
// 循环展开:减少分支开销
for (int i = 0; i + 4 <= n; i += 4) {
sum += data[i];
sum += data[i + 1];
sum += data[i + 2];
sum += data[i + 3];
}// Generate specialized code for each type combination
// LuaJIT specializes aggressively
// Generic add (slow)
TValue generic_add(TValue a, TValue b) {
if (tvisnum(a) && tvisnum(b)) {
return numV(numV(a) + numV(b));
} else if (tvisstr(a) || tvisstr(b)) {
return concat(tostring(a), tostring(b));
}
// ... metamethod lookup
}
// Specialized add for numbers (fast)
// Generated when trace shows both args are numbers
double specialized_add_nn(double a, double b) {
return a + b; // Single instruction
}
// Type guards ensure specialization is valid
// Side exit if types don't match expected// 为每种类型组合生成专用代码
// LuaJIT会进行激进的专用化
// 通用加法(慢)
TValue generic_add(TValue a, TValue b) {
if (tvisnum(a) && tvisnum(b)) {
return numV(numV(a) + numV(b));
} else if (tvisstr(a) || tvisstr(b)) {
return concat(tostring(a), tostring(b));
}
// ... 元方法查找
}
// 数字专用加法(快)
// 当追踪显示两个参数都是数字时生成
double specialized_add_nn(double a, double b) {
return a + b; // 单条指令
}
// 类型守卫确保专用化有效
// 类型不匹配时触发侧出口CPU Pipeline Awareness
══════════════════════════════════════════════════════════════
Latency (cycles) Operation
────────────────────────────────────────────────────────────
1 Register-to-register ALU
3-4 L1 cache hit
~12 L2 cache hit
~40 L3 cache hit
~200 Main memory
~10-20 Branch mispredict penalty
~100+ Page fault
Key insight: Memory is the bottleneck
Computation is nearly free by comparison
Optimize for memory access patterns firstCPU流水线感知
══════════════════════════════════════════════════════════════
延迟(周期数) 操作
────────────────────────────────────────────────────────────
1 寄存器到寄存器的ALU操作
3-4 L1缓存命中
~12 L2缓存命中
~40 L3缓存命中
~200 主内存访问
~10-20 分支预测错误惩罚
~100+ 页错误
核心洞见:内存是瓶颈
相比之下计算几乎是免费的
优先优化内存访问模式