Loading...
Loading...
Compare original and translation side by side
| Workload | Approach | Section |
|---|---|---|
| Training loop | Manual | Loop Workloads — Manual Timing |
| Single kernel or op | Write CUDA event benchmark (pre-allocate, warmup, event pairs) | Non-Loop Workloads — CUDA Event Benchmarking |
| Add timeline labels for nsys | Use | NVTX Reference |
| 工作负载 | 方法 | 章节 |
|---|---|---|
| 训练循环 | 带预热的手动 | 循环类工作负载——手动计时 |
| 单个内核或算子 | 编写CUDA事件基准测试代码(预分配、预热、事件对) | 非循环类工作负载——CUDA事件基准测试 |
| 为nsys添加时间线标签 | 使用 | NVTX参考 |
time.perf_counter()torch.cuda.synchronize()cute.compile()time.perf_counter()torch.cuda.synchronize()cute.compile()torch.cuda.synchronize()time.perf_counter()torch.cuda.synchronize()time.perf_counter()import time
import torch
WARMUP = 5
NUM_ITERS = 30
BATCH_SIZE = 128 # global batch size for throughput calculation
iter_times = []
data_times = []
for i, batch in enumerate(dataloader):
if i >= WARMUP + NUM_ITERS:
break
t_data_end = time.perf_counter()
torch.cuda.synchronize()
t_start = time.perf_counter()
# ... existing training loop body ...
torch.cuda.synchronize()
t_end = time.perf_counter()
if i >= WARMUP:
iter_ms = (t_end - t_start) * 1000
iter_times.append(iter_ms)
if i > 0:
data_times.append((t_data_end - prev_iter_end) * 1000)
print(f"[{i:04d}]: iter {iter_ms:.2f} ms, fps {BATCH_SIZE / (iter_ms / 1000):.2f}")
prev_iter_end = t_end
import statistics
print(f"Average: iter {statistics.mean(iter_times):.2f} ms, "
f"fps {BATCH_SIZE / (statistics.mean(iter_times) / 1000):.2f}")import time
import torch
WARMUP = 5
NUM_ITERS = 30
BATCH_SIZE = 128 # 用于计算吞吐量的全局批次大小
iter_times = []
data_times = []
for i, batch in enumerate(dataloader):
if i >= WARMUP + NUM_ITERS:
break
t_data_end = time.perf_counter()
torch.cuda.synchronize()
t_start = time.perf_counter()
# ... 现有训练循环主体 ...
torch.cuda.synchronize()
t_end = time.perf_counter()
if i >= WARMUP:
iter_ms = (t_end - t_start) * 1000
iter_times.append(iter_ms)
if i > 0:
data_times.append((t_data_end - prev_iter_end) * 1000)
print(f"[{i:04d}]: 迭代耗时 {iter_ms:.2f} 毫秒,每秒帧数 {BATCH_SIZE / (iter_ms / 1000):.2f}")
prev_iter_end = t_end
import statistics
print(f"平均值: 迭代耗时 {statistics.mean(iter_times):.2f} 毫秒,"
f"每秒帧数 {BATCH_SIZE / (statistics.mean(iter_times) / 1000):.2f}")data / iter > 0.2data / iter > 0.2torch.cuda.synchronize()time.perf_counter()nsys profiletorch.cuda.synchronize()time.perf_counter()nsys profileimport torch
def benchmark(fn, warmup=50, iters=100):
for _ in range(warmup):
fn()
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(iters):
fn()
end.record()
torch.cuda.synchronize()
return start.elapsed_time(end) / iters # ms per iterationimport torch
def benchmark(fn, warmup=50, iters=100):
for _ in range(warmup):
fn()
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(iters):
fn()
end.record()
torch.cuda.synchronize()
return start.elapsed_time(end) / iters # 每迭代毫秒数import torch
import statistics
def benchmark_detailed(fn, warmup=50, iters=100):
for _ in range(warmup):
fn()
torch.cuda.synchronize()
starts = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
ends = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
for i in range(iters):
starts[i].record()
fn()
ends[i].record()
torch.cuda.synchronize()
times = [starts[i].elapsed_time(ends[i]) for i in range(iters)]
return {
"mean_ms": statistics.mean(times),
"median_ms": statistics.median(times),
"std_ms": statistics.stdev(times) if len(times) > 1 else 0,
"min_ms": min(times),
"max_ms": max(times),
}import torch
import statistics
def benchmark_detailed(fn, warmup=50, iters=100):
for _ in range(warmup):
fn()
torch.cuda.synchronize()
starts = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
ends = [torch.cuda.Event(enable_timing=True) for _ in range(iters)]
for i in range(iters):
starts[i].record()
fn()
ends[i].record()
torch.cuda.synchronize()
times = [starts[i].elapsed_time(ends[i]) for i in range(iters)]
return {
"mean_ms": statistics.mean(times),
"median_ms": statistics.median(times),
"std_ms": statistics.stdev(times) if len(times) > 1 else 0,
"min_ms": min(times),
"max_ms": max(times),
}| Anti-Pattern | Problem |
|---|---|
| Adds ~10-50us overhead per iteration |
| Measures CPU time, misses async GPU execution |
| Missing warmup | First iterations include JIT, clock ramp-up, context init |
| Allocating tensors inside measurement loop | Allocation overhead pollutes timing |
| Reporting only mean | Hides variance, outliers, bimodal distributions |
| 反模式 | 问题 |
|---|---|
每次迭代前后都调用 | 每迭代增加约10-50微秒的开销 |
使用 | 测量的是CPU时间,无法捕捉异步GPU执行 |
| 缺少预热阶段 | 前几次迭代包含JIT编译、时钟加速、上下文初始化等开销 |
| 在测量循环内分配张量 | 分配开销会干扰计时结果 |
| 仅报告平均值 | 隐藏了方差、异常值和双峰分布情况 |
import nvtximport nvtx
- **Do** annotate training phases (forward, backward, optimizer, data loading) for nsys timeline clarity.
- **Do not** annotate for measurement — use CUDA events or manual timing instead.
- **Do not** over-annotate — too many fine-grained ranges add visual clutter and minor overhead.
For NVTX domains, categories, payloads, and legacy API details, see [references/nvtx-api.md](references/nvtx-api.md).
- **建议**:为训练阶段(前向传播、反向传播、优化器、数据加载)添加注解,提升nsys时间线的清晰度。
- **不建议**:使用注解进行测量——应改用CUDA事件或手动计时。
- **不建议**过度注解:过多细粒度范围会增加视觉混乱和轻微开销。
关于NVTX域、类别、负载和旧版API的详细信息,请参考[references/nvtx-api.md](references/nvtx-api.md)。device_timecuda_timedevice_timecuda_time