Loading...
Loading...
AI for Science 场景下的昇腾 NPU Profiling 采集与性能分析 Skill,用于在华为 Ascend NPU 上使用 torch_npu.profiler 采集 L0、L1、L2 级性能数据,分析训练或推理中的算子耗时、调用栈、内存与瓶颈,并指导后续调优。
npx skill4agent add ascend-ai-coding/awesome-ascend-skills ai-for-science-ai4s-profilingcuda.cuda()torch.device("cuda")npuimport torch_nputransfer_to_nputorch_npu.profiler| 项目 | 要求 |
|---|---|
| 硬件 | Ascend910 系列(至少 1 卡) |
| CANN | ≥ 8.0(推荐 8.2+) |
| Python | 3.8 – 3.10 |
| PyTorch | 与 CANN 版本匹配 |
| torch_npu | 与 PyTorch 版本一致 |
| 磁盘空间 | 建议 ≥ 10GB 可用(L2 数据量较大) |
0. 环境准备与校验
→ 1. 确定采集场景(训练 / 推理)
→ 2. 选择采集级别(L0 / L1 / L2)
→ 3. 植入 Profiling 代码
→ 4. 执行采集
→ 5. 数据解析与可视化
→ 6. 性能分析与调优建议
→ 7. GPU 对比采集(可选)source /usr/local/Ascend/ascend-toolkit/set_env.sh
cat /usr/local/Ascend/ascend-toolkit/latest/version.cfg
npu-smi infopython3 -c "import torch; import torch_npu; print(torch.npu.is_available()); a = torch.randn(3,4).npu(); print(a.device)"Truenpu:0# CPU 绑核:将训练进程绑定到固定 CPU 核心,减少跨核调度带来的性能抖动
export CPU_AFFINITY_CONF=1
# 流水优化:开启 Host 侧任务下发与 Device 侧执行的异步流水,
# 减少 Host 等待 Device 的空闲时间,提升整体吞吐
# 0=关闭, 1=开启, 2=开启增强模式(推荐)
export TASK_QUEUE_ENABLE=2CPU_AFFINITY_CONF=1TASK_QUEUE_ENABLE=2source set_env.shscheduleskip_firstwaitwarmupactiverepeatschedule = torch_npu.profiler.schedule(
wait=1, warmup=1, active=1, repeat=1, skip_first=20
)prof.step()with torch_npu.profiler.profile(
activities=[torch_npu.profiler.ProfilerActivity.NPU],
on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./npu-profiling-inference")
) as prof:
model(input_data)
prof.step()| 级别 | 说明 | 数据量 | 典型用途 |
|---|---|---|---|
| L0(默认) | 仅采集 NPU 活动,最小膨胀 | 小 | 快速定位热点、整体耗时分布 |
| L1 | 采集 CPU + NPU + 算子详情 | 中 | 分析 Cube/Vector/MatMul/Conv 算子耗时 |
| L2 | L1 + 调用栈 + 内存 | 大 | 深度分析调用链、内存瓶颈 |
import torch
import torch_npu
with torch_npu.profiler.profile(
activities=[torch_npu.profiler.ProfilerActivity.NPU],
schedule=torch_npu.profiler.schedule(
wait=1, warmup=1, active=1, repeat=1, skip_first=20
),
on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(
"./npu-profiling-L0"
)
) as prof:
for step, batch in enumerate(train_dataloader):
train_one_step(batch)
prof.step()ProfilerLevel.Level1import torch
import torch_npu
with torch_npu.profiler.profile(
activities=[
torch_npu.profiler.ProfilerActivity.CPU,
torch_npu.profiler.ProfilerActivity.NPU,
],
with_stack=False,
record_shapes=True,
profile_memory=False,
schedule=torch.profiler.schedule(
wait=1, warmup=1, active=1, repeat=1, skip_first=20
),
experimental_config=torch_npu.profiler._ExperimentalConfig(
profiler_level=torch_npu.profiler.ProfilerLevel.Level1
),
on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(
"./npu-profiling-L1"
)
) as prof:
for step, batch in enumerate(train_dataloader):
train_one_step(batch)
prof.step()record_shapes=Truewith_stack=Falseprofile_memory=Falseexperimental_configimport torch
import torch_npu
with torch_npu.profiler.profile(
activities=[
torch_npu.profiler.ProfilerActivity.CPU,
torch_npu.profiler.ProfilerActivity.NPU,
],
with_stack=True,
record_shapes=True,
profile_memory=True,
schedule=torch.profiler.schedule(
wait=1, warmup=1, active=1, repeat=1, skip_first=20
),
experimental_config=torch_npu.profiler._ExperimentalConfig(
profiler_level=torch_npu.profiler.ProfilerLevel.Level1
),
on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(
"./npu-profiling-L2"
)
) as prof:
for step, batch in enumerate(train_dataloader):
train_one_step(batch)
prof.step()with_stack=Trueprofile_memory=Truewithimport torch
import torch_npu
import pytorch_lightning as pl
class NPUProfilingCallback(pl.Callback):
"""昇腾 NPU Profiling 回调,支持 L0/L1/L2 级别切换。"""
def __init__(self, output_dir="./npu-profiling", level="L0",
skip_first=20, wait=1, warmup=1, active=1, repeat=1):
super().__init__()
self.output_dir = output_dir
self.level = level
self.skip_first = skip_first
self.wait = wait
self.warmup = warmup
self.active = active
self.repeat = repeat
self.prof = None
def _build_profile_kwargs(self):
kwargs = {
"activities": [torch_npu.profiler.ProfilerActivity.NPU],
"schedule": torch_npu.profiler.schedule(
wait=self.wait, warmup=self.warmup,
active=self.active, repeat=self.repeat,
skip_first=self.skip_first,
),
"on_trace_ready": torch_npu.profiler.tensorboard_trace_handler(
self.output_dir
),
}
if self.level in ("L1", "L2"):
kwargs["activities"].insert(
0, torch_npu.profiler.ProfilerActivity.CPU
)
kwargs["record_shapes"] = True
kwargs["experimental_config"] = (
torch_npu.profiler._ExperimentalConfig(
profiler_level=torch_npu.profiler.ProfilerLevel.Level1
)
)
if self.level == "L2":
kwargs["with_stack"] = True
kwargs["profile_memory"] = True
return kwargs
def on_train_start(self, trainer, pl_module):
self.prof = torch_npu.profiler.profile(**self._build_profile_kwargs())
self.prof.start()
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
if self.prof is not None:
torch.npu.synchronize()
self.prof.step()
def on_train_end(self, trainer, pl_module):
if self.prof is not None:
self.prof.stop()
self.prof = Nonetrainer = pl.Trainer(
callbacks=[
...,
NPUProfilingCallback(
output_dir="./npu-profiling-L1",
level="L1",
skip_first=20,
),
],
...
)TrainerTrainerCallbackimport torch
import torch_npu
from transformers import TrainerCallback
class NPUProfilingTrainerCallback(TrainerCallback):
def __init__(self, output_dir="./npu-profiling", skip_first=20):
self.output_dir = output_dir
self.skip_first = skip_first
self.prof = None
def on_train_begin(self, args, state, control, **kwargs):
self.prof = torch_npu.profiler.profile(
activities=[torch_npu.profiler.ProfilerActivity.NPU],
schedule=torch_npu.profiler.schedule(
wait=1, warmup=1, active=1, repeat=1,
skip_first=self.skip_first,
),
on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(
self.output_dir
),
)
self.prof.start()
def on_step_end(self, args, state, control, **kwargs):
if self.prof is not None:
torch.npu.synchronize()
self.prof.step()
def on_train_end(self, args, state, control, **kwargs):
if self.prof is not None:
self.prof.stop()
self.prof = Noneprof.step()model_engine.step()import torch
import torch_npu
if local_rank == 0:
prof = torch_npu.profiler.profile(
activities=[torch_npu.profiler.ProfilerActivity.NPU],
schedule=torch_npu.profiler.schedule(
wait=1, warmup=1, active=1, repeat=1, skip_first=20
),
on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(
"./npu-profiling-deepspeed"
),
)
prof.start()
for step, batch in enumerate(train_dataloader):
loss = model_engine(batch)
model_engine.backward(loss)
model_engine.step()
if local_rank == 0:
prof.step()
if local_rank == 0:
prof.stop()torch.profilertorch.profiler.profiletorch_npu.profiler.profiletorch.profiler.ProfilerActivity.CUDAexperimental_configwith torch.profiler.profile(
activities=[torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(
wait=1, warmup=1, active=1, repeat=1, skip_first=20
),
on_trace_ready=torch.profiler.tensorboard_trace_handler(
"./gpu-profiling-L0"
)
) as prof:
for step, batch in enumerate(train_dataloader):
train_one_step(batch)
prof.step()with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
schedule=torch.profiler.schedule(
wait=1, warmup=1, active=1, repeat=1, skip_first=20
),
on_trace_ready=torch.profiler.tensorboard_trace_handler(
"./gpu-profiling-with-cpu"
)
) as prof:
for step, batch in enumerate(train_dataloader):
train_one_step(batch)
prof.step()| 场景 | 级别 | 模板位置 |
|---|---|---|
| 训练 + 快速概览 | L0 | 第 3.1 节 |
| 训练 + 算子分析 | L1 | 第 3.2 节 |
| 训练 + 深度分析 | L2 | 第 3.3 节 |
| 推理 | L0 | 第 1.2 节 |
| PyTorch Lightning | L0/L1/L2 | 第 4.1 节 |
| HuggingFace Trainer | L0 | 第 4.2 节 |
| DeepSpeed 多卡 | L0 | 第 4.3 节 |
| GPU 对比 | - | 第 5 节 |
python scripts/validate_profiling_env.py --device npu:0 --output-dir ./profiling_outputreferences/analysis-checklist.md