NsightComputeProfiling入门
NsightComputeProfiling入门
- 1.参考链接
- 2.下载安装nsight-compute-linux-2024.2.1.2-34372528.run
- 3.获取GPU设备信息
- 5.从计算的角度如何让一个SM饱和呢:
- 6.numpy算法实现:sum(a*b+c)
- 7.cuda实现
- 8.常用编译命令
- 9.查看Nsight-Compute支持的sections
- 10.默认Sections存储路径(从中可以看到Nsight Compute用到的Metric、计算公式及性能预估的python脚本)
- 11.ncu常用命令
- 12.先通过GPU Speed Of Light Throughput分析宏观层面的性能瓶颈,再根据指引查看相关瓶颈点
- 13.测试fma指令需要的cycle数(一条fma指令的latency是38个cycle,每增加一条指令,增加4个cycle)
- 14.测试share memory load指令需要的cycle数
- 15.测试global load指令需要的cycle数(一条load指令需要51个cycle,没增加一条指令,增加4个cycle)
- 16.将规模扩大52倍
- 17.小结
本文是NsightCompute的一个演示
1.参考链接
- Assembler Instructions with C Expression Operands
- PTX FMA指令
- Inline PTX Assembly in CUDA
- Your GPU Compute Capability
- Multiprocessor Level
- Arithmetic Instructions
2.下载安装nsight-compute-linux-2024.2.1.2-34372528.run
/usr/local/NVIDIA-Nsight-Compute/ncu --version
NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2024 NVIDIA Corporation
Version 2024.2.1.0 (build 34372528) (public-release)
3.获取GPU设备信息
cd /usr/local/cuda/extras/CUPTI/samples/checkpoint_kernels
make
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics device__attribute_limits_max_cta_per_sm,\
device__attribute_max_warps_per_multiprocessor,\
device__attribute_num_schedulers_per_multiprocessor,\
device__attribute_max_warps_per_scheduler,\
device__attribute_max_ipc_per_scheduler,\
device__attribute_max_ipc_per_multiprocessor,\
device__attribute_max_registers_per_thread ./checkpoint_kernels
cd -
输出
--------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
--------------------------------------------------- ----------- ------------
device__attribute_limits_max_cta_per_sm 16
device__attribute_max_ipc_per_multiprocessor 4
device__attribute_max_ipc_per_scheduler 1
device__attribute_max_registers_per_thread 255
device__attribute_max_warps_per_multiprocessor 48
device__attribute_max_warps_per_scheduler 12
device__attribute_num_schedulers_per_multiprocessor 4
--------------------------------------------------- ----------- ------------
解释
device__attribute_limits_max_cta_per_sm 每个SM支持的最大CTA数 16
device__attribute_max_warps_per_multiprocessor 每个SM支持的最大warp数 48
device__attribute_num_schedulers_per_multiprocessor 每个SM的warp schedule数 4
device__attribute_max_warps_per_scheduler 每个warp schedule最大支持的warp数 12
device__attribute_max_ipc_per_scheduler 每个warp scheduler每个时钟能处理的最大指令条数 1
device__attribute_max_ipc_per_multiprocessor 每个SM每个时钟能处理的最大指令条数 4
device__attribute_max_registers_per_thread 每个线程支持的最大寄存器数 255
The number of instructions required to hide a latency of L clock cycles depends on the respective throughputs of these instructions
4L for devices of compute capability 5.x, 6.1, 6.2, 7.x and 8.x since for these devices,
a multiprocessor issues one instruction per warp over one clock cycle for four warps at a time, as mentioned in Compute Capabilities.
从Arithmetic Instruction可以看到每个cyle一个sm可以处理4条乘加指令
指令latency及多种因素的影响(寄存器依赖,别的指令的指令,操作数是否准备好等等,因此需要更多的线程来隐藏延迟)
4.获取GPU信息
tee devinfo.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
int main()
{
int deviceid=0;
cudaSetDevice(deviceid);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, deviceid);
int maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
int sharedMemoryPerBlock = deviceProp.sharedMemPerBlock;
int maxBlocksPerMultiprocessor = deviceProp.maxBlocksPerMultiProcessor;
int smCount = deviceProp.multiProcessorCount;
std::cout << "Device name: " << deviceProp.name << std::endl;
std::cout << "Max threads per block: " << maxThreadsPerBlock << std::endl;
std::cout << "Shared memory per block: " << sharedMemoryPerBlock << " bytes" << std::endl;
std::cout << "Max blocks per SM: " << maxBlocksPerMultiprocessor << std::endl;
std::cout << "Number of SMs: " << smCount << std::endl;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_70 -o devinfo devinfo.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
./devinfo
输出
Device name: NVIDIA GeForce RTX 3060
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
Number of SMs: 28
5.从计算的角度如何让一个SM饱和呢:
- 每个SM每个时钟能处理的最大指令条数是4,一个warp同时执行一条指令
- 也就是说一个SM可以同时运行:32*4个线程
- 为了更好的隐藏延迟,乘加指令的Latency是4个cycle,这里将Blocksize为 4324个线程=512
- 28个SM,将grid设置为28
6.numpy算法实现:sum(a*b+c)
tee sample_2.py<<-'EOF'
import numpy as np
grid_count=28
block_count=512
total_count=grid_count*block_count
input_a=np.arange(total_count,dtype=np.float32)*0.001
input_b=np.arange(total_count,dtype=np.float32)*0.002
input_c=np.arange(total_count,dtype=np.float32)*0.003
output_d=input_a*input_b+input_c
output_reduce_sum=np.sum(output_d).astype(np.float32)
print(output_reduce_sum)
EOF
python sample_2.py
输出
2272286.2
7.cuda实现
tee sample_2.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \
std::cerr << " code=" << err << " (" << cudaGetErrorString(err) << ")" << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
__device__ __forceinline__ float warp_reduce_sum(float val) {
#if 1
#pragma unroll
for (int mask = warpSize >> 1; mask >= 1; mask >>= 1) {
val += __shfl_xor_sync(0xffffffff, val, mask);
}
return val;
#else
unsigned int mask = 0xffffffff;
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
val += __shfl_down_sync(mask, val, offset);
}
return val;
#endif
}
__global__ void MyKernel(float *a,float *b,float *c,float *output)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
float d=0;
__shared__ float shared[32];
#if 1
float _a=a[tid];
float _b=b[tid];
float _c=c[tid];
//d=a*b+c
asm volatile("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(d) : "f"(_a),"f"(_b),"f"(_c));
#else
asm volatile("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(d) : "f"(a[tid]),"f"(b[tid]),"f"(c[tid]));
#endif
//warp内和,放在share memory里
float val = warp_reduce_sum(d);
if (lane == 0) //每个warp的一个thread负责存放结果
{
shared[wid] = val;
}
__syncthreads();
float sum=0;
if (threadIdx.x == 0) //每个block的第一个thread,负责对shared memory的数据求和
{
int warp_count = blockDim.x / warpSize;
for (int i = 0; i <warp_count ; i++){
sum += shared[i];
}
atomicAdd(output, sum);
}
}
int main(int argc,char *argv[])
{
int deviceid=0;
cudaSetDevice(deviceid);
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int block_size=512;
int grid_size=28;
int total_count=block_size*grid_size;
int total_size=sizeof(float)*total_count;
float *host_a = (float *)malloc(total_size);
float *host_b = (float *)malloc(total_size);
float *host_c = (float *)malloc(total_size);
float *host_d = (float *)malloc(sizeof(float));
float *dev_a;
float *dev_b;
float *dev_c;
float *dev_d;
host_d[0]=0.0;
CHECK_CUDA(cudaMalloc(&dev_a, total_size));
CHECK_CUDA(cudaMalloc(&dev_b, total_size));
CHECK_CUDA(cudaMalloc(&dev_c, total_size));
CHECK_CUDA(cudaMalloc(&dev_d, sizeof(float)));
for(int i = 0; i < total_count; ++i) host_a[i] = i*0.001;
for(int i = 0; i < total_count; ++i) host_b[i] = i*0.002;
for(int i = 0; i < total_count; ++i) host_c[i] = i*0.003;
CHECK_CUDA(cudaMemcpy(dev_a, host_a, total_size,cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(dev_b, host_b, total_size,cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(dev_c, host_c, total_size,cudaMemcpyHostToDevice));
for(int i=0;i<2;i++)
{
CHECK_CUDA(cudaMemcpy(dev_a, host_a, total_size,cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(dev_b, host_b, total_size,cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(dev_c, host_c, total_size,cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(dev_d, host_d, sizeof(float),cudaMemcpyHostToDevice));
cudaEventRecord(start, stream);
MyKernel<<<grid_size,block_size,sizeof(float)*32, stream>>>(dev_a,dev_b,dev_c,dev_d);
cudaEventRecord(stop, stream);
CHECK_CUDA(cudaEventSynchronize(stop));
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("%d %.3f\n",i,milliseconds);
}
cudaMemcpy(host_d, dev_d,sizeof(float), cudaMemcpyDeviceToHost);
printf("output:%f\n",host_d[0]);
free(host_a);
free(host_b);
free(host_c);
free(host_d);
CHECK_CUDA(cudaFree(dev_a));
CHECK_CUDA(cudaFree(dev_b));
CHECK_CUDA(cudaFree(dev_c));
CHECK_CUDA(cudaFree(dev_d));
CHECK_CUDA(cudaEventDestroy(start));
CHECK_CUDA(cudaEventDestroy(stop));
CHECK_CUDA(cudaStreamDestroy(stream));
return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_2.cu -o sample_2
./sample_2
输出
2272286.000000
8.常用编译命令
# 将device code生成ptx
/usr/local/cuda/bin/nvcc -std=c++17 -maxrregcount=24 -arch=sm_86 -ptx sample_2.cu -o sample_2.ptx
# ptx生成cubin
/usr/local/cuda/bin/nvcc -maxrregcount=24 -O0 -arch=sm_86 sample_2.ptx -cubin -o sample_2.cubin
# cubin生成fatbin
/usr/local/cuda/bin/nvcc -maxrregcount=24 -arch=sm_86 sample_2.cubin -fatbin -o sample_2.fatbin
# Link object files with relocatable device code and .ptx, .cubin, and .fatbin files into an object file with executable device code, which can be passed to the host linker.
/usr/local/cuda/bin/nvcc -maxrregcount=24 --gpu-architecture=sm_86 --device-link sample_2.fatbin --output-file link.o
# 编译Host代码
/usr/local/cuda/bin/nvcc -std=c++17 -c sample_2.cu -o sample_2.o
# 链接HOST和device代码成可执行程序
/usr/local/cuda/bin/nvcc -o sample_2 sample_2.o link.o -L /usr/local/cuda/lib64 -lcuda
# 查看资源使用情况
/usr/local/cuda/bin/cuobjdump --dump-resource-usage sample_2.fatbin
# 查看PTX指令
cat sample_2.ptx
# 查看SASS指令
/usr/local/cuda/bin/cuobjdump --dump-sass sample_2.fatbin
9.查看Nsight-Compute支持的sections
/usr/local/NVIDIA-Nsight-Compute/ncu --list-sections
C2CLink
ComputeWorkloadAnalysis
InstructionStats
LaunchStats
MemoryWorkloadAnalysis
MemoryWorkloadAnalysis_Chart
MemoryWorkloadAnalysis_Tables
NumaAffinity
Nvlink
Nvlink_Tables
Nvlink_Topology
Occupancy
PmSampling
PmSampling_WarpStates
SchedulerStats
SourceCounters
SpeedOfLight
SpeedOfLight_HierarchicalDoubleRooflineChart
SpeedOfLight_HierarchicalHalfRooflineChart
SpeedOfLight_HierarchicalSingleRooflineChart
SpeedOfLight_HierarchicalTensorRooflineChart
SpeedOfLight_RooflineChart
WarpStateStats
WorkloadDistribution
10.默认Sections存储路径(从中可以看到Nsight Compute用到的Metric、计算公式及性能预估的python脚本)
ls -l /root/Documents/NVIDIA\ Nsight\ Compute/2024.2.1/Sections/*
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/AchievedOccupancy.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/C2CLink.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Compute.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/ComputeWorkloadAnalysis.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/CPIStall.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/FPInstructions.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/HighPipeUtilization.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/InstructionStatistics.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/IssueSlotUtilization.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/LaunchStatistics.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/LaunchStatistics.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryApertureUsage.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryCacheAccessPattern.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryL2Compression.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Memory.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryWorkloadAnalysis_Chart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryWorkloadAnalysis.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryWorkloadAnalysis_Tables.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/NumaAffinity.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Nvlink.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Nvlink_Tables.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Nvlink_Topology.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/NvRules.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Occupancy.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/PCSamplingData.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/PmSampling.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/PmSampling_WarpStates.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/RequestedMetrics.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SchedulerStatistics.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SharedMemoryConflicts.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SlowPipeLimiter.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SourceCounters.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_HierarchicalDoubleRooflineChart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_HierarchicalHalfRooflineChart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_HierarchicalSingleRooflineChart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_HierarchicalTensorRooflineChart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_RooflineChart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_Roofline.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/TheoreticalOccupancy.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/ThreadDivergence.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/UncoalescedAccess.chart'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/UncoalescedAccess.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/UncoalescedSharedAccess.chart'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/UncoalescedSharedAccess.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/version.txt'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/WarpStateStatistics.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/WorkloadDistribution.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/WorkloadImbalance.py'
11.ncu常用命令
# 获取所有的metrics
/usr/local/NVIDIA-Nsight-Compute/ncu --set full --export ncu_report -f ./sample_2
# 获取指定section
/usr/local/NVIDIA-Nsight-Compute/ncu --section ComputeWorkloadAnalysis --print-details all ./sample_2
# 获取指定section和特定的metrics
/usr/local/NVIDIA-Nsight-Compute/ncu --section WarpStateStats --metrics smsp__pcsamp_sample_count,group:smsp__pcsamp_warp_stall_reasons,group:smsp__pcsamp_warp_stall_reasons_not_issued ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics group:smsp__pcsamp_warp_stall_reasons ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics group:smsp__pcsamp_warp_stall_reasons_not_issued ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics group:memory__dram_table ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed,breakdown:gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics sm__throughput.avg.pct_of_peak_sustained_elapsed,breakdown:sm__throughput.avg.pct_of_peak_sustained_elapsed ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics l1tex__lsuin_requests,l1tex__f_wavefronts,dram__sectors,dramc__sectors,fbpa__dram_sectors,l1tex__data_bank_reads,l1tex__data_pipe_lsu_wavefronts,l1tex__lsuin_requests,l1tex__t_bytes_lookup_miss ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics sass__inst_executed_per_opcode,sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active --target-processes all --export ncu_report -f ./sample_2
# 查看instances的数据
/usr/local/NVIDIA-Nsight-Compute/ncu -i ncu_report.ncu-rep --print-details all --print-units auto --print-metric-instances values
12.先通过GPU Speed Of Light Throughput分析宏观层面的性能瓶颈,再根据指引查看相关瓶颈点
13.测试fma指令需要的cycle数(一条fma指令的latency是38个cycle,每增加一条指令,增加4个cycle)
tee sample_3.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
template<int LoopCount>
__global__ void measure_load_latency(int *tid, int *out,int *out_val, int N) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= N) return;
int temp[LoopCount];
unsigned int start_cycle, end_cycle;
//%%clock:A predefined, read-only 32-bit unsigned cycle counter.
float _a=tid[idx];
float _b=tid[idx];
float _c=tid[idx];
__syncthreads();
asm volatile ("mov.u32 %0, %%clock;" : "=r"(start_cycle) :: "memory");
#pragma unroll
for(int i=0;i<LoopCount;i++)
{
asm volatile("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(_c) : "f"(_a),"f"(_b),"f"(_c));
}
asm volatile ("mov.u32 %0, %%clock;" : "=r"(end_cycle) :: "memory");
out[idx] = end_cycle - start_cycle;
out_val[idx]=(int)_c;
}
__host__ void measure_load_latency_on_gpu(int *h_in, int N) {
int *d_in, *d_out,*d_out_val;;
int h_out[N];
cudaMalloc((void**)&d_in, 128*N * sizeof(int));
cudaMalloc((void**)&d_out, 2*N * sizeof(int));
cudaMalloc((void**)&d_out_val, N * sizeof(int));
printf("-----------------------------------------------\n");
cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice);
int blocks = 1;
int threads_per_block = 32;
{
measure_load_latency<1><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "latency (in cycles): " << h_out[0] << std::endl;
}
{
measure_load_latency<2><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "latency (in cycles): " << h_out[0] << std::endl;
}
{
measure_load_latency<3><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "latency (in cycles): " << h_out[0] << std::endl;
}
{
measure_load_latency<4><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "latency (in cycles): " << h_out[0] << std::endl;
}
cudaFree(d_in);
cudaFree(d_out);
}
int main() {
const int N = 32;
int h_in[N] = {42}; // Sample input data
measure_load_latency_on_gpu(h_in, N);
measure_load_latency_on_gpu(h_in, N);
return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_3.cu -o sample_3
./sample_3
输出
-----------------------------------------------
latency (in cycles): 38
latency (in cycles): 42
latency (in cycles): 46
latency (in cycles): 50
-----------------------------------------------
latency (in cycles): 38
latency (in cycles): 42
latency (in cycles): 46
latency (in cycles): 50
14.测试share memory load指令需要的cycle数
tee sample_3.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
template<int LoopCount>
__global__ void measure_load_latency(int *in, int *out,int *out_val, int N) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= N) return;
int temp[LoopCount];
__shared__ float shared[1024];
shared[idx]=in[idx];
unsigned int start_cycle, end_cycle;
//%%clock:A predefined, read-only 32-bit unsigned cycle counter.
__syncthreads();
asm volatile ("mov.u32 %0, %%clock;" : "=r"(start_cycle) :: "memory");
#pragma unroll
for(int i=0;i<LoopCount;i++)
{
temp[i] = shared[idx+32*i];
}
asm volatile ("mov.u32 %0, %%clock;" : "=r"(end_cycle) :: "memory");
out[idx] = end_cycle - start_cycle;
int total=0;
for(int i=0;i<LoopCount;i++)
{
total+=temp[i];
}
out_val[idx]=total; //为了防止被编译器优化掉
}
__host__ void measure_load_latency_on_gpu(int *h_in, int N) {
int *d_in, *d_out,*d_out_val;;
int h_out[N];
cudaMalloc((void**)&d_in, 128*N * sizeof(int));
cudaMalloc((void**)&d_out, 2*N * sizeof(int));
cudaMalloc((void**)&d_out_val, N * sizeof(int));
cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice);
printf("-------------------------------------------\n");
int blocks = 1;
int threads_per_block = 32;
{
cudaCtxResetPersistingL2Cache();
measure_load_latency<1><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
}
{
cudaCtxResetPersistingL2Cache();
measure_load_latency<2><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
}
{
cudaCtxResetPersistingL2Cache();
measure_load_latency<3><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
}
{
cudaCtxResetPersistingL2Cache();
measure_load_latency<4><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
}
cudaFree(d_in);
cudaFree(d_out);
}
int main() {
const int N = 32;
int h_in[N] = {42}; // Sample input data
measure_load_latency_on_gpu(h_in, N);
measure_load_latency_on_gpu(h_in, N);
return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_3.cu -o sample_3
./sample_3
输出
-------------------------------------------
Load latency (in cycles): 38
Load latency (in cycles): 69
Load latency (in cycles): 77
Load latency (in cycles): 85
-------------------------------------------
Load latency (in cycles): 38
Load latency (in cycles): 69
Load latency (in cycles): 77
Load latency (in cycles): 85
15.测试global load指令需要的cycle数(一条load指令需要51个cycle,没增加一条指令,增加4个cycle)
tee sample_3.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
template<int LoopCount>
__global__ void measure_load_latency(int *in, int *out,int *out_val, int N) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= N) return;
int temp[LoopCount];
unsigned int start_cycle, end_cycle;
//%%clock:A predefined, read-only 32-bit unsigned cycle counter.
__syncthreads();
asm volatile ("mov.u32 %0, %%clock;" : "=r"(start_cycle) :: "memory");
#pragma unroll
for(int i=0;i<LoopCount;i++)
{
temp[i] = in[idx+32*i];
}
asm volatile ("mov.u32 %0, %%clock;" : "=r"(end_cycle) :: "memory");
out[idx] = end_cycle - start_cycle;
int total=0;
for(int i=0;i<LoopCount;i++)
{
total+=temp[i];
}
out_val[idx]=total;
}
__host__ void measure_load_latency_on_gpu(int *h_in, int N) {
int *d_in, *d_out,*d_out_val;;
int h_out[N];
cudaMalloc((void**)&d_in, 128*N * sizeof(int));
cudaMalloc((void**)&d_out, 2*N * sizeof(int));
cudaMalloc((void**)&d_out_val, N * sizeof(int));
cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice);
int blocks = 1;
int threads_per_block = 32;
{
cudaCtxResetPersistingL2Cache();
measure_load_latency<1><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
}
{
cudaCtxResetPersistingL2Cache();
measure_load_latency<2><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
}
{
cudaCtxResetPersistingL2Cache();
measure_load_latency<3><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
cudaDeviceSynchronize();
cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);
std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
}
cudaFree(d_in);
cudaFree(d_out);
}
int main() {
const int N = 32;
int h_in[N] = {42}; // Sample input data
measure_load_latency_on_gpu(h_in, N);
return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_3.cu -o sample_3
./sample_3
输出
bash
Load latency (in cycles): 51
Load latency (in cycles): 55
Load latency (in cycles): 59
确认是否对齐访问
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__sass_inst_executed_op_global_ld.sum,memory_l2_theoretical_sectors_global,memory_l2_theoretical_sectors_global_ideal,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_access.sum,sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_bypass.sum,l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum ./sample_3
输出
Section: Command line profiler metrics
------------------------------------------------------------------------ ----------- ------------
Metric Name Metric Unit Metric Value
------------------------------------------------------------------------ ----------- ------------
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum 1
l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum 1
memory_l2_theoretical_sectors_global sectors 12
memory_l2_theoretical_sectors_global_ideal sectors 12
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_access.sum 0
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_bypass.sum 0
smsp__sass_inst_executed_op_global_ld.sum inst 1
------------------------------------------------------------------------ ----------- ------------
void measure_load_latency<2>(int *, int *, int *, int) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
------------------------------------------------------------------------ ----------- ------------
Metric Name Metric Unit Metric Value
------------------------------------------------------------------------ ----------- ------------
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum 2
l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum 2
memory_l2_theoretical_sectors_global sectors 16
memory_l2_theoretical_sectors_global_ideal sectors 16
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_access.sum 0
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_bypass.sum 0
smsp__sass_inst_executed_op_global_ld.sum inst 2
------------------------------------------------------------------------ ----------- ------------
void measure_load_latency<3>(int *, int *, int *, int) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
------------------------------------------------------------------------ ----------- ------------
Metric Name Metric Unit Metric Value
------------------------------------------------------------------------ ----------- ------------
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum 3
l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum 3
memory_l2_theoretical_sectors_global sectors 20
memory_l2_theoretical_sectors_global_ideal sectors 20
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_access.sum 0
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_bypass.sum 0
smsp__sass_inst_executed_op_global_ld.sum inst 3
------------------------------------------------------------------------ ----------- ------------
从上面global load指令的latency可见,它需要超过50个cycle,才能完成数据的加载
接下来将数据规模扩大50倍,保证在数据完全加载回来之前,一直有指令下发,测试DDR的带宽利用率
16.将规模扩大52倍
tee sample_2_opt.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \
std::cerr << " code=" << err << " (" << cudaGetErrorString(err) << ")" << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
__device__ __forceinline__ float warp_reduce_sum(float val) {
#if 1
#pragma unroll
for (int mask = warpSize >> 1; mask >= 1; mask >>= 1) {
val += __shfl_xor_sync(0xffffffff, val, mask);
}
return val;
#else
unsigned int mask = 0xffffffff;
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
val += __shfl_down_sync(mask, val, offset);
}
return val;
#endif
}
__global__ void MyKernelOpt(float *a,float *b,float *c,float *output)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
float d=0;
__shared__ float shared[32];
#if 1
float _a=a[tid];
float _b=b[tid];
float _c=c[tid];
//d=a*b+c
asm volatile("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(d) : "f"(_a),"f"(_b),"f"(_c));
#else
asm volatile("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(d) : "f"(a[tid]),"f"(b[tid]),"f"(c[tid]));
#endif
//warp内和,放在share memory里
float val = warp_reduce_sum(d);
if (lane == 0) //每个warp的一个thread负责存放结果
{
shared[wid] = val;
}
__syncthreads();
float sum=0;
if (threadIdx.x == 0) //每个block的第一个thread,负责对shared memory的数据求和
{
int warp_count = blockDim.x / warpSize;
for (int i = 0; i <warp_count ; i++){
sum += shared[i];
}
atomicAdd(output, sum);
}
}
int main(int argc,char *argv[])
{
int deviceid=0;
cudaSetDevice(deviceid);
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int block_size=512;
int grid_size=28*52;// 扩大52倍
int total_count=block_size*grid_size;
int total_size=sizeof(float)*total_count;
float *host_a = (float *)malloc(total_size);
float *host_b = (float *)malloc(total_size);
float *host_c = (float *)malloc(total_size);
float *host_d = (float *)malloc(sizeof(float));
float *dev_a;
float *dev_b;
float *dev_c;
float *dev_d;
host_d[0]=0.0;
CHECK_CUDA(cudaMalloc(&dev_a, total_size));
CHECK_CUDA(cudaMalloc(&dev_b, total_size));
CHECK_CUDA(cudaMalloc(&dev_c, total_size));
CHECK_CUDA(cudaMalloc(&dev_d, sizeof(float)));
for(int i = 0; i < total_count; ++i) host_a[i] = i*0.001;
for(int i = 0; i < total_count; ++i) host_b[i] = i*0.002;
for(int i = 0; i < total_count; ++i) host_c[i] = i*0.003;
CHECK_CUDA(cudaMemcpy(dev_a, host_a, total_size,cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(dev_b, host_b, total_size,cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(dev_c, host_c, total_size,cudaMemcpyHostToDevice));
for(int i=0;i<2;i++)
{
CHECK_CUDA(cudaMemcpy(dev_a, host_a, total_size,cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(dev_b, host_b, total_size,cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(dev_c, host_c, total_size,cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(dev_d, host_d, sizeof(float),cudaMemcpyHostToDevice));
cudaEventRecord(start, stream);
MyKernelOpt<<<grid_size,block_size,sizeof(float)*32, stream>>>(dev_a,dev_b,dev_c,dev_d);
cudaEventRecord(stop, stream);
CHECK_CUDA(cudaEventSynchronize(stop));
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("%d %.3f\n",i,milliseconds);
}
cudaMemcpy(host_d, dev_d,sizeof(float), cudaMemcpyDeviceToHost);
printf("output:%f\n",host_d[0]);
free(host_a);
free(host_b);
free(host_c);
free(host_d);
CHECK_CUDA(cudaFree(dev_a));
CHECK_CUDA(cudaFree(dev_b));
CHECK_CUDA(cudaFree(dev_c));
CHECK_CUDA(cudaFree(dev_d));
CHECK_CUDA(cudaEventDestroy(start));
CHECK_CUDA(cudaEventDestroy(stop));
CHECK_CUDA(cudaStreamDestroy(stream));
return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_2_opt.cu -o sample_2_opt
./sample_2_opt
/usr/local/NVIDIA-Nsight-Compute/ncu --set full --export ncu_report_opt -f ./sample_2_opt
17.小结
- 该Kernel为访问存瓶颈,因规模太小无法充分打满DDR的带宽,增大规模后可以接近访存的理论性能
原文地址:https://blog.csdn.net/m0_61864577/article/details/140618800
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!