自学内容网 自学内容网

NsightComputeProfiling入门

本文是NsightCompute的一个演示

1.参考链接

2.下载安装nsight-compute-linux-2024.2.1.2-34372528.run

/usr/local/NVIDIA-Nsight-Compute/ncu --version
NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2024 NVIDIA Corporation
Version 2024.2.1.0 (build 34372528) (public-release)

3.获取GPU设备信息

cd /usr/local/cuda/extras/CUPTI/samples/checkpoint_kernels
make
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics device__attribute_limits_max_cta_per_sm,\
device__attribute_max_warps_per_multiprocessor,\
device__attribute_num_schedulers_per_multiprocessor,\
device__attribute_max_warps_per_scheduler,\
device__attribute_max_ipc_per_scheduler,\
device__attribute_max_ipc_per_multiprocessor,\
device__attribute_max_registers_per_thread ./checkpoint_kernels
cd -

输出

 --------------------------------------------------- ----------- ------------
 Metric Name                                         Metric Unit Metric Value
 --------------------------------------------------- ----------- ------------
 device__attribute_limits_max_cta_per_sm                                   16
 device__attribute_max_ipc_per_multiprocessor                               4
 device__attribute_max_ipc_per_scheduler                                    1
 device__attribute_max_registers_per_thread                               255
 device__attribute_max_warps_per_multiprocessor                            48
 device__attribute_max_warps_per_scheduler                                 12
 device__attribute_num_schedulers_per_multiprocessor                        4
 --------------------------------------------------- ----------- ------------

解释

device__attribute_limits_max_cta_per_sm              每个SM支持的最大CTA数                          16
device__attribute_max_warps_per_multiprocessor       每个SM支持的最大warp数                         48
device__attribute_num_schedulers_per_multiprocessor  每个SM的warp schedule数                        4
device__attribute_max_warps_per_scheduler            每个warp schedule最大支持的warp数              12
device__attribute_max_ipc_per_scheduler              每个warp scheduler每个时钟能处理的最大指令条数 1
device__attribute_max_ipc_per_multiprocessor         每个SM每个时钟能处理的最大指令条数             4
device__attribute_max_registers_per_thread           每个线程支持的最大寄存器数                     255
The number of instructions required to hide a latency of L clock cycles depends on the respective throughputs of these instructions
4L for devices of compute capability 5.x, 6.1, 6.2, 7.x and 8.x since for these devices,
a multiprocessor issues one instruction per warp over one clock cycle for four warps at a time, as mentioned in Compute Capabilities.
从Arithmetic Instruction可以看到每个cyle一个sm可以处理4条乘加指令
指令latency及多种因素的影响(寄存器依赖,别的指令的指令,操作数是否准备好等等,因此需要更多的线程来隐藏延迟)

4.获取GPU信息

tee devinfo.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
int main()
{
    int deviceid=0;
    cudaSetDevice(deviceid);  
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, deviceid);

    int maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
    int sharedMemoryPerBlock = deviceProp.sharedMemPerBlock;
    int maxBlocksPerMultiprocessor = deviceProp.maxBlocksPerMultiProcessor;
    int smCount = deviceProp.multiProcessorCount;

    std::cout << "Device name: " << deviceProp.name << std::endl;
    std::cout << "Max threads per block: " << maxThreadsPerBlock << std::endl;
    std::cout << "Shared memory per block: " << sharedMemoryPerBlock << " bytes" << std::endl;
    std::cout << "Max blocks per SM: " << maxBlocksPerMultiprocessor << std::endl;
    std::cout << "Number of SMs: " << smCount << std::endl;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_70 -o devinfo devinfo.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
./devinfo

输出

Device name: NVIDIA GeForce RTX 3060
Max threads per block: 1024
Shared memory per block: 49152 bytes
Max blocks per SM: 16
Number of SMs: 28

5.从计算的角度如何让一个SM饱和呢:

  • 每个SM每个时钟能处理的最大指令条数是4,一个warp同时执行一条指令
  • 也就是说一个SM可以同时运行:32*4个线程
  • 为了更好的隐藏延迟,乘加指令的Latency是4个cycle,这里将Blocksize为 4324个线程=512
  • 28个SM,将grid设置为28

6.numpy算法实现:sum(a*b+c)

tee sample_2.py<<-'EOF'
import numpy as np
grid_count=28
block_count=512
total_count=grid_count*block_count
input_a=np.arange(total_count,dtype=np.float32)*0.001
input_b=np.arange(total_count,dtype=np.float32)*0.002
input_c=np.arange(total_count,dtype=np.float32)*0.003
output_d=input_a*input_b+input_c
output_reduce_sum=np.sum(output_d).astype(np.float32)
print(output_reduce_sum)
EOF
python sample_2.py

输出

2272286.2

7.cuda实现

tee sample_2.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>

#define CHECK_CUDA(call)                                           \
    do {                                                           \
        cudaError_t err = call;                                    \
        if (err != cudaSuccess) {                                  \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \
            std::cerr << " code=" << err << " (" << cudaGetErrorString(err) << ")" << std::endl; \
            exit(EXIT_FAILURE);                                    \
        }                                                          \
    } while (0)

__device__ __forceinline__ float warp_reduce_sum(float val) {
#if 1
  #pragma unroll
  for (int mask = warpSize >> 1; mask >= 1; mask >>= 1) {
    val += __shfl_xor_sync(0xffffffff, val, mask);
  }
  return val;
#else 
    unsigned int mask = 0xffffffff;
    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
        val += __shfl_down_sync(mask, val, offset);
    }
    return val;
#endif  
}

__global__ void MyKernel(float *a,float *b,float *c,float *output)
{
    int tid  = threadIdx.x + blockIdx.x * blockDim.x;
    int lane = threadIdx.x % warpSize;
    int wid = threadIdx.x / warpSize;
    
    float d=0;    
    __shared__ float shared[32];
#if 1
float _a=a[tid];
float _b=b[tid];
float _c=c[tid];  
    //d=a*b+c
    asm volatile("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(d) : "f"(_a),"f"(_b),"f"(_c));    
#else
    asm volatile("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(d) : "f"(a[tid]),"f"(b[tid]),"f"(c[tid]));          
#endif
    
    //warp内和,放在share memory里
    float val = warp_reduce_sum(d);
    if (lane == 0) //每个warp的一个thread负责存放结果
    {
       shared[wid] = val;
    }
    __syncthreads();
    
    float sum=0;
    if (threadIdx.x == 0) //每个block的第一个thread,负责对shared memory的数据求和
    {
        int warp_count = blockDim.x / warpSize;
        for (int i = 0; i <warp_count ; i++){
            sum += shared[i];
        }
        atomicAdd(output, sum);
    }
}

int main(int argc,char *argv[])
{
    int deviceid=0;
    cudaSetDevice(deviceid);
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int block_size=512;
    int grid_size=28;
    
    int total_count=block_size*grid_size;
    int total_size=sizeof(float)*total_count;
    
    float *host_a = (float *)malloc(total_size);
    float *host_b = (float *)malloc(total_size);
    float *host_c = (float *)malloc(total_size);
    float *host_d = (float *)malloc(sizeof(float));
    
    float *dev_a;
    float *dev_b;
    float *dev_c;
    float *dev_d;
    
    host_d[0]=0.0;
    CHECK_CUDA(cudaMalloc(&dev_a, total_size));
    CHECK_CUDA(cudaMalloc(&dev_b, total_size));
    CHECK_CUDA(cudaMalloc(&dev_c, total_size));
    CHECK_CUDA(cudaMalloc(&dev_d, sizeof(float)));
    
    for(int i = 0; i < total_count; ++i) host_a[i] = i*0.001;
    for(int i = 0; i < total_count; ++i) host_b[i] = i*0.002;
    for(int i = 0; i < total_count; ++i) host_c[i] = i*0.003;
    
    CHECK_CUDA(cudaMemcpy(dev_a, host_a, total_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_b, host_b, total_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_c, host_c, total_size,cudaMemcpyHostToDevice));

    for(int i=0;i<2;i++)
    {
        CHECK_CUDA(cudaMemcpy(dev_a, host_a, total_size,cudaMemcpyHostToDevice));
        CHECK_CUDA(cudaMemcpy(dev_b, host_b, total_size,cudaMemcpyHostToDevice));
        CHECK_CUDA(cudaMemcpy(dev_c, host_c, total_size,cudaMemcpyHostToDevice));    
        CHECK_CUDA(cudaMemcpy(dev_d, host_d, sizeof(float),cudaMemcpyHostToDevice));
        cudaEventRecord(start, stream);
        MyKernel<<<grid_size,block_size,sizeof(float)*32, stream>>>(dev_a,dev_b,dev_c,dev_d);
        cudaEventRecord(stop, stream);
        CHECK_CUDA(cudaEventSynchronize(stop));
        float milliseconds = 0;
        cudaEventElapsedTime(&milliseconds, start, stop);
        printf("%d %.3f\n",i,milliseconds);
    }        
    cudaMemcpy(host_d, dev_d,sizeof(float), cudaMemcpyDeviceToHost);
    printf("output:%f\n",host_d[0]);    
    
    free(host_a);
    free(host_b);
    free(host_c);
    free(host_d);
    
    CHECK_CUDA(cudaFree(dev_a));
    CHECK_CUDA(cudaFree(dev_b));
    CHECK_CUDA(cudaFree(dev_c));
    CHECK_CUDA(cudaFree(dev_d));
    
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));
    CHECK_CUDA(cudaStreamDestroy(stream));
    return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_2.cu -o sample_2
./sample_2

输出

2272286.000000

8.常用编译命令

# 将device code生成ptx
/usr/local/cuda/bin/nvcc -std=c++17 -maxrregcount=24 -arch=sm_86 -ptx sample_2.cu -o sample_2.ptx
# ptx生成cubin
/usr/local/cuda/bin/nvcc -maxrregcount=24 -O0 -arch=sm_86 sample_2.ptx -cubin -o sample_2.cubin
# cubin生成fatbin
/usr/local/cuda/bin/nvcc -maxrregcount=24 -arch=sm_86 sample_2.cubin -fatbin -o sample_2.fatbin
# Link object files with relocatable device code and .ptx, .cubin, and .fatbin files into an object file with executable device code, which can be passed to the host linker.
/usr/local/cuda/bin/nvcc -maxrregcount=24 --gpu-architecture=sm_86 --device-link sample_2.fatbin --output-file link.o
# 编译Host代码
/usr/local/cuda/bin/nvcc -std=c++17 -c sample_2.cu -o sample_2.o
# 链接HOST和device代码成可执行程序
/usr/local/cuda/bin/nvcc -o sample_2 sample_2.o link.o  -L /usr/local/cuda/lib64 -lcuda
# 查看资源使用情况
/usr/local/cuda/bin/cuobjdump --dump-resource-usage sample_2.fatbin
# 查看PTX指令
cat sample_2.ptx
# 查看SASS指令
/usr/local/cuda/bin/cuobjdump --dump-sass sample_2.fatbin

9.查看Nsight-Compute支持的sections

/usr/local/NVIDIA-Nsight-Compute/ncu --list-sections

C2CLink                                     
ComputeWorkloadAnalysis                     
InstructionStats                            
LaunchStats                                 
MemoryWorkloadAnalysis                      
MemoryWorkloadAnalysis_Chart                
MemoryWorkloadAnalysis_Tables               
NumaAffinity                                
Nvlink                                      
Nvlink_Tables                               
Nvlink_Topology                             
Occupancy                                   
PmSampling                                  
PmSampling_WarpStates                       
SchedulerStats                              
SourceCounters                              
SpeedOfLight                                
SpeedOfLight_HierarchicalDoubleRooflineChart
SpeedOfLight_HierarchicalHalfRooflineChart  
SpeedOfLight_HierarchicalSingleRooflineChart
SpeedOfLight_HierarchicalTensorRooflineChart
SpeedOfLight_RooflineChart                  
WarpStateStats                              
WorkloadDistribution

10.默认Sections存储路径(从中可以看到Nsight Compute用到的Metric、计算公式及性能预估的python脚本)

ls -l /root/Documents/NVIDIA\ Nsight\ Compute/2024.2.1/Sections/*

'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/AchievedOccupancy.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/C2CLink.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Compute.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/ComputeWorkloadAnalysis.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/CPIStall.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/FPInstructions.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/HighPipeUtilization.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/InstructionStatistics.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/IssueSlotUtilization.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/LaunchStatistics.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/LaunchStatistics.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryApertureUsage.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryCacheAccessPattern.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryL2Compression.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Memory.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryWorkloadAnalysis_Chart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryWorkloadAnalysis.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/MemoryWorkloadAnalysis_Tables.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/NumaAffinity.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Nvlink.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Nvlink_Tables.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Nvlink_Topology.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/NvRules.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/Occupancy.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/PCSamplingData.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/PmSampling.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/PmSampling_WarpStates.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/RequestedMetrics.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SchedulerStatistics.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SharedMemoryConflicts.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SlowPipeLimiter.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SourceCounters.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_HierarchicalDoubleRooflineChart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_HierarchicalHalfRooflineChart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_HierarchicalSingleRooflineChart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_HierarchicalTensorRooflineChart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_RooflineChart.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight_Roofline.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/SpeedOfLight.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/TheoreticalOccupancy.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/ThreadDivergence.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/UncoalescedAccess.chart'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/UncoalescedAccess.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/UncoalescedSharedAccess.chart'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/UncoalescedSharedAccess.py'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/version.txt'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/WarpStateStatistics.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/WorkloadDistribution.section'
'/root/Documents/NVIDIA Nsight Compute/2024.2.1/Sections/WorkloadImbalance.py'

11.ncu常用命令

# 获取所有的metrics
/usr/local/NVIDIA-Nsight-Compute/ncu --set full --export ncu_report -f ./sample_2

# 获取指定section
/usr/local/NVIDIA-Nsight-Compute/ncu --section ComputeWorkloadAnalysis --print-details all ./sample_2

# 获取指定section和特定的metrics
/usr/local/NVIDIA-Nsight-Compute/ncu --section WarpStateStats --metrics smsp__pcsamp_sample_count,group:smsp__pcsamp_warp_stall_reasons,group:smsp__pcsamp_warp_stall_reasons_not_issued ./sample_2

/usr/local/NVIDIA-Nsight-Compute/ncu --metrics group:smsp__pcsamp_warp_stall_reasons ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics group:smsp__pcsamp_warp_stall_reasons_not_issued ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics group:memory__dram_table ./sample_2

/usr/local/NVIDIA-Nsight-Compute/ncu --metrics gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed,breakdown:gpu__compute_memory_throughput.avg.pct_of_peak_sustained_elapsed  ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics sm__throughput.avg.pct_of_peak_sustained_elapsed,breakdown:sm__throughput.avg.pct_of_peak_sustained_elapsed ./sample_2
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics l1tex__lsuin_requests,l1tex__f_wavefronts,dram__sectors,dramc__sectors,fbpa__dram_sectors,l1tex__data_bank_reads,l1tex__data_pipe_lsu_wavefronts,l1tex__lsuin_requests,l1tex__t_bytes_lookup_miss  ./sample_2

/usr/local/NVIDIA-Nsight-Compute/ncu --metrics sass__inst_executed_per_opcode,sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active --target-processes all --export ncu_report -f ./sample_2
# 查看instances的数据
/usr/local/NVIDIA-Nsight-Compute/ncu -i ncu_report.ncu-rep --print-details all --print-units auto --print-metric-instances values

12.先通过GPU Speed Of Light Throughput分析宏观层面的性能瓶颈,再根据指引查看相关瓶颈点

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

13.测试fma指令需要的cycle数(一条fma指令的latency是38个cycle,每增加一条指令,增加4个cycle)

tee sample_3.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>

template<int LoopCount>
__global__ void measure_load_latency(int *tid, int *out,int *out_val, int N) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= N) return;
    int temp[LoopCount];
    unsigned int start_cycle, end_cycle;
    //%%clock:A predefined, read-only 32-bit unsigned cycle counter.
    
    float _a=tid[idx];
    float _b=tid[idx];
    float _c=tid[idx];

    __syncthreads();    
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(start_cycle) :: "memory");
    #pragma unroll
    for(int i=0;i<LoopCount;i++)
    {
        asm volatile("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(_c) : "f"(_a),"f"(_b),"f"(_c));
    }
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(end_cycle) :: "memory");        
    out[idx] = end_cycle - start_cycle;
    out_val[idx]=(int)_c;
}

__host__ void measure_load_latency_on_gpu(int *h_in, int N) {
    int *d_in, *d_out,*d_out_val;;
    int h_out[N];

    cudaMalloc((void**)&d_in, 128*N * sizeof(int));
    cudaMalloc((void**)&d_out,  2*N * sizeof(int));
    cudaMalloc((void**)&d_out_val,  N * sizeof(int));
    printf("-----------------------------------------------\n");
    cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice);
    
    int blocks = 1;
    int threads_per_block = 32;
    {
        measure_load_latency<1><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "latency (in cycles): " << h_out[0] << std::endl;
    }
    {
        measure_load_latency<2><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "latency (in cycles): " << h_out[0] << std::endl;
    }
    {
        measure_load_latency<3><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "latency (in cycles): " << h_out[0] << std::endl;
    }
    {
        measure_load_latency<4><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "latency (in cycles): " << h_out[0] << std::endl;
    }    
    cudaFree(d_in);
    cudaFree(d_out);
}

int main() {
    const int N = 32;
    int h_in[N] = {42};  // Sample input data
    measure_load_latency_on_gpu(h_in, N);    
    measure_load_latency_on_gpu(h_in, N);    
    return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_3.cu -o sample_3
./sample_3

输出

-----------------------------------------------
latency (in cycles): 38
latency (in cycles): 42
latency (in cycles): 46
latency (in cycles): 50
-----------------------------------------------
latency (in cycles): 38
latency (in cycles): 42
latency (in cycles): 46
latency (in cycles): 50

14.测试share memory load指令需要的cycle数

tee sample_3.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>

template<int LoopCount>
__global__ void measure_load_latency(int *in, int *out,int *out_val, int N) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= N) return;
    int temp[LoopCount];
    __shared__ float shared[1024];
    shared[idx]=in[idx];
    unsigned int start_cycle, end_cycle;
    //%%clock:A predefined, read-only 32-bit unsigned cycle counter.
    __syncthreads();
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(start_cycle) :: "memory");
    #pragma unroll
    for(int i=0;i<LoopCount;i++)
    {
        temp[i] = shared[idx+32*i];
    }
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(end_cycle) :: "memory");        
    out[idx] = end_cycle - start_cycle;
    int total=0;
    for(int i=0;i<LoopCount;i++)
    {
        total+=temp[i];
    }
    out_val[idx]=total; //为了防止被编译器优化掉
}

__host__ void measure_load_latency_on_gpu(int *h_in, int N) {
    int *d_in, *d_out,*d_out_val;;
    int h_out[N];

    cudaMalloc((void**)&d_in, 128*N * sizeof(int));
    cudaMalloc((void**)&d_out,  2*N * sizeof(int));
    cudaMalloc((void**)&d_out_val,  N * sizeof(int));
    
    cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice);
    printf("-------------------------------------------\n");
    int blocks = 1;
    int threads_per_block = 32;
    {
        cudaCtxResetPersistingL2Cache();
        measure_load_latency<1><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
    }
    {
        cudaCtxResetPersistingL2Cache();
        measure_load_latency<2><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
    }
    {
        cudaCtxResetPersistingL2Cache();
        measure_load_latency<3><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
    }
    {
        cudaCtxResetPersistingL2Cache();
        measure_load_latency<4><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
    }
    cudaFree(d_in);
    cudaFree(d_out);
}

int main() {
    const int N = 32;
    int h_in[N] = {42};  // Sample input data
    measure_load_latency_on_gpu(h_in, N);    
    measure_load_latency_on_gpu(h_in, N);    
    return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_3.cu -o sample_3
./sample_3

输出

-------------------------------------------
Load latency (in cycles): 38
Load latency (in cycles): 69
Load latency (in cycles): 77
Load latency (in cycles): 85
-------------------------------------------
Load latency (in cycles): 38
Load latency (in cycles): 69
Load latency (in cycles): 77
Load latency (in cycles): 85

15.测试global load指令需要的cycle数(一条load指令需要51个cycle,没增加一条指令,增加4个cycle)

tee sample_3.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>

template<int LoopCount>
__global__ void measure_load_latency(int *in, int *out,int *out_val, int N) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= N) return;
    int temp[LoopCount];
    unsigned int start_cycle, end_cycle;
    //%%clock:A predefined, read-only 32-bit unsigned cycle counter.
    __syncthreads();
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(start_cycle) :: "memory");
    #pragma unroll
    for(int i=0;i<LoopCount;i++)
    {
        temp[i] = in[idx+32*i];
    }
    asm volatile ("mov.u32 %0, %%clock;" : "=r"(end_cycle) :: "memory");        
    out[idx] = end_cycle - start_cycle;
    int total=0;
    for(int i=0;i<LoopCount;i++)
    {
        total+=temp[i];
    }
    out_val[idx]=total;
}

__host__ void measure_load_latency_on_gpu(int *h_in, int N) {
    int *d_in, *d_out,*d_out_val;;
    int h_out[N];

    cudaMalloc((void**)&d_in, 128*N * sizeof(int));
    cudaMalloc((void**)&d_out,  2*N * sizeof(int));
    cudaMalloc((void**)&d_out_val,  N * sizeof(int));
    
    cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice);
    
    int blocks = 1;
    int threads_per_block = 32;
    {
        cudaCtxResetPersistingL2Cache();
        measure_load_latency<1><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
    }
    {
        cudaCtxResetPersistingL2Cache();
        measure_load_latency<2><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
    }
    {
        cudaCtxResetPersistingL2Cache();
        measure_load_latency<3><<<blocks, threads_per_block>>>((int*)d_in, d_out,d_out_val,N);
        cudaDeviceSynchronize();    
        cudaMemcpy(h_out, d_out, sizeof(int)*threads_per_block, cudaMemcpyDeviceToHost);    
        std::cout << "Load latency (in cycles): " << h_out[0] << std::endl;
    }
    cudaFree(d_in);
    cudaFree(d_out);
}

int main() {
    const int N = 32;
    int h_in[N] = {42};  // Sample input data
    measure_load_latency_on_gpu(h_in, N);    
    return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_3.cu -o sample_3
./sample_3

输出
bash

Load latency (in cycles): 51
Load latency (in cycles): 55
Load latency (in cycles): 59

确认是否对齐访问

/usr/local/NVIDIA-Nsight-Compute/ncu  --metrics  smsp__sass_inst_executed_op_global_ld.sum,memory_l2_theoretical_sectors_global,memory_l2_theoretical_sectors_global_ideal,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_access.sum,sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_bypass.sum,l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum  ./sample_3

输出

Section: Command line profiler metrics
------------------------------------------------------------------------ ----------- ------------
Metric Name                                                              Metric Unit Metric Value
------------------------------------------------------------------------ ----------- ------------
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum                                        1
l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum                                                 1
memory_l2_theoretical_sectors_global                                         sectors           12
memory_l2_theoretical_sectors_global_ideal                                   sectors           12
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_access.sum                        0
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_bypass.sum                        0
smsp__sass_inst_executed_op_global_ld.sum                                       inst            1
------------------------------------------------------------------------ ----------- ------------

void measure_load_latency<2>(int *, int *, int *, int) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
------------------------------------------------------------------------ ----------- ------------
Metric Name                                                              Metric Unit Metric Value
------------------------------------------------------------------------ ----------- ------------
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum                                        2
l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum                                                 2
memory_l2_theoretical_sectors_global                                         sectors           16
memory_l2_theoretical_sectors_global_ideal                                   sectors           16
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_access.sum                        0
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_bypass.sum                        0
smsp__sass_inst_executed_op_global_ld.sum                                       inst            2
------------------------------------------------------------------------ ----------- ------------

void measure_load_latency<3>(int *, int *, int *, int) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
------------------------------------------------------------------------ ----------- ------------
Metric Name                                                              Metric Unit Metric Value
------------------------------------------------------------------------ ----------- ------------
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.sum                                        3
l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum                                                 3
memory_l2_theoretical_sectors_global                                         sectors           20
memory_l2_theoretical_sectors_global_ideal                                   sectors           20
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_access.sum                        0
sm__sass_l1tex_t_requests_pipe_lsu_mem_global_op_ldgsts_cache_bypass.sum                        0
smsp__sass_inst_executed_op_global_ld.sum                                       inst            3
------------------------------------------------------------------------ ----------- ------------

从上面global load指令的latency可见,它需要超过50个cycle,才能完成数据的加载
接下来将数据规模扩大50倍,保证在数据完全加载回来之前,一直有指令下发,测试DDR的带宽利用率

16.将规模扩大52倍

tee sample_2_opt.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>

#define CHECK_CUDA(call)                                           \
    do {                                                           \
        cudaError_t err = call;                                    \
        if (err != cudaSuccess) {                                  \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \
            std::cerr << " code=" << err << " (" << cudaGetErrorString(err) << ")" << std::endl; \
            exit(EXIT_FAILURE);                                    \
        }                                                          \
    } while (0)

__device__ __forceinline__ float warp_reduce_sum(float val) {
#if 1
  #pragma unroll
  for (int mask = warpSize >> 1; mask >= 1; mask >>= 1) {
    val += __shfl_xor_sync(0xffffffff, val, mask);
  }
  return val;
#else 
    unsigned int mask = 0xffffffff;
    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
        val += __shfl_down_sync(mask, val, offset);
    }
    return val;
#endif  
}

__global__ void MyKernelOpt(float *a,float *b,float *c,float *output)
{
    int tid  = threadIdx.x + blockIdx.x * blockDim.x;
    int lane = threadIdx.x % warpSize;
    int wid = threadIdx.x / warpSize;
    
    float d=0;    
    __shared__ float shared[32];
#if 1
float _a=a[tid];
float _b=b[tid];
float _c=c[tid];  
    //d=a*b+c
    asm volatile("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(d) : "f"(_a),"f"(_b),"f"(_c));    
#else
    asm volatile("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(d) : "f"(a[tid]),"f"(b[tid]),"f"(c[tid]));          
#endif
    
    //warp内和,放在share memory里
    float val = warp_reduce_sum(d);
    if (lane == 0) //每个warp的一个thread负责存放结果
    {
       shared[wid] = val;
    }
    __syncthreads();
    
    float sum=0;
    if (threadIdx.x == 0) //每个block的第一个thread,负责对shared memory的数据求和
    {
        int warp_count = blockDim.x / warpSize;
        for (int i = 0; i <warp_count ; i++){
            sum += shared[i];
        }
        atomicAdd(output, sum);
    }
}

int main(int argc,char *argv[])
{
    int deviceid=0;
    cudaSetDevice(deviceid);
    cudaStream_t stream;
    cudaStreamCreate(&stream);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    int block_size=512;
    int grid_size=28*52;// 扩大52倍
    
    int total_count=block_size*grid_size;
    int total_size=sizeof(float)*total_count;
    
    float *host_a = (float *)malloc(total_size);
    float *host_b = (float *)malloc(total_size);
    float *host_c = (float *)malloc(total_size);
    float *host_d = (float *)malloc(sizeof(float));
    
    float *dev_a;
    float *dev_b;
    float *dev_c;
    float *dev_d;
    
    host_d[0]=0.0;
    CHECK_CUDA(cudaMalloc(&dev_a, total_size));
    CHECK_CUDA(cudaMalloc(&dev_b, total_size));
    CHECK_CUDA(cudaMalloc(&dev_c, total_size));
    CHECK_CUDA(cudaMalloc(&dev_d, sizeof(float)));
    
    for(int i = 0; i < total_count; ++i) host_a[i] = i*0.001;
    for(int i = 0; i < total_count; ++i) host_b[i] = i*0.002;
    for(int i = 0; i < total_count; ++i) host_c[i] = i*0.003;
    
    CHECK_CUDA(cudaMemcpy(dev_a, host_a, total_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_b, host_b, total_size,cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(dev_c, host_c, total_size,cudaMemcpyHostToDevice));

    for(int i=0;i<2;i++)
    {
        CHECK_CUDA(cudaMemcpy(dev_a, host_a, total_size,cudaMemcpyHostToDevice));
        CHECK_CUDA(cudaMemcpy(dev_b, host_b, total_size,cudaMemcpyHostToDevice));
        CHECK_CUDA(cudaMemcpy(dev_c, host_c, total_size,cudaMemcpyHostToDevice));    
        CHECK_CUDA(cudaMemcpy(dev_d, host_d, sizeof(float),cudaMemcpyHostToDevice));
        cudaEventRecord(start, stream);
        MyKernelOpt<<<grid_size,block_size,sizeof(float)*32, stream>>>(dev_a,dev_b,dev_c,dev_d);
        cudaEventRecord(stop, stream);
        CHECK_CUDA(cudaEventSynchronize(stop));
        float milliseconds = 0;
        cudaEventElapsedTime(&milliseconds, start, stop);
        printf("%d %.3f\n",i,milliseconds);
    }        
    cudaMemcpy(host_d, dev_d,sizeof(float), cudaMemcpyDeviceToHost);
    printf("output:%f\n",host_d[0]);    
    
    free(host_a);
    free(host_b);
    free(host_c);
    free(host_d);
    
    CHECK_CUDA(cudaFree(dev_a));
    CHECK_CUDA(cudaFree(dev_b));
    CHECK_CUDA(cudaFree(dev_c));
    CHECK_CUDA(cudaFree(dev_d));
    
    CHECK_CUDA(cudaEventDestroy(start));
    CHECK_CUDA(cudaEventDestroy(stop));
    CHECK_CUDA(cudaStreamDestroy(stream));
    return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -lineinfo sample_2_opt.cu -o sample_2_opt
./sample_2_opt
/usr/local/NVIDIA-Nsight-Compute/ncu --set full --export ncu_report_opt -f ./sample_2_opt

在这里插入图片描述
在这里插入图片描述

17.小结

  • 该Kernel为访问存瓶颈,因规模太小无法充分打满DDR的带宽,增大规模后可以接近访存的理论性能

原文地址:https://blog.csdn.net/m0_61864577/article/details/140618800

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!