CUDA与TensorRT学习二:CUDA编程入门

文章目录

- 一、理解CUDA的grid和Block
- - 1）第一个cuda项目
- 二、理解.cu和.cpp的相互引用及Makefile
- 三、利用CUDA矩阵乘法(matmul)计算、Error Handle 及硬件信息获取
- - 1）矩阵乘法
  - 2）Error Handle
  - 3）硬件信息获取
- 四、安装Nsight system and compute
- 五、共享内存、Bank Conflict原因和解决方法、TRT用Cuda进行预处理/后处理来加速、Stream 与Event（用Cuda写流提高并发性）
- 六、双线性插值与仿射变换

一、理解CUDA的grid和Block

目标
理解Cuda中一维、二维、三维的grid、block的写法，以及遍历thread的方法

1）第一个cuda项目

修改项目的Makefile.config
总体文件目录
代码

#include <cuda_runtime.h>
#include <stdio.h>__global__ void print_idx_kernel(){printf("block idx: (%3d, %3d, %3d), thread idx: (%3d, %3d, %3d)\n",blockIdx.z, blockIdx.y, blockIdx.x,threadIdx.z, threadIdx.y, threadIdx.x);
}__global__ void print_dim_kernel(){printf("grid dimension: (%3d, %3d, %3d), block dimension: (%3d, %3d, %3d)\n",gridDim.z, gridDim.y, gridDim.x,blockDim.z, blockDim.y, blockDim.x);
}__global__ void print_thread_idx_per_block_kernel(){int index = threadIdx.z * blockDim.x * blockDim.y + \threadIdx.y * blockDim.x + \threadIdx.x;printf("block idx: (%3d, %3d, %3d), thread idx: %3d\n",blockIdx.z, blockIdx.y, blockIdx.x,index);
}__global__ void print_thread_idx_per_grid_kernel(){int bSize  = blockDim.z * blockDim.y * blockDim.x;int bIndex = blockIdx.z * gridDim.x * gridDim.y + \blockIdx.y * gridDim.x + \blockIdx.x;int tIndex = threadIdx.z * blockDim.x * blockDim.y + \threadIdx.y * blockDim.x + \threadIdx.x;int index  = bIndex * bSize + tIndex;printf("block idx: %3d, thread idx in block: %3d, thread idx: %3d\n", bIndex, tIndex, index);
}__global__ void print_cord_kernel(){int index = threadIdx.z * blockDim.x * blockDim.y + \threadIdx.y * blockDim.x + \threadIdx.x;int x  = blockIdx.x * blockDim.x + threadIdx.x;int y  = blockIdx.y * blockDim.y + threadIdx.y;printf("block idx: (%3d, %3d, %3d), thread idx: %3d, cord: (%3d, %3d)\n",blockIdx.z, blockIdx.y, blockIdx.x,index, x, y);
}void print_one_dim(){int inputSize = 8;int blockDim = 4;int gridDim = inputSize / blockDim;dim3 block(blockDim);dim3 grid(gridDim);/* 这里建议大家吧每一函数都试一遍*/// print_idx_kernel<<<grid, block>>>();// print_dim_kernel<<<grid, block>>>();// print_thread_idx_per_block_kernel<<<grid, block>>>();print_thread_idx_per_grid_kernel<<<grid, block>>>();cudaDeviceSynchronize();
}void print_two_dim(){int inputWidth = 4;int blockDim = 2;int gridDim = inputWidth / blockDim;dim3 block(blockDim, blockDim);dim3 grid(gridDim, gridDim);/* 这里建议大家吧每一函数都试一遍*/// print_idx_kernel<<<grid, block>>>();// print_dim_kernel<<<grid, block>>>();// print_thread_idx_per_block_kernel<<<grid, block>>>();print_thread_idx_per_grid_kernel<<<grid, block>>>();cudaDeviceSynchronize();
}void print_cord(){int inputWidth = 4;int blockDim = 2;int gridDim = inputWidth / blockDim;dim3 block(blockDim, blockDim);dim3 grid(gridDim, gridDim);print_cord_kernel<<<grid, block>>>();cudaDeviceSynchronize();
}int main() {/*synchronize是同步的意思，有几种synchronizecudaDeviceSynchronize: CPU与GPU端完成同步，CPU不执行之后的语句，知道这个语句以前的所有cuda操作结束cudaStreamSynchronize: 跟cudaDeviceSynchronize很像，但是这个是针对某一个stream的。只同步指定的stream中的cpu/gpu操作，其他的不管cudaThreadSynchronize: 现在已经不被推荐使用的方法__syncthreads:         线程块内同步*/// print_one_dim();// print_two_dim();print_cord();return 0;
}

注意
__global__ 表示核函数kernel
需求：找到某个block下面的thread

代码如下（先走z，然后y，最后z）
在这里插入图片描述
一般的优化

二、理解.cu和.cpp的相互引用及Makefile

编译器
不再是gcc或g++，而是nvcc，这样才不会编译报错
编译项目一指令

nvcc print_index.cu  -o app -I  /usr/local/cuda/include/

cuda_check作用
发生错误的时候告诉你错误发生在哪里

#define CUDA_CHECK(call) {                                                 \cudaError_t error = call;                                              \if (error != cudaSuccess) {                                            \printf("ERROR: %s:%d, ", __FILE__, __LINE__);                      \printf("CODE:%d, DETAIL:%s\n", error, cudaGetErrorString(error));  \exit(1);                                                           \}                                                                      \
}

三、利用CUDA矩阵乘法(matmul)计算、Error Handle 及硬件信息获取

1）矩阵乘法

目的
理解使用cuda进行矩阵运算的加速方法，tile的用意
项目目录

2）Error Handle

3）硬件信息获取

目标
学习使用cuda runtime api显示GPU硬件信息，以及理解GPU硬件信息重要性
项目布局
打印效果
相关代码

int main(){int count;int index = 0;cudaGetDeviceCount(&count);while (index < count) {cudaSetDevice(index);cudaDeviceProp prop;cudaGetDeviceProperties(&prop, index);LOG("%-40s",             "*********************Architecture related**********************");LOG("%-40s%d%s",         "Device id: ",                   index, "");LOG("%-40s%s%s",         "Device name: ",                 prop.name, "");LOG("%-40s%.1f%s",       "Device compute capability: ",   prop.major + (float)prop.minor / 10, "");LOG("%-40s%.2f%s",       "GPU global meory size: ",       (float)prop.totalGlobalMem / (1<<30), "GB");LOG("%-40s%.2f%s",       "L2 cache size: ",               (float)prop.l2CacheSize / (1<<20), "MB");LOG("%-40s%.2f%s",       "Shared memory per block: ",     (float)prop.sharedMemPerBlock / (1<<10), "KB");LOG("%-40s%.2f%s",       "Shared memory per SM: ",        (float)prop.sharedMemPerMultiprocessor / (1<<10), "KB");LOG("%-40s%.2f%s",       "Device clock rate: ",           prop.clockRate*1E-6, "GHz");LOG("%-40s%.2f%s",       "Device memory clock rate: ",    prop.memoryClockRate*1E-6, "Ghz");LOG("%-40s%d%s",         "Number of SM: ",                prop.multiProcessorCount, "");LOG("%-40s%d%s",         "Warp size: ",                   prop.warpSize, "");LOG("%-40s",             "*********************Parameter related************************");LOG("%-40s%d%s",         "Max block numbers: ",           prop.maxBlocksPerMultiProcessor, "");LOG("%-40s%d%s",         "Max threads per block: ",       prop.maxThreadsPerBlock, "");LOG("%-40s%d:%d:%d%s",   "Max block dimension size:",     prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2], "");LOG("%-40s%d:%d:%d%s",   "Max grid dimension size: ",     prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2], "");index ++;printf("\n");}return 0;
}