cuda矩阵点乘计算实现
1、模型
矩阵的点乘运算可以看做把两个矩阵A和B,A的行和B的列结合在一起,形成一个正方体。正方体中每个坐标位置的值为其坐标投影位置值相乘。得出的值为沿着A的行(即B的列)方向,将正方体中元素的值进行求和。
2、程序
基于以上认知,设计了一个cuda程序,用于计算矩阵的乘法。
#ifndef __MATRIX_HPP__
#define __MATRIX_HPP__
#include <iostream>
template<int row_num_, int col_num_, typename val_type_>
struct matrix_device
{
static const int row_num = row_num_;
static const int col_num = col_num_;
using val_type = val_type_;
val_type* data;
matrix_device() : data(nullptr) {}
};
template<int row_num_, int col_num_, typename val_type_>
struct matrix_host
{
static const int row_num = row_num_;
static const int col_num = col_num_;
using val_type = val_type_;
val_type data[row_num * col_num];
};
template<int row_num_, int col_num_, typename val_type_>
__device__ val_type_& mat_at(matrix_device<row_num_, col_num_, val_type_>& mat, int row, int col) {
return mat.data[row * col_num_ + col];
}
template<int row_num_, int col_num_, typename val_type_>
__host__ val_type_& mat_at(matrix_host<row_num_, col_num_, val_type_>& mat, int row, int col) {
return mat.data[row * col_num_ + col];
}
template<int row_num_, int col_num_, typename val_type_>
void print(matrix_host<row_num_, col_num_, val_type_>& mat) {
for (int i = 0; i < row_num_; ++i) {
for (int j = 0; j < col_num_; ++j) {
std::cout << mat_at(mat, i, j) << " ";
}
std::cout << std::endl;
}
}
template<int row_num_, int col_num_, typename val_type_>
void init(matrix_device<row_num_, col_num_, val_type_>& mat) {
if (mat.data == nullptr)
{
cudaMalloc(&mat.data, row_num_ * col_num_ * sizeof(val_type_));
}
cudaMemset(mat.data, 0, row_num_ * col_num_ * sizeof(val_type_));
}
template<int row_num_, int col_num_, typename val_type_>
void eyes(matrix_host<row_num_, col_num_, val_type_>& mat, val_type_ value = 1) {
for (int i = 0; i < row_num_; ++i)
{
for (int j = 0; j < col_num_; ++j)
{
mat_at(mat, i, j) = (i == j) ? value : 0;
}
}
}
template<int row_num, int col_num, typename val_type>
void ones(matrix_host<row_num, col_num, val_type>& mat, val_type value = 1)
{
for (int i = 0; i < row_num; ++i)
{
for (int j = 0; j < col_num; ++j)
{
mat_at(mat, i, j) = value;
}
}
}
template<int row_num_, int col_num_, typename val_type_>
void upper_triangular(matrix_host<row_num_, col_num_, val_type_>& mat, val_type_ value = 1) {
memset(mat.data, 0, row_num_ * col_num_ * sizeof(val_type_));
for (int i = 0; i < row_num_; ++i) {
for (int j = i; j < col_num_; ++j) {
mat_at(mat, i, j) = (i <= j) ? value : 0;
}
}
}
template<int row_num_, int col_num_, typename val_type_>
__host__ void eyes(matrix_device<row_num_, col_num_, val_type_>& mat, val_type_ value = 1) {
init(mat);
// 定义一个host的矩阵
matrix_host<row_num_, col_num_, val_type_> host_mat;
eyes(host_mat, value);
// 将host的矩阵拷贝到device
cudaMemcpy(mat.data, host_mat.data, row_num_ * col_num_ * sizeof(val_type_), cudaMemcpyHostToDevice);
}
template<int row_num_, int col_num_, typename val_type_>
__host__ void ones(matrix_device<row_num_, col_num_, val_type_>& mat, val_type_ value = 1) {
init(mat);
// 定义一个host的矩阵
matrix_host<row_num_, col_num_, val_type_> host_mat;
ones(host_mat, value);
// 将host的矩阵拷贝到device
cudaMemcpy(mat.data, host_mat.data, row_num_ * col_num_ * sizeof(val_type_), cudaMemcpyHostToDevice);
}
template<int row_num_, int col_num_, typename val_type_>
__host__ void upper_triangular(matrix_device<row_num_, col_num_, val_type_>& mat, val_type_ value = 1) {
init(mat);
// 定义一个host的矩阵
matrix_host<row_num_, col_num_, val_type_> host_mat;
upper_triangular(host_mat, value);
// 将host的矩阵拷贝到device
cudaMemcpy(mat.data, host_mat.data, row_num_ * col_num_ * sizeof(val_type_), cudaMemcpyHostToDevice);
}
template<int row_num_, int col_num_, typename val_type_>
void release(matrix_device<row_num_, col_num_, val_type_>& mat) {
cudaFree(mat.data);
mat.data = nullptr;
}
template<int row_num_, int col_num_, typename val_type_>
matrix_device<row_num_, col_num_, val_type_> type_cast(matrix_host<row_num_, col_num_, val_type_>& mat) {
matrix_device<row_num_, col_num_, val_type_> device_mat;
device_mat.data = cudaMalloc<val_type_>(row_num_ * col_num_);
cudaMemcpy(device_mat.data, mat.data, row_num_ * col_num_ * sizeof(val_type_), cudaMemcpyHostToDevice);
return device_mat;
}
template<int row_num_, int col_num_, typename val_type_>
matrix_host<row_num_, col_num_, val_type_> type_cast(matrix_device<row_num_, col_num_, val_type_>& mat) {
matrix_host<row_num_, col_num_, val_type_> host_mat;
cudaMemcpy(host_mat.data, mat.data, row_num_ * col_num_ * sizeof(val_type_), cudaMemcpyDeviceToHost);
return host_mat;
}
struct offset_type
{
int x;
int y;
int z;
};
template<int row_num, int adj_num, int col_num, int tile, typename val_type>
__global__ void mat_dot_kernel(matrix_device<row_num, adj_num, val_type> a, matrix_device<adj_num, col_num, val_type> b, matrix_device<row_num, col_num, val_type> c, int griddim)
{
offset_type offset;
offset.x = 0;
offset.y = 0;
offset.z = 0;
__shared__ val_type sa[tile][tile];
__shared__ val_type sb[tile][tile];
__shared__ val_type sc[tile][tile];
sc[threadIdx.x][threadIdx.y] = 0;
for (;offset.z < col_num;)
{
int a_r = threadIdx.x + offset.x;
int a_c = threadIdx.y + offset.y;
int b_r = threadIdx.y + offset.y;
int b_c = threadIdx.x + offset.z + blockIdx.x * tile;
int c_r = threadIdx.x + offset.x;
int c_c = threadIdx.y + offset.z + blockIdx.x * tile;
sa[threadIdx.x][threadIdx.y] = (a_r < row_num && a_c < adj_num) ? mat_at(a, a_r, a_c) : 0;
sb[threadIdx.y][threadIdx.x] = (b_r < adj_num && b_c < col_num) ? mat_at(b, b_r, b_c) : 0;
__syncthreads();
for (int i = 0; i < tile; ++i)
{
sc[threadIdx.x][threadIdx.y] += sa[threadIdx.x][i] * sb[i][threadIdx.y];
}
__syncthreads();
offset.y += tile;
if (offset.y < adj_num)
{
continue;
}
if (c_r < row_num && c_c < col_num)
{
mat_at(c, c_r, c_c) = sc[threadIdx.x][threadIdx.y];
}
offset.y = 0;
sc[threadIdx.x][threadIdx.y] = 0; // y增加时,sc保留,当y重置时,sc也要重置
offset.x += tile;
if (offset.x < row_num)
{
continue;
}
offset.x = 0;
offset.z += tile * griddim;
}
}
template<int row_num, int adj_num, int col_num, typename val_type>
void mat_dot(matrix_device<row_num, adj_num, val_type>& a, matrix_device<adj_num, col_num, val_type>& b, matrix_device<row_num, col_num, val_type>& c)
{
const int tile = 16;
int griddim = (col_num + tile - 1) / tile;
griddim = (griddim > 4)? 4 : griddim;
griddim = (griddim < 1)? 1 : griddim;
dim3 threadsPerBlock(tile, tile);
dim3 numBlocks(griddim, 1);
mat_dot_kernel<row_num, adj_num, col_num, tile, val_type><<<numBlocks, threadsPerBlock>>>(a, b, c, griddim);
cudaDeviceSynchronize();
}
// 代理matrix_device以自动支持cudaMalloc和cudaFree
template<int row_num_, int col_num_, typename val_type_>
struct matrix_device_proxy
{
static const int row_num = row_num_;
static const int col_num = col_num_;
using val_type = val_type_;
matrix_device<row_num, col_num, val_type> mat;
matrix_device_proxy()
{
init(mat);
}
matrix_device<row_num, col_num, val_type>& operator()()
{
return mat;
}
~matrix_device_proxy()
{
release(mat);
}
};
#endif
3、实验
写了1个实验程序,用以计算矩阵的点乘:
#include "matrix.hpp"
int main()
{
constexpr int row_num = 30;
constexpr int adj_num = 30;
constexpr int col_num = 30;
matrix_device_proxy<row_num, adj_num, double> A;
eyes(A(), 18.0);
matrix_device_proxy<adj_num, col_num, double> B;
eyes(B(), 2.0);
matrix_device_proxy<row_num, col_num, double> C;
mat_dot(A(), B(), C());
print(type_cast(C()));
cudaDeviceReset();
return 0;
}
这个程序定义了2个30*30的对角矩阵进行点乘运算。结果如下:
可以看到,正确计算出了点乘的结果。
原文地址:https://blog.csdn.net/Dr_Jack/article/details/145140969
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!