自学内容网 自学内容网

cuda矩阵点乘计算实现

cuda矩阵点乘计算实现

1、模型

矩阵的点乘运算可以看做把两个矩阵A和B,A的行和B的列结合在一起,形成一个正方体。正方体中每个坐标位置的值为其坐标投影位置值相乘。得出的值为沿着A的行(即B的列)方向,将正方体中元素的值进行求和。

2、程序

基于以上认知,设计了一个cuda程序,用于计算矩阵的乘法。

#ifndef __MATRIX_HPP__
#define __MATRIX_HPP__
#include <iostream>

template<int row_num_, int col_num_, typename val_type_>
struct matrix_device
{
    static const int row_num = row_num_;
    static const int col_num = col_num_;
    using val_type = val_type_;

    val_type* data;
    matrix_device() : data(nullptr) {}
};

template<int row_num_, int col_num_, typename val_type_>
struct matrix_host
{
    static const int row_num = row_num_;
    static const int col_num = col_num_;
    using val_type = val_type_;

    val_type data[row_num * col_num];
};

template<int row_num_, int col_num_, typename val_type_>
__device__ val_type_& mat_at(matrix_device<row_num_, col_num_, val_type_>& mat, int row, int col) {
    return mat.data[row * col_num_ + col];
}

template<int row_num_, int col_num_, typename val_type_>
__host__ val_type_& mat_at(matrix_host<row_num_, col_num_, val_type_>& mat, int row, int col) {
    return mat.data[row * col_num_ + col];
}

template<int row_num_, int col_num_, typename val_type_>
void print(matrix_host<row_num_, col_num_, val_type_>& mat) {
    for (int i = 0; i < row_num_; ++i) {
        for (int j = 0; j < col_num_; ++j) {
            std::cout << mat_at(mat, i, j) << " ";
        }
        std::cout << std::endl;
    }
}

template<int row_num_, int col_num_, typename val_type_>
void init(matrix_device<row_num_, col_num_, val_type_>& mat) {
    if (mat.data == nullptr) 
    {
        cudaMalloc(&mat.data, row_num_ * col_num_ * sizeof(val_type_));
    }
    cudaMemset(mat.data, 0, row_num_ * col_num_ * sizeof(val_type_));
}

template<int row_num_, int col_num_, typename val_type_>
void eyes(matrix_host<row_num_, col_num_, val_type_>& mat, val_type_ value = 1) {
    for (int i = 0; i < row_num_; ++i) 
    {
        for (int j = 0; j < col_num_; ++j) 
        {
            mat_at(mat, i, j) = (i == j) ? value : 0;
        }
    }
}

template<int row_num, int col_num, typename val_type>
void ones(matrix_host<row_num, col_num, val_type>& mat, val_type value = 1)
{
    for (int i = 0; i < row_num; ++i)
    {
        for (int j = 0; j < col_num; ++j)
        {
            mat_at(mat, i, j) = value;
        }
    }
}

template<int row_num_, int col_num_, typename val_type_>
void upper_triangular(matrix_host<row_num_, col_num_, val_type_>& mat, val_type_ value = 1) {
    memset(mat.data, 0, row_num_ * col_num_ * sizeof(val_type_));
    for (int i = 0; i < row_num_; ++i) {
        for (int j = i; j < col_num_; ++j) {
            mat_at(mat, i, j) = (i <= j) ? value : 0;
        }
    }
}

template<int row_num_, int col_num_, typename val_type_>
__host__ void eyes(matrix_device<row_num_, col_num_, val_type_>& mat, val_type_ value = 1) {
    init(mat);
    // 定义一个host的矩阵
    matrix_host<row_num_, col_num_, val_type_> host_mat;
    eyes(host_mat, value);
    // 将host的矩阵拷贝到device
    cudaMemcpy(mat.data, host_mat.data, row_num_ * col_num_ * sizeof(val_type_), cudaMemcpyHostToDevice);
}

template<int row_num_, int col_num_, typename val_type_>
__host__ void ones(matrix_device<row_num_, col_num_, val_type_>& mat, val_type_ value = 1) {
    init(mat);
    // 定义一个host的矩阵
    matrix_host<row_num_, col_num_, val_type_> host_mat;
    ones(host_mat, value);
    // 将host的矩阵拷贝到device
    cudaMemcpy(mat.data, host_mat.data, row_num_ * col_num_ * sizeof(val_type_), cudaMemcpyHostToDevice);
}

template<int row_num_, int col_num_, typename val_type_>
__host__ void upper_triangular(matrix_device<row_num_, col_num_, val_type_>& mat, val_type_ value = 1) {
    init(mat);
    // 定义一个host的矩阵
    matrix_host<row_num_, col_num_, val_type_> host_mat;
    upper_triangular(host_mat, value);
    // 将host的矩阵拷贝到device
    cudaMemcpy(mat.data, host_mat.data, row_num_ * col_num_ * sizeof(val_type_), cudaMemcpyHostToDevice);
}

template<int row_num_, int col_num_, typename val_type_>
void release(matrix_device<row_num_, col_num_, val_type_>& mat) {
    cudaFree(mat.data);
    mat.data = nullptr;
}

template<int row_num_, int col_num_, typename val_type_>
matrix_device<row_num_, col_num_, val_type_> type_cast(matrix_host<row_num_, col_num_, val_type_>& mat) {
    matrix_device<row_num_, col_num_, val_type_> device_mat;
    device_mat.data = cudaMalloc<val_type_>(row_num_ * col_num_);
    cudaMemcpy(device_mat.data, mat.data, row_num_ * col_num_ * sizeof(val_type_), cudaMemcpyHostToDevice);
    return device_mat;
}

template<int row_num_, int col_num_, typename val_type_>
matrix_host<row_num_, col_num_, val_type_> type_cast(matrix_device<row_num_, col_num_, val_type_>& mat) {
    matrix_host<row_num_, col_num_, val_type_> host_mat;
    cudaMemcpy(host_mat.data, mat.data, row_num_ * col_num_ * sizeof(val_type_), cudaMemcpyDeviceToHost);
    return host_mat;
}

struct offset_type
{
    int x;
    int y;
    int z;
};

template<int row_num, int adj_num, int col_num, int tile, typename val_type>
__global__ void mat_dot_kernel(matrix_device<row_num, adj_num, val_type> a, matrix_device<adj_num, col_num, val_type> b, matrix_device<row_num, col_num, val_type> c, int griddim)
{
    offset_type offset;
    offset.x = 0;
    offset.y = 0;
    offset.z = 0;
    __shared__ val_type sa[tile][tile];
    __shared__ val_type sb[tile][tile];
    __shared__ val_type sc[tile][tile];
    sc[threadIdx.x][threadIdx.y] = 0;
    for (;offset.z < col_num;)
    {
        int a_r = threadIdx.x + offset.x;
        int a_c = threadIdx.y + offset.y;
        int b_r = threadIdx.y + offset.y;
        int b_c = threadIdx.x + offset.z + blockIdx.x * tile;
        int c_r = threadIdx.x + offset.x;
        int c_c = threadIdx.y + offset.z + blockIdx.x * tile;
        sa[threadIdx.x][threadIdx.y] = (a_r < row_num && a_c < adj_num) ? mat_at(a, a_r, a_c) : 0;
        sb[threadIdx.y][threadIdx.x] = (b_r < adj_num && b_c < col_num) ? mat_at(b, b_r, b_c) : 0;
        __syncthreads();
        for (int i = 0; i < tile; ++i)
        {
            sc[threadIdx.x][threadIdx.y] += sa[threadIdx.x][i] * sb[i][threadIdx.y];
        }
        __syncthreads();
        offset.y += tile;
        if (offset.y < adj_num)
        {
            continue;
        }
        if (c_r < row_num && c_c < col_num)
        {
            mat_at(c, c_r, c_c) = sc[threadIdx.x][threadIdx.y];
        }
        offset.y = 0;
        sc[threadIdx.x][threadIdx.y] = 0;       // y增加时,sc保留,当y重置时,sc也要重置
        offset.x += tile;
        if (offset.x < row_num)
        {
            continue;
        }
        offset.x = 0;
        offset.z += tile * griddim;
    }
}

template<int row_num, int adj_num, int col_num, typename val_type>
void mat_dot(matrix_device<row_num, adj_num, val_type>& a, matrix_device<adj_num, col_num, val_type>& b, matrix_device<row_num, col_num, val_type>& c)
{
    const int tile = 16;
    int griddim = (col_num + tile - 1) / tile;
    griddim = (griddim > 4)? 4 : griddim;
    griddim = (griddim < 1)? 1 : griddim;
    dim3 threadsPerBlock(tile, tile);
    dim3 numBlocks(griddim, 1);
    mat_dot_kernel<row_num, adj_num, col_num, tile, val_type><<<numBlocks, threadsPerBlock>>>(a, b, c, griddim);
    cudaDeviceSynchronize();
}

// 代理matrix_device以自动支持cudaMalloc和cudaFree
template<int row_num_, int col_num_, typename val_type_>
struct matrix_device_proxy
{
    static const int row_num = row_num_;
    static const int col_num = col_num_;
    using val_type = val_type_;
    matrix_device<row_num, col_num, val_type> mat;

    matrix_device_proxy()
    {
        init(mat);
    }

    matrix_device<row_num, col_num, val_type>& operator()()
    {
        return mat;
    }

    ~matrix_device_proxy()
    {
        release(mat);
    }

};
#endif

3、实验

写了1个实验程序,用以计算矩阵的点乘:

#include "matrix.hpp"
int main()
{
    constexpr int row_num = 30;
    constexpr int adj_num = 30;
    constexpr int col_num = 30;
    matrix_device_proxy<row_num, adj_num, double> A;
    eyes(A(), 18.0);
    matrix_device_proxy<adj_num, col_num, double> B;
    eyes(B(), 2.0);
    matrix_device_proxy<row_num, col_num, double> C;
    mat_dot(A(), B(), C());
    print(type_cast(C()));
    cudaDeviceReset();
    return 0;
}

这个程序定义了2个30*30的对角矩阵进行点乘运算。结果如下:
点乘运算结果
可以看到,正确计算出了点乘的结果。


原文地址:https://blog.csdn.net/Dr_Jack/article/details/145140969

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!