Matrix Multiplication Implementation in CUDA C++ API with and without shared memory

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
1
down vote

favorite












Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.



matrix.hpp



By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.



#pragma once

struct matrix
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;

double *elements;
int rows;
int cols;
int size;
;


kernels.cuh



Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.



#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif


kernels.cu



Here is the actual implementations of the kernels.



#include "kernels.cuh"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size)
int bx = blockIdx.x;
int by = blockIdx.y;

int tx = threadIdx.x;
int ty = threadIdx.y;

int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;

extern __shared__ double buffer;
double *a_shared = &buffer[0];
double *b_shared = &buffer[tile_size * tile_size];

double sum = 0;

for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++)
if (k * tile_size + tx < a.cols && row < a.rows)
a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
else
a_shared[ty * tile_size + tx] = 0.0;

if (k * tile_size + ty < b.rows && col < b.cols)
b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
else
b_shared[ty * tile_size + tx] = 0.0;

__syncthreads();
#pragma unroll
for (int n = 0; n < tile_size; ++n)
sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];

__syncthreads();

if (row < c.rows && col < c.cols)
c.elements[row * c.cols + col] = sum;


#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c)
int bx = blockIdx.x;
int by = blockIdx.y;

int tx = threadIdx.x;
int ty = threadIdx.y;

int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;

if (row < c.rows && col < c.cols)
double sum = 0;
#pragma unroll
for (int k = 0; k < a.cols && k < b.rows; k++)
sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];

c.elements[row * c.cols + col] = sum;


#endif


wrappers.cu



I created some wrapper functions in this file in order to keep my main function clean and offer some kind of high-level abstraction to the user.



#include "wrappers.cuh"
#include <iostream>

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size)
cudaError_t error;
dim3 dimBlock;
dim3 dimGrid;
dimBlock.x = block_size;
dimBlock.y = block_size;
dimBlock.z = 1;
dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
dimGrid.z = 1;

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;

cudaEventRecord(start);
#if SHARED == 1
unsigned int tile_size = block_size;
matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
cudaEventRecord(stop);

cudaEventSynchronize(stop);

cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;

error = cudaDeviceSynchronize();
if (error != cudaSuccess)
std::cerr << cudaGetErrorString(error) << std::endl;




wrappers.cuh



Here are the prototypes of the wrapper functions.



#pragma once

#include "kernels.cuh"

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);


main.cpp



Here is the main function.



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "wrappers.cuh"

#include <iostream>
#include <string>

void print(matrix m, std::string label)
std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
for (int row = 0; row < m.rows; row++)
for (int col = 0; col < m.cols; col++)
std::cout << m.elements[row * m.cols + col] << "t";

std::cout << std::endl;



int main(int argc, char **argv)
if (argc != 8)
std::cout << "NAME" << std::endl;
std::cout << "t" << "matrix-multiplication" << std::endl;
std::cout << std::endl;
return 0;


int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++)
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
std::cout << std::endl;


int a_rows = std::stoi(argv[1]);
int a_cols = std::stoi(argv[2]);

int b_rows = std::stoi(argv[3]);
int b_cols = std::stoi(argv[4]);

int c_rows = std::stoi(argv[5]);
int c_cols = std::stoi(argv[6]);

int block_size = std::stoi(argv[7]);

matrix a(a_rows, a_cols);
matrix b(b_rows, b_cols);
matrix c(c_rows, c_cols);

cudaMallocManaged(&a.elements, a.size * sizeof(double));
cudaMallocManaged(&b.elements, b.size * sizeof(double));
cudaMallocManaged(&c.elements, c.size * sizeof(double));

fill_col(a, block_size); // Implementation not shown here
fill_row(b, block_size); // Implementation not shown here

matrix_multiplication(a, b, c, block_size);

print(a, "a");
print(b, "b");
print(c, "c");

cudaFree(a.elements);
cudaFree(b.elements);
cudaFree(c.elements);

return 0;



So ... what do you think? Does it look good? Do you have any suggestions to make?







share|improve this question





















  • Where do you actually allocate elements?
    – Zeta
    Apr 16 at 4:06










  • @Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
    – xorz57
    Apr 16 at 10:13
















up vote
1
down vote

favorite












Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.



matrix.hpp



By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.



#pragma once

struct matrix
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;

double *elements;
int rows;
int cols;
int size;
;


kernels.cuh



Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.



#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif


kernels.cu



Here is the actual implementations of the kernels.



#include "kernels.cuh"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size)
int bx = blockIdx.x;
int by = blockIdx.y;

int tx = threadIdx.x;
int ty = threadIdx.y;

int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;

extern __shared__ double buffer;
double *a_shared = &buffer[0];
double *b_shared = &buffer[tile_size * tile_size];

double sum = 0;

for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++)
if (k * tile_size + tx < a.cols && row < a.rows)
a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
else
a_shared[ty * tile_size + tx] = 0.0;

if (k * tile_size + ty < b.rows && col < b.cols)
b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
else
b_shared[ty * tile_size + tx] = 0.0;

__syncthreads();
#pragma unroll
for (int n = 0; n < tile_size; ++n)
sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];

__syncthreads();

if (row < c.rows && col < c.cols)
c.elements[row * c.cols + col] = sum;


#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c)
int bx = blockIdx.x;
int by = blockIdx.y;

int tx = threadIdx.x;
int ty = threadIdx.y;

int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;

if (row < c.rows && col < c.cols)
double sum = 0;
#pragma unroll
for (int k = 0; k < a.cols && k < b.rows; k++)
sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];

c.elements[row * c.cols + col] = sum;


#endif


wrappers.cu



I created some wrapper functions in this file in order to keep my main function clean and offer some kind of high-level abstraction to the user.



#include "wrappers.cuh"
#include <iostream>

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size)
cudaError_t error;
dim3 dimBlock;
dim3 dimGrid;
dimBlock.x = block_size;
dimBlock.y = block_size;
dimBlock.z = 1;
dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
dimGrid.z = 1;

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;

cudaEventRecord(start);
#if SHARED == 1
unsigned int tile_size = block_size;
matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
cudaEventRecord(stop);

cudaEventSynchronize(stop);

cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;

error = cudaDeviceSynchronize();
if (error != cudaSuccess)
std::cerr << cudaGetErrorString(error) << std::endl;




wrappers.cuh



Here are the prototypes of the wrapper functions.



#pragma once

#include "kernels.cuh"

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);


main.cpp



Here is the main function.



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "wrappers.cuh"

#include <iostream>
#include <string>

void print(matrix m, std::string label)
std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
for (int row = 0; row < m.rows; row++)
for (int col = 0; col < m.cols; col++)
std::cout << m.elements[row * m.cols + col] << "t";

std::cout << std::endl;



int main(int argc, char **argv)
if (argc != 8)
std::cout << "NAME" << std::endl;
std::cout << "t" << "matrix-multiplication" << std::endl;
std::cout << std::endl;
return 0;


int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++)
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
std::cout << std::endl;


int a_rows = std::stoi(argv[1]);
int a_cols = std::stoi(argv[2]);

int b_rows = std::stoi(argv[3]);
int b_cols = std::stoi(argv[4]);

int c_rows = std::stoi(argv[5]);
int c_cols = std::stoi(argv[6]);

int block_size = std::stoi(argv[7]);

matrix a(a_rows, a_cols);
matrix b(b_rows, b_cols);
matrix c(c_rows, c_cols);

cudaMallocManaged(&a.elements, a.size * sizeof(double));
cudaMallocManaged(&b.elements, b.size * sizeof(double));
cudaMallocManaged(&c.elements, c.size * sizeof(double));

fill_col(a, block_size); // Implementation not shown here
fill_row(b, block_size); // Implementation not shown here

matrix_multiplication(a, b, c, block_size);

print(a, "a");
print(b, "b");
print(c, "c");

cudaFree(a.elements);
cudaFree(b.elements);
cudaFree(c.elements);

return 0;



So ... what do you think? Does it look good? Do you have any suggestions to make?







share|improve this question





















  • Where do you actually allocate elements?
    – Zeta
    Apr 16 at 4:06










  • @Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
    – xorz57
    Apr 16 at 10:13












up vote
1
down vote

favorite









up vote
1
down vote

favorite











Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.



matrix.hpp



By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.



#pragma once

struct matrix
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;

double *elements;
int rows;
int cols;
int size;
;


kernels.cuh



Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.



#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif


kernels.cu



Here is the actual implementations of the kernels.



#include "kernels.cuh"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size)
int bx = blockIdx.x;
int by = blockIdx.y;

int tx = threadIdx.x;
int ty = threadIdx.y;

int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;

extern __shared__ double buffer;
double *a_shared = &buffer[0];
double *b_shared = &buffer[tile_size * tile_size];

double sum = 0;

for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++)
if (k * tile_size + tx < a.cols && row < a.rows)
a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
else
a_shared[ty * tile_size + tx] = 0.0;

if (k * tile_size + ty < b.rows && col < b.cols)
b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
else
b_shared[ty * tile_size + tx] = 0.0;

__syncthreads();
#pragma unroll
for (int n = 0; n < tile_size; ++n)
sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];

__syncthreads();

if (row < c.rows && col < c.cols)
c.elements[row * c.cols + col] = sum;


#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c)
int bx = blockIdx.x;
int by = blockIdx.y;

int tx = threadIdx.x;
int ty = threadIdx.y;

int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;

if (row < c.rows && col < c.cols)
double sum = 0;
#pragma unroll
for (int k = 0; k < a.cols && k < b.rows; k++)
sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];

c.elements[row * c.cols + col] = sum;


#endif


wrappers.cu



I created some wrapper functions in this file in order to keep my main function clean and offer some kind of high-level abstraction to the user.



#include "wrappers.cuh"
#include <iostream>

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size)
cudaError_t error;
dim3 dimBlock;
dim3 dimGrid;
dimBlock.x = block_size;
dimBlock.y = block_size;
dimBlock.z = 1;
dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
dimGrid.z = 1;

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;

cudaEventRecord(start);
#if SHARED == 1
unsigned int tile_size = block_size;
matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
cudaEventRecord(stop);

cudaEventSynchronize(stop);

cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;

error = cudaDeviceSynchronize();
if (error != cudaSuccess)
std::cerr << cudaGetErrorString(error) << std::endl;




wrappers.cuh



Here are the prototypes of the wrapper functions.



#pragma once

#include "kernels.cuh"

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);


main.cpp



Here is the main function.



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "wrappers.cuh"

#include <iostream>
#include <string>

void print(matrix m, std::string label)
std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
for (int row = 0; row < m.rows; row++)
for (int col = 0; col < m.cols; col++)
std::cout << m.elements[row * m.cols + col] << "t";

std::cout << std::endl;



int main(int argc, char **argv)
if (argc != 8)
std::cout << "NAME" << std::endl;
std::cout << "t" << "matrix-multiplication" << std::endl;
std::cout << std::endl;
return 0;


int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++)
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
std::cout << std::endl;


int a_rows = std::stoi(argv[1]);
int a_cols = std::stoi(argv[2]);

int b_rows = std::stoi(argv[3]);
int b_cols = std::stoi(argv[4]);

int c_rows = std::stoi(argv[5]);
int c_cols = std::stoi(argv[6]);

int block_size = std::stoi(argv[7]);

matrix a(a_rows, a_cols);
matrix b(b_rows, b_cols);
matrix c(c_rows, c_cols);

cudaMallocManaged(&a.elements, a.size * sizeof(double));
cudaMallocManaged(&b.elements, b.size * sizeof(double));
cudaMallocManaged(&c.elements, c.size * sizeof(double));

fill_col(a, block_size); // Implementation not shown here
fill_row(b, block_size); // Implementation not shown here

matrix_multiplication(a, b, c, block_size);

print(a, "a");
print(b, "b");
print(c, "c");

cudaFree(a.elements);
cudaFree(b.elements);
cudaFree(c.elements);

return 0;



So ... what do you think? Does it look good? Do you have any suggestions to make?







share|improve this question













Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.



matrix.hpp



By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.



#pragma once

struct matrix
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;

double *elements;
int rows;
int cols;
int size;
;


kernels.cuh



Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.



#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif


kernels.cu



Here is the actual implementations of the kernels.



#include "kernels.cuh"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size)
int bx = blockIdx.x;
int by = blockIdx.y;

int tx = threadIdx.x;
int ty = threadIdx.y;

int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;

extern __shared__ double buffer;
double *a_shared = &buffer[0];
double *b_shared = &buffer[tile_size * tile_size];

double sum = 0;

for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++)
if (k * tile_size + tx < a.cols && row < a.rows)
a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
else
a_shared[ty * tile_size + tx] = 0.0;

if (k * tile_size + ty < b.rows && col < b.cols)
b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
else
b_shared[ty * tile_size + tx] = 0.0;

__syncthreads();
#pragma unroll
for (int n = 0; n < tile_size; ++n)
sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];

__syncthreads();

if (row < c.rows && col < c.cols)
c.elements[row * c.cols + col] = sum;


#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c)
int bx = blockIdx.x;
int by = blockIdx.y;

int tx = threadIdx.x;
int ty = threadIdx.y;

int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;

if (row < c.rows && col < c.cols)
double sum = 0;
#pragma unroll
for (int k = 0; k < a.cols && k < b.rows; k++)
sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];

c.elements[row * c.cols + col] = sum;


#endif


wrappers.cu



I created some wrapper functions in this file in order to keep my main function clean and offer some kind of high-level abstraction to the user.



#include "wrappers.cuh"
#include <iostream>

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size)
cudaError_t error;
dim3 dimBlock;
dim3 dimGrid;
dimBlock.x = block_size;
dimBlock.y = block_size;
dimBlock.z = 1;
dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
dimGrid.z = 1;

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;

cudaEventRecord(start);
#if SHARED == 1
unsigned int tile_size = block_size;
matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
cudaEventRecord(stop);

cudaEventSynchronize(stop);

cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;

error = cudaDeviceSynchronize();
if (error != cudaSuccess)
std::cerr << cudaGetErrorString(error) << std::endl;




wrappers.cuh



Here are the prototypes of the wrapper functions.



#pragma once

#include "kernels.cuh"

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);


main.cpp



Here is the main function.



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "wrappers.cuh"

#include <iostream>
#include <string>

void print(matrix m, std::string label)
std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
for (int row = 0; row < m.rows; row++)
for (int col = 0; col < m.cols; col++)
std::cout << m.elements[row * m.cols + col] << "t";

std::cout << std::endl;



int main(int argc, char **argv)
if (argc != 8)
std::cout << "NAME" << std::endl;
std::cout << "t" << "matrix-multiplication" << std::endl;
std::cout << std::endl;
return 0;


int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++)
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
std::cout << std::endl;


int a_rows = std::stoi(argv[1]);
int a_cols = std::stoi(argv[2]);

int b_rows = std::stoi(argv[3]);
int b_cols = std::stoi(argv[4]);

int c_rows = std::stoi(argv[5]);
int c_cols = std::stoi(argv[6]);

int block_size = std::stoi(argv[7]);

matrix a(a_rows, a_cols);
matrix b(b_rows, b_cols);
matrix c(c_rows, c_cols);

cudaMallocManaged(&a.elements, a.size * sizeof(double));
cudaMallocManaged(&b.elements, b.size * sizeof(double));
cudaMallocManaged(&c.elements, c.size * sizeof(double));

fill_col(a, block_size); // Implementation not shown here
fill_row(b, block_size); // Implementation not shown here

matrix_multiplication(a, b, c, block_size);

print(a, "a");
print(b, "b");
print(c, "c");

cudaFree(a.elements);
cudaFree(b.elements);
cudaFree(c.elements);

return 0;



So ... what do you think? Does it look good? Do you have any suggestions to make?









share|improve this question












share|improve this question




share|improve this question








edited Apr 17 at 23:30
























asked Apr 16 at 1:25









xorz57

32912




32912











  • Where do you actually allocate elements?
    – Zeta
    Apr 16 at 4:06










  • @Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
    – xorz57
    Apr 16 at 10:13
















  • Where do you actually allocate elements?
    – Zeta
    Apr 16 at 4:06










  • @Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
    – xorz57
    Apr 16 at 10:13















Where do you actually allocate elements?
– Zeta
Apr 16 at 4:06




Where do you actually allocate elements?
– Zeta
Apr 16 at 4:06












@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
– xorz57
Apr 16 at 10:13




@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
– xorz57
Apr 16 at 10:13










2 Answers
2






active

oldest

votes

















up vote
1
down vote













matrix(int rows, int cols) 
this->rows = rows;
this->cols = cols;
this->size = rows * cols;



0_0



Did you actually mean



matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols) 


?



(Additionally beware of integer overflow; size_t would be better here.)



cudaMallocManaged(&a.elements, a.size * sizeof(double));


Managing object's resources in external calls is generally not a good idea. Two things can be done here:



  1. cudaMallocManaged called from inside the matrix's constructor.


  2. Consequently, cudaFree could be called from matrix's destructor, but—a better (IMHO) solution is to turn elements into a unique_ptr and call cudaFree from elements' deleter. Aside from raising consistency, this makes your matrix DefaultMoveable.






share|improve this answer






























    up vote
    0
    down vote













    In addition to what bipll noted, the elements member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr with an inline data member initializer.



    And it doesn’t have a destructor. Shouldn’t it free the memory? I think you really want a unique_ptr with a custom deleter.



    The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete to disable that.






    share|improve this answer





















      Your Answer




      StackExchange.ifUsing("editor", function ()
      return StackExchange.using("mathjaxEditing", function ()
      StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
      StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
      );
      );
      , "mathjax-editing");

      StackExchange.ifUsing("editor", function ()
      StackExchange.using("externalEditor", function ()
      StackExchange.using("snippets", function ()
      StackExchange.snippets.init();
      );
      );
      , "code-snippets");

      StackExchange.ready(function()
      var channelOptions =
      tags: "".split(" "),
      id: "196"
      ;
      initTagRenderer("".split(" "), "".split(" "), channelOptions);

      StackExchange.using("externalEditor", function()
      // Have to fire editor after snippets, if snippets enabled
      if (StackExchange.settings.snippets.snippetsEnabled)
      StackExchange.using("snippets", function()
      createEditor();
      );

      else
      createEditor();

      );

      function createEditor()
      StackExchange.prepareEditor(
      heartbeatType: 'answer',
      convertImagesToLinks: false,
      noModals: false,
      showLowRepImageUploadWarning: true,
      reputationToPostImages: null,
      bindNavPrevention: true,
      postfix: "",
      onDemand: true,
      discardSelector: ".discard-answer"
      ,immediatelyShowMarkdownHelp:true
      );



      );








       

      draft saved


      draft discarded


















      StackExchange.ready(
      function ()
      StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f192154%2fmatrix-multiplication-implementation-in-cuda-c-api-with-and-without-shared-mem%23new-answer', 'question_page');

      );

      Post as a guest






























      2 Answers
      2






      active

      oldest

      votes








      2 Answers
      2






      active

      oldest

      votes









      active

      oldest

      votes






      active

      oldest

      votes








      up vote
      1
      down vote













      matrix(int rows, int cols) 
      this->rows = rows;
      this->cols = cols;
      this->size = rows * cols;



      0_0



      Did you actually mean



      matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols) 


      ?



      (Additionally beware of integer overflow; size_t would be better here.)



      cudaMallocManaged(&a.elements, a.size * sizeof(double));


      Managing object's resources in external calls is generally not a good idea. Two things can be done here:



      1. cudaMallocManaged called from inside the matrix's constructor.


      2. Consequently, cudaFree could be called from matrix's destructor, but—a better (IMHO) solution is to turn elements into a unique_ptr and call cudaFree from elements' deleter. Aside from raising consistency, this makes your matrix DefaultMoveable.






      share|improve this answer



























        up vote
        1
        down vote













        matrix(int rows, int cols) 
        this->rows = rows;
        this->cols = cols;
        this->size = rows * cols;



        0_0



        Did you actually mean



        matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols) 


        ?



        (Additionally beware of integer overflow; size_t would be better here.)



        cudaMallocManaged(&a.elements, a.size * sizeof(double));


        Managing object's resources in external calls is generally not a good idea. Two things can be done here:



        1. cudaMallocManaged called from inside the matrix's constructor.


        2. Consequently, cudaFree could be called from matrix's destructor, but—a better (IMHO) solution is to turn elements into a unique_ptr and call cudaFree from elements' deleter. Aside from raising consistency, this makes your matrix DefaultMoveable.






        share|improve this answer

























          up vote
          1
          down vote










          up vote
          1
          down vote









          matrix(int rows, int cols) 
          this->rows = rows;
          this->cols = cols;
          this->size = rows * cols;



          0_0



          Did you actually mean



          matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols) 


          ?



          (Additionally beware of integer overflow; size_t would be better here.)



          cudaMallocManaged(&a.elements, a.size * sizeof(double));


          Managing object's resources in external calls is generally not a good idea. Two things can be done here:



          1. cudaMallocManaged called from inside the matrix's constructor.


          2. Consequently, cudaFree could be called from matrix's destructor, but—a better (IMHO) solution is to turn elements into a unique_ptr and call cudaFree from elements' deleter. Aside from raising consistency, this makes your matrix DefaultMoveable.






          share|improve this answer















          matrix(int rows, int cols) 
          this->rows = rows;
          this->cols = cols;
          this->size = rows * cols;



          0_0



          Did you actually mean



          matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols) 


          ?



          (Additionally beware of integer overflow; size_t would be better here.)



          cudaMallocManaged(&a.elements, a.size * sizeof(double));


          Managing object's resources in external calls is generally not a good idea. Two things can be done here:



          1. cudaMallocManaged called from inside the matrix's constructor.


          2. Consequently, cudaFree could be called from matrix's destructor, but—a better (IMHO) solution is to turn elements into a unique_ptr and call cudaFree from elements' deleter. Aside from raising consistency, this makes your matrix DefaultMoveable.







          share|improve this answer















          share|improve this answer



          share|improve this answer








          edited Apr 17 at 8:55


























          answered Apr 16 at 12:03









          bipll

          4026




          4026






















              up vote
              0
              down vote













              In addition to what bipll noted, the elements member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr with an inline data member initializer.



              And it doesn’t have a destructor. Shouldn’t it free the memory? I think you really want a unique_ptr with a custom deleter.



              The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete to disable that.






              share|improve this answer

























                up vote
                0
                down vote













                In addition to what bipll noted, the elements member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr with an inline data member initializer.



                And it doesn’t have a destructor. Shouldn’t it free the memory? I think you really want a unique_ptr with a custom deleter.



                The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete to disable that.






                share|improve this answer























                  up vote
                  0
                  down vote










                  up vote
                  0
                  down vote









                  In addition to what bipll noted, the elements member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr with an inline data member initializer.



                  And it doesn’t have a destructor. Shouldn’t it free the memory? I think you really want a unique_ptr with a custom deleter.



                  The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete to disable that.






                  share|improve this answer













                  In addition to what bipll noted, the elements member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr with an inline data member initializer.



                  And it doesn’t have a destructor. Shouldn’t it free the memory? I think you really want a unique_ptr with a custom deleter.



                  The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete to disable that.







                  share|improve this answer













                  share|improve this answer



                  share|improve this answer











                  answered Apr 18 at 6:21









                  JDługosz

                  5,047731




                  5,047731






















                       

                      draft saved


                      draft discarded


























                       


                      draft saved


                      draft discarded














                      StackExchange.ready(
                      function ()
                      StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f192154%2fmatrix-multiplication-implementation-in-cuda-c-api-with-and-without-shared-mem%23new-answer', 'question_page');

                      );

                      Post as a guest













































































                      Popular posts from this blog

                      Greedy Best First Search implementation in Rust

                      Function to Return a JSON Like Objects Using VBA Collections and Arrays

                      C++11 CLH Lock Implementation