Matrix Multiplication Implementation in CUDA C++ API with and without shared memory

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
1
down vote

favorite

Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.

matrix.hpp

By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.

#pragma once

struct matrix 
 matrix(int rows, int cols) 
 this->rows = rows;
 this->cols = cols;
 this->size = rows * cols;
 
 double *elements;
 int rows;
 int cols;
 int size;
;

kernels.cuh

Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.

#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"

#if SHARED == 1
 __global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
 __global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif

kernels.cu

Here is the actual implementations of the kernels.

#include "kernels.cuh"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size) 
 int bx = blockIdx.x;
 int by = blockIdx.y;

 int tx = threadIdx.x;
 int ty = threadIdx.y;

 int row = by * blockDim.y + ty;
 int col = bx * blockDim.x + tx;

 extern __shared__ double buffer;
 double *a_shared = &buffer[0];
 double *b_shared = &buffer[tile_size * tile_size];

 double sum = 0;

 for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++) 
 if (k * tile_size + tx < a.cols && row < a.rows) 
 a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
 else 
 a_shared[ty * tile_size + tx] = 0.0;
 
 if (k * tile_size + ty < b.rows && col < b.cols) 
 b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
 else 
 b_shared[ty * tile_size + tx] = 0.0;
 
 __syncthreads();
#pragma unroll
 for (int n = 0; n < tile_size; ++n) 
 sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];
 
 __syncthreads();
 
 if (row < c.rows && col < c.cols) 
 c.elements[row * c.cols + col] = sum;
 

#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c) 
 int bx = blockIdx.x;
 int by = blockIdx.y;

 int tx = threadIdx.x;
 int ty = threadIdx.y;

 int row = by * blockDim.y + ty;
 int col = bx * blockDim.x + tx;

 if (row < c.rows && col < c.cols) 
 double sum = 0;
#pragma unroll
 for (int k = 0; k < a.cols && k < b.rows; k++) 
 sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];
 
 c.elements[row * c.cols + col] = sum;
 

#endif

wrappers.cu

I created some wrapper functions in this file in order to keep my main function clean and offer some kind of high-level abstraction to the user.

#include "wrappers.cuh"
#include <iostream>

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size) 
 cudaError_t error;
 dim3 dimBlock;
 dim3 dimGrid;
 dimBlock.x = block_size;
 dimBlock.y = block_size;
 dimBlock.z = 1;
 dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
 dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
 dimGrid.z = 1;

 cudaEvent_t start, stop;
 cudaEventCreate(&start);
 cudaEventCreate(&stop);
 float milliseconds = 0;

 cudaEventRecord(start);
#if SHARED == 1
 unsigned int tile_size = block_size;
 matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
 matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
 cudaEventRecord(stop);

 cudaEventSynchronize(stop);

 cudaEventElapsedTime(&milliseconds, start, stop);
 std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;

 error = cudaDeviceSynchronize();
 if (error != cudaSuccess) 
 std::cerr << cudaGetErrorString(error) << std::endl;

wrappers.cuh

Here are the prototypes of the wrapper functions.

#pragma once

#include "kernels.cuh"

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);

main.cpp

Here is the main function.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "wrappers.cuh"

#include <iostream>
#include <string>

void print(matrix m, std::string label) 
 std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
 for (int row = 0; row < m.rows; row++) 
 for (int col = 0; col < m.cols; col++) 
 std::cout << m.elements[row * m.cols + col] << "t";
 
 std::cout << std::endl;
 


int main(int argc, char **argv) 
 if (argc != 8) 
 std::cout << "NAME" << std::endl;
 std::cout << "t" << "matrix-multiplication" << std::endl;
 std::cout << std::endl;
 return 0;
 

 int nDevices;
 cudaGetDeviceCount(&nDevices);
 for (int i = 0; i < nDevices; i++) 
 cudaDeviceProp prop;
 cudaGetDeviceProperties(&prop, i);
 std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
 std::cout << std::endl;
 

 int a_rows = std::stoi(argv[1]);
 int a_cols = std::stoi(argv[2]);

 int b_rows = std::stoi(argv[3]);
 int b_cols = std::stoi(argv[4]);

 int c_rows = std::stoi(argv[5]);
 int c_cols = std::stoi(argv[6]);

 int block_size = std::stoi(argv[7]);

 matrix a(a_rows, a_cols);
 matrix b(b_rows, b_cols);
 matrix c(c_rows, c_cols);

 cudaMallocManaged(&a.elements, a.size * sizeof(double));
 cudaMallocManaged(&b.elements, b.size * sizeof(double));
 cudaMallocManaged(&c.elements, c.size * sizeof(double));

 fill_col(a, block_size); // Implementation not shown here
 fill_row(b, block_size); // Implementation not shown here

 matrix_multiplication(a, b, c, block_size);

 print(a, "a");
 print(b, "b");
 print(c, "c");

 cudaFree(a.elements);
 cudaFree(b.elements);
 cudaFree(c.elements);

 return 0;

So ... what do you think? Does it look good? Do you have any suggestions to make?

edited Apr 17 at 23:30

asked Apr 16 at 1:25

xorz57

32912

Where do you actually allocate elements?
â€“Â Zeta
Apr 16 at 4:06

@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â€“Â xorz57
Apr 16 at 10:13

add a commentÂ |Â

up vote
1
down vote

favorite

Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.

matrix.hpp

By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.

#pragma once

struct matrix 
 matrix(int rows, int cols) 
 this->rows = rows;
 this->cols = cols;
 this->size = rows * cols;
 
 double *elements;
 int rows;
 int cols;
 int size;
;

kernels.cuh

Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.

#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"

#if SHARED == 1
 __global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
 __global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif

kernels.cu

Here is the actual implementations of the kernels.

#include "kernels.cuh"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size) 
 int bx = blockIdx.x;
 int by = blockIdx.y;

 int tx = threadIdx.x;
 int ty = threadIdx.y;

 int row = by * blockDim.y + ty;
 int col = bx * blockDim.x + tx;

 extern __shared__ double buffer;
 double *a_shared = &buffer[0];
 double *b_shared = &buffer[tile_size * tile_size];

 double sum = 0;

 for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++) 
 if (k * tile_size + tx < a.cols && row < a.rows) 
 a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
 else 
 a_shared[ty * tile_size + tx] = 0.0;
 
 if (k * tile_size + ty < b.rows && col < b.cols) 
 b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
 else 
 b_shared[ty * tile_size + tx] = 0.0;
 
 __syncthreads();
#pragma unroll
 for (int n = 0; n < tile_size; ++n) 
 sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];
 
 __syncthreads();
 
 if (row < c.rows && col < c.cols) 
 c.elements[row * c.cols + col] = sum;
 

#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c) 
 int bx = blockIdx.x;
 int by = blockIdx.y;

 int tx = threadIdx.x;
 int ty = threadIdx.y;

 int row = by * blockDim.y + ty;
 int col = bx * blockDim.x + tx;

 if (row < c.rows && col < c.cols) 
 double sum = 0;
#pragma unroll
 for (int k = 0; k < a.cols && k < b.rows; k++) 
 sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];
 
 c.elements[row * c.cols + col] = sum;
 

#endif

wrappers.cu

I created some wrapper functions in this file in order to keep my main function clean and offer some kind of high-level abstraction to the user.

#include "wrappers.cuh"
#include <iostream>

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size) 
 cudaError_t error;
 dim3 dimBlock;
 dim3 dimGrid;
 dimBlock.x = block_size;
 dimBlock.y = block_size;
 dimBlock.z = 1;
 dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
 dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
 dimGrid.z = 1;

 cudaEvent_t start, stop;
 cudaEventCreate(&start);
 cudaEventCreate(&stop);
 float milliseconds = 0;

 cudaEventRecord(start);
#if SHARED == 1
 unsigned int tile_size = block_size;
 matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
 matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
 cudaEventRecord(stop);

 cudaEventSynchronize(stop);

 cudaEventElapsedTime(&milliseconds, start, stop);
 std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;

 error = cudaDeviceSynchronize();
 if (error != cudaSuccess) 
 std::cerr << cudaGetErrorString(error) << std::endl;

wrappers.cuh

Here are the prototypes of the wrapper functions.

#pragma once

#include "kernels.cuh"

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);

main.cpp

Here is the main function.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "wrappers.cuh"

#include <iostream>
#include <string>

void print(matrix m, std::string label) 
 std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
 for (int row = 0; row < m.rows; row++) 
 for (int col = 0; col < m.cols; col++) 
 std::cout << m.elements[row * m.cols + col] << "t";
 
 std::cout << std::endl;
 


int main(int argc, char **argv) 
 if (argc != 8) 
 std::cout << "NAME" << std::endl;
 std::cout << "t" << "matrix-multiplication" << std::endl;
 std::cout << std::endl;
 return 0;
 

 int nDevices;
 cudaGetDeviceCount(&nDevices);
 for (int i = 0; i < nDevices; i++) 
 cudaDeviceProp prop;
 cudaGetDeviceProperties(&prop, i);
 std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
 std::cout << std::endl;
 

 int a_rows = std::stoi(argv[1]);
 int a_cols = std::stoi(argv[2]);

 int b_rows = std::stoi(argv[3]);
 int b_cols = std::stoi(argv[4]);

 int c_rows = std::stoi(argv[5]);
 int c_cols = std::stoi(argv[6]);

 int block_size = std::stoi(argv[7]);

 matrix a(a_rows, a_cols);
 matrix b(b_rows, b_cols);
 matrix c(c_rows, c_cols);

 cudaMallocManaged(&a.elements, a.size * sizeof(double));
 cudaMallocManaged(&b.elements, b.size * sizeof(double));
 cudaMallocManaged(&c.elements, c.size * sizeof(double));

 fill_col(a, block_size); // Implementation not shown here
 fill_row(b, block_size); // Implementation not shown here

 matrix_multiplication(a, b, c, block_size);

 print(a, "a");
 print(b, "b");
 print(c, "c");

 cudaFree(a.elements);
 cudaFree(b.elements);
 cudaFree(c.elements);

 return 0;

So ... what do you think? Does it look good? Do you have any suggestions to make?

edited Apr 17 at 23:30

asked Apr 16 at 1:25

xorz57

32912

Where do you actually allocate elements?
â€“Â Zeta
Apr 16 at 4:06

@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â€“Â xorz57
Apr 16 at 10:13

add a commentÂ |Â

up vote
1
down vote

favorite

Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.

matrix.hpp

By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.

#pragma once

struct matrix 
 matrix(int rows, int cols) 
 this->rows = rows;
 this->cols = cols;
 this->size = rows * cols;
 
 double *elements;
 int rows;
 int cols;
 int size;
;

kernels.cuh

Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.

#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"

#if SHARED == 1
 __global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
 __global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif

kernels.cu

Here is the actual implementations of the kernels.

#include "kernels.cuh"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size) 
 int bx = blockIdx.x;
 int by = blockIdx.y;

 int tx = threadIdx.x;
 int ty = threadIdx.y;

 int row = by * blockDim.y + ty;
 int col = bx * blockDim.x + tx;

 extern __shared__ double buffer;
 double *a_shared = &buffer[0];
 double *b_shared = &buffer[tile_size * tile_size];

 double sum = 0;

 for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++) 
 if (k * tile_size + tx < a.cols && row < a.rows) 
 a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
 else 
 a_shared[ty * tile_size + tx] = 0.0;
 
 if (k * tile_size + ty < b.rows && col < b.cols) 
 b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
 else 
 b_shared[ty * tile_size + tx] = 0.0;
 
 __syncthreads();
#pragma unroll
 for (int n = 0; n < tile_size; ++n) 
 sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];
 
 __syncthreads();
 
 if (row < c.rows && col < c.cols) 
 c.elements[row * c.cols + col] = sum;
 

#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c) 
 int bx = blockIdx.x;
 int by = blockIdx.y;

 int tx = threadIdx.x;
 int ty = threadIdx.y;

 int row = by * blockDim.y + ty;
 int col = bx * blockDim.x + tx;

 if (row < c.rows && col < c.cols) 
 double sum = 0;
#pragma unroll
 for (int k = 0; k < a.cols && k < b.rows; k++) 
 sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];
 
 c.elements[row * c.cols + col] = sum;
 

#endif

wrappers.cu

I created some wrapper functions in this file in order to keep my main function clean and offer some kind of high-level abstraction to the user.

#include "wrappers.cuh"
#include <iostream>

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size) 
 cudaError_t error;
 dim3 dimBlock;
 dim3 dimGrid;
 dimBlock.x = block_size;
 dimBlock.y = block_size;
 dimBlock.z = 1;
 dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
 dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
 dimGrid.z = 1;

 cudaEvent_t start, stop;
 cudaEventCreate(&start);
 cudaEventCreate(&stop);
 float milliseconds = 0;

 cudaEventRecord(start);
#if SHARED == 1
 unsigned int tile_size = block_size;
 matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
 matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
 cudaEventRecord(stop);

 cudaEventSynchronize(stop);

 cudaEventElapsedTime(&milliseconds, start, stop);
 std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;

 error = cudaDeviceSynchronize();
 if (error != cudaSuccess) 
 std::cerr << cudaGetErrorString(error) << std::endl;

wrappers.cuh

Here are the prototypes of the wrapper functions.

#pragma once

#include "kernels.cuh"

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);

main.cpp

Here is the main function.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "wrappers.cuh"

#include <iostream>
#include <string>

void print(matrix m, std::string label) 
 std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
 for (int row = 0; row < m.rows; row++) 
 for (int col = 0; col < m.cols; col++) 
 std::cout << m.elements[row * m.cols + col] << "t";
 
 std::cout << std::endl;
 


int main(int argc, char **argv) 
 if (argc != 8) 
 std::cout << "NAME" << std::endl;
 std::cout << "t" << "matrix-multiplication" << std::endl;
 std::cout << std::endl;
 return 0;
 

 int nDevices;
 cudaGetDeviceCount(&nDevices);
 for (int i = 0; i < nDevices; i++) 
 cudaDeviceProp prop;
 cudaGetDeviceProperties(&prop, i);
 std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
 std::cout << std::endl;
 

 int a_rows = std::stoi(argv[1]);
 int a_cols = std::stoi(argv[2]);

 int b_rows = std::stoi(argv[3]);
 int b_cols = std::stoi(argv[4]);

 int c_rows = std::stoi(argv[5]);
 int c_cols = std::stoi(argv[6]);

 int block_size = std::stoi(argv[7]);

 matrix a(a_rows, a_cols);
 matrix b(b_rows, b_cols);
 matrix c(c_rows, c_cols);

 cudaMallocManaged(&a.elements, a.size * sizeof(double));
 cudaMallocManaged(&b.elements, b.size * sizeof(double));
 cudaMallocManaged(&c.elements, c.size * sizeof(double));

 fill_col(a, block_size); // Implementation not shown here
 fill_row(b, block_size); // Implementation not shown here

 matrix_multiplication(a, b, c, block_size);

 print(a, "a");
 print(b, "b");
 print(c, "c");

 cudaFree(a.elements);
 cudaFree(b.elements);
 cudaFree(c.elements);

 return 0;

So ... what do you think? Does it look good? Do you have any suggestions to make?

edited Apr 17 at 23:30

asked Apr 16 at 1:25

xorz57

32912

Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.

matrix.hpp

By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.

#pragma once

struct matrix 
 matrix(int rows, int cols) 
 this->rows = rows;
 this->cols = cols;
 this->size = rows * cols;
 
 double *elements;
 int rows;
 int cols;
 int size;
;

kernels.cuh

Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.

#pragma once

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"

#if SHARED == 1
 __global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
 __global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif

kernels.cu

Here is the actual implementations of the kernels.

#include "kernels.cuh"

#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size) 
 int bx = blockIdx.x;
 int by = blockIdx.y;

 int tx = threadIdx.x;
 int ty = threadIdx.y;

 int row = by * blockDim.y + ty;
 int col = bx * blockDim.x + tx;

 extern __shared__ double buffer;
 double *a_shared = &buffer[0];
 double *b_shared = &buffer[tile_size * tile_size];

 double sum = 0;

 for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++) 
 if (k * tile_size + tx < a.cols && row < a.rows) 
 a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
 else 
 a_shared[ty * tile_size + tx] = 0.0;
 
 if (k * tile_size + ty < b.rows && col < b.cols) 
 b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
 else 
 b_shared[ty * tile_size + tx] = 0.0;
 
 __syncthreads();
#pragma unroll
 for (int n = 0; n < tile_size; ++n) 
 sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];
 
 __syncthreads();
 
 if (row < c.rows && col < c.cols) 
 c.elements[row * c.cols + col] = sum;
 

#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c) 
 int bx = blockIdx.x;
 int by = blockIdx.y;

 int tx = threadIdx.x;
 int ty = threadIdx.y;

 int row = by * blockDim.y + ty;
 int col = bx * blockDim.x + tx;

 if (row < c.rows && col < c.cols) 
 double sum = 0;
#pragma unroll
 for (int k = 0; k < a.cols && k < b.rows; k++) 
 sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];
 
 c.elements[row * c.cols + col] = sum;
 

#endif

wrappers.cu

I created some wrapper functions in this file in order to keep my main function clean and offer some kind of high-level abstraction to the user.

#include "wrappers.cuh"
#include <iostream>

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size) 
 cudaError_t error;
 dim3 dimBlock;
 dim3 dimGrid;
 dimBlock.x = block_size;
 dimBlock.y = block_size;
 dimBlock.z = 1;
 dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
 dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
 dimGrid.z = 1;

 cudaEvent_t start, stop;
 cudaEventCreate(&start);
 cudaEventCreate(&stop);
 float milliseconds = 0;

 cudaEventRecord(start);
#if SHARED == 1
 unsigned int tile_size = block_size;
 matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
 matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
 cudaEventRecord(stop);

 cudaEventSynchronize(stop);

 cudaEventElapsedTime(&milliseconds, start, stop);
 std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;

 error = cudaDeviceSynchronize();
 if (error != cudaSuccess) 
 std::cerr << cudaGetErrorString(error) << std::endl;

wrappers.cuh

Here are the prototypes of the wrapper functions.

#pragma once

#include "kernels.cuh"

void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);

main.cpp

Here is the main function.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "wrappers.cuh"

#include <iostream>
#include <string>

void print(matrix m, std::string label) 
 std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
 for (int row = 0; row < m.rows; row++) 
 for (int col = 0; col < m.cols; col++) 
 std::cout << m.elements[row * m.cols + col] << "t";
 
 std::cout << std::endl;
 


int main(int argc, char **argv) 
 if (argc != 8) 
 std::cout << "NAME" << std::endl;
 std::cout << "t" << "matrix-multiplication" << std::endl;
 std::cout << std::endl;
 return 0;
 

 int nDevices;
 cudaGetDeviceCount(&nDevices);
 for (int i = 0; i < nDevices; i++) 
 cudaDeviceProp prop;
 cudaGetDeviceProperties(&prop, i);
 std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
 std::cout << std::endl;
 

 int a_rows = std::stoi(argv[1]);
 int a_cols = std::stoi(argv[2]);

 int b_rows = std::stoi(argv[3]);
 int b_cols = std::stoi(argv[4]);

 int c_rows = std::stoi(argv[5]);
 int c_cols = std::stoi(argv[6]);

 int block_size = std::stoi(argv[7]);

 matrix a(a_rows, a_cols);
 matrix b(b_rows, b_cols);
 matrix c(c_rows, c_cols);

 cudaMallocManaged(&a.elements, a.size * sizeof(double));
 cudaMallocManaged(&b.elements, b.size * sizeof(double));
 cudaMallocManaged(&c.elements, c.size * sizeof(double));

 fill_col(a, block_size); // Implementation not shown here
 fill_row(b, block_size); // Implementation not shown here

 matrix_multiplication(a, b, c, block_size);

 print(a, "a");
 print(b, "b");
 print(c, "c");

 cudaFree(a.elements);
 cudaFree(b.elements);
 cudaFree(c.elements);

 return 0;

So ... what do you think? Does it look good? Do you have any suggestions to make?

edited Apr 17 at 23:30

asked Apr 16 at 1:25

xorz57

32912

edited Apr 17 at 23:30

asked Apr 16 at 1:25

xorz57

32912

asked Apr 16 at 1:25

xorz57

32912

asked Apr 16 at 1:25

xorz57

32912

Where do you actually allocate elements?
â€“Â Zeta
Apr 16 at 4:06

@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â€“Â xorz57
Apr 16 at 10:13

add a commentÂ |Â

Where do you actually allocate elements?
â€“Â Zeta
Apr 16 at 4:06

@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â€“Â xorz57
Apr 16 at 10:13

Where do you actually allocate elements?
â€“Â Zeta
Apr 16 at 4:06

@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â€“Â xorz57
Apr 16 at 10:13

add a commentÂ |Â

2 Answers
2

active

oldest

votes

up vote
1
down vote

matrix(int rows, int cols) 
 this->rows = rows;
 this->cols = cols;
 this->size = rows * cols;

0_0

Did you actually mean

matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols)

(Additionally beware of integer overflow; size_t would be better here.)

cudaMallocManaged(&a.elements, a.size * sizeof(double));

Managing object's resources in external calls is generally not a good idea. Two things can be done here:

cudaMallocManaged called from inside the matrix's constructor.

Consequently, cudaFree could be called from matrix's destructor, butÃ¢Â€Â”a better (IMHO) solution is to turn elements into a unique_ptr and call cudaFree from elements' deleter. Aside from raising consistency, this makes your matrix DefaultMoveable.

edited Apr 17 at 8:55

answered Apr 16 at 12:03

bipll

4026

add a commentÂ |Â

up vote
0
down vote

In addition to what bipll noted, the elements member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr with an inline data member initializer.

And it doesnÃ¢Â€Â™t have a destructor. ShouldnÃ¢Â€Â™t it free the memory? I think you really want a unique_ptr with a custom deleter.

The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete to disable that.

answered Apr 18 at 6:21

JDÃ…Â‚ugosz

5,047731

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f192154%2fmatrix-multiplication-implementation-in-cuda-c-api-with-and-without-shared-mem%23new-answer', 'question_page');

);

Post as a guest

Name

2 Answers
2

active

oldest

votes

2 Answers
2

active

oldest

votes

up vote
1
down vote

matrix(int rows, int cols) 
 this->rows = rows;
 this->cols = cols;
 this->size = rows * cols;

0_0

Did you actually mean

matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols)

(Additionally beware of integer overflow; size_t would be better here.)

cudaMallocManaged(&a.elements, a.size * sizeof(double));

Managing object's resources in external calls is generally not a good idea. Two things can be done here:

cudaMallocManaged called from inside the matrix's constructor.

Consequently, cudaFree could be called from matrix's destructor, butÃ¢Â€Â”a better (IMHO) solution is to turn elements into a unique_ptr and call cudaFree from elements' deleter. Aside from raising consistency, this makes your matrix DefaultMoveable.

edited Apr 17 at 8:55

answered Apr 16 at 12:03

bipll

4026

add a commentÂ |Â

up vote
1
down vote

matrix(int rows, int cols) 
 this->rows = rows;
 this->cols = cols;
 this->size = rows * cols;

0_0

Did you actually mean

matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols)

(Additionally beware of integer overflow; size_t would be better here.)

cudaMallocManaged(&a.elements, a.size * sizeof(double));

Managing object's resources in external calls is generally not a good idea. Two things can be done here:

cudaMallocManaged called from inside the matrix's constructor.

Consequently, cudaFree could be called from matrix's destructor, butÃ¢Â€Â”a better (IMHO) solution is to turn elements into a unique_ptr and call cudaFree from elements' deleter. Aside from raising consistency, this makes your matrix DefaultMoveable.

edited Apr 17 at 8:55

answered Apr 16 at 12:03

bipll

4026

add a commentÂ |Â

up vote
1
down vote

matrix(int rows, int cols) 
 this->rows = rows;
 this->cols = cols;
 this->size = rows * cols;

0_0

Did you actually mean

matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols)

(Additionally beware of integer overflow; size_t would be better here.)

cudaMallocManaged(&a.elements, a.size * sizeof(double));

Managing object's resources in external calls is generally not a good idea. Two things can be done here:

cudaMallocManaged called from inside the matrix's constructor.

Consequently, cudaFree could be called from matrix's destructor, butÃ¢Â€Â”a better (IMHO) solution is to turn elements into a unique_ptr and call cudaFree from elements' deleter. Aside from raising consistency, this makes your matrix DefaultMoveable.

edited Apr 17 at 8:55

answered Apr 16 at 12:03

bipll

4026

matrix(int rows, int cols) 
 this->rows = rows;
 this->cols = cols;
 this->size = rows * cols;

0_0

Did you actually mean

matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols)

(Additionally beware of integer overflow; size_t would be better here.)

cudaMallocManaged(&a.elements, a.size * sizeof(double));

Managing object's resources in external calls is generally not a good idea. Two things can be done here:

cudaMallocManaged called from inside the matrix's constructor.

Consequently, cudaFree could be called from matrix's destructor, butÃ¢Â€Â”a better (IMHO) solution is to turn elements into a unique_ptr and call cudaFree from elements' deleter. Aside from raising consistency, this makes your matrix DefaultMoveable.

edited Apr 17 at 8:55

answered Apr 16 at 12:03

bipll

4026

edited Apr 17 at 8:55

answered Apr 16 at 12:03

bipll

4026

answered Apr 16 at 12:03

bipll

4026

answered Apr 16 at 12:03

bipll

4026

add a commentÂ |Â

up vote
0
down vote

In addition to what bipll noted, the elements member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr with an inline data member initializer.

And it doesnÃ¢Â€Â™t have a destructor. ShouldnÃ¢Â€Â™t it free the memory? I think you really want a unique_ptr with a custom deleter.

The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete to disable that.

answered Apr 18 at 6:21

JDÃ…Â‚ugosz

5,047731

add a commentÂ |Â

up vote
0
down vote

In addition to what bipll noted, the elements member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr with an inline data member initializer.

And it doesnÃ¢Â€Â™t have a destructor. ShouldnÃ¢Â€Â™t it free the memory? I think you really want a unique_ptr with a custom deleter.

The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete to disable that.

answered Apr 18 at 6:21

JDÃ…Â‚ugosz

5,047731

add a commentÂ |Â

up vote
0
down vote

In addition to what bipll noted, the elements member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr with an inline data member initializer.

And it doesnÃ¢Â€Â™t have a destructor. ShouldnÃ¢Â€Â™t it free the memory? I think you really want a unique_ptr with a custom deleter.

The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete to disable that.

answered Apr 18 at 6:21

JDÃ…Â‚ugosz

5,047731

In addition to what bipll noted, the elements member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr with an inline data member initializer.

And it doesnÃ¢Â€Â™t have a destructor. ShouldnÃ¢Â€Â™t it free the memory? I think you really want a unique_ptr with a custom deleter.

The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete to disable that.

answered Apr 18 at 6:21

JDÃ…Â‚ugosz

5,047731

answered Apr 18 at 6:21

JDÃ…Â‚ugosz

5,047731

answered Apr 18 at 6:21

JDÃ…Â‚ugosz

5,047731

answered Apr 18 at 6:21

JDÃ…Â‚ugosz

5,047731

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr