Matrix Multiplication Implementation in CUDA C++ API with and without shared memory
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
1
down vote
favorite
Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.
matrix.hpp
By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.
#pragma once
struct matrix
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
double *elements;
int rows;
int cols;
int size;
;
kernels.cuh
Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.
#pragma once
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"
#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif
kernels.cu
Here is the actual implementations of the kernels.
#include "kernels.cuh"
#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size)
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;
extern __shared__ double buffer;
double *a_shared = &buffer[0];
double *b_shared = &buffer[tile_size * tile_size];
double sum = 0;
for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++)
if (k * tile_size + tx < a.cols && row < a.rows)
a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
else
a_shared[ty * tile_size + tx] = 0.0;
if (k * tile_size + ty < b.rows && col < b.cols)
b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
else
b_shared[ty * tile_size + tx] = 0.0;
__syncthreads();
#pragma unroll
for (int n = 0; n < tile_size; ++n)
sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];
__syncthreads();
if (row < c.rows && col < c.cols)
c.elements[row * c.cols + col] = sum;
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c)
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;
if (row < c.rows && col < c.cols)
double sum = 0;
#pragma unroll
for (int k = 0; k < a.cols && k < b.rows; k++)
sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];
c.elements[row * c.cols + col] = sum;
#endif
wrappers.cu
I created some wrapper functions
in this file in order to keep my main
function clean and offer some kind of high-level abstraction to the user.
#include "wrappers.cuh"
#include <iostream>
void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size)
cudaError_t error;
dim3 dimBlock;
dim3 dimGrid;
dimBlock.x = block_size;
dimBlock.y = block_size;
dimBlock.z = 1;
dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
dimGrid.z = 1;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;
cudaEventRecord(start);
#if SHARED == 1
unsigned int tile_size = block_size;
matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;
error = cudaDeviceSynchronize();
if (error != cudaSuccess)
std::cerr << cudaGetErrorString(error) << std::endl;
wrappers.cuh
Here are the prototypes of the wrapper functions
.
#pragma once
#include "kernels.cuh"
void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);
main.cpp
Here is the main
function.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "wrappers.cuh"
#include <iostream>
#include <string>
void print(matrix m, std::string label)
std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
for (int row = 0; row < m.rows; row++)
for (int col = 0; col < m.cols; col++)
std::cout << m.elements[row * m.cols + col] << "t";
std::cout << std::endl;
int main(int argc, char **argv)
if (argc != 8)
std::cout << "NAME" << std::endl;
std::cout << "t" << "matrix-multiplication" << std::endl;
std::cout << std::endl;
return 0;
int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++)
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
std::cout << std::endl;
int a_rows = std::stoi(argv[1]);
int a_cols = std::stoi(argv[2]);
int b_rows = std::stoi(argv[3]);
int b_cols = std::stoi(argv[4]);
int c_rows = std::stoi(argv[5]);
int c_cols = std::stoi(argv[6]);
int block_size = std::stoi(argv[7]);
matrix a(a_rows, a_cols);
matrix b(b_rows, b_cols);
matrix c(c_rows, c_cols);
cudaMallocManaged(&a.elements, a.size * sizeof(double));
cudaMallocManaged(&b.elements, b.size * sizeof(double));
cudaMallocManaged(&c.elements, c.size * sizeof(double));
fill_col(a, block_size); // Implementation not shown here
fill_row(b, block_size); // Implementation not shown here
matrix_multiplication(a, b, c, block_size);
print(a, "a");
print(b, "b");
print(c, "c");
cudaFree(a.elements);
cudaFree(b.elements);
cudaFree(c.elements);
return 0;
So ... what do you think? Does it look good? Do you have any suggestions to make?
c++ performance beginner matrix cuda
add a comment |Â
up vote
1
down vote
favorite
Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.
matrix.hpp
By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.
#pragma once
struct matrix
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
double *elements;
int rows;
int cols;
int size;
;
kernels.cuh
Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.
#pragma once
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"
#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif
kernels.cu
Here is the actual implementations of the kernels.
#include "kernels.cuh"
#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size)
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;
extern __shared__ double buffer;
double *a_shared = &buffer[0];
double *b_shared = &buffer[tile_size * tile_size];
double sum = 0;
for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++)
if (k * tile_size + tx < a.cols && row < a.rows)
a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
else
a_shared[ty * tile_size + tx] = 0.0;
if (k * tile_size + ty < b.rows && col < b.cols)
b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
else
b_shared[ty * tile_size + tx] = 0.0;
__syncthreads();
#pragma unroll
for (int n = 0; n < tile_size; ++n)
sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];
__syncthreads();
if (row < c.rows && col < c.cols)
c.elements[row * c.cols + col] = sum;
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c)
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;
if (row < c.rows && col < c.cols)
double sum = 0;
#pragma unroll
for (int k = 0; k < a.cols && k < b.rows; k++)
sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];
c.elements[row * c.cols + col] = sum;
#endif
wrappers.cu
I created some wrapper functions
in this file in order to keep my main
function clean and offer some kind of high-level abstraction to the user.
#include "wrappers.cuh"
#include <iostream>
void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size)
cudaError_t error;
dim3 dimBlock;
dim3 dimGrid;
dimBlock.x = block_size;
dimBlock.y = block_size;
dimBlock.z = 1;
dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
dimGrid.z = 1;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;
cudaEventRecord(start);
#if SHARED == 1
unsigned int tile_size = block_size;
matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;
error = cudaDeviceSynchronize();
if (error != cudaSuccess)
std::cerr << cudaGetErrorString(error) << std::endl;
wrappers.cuh
Here are the prototypes of the wrapper functions
.
#pragma once
#include "kernels.cuh"
void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);
main.cpp
Here is the main
function.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "wrappers.cuh"
#include <iostream>
#include <string>
void print(matrix m, std::string label)
std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
for (int row = 0; row < m.rows; row++)
for (int col = 0; col < m.cols; col++)
std::cout << m.elements[row * m.cols + col] << "t";
std::cout << std::endl;
int main(int argc, char **argv)
if (argc != 8)
std::cout << "NAME" << std::endl;
std::cout << "t" << "matrix-multiplication" << std::endl;
std::cout << std::endl;
return 0;
int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++)
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
std::cout << std::endl;
int a_rows = std::stoi(argv[1]);
int a_cols = std::stoi(argv[2]);
int b_rows = std::stoi(argv[3]);
int b_cols = std::stoi(argv[4]);
int c_rows = std::stoi(argv[5]);
int c_cols = std::stoi(argv[6]);
int block_size = std::stoi(argv[7]);
matrix a(a_rows, a_cols);
matrix b(b_rows, b_cols);
matrix c(c_rows, c_cols);
cudaMallocManaged(&a.elements, a.size * sizeof(double));
cudaMallocManaged(&b.elements, b.size * sizeof(double));
cudaMallocManaged(&c.elements, c.size * sizeof(double));
fill_col(a, block_size); // Implementation not shown here
fill_row(b, block_size); // Implementation not shown here
matrix_multiplication(a, b, c, block_size);
print(a, "a");
print(b, "b");
print(c, "c");
cudaFree(a.elements);
cudaFree(b.elements);
cudaFree(c.elements);
return 0;
So ... what do you think? Does it look good? Do you have any suggestions to make?
c++ performance beginner matrix cuda
Where do you actually allocateelements
?
â Zeta
Apr 16 at 4:06
@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â xorz57
Apr 16 at 10:13
add a comment |Â
up vote
1
down vote
favorite
up vote
1
down vote
favorite
Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.
matrix.hpp
By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.
#pragma once
struct matrix
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
double *elements;
int rows;
int cols;
int size;
;
kernels.cuh
Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.
#pragma once
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"
#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif
kernels.cu
Here is the actual implementations of the kernels.
#include "kernels.cuh"
#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size)
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;
extern __shared__ double buffer;
double *a_shared = &buffer[0];
double *b_shared = &buffer[tile_size * tile_size];
double sum = 0;
for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++)
if (k * tile_size + tx < a.cols && row < a.rows)
a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
else
a_shared[ty * tile_size + tx] = 0.0;
if (k * tile_size + ty < b.rows && col < b.cols)
b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
else
b_shared[ty * tile_size + tx] = 0.0;
__syncthreads();
#pragma unroll
for (int n = 0; n < tile_size; ++n)
sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];
__syncthreads();
if (row < c.rows && col < c.cols)
c.elements[row * c.cols + col] = sum;
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c)
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;
if (row < c.rows && col < c.cols)
double sum = 0;
#pragma unroll
for (int k = 0; k < a.cols && k < b.rows; k++)
sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];
c.elements[row * c.cols + col] = sum;
#endif
wrappers.cu
I created some wrapper functions
in this file in order to keep my main
function clean and offer some kind of high-level abstraction to the user.
#include "wrappers.cuh"
#include <iostream>
void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size)
cudaError_t error;
dim3 dimBlock;
dim3 dimGrid;
dimBlock.x = block_size;
dimBlock.y = block_size;
dimBlock.z = 1;
dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
dimGrid.z = 1;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;
cudaEventRecord(start);
#if SHARED == 1
unsigned int tile_size = block_size;
matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;
error = cudaDeviceSynchronize();
if (error != cudaSuccess)
std::cerr << cudaGetErrorString(error) << std::endl;
wrappers.cuh
Here are the prototypes of the wrapper functions
.
#pragma once
#include "kernels.cuh"
void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);
main.cpp
Here is the main
function.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "wrappers.cuh"
#include <iostream>
#include <string>
void print(matrix m, std::string label)
std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
for (int row = 0; row < m.rows; row++)
for (int col = 0; col < m.cols; col++)
std::cout << m.elements[row * m.cols + col] << "t";
std::cout << std::endl;
int main(int argc, char **argv)
if (argc != 8)
std::cout << "NAME" << std::endl;
std::cout << "t" << "matrix-multiplication" << std::endl;
std::cout << std::endl;
return 0;
int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++)
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
std::cout << std::endl;
int a_rows = std::stoi(argv[1]);
int a_cols = std::stoi(argv[2]);
int b_rows = std::stoi(argv[3]);
int b_cols = std::stoi(argv[4]);
int c_rows = std::stoi(argv[5]);
int c_cols = std::stoi(argv[6]);
int block_size = std::stoi(argv[7]);
matrix a(a_rows, a_cols);
matrix b(b_rows, b_cols);
matrix c(c_rows, c_cols);
cudaMallocManaged(&a.elements, a.size * sizeof(double));
cudaMallocManaged(&b.elements, b.size * sizeof(double));
cudaMallocManaged(&c.elements, c.size * sizeof(double));
fill_col(a, block_size); // Implementation not shown here
fill_row(b, block_size); // Implementation not shown here
matrix_multiplication(a, b, c, block_size);
print(a, "a");
print(b, "b");
print(c, "c");
cudaFree(a.elements);
cudaFree(b.elements);
cudaFree(c.elements);
return 0;
So ... what do you think? Does it look good? Do you have any suggestions to make?
c++ performance beginner matrix cuda
Here is my implementation of matrix multiplication using the CUDA C++ API.
I tried to separate the source code into multiple files for easier maintenance and readability.
matrix.hpp
By creating this struct I wanted to keep things tidy and avoid passing a lot of parameters to functions and kernels later.
#pragma once
struct matrix
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
double *elements;
int rows;
int cols;
int size;
;
kernels.cuh
Here I have placed the kernel prototypes. I wrote 2 versions of the matrix multiplication. One that makes use of shared memory and one that does not.
#pragma once
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "matrix.hpp"
#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size);
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c);
#endif
kernels.cu
Here is the actual implementations of the kernels.
#include "kernels.cuh"
#if SHARED == 1
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c, unsigned int tile_size)
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;
extern __shared__ double buffer;
double *a_shared = &buffer[0];
double *b_shared = &buffer[tile_size * tile_size];
double sum = 0;
for (int k = 0; k < (tile_size + a.cols - 1) / tile_size; k++)
if (k * tile_size + tx < a.cols && row < a.rows)
a_shared[ty * tile_size + tx] = a.elements[row * a.cols + (k * tile_size + tx)];
else
a_shared[ty * tile_size + tx] = 0.0;
if (k * tile_size + ty < b.rows && col < b.cols)
b_shared[ty * tile_size + tx] = b.elements[(k * tile_size + ty) * b.cols + col];
else
b_shared[ty * tile_size + tx] = 0.0;
__syncthreads();
#pragma unroll
for (int n = 0; n < tile_size; ++n)
sum += a_shared[ty * tile_size + n] * b_shared[n * tile_size + tx];
__syncthreads();
if (row < c.rows && col < c.cols)
c.elements[row * c.cols + col] = sum;
#elif SHARED == 0
__global__ void matrix_multiplication_kernel(matrix a, matrix b, matrix c)
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * blockDim.y + ty;
int col = bx * blockDim.x + tx;
if (row < c.rows && col < c.cols)
double sum = 0;
#pragma unroll
for (int k = 0; k < a.cols && k < b.rows; k++)
sum += a.elements[row * a.cols + k] * b.elements[k * b.cols + col];
c.elements[row * c.cols + col] = sum;
#endif
wrappers.cu
I created some wrapper functions
in this file in order to keep my main
function clean and offer some kind of high-level abstraction to the user.
#include "wrappers.cuh"
#include <iostream>
void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size)
cudaError_t error;
dim3 dimBlock;
dim3 dimGrid;
dimBlock.x = block_size;
dimBlock.y = block_size;
dimBlock.z = 1;
dimGrid.x = (c.cols - 1) / dimBlock.x + 1;
dimGrid.y = (c.rows - 1) / dimBlock.y + 1;
dimGrid.z = 1;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float milliseconds = 0;
cudaEventRecord(start);
#if SHARED == 1
unsigned int tile_size = block_size;
matrix_multiplication_kernel <<<dimGrid, dimBlock, 2 * tile_size * tile_size * sizeof(double)>>> (a, b, c, tile_size);
#elif SHARED == 0
matrix_multiplication_kernel <<<dimGrid, dimBlock>>> (a, b, c);
#endif
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&milliseconds, start, stop);
std::cout << "kernel execution time" << " " << milliseconds << " " << "ms" << std::endl;
error = cudaDeviceSynchronize();
if (error != cudaSuccess)
std::cerr << cudaGetErrorString(error) << std::endl;
wrappers.cuh
Here are the prototypes of the wrapper functions
.
#pragma once
#include "kernels.cuh"
void matrix_multiplication(matrix a, matrix b, matrix c, unsigned int block_size);
main.cpp
Here is the main
function.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "wrappers.cuh"
#include <iostream>
#include <string>
void print(matrix m, std::string label)
std::cout << label << "[" << m.rows << "x" << m.cols << "] = " << std::endl;
for (int row = 0; row < m.rows; row++)
for (int col = 0; col < m.cols; col++)
std::cout << m.elements[row * m.cols + col] << "t";
std::cout << std::endl;
int main(int argc, char **argv)
if (argc != 8)
std::cout << "NAME" << std::endl;
std::cout << "t" << "matrix-multiplication" << std::endl;
std::cout << std::endl;
return 0;
int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++)
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "GPU #" << prop.pciDeviceID << " " << prop.name;
std::cout << std::endl;
int a_rows = std::stoi(argv[1]);
int a_cols = std::stoi(argv[2]);
int b_rows = std::stoi(argv[3]);
int b_cols = std::stoi(argv[4]);
int c_rows = std::stoi(argv[5]);
int c_cols = std::stoi(argv[6]);
int block_size = std::stoi(argv[7]);
matrix a(a_rows, a_cols);
matrix b(b_rows, b_cols);
matrix c(c_rows, c_cols);
cudaMallocManaged(&a.elements, a.size * sizeof(double));
cudaMallocManaged(&b.elements, b.size * sizeof(double));
cudaMallocManaged(&c.elements, c.size * sizeof(double));
fill_col(a, block_size); // Implementation not shown here
fill_row(b, block_size); // Implementation not shown here
matrix_multiplication(a, b, c, block_size);
print(a, "a");
print(b, "b");
print(c, "c");
cudaFree(a.elements);
cudaFree(b.elements);
cudaFree(c.elements);
return 0;
So ... what do you think? Does it look good? Do you have any suggestions to make?
c++ performance beginner matrix cuda
edited Apr 17 at 23:30
asked Apr 16 at 1:25
xorz57
32912
32912
Where do you actually allocateelements
?
â Zeta
Apr 16 at 4:06
@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â xorz57
Apr 16 at 10:13
add a comment |Â
Where do you actually allocateelements
?
â Zeta
Apr 16 at 4:06
@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â xorz57
Apr 16 at 10:13
Where do you actually allocate
elements
?â Zeta
Apr 16 at 4:06
Where do you actually allocate
elements
?â Zeta
Apr 16 at 4:06
@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â xorz57
Apr 16 at 10:13
@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â xorz57
Apr 16 at 10:13
add a comment |Â
2 Answers
2
active
oldest
votes
up vote
1
down vote
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
0_0
Did you actually mean
matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols)
?
(Additionally beware of integer overflow; size_t would be better here.)
cudaMallocManaged(&a.elements, a.size * sizeof(double));
Managing object's resources in external calls is generally not a good idea. Two things can be done here:
cudaMallocManaged
called from inside thematrix
's constructor.Consequently,
cudaFree
could be called frommatrix
's destructor, butâÂÂa better (IMHO) solution is to turnelements
into aunique_ptr
and callcudaFree
fromelements
' deleter. Aside from raising consistency, this makes yourmatrix
DefaultMoveable.
add a comment |Â
up vote
0
down vote
In addition to what bipll noted, the elements
member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr
with an inline data member initializer.
And it doesnâÂÂt have a destructor. ShouldnâÂÂt it free the memory? I think you really want a unique_ptr
with a custom deleter.
The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete
to disable that.
add a comment |Â
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
1
down vote
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
0_0
Did you actually mean
matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols)
?
(Additionally beware of integer overflow; size_t would be better here.)
cudaMallocManaged(&a.elements, a.size * sizeof(double));
Managing object's resources in external calls is generally not a good idea. Two things can be done here:
cudaMallocManaged
called from inside thematrix
's constructor.Consequently,
cudaFree
could be called frommatrix
's destructor, butâÂÂa better (IMHO) solution is to turnelements
into aunique_ptr
and callcudaFree
fromelements
' deleter. Aside from raising consistency, this makes yourmatrix
DefaultMoveable.
add a comment |Â
up vote
1
down vote
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
0_0
Did you actually mean
matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols)
?
(Additionally beware of integer overflow; size_t would be better here.)
cudaMallocManaged(&a.elements, a.size * sizeof(double));
Managing object's resources in external calls is generally not a good idea. Two things can be done here:
cudaMallocManaged
called from inside thematrix
's constructor.Consequently,
cudaFree
could be called frommatrix
's destructor, butâÂÂa better (IMHO) solution is to turnelements
into aunique_ptr
and callcudaFree
fromelements
' deleter. Aside from raising consistency, this makes yourmatrix
DefaultMoveable.
add a comment |Â
up vote
1
down vote
up vote
1
down vote
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
0_0
Did you actually mean
matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols)
?
(Additionally beware of integer overflow; size_t would be better here.)
cudaMallocManaged(&a.elements, a.size * sizeof(double));
Managing object's resources in external calls is generally not a good idea. Two things can be done here:
cudaMallocManaged
called from inside thematrix
's constructor.Consequently,
cudaFree
could be called frommatrix
's destructor, butâÂÂa better (IMHO) solution is to turnelements
into aunique_ptr
and callcudaFree
fromelements
' deleter. Aside from raising consistency, this makes yourmatrix
DefaultMoveable.
matrix(int rows, int cols)
this->rows = rows;
this->cols = cols;
this->size = rows * cols;
0_0
Did you actually mean
matrix(int rows, int cols): rows(rows), cols(cols), size(rows * cols)
?
(Additionally beware of integer overflow; size_t would be better here.)
cudaMallocManaged(&a.elements, a.size * sizeof(double));
Managing object's resources in external calls is generally not a good idea. Two things can be done here:
cudaMallocManaged
called from inside thematrix
's constructor.Consequently,
cudaFree
could be called frommatrix
's destructor, butâÂÂa better (IMHO) solution is to turnelements
into aunique_ptr
and callcudaFree
fromelements
' deleter. Aside from raising consistency, this makes yourmatrix
DefaultMoveable.
edited Apr 17 at 8:55
answered Apr 16 at 12:03
bipll
4026
4026
add a comment |Â
add a comment |Â
up vote
0
down vote
In addition to what bipll noted, the elements
member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr
with an inline data member initializer.
And it doesnâÂÂt have a destructor. ShouldnâÂÂt it free the memory? I think you really want a unique_ptr
with a custom deleter.
The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete
to disable that.
add a comment |Â
up vote
0
down vote
In addition to what bipll noted, the elements
member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr
with an inline data member initializer.
And it doesnâÂÂt have a destructor. ShouldnâÂÂt it free the memory? I think you really want a unique_ptr
with a custom deleter.
The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete
to disable that.
add a comment |Â
up vote
0
down vote
up vote
0
down vote
In addition to what bipll noted, the elements
member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr
with an inline data member initializer.
And it doesnâÂÂt have a destructor. ShouldnâÂÂt it free the memory? I think you really want a unique_ptr
with a custom deleter.
The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete
to disable that.
In addition to what bipll noted, the elements
member is not initialized by the constructor, but left in a garbage state. At the very least, make it nullptr
with an inline data member initializer.
And it doesnâÂÂt have a destructor. ShouldnâÂÂt it free the memory? I think you really want a unique_ptr
with a custom deleter.
The compiler generated assignment and copy members for you, but they will do the wrong thing. You should mark them =delete
to disable that.
answered Apr 18 at 6:21
JDÃ Âugosz
5,047731
5,047731
add a comment |Â
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f192154%2fmatrix-multiplication-implementation-in-cuda-c-api-with-and-without-shared-mem%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Where do you actually allocate
elements
?â Zeta
Apr 16 at 4:06
@Zeta I guess you need the main ... I will include it but I will leave some functions out for simplicity. Those are for initialization of data and printing data.
â xorz57
Apr 16 at 10:13