Complex multiplication and integration with CUDA
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
3
down vote
favorite
I want to perform multiplication on two vectors and integrate it in a vector called acc_y
. The acc_y
variable will update over every iteration and averaged out.
I have modified vector addition code for it.
kernel for the multiply and integration :
__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim)
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Multiplication
for (int i = index; i < M; i += stride)
acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];
__global__ void cavg(int M,double iter,double *xre,double *xim)
// Averaging
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Grid-stride approch
for (int i = index; i < M; i += stride)
xre[i] /= iter;
xim[i] /= iter;
As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?
Will shared memory help here?
I have used cuComplex.h also but getting the same performance.
host code :
for (j = 0; j < iter; j++)
// Generate data in host
for (i = j * M, c = 0; i < M * (j + 1); i++, c++)
x1re[c] = (float)i;
x1im[c] = 0.0;
x2re[c] = 1.0;
x2im[c] = 0.0;
// Copy host to device
cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);
//mac
cvctmac<<<numBlock, numThread>>>(
M,
dyre, dyim,
dx1re, dx1im,
dx2re, dx2im,
dacc_yre, dacc_yim
);
// Avg
cavg<<<numBlock, numThread>>>(
M, (double) iter,
dacc_yre, dacc_yim
);
Please suggest the way to optimize the code.
performance c numerical-methods cuda
add a comment |Â
up vote
3
down vote
favorite
I want to perform multiplication on two vectors and integrate it in a vector called acc_y
. The acc_y
variable will update over every iteration and averaged out.
I have modified vector addition code for it.
kernel for the multiply and integration :
__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim)
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Multiplication
for (int i = index; i < M; i += stride)
acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];
__global__ void cavg(int M,double iter,double *xre,double *xim)
// Averaging
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Grid-stride approch
for (int i = index; i < M; i += stride)
xre[i] /= iter;
xim[i] /= iter;
As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?
Will shared memory help here?
I have used cuComplex.h also but getting the same performance.
host code :
for (j = 0; j < iter; j++)
// Generate data in host
for (i = j * M, c = 0; i < M * (j + 1); i++, c++)
x1re[c] = (float)i;
x1im[c] = 0.0;
x2re[c] = 1.0;
x2im[c] = 0.0;
// Copy host to device
cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);
//mac
cvctmac<<<numBlock, numThread>>>(
M,
dyre, dyim,
dx1re, dx1im,
dx2re, dx2im,
dacc_yre, dacc_yim
);
// Avg
cavg<<<numBlock, numThread>>>(
M, (double) iter,
dacc_yre, dacc_yim
);
Please suggest the way to optimize the code.
performance c numerical-methods cuda
It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â xorz57
Apr 16 at 1:34
The code is for runtime API and for compute capability of 6.1.
â sbhatporia
Apr 16 at 12:32
add a comment |Â
up vote
3
down vote
favorite
up vote
3
down vote
favorite
I want to perform multiplication on two vectors and integrate it in a vector called acc_y
. The acc_y
variable will update over every iteration and averaged out.
I have modified vector addition code for it.
kernel for the multiply and integration :
__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim)
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Multiplication
for (int i = index; i < M; i += stride)
acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];
__global__ void cavg(int M,double iter,double *xre,double *xim)
// Averaging
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Grid-stride approch
for (int i = index; i < M; i += stride)
xre[i] /= iter;
xim[i] /= iter;
As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?
Will shared memory help here?
I have used cuComplex.h also but getting the same performance.
host code :
for (j = 0; j < iter; j++)
// Generate data in host
for (i = j * M, c = 0; i < M * (j + 1); i++, c++)
x1re[c] = (float)i;
x1im[c] = 0.0;
x2re[c] = 1.0;
x2im[c] = 0.0;
// Copy host to device
cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);
//mac
cvctmac<<<numBlock, numThread>>>(
M,
dyre, dyim,
dx1re, dx1im,
dx2re, dx2im,
dacc_yre, dacc_yim
);
// Avg
cavg<<<numBlock, numThread>>>(
M, (double) iter,
dacc_yre, dacc_yim
);
Please suggest the way to optimize the code.
performance c numerical-methods cuda
I want to perform multiplication on two vectors and integrate it in a vector called acc_y
. The acc_y
variable will update over every iteration and averaged out.
I have modified vector addition code for it.
kernel for the multiply and integration :
__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim)
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Multiplication
for (int i = index; i < M; i += stride)
acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];
__global__ void cavg(int M,double iter,double *xre,double *xim)
// Averaging
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Grid-stride approch
for (int i = index; i < M; i += stride)
xre[i] /= iter;
xim[i] /= iter;
As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?
Will shared memory help here?
I have used cuComplex.h also but getting the same performance.
host code :
for (j = 0; j < iter; j++)
// Generate data in host
for (i = j * M, c = 0; i < M * (j + 1); i++, c++)
x1re[c] = (float)i;
x1im[c] = 0.0;
x2re[c] = 1.0;
x2im[c] = 0.0;
// Copy host to device
cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);
//mac
cvctmac<<<numBlock, numThread>>>(
M,
dyre, dyim,
dx1re, dx1im,
dx2re, dx2im,
dacc_yre, dacc_yim
);
// Avg
cavg<<<numBlock, numThread>>>(
M, (double) iter,
dacc_yre, dacc_yim
);
Please suggest the way to optimize the code.
performance c numerical-methods cuda
edited Mar 21 at 13:40
asked Mar 20 at 11:33
sbhatporia
164
164
It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â xorz57
Apr 16 at 1:34
The code is for runtime API and for compute capability of 6.1.
â sbhatporia
Apr 16 at 12:32
add a comment |Â
It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â xorz57
Apr 16 at 1:34
The code is for runtime API and for compute capability of 6.1.
â sbhatporia
Apr 16 at 12:32
It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â xorz57
Apr 16 at 1:34
It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â xorz57
Apr 16 at 1:34
The code is for runtime API and for compute capability of 6.1.
â sbhatporia
Apr 16 at 12:32
The code is for runtime API and for compute capability of 6.1.
â sbhatporia
Apr 16 at 12:32
add a comment |Â
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f190022%2fcomplex-multiplication-and-integration-with-cuda%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â xorz57
Apr 16 at 1:34
The code is for runtime API and for compute capability of 6.1.
â sbhatporia
Apr 16 at 12:32