Complex multiplication and integration with CUDA

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
3
down vote

favorite












I want to perform multiplication on two vectors and integrate it in a vector called acc_y. The acc_y variable will update over every iteration and averaged out.
I have modified vector addition code for it.



kernel for the multiply and integration :



__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim) 
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

// Multiplication
for (int i = index; i < M; i += stride)
acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];


__global__ void cavg(int M,double iter,double *xre,double *xim)
// Averaging
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Grid-stride approch
for (int i = index; i < M; i += stride)
xre[i] /= iter;
xim[i] /= iter;




As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?



Will shared memory help here?



I have used cuComplex.h also but getting the same performance.



host code :



for (j = 0; j < iter; j++) 

// Generate data in host
for (i = j * M, c = 0; i < M * (j + 1); i++, c++)
x1re[c] = (float)i;
x1im[c] = 0.0;
x2re[c] = 1.0;
x2im[c] = 0.0;


// Copy host to device
cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);

//mac
cvctmac<<<numBlock, numThread>>>(
M,
dyre, dyim,
dx1re, dx1im,
dx2re, dx2im,
dacc_yre, dacc_yim
);

// Avg
cavg<<<numBlock, numThread>>>(
M, (double) iter,
dacc_yre, dacc_yim
);


Please suggest the way to optimize the code.







share|improve this question





















  • It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
    – xorz57
    Apr 16 at 1:34










  • The code is for runtime API and for compute capability of 6.1.
    – sbhatporia
    Apr 16 at 12:32
















up vote
3
down vote

favorite












I want to perform multiplication on two vectors and integrate it in a vector called acc_y. The acc_y variable will update over every iteration and averaged out.
I have modified vector addition code for it.



kernel for the multiply and integration :



__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim) 
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

// Multiplication
for (int i = index; i < M; i += stride)
acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];


__global__ void cavg(int M,double iter,double *xre,double *xim)
// Averaging
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Grid-stride approch
for (int i = index; i < M; i += stride)
xre[i] /= iter;
xim[i] /= iter;




As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?



Will shared memory help here?



I have used cuComplex.h also but getting the same performance.



host code :



for (j = 0; j < iter; j++) 

// Generate data in host
for (i = j * M, c = 0; i < M * (j + 1); i++, c++)
x1re[c] = (float)i;
x1im[c] = 0.0;
x2re[c] = 1.0;
x2im[c] = 0.0;


// Copy host to device
cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);

//mac
cvctmac<<<numBlock, numThread>>>(
M,
dyre, dyim,
dx1re, dx1im,
dx2re, dx2im,
dacc_yre, dacc_yim
);

// Avg
cavg<<<numBlock, numThread>>>(
M, (double) iter,
dacc_yre, dacc_yim
);


Please suggest the way to optimize the code.







share|improve this question





















  • It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
    – xorz57
    Apr 16 at 1:34










  • The code is for runtime API and for compute capability of 6.1.
    – sbhatporia
    Apr 16 at 12:32












up vote
3
down vote

favorite









up vote
3
down vote

favorite











I want to perform multiplication on two vectors and integrate it in a vector called acc_y. The acc_y variable will update over every iteration and averaged out.
I have modified vector addition code for it.



kernel for the multiply and integration :



__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim) 
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

// Multiplication
for (int i = index; i < M; i += stride)
acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];


__global__ void cavg(int M,double iter,double *xre,double *xim)
// Averaging
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Grid-stride approch
for (int i = index; i < M; i += stride)
xre[i] /= iter;
xim[i] /= iter;




As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?



Will shared memory help here?



I have used cuComplex.h also but getting the same performance.



host code :



for (j = 0; j < iter; j++) 

// Generate data in host
for (i = j * M, c = 0; i < M * (j + 1); i++, c++)
x1re[c] = (float)i;
x1im[c] = 0.0;
x2re[c] = 1.0;
x2im[c] = 0.0;


// Copy host to device
cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);

//mac
cvctmac<<<numBlock, numThread>>>(
M,
dyre, dyim,
dx1re, dx1im,
dx2re, dx2im,
dacc_yre, dacc_yim
);

// Avg
cavg<<<numBlock, numThread>>>(
M, (double) iter,
dacc_yre, dacc_yim
);


Please suggest the way to optimize the code.







share|improve this question













I want to perform multiplication on two vectors and integrate it in a vector called acc_y. The acc_y variable will update over every iteration and averaged out.
I have modified vector addition code for it.



kernel for the multiply and integration :



__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim) 
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

// Multiplication
for (int i = index; i < M; i += stride)
acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];


__global__ void cavg(int M,double iter,double *xre,double *xim)
// Averaging
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Grid-stride approch
for (int i = index; i < M; i += stride)
xre[i] /= iter;
xim[i] /= iter;




As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?



Will shared memory help here?



I have used cuComplex.h also but getting the same performance.



host code :



for (j = 0; j < iter; j++) 

// Generate data in host
for (i = j * M, c = 0; i < M * (j + 1); i++, c++)
x1re[c] = (float)i;
x1im[c] = 0.0;
x2re[c] = 1.0;
x2im[c] = 0.0;


// Copy host to device
cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);

//mac
cvctmac<<<numBlock, numThread>>>(
M,
dyre, dyim,
dx1re, dx1im,
dx2re, dx2im,
dacc_yre, dacc_yim
);

// Avg
cavg<<<numBlock, numThread>>>(
M, (double) iter,
dacc_yre, dacc_yim
);


Please suggest the way to optimize the code.









share|improve this question












share|improve this question




share|improve this question








edited Mar 21 at 13:40
























asked Mar 20 at 11:33









sbhatporia

164




164











  • It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
    – xorz57
    Apr 16 at 1:34










  • The code is for runtime API and for compute capability of 6.1.
    – sbhatporia
    Apr 16 at 12:32
















  • It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
    – xorz57
    Apr 16 at 1:34










  • The code is for runtime API and for compute capability of 6.1.
    – sbhatporia
    Apr 16 at 12:32















It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
– xorz57
Apr 16 at 1:34




It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
– xorz57
Apr 16 at 1:34












The code is for runtime API and for compute capability of 6.1.
– sbhatporia
Apr 16 at 12:32




The code is for runtime API and for compute capability of 6.1.
– sbhatporia
Apr 16 at 12:32















active

oldest

votes











Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f190022%2fcomplex-multiplication-and-integration-with-cuda%23new-answer', 'question_page');

);

Post as a guest



































active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes










 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f190022%2fcomplex-multiplication-and-integration-with-cuda%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Greedy Best First Search implementation in Rust

Function to Return a JSON Like Objects Using VBA Collections and Arrays

C++11 CLH Lock Implementation