Complex multiplication and integration with CUDA

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
3
down vote

favorite

I want to perform multiplication on two vectors and integrate it in a vector called acc_y. The acc_y variable will update over every iteration and averaged out.
I have modified vector addition code for it.

kernel for the multiply and integration :

__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim) 
 int index = blockIdx.x * blockDim.x + threadIdx.x;
 int stride = blockDim.x * gridDim.x;

 // Multiplication
 for (int i = index; i < M; i += stride) 
 acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
 acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];
 

__global__ void cavg(int M,double iter,double *xre,double *xim)
 // Averaging
 int index = blockIdx.x * blockDim.x + threadIdx.x;
 int stride = blockDim.x * gridDim.x;
 // Grid-stride approch
 for (int i = index; i < M; i += stride) 
 xre[i] /= iter;
 xim[i] /= iter;

As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?

Will shared memory help here?

I have used cuComplex.h also but getting the same performance.

host code :

for (j = 0; j < iter; j++) 

 // Generate data in host
 for (i = j * M, c = 0; i < M * (j + 1); i++, c++) 
 x1re[c] = (float)i;
 x1im[c] = 0.0;
 x2re[c] = 1.0;
 x2im[c] = 0.0;
 

 // Copy host to device
 cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);

 //mac
 cvctmac<<<numBlock, numThread>>>(
 M,
 dyre, dyim,
 dx1re, dx1im,
 dx2re, dx2im,
 dacc_yre, dacc_yim
 );

// Avg
cavg<<<numBlock, numThread>>>(
 M, (double) iter,
 dacc_yre, dacc_yim
);

Please suggest the way to optimize the code.

edited Mar 21 at 13:40

asked Mar 20 at 11:33

sbhatporia

164

It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â€“Â xorz57
Apr 16 at 1:34

The code is for runtime API and for compute capability of 6.1.
â€“Â sbhatporia
Apr 16 at 12:32

add a commentÂ |Â

up vote
3
down vote

favorite

kernel for the multiply and integration :

__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim) 
 int index = blockIdx.x * blockDim.x + threadIdx.x;
 int stride = blockDim.x * gridDim.x;

 // Multiplication
 for (int i = index; i < M; i += stride) 
 acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
 acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];
 

__global__ void cavg(int M,double iter,double *xre,double *xim)
 // Averaging
 int index = blockIdx.x * blockDim.x + threadIdx.x;
 int stride = blockDim.x * gridDim.x;
 // Grid-stride approch
 for (int i = index; i < M; i += stride) 
 xre[i] /= iter;
 xim[i] /= iter;

As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?

Will shared memory help here?

I have used cuComplex.h also but getting the same performance.

host code :

for (j = 0; j < iter; j++) 

 // Generate data in host
 for (i = j * M, c = 0; i < M * (j + 1); i++, c++) 
 x1re[c] = (float)i;
 x1im[c] = 0.0;
 x2re[c] = 1.0;
 x2im[c] = 0.0;
 

 // Copy host to device
 cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);

 //mac
 cvctmac<<<numBlock, numThread>>>(
 M,
 dyre, dyim,
 dx1re, dx1im,
 dx2re, dx2im,
 dacc_yre, dacc_yim
 );

// Avg
cavg<<<numBlock, numThread>>>(
 M, (double) iter,
 dacc_yre, dacc_yim
);

Please suggest the way to optimize the code.

edited Mar 21 at 13:40

asked Mar 20 at 11:33

sbhatporia

164

It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â€“Â xorz57
Apr 16 at 1:34

The code is for runtime API and for compute capability of 6.1.
â€“Â sbhatporia
Apr 16 at 12:32

add a commentÂ |Â

up vote
3
down vote

favorite

kernel for the multiply and integration :

__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim) 
 int index = blockIdx.x * blockDim.x + threadIdx.x;
 int stride = blockDim.x * gridDim.x;

 // Multiplication
 for (int i = index; i < M; i += stride) 
 acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
 acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];
 

__global__ void cavg(int M,double iter,double *xre,double *xim)
 // Averaging
 int index = blockIdx.x * blockDim.x + threadIdx.x;
 int stride = blockDim.x * gridDim.x;
 // Grid-stride approch
 for (int i = index; i < M; i += stride) 
 xre[i] /= iter;
 xim[i] /= iter;

As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?

Will shared memory help here?

I have used cuComplex.h also but getting the same performance.

host code :

for (j = 0; j < iter; j++) 

 // Generate data in host
 for (i = j * M, c = 0; i < M * (j + 1); i++, c++) 
 x1re[c] = (float)i;
 x1im[c] = 0.0;
 x2re[c] = 1.0;
 x2im[c] = 0.0;
 

 // Copy host to device
 cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);

 //mac
 cvctmac<<<numBlock, numThread>>>(
 M,
 dyre, dyim,
 dx1re, dx1im,
 dx2re, dx2im,
 dacc_yre, dacc_yim
 );

// Avg
cavg<<<numBlock, numThread>>>(
 M, (double) iter,
 dacc_yre, dacc_yim
);

Please suggest the way to optimize the code.

edited Mar 21 at 13:40

asked Mar 20 at 11:33

sbhatporia

164

kernel for the multiply and integration :

__global__ void cvctmac (int M,float *yre,float *yim,float *x1re,float *x1im,float *x2re,float *x2im,double *acc_yre,double *acc_yim) 
 int index = blockIdx.x * blockDim.x + threadIdx.x;
 int stride = blockDim.x * gridDim.x;

 // Multiplication
 for (int i = index; i < M; i += stride) 
 acc_yre[i] += x1re[i] * x2re[i] - x1im[i] * x2im[i];
 acc_yim[i] += x1re[i] * x2im[i] + x1im[i] * x2re[i];
 

__global__ void cavg(int M,double iter,double *xre,double *xim)
 // Averaging
 int index = blockIdx.x * blockDim.x + threadIdx.x;
 int stride = blockDim.x * gridDim.x;
 // Grid-stride approch
 for (int i = index; i < M; i += stride) 
 xre[i] /= iter;
 xim[i] /= iter;

As the code is performing complex arithmetic, I am doing 4 operations per thread(2 for multiplication and 2 for integration). what will be the way to optimize the kernel cvctmac?

Will shared memory help here?

I have used cuComplex.h also but getting the same performance.

host code :

for (j = 0; j < iter; j++) 

 // Generate data in host
 for (i = j * M, c = 0; i < M * (j + 1); i++, c++) 
 x1re[c] = (float)i;
 x1im[c] = 0.0;
 x2re[c] = 1.0;
 x2im[c] = 0.0;
 

 // Copy host to device
 cudaMemcpy(dx1re, x1re, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx1im, x1im, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx2re, x2re, M * sizeof(float), cudaMemcpyHostToDevice);
 cudaMemcpy(dx2im, x2im, M * sizeof(float), cudaMemcpyHostToDevice);

 //mac
 cvctmac<<<numBlock, numThread>>>(
 M,
 dyre, dyim,
 dx1re, dx1im,
 dx2re, dx2im,
 dacc_yre, dacc_yim
 );

// Avg
cavg<<<numBlock, numThread>>>(
 M, (double) iter,
 dacc_yre, dacc_yim
);

Please suggest the way to optimize the code.

edited Mar 21 at 13:40

asked Mar 20 at 11:33

sbhatporia

164

edited Mar 21 at 13:40

asked Mar 20 at 11:33

sbhatporia

164

asked Mar 20 at 11:33

sbhatporia

164

asked Mar 20 at 11:33

sbhatporia

164

It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â€“Â xorz57
Apr 16 at 1:34

The code is for runtime API and for compute capability of 6.1.
â€“Â sbhatporia
Apr 16 at 12:32

add a commentÂ |Â

It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â€“Â xorz57
Apr 16 at 1:34

The code is for runtime API and for compute capability of 6.1.
â€“Â sbhatporia
Apr 16 at 12:32

It would be useful if you also stated on what CUDA API and Compute Capability you want your source code to be optimized.
â€“Â xorz57
Apr 16 at 1:34

The code is for runtime API and for compute capability of 6.1.
â€“Â sbhatporia
Apr 16 at 12:32

add a commentÂ |Â

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f190022%2fcomplex-multiplication-and-integration-with-cuda%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr