CUDA Template-matching program

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
0
down vote

favorite












I have started learning CUDA programming. As my first code, I have tried to implement a simple template matching program. but as I am very new to CUDA and parallel programming, I am not sure how I can improve and optimize my code to gain most from CUDA and parallel execution of this code.
Here is the code I have written so far:



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <math.h>
#include <chrono>

void populate(uint64_t *img, int height, int width, uint8_t val)
uint64_t * ptr = img;
for (int h = 0; h < height; h++)
for (int w = 0; w < width; w++)
*ptr = (h*width) + w;
ptr++;




__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

int row_start = threadIdx.x * width;

uint64_t *img = &Img[row_start];
uint64_t *l1 = &L1[row_start];
uint64_t *l2 = &L2[row_start];
uint64_t *lx = &Lx[row_start];
uint64_t *ly = &Ly[row_start];
for (int i = 0; i < width; i++)
if (i == 0)
l1[i] = img[i];
l2[i] = img[i] * img[i];

else
l1[i] = l1[i - 1] + img[i];
l2[i] = l2[i - 1] + (img[i] * img[i]);

lx[i] = lx[i-1] + img[i] * i;
ly[i] = ly[i - 1] + img[i] * threadIdx.x;

return;


__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

int column_start = threadIdx.x;
uint64_t *img = &Img[column_start];
uint64_t *l1 = &L1[column_start];
uint64_t *l2 = &L2[column_start];
uint64_t *lx = &Lx[column_start];
uint64_t *ly = &Ly[column_start];
int pre_rd = 0;
int rd = 0;
for (int i = 0; i < height; i++)
if (i != 0)
l1[rd] += l1[pre_rd];
l2[rd] += l2[pre_rd];
lx[rd] += lx[pre_rd];
ly[rd] += ly[pre_rd];

pre_rd = rd;
rd += width;

return;


void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4)
int a = 0;
int b = blk_size - 1;
int c = blk_size * (blk_size - 1);
int d = (blk_size * blk_size) - 1;
float blk_hlf = (float)(blk_size - 1) / 2;
int blk_pow = blk_size * blk_size;
*V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
*V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
*V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
*V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);

*V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
*V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);

*V1 = *V1 / blk_pow;
*V2 = (*V2 / blk_pow) - (*V1 * *V1);




__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences)

int findMin(float *diffs,int size)
int min = diffs[0];
int min_idx = 0;
for (int i = 1; i < size; i++)
if (min > diffs[i])
min_idx = i;
min = diffs[i];


return min_idx;


int main(void)


int height = 16;
int width = 16;
int tmplate_blk_size = 4;
int features_width = width - tmplate_blk_size + 1;
int features_height = height - tmplate_blk_size + 1;

uint64_t *Img, *L1, *L2, *Lx, *Ly;
uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
float *V1, *V2, *V3, *V4; //Features of input image
float TV1, TV2, TV3, TV4; //Features of template image
float *Differences; //Stores distance between image features and template features

// Allocate Unified Memory for input image – accessible from CPU or GPU
int image_size = height * width;
int feature_size = features_width * features_height;
cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
cudaMallocManaged(&V1, feature_size * sizeof(float));
cudaMallocManaged(&V2, feature_size * sizeof(float));
cudaMallocManaged(&V3, feature_size * sizeof(float));
cudaMallocManaged(&V4, feature_size * sizeof(float));
cudaMallocManaged(&Differences, feature_size * sizeof(float));

//Allocate Unified Memory for template image – accessible from CPU or GPU
int template_size = tmplate_blk_size * tmplate_blk_size;
cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));

populate(Img, height, width, 1);
populate(Template, tmplate_blk_size, tmplate_blk_size, 1);

addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();

addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();

//Calculate all features
//For template, we only need one value for each feature so it's better and faster to run it on CPU
calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);

dim3 threads(height, width);
calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
cudaDeviceSynchronize();

// Free memory
cudaFree(Img);
cudaFree(L1);
cudaFree(L2);
cudaFree(Lx);
cudaFree(Ly);
cudaFree(V1);
cudaFree(V2);
cudaFree(V3);
cudaFree(V4);
cudaFree(Template);
cudaFree(TL1);
cudaFree(TL2);
cudaFree(TLx);
cudaFree(TLy);
cudaFree(Differences);

return 0;







share|improve this question

















  • 1




    Please tell us more about what you mean by template matching, ideally with examples.
    – 200_success
    Apr 12 at 21:01










  • Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
    – xorz57
    Apr 13 at 0:58
















up vote
0
down vote

favorite












I have started learning CUDA programming. As my first code, I have tried to implement a simple template matching program. but as I am very new to CUDA and parallel programming, I am not sure how I can improve and optimize my code to gain most from CUDA and parallel execution of this code.
Here is the code I have written so far:



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <math.h>
#include <chrono>

void populate(uint64_t *img, int height, int width, uint8_t val)
uint64_t * ptr = img;
for (int h = 0; h < height; h++)
for (int w = 0; w < width; w++)
*ptr = (h*width) + w;
ptr++;




__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

int row_start = threadIdx.x * width;

uint64_t *img = &Img[row_start];
uint64_t *l1 = &L1[row_start];
uint64_t *l2 = &L2[row_start];
uint64_t *lx = &Lx[row_start];
uint64_t *ly = &Ly[row_start];
for (int i = 0; i < width; i++)
if (i == 0)
l1[i] = img[i];
l2[i] = img[i] * img[i];

else
l1[i] = l1[i - 1] + img[i];
l2[i] = l2[i - 1] + (img[i] * img[i]);

lx[i] = lx[i-1] + img[i] * i;
ly[i] = ly[i - 1] + img[i] * threadIdx.x;

return;


__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

int column_start = threadIdx.x;
uint64_t *img = &Img[column_start];
uint64_t *l1 = &L1[column_start];
uint64_t *l2 = &L2[column_start];
uint64_t *lx = &Lx[column_start];
uint64_t *ly = &Ly[column_start];
int pre_rd = 0;
int rd = 0;
for (int i = 0; i < height; i++)
if (i != 0)
l1[rd] += l1[pre_rd];
l2[rd] += l2[pre_rd];
lx[rd] += lx[pre_rd];
ly[rd] += ly[pre_rd];

pre_rd = rd;
rd += width;

return;


void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4)
int a = 0;
int b = blk_size - 1;
int c = blk_size * (blk_size - 1);
int d = (blk_size * blk_size) - 1;
float blk_hlf = (float)(blk_size - 1) / 2;
int blk_pow = blk_size * blk_size;
*V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
*V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
*V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
*V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);

*V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
*V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);

*V1 = *V1 / blk_pow;
*V2 = (*V2 / blk_pow) - (*V1 * *V1);




__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences)

int findMin(float *diffs,int size)
int min = diffs[0];
int min_idx = 0;
for (int i = 1; i < size; i++)
if (min > diffs[i])
min_idx = i;
min = diffs[i];


return min_idx;


int main(void)


int height = 16;
int width = 16;
int tmplate_blk_size = 4;
int features_width = width - tmplate_blk_size + 1;
int features_height = height - tmplate_blk_size + 1;

uint64_t *Img, *L1, *L2, *Lx, *Ly;
uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
float *V1, *V2, *V3, *V4; //Features of input image
float TV1, TV2, TV3, TV4; //Features of template image
float *Differences; //Stores distance between image features and template features

// Allocate Unified Memory for input image – accessible from CPU or GPU
int image_size = height * width;
int feature_size = features_width * features_height;
cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
cudaMallocManaged(&V1, feature_size * sizeof(float));
cudaMallocManaged(&V2, feature_size * sizeof(float));
cudaMallocManaged(&V3, feature_size * sizeof(float));
cudaMallocManaged(&V4, feature_size * sizeof(float));
cudaMallocManaged(&Differences, feature_size * sizeof(float));

//Allocate Unified Memory for template image – accessible from CPU or GPU
int template_size = tmplate_blk_size * tmplate_blk_size;
cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));

populate(Img, height, width, 1);
populate(Template, tmplate_blk_size, tmplate_blk_size, 1);

addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();

addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();

//Calculate all features
//For template, we only need one value for each feature so it's better and faster to run it on CPU
calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);

dim3 threads(height, width);
calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
cudaDeviceSynchronize();

// Free memory
cudaFree(Img);
cudaFree(L1);
cudaFree(L2);
cudaFree(Lx);
cudaFree(Ly);
cudaFree(V1);
cudaFree(V2);
cudaFree(V3);
cudaFree(V4);
cudaFree(Template);
cudaFree(TL1);
cudaFree(TL2);
cudaFree(TLx);
cudaFree(TLy);
cudaFree(Differences);

return 0;







share|improve this question

















  • 1




    Please tell us more about what you mean by template matching, ideally with examples.
    – 200_success
    Apr 12 at 21:01










  • Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
    – xorz57
    Apr 13 at 0:58












up vote
0
down vote

favorite









up vote
0
down vote

favorite











I have started learning CUDA programming. As my first code, I have tried to implement a simple template matching program. but as I am very new to CUDA and parallel programming, I am not sure how I can improve and optimize my code to gain most from CUDA and parallel execution of this code.
Here is the code I have written so far:



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <math.h>
#include <chrono>

void populate(uint64_t *img, int height, int width, uint8_t val)
uint64_t * ptr = img;
for (int h = 0; h < height; h++)
for (int w = 0; w < width; w++)
*ptr = (h*width) + w;
ptr++;




__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

int row_start = threadIdx.x * width;

uint64_t *img = &Img[row_start];
uint64_t *l1 = &L1[row_start];
uint64_t *l2 = &L2[row_start];
uint64_t *lx = &Lx[row_start];
uint64_t *ly = &Ly[row_start];
for (int i = 0; i < width; i++)
if (i == 0)
l1[i] = img[i];
l2[i] = img[i] * img[i];

else
l1[i] = l1[i - 1] + img[i];
l2[i] = l2[i - 1] + (img[i] * img[i]);

lx[i] = lx[i-1] + img[i] * i;
ly[i] = ly[i - 1] + img[i] * threadIdx.x;

return;


__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

int column_start = threadIdx.x;
uint64_t *img = &Img[column_start];
uint64_t *l1 = &L1[column_start];
uint64_t *l2 = &L2[column_start];
uint64_t *lx = &Lx[column_start];
uint64_t *ly = &Ly[column_start];
int pre_rd = 0;
int rd = 0;
for (int i = 0; i < height; i++)
if (i != 0)
l1[rd] += l1[pre_rd];
l2[rd] += l2[pre_rd];
lx[rd] += lx[pre_rd];
ly[rd] += ly[pre_rd];

pre_rd = rd;
rd += width;

return;


void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4)
int a = 0;
int b = blk_size - 1;
int c = blk_size * (blk_size - 1);
int d = (blk_size * blk_size) - 1;
float blk_hlf = (float)(blk_size - 1) / 2;
int blk_pow = blk_size * blk_size;
*V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
*V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
*V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
*V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);

*V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
*V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);

*V1 = *V1 / blk_pow;
*V2 = (*V2 / blk_pow) - (*V1 * *V1);




__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences)

int findMin(float *diffs,int size)
int min = diffs[0];
int min_idx = 0;
for (int i = 1; i < size; i++)
if (min > diffs[i])
min_idx = i;
min = diffs[i];


return min_idx;


int main(void)


int height = 16;
int width = 16;
int tmplate_blk_size = 4;
int features_width = width - tmplate_blk_size + 1;
int features_height = height - tmplate_blk_size + 1;

uint64_t *Img, *L1, *L2, *Lx, *Ly;
uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
float *V1, *V2, *V3, *V4; //Features of input image
float TV1, TV2, TV3, TV4; //Features of template image
float *Differences; //Stores distance between image features and template features

// Allocate Unified Memory for input image – accessible from CPU or GPU
int image_size = height * width;
int feature_size = features_width * features_height;
cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
cudaMallocManaged(&V1, feature_size * sizeof(float));
cudaMallocManaged(&V2, feature_size * sizeof(float));
cudaMallocManaged(&V3, feature_size * sizeof(float));
cudaMallocManaged(&V4, feature_size * sizeof(float));
cudaMallocManaged(&Differences, feature_size * sizeof(float));

//Allocate Unified Memory for template image – accessible from CPU or GPU
int template_size = tmplate_blk_size * tmplate_blk_size;
cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));

populate(Img, height, width, 1);
populate(Template, tmplate_blk_size, tmplate_blk_size, 1);

addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();

addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();

//Calculate all features
//For template, we only need one value for each feature so it's better and faster to run it on CPU
calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);

dim3 threads(height, width);
calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
cudaDeviceSynchronize();

// Free memory
cudaFree(Img);
cudaFree(L1);
cudaFree(L2);
cudaFree(Lx);
cudaFree(Ly);
cudaFree(V1);
cudaFree(V2);
cudaFree(V3);
cudaFree(V4);
cudaFree(Template);
cudaFree(TL1);
cudaFree(TL2);
cudaFree(TLx);
cudaFree(TLy);
cudaFree(Differences);

return 0;







share|improve this question













I have started learning CUDA programming. As my first code, I have tried to implement a simple template matching program. but as I am very new to CUDA and parallel programming, I am not sure how I can improve and optimize my code to gain most from CUDA and parallel execution of this code.
Here is the code I have written so far:



#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <math.h>
#include <chrono>

void populate(uint64_t *img, int height, int width, uint8_t val)
uint64_t * ptr = img;
for (int h = 0; h < height; h++)
for (int w = 0; w < width; w++)
*ptr = (h*width) + w;
ptr++;




__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

int row_start = threadIdx.x * width;

uint64_t *img = &Img[row_start];
uint64_t *l1 = &L1[row_start];
uint64_t *l2 = &L2[row_start];
uint64_t *lx = &Lx[row_start];
uint64_t *ly = &Ly[row_start];
for (int i = 0; i < width; i++)
if (i == 0)
l1[i] = img[i];
l2[i] = img[i] * img[i];

else
l1[i] = l1[i - 1] + img[i];
l2[i] = l2[i - 1] + (img[i] * img[i]);

lx[i] = lx[i-1] + img[i] * i;
ly[i] = ly[i - 1] + img[i] * threadIdx.x;

return;


__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

int column_start = threadIdx.x;
uint64_t *img = &Img[column_start];
uint64_t *l1 = &L1[column_start];
uint64_t *l2 = &L2[column_start];
uint64_t *lx = &Lx[column_start];
uint64_t *ly = &Ly[column_start];
int pre_rd = 0;
int rd = 0;
for (int i = 0; i < height; i++)
if (i != 0)
l1[rd] += l1[pre_rd];
l2[rd] += l2[pre_rd];
lx[rd] += lx[pre_rd];
ly[rd] += ly[pre_rd];

pre_rd = rd;
rd += width;

return;


void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4)
int a = 0;
int b = blk_size - 1;
int c = blk_size * (blk_size - 1);
int d = (blk_size * blk_size) - 1;
float blk_hlf = (float)(blk_size - 1) / 2;
int blk_pow = blk_size * blk_size;
*V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
*V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
*V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
*V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);

*V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
*V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);

*V1 = *V1 / blk_pow;
*V2 = (*V2 / blk_pow) - (*V1 * *V1);




__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences)

int findMin(float *diffs,int size)
int min = diffs[0];
int min_idx = 0;
for (int i = 1; i < size; i++)
if (min > diffs[i])
min_idx = i;
min = diffs[i];


return min_idx;


int main(void)


int height = 16;
int width = 16;
int tmplate_blk_size = 4;
int features_width = width - tmplate_blk_size + 1;
int features_height = height - tmplate_blk_size + 1;

uint64_t *Img, *L1, *L2, *Lx, *Ly;
uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
float *V1, *V2, *V3, *V4; //Features of input image
float TV1, TV2, TV3, TV4; //Features of template image
float *Differences; //Stores distance between image features and template features

// Allocate Unified Memory for input image – accessible from CPU or GPU
int image_size = height * width;
int feature_size = features_width * features_height;
cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
cudaMallocManaged(&V1, feature_size * sizeof(float));
cudaMallocManaged(&V2, feature_size * sizeof(float));
cudaMallocManaged(&V3, feature_size * sizeof(float));
cudaMallocManaged(&V4, feature_size * sizeof(float));
cudaMallocManaged(&Differences, feature_size * sizeof(float));

//Allocate Unified Memory for template image – accessible from CPU or GPU
int template_size = tmplate_blk_size * tmplate_blk_size;
cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));

populate(Img, height, width, 1);
populate(Template, tmplate_blk_size, tmplate_blk_size, 1);

addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();

addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();

//Calculate all features
//For template, we only need one value for each feature so it's better and faster to run it on CPU
calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);

dim3 threads(height, width);
calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
cudaDeviceSynchronize();

// Free memory
cudaFree(Img);
cudaFree(L1);
cudaFree(L2);
cudaFree(Lx);
cudaFree(Ly);
cudaFree(V1);
cudaFree(V2);
cudaFree(V3);
cudaFree(V4);
cudaFree(Template);
cudaFree(TL1);
cudaFree(TL2);
cudaFree(TLx);
cudaFree(TLy);
cudaFree(Differences);

return 0;









share|improve this question












share|improve this question




share|improve this question








edited Apr 12 at 21:01









200_success

123k14142399




123k14142399









asked Apr 12 at 18:13









Masoud

71




71







  • 1




    Please tell us more about what you mean by template matching, ideally with examples.
    – 200_success
    Apr 12 at 21:01










  • Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
    – xorz57
    Apr 13 at 0:58












  • 1




    Please tell us more about what you mean by template matching, ideally with examples.
    – 200_success
    Apr 12 at 21:01










  • Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
    – xorz57
    Apr 13 at 0:58







1




1




Please tell us more about what you mean by template matching, ideally with examples.
– 200_success
Apr 12 at 21:01




Please tell us more about what you mean by template matching, ideally with examples.
– 200_success
Apr 12 at 21:01












Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
– xorz57
Apr 13 at 0:58




Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
– xorz57
Apr 13 at 0:58















active

oldest

votes











Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f191904%2fcuda-template-matching-program%23new-answer', 'question_page');

);

Post as a guest



































active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes










 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f191904%2fcuda-template-matching-program%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Greedy Best First Search implementation in Rust

Function to Return a JSON Like Objects Using VBA Collections and Arrays

C++11 CLH Lock Implementation