CUDA Template-matching program

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
0
down vote

favorite

I have started learning CUDA programming. As my first code, I have tried to implement a simple template matching program. but as I am very new to CUDA and parallel programming, I am not sure how I can improve and optimize my code to gain most from CUDA and parallel execution of this code.
Here is the code I have written so far:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <math.h>
#include <chrono>

void populate(uint64_t *img, int height, int width, uint8_t val) 
 uint64_t * ptr = img;
 for (int h = 0; h < height; h++) 
 for (int w = 0; w < width; w++) 
 *ptr = (h*width) + w;
 ptr++;
 
 


__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

 int row_start = threadIdx.x * width;

 uint64_t *img = &Img[row_start];
 uint64_t *l1 = &L1[row_start];
 uint64_t *l2 = &L2[row_start];
 uint64_t *lx = &Lx[row_start];
 uint64_t *ly = &Ly[row_start];
 for (int i = 0; i < width; i++) 
 if (i == 0) 
 l1[i] = img[i];
 l2[i] = img[i] * img[i];
 
 else 
 l1[i] = l1[i - 1] + img[i];
 l2[i] = l2[i - 1] + (img[i] * img[i]);
 
 lx[i] = lx[i-1] + img[i] * i;
 ly[i] = ly[i - 1] + img[i] * threadIdx.x;
 
 return;


__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

 int column_start = threadIdx.x;
 uint64_t *img = &Img[column_start];
 uint64_t *l1 = &L1[column_start];
 uint64_t *l2 = &L2[column_start];
 uint64_t *lx = &Lx[column_start];
 uint64_t *ly = &Ly[column_start];
 int pre_rd = 0;
 int rd = 0;
 for (int i = 0; i < height; i++) 
 if (i != 0) 
 l1[rd] += l1[pre_rd];
 l2[rd] += l2[pre_rd];
 lx[rd] += lx[pre_rd];
 ly[rd] += ly[pre_rd];
 
 pre_rd = rd;
 rd += width;
 
 return;


void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4) 
 int a = 0;
 int b = blk_size - 1;
 int c = blk_size * (blk_size - 1);
 int d = (blk_size * blk_size) - 1;
 float blk_hlf = (float)(blk_size - 1) / 2;
 int blk_pow = blk_size * blk_size;
 *V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
 *V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
 *V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
 *V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);

 *V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
 *V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);

 *V1 = *V1 / blk_pow;
 *V2 = (*V2 / blk_pow) - (*V1 * *V1);




__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences) 

int findMin(float *diffs,int size) 
 int min = diffs[0];
 int min_idx = 0;
 for (int i = 1; i < size; i++) 
 if (min > diffs[i]) 
 min_idx = i;
 min = diffs[i];
 
 
 return min_idx;


int main(void)


 int height = 16;
 int width = 16;
 int tmplate_blk_size = 4;
 int features_width = width - tmplate_blk_size + 1;
 int features_height = height - tmplate_blk_size + 1;

 uint64_t *Img, *L1, *L2, *Lx, *Ly;
 uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
 float *V1, *V2, *V3, *V4; //Features of input image
 float TV1, TV2, TV3, TV4; //Features of template image
 float *Differences; //Stores distance between image features and template features

 // Allocate Unified Memory for input image Ã¢Â€Â“ accessible from CPU or GPU
 int image_size = height * width;
 int feature_size = features_width * features_height;
 cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
 cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
 cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
 cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
 cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
 cudaMallocManaged(&V1, feature_size * sizeof(float));
 cudaMallocManaged(&V2, feature_size * sizeof(float));
 cudaMallocManaged(&V3, feature_size * sizeof(float));
 cudaMallocManaged(&V4, feature_size * sizeof(float));
 cudaMallocManaged(&Differences, feature_size * sizeof(float));

 //Allocate Unified Memory for template image Ã¢Â€Â“ accessible from CPU or GPU
 int template_size = tmplate_blk_size * tmplate_blk_size;
 cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));

 populate(Img, height, width, 1);
 populate(Template, tmplate_blk_size, tmplate_blk_size, 1);

 addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
 addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
 cudaDeviceSynchronize();

 addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
 addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
 cudaDeviceSynchronize();

 //Calculate all features
 //For template, we only need one value for each feature so it's better and faster to run it on CPU
 calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);

 dim3 threads(height, width);
 calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
 cudaDeviceSynchronize();

 // Free memory
 cudaFree(Img);
 cudaFree(L1);
 cudaFree(L2);
 cudaFree(Lx);
 cudaFree(Ly);
 cudaFree(V1);
 cudaFree(V2);
 cudaFree(V3);
 cudaFree(V4);
 cudaFree(Template);
 cudaFree(TL1);
 cudaFree(TL2);
 cudaFree(TLx);
 cudaFree(TLy);
 cudaFree(Differences);

 return 0;

edited Apr 12 at 21:01

200_success

123k14142399

asked Apr 12 at 18:13

Masoud

1

Please tell us more about what you mean by template matching, ideally with examples.
â€“Â 200_success
Apr 12 at 21:01

Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â€“Â xorz57
Apr 13 at 0:58

add a commentÂ |Â

up vote
0
down vote

favorite

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <math.h>
#include <chrono>

void populate(uint64_t *img, int height, int width, uint8_t val) 
 uint64_t * ptr = img;
 for (int h = 0; h < height; h++) 
 for (int w = 0; w < width; w++) 
 *ptr = (h*width) + w;
 ptr++;
 
 


__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

 int row_start = threadIdx.x * width;

 uint64_t *img = &Img[row_start];
 uint64_t *l1 = &L1[row_start];
 uint64_t *l2 = &L2[row_start];
 uint64_t *lx = &Lx[row_start];
 uint64_t *ly = &Ly[row_start];
 for (int i = 0; i < width; i++) 
 if (i == 0) 
 l1[i] = img[i];
 l2[i] = img[i] * img[i];
 
 else 
 l1[i] = l1[i - 1] + img[i];
 l2[i] = l2[i - 1] + (img[i] * img[i]);
 
 lx[i] = lx[i-1] + img[i] * i;
 ly[i] = ly[i - 1] + img[i] * threadIdx.x;
 
 return;


__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

 int column_start = threadIdx.x;
 uint64_t *img = &Img[column_start];
 uint64_t *l1 = &L1[column_start];
 uint64_t *l2 = &L2[column_start];
 uint64_t *lx = &Lx[column_start];
 uint64_t *ly = &Ly[column_start];
 int pre_rd = 0;
 int rd = 0;
 for (int i = 0; i < height; i++) 
 if (i != 0) 
 l1[rd] += l1[pre_rd];
 l2[rd] += l2[pre_rd];
 lx[rd] += lx[pre_rd];
 ly[rd] += ly[pre_rd];
 
 pre_rd = rd;
 rd += width;
 
 return;


void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4) 
 int a = 0;
 int b = blk_size - 1;
 int c = blk_size * (blk_size - 1);
 int d = (blk_size * blk_size) - 1;
 float blk_hlf = (float)(blk_size - 1) / 2;
 int blk_pow = blk_size * blk_size;
 *V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
 *V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
 *V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
 *V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);

 *V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
 *V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);

 *V1 = *V1 / blk_pow;
 *V2 = (*V2 / blk_pow) - (*V1 * *V1);




__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences) 

int findMin(float *diffs,int size) 
 int min = diffs[0];
 int min_idx = 0;
 for (int i = 1; i < size; i++) 
 if (min > diffs[i]) 
 min_idx = i;
 min = diffs[i];
 
 
 return min_idx;


int main(void)


 int height = 16;
 int width = 16;
 int tmplate_blk_size = 4;
 int features_width = width - tmplate_blk_size + 1;
 int features_height = height - tmplate_blk_size + 1;

 uint64_t *Img, *L1, *L2, *Lx, *Ly;
 uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
 float *V1, *V2, *V3, *V4; //Features of input image
 float TV1, TV2, TV3, TV4; //Features of template image
 float *Differences; //Stores distance between image features and template features

 // Allocate Unified Memory for input image Ã¢Â€Â“ accessible from CPU or GPU
 int image_size = height * width;
 int feature_size = features_width * features_height;
 cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
 cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
 cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
 cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
 cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
 cudaMallocManaged(&V1, feature_size * sizeof(float));
 cudaMallocManaged(&V2, feature_size * sizeof(float));
 cudaMallocManaged(&V3, feature_size * sizeof(float));
 cudaMallocManaged(&V4, feature_size * sizeof(float));
 cudaMallocManaged(&Differences, feature_size * sizeof(float));

 //Allocate Unified Memory for template image Ã¢Â€Â“ accessible from CPU or GPU
 int template_size = tmplate_blk_size * tmplate_blk_size;
 cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));

 populate(Img, height, width, 1);
 populate(Template, tmplate_blk_size, tmplate_blk_size, 1);

 addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
 addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
 cudaDeviceSynchronize();

 addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
 addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
 cudaDeviceSynchronize();

 //Calculate all features
 //For template, we only need one value for each feature so it's better and faster to run it on CPU
 calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);

 dim3 threads(height, width);
 calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
 cudaDeviceSynchronize();

 // Free memory
 cudaFree(Img);
 cudaFree(L1);
 cudaFree(L2);
 cudaFree(Lx);
 cudaFree(Ly);
 cudaFree(V1);
 cudaFree(V2);
 cudaFree(V3);
 cudaFree(V4);
 cudaFree(Template);
 cudaFree(TL1);
 cudaFree(TL2);
 cudaFree(TLx);
 cudaFree(TLy);
 cudaFree(Differences);

 return 0;

edited Apr 12 at 21:01

200_success

123k14142399

asked Apr 12 at 18:13

Masoud

1

Please tell us more about what you mean by template matching, ideally with examples.
â€“Â 200_success
Apr 12 at 21:01

Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â€“Â xorz57
Apr 13 at 0:58

add a commentÂ |Â

up vote
0
down vote

favorite

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <math.h>
#include <chrono>

void populate(uint64_t *img, int height, int width, uint8_t val) 
 uint64_t * ptr = img;
 for (int h = 0; h < height; h++) 
 for (int w = 0; w < width; w++) 
 *ptr = (h*width) + w;
 ptr++;
 
 


__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

 int row_start = threadIdx.x * width;

 uint64_t *img = &Img[row_start];
 uint64_t *l1 = &L1[row_start];
 uint64_t *l2 = &L2[row_start];
 uint64_t *lx = &Lx[row_start];
 uint64_t *ly = &Ly[row_start];
 for (int i = 0; i < width; i++) 
 if (i == 0) 
 l1[i] = img[i];
 l2[i] = img[i] * img[i];
 
 else 
 l1[i] = l1[i - 1] + img[i];
 l2[i] = l2[i - 1] + (img[i] * img[i]);
 
 lx[i] = lx[i-1] + img[i] * i;
 ly[i] = ly[i - 1] + img[i] * threadIdx.x;
 
 return;


__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

 int column_start = threadIdx.x;
 uint64_t *img = &Img[column_start];
 uint64_t *l1 = &L1[column_start];
 uint64_t *l2 = &L2[column_start];
 uint64_t *lx = &Lx[column_start];
 uint64_t *ly = &Ly[column_start];
 int pre_rd = 0;
 int rd = 0;
 for (int i = 0; i < height; i++) 
 if (i != 0) 
 l1[rd] += l1[pre_rd];
 l2[rd] += l2[pre_rd];
 lx[rd] += lx[pre_rd];
 ly[rd] += ly[pre_rd];
 
 pre_rd = rd;
 rd += width;
 
 return;


void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4) 
 int a = 0;
 int b = blk_size - 1;
 int c = blk_size * (blk_size - 1);
 int d = (blk_size * blk_size) - 1;
 float blk_hlf = (float)(blk_size - 1) / 2;
 int blk_pow = blk_size * blk_size;
 *V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
 *V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
 *V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
 *V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);

 *V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
 *V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);

 *V1 = *V1 / blk_pow;
 *V2 = (*V2 / blk_pow) - (*V1 * *V1);




__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences) 

int findMin(float *diffs,int size) 
 int min = diffs[0];
 int min_idx = 0;
 for (int i = 1; i < size; i++) 
 if (min > diffs[i]) 
 min_idx = i;
 min = diffs[i];
 
 
 return min_idx;


int main(void)


 int height = 16;
 int width = 16;
 int tmplate_blk_size = 4;
 int features_width = width - tmplate_blk_size + 1;
 int features_height = height - tmplate_blk_size + 1;

 uint64_t *Img, *L1, *L2, *Lx, *Ly;
 uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
 float *V1, *V2, *V3, *V4; //Features of input image
 float TV1, TV2, TV3, TV4; //Features of template image
 float *Differences; //Stores distance between image features and template features

 // Allocate Unified Memory for input image Ã¢Â€Â“ accessible from CPU or GPU
 int image_size = height * width;
 int feature_size = features_width * features_height;
 cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
 cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
 cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
 cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
 cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
 cudaMallocManaged(&V1, feature_size * sizeof(float));
 cudaMallocManaged(&V2, feature_size * sizeof(float));
 cudaMallocManaged(&V3, feature_size * sizeof(float));
 cudaMallocManaged(&V4, feature_size * sizeof(float));
 cudaMallocManaged(&Differences, feature_size * sizeof(float));

 //Allocate Unified Memory for template image Ã¢Â€Â“ accessible from CPU or GPU
 int template_size = tmplate_blk_size * tmplate_blk_size;
 cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));

 populate(Img, height, width, 1);
 populate(Template, tmplate_blk_size, tmplate_blk_size, 1);

 addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
 addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
 cudaDeviceSynchronize();

 addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
 addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
 cudaDeviceSynchronize();

 //Calculate all features
 //For template, we only need one value for each feature so it's better and faster to run it on CPU
 calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);

 dim3 threads(height, width);
 calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
 cudaDeviceSynchronize();

 // Free memory
 cudaFree(Img);
 cudaFree(L1);
 cudaFree(L2);
 cudaFree(Lx);
 cudaFree(Ly);
 cudaFree(V1);
 cudaFree(V2);
 cudaFree(V3);
 cudaFree(V4);
 cudaFree(Template);
 cudaFree(TL1);
 cudaFree(TL2);
 cudaFree(TLx);
 cudaFree(TLy);
 cudaFree(Differences);

 return 0;

edited Apr 12 at 21:01

200_success

123k14142399

asked Apr 12 at 18:13

Masoud

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <math.h>
#include <chrono>

void populate(uint64_t *img, int height, int width, uint8_t val) 
 uint64_t * ptr = img;
 for (int h = 0; h < height; h++) 
 for (int w = 0; w < width; w++) 
 *ptr = (h*width) + w;
 ptr++;
 
 


__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

 int row_start = threadIdx.x * width;

 uint64_t *img = &Img[row_start];
 uint64_t *l1 = &L1[row_start];
 uint64_t *l2 = &L2[row_start];
 uint64_t *lx = &Lx[row_start];
 uint64_t *ly = &Ly[row_start];
 for (int i = 0; i < width; i++) 
 if (i == 0) 
 l1[i] = img[i];
 l2[i] = img[i] * img[i];
 
 else 
 l1[i] = l1[i - 1] + img[i];
 l2[i] = l2[i - 1] + (img[i] * img[i]);
 
 lx[i] = lx[i-1] + img[i] * i;
 ly[i] = ly[i - 1] + img[i] * threadIdx.x;
 
 return;


__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)

 int column_start = threadIdx.x;
 uint64_t *img = &Img[column_start];
 uint64_t *l1 = &L1[column_start];
 uint64_t *l2 = &L2[column_start];
 uint64_t *lx = &Lx[column_start];
 uint64_t *ly = &Ly[column_start];
 int pre_rd = 0;
 int rd = 0;
 for (int i = 0; i < height; i++) 
 if (i != 0) 
 l1[rd] += l1[pre_rd];
 l2[rd] += l2[pre_rd];
 lx[rd] += lx[pre_rd];
 ly[rd] += ly[pre_rd];
 
 pre_rd = rd;
 rd += width;
 
 return;


void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4) 
 int a = 0;
 int b = blk_size - 1;
 int c = blk_size * (blk_size - 1);
 int d = (blk_size * blk_size) - 1;
 float blk_hlf = (float)(blk_size - 1) / 2;
 int blk_pow = blk_size * blk_size;
 *V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
 *V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
 *V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
 *V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);

 *V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
 *V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);

 *V1 = *V1 / blk_pow;
 *V2 = (*V2 / blk_pow) - (*V1 * *V1);




__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences) 

int findMin(float *diffs,int size) 
 int min = diffs[0];
 int min_idx = 0;
 for (int i = 1; i < size; i++) 
 if (min > diffs[i]) 
 min_idx = i;
 min = diffs[i];
 
 
 return min_idx;


int main(void)


 int height = 16;
 int width = 16;
 int tmplate_blk_size = 4;
 int features_width = width - tmplate_blk_size + 1;
 int features_height = height - tmplate_blk_size + 1;

 uint64_t *Img, *L1, *L2, *Lx, *Ly;
 uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
 float *V1, *V2, *V3, *V4; //Features of input image
 float TV1, TV2, TV3, TV4; //Features of template image
 float *Differences; //Stores distance between image features and template features

 // Allocate Unified Memory for input image Ã¢Â€Â“ accessible from CPU or GPU
 int image_size = height * width;
 int feature_size = features_width * features_height;
 cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
 cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
 cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
 cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
 cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
 cudaMallocManaged(&V1, feature_size * sizeof(float));
 cudaMallocManaged(&V2, feature_size * sizeof(float));
 cudaMallocManaged(&V3, feature_size * sizeof(float));
 cudaMallocManaged(&V4, feature_size * sizeof(float));
 cudaMallocManaged(&Differences, feature_size * sizeof(float));

 //Allocate Unified Memory for template image Ã¢Â€Â“ accessible from CPU or GPU
 int template_size = tmplate_blk_size * tmplate_blk_size;
 cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
 cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));

 populate(Img, height, width, 1);
 populate(Template, tmplate_blk_size, tmplate_blk_size, 1);

 addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
 addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
 cudaDeviceSynchronize();

 addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
 addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
 cudaDeviceSynchronize();

 //Calculate all features
 //For template, we only need one value for each feature so it's better and faster to run it on CPU
 calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);

 dim3 threads(height, width);
 calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
 cudaDeviceSynchronize();

 // Free memory
 cudaFree(Img);
 cudaFree(L1);
 cudaFree(L2);
 cudaFree(Lx);
 cudaFree(Ly);
 cudaFree(V1);
 cudaFree(V2);
 cudaFree(V3);
 cudaFree(V4);
 cudaFree(Template);
 cudaFree(TL1);
 cudaFree(TL2);
 cudaFree(TLx);
 cudaFree(TLy);
 cudaFree(Differences);

 return 0;

edited Apr 12 at 21:01

200_success

123k14142399

asked Apr 12 at 18:13

Masoud

edited Apr 12 at 21:01

200_success

123k14142399

edited Apr 12 at 21:01

200_success

123k14142399

edited Apr 12 at 21:01

200_success

123k14142399

asked Apr 12 at 18:13

Masoud

asked Apr 12 at 18:13

Masoud

asked Apr 12 at 18:13

Masoud

1

Please tell us more about what you mean by template matching, ideally with examples.
â€“Â 200_success
Apr 12 at 21:01

Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â€“Â xorz57
Apr 13 at 0:58

add a commentÂ |Â

1

Please tell us more about what you mean by template matching, ideally with examples.
â€“Â 200_success
Apr 12 at 21:01

Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â€“Â xorz57
Apr 13 at 0:58

Please tell us more about what you mean by template matching, ideally with examples.
â€“Â 200_success
Apr 12 at 21:01

Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â€“Â xorz57
Apr 13 at 0:58

add a commentÂ |Â

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f191904%2fcuda-template-matching-program%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr