CUDA Template-matching program
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
0
down vote
favorite
I have started learning CUDA programming. As my first code, I have tried to implement a simple template matching program. but as I am very new to CUDA and parallel programming, I am not sure how I can improve and optimize my code to gain most from CUDA and parallel execution of this code.
Here is the code I have written so far:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <math.h>
#include <chrono>
void populate(uint64_t *img, int height, int width, uint8_t val)
uint64_t * ptr = img;
for (int h = 0; h < height; h++)
for (int w = 0; w < width; w++)
*ptr = (h*width) + w;
ptr++;
__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)
int row_start = threadIdx.x * width;
uint64_t *img = &Img[row_start];
uint64_t *l1 = &L1[row_start];
uint64_t *l2 = &L2[row_start];
uint64_t *lx = &Lx[row_start];
uint64_t *ly = &Ly[row_start];
for (int i = 0; i < width; i++)
if (i == 0)
l1[i] = img[i];
l2[i] = img[i] * img[i];
else
l1[i] = l1[i - 1] + img[i];
l2[i] = l2[i - 1] + (img[i] * img[i]);
lx[i] = lx[i-1] + img[i] * i;
ly[i] = ly[i - 1] + img[i] * threadIdx.x;
return;
__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)
int column_start = threadIdx.x;
uint64_t *img = &Img[column_start];
uint64_t *l1 = &L1[column_start];
uint64_t *l2 = &L2[column_start];
uint64_t *lx = &Lx[column_start];
uint64_t *ly = &Ly[column_start];
int pre_rd = 0;
int rd = 0;
for (int i = 0; i < height; i++)
if (i != 0)
l1[rd] += l1[pre_rd];
l2[rd] += l2[pre_rd];
lx[rd] += lx[pre_rd];
ly[rd] += ly[pre_rd];
pre_rd = rd;
rd += width;
return;
void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4)
int a = 0;
int b = blk_size - 1;
int c = blk_size * (blk_size - 1);
int d = (blk_size * blk_size) - 1;
float blk_hlf = (float)(blk_size - 1) / 2;
int blk_pow = blk_size * blk_size;
*V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
*V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
*V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
*V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);
*V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
*V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);
*V1 = *V1 / blk_pow;
*V2 = (*V2 / blk_pow) - (*V1 * *V1);
__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences)
int findMin(float *diffs,int size)
int min = diffs[0];
int min_idx = 0;
for (int i = 1; i < size; i++)
if (min > diffs[i])
min_idx = i;
min = diffs[i];
return min_idx;
int main(void)
int height = 16;
int width = 16;
int tmplate_blk_size = 4;
int features_width = width - tmplate_blk_size + 1;
int features_height = height - tmplate_blk_size + 1;
uint64_t *Img, *L1, *L2, *Lx, *Ly;
uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
float *V1, *V2, *V3, *V4; //Features of input image
float TV1, TV2, TV3, TV4; //Features of template image
float *Differences; //Stores distance between image features and template features
// Allocate Unified Memory for input image â accessible from CPU or GPU
int image_size = height * width;
int feature_size = features_width * features_height;
cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
cudaMallocManaged(&V1, feature_size * sizeof(float));
cudaMallocManaged(&V2, feature_size * sizeof(float));
cudaMallocManaged(&V3, feature_size * sizeof(float));
cudaMallocManaged(&V4, feature_size * sizeof(float));
cudaMallocManaged(&Differences, feature_size * sizeof(float));
//Allocate Unified Memory for template image â accessible from CPU or GPU
int template_size = tmplate_blk_size * tmplate_blk_size;
cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));
populate(Img, height, width, 1);
populate(Template, tmplate_blk_size, tmplate_blk_size, 1);
addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();
addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();
//Calculate all features
//For template, we only need one value for each feature so it's better and faster to run it on CPU
calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);
dim3 threads(height, width);
calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
cudaDeviceSynchronize();
// Free memory
cudaFree(Img);
cudaFree(L1);
cudaFree(L2);
cudaFree(Lx);
cudaFree(Ly);
cudaFree(V1);
cudaFree(V2);
cudaFree(V3);
cudaFree(V4);
cudaFree(Template);
cudaFree(TL1);
cudaFree(TL2);
cudaFree(TLx);
cudaFree(TLy);
cudaFree(Differences);
return 0;
performance beginner c cuda
add a comment |Â
up vote
0
down vote
favorite
I have started learning CUDA programming. As my first code, I have tried to implement a simple template matching program. but as I am very new to CUDA and parallel programming, I am not sure how I can improve and optimize my code to gain most from CUDA and parallel execution of this code.
Here is the code I have written so far:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <math.h>
#include <chrono>
void populate(uint64_t *img, int height, int width, uint8_t val)
uint64_t * ptr = img;
for (int h = 0; h < height; h++)
for (int w = 0; w < width; w++)
*ptr = (h*width) + w;
ptr++;
__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)
int row_start = threadIdx.x * width;
uint64_t *img = &Img[row_start];
uint64_t *l1 = &L1[row_start];
uint64_t *l2 = &L2[row_start];
uint64_t *lx = &Lx[row_start];
uint64_t *ly = &Ly[row_start];
for (int i = 0; i < width; i++)
if (i == 0)
l1[i] = img[i];
l2[i] = img[i] * img[i];
else
l1[i] = l1[i - 1] + img[i];
l2[i] = l2[i - 1] + (img[i] * img[i]);
lx[i] = lx[i-1] + img[i] * i;
ly[i] = ly[i - 1] + img[i] * threadIdx.x;
return;
__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)
int column_start = threadIdx.x;
uint64_t *img = &Img[column_start];
uint64_t *l1 = &L1[column_start];
uint64_t *l2 = &L2[column_start];
uint64_t *lx = &Lx[column_start];
uint64_t *ly = &Ly[column_start];
int pre_rd = 0;
int rd = 0;
for (int i = 0; i < height; i++)
if (i != 0)
l1[rd] += l1[pre_rd];
l2[rd] += l2[pre_rd];
lx[rd] += lx[pre_rd];
ly[rd] += ly[pre_rd];
pre_rd = rd;
rd += width;
return;
void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4)
int a = 0;
int b = blk_size - 1;
int c = blk_size * (blk_size - 1);
int d = (blk_size * blk_size) - 1;
float blk_hlf = (float)(blk_size - 1) / 2;
int blk_pow = blk_size * blk_size;
*V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
*V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
*V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
*V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);
*V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
*V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);
*V1 = *V1 / blk_pow;
*V2 = (*V2 / blk_pow) - (*V1 * *V1);
__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences)
int findMin(float *diffs,int size)
int min = diffs[0];
int min_idx = 0;
for (int i = 1; i < size; i++)
if (min > diffs[i])
min_idx = i;
min = diffs[i];
return min_idx;
int main(void)
int height = 16;
int width = 16;
int tmplate_blk_size = 4;
int features_width = width - tmplate_blk_size + 1;
int features_height = height - tmplate_blk_size + 1;
uint64_t *Img, *L1, *L2, *Lx, *Ly;
uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
float *V1, *V2, *V3, *V4; //Features of input image
float TV1, TV2, TV3, TV4; //Features of template image
float *Differences; //Stores distance between image features and template features
// Allocate Unified Memory for input image â accessible from CPU or GPU
int image_size = height * width;
int feature_size = features_width * features_height;
cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
cudaMallocManaged(&V1, feature_size * sizeof(float));
cudaMallocManaged(&V2, feature_size * sizeof(float));
cudaMallocManaged(&V3, feature_size * sizeof(float));
cudaMallocManaged(&V4, feature_size * sizeof(float));
cudaMallocManaged(&Differences, feature_size * sizeof(float));
//Allocate Unified Memory for template image â accessible from CPU or GPU
int template_size = tmplate_blk_size * tmplate_blk_size;
cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));
populate(Img, height, width, 1);
populate(Template, tmplate_blk_size, tmplate_blk_size, 1);
addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();
addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();
//Calculate all features
//For template, we only need one value for each feature so it's better and faster to run it on CPU
calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);
dim3 threads(height, width);
calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
cudaDeviceSynchronize();
// Free memory
cudaFree(Img);
cudaFree(L1);
cudaFree(L2);
cudaFree(Lx);
cudaFree(Ly);
cudaFree(V1);
cudaFree(V2);
cudaFree(V3);
cudaFree(V4);
cudaFree(Template);
cudaFree(TL1);
cudaFree(TL2);
cudaFree(TLx);
cudaFree(TLy);
cudaFree(Differences);
return 0;
performance beginner c cuda
1
Please tell us more about what you mean by template matching, ideally with examples.
â 200_success
Apr 12 at 21:01
Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â xorz57
Apr 13 at 0:58
add a comment |Â
up vote
0
down vote
favorite
up vote
0
down vote
favorite
I have started learning CUDA programming. As my first code, I have tried to implement a simple template matching program. but as I am very new to CUDA and parallel programming, I am not sure how I can improve and optimize my code to gain most from CUDA and parallel execution of this code.
Here is the code I have written so far:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <math.h>
#include <chrono>
void populate(uint64_t *img, int height, int width, uint8_t val)
uint64_t * ptr = img;
for (int h = 0; h < height; h++)
for (int w = 0; w < width; w++)
*ptr = (h*width) + w;
ptr++;
__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)
int row_start = threadIdx.x * width;
uint64_t *img = &Img[row_start];
uint64_t *l1 = &L1[row_start];
uint64_t *l2 = &L2[row_start];
uint64_t *lx = &Lx[row_start];
uint64_t *ly = &Ly[row_start];
for (int i = 0; i < width; i++)
if (i == 0)
l1[i] = img[i];
l2[i] = img[i] * img[i];
else
l1[i] = l1[i - 1] + img[i];
l2[i] = l2[i - 1] + (img[i] * img[i]);
lx[i] = lx[i-1] + img[i] * i;
ly[i] = ly[i - 1] + img[i] * threadIdx.x;
return;
__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)
int column_start = threadIdx.x;
uint64_t *img = &Img[column_start];
uint64_t *l1 = &L1[column_start];
uint64_t *l2 = &L2[column_start];
uint64_t *lx = &Lx[column_start];
uint64_t *ly = &Ly[column_start];
int pre_rd = 0;
int rd = 0;
for (int i = 0; i < height; i++)
if (i != 0)
l1[rd] += l1[pre_rd];
l2[rd] += l2[pre_rd];
lx[rd] += lx[pre_rd];
ly[rd] += ly[pre_rd];
pre_rd = rd;
rd += width;
return;
void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4)
int a = 0;
int b = blk_size - 1;
int c = blk_size * (blk_size - 1);
int d = (blk_size * blk_size) - 1;
float blk_hlf = (float)(blk_size - 1) / 2;
int blk_pow = blk_size * blk_size;
*V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
*V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
*V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
*V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);
*V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
*V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);
*V1 = *V1 / blk_pow;
*V2 = (*V2 / blk_pow) - (*V1 * *V1);
__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences)
int findMin(float *diffs,int size)
int min = diffs[0];
int min_idx = 0;
for (int i = 1; i < size; i++)
if (min > diffs[i])
min_idx = i;
min = diffs[i];
return min_idx;
int main(void)
int height = 16;
int width = 16;
int tmplate_blk_size = 4;
int features_width = width - tmplate_blk_size + 1;
int features_height = height - tmplate_blk_size + 1;
uint64_t *Img, *L1, *L2, *Lx, *Ly;
uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
float *V1, *V2, *V3, *V4; //Features of input image
float TV1, TV2, TV3, TV4; //Features of template image
float *Differences; //Stores distance between image features and template features
// Allocate Unified Memory for input image â accessible from CPU or GPU
int image_size = height * width;
int feature_size = features_width * features_height;
cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
cudaMallocManaged(&V1, feature_size * sizeof(float));
cudaMallocManaged(&V2, feature_size * sizeof(float));
cudaMallocManaged(&V3, feature_size * sizeof(float));
cudaMallocManaged(&V4, feature_size * sizeof(float));
cudaMallocManaged(&Differences, feature_size * sizeof(float));
//Allocate Unified Memory for template image â accessible from CPU or GPU
int template_size = tmplate_blk_size * tmplate_blk_size;
cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));
populate(Img, height, width, 1);
populate(Template, tmplate_blk_size, tmplate_blk_size, 1);
addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();
addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();
//Calculate all features
//For template, we only need one value for each feature so it's better and faster to run it on CPU
calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);
dim3 threads(height, width);
calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
cudaDeviceSynchronize();
// Free memory
cudaFree(Img);
cudaFree(L1);
cudaFree(L2);
cudaFree(Lx);
cudaFree(Ly);
cudaFree(V1);
cudaFree(V2);
cudaFree(V3);
cudaFree(V4);
cudaFree(Template);
cudaFree(TL1);
cudaFree(TL2);
cudaFree(TLx);
cudaFree(TLy);
cudaFree(Differences);
return 0;
performance beginner c cuda
I have started learning CUDA programming. As my first code, I have tried to implement a simple template matching program. but as I am very new to CUDA and parallel programming, I am not sure how I can improve and optimize my code to gain most from CUDA and parallel execution of this code.
Here is the code I have written so far:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <math.h>
#include <chrono>
void populate(uint64_t *img, int height, int width, uint8_t val)
uint64_t * ptr = img;
for (int h = 0; h < height; h++)
for (int w = 0; w < width; w++)
*ptr = (h*width) + w;
ptr++;
__global__
void addRow(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)
int row_start = threadIdx.x * width;
uint64_t *img = &Img[row_start];
uint64_t *l1 = &L1[row_start];
uint64_t *l2 = &L2[row_start];
uint64_t *lx = &Lx[row_start];
uint64_t *ly = &Ly[row_start];
for (int i = 0; i < width; i++)
if (i == 0)
l1[i] = img[i];
l2[i] = img[i] * img[i];
else
l1[i] = l1[i - 1] + img[i];
l2[i] = l2[i - 1] + (img[i] * img[i]);
lx[i] = lx[i-1] + img[i] * i;
ly[i] = ly[i - 1] + img[i] * threadIdx.x;
return;
__global__
void addColumn(uint64_t *Img, int height, int width, uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly)
int column_start = threadIdx.x;
uint64_t *img = &Img[column_start];
uint64_t *l1 = &L1[column_start];
uint64_t *l2 = &L2[column_start];
uint64_t *lx = &Lx[column_start];
uint64_t *ly = &Ly[column_start];
int pre_rd = 0;
int rd = 0;
for (int i = 0; i < height; i++)
if (i != 0)
l1[rd] += l1[pre_rd];
l2[rd] += l2[pre_rd];
lx[rd] += lx[pre_rd];
ly[rd] += ly[pre_rd];
pre_rd = rd;
rd += width;
return;
void calculateTemplateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int blk_size, float *V1, float *V2, float *V3, float *V4)
int a = 0;
int b = blk_size - 1;
int c = blk_size * (blk_size - 1);
int d = (blk_size * blk_size) - 1;
float blk_hlf = (float)(blk_size - 1) / 2;
int blk_pow = blk_size * blk_size;
*V1 = (L1[d] + L1[a]) - (L1[b] + L1[c]);
*V2 = (L2[d] + L2[a]) - (L2[b] + L2[c]);
*V3 = (Lx[d] + Lx[a]) - (Lx[b] + Lx[c]);
*V4 = (Ly[d] + Ly[a]) - (Ly[b] + Ly[c]);
*V3 = (*V3 - blk_hlf * *V1) / (blk_pow * blk_size);
*V4 = (*V4 - blk_hlf * *V1) / (blk_pow * blk_size);
*V1 = *V1 / blk_pow;
*V2 = (*V2 / blk_pow) - (*V1 * *V1);
__global__
void calculateFeatures(uint64_t *L1, uint64_t *L2, uint64_t *Lx, uint64_t *Ly, int height, int width, int blk_size, float *V1, float *V2, float *V3, float *V4, float TV1, float TV2, float TV3, float TV4, float *differences)
int findMin(float *diffs,int size)
int min = diffs[0];
int min_idx = 0;
for (int i = 1; i < size; i++)
if (min > diffs[i])
min_idx = i;
min = diffs[i];
return min_idx;
int main(void)
int height = 16;
int width = 16;
int tmplate_blk_size = 4;
int features_width = width - tmplate_blk_size + 1;
int features_height = height - tmplate_blk_size + 1;
uint64_t *Img, *L1, *L2, *Lx, *Ly;
uint64_t *Template, *TL1, *TL2, *TLx, *TLy;
float *V1, *V2, *V3, *V4; //Features of input image
float TV1, TV2, TV3, TV4; //Features of template image
float *Differences; //Stores distance between image features and template features
// Allocate Unified Memory for input image â accessible from CPU or GPU
int image_size = height * width;
int feature_size = features_width * features_height;
cudaMallocManaged(&Img, image_size * sizeof(uint64_t));
cudaMallocManaged(&L1, image_size * sizeof(uint64_t));
cudaMallocManaged(&L2, image_size * sizeof(uint64_t));
cudaMallocManaged(&Lx, image_size * sizeof(uint64_t));
cudaMallocManaged(&Ly, image_size * sizeof(uint64_t));
cudaMallocManaged(&V1, feature_size * sizeof(float));
cudaMallocManaged(&V2, feature_size * sizeof(float));
cudaMallocManaged(&V3, feature_size * sizeof(float));
cudaMallocManaged(&V4, feature_size * sizeof(float));
cudaMallocManaged(&Differences, feature_size * sizeof(float));
//Allocate Unified Memory for template image â accessible from CPU or GPU
int template_size = tmplate_blk_size * tmplate_blk_size;
cudaMallocManaged(&Template, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL1, template_size * sizeof(uint64_t));
cudaMallocManaged(&TL2, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLx, template_size * sizeof(uint64_t));
cudaMallocManaged(&TLy, template_size * sizeof(uint64_t));
populate(Img, height, width, 1);
populate(Template, tmplate_blk_size, tmplate_blk_size, 1);
addRow<<<1, height>>>(Img, height, width, L1, L2, Lx, Ly);
addRow<<<1, tmplate_blk_size>>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();
addColumn<<<1, width>>>(Img, height, width, L1, L2, Lx, Ly);
addColumn<<<1, tmplate_blk_size >>>(Template, tmplate_blk_size, tmplate_blk_size, TL1, TL2, TLx, TLy);
cudaDeviceSynchronize();
//Calculate all features
//For template, we only need one value for each feature so it's better and faster to run it on CPU
calculateTemplateFeatures(TL1, TL2, TLx, TLy, tmplate_blk_size, &TV1, &TV2, &TV3, &TV4);
dim3 threads(height, width);
calculateFeatures<<<1, threads>>>(L1, L2, Lx, Ly, height, width, tmplate_blk_size, V1, V2, V3, V4, TV1, TV2, TV3, TV4, Differences);
cudaDeviceSynchronize();
// Free memory
cudaFree(Img);
cudaFree(L1);
cudaFree(L2);
cudaFree(Lx);
cudaFree(Ly);
cudaFree(V1);
cudaFree(V2);
cudaFree(V3);
cudaFree(V4);
cudaFree(Template);
cudaFree(TL1);
cudaFree(TL2);
cudaFree(TLx);
cudaFree(TLy);
cudaFree(Differences);
return 0;
performance beginner c cuda
edited Apr 12 at 21:01
200_success
123k14142399
123k14142399
asked Apr 12 at 18:13
Masoud
71
71
1
Please tell us more about what you mean by template matching, ideally with examples.
â 200_success
Apr 12 at 21:01
Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â xorz57
Apr 13 at 0:58
add a comment |Â
1
Please tell us more about what you mean by template matching, ideally with examples.
â 200_success
Apr 12 at 21:01
Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â xorz57
Apr 13 at 0:58
1
1
Please tell us more about what you mean by template matching, ideally with examples.
â 200_success
Apr 12 at 21:01
Please tell us more about what you mean by template matching, ideally with examples.
â 200_success
Apr 12 at 21:01
Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â xorz57
Apr 13 at 0:58
Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â xorz57
Apr 13 at 0:58
add a comment |Â
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f191904%2fcuda-template-matching-program%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
1
Please tell us more about what you mean by template matching, ideally with examples.
â 200_success
Apr 12 at 21:01
Don't forget to mention the compute capability of your GPU and the CUDA API version as well.
â xorz57
Apr 13 at 0:58