RGB to HSV image conversion with CUDA, with branching and branchless versions

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
0
down vote

favorite

I have created two CUDA kernels for converting a 3 channel 16-bit RGB image to a 3 channel 16-bit HSV image. The first kernel below performs the usual algorithm in a fairly straight forward manner as described here and here and uses branching.

__global__ void DC_rgb16ToHsv16(const uint16_t *dvc_rgb16,
 size_t rgbPitch,
 size_t W, size_t H,
 uint16_t *dvc_hsv16,
 size_t hsvPitch) 
 const int col = blockIdx.x*blockDim.x + threadIdx.x;
 const int row = blockIdx.y*blockDim.y + threadIdx.y;
 if (row < H && col < W) 
 const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
 const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

 constexpr uint16_t uint16_max = 65535;
 constexpr float norm = 1.0f/uint16_max; // normalize RGB input
 const float nR = inpix[0]*norm;
 const float nG = inpix[1]*norm;
 const float nB = inpix[2]*norm;

 const float nV = fmaxf(nR, fmaxf(nG, nB)); // V = max channel value

 const float cmin = fminf(nR, fminf(nG, nB));
 const float delta = nV - cmin;
 const float nS = nV > 0 ? delta / nV : 0.0f; // S = saturation level
 float nH = 0.0; // use H=S=V=0 for black
 if (nV > 0 && delta != 0.0) // not black (hue undefined) nor gray 
 const float invDelta = 1.0/delta;
 const float nCr = (nV - nR) * invDelta;
 const float nCg = (nV - nG) * invDelta; 
 const float nCb = (nV - nB) * invDelta; 
 if (nR == nV)
 nH = nCb - nCg; // may be negative
 else if (nG == nV)
 nH = 2.0f + nCr - nCb;
 else if (nB == nV)
 nH = 4.0f + nCg - nCr; 
 nH *= 1.0f/6;
 if (nH < 0.0)
 nH += 1.0f;
 

 const uint16_t H = uint16_t(lroundf(nH*uint16_max));
 const uint16_t S = uint16_t(lroundf(nS*uint16_max));
 const uint16_t V = uint16_t(lroundf(nV*uint16_max));

 const size_t hsvPitch16 = hsvPitch/2;
 uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
 outpix[0] = H;
 outpix[1] = S;
 outpix[2] = V;

Knowing the branchless, vectorized code should perform better (especially on a SIMD architecture like CUDA) I followed the idea found
here to implement my hopefully "faster" kernel as listed below.
There is one unavoidable branch in the outer if-statement and I was hoping the use of a ternary operator would result in a condition move (as opposed to a branch).

__global__ void DC_fast_rgb16ToHsv16(const uint16_t *dvc_rgb16,
 size_t rgbPitch,
 size_t W, size_t H,
 uint16_t *dvc_hsv16,
 size_t hsvPitch) 
 const int col = blockIdx.x*blockDim.x + threadIdx.x;
 const int row = blockIdx.y*blockDim.y + threadIdx.y;
 if (row < H && col < W) 
 const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
 const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

 const uint16_t R = inpix[0];
 const uint16_t G = inpix[1];
 const uint16_t B = inpix[2];
 constexpr uint16_t uint16_max = 65535;
 constexpr float norm = 1.0f/uint16_max; // normalize RGB input
 const float4 rgb = make_float4(float(R)*norm,float(G)*norm,float(B)*norm, 0.0);

 const float4 K = make_float4(0.0, -1.0/3.0, 2.0/3.0, -1.0);
 const float4 p = 
 rgb.y < rgb.z ? make_float4(rgb.z, rgb.y, K.w, K.z) : make_float4(rgb.y, rgb.z, K.x, K.y);
 const float4 q =
 rgb.x < p.x ? make_float4(p.x, p.y, p.w, rgb.x) : make_float4(rgb.x, p.y, p.z, p.x);
 const float d = q.x - fminf(q.w, q.y);
 const float e = 1.0e-10;
 const float4 hsv = make_float4(fabsf(q.z + (q.w - q.y)/(6.0*d + e)), d/(q.x + e), q.x, 0.0);

 const size_t hsvPitch16 = hsvPitch/2;
 uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
 outpix[0] = uint16_t(lroundf(hsv.x*uint16_max));
 outpix[1] = uint16_t(lroundf(hsv.y*uint16_max));
 outpix[2] = uint16_t(lroundf(hsv.z*uint16_max));

What I have found in early testing is that the "branchless" version is almost 1.5x slower! Anyway, I thought I would post this here to see if there are any CUDA experts that can explain this. I can always try to analyze the generated PTX code, but I haven't attempted this yet.

edited May 25 at 19:39

200_success

123k14143399

asked May 25 at 17:58

wcochran

1023

Just to clarify, both programs accomplish the same task, but just in a different way?
â€“Â Phrancis
May 25 at 18:09

Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
â€“Â 200_success
May 25 at 18:13

@PhrancisYes -- both kernels are (should be) equivalent.
â€“Â wcochran
May 25 at 19:29

@200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
â€“Â wcochran
May 25 at 19:31

Voted down -- wow this community is harsh.
â€“Â wcochran
May 25 at 19:33

Â |Â
show 2 more comments

up vote
0
down vote

favorite

__global__ void DC_rgb16ToHsv16(const uint16_t *dvc_rgb16,
 size_t rgbPitch,
 size_t W, size_t H,
 uint16_t *dvc_hsv16,
 size_t hsvPitch) 
 const int col = blockIdx.x*blockDim.x + threadIdx.x;
 const int row = blockIdx.y*blockDim.y + threadIdx.y;
 if (row < H && col < W) 
 const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
 const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

 constexpr uint16_t uint16_max = 65535;
 constexpr float norm = 1.0f/uint16_max; // normalize RGB input
 const float nR = inpix[0]*norm;
 const float nG = inpix[1]*norm;
 const float nB = inpix[2]*norm;

 const float nV = fmaxf(nR, fmaxf(nG, nB)); // V = max channel value

 const float cmin = fminf(nR, fminf(nG, nB));
 const float delta = nV - cmin;
 const float nS = nV > 0 ? delta / nV : 0.0f; // S = saturation level
 float nH = 0.0; // use H=S=V=0 for black
 if (nV > 0 && delta != 0.0) // not black (hue undefined) nor gray 
 const float invDelta = 1.0/delta;
 const float nCr = (nV - nR) * invDelta;
 const float nCg = (nV - nG) * invDelta; 
 const float nCb = (nV - nB) * invDelta; 
 if (nR == nV)
 nH = nCb - nCg; // may be negative
 else if (nG == nV)
 nH = 2.0f + nCr - nCb;
 else if (nB == nV)
 nH = 4.0f + nCg - nCr; 
 nH *= 1.0f/6;
 if (nH < 0.0)
 nH += 1.0f;
 

 const uint16_t H = uint16_t(lroundf(nH*uint16_max));
 const uint16_t S = uint16_t(lroundf(nS*uint16_max));
 const uint16_t V = uint16_t(lroundf(nV*uint16_max));

 const size_t hsvPitch16 = hsvPitch/2;
 uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
 outpix[0] = H;
 outpix[1] = S;
 outpix[2] = V;

__global__ void DC_fast_rgb16ToHsv16(const uint16_t *dvc_rgb16,
 size_t rgbPitch,
 size_t W, size_t H,
 uint16_t *dvc_hsv16,
 size_t hsvPitch) 
 const int col = blockIdx.x*blockDim.x + threadIdx.x;
 const int row = blockIdx.y*blockDim.y + threadIdx.y;
 if (row < H && col < W) 
 const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
 const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

 const uint16_t R = inpix[0];
 const uint16_t G = inpix[1];
 const uint16_t B = inpix[2];
 constexpr uint16_t uint16_max = 65535;
 constexpr float norm = 1.0f/uint16_max; // normalize RGB input
 const float4 rgb = make_float4(float(R)*norm,float(G)*norm,float(B)*norm, 0.0);

 const float4 K = make_float4(0.0, -1.0/3.0, 2.0/3.0, -1.0);
 const float4 p = 
 rgb.y < rgb.z ? make_float4(rgb.z, rgb.y, K.w, K.z) : make_float4(rgb.y, rgb.z, K.x, K.y);
 const float4 q =
 rgb.x < p.x ? make_float4(p.x, p.y, p.w, rgb.x) : make_float4(rgb.x, p.y, p.z, p.x);
 const float d = q.x - fminf(q.w, q.y);
 const float e = 1.0e-10;
 const float4 hsv = make_float4(fabsf(q.z + (q.w - q.y)/(6.0*d + e)), d/(q.x + e), q.x, 0.0);

 const size_t hsvPitch16 = hsvPitch/2;
 uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
 outpix[0] = uint16_t(lroundf(hsv.x*uint16_max));
 outpix[1] = uint16_t(lroundf(hsv.y*uint16_max));
 outpix[2] = uint16_t(lroundf(hsv.z*uint16_max));

edited May 25 at 19:39

200_success

123k14143399

asked May 25 at 17:58

wcochran

1023

Just to clarify, both programs accomplish the same task, but just in a different way?
â€“Â Phrancis
May 25 at 18:09

Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
â€“Â 200_success
May 25 at 18:13

@PhrancisYes -- both kernels are (should be) equivalent.
â€“Â wcochran
May 25 at 19:29

@200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
â€“Â wcochran
May 25 at 19:31

Voted down -- wow this community is harsh.
â€“Â wcochran
May 25 at 19:33

Â |Â
show 2 more comments

up vote
0
down vote

favorite

__global__ void DC_rgb16ToHsv16(const uint16_t *dvc_rgb16,
 size_t rgbPitch,
 size_t W, size_t H,
 uint16_t *dvc_hsv16,
 size_t hsvPitch) 
 const int col = blockIdx.x*blockDim.x + threadIdx.x;
 const int row = blockIdx.y*blockDim.y + threadIdx.y;
 if (row < H && col < W) 
 const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
 const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

 constexpr uint16_t uint16_max = 65535;
 constexpr float norm = 1.0f/uint16_max; // normalize RGB input
 const float nR = inpix[0]*norm;
 const float nG = inpix[1]*norm;
 const float nB = inpix[2]*norm;

 const float nV = fmaxf(nR, fmaxf(nG, nB)); // V = max channel value

 const float cmin = fminf(nR, fminf(nG, nB));
 const float delta = nV - cmin;
 const float nS = nV > 0 ? delta / nV : 0.0f; // S = saturation level
 float nH = 0.0; // use H=S=V=0 for black
 if (nV > 0 && delta != 0.0) // not black (hue undefined) nor gray 
 const float invDelta = 1.0/delta;
 const float nCr = (nV - nR) * invDelta;
 const float nCg = (nV - nG) * invDelta; 
 const float nCb = (nV - nB) * invDelta; 
 if (nR == nV)
 nH = nCb - nCg; // may be negative
 else if (nG == nV)
 nH = 2.0f + nCr - nCb;
 else if (nB == nV)
 nH = 4.0f + nCg - nCr; 
 nH *= 1.0f/6;
 if (nH < 0.0)
 nH += 1.0f;
 

 const uint16_t H = uint16_t(lroundf(nH*uint16_max));
 const uint16_t S = uint16_t(lroundf(nS*uint16_max));
 const uint16_t V = uint16_t(lroundf(nV*uint16_max));

 const size_t hsvPitch16 = hsvPitch/2;
 uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
 outpix[0] = H;
 outpix[1] = S;
 outpix[2] = V;

__global__ void DC_fast_rgb16ToHsv16(const uint16_t *dvc_rgb16,
 size_t rgbPitch,
 size_t W, size_t H,
 uint16_t *dvc_hsv16,
 size_t hsvPitch) 
 const int col = blockIdx.x*blockDim.x + threadIdx.x;
 const int row = blockIdx.y*blockDim.y + threadIdx.y;
 if (row < H && col < W) 
 const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
 const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

 const uint16_t R = inpix[0];
 const uint16_t G = inpix[1];
 const uint16_t B = inpix[2];
 constexpr uint16_t uint16_max = 65535;
 constexpr float norm = 1.0f/uint16_max; // normalize RGB input
 const float4 rgb = make_float4(float(R)*norm,float(G)*norm,float(B)*norm, 0.0);

 const float4 K = make_float4(0.0, -1.0/3.0, 2.0/3.0, -1.0);
 const float4 p = 
 rgb.y < rgb.z ? make_float4(rgb.z, rgb.y, K.w, K.z) : make_float4(rgb.y, rgb.z, K.x, K.y);
 const float4 q =
 rgb.x < p.x ? make_float4(p.x, p.y, p.w, rgb.x) : make_float4(rgb.x, p.y, p.z, p.x);
 const float d = q.x - fminf(q.w, q.y);
 const float e = 1.0e-10;
 const float4 hsv = make_float4(fabsf(q.z + (q.w - q.y)/(6.0*d + e)), d/(q.x + e), q.x, 0.0);

 const size_t hsvPitch16 = hsvPitch/2;
 uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
 outpix[0] = uint16_t(lroundf(hsv.x*uint16_max));
 outpix[1] = uint16_t(lroundf(hsv.y*uint16_max));
 outpix[2] = uint16_t(lroundf(hsv.z*uint16_max));

edited May 25 at 19:39

200_success

123k14143399

asked May 25 at 17:58

wcochran

1023

__global__ void DC_rgb16ToHsv16(const uint16_t *dvc_rgb16,
 size_t rgbPitch,
 size_t W, size_t H,
 uint16_t *dvc_hsv16,
 size_t hsvPitch) 
 const int col = blockIdx.x*blockDim.x + threadIdx.x;
 const int row = blockIdx.y*blockDim.y + threadIdx.y;
 if (row < H && col < W) 
 const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
 const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

 constexpr uint16_t uint16_max = 65535;
 constexpr float norm = 1.0f/uint16_max; // normalize RGB input
 const float nR = inpix[0]*norm;
 const float nG = inpix[1]*norm;
 const float nB = inpix[2]*norm;

 const float nV = fmaxf(nR, fmaxf(nG, nB)); // V = max channel value

 const float cmin = fminf(nR, fminf(nG, nB));
 const float delta = nV - cmin;
 const float nS = nV > 0 ? delta / nV : 0.0f; // S = saturation level
 float nH = 0.0; // use H=S=V=0 for black
 if (nV > 0 && delta != 0.0) // not black (hue undefined) nor gray 
 const float invDelta = 1.0/delta;
 const float nCr = (nV - nR) * invDelta;
 const float nCg = (nV - nG) * invDelta; 
 const float nCb = (nV - nB) * invDelta; 
 if (nR == nV)
 nH = nCb - nCg; // may be negative
 else if (nG == nV)
 nH = 2.0f + nCr - nCb;
 else if (nB == nV)
 nH = 4.0f + nCg - nCr; 
 nH *= 1.0f/6;
 if (nH < 0.0)
 nH += 1.0f;
 

 const uint16_t H = uint16_t(lroundf(nH*uint16_max));
 const uint16_t S = uint16_t(lroundf(nS*uint16_max));
 const uint16_t V = uint16_t(lroundf(nV*uint16_max));

 const size_t hsvPitch16 = hsvPitch/2;
 uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
 outpix[0] = H;
 outpix[1] = S;
 outpix[2] = V;

__global__ void DC_fast_rgb16ToHsv16(const uint16_t *dvc_rgb16,
 size_t rgbPitch,
 size_t W, size_t H,
 uint16_t *dvc_hsv16,
 size_t hsvPitch) 
 const int col = blockIdx.x*blockDim.x + threadIdx.x;
 const int row = blockIdx.y*blockDim.y + threadIdx.y;
 if (row < H && col < W) 
 const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
 const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

 const uint16_t R = inpix[0];
 const uint16_t G = inpix[1];
 const uint16_t B = inpix[2];
 constexpr uint16_t uint16_max = 65535;
 constexpr float norm = 1.0f/uint16_max; // normalize RGB input
 const float4 rgb = make_float4(float(R)*norm,float(G)*norm,float(B)*norm, 0.0);

 const float4 K = make_float4(0.0, -1.0/3.0, 2.0/3.0, -1.0);
 const float4 p = 
 rgb.y < rgb.z ? make_float4(rgb.z, rgb.y, K.w, K.z) : make_float4(rgb.y, rgb.z, K.x, K.y);
 const float4 q =
 rgb.x < p.x ? make_float4(p.x, p.y, p.w, rgb.x) : make_float4(rgb.x, p.y, p.z, p.x);
 const float d = q.x - fminf(q.w, q.y);
 const float e = 1.0e-10;
 const float4 hsv = make_float4(fabsf(q.z + (q.w - q.y)/(6.0*d + e)), d/(q.x + e), q.x, 0.0);

 const size_t hsvPitch16 = hsvPitch/2;
 uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
 outpix[0] = uint16_t(lroundf(hsv.x*uint16_max));
 outpix[1] = uint16_t(lroundf(hsv.y*uint16_max));
 outpix[2] = uint16_t(lroundf(hsv.z*uint16_max));

edited May 25 at 19:39

200_success

123k14143399

asked May 25 at 17:58

wcochran

1023

edited May 25 at 19:39

200_success

123k14143399

edited May 25 at 19:39

200_success

123k14143399

edited May 25 at 19:39

200_success

123k14143399

asked May 25 at 17:58

wcochran

1023

asked May 25 at 17:58

wcochran

1023

asked May 25 at 17:58

wcochran

1023

Just to clarify, both programs accomplish the same task, but just in a different way?
â€“Â Phrancis
May 25 at 18:09

Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
â€“Â 200_success
May 25 at 18:13

@PhrancisYes -- both kernels are (should be) equivalent.
â€“Â wcochran
May 25 at 19:29

@200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
â€“Â wcochran
May 25 at 19:31

Voted down -- wow this community is harsh.
â€“Â wcochran
May 25 at 19:33

Â |Â
show 2 more comments

Just to clarify, both programs accomplish the same task, but just in a different way?
â€“Â Phrancis
May 25 at 18:09

Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
â€“Â 200_success
May 25 at 18:13

@PhrancisYes -- both kernels are (should be) equivalent.
â€“Â wcochran
May 25 at 19:29

@200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
â€“Â wcochran
May 25 at 19:31

Voted down -- wow this community is harsh.
â€“Â wcochran
May 25 at 19:33

Just to clarify, both programs accomplish the same task, but just in a different way?
â€“Â Phrancis
May 25 at 18:09

Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
â€“Â 200_success
May 25 at 18:13

@PhrancisYes -- both kernels are (should be) equivalent.
â€“Â wcochran
May 25 at 19:29

@200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
â€“Â wcochran
May 25 at 19:31

Voted down -- wow this community is harsh.
â€“Â wcochran
May 25 at 19:33

Â |Â
show 2 more comments

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f195176%2frgb-to-hsv-image-conversion-with-cuda-with-branching-and-branchless-versions%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr