RGB to HSV image conversion with CUDA, with branching and branchless versions

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
0
down vote

favorite












I have created two CUDA kernels for converting a 3 channel 16-bit RGB image to a 3 channel 16-bit HSV image. The first kernel below performs the usual algorithm in a fairly straight forward manner as described here and here and uses branching.



__global__ void DC_rgb16ToHsv16(const uint16_t *dvc_rgb16,
size_t rgbPitch,
size_t W, size_t H,
uint16_t *dvc_hsv16,
size_t hsvPitch)
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row < H && col < W)
const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

constexpr uint16_t uint16_max = 65535;
constexpr float norm = 1.0f/uint16_max; // normalize RGB input
const float nR = inpix[0]*norm;
const float nG = inpix[1]*norm;
const float nB = inpix[2]*norm;

const float nV = fmaxf(nR, fmaxf(nG, nB)); // V = max channel value

const float cmin = fminf(nR, fminf(nG, nB));
const float delta = nV - cmin;
const float nS = nV > 0 ? delta / nV : 0.0f; // S = saturation level
float nH = 0.0; // use H=S=V=0 for black
if (nV > 0 && delta != 0.0) // not black (hue undefined) nor gray
const float invDelta = 1.0/delta;
const float nCr = (nV - nR) * invDelta;
const float nCg = (nV - nG) * invDelta;
const float nCb = (nV - nB) * invDelta;
if (nR == nV)
nH = nCb - nCg; // may be negative
else if (nG == nV)
nH = 2.0f + nCr - nCb;
else if (nB == nV)
nH = 4.0f + nCg - nCr;
nH *= 1.0f/6;
if (nH < 0.0)
nH += 1.0f;


const uint16_t H = uint16_t(lroundf(nH*uint16_max));
const uint16_t S = uint16_t(lroundf(nS*uint16_max));
const uint16_t V = uint16_t(lroundf(nV*uint16_max));

const size_t hsvPitch16 = hsvPitch/2;
uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
outpix[0] = H;
outpix[1] = S;
outpix[2] = V;




Knowing the branchless, vectorized code should perform better (especially on a SIMD architecture like CUDA) I followed the idea found
here to implement my hopefully "faster" kernel as listed below.
There is one unavoidable branch in the outer if-statement and I was hoping the use of a ternary operator would result in a condition move (as opposed to a branch).



__global__ void DC_fast_rgb16ToHsv16(const uint16_t *dvc_rgb16,
size_t rgbPitch,
size_t W, size_t H,
uint16_t *dvc_hsv16,
size_t hsvPitch)
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row < H && col < W)
const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

const uint16_t R = inpix[0];
const uint16_t G = inpix[1];
const uint16_t B = inpix[2];
constexpr uint16_t uint16_max = 65535;
constexpr float norm = 1.0f/uint16_max; // normalize RGB input
const float4 rgb = make_float4(float(R)*norm,float(G)*norm,float(B)*norm, 0.0);

const float4 K = make_float4(0.0, -1.0/3.0, 2.0/3.0, -1.0);
const float4 p =
rgb.y < rgb.z ? make_float4(rgb.z, rgb.y, K.w, K.z) : make_float4(rgb.y, rgb.z, K.x, K.y);
const float4 q =
rgb.x < p.x ? make_float4(p.x, p.y, p.w, rgb.x) : make_float4(rgb.x, p.y, p.z, p.x);
const float d = q.x - fminf(q.w, q.y);
const float e = 1.0e-10;
const float4 hsv = make_float4(fabsf(q.z + (q.w - q.y)/(6.0*d + e)), d/(q.x + e), q.x, 0.0);

const size_t hsvPitch16 = hsvPitch/2;
uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
outpix[0] = uint16_t(lroundf(hsv.x*uint16_max));
outpix[1] = uint16_t(lroundf(hsv.y*uint16_max));
outpix[2] = uint16_t(lroundf(hsv.z*uint16_max));




What I have found in early testing is that the "branchless" version is almost 1.5x slower! Anyway, I thought I would post this here to see if there are any CUDA experts that can explain this. I can always try to analyze the generated PTX code, but I haven't attempted this yet.







share|improve this question





















  • Just to clarify, both programs accomplish the same task, but just in a different way?
    – Phrancis
    May 25 at 18:09










  • Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
    – 200_success
    May 25 at 18:13










  • @PhrancisYes -- both kernels are (should be) equivalent.
    – wcochran
    May 25 at 19:29










  • @200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
    – wcochran
    May 25 at 19:31










  • Voted down -- wow this community is harsh.
    – wcochran
    May 25 at 19:33
















up vote
0
down vote

favorite












I have created two CUDA kernels for converting a 3 channel 16-bit RGB image to a 3 channel 16-bit HSV image. The first kernel below performs the usual algorithm in a fairly straight forward manner as described here and here and uses branching.



__global__ void DC_rgb16ToHsv16(const uint16_t *dvc_rgb16,
size_t rgbPitch,
size_t W, size_t H,
uint16_t *dvc_hsv16,
size_t hsvPitch)
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row < H && col < W)
const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

constexpr uint16_t uint16_max = 65535;
constexpr float norm = 1.0f/uint16_max; // normalize RGB input
const float nR = inpix[0]*norm;
const float nG = inpix[1]*norm;
const float nB = inpix[2]*norm;

const float nV = fmaxf(nR, fmaxf(nG, nB)); // V = max channel value

const float cmin = fminf(nR, fminf(nG, nB));
const float delta = nV - cmin;
const float nS = nV > 0 ? delta / nV : 0.0f; // S = saturation level
float nH = 0.0; // use H=S=V=0 for black
if (nV > 0 && delta != 0.0) // not black (hue undefined) nor gray
const float invDelta = 1.0/delta;
const float nCr = (nV - nR) * invDelta;
const float nCg = (nV - nG) * invDelta;
const float nCb = (nV - nB) * invDelta;
if (nR == nV)
nH = nCb - nCg; // may be negative
else if (nG == nV)
nH = 2.0f + nCr - nCb;
else if (nB == nV)
nH = 4.0f + nCg - nCr;
nH *= 1.0f/6;
if (nH < 0.0)
nH += 1.0f;


const uint16_t H = uint16_t(lroundf(nH*uint16_max));
const uint16_t S = uint16_t(lroundf(nS*uint16_max));
const uint16_t V = uint16_t(lroundf(nV*uint16_max));

const size_t hsvPitch16 = hsvPitch/2;
uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
outpix[0] = H;
outpix[1] = S;
outpix[2] = V;




Knowing the branchless, vectorized code should perform better (especially on a SIMD architecture like CUDA) I followed the idea found
here to implement my hopefully "faster" kernel as listed below.
There is one unavoidable branch in the outer if-statement and I was hoping the use of a ternary operator would result in a condition move (as opposed to a branch).



__global__ void DC_fast_rgb16ToHsv16(const uint16_t *dvc_rgb16,
size_t rgbPitch,
size_t W, size_t H,
uint16_t *dvc_hsv16,
size_t hsvPitch)
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row < H && col < W)
const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

const uint16_t R = inpix[0];
const uint16_t G = inpix[1];
const uint16_t B = inpix[2];
constexpr uint16_t uint16_max = 65535;
constexpr float norm = 1.0f/uint16_max; // normalize RGB input
const float4 rgb = make_float4(float(R)*norm,float(G)*norm,float(B)*norm, 0.0);

const float4 K = make_float4(0.0, -1.0/3.0, 2.0/3.0, -1.0);
const float4 p =
rgb.y < rgb.z ? make_float4(rgb.z, rgb.y, K.w, K.z) : make_float4(rgb.y, rgb.z, K.x, K.y);
const float4 q =
rgb.x < p.x ? make_float4(p.x, p.y, p.w, rgb.x) : make_float4(rgb.x, p.y, p.z, p.x);
const float d = q.x - fminf(q.w, q.y);
const float e = 1.0e-10;
const float4 hsv = make_float4(fabsf(q.z + (q.w - q.y)/(6.0*d + e)), d/(q.x + e), q.x, 0.0);

const size_t hsvPitch16 = hsvPitch/2;
uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
outpix[0] = uint16_t(lroundf(hsv.x*uint16_max));
outpix[1] = uint16_t(lroundf(hsv.y*uint16_max));
outpix[2] = uint16_t(lroundf(hsv.z*uint16_max));




What I have found in early testing is that the "branchless" version is almost 1.5x slower! Anyway, I thought I would post this here to see if there are any CUDA experts that can explain this. I can always try to analyze the generated PTX code, but I haven't attempted this yet.







share|improve this question





















  • Just to clarify, both programs accomplish the same task, but just in a different way?
    – Phrancis
    May 25 at 18:09










  • Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
    – 200_success
    May 25 at 18:13










  • @PhrancisYes -- both kernels are (should be) equivalent.
    – wcochran
    May 25 at 19:29










  • @200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
    – wcochran
    May 25 at 19:31










  • Voted down -- wow this community is harsh.
    – wcochran
    May 25 at 19:33












up vote
0
down vote

favorite









up vote
0
down vote

favorite











I have created two CUDA kernels for converting a 3 channel 16-bit RGB image to a 3 channel 16-bit HSV image. The first kernel below performs the usual algorithm in a fairly straight forward manner as described here and here and uses branching.



__global__ void DC_rgb16ToHsv16(const uint16_t *dvc_rgb16,
size_t rgbPitch,
size_t W, size_t H,
uint16_t *dvc_hsv16,
size_t hsvPitch)
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row < H && col < W)
const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

constexpr uint16_t uint16_max = 65535;
constexpr float norm = 1.0f/uint16_max; // normalize RGB input
const float nR = inpix[0]*norm;
const float nG = inpix[1]*norm;
const float nB = inpix[2]*norm;

const float nV = fmaxf(nR, fmaxf(nG, nB)); // V = max channel value

const float cmin = fminf(nR, fminf(nG, nB));
const float delta = nV - cmin;
const float nS = nV > 0 ? delta / nV : 0.0f; // S = saturation level
float nH = 0.0; // use H=S=V=0 for black
if (nV > 0 && delta != 0.0) // not black (hue undefined) nor gray
const float invDelta = 1.0/delta;
const float nCr = (nV - nR) * invDelta;
const float nCg = (nV - nG) * invDelta;
const float nCb = (nV - nB) * invDelta;
if (nR == nV)
nH = nCb - nCg; // may be negative
else if (nG == nV)
nH = 2.0f + nCr - nCb;
else if (nB == nV)
nH = 4.0f + nCg - nCr;
nH *= 1.0f/6;
if (nH < 0.0)
nH += 1.0f;


const uint16_t H = uint16_t(lroundf(nH*uint16_max));
const uint16_t S = uint16_t(lroundf(nS*uint16_max));
const uint16_t V = uint16_t(lroundf(nV*uint16_max));

const size_t hsvPitch16 = hsvPitch/2;
uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
outpix[0] = H;
outpix[1] = S;
outpix[2] = V;




Knowing the branchless, vectorized code should perform better (especially on a SIMD architecture like CUDA) I followed the idea found
here to implement my hopefully "faster" kernel as listed below.
There is one unavoidable branch in the outer if-statement and I was hoping the use of a ternary operator would result in a condition move (as opposed to a branch).



__global__ void DC_fast_rgb16ToHsv16(const uint16_t *dvc_rgb16,
size_t rgbPitch,
size_t W, size_t H,
uint16_t *dvc_hsv16,
size_t hsvPitch)
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row < H && col < W)
const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

const uint16_t R = inpix[0];
const uint16_t G = inpix[1];
const uint16_t B = inpix[2];
constexpr uint16_t uint16_max = 65535;
constexpr float norm = 1.0f/uint16_max; // normalize RGB input
const float4 rgb = make_float4(float(R)*norm,float(G)*norm,float(B)*norm, 0.0);

const float4 K = make_float4(0.0, -1.0/3.0, 2.0/3.0, -1.0);
const float4 p =
rgb.y < rgb.z ? make_float4(rgb.z, rgb.y, K.w, K.z) : make_float4(rgb.y, rgb.z, K.x, K.y);
const float4 q =
rgb.x < p.x ? make_float4(p.x, p.y, p.w, rgb.x) : make_float4(rgb.x, p.y, p.z, p.x);
const float d = q.x - fminf(q.w, q.y);
const float e = 1.0e-10;
const float4 hsv = make_float4(fabsf(q.z + (q.w - q.y)/(6.0*d + e)), d/(q.x + e), q.x, 0.0);

const size_t hsvPitch16 = hsvPitch/2;
uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
outpix[0] = uint16_t(lroundf(hsv.x*uint16_max));
outpix[1] = uint16_t(lroundf(hsv.y*uint16_max));
outpix[2] = uint16_t(lroundf(hsv.z*uint16_max));




What I have found in early testing is that the "branchless" version is almost 1.5x slower! Anyway, I thought I would post this here to see if there are any CUDA experts that can explain this. I can always try to analyze the generated PTX code, but I haven't attempted this yet.







share|improve this question













I have created two CUDA kernels for converting a 3 channel 16-bit RGB image to a 3 channel 16-bit HSV image. The first kernel below performs the usual algorithm in a fairly straight forward manner as described here and here and uses branching.



__global__ void DC_rgb16ToHsv16(const uint16_t *dvc_rgb16,
size_t rgbPitch,
size_t W, size_t H,
uint16_t *dvc_hsv16,
size_t hsvPitch)
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row < H && col < W)
const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

constexpr uint16_t uint16_max = 65535;
constexpr float norm = 1.0f/uint16_max; // normalize RGB input
const float nR = inpix[0]*norm;
const float nG = inpix[1]*norm;
const float nB = inpix[2]*norm;

const float nV = fmaxf(nR, fmaxf(nG, nB)); // V = max channel value

const float cmin = fminf(nR, fminf(nG, nB));
const float delta = nV - cmin;
const float nS = nV > 0 ? delta / nV : 0.0f; // S = saturation level
float nH = 0.0; // use H=S=V=0 for black
if (nV > 0 && delta != 0.0) // not black (hue undefined) nor gray
const float invDelta = 1.0/delta;
const float nCr = (nV - nR) * invDelta;
const float nCg = (nV - nG) * invDelta;
const float nCb = (nV - nB) * invDelta;
if (nR == nV)
nH = nCb - nCg; // may be negative
else if (nG == nV)
nH = 2.0f + nCr - nCb;
else if (nB == nV)
nH = 4.0f + nCg - nCr;
nH *= 1.0f/6;
if (nH < 0.0)
nH += 1.0f;


const uint16_t H = uint16_t(lroundf(nH*uint16_max));
const uint16_t S = uint16_t(lroundf(nS*uint16_max));
const uint16_t V = uint16_t(lroundf(nV*uint16_max));

const size_t hsvPitch16 = hsvPitch/2;
uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
outpix[0] = H;
outpix[1] = S;
outpix[2] = V;




Knowing the branchless, vectorized code should perform better (especially on a SIMD architecture like CUDA) I followed the idea found
here to implement my hopefully "faster" kernel as listed below.
There is one unavoidable branch in the outer if-statement and I was hoping the use of a ternary operator would result in a condition move (as opposed to a branch).



__global__ void DC_fast_rgb16ToHsv16(const uint16_t *dvc_rgb16,
size_t rgbPitch,
size_t W, size_t H,
uint16_t *dvc_hsv16,
size_t hsvPitch)
const int col = blockIdx.x*blockDim.x + threadIdx.x;
const int row = blockIdx.y*blockDim.y + threadIdx.y;
if (row < H && col < W)
const size_t rgbPitch16 = rgbPitch/2; // pitch in 16-bit words
const uint16_t *inpix = dvc_rgb16 + row*rgbPitch16 + 3*col;

const uint16_t R = inpix[0];
const uint16_t G = inpix[1];
const uint16_t B = inpix[2];
constexpr uint16_t uint16_max = 65535;
constexpr float norm = 1.0f/uint16_max; // normalize RGB input
const float4 rgb = make_float4(float(R)*norm,float(G)*norm,float(B)*norm, 0.0);

const float4 K = make_float4(0.0, -1.0/3.0, 2.0/3.0, -1.0);
const float4 p =
rgb.y < rgb.z ? make_float4(rgb.z, rgb.y, K.w, K.z) : make_float4(rgb.y, rgb.z, K.x, K.y);
const float4 q =
rgb.x < p.x ? make_float4(p.x, p.y, p.w, rgb.x) : make_float4(rgb.x, p.y, p.z, p.x);
const float d = q.x - fminf(q.w, q.y);
const float e = 1.0e-10;
const float4 hsv = make_float4(fabsf(q.z + (q.w - q.y)/(6.0*d + e)), d/(q.x + e), q.x, 0.0);

const size_t hsvPitch16 = hsvPitch/2;
uint16_t *outpix = dvc_hsv16 + row*hsvPitch16 + 3*col;
outpix[0] = uint16_t(lroundf(hsv.x*uint16_max));
outpix[1] = uint16_t(lroundf(hsv.y*uint16_max));
outpix[2] = uint16_t(lroundf(hsv.z*uint16_max));




What I have found in early testing is that the "branchless" version is almost 1.5x slower! Anyway, I thought I would post this here to see if there are any CUDA experts that can explain this. I can always try to analyze the generated PTX code, but I haven't attempted this yet.









share|improve this question












share|improve this question




share|improve this question








edited May 25 at 19:39









200_success

123k14143399




123k14143399









asked May 25 at 17:58









wcochran

1023




1023











  • Just to clarify, both programs accomplish the same task, but just in a different way?
    – Phrancis
    May 25 at 18:09










  • Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
    – 200_success
    May 25 at 18:13










  • @PhrancisYes -- both kernels are (should be) equivalent.
    – wcochran
    May 25 at 19:29










  • @200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
    – wcochran
    May 25 at 19:31










  • Voted down -- wow this community is harsh.
    – wcochran
    May 25 at 19:33
















  • Just to clarify, both programs accomplish the same task, but just in a different way?
    – Phrancis
    May 25 at 18:09










  • Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
    – 200_success
    May 25 at 18:13










  • @PhrancisYes -- both kernels are (should be) equivalent.
    – wcochran
    May 25 at 19:29










  • @200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
    – wcochran
    May 25 at 19:31










  • Voted down -- wow this community is harsh.
    – wcochran
    May 25 at 19:33















Just to clarify, both programs accomplish the same task, but just in a different way?
– Phrancis
May 25 at 18:09




Just to clarify, both programs accomplish the same task, but just in a different way?
– Phrancis
May 25 at 18:09












Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
– 200_success
May 25 at 18:13




Are you seeking open-ended suggestions to improve your code? Or are you asking specifically for an explanation of these functions' performance?
– 200_success
May 25 at 18:13












@PhrancisYes -- both kernels are (should be) equivalent.
– wcochran
May 25 at 19:29




@PhrancisYes -- both kernels are (should be) equivalent.
– wcochran
May 25 at 19:29












@200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
– wcochran
May 25 at 19:31




@200_success I am seeking for the fastest 16-bit RGB to HSV conversion in CUDA. In the process, I am wondering why the branchless version is slower. I would welcome any insights or suggestions.
– wcochran
May 25 at 19:31












Voted down -- wow this community is harsh.
– wcochran
May 25 at 19:33




Voted down -- wow this community is harsh.
– wcochran
May 25 at 19:33















active

oldest

votes











Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f195176%2frgb-to-hsv-image-conversion-with-cuda-with-branching-and-branchless-versions%23new-answer', 'question_page');

);

Post as a guest



































active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes










 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f195176%2frgb-to-hsv-image-conversion-with-cuda-with-branching-and-branchless-versions%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Python Lists

Aion

JavaScript Array Iteration Methods