Computing the gradient of a function

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
7
down vote

favorite

I have some Python code which uses NumPy which computes gradient of a function and this is a big bottleneck in my application. So, my initial attempt was to try to use Cython to improve the performance.

So, using online guides, I was able to port this to Cython easily but got a very moderate speedup around 15%. The function contains many loops and I was hoping that Cython would give a much better improvement.

The Cython code looks as follows. The following are helper functions that only get called from Cython.

cimport numpy as np
cimport cython

cdef extern from "math.h":
 double fabs(double x)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef cget_cubic_bspline_weight(double u):
 u = fabs(u)
 if u < 2.0:
 if u < 1.0:
 return 2.0 / 3.0 - u ** 2 + 0.5 * u ** 3
 else:
 return ((2.0 - u) ** 3) / 6.0

 return 0.0

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef cget_cubic_spline_first_der_weight(double u):
 cdef double o = u
 u = fabs(u)
 cdef double v
 if u < 2.0:
 if u < 1.0:
 return (1.5 * u - 2.0) * o
 else:
 u -= 2.0
 v = -0.5 * u * u
 if o < 0.0:
 return -v
 return v

 return 0.0;

The following is the main function that computes the gradient:

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 nmi = (entropies[0] + entropies[1]) / entropies[2]

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]
 jd = [0.0] * 2
 rd = [0.0] * 2
 wd = [0.0] * 2

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weight(ref - float(r)) * 
 cget_cubic_spline_first_der_weight(war - float(w))

 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]

 jd[0] += c * war_x[x, y] * jl
 rd[0] += c * war_x[x, y] * rl
 wd[0] += c * war_x[x, y] * wl

 jd[1] += c * war_y[x, y] * jl
 rd[1] += c * war_y[x, y] * rl
 wd[1] += c * war_y[x, y] * wl


 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / (entropies[2] * entropies[3])
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / (entropies[2] * entropies[3])

Now, I call this as:

speed.gradient_2d(self.rdata, self.wdata, warped_grad_image,
 result_gradient.data, self.entropies,
 self.jhlog, self.reflog, self.warlog, self.bins,
 int(self.rdata.shape[1]), int(self.rdata.shape[0]))

Everything except the last 2 parameters are NumPy arrays and are as described in the Cython function signature. The Python code is pretty much the same and I can post it if you want but it is basically really the same.

I compiled the whole thing with the setup.py as:

from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy

ext = Extension("speed",
 sources=["perf/speed.pyx"],
 include_dirs=[numpy.get_include()],
 language="c++",
 libraries=,
 extra_link_args=)

setup(ext_modules = cythonize([ext]))

Again, because I have so many loops in my code, I was under the impression that the Cython version would be much faster but I only get 15% improvement. I followed this guide for the implementation and as far as I can tell I did pretty much everything it recommends. Any suggestions on what I could try next would be greatly appreciated!

Inlining the top helper functions seem to only degrade the performance slightly.

edited Apr 29 at 17:59

Jamalâ™¦

30.1k11114225

asked Apr 29 at 17:21

Luca

1667

add a commentÂ |Â

up vote
7
down vote

favorite

The Cython code looks as follows. The following are helper functions that only get called from Cython.

cimport numpy as np
cimport cython

cdef extern from "math.h":
 double fabs(double x)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef cget_cubic_bspline_weight(double u):
 u = fabs(u)
 if u < 2.0:
 if u < 1.0:
 return 2.0 / 3.0 - u ** 2 + 0.5 * u ** 3
 else:
 return ((2.0 - u) ** 3) / 6.0

 return 0.0

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef cget_cubic_spline_first_der_weight(double u):
 cdef double o = u
 u = fabs(u)
 cdef double v
 if u < 2.0:
 if u < 1.0:
 return (1.5 * u - 2.0) * o
 else:
 u -= 2.0
 v = -0.5 * u * u
 if o < 0.0:
 return -v
 return v

 return 0.0;

The following is the main function that computes the gradient:

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 nmi = (entropies[0] + entropies[1]) / entropies[2]

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]
 jd = [0.0] * 2
 rd = [0.0] * 2
 wd = [0.0] * 2

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weight(ref - float(r)) * 
 cget_cubic_spline_first_der_weight(war - float(w))

 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]

 jd[0] += c * war_x[x, y] * jl
 rd[0] += c * war_x[x, y] * rl
 wd[0] += c * war_x[x, y] * wl

 jd[1] += c * war_y[x, y] * jl
 rd[1] += c * war_y[x, y] * rl
 wd[1] += c * war_y[x, y] * wl


 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / (entropies[2] * entropies[3])
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / (entropies[2] * entropies[3])

Now, I call this as:

speed.gradient_2d(self.rdata, self.wdata, warped_grad_image,
 result_gradient.data, self.entropies,
 self.jhlog, self.reflog, self.warlog, self.bins,
 int(self.rdata.shape[1]), int(self.rdata.shape[0]))

I compiled the whole thing with the setup.py as:

from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy

ext = Extension("speed",
 sources=["perf/speed.pyx"],
 include_dirs=[numpy.get_include()],
 language="c++",
 libraries=,
 extra_link_args=)

setup(ext_modules = cythonize([ext]))

Inlining the top helper functions seem to only degrade the performance slightly.

edited Apr 29 at 17:59

Jamalâ™¦

30.1k11114225

asked Apr 29 at 17:21

Luca

1667

add a commentÂ |Â

up vote
7
down vote

favorite

The Cython code looks as follows. The following are helper functions that only get called from Cython.

cimport numpy as np
cimport cython

cdef extern from "math.h":
 double fabs(double x)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef cget_cubic_bspline_weight(double u):
 u = fabs(u)
 if u < 2.0:
 if u < 1.0:
 return 2.0 / 3.0 - u ** 2 + 0.5 * u ** 3
 else:
 return ((2.0 - u) ** 3) / 6.0

 return 0.0

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef cget_cubic_spline_first_der_weight(double u):
 cdef double o = u
 u = fabs(u)
 cdef double v
 if u < 2.0:
 if u < 1.0:
 return (1.5 * u - 2.0) * o
 else:
 u -= 2.0
 v = -0.5 * u * u
 if o < 0.0:
 return -v
 return v

 return 0.0;

The following is the main function that computes the gradient:

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 nmi = (entropies[0] + entropies[1]) / entropies[2]

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]
 jd = [0.0] * 2
 rd = [0.0] * 2
 wd = [0.0] * 2

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weight(ref - float(r)) * 
 cget_cubic_spline_first_der_weight(war - float(w))

 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]

 jd[0] += c * war_x[x, y] * jl
 rd[0] += c * war_x[x, y] * rl
 wd[0] += c * war_x[x, y] * wl

 jd[1] += c * war_y[x, y] * jl
 rd[1] += c * war_y[x, y] * rl
 wd[1] += c * war_y[x, y] * wl


 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / (entropies[2] * entropies[3])
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / (entropies[2] * entropies[3])

Now, I call this as:

speed.gradient_2d(self.rdata, self.wdata, warped_grad_image,
 result_gradient.data, self.entropies,
 self.jhlog, self.reflog, self.warlog, self.bins,
 int(self.rdata.shape[1]), int(self.rdata.shape[0]))

I compiled the whole thing with the setup.py as:

from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy

ext = Extension("speed",
 sources=["perf/speed.pyx"],
 include_dirs=[numpy.get_include()],
 language="c++",
 libraries=,
 extra_link_args=)

setup(ext_modules = cythonize([ext]))

Inlining the top helper functions seem to only degrade the performance slightly.

edited Apr 29 at 17:59

Jamalâ™¦

30.1k11114225

asked Apr 29 at 17:21

Luca

1667

The Cython code looks as follows. The following are helper functions that only get called from Cython.

cimport numpy as np
cimport cython

cdef extern from "math.h":
 double fabs(double x)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef cget_cubic_bspline_weight(double u):
 u = fabs(u)
 if u < 2.0:
 if u < 1.0:
 return 2.0 / 3.0 - u ** 2 + 0.5 * u ** 3
 else:
 return ((2.0 - u) ** 3) / 6.0

 return 0.0

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cdef cget_cubic_spline_first_der_weight(double u):
 cdef double o = u
 u = fabs(u)
 cdef double v
 if u < 2.0:
 if u < 1.0:
 return (1.5 * u - 2.0) * o
 else:
 u -= 2.0
 v = -0.5 * u * u
 if o < 0.0:
 return -v
 return v

 return 0.0;

The following is the main function that computes the gradient:

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 nmi = (entropies[0] + entropies[1]) / entropies[2]

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]
 jd = [0.0] * 2
 rd = [0.0] * 2
 wd = [0.0] * 2

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weight(ref - float(r)) * 
 cget_cubic_spline_first_der_weight(war - float(w))

 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]

 jd[0] += c * war_x[x, y] * jl
 rd[0] += c * war_x[x, y] * rl
 wd[0] += c * war_x[x, y] * wl

 jd[1] += c * war_y[x, y] * jl
 rd[1] += c * war_y[x, y] * rl
 wd[1] += c * war_y[x, y] * wl


 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / (entropies[2] * entropies[3])
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / (entropies[2] * entropies[3])

Now, I call this as:

speed.gradient_2d(self.rdata, self.wdata, warped_grad_image,
 result_gradient.data, self.entropies,
 self.jhlog, self.reflog, self.warlog, self.bins,
 int(self.rdata.shape[1]), int(self.rdata.shape[0]))

I compiled the whole thing with the setup.py as:

from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy

ext = Extension("speed",
 sources=["perf/speed.pyx"],
 include_dirs=[numpy.get_include()],
 language="c++",
 libraries=,
 extra_link_args=)

setup(ext_modules = cythonize([ext]))

Inlining the top helper functions seem to only degrade the performance slightly.

edited Apr 29 at 17:59

Jamalâ™¦

30.1k11114225

asked Apr 29 at 17:21

Luca

1667

edited Apr 29 at 17:59

Jamalâ™¦

30.1k11114225

edited Apr 29 at 17:59

Jamalâ™¦

30.1k11114225

edited Apr 29 at 17:59

Jamalâ™¦

30.1k11114225

asked Apr 29 at 17:21

Luca

1667

asked Apr 29 at 17:21

Luca

1667

asked Apr 29 at 17:21

Luca

1667

add a commentÂ |Â

2 Answers
2

active

oldest

votes

up vote
3
down vote

Ok, after playing around a bit, it turns out that the main thing that will boost speed is using ctypes. Here is the modified code which offers about 13x speedup. I am leaving it here in case it will be of use to someone else. I am sure more performance can be extracted but I will be hitting diminishing returns.

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 cdef double nmi = (entropies[0] + entropies[1]) / entropies[2]
 cdef double norm = entropies[2] * entropies[3]

 cdef double jd[2]
 cdef double rd[2]
 cdef double wd[2]

 cdef double ref
 cdef double war
 cdef double c_war_x_x_y
 cdef double c_war_y_x_y

 cdef double jl
 cdef double rl
 cdef double wl

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]

 jd[0] = jd[1] = 0.0
 rd[0] = rd[1] = 0.0
 wd[0] = wd[1] = 0.0

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weights(ref - r) * 
 cget_cubic_spline_first_der_weights(war - w)
 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]

 c_war_x_x_y = c * war_x[x, y]
 c_war_y_x_y = c * war_y[x, y]

 jd[0] += c_war_x_x_y * jl
 rd[0] += c_war_x_x_y * rl
 wd[0] += c_war_x_x_y * wl

 jd[1] += c_war_y_x_y * jl
 rd[1] += c_war_y_x_y * rl
 wd[1] += c_war_y_x_y * wl


 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / norm
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / norm

answered Apr 29 at 18:57

Luca

1667

There has always been this complicated tradeoff between space used and performance...
â€“Â FreezePhoenix
Apr 29 at 19:42

yeah, in my case just using ctypes for a few scalar values resulted in a massive speedup.
â€“Â Luca
Apr 29 at 19:53

add a commentÂ |Â

up vote
2
down vote

So what you want to do is have this run faster.

To begin with, there are several things to improve on.

Let's start with the second code block.

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 nmi = (entropies[0] + entropies[1]) / entropies[2]

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]
 jd = [0.0] * 2
 rd = [0.0] * 2
 wd = [0.0] * 2

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weight(ref - float(r)) * 
 cget_cubic_spline_first_der_weight(war - float(w))

 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]
 # Why are we acessing / calling [x, y] of war_x this many times?
 # jd[0] += c * war_x[x, y] * jl
 # rd[0] += c * war_x[x, y] * rl
 # wd[0] += c * war_x[x, y] * wl

 # Lets do this instead:
 c_war_x_x_y = c * war_x[x, y]
 jd[0] += c_war_x_x_y * jl
 rd[0] += c_war_x_x_y * rl
 wd[0] += c_war_x_x_y * wl

 # Same here: 
 # jd[1] += c * war_y[x, y] * jl
 # rd[1] += c * war_y[x, y] * rl
 # wd[1] += c * war_y[x, y] * wl

 c_war_y_x_y = c * war_y[x, y]
 jd[1] += c_war_y_x_y * jl
 rd[1] += c_war_y_x_y * rl
 wd[1] += c_war_y_x_y * wl

 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / (entropies[2] * entropies[3])
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / (entropies[2] * entropies[3])

There are several more areas for improvement. But keep doing these sorts of improvements and you should be fine.

answered Apr 29 at 18:17

FreezePhoenix

310218

Good point. Thanks, I will have a closer look and see. I was relying on caching from the compiler to sort of not have this issue but it could make a difference!
â€“Â Luca
Apr 29 at 18:27

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f193216%2fcomputing-the-gradient-of-a-function%23new-answer', 'question_page');

);

Post as a guest

Name

2 Answers
2

active

oldest

votes

2 Answers
2

active

oldest

votes

up vote
3
down vote

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 cdef double nmi = (entropies[0] + entropies[1]) / entropies[2]
 cdef double norm = entropies[2] * entropies[3]

 cdef double jd[2]
 cdef double rd[2]
 cdef double wd[2]

 cdef double ref
 cdef double war
 cdef double c_war_x_x_y
 cdef double c_war_y_x_y

 cdef double jl
 cdef double rl
 cdef double wl

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]

 jd[0] = jd[1] = 0.0
 rd[0] = rd[1] = 0.0
 wd[0] = wd[1] = 0.0

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weights(ref - r) * 
 cget_cubic_spline_first_der_weights(war - w)
 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]

 c_war_x_x_y = c * war_x[x, y]
 c_war_y_x_y = c * war_y[x, y]

 jd[0] += c_war_x_x_y * jl
 rd[0] += c_war_x_x_y * rl
 wd[0] += c_war_x_x_y * wl

 jd[1] += c_war_y_x_y * jl
 rd[1] += c_war_y_x_y * rl
 wd[1] += c_war_y_x_y * wl


 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / norm
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / norm

answered Apr 29 at 18:57

Luca

1667

There has always been this complicated tradeoff between space used and performance...
â€“Â FreezePhoenix
Apr 29 at 19:42

yeah, in my case just using ctypes for a few scalar values resulted in a massive speedup.
â€“Â Luca
Apr 29 at 19:53

add a commentÂ |Â

up vote
3
down vote

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 cdef double nmi = (entropies[0] + entropies[1]) / entropies[2]
 cdef double norm = entropies[2] * entropies[3]

 cdef double jd[2]
 cdef double rd[2]
 cdef double wd[2]

 cdef double ref
 cdef double war
 cdef double c_war_x_x_y
 cdef double c_war_y_x_y

 cdef double jl
 cdef double rl
 cdef double wl

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]

 jd[0] = jd[1] = 0.0
 rd[0] = rd[1] = 0.0
 wd[0] = wd[1] = 0.0

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weights(ref - r) * 
 cget_cubic_spline_first_der_weights(war - w)
 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]

 c_war_x_x_y = c * war_x[x, y]
 c_war_y_x_y = c * war_y[x, y]

 jd[0] += c_war_x_x_y * jl
 rd[0] += c_war_x_x_y * rl
 wd[0] += c_war_x_x_y * wl

 jd[1] += c_war_y_x_y * jl
 rd[1] += c_war_y_x_y * rl
 wd[1] += c_war_y_x_y * wl


 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / norm
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / norm

answered Apr 29 at 18:57

Luca

1667

There has always been this complicated tradeoff between space used and performance...
â€“Â FreezePhoenix
Apr 29 at 19:42

yeah, in my case just using ctypes for a few scalar values resulted in a massive speedup.
â€“Â Luca
Apr 29 at 19:53

add a commentÂ |Â

up vote
3
down vote

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 cdef double nmi = (entropies[0] + entropies[1]) / entropies[2]
 cdef double norm = entropies[2] * entropies[3]

 cdef double jd[2]
 cdef double rd[2]
 cdef double wd[2]

 cdef double ref
 cdef double war
 cdef double c_war_x_x_y
 cdef double c_war_y_x_y

 cdef double jl
 cdef double rl
 cdef double wl

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]

 jd[0] = jd[1] = 0.0
 rd[0] = rd[1] = 0.0
 wd[0] = wd[1] = 0.0

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weights(ref - r) * 
 cget_cubic_spline_first_der_weights(war - w)
 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]

 c_war_x_x_y = c * war_x[x, y]
 c_war_y_x_y = c * war_y[x, y]

 jd[0] += c_war_x_x_y * jl
 rd[0] += c_war_x_x_y * rl
 wd[0] += c_war_x_x_y * wl

 jd[1] += c_war_y_x_y * jl
 rd[1] += c_war_y_x_y * rl
 wd[1] += c_war_y_x_y * wl


 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / norm
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / norm

answered Apr 29 at 18:57

Luca

1667

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 cdef double nmi = (entropies[0] + entropies[1]) / entropies[2]
 cdef double norm = entropies[2] * entropies[3]

 cdef double jd[2]
 cdef double rd[2]
 cdef double wd[2]

 cdef double ref
 cdef double war
 cdef double c_war_x_x_y
 cdef double c_war_y_x_y

 cdef double jl
 cdef double rl
 cdef double wl

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]

 jd[0] = jd[1] = 0.0
 rd[0] = rd[1] = 0.0
 wd[0] = wd[1] = 0.0

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weights(ref - r) * 
 cget_cubic_spline_first_der_weights(war - w)
 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]

 c_war_x_x_y = c * war_x[x, y]
 c_war_y_x_y = c * war_y[x, y]

 jd[0] += c_war_x_x_y * jl
 rd[0] += c_war_x_x_y * rl
 wd[0] += c_war_x_x_y * wl

 jd[1] += c_war_y_x_y * jl
 rd[1] += c_war_y_x_y * rl
 wd[1] += c_war_y_x_y * wl


 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / norm
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / norm

answered Apr 29 at 18:57

Luca

1667

answered Apr 29 at 18:57

Luca

1667

answered Apr 29 at 18:57

Luca

1667

answered Apr 29 at 18:57

Luca

1667

There has always been this complicated tradeoff between space used and performance...
â€“Â FreezePhoenix
Apr 29 at 19:42

yeah, in my case just using ctypes for a few scalar values resulted in a massive speedup.
â€“Â Luca
Apr 29 at 19:53

add a commentÂ |Â

There has always been this complicated tradeoff between space used and performance...
â€“Â FreezePhoenix
Apr 29 at 19:42

yeah, in my case just using ctypes for a few scalar values resulted in a massive speedup.
â€“Â Luca
Apr 29 at 19:53

There has always been this complicated tradeoff between space used and performance...
â€“Â FreezePhoenix
Apr 29 at 19:42

yeah, in my case just using ctypes for a few scalar values resulted in a massive speedup.
â€“Â Luca
Apr 29 at 19:53

add a commentÂ |Â

up vote
2
down vote

So what you want to do is have this run faster.

To begin with, there are several things to improve on.

Let's start with the second code block.

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 nmi = (entropies[0] + entropies[1]) / entropies[2]

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]
 jd = [0.0] * 2
 rd = [0.0] * 2
 wd = [0.0] * 2

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weight(ref - float(r)) * 
 cget_cubic_spline_first_der_weight(war - float(w))

 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]
 # Why are we acessing / calling [x, y] of war_x this many times?
 # jd[0] += c * war_x[x, y] * jl
 # rd[0] += c * war_x[x, y] * rl
 # wd[0] += c * war_x[x, y] * wl

 # Lets do this instead:
 c_war_x_x_y = c * war_x[x, y]
 jd[0] += c_war_x_x_y * jl
 rd[0] += c_war_x_x_y * rl
 wd[0] += c_war_x_x_y * wl

 # Same here: 
 # jd[1] += c * war_y[x, y] * jl
 # rd[1] += c * war_y[x, y] * rl
 # wd[1] += c * war_y[x, y] * wl

 c_war_y_x_y = c * war_y[x, y]
 jd[1] += c_war_y_x_y * jl
 rd[1] += c_war_y_x_y * rl
 wd[1] += c_war_y_x_y * wl

 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / (entropies[2] * entropies[3])
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / (entropies[2] * entropies[3])

There are several more areas for improvement. But keep doing these sorts of improvements and you should be fine.

answered Apr 29 at 18:17

FreezePhoenix

310218

Good point. Thanks, I will have a closer look and see. I was relying on caching from the compiler to sort of not have this issue but it could make a difference!
â€“Â Luca
Apr 29 at 18:27

add a commentÂ |Â

up vote
2
down vote

So what you want to do is have this run faster.

To begin with, there are several things to improve on.

Let's start with the second code block.

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 nmi = (entropies[0] + entropies[1]) / entropies[2]

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]
 jd = [0.0] * 2
 rd = [0.0] * 2
 wd = [0.0] * 2

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weight(ref - float(r)) * 
 cget_cubic_spline_first_der_weight(war - float(w))

 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]
 # Why are we acessing / calling [x, y] of war_x this many times?
 # jd[0] += c * war_x[x, y] * jl
 # rd[0] += c * war_x[x, y] * rl
 # wd[0] += c * war_x[x, y] * wl

 # Lets do this instead:
 c_war_x_x_y = c * war_x[x, y]
 jd[0] += c_war_x_x_y * jl
 rd[0] += c_war_x_x_y * rl
 wd[0] += c_war_x_x_y * wl

 # Same here: 
 # jd[1] += c * war_y[x, y] * jl
 # rd[1] += c * war_y[x, y] * rl
 # wd[1] += c * war_y[x, y] * wl

 c_war_y_x_y = c * war_y[x, y]
 jd[1] += c_war_y_x_y * jl
 rd[1] += c_war_y_x_y * rl
 wd[1] += c_war_y_x_y * wl

 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / (entropies[2] * entropies[3])
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / (entropies[2] * entropies[3])

There are several more areas for improvement. But keep doing these sorts of improvements and you should be fine.

answered Apr 29 at 18:17

FreezePhoenix

310218

Good point. Thanks, I will have a closer look and see. I was relying on caching from the compiler to sort of not have this issue but it could make a difference!
â€“Â Luca
Apr 29 at 18:27

add a commentÂ |Â

up vote
2
down vote

So what you want to do is have this run faster.

To begin with, there are several things to improve on.

Let's start with the second code block.

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 nmi = (entropies[0] + entropies[1]) / entropies[2]

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]
 jd = [0.0] * 2
 rd = [0.0] * 2
 wd = [0.0] * 2

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weight(ref - float(r)) * 
 cget_cubic_spline_first_der_weight(war - float(w))

 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]
 # Why are we acessing / calling [x, y] of war_x this many times?
 # jd[0] += c * war_x[x, y] * jl
 # rd[0] += c * war_x[x, y] * rl
 # wd[0] += c * war_x[x, y] * wl

 # Lets do this instead:
 c_war_x_x_y = c * war_x[x, y]
 jd[0] += c_war_x_x_y * jl
 rd[0] += c_war_x_x_y * rl
 wd[0] += c_war_x_x_y * wl

 # Same here: 
 # jd[1] += c * war_y[x, y] * jl
 # rd[1] += c * war_y[x, y] * rl
 # wd[1] += c * war_y[x, y] * wl

 c_war_y_x_y = c * war_y[x, y]
 jd[1] += c_war_y_x_y * jl
 rd[1] += c_war_y_x_y * rl
 wd[1] += c_war_y_x_y * wl

 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / (entropies[2] * entropies[3])
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / (entropies[2] * entropies[3])

There are several more areas for improvement. But keep doing these sorts of improvements and you should be fine.

answered Apr 29 at 18:17

FreezePhoenix

310218

So what you want to do is have this run faster.

To begin with, there are several things to improve on.

Let's start with the second code block.

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
cpdef gradient_2d(np.ndarray[double, ndim=2, mode="c"] reference,
 np.ndarray[double, ndim=2, mode="c"] warped,
 np.ndarray[double, ndim=5, mode="fortran"] warped_gradient,
 np.ndarray[double, ndim=5, mode="fortran"] result_gradient,
 double[:] entropies,
 np.ndarray[double, ndim=2, mode="c"] jhlog,
 np.ndarray[double, ndim=2, mode="fortran"] reflog,
 np.ndarray[double, ndim=2, mode="fortran"] warlog,
 int[:] bins,
 int height, int width):

 war_x = warped_gradient[..., 0]
 war_y = warped_gradient[..., 1]

 res_x = result_gradient[..., 0]
 res_y = result_gradient[..., 1]
 nmi = (entropies[0] + entropies[1]) / entropies[2]

 for y in range(height):
 for x in range(width):
 ref = reference[x, y]
 war = warped[x, y]
 jd = [0.0] * 2
 rd = [0.0] * 2
 wd = [0.0] * 2

 for r in range(int(ref - 1.0), int(ref + 3.0)):
 if (-1 < r and r < bins[0]):
 for w in range(int(war - 1.0), int(war + 3.0)):
 if (-1 < w and w < bins[1]):
 c = cget_cubic_bspline_weight(ref - float(r)) * 
 cget_cubic_spline_first_der_weight(war - float(w))

 jl = jhlog[r, w]
 rl = reflog[r, 0]
 wl = warlog[0, w]
 # Why are we acessing / calling [x, y] of war_x this many times?
 # jd[0] += c * war_x[x, y] * jl
 # rd[0] += c * war_x[x, y] * rl
 # wd[0] += c * war_x[x, y] * wl

 # Lets do this instead:
 c_war_x_x_y = c * war_x[x, y]
 jd[0] += c_war_x_x_y * jl
 rd[0] += c_war_x_x_y * rl
 wd[0] += c_war_x_x_y * wl

 # Same here: 
 # jd[1] += c * war_y[x, y] * jl
 # rd[1] += c * war_y[x, y] * rl
 # wd[1] += c * war_y[x, y] * wl

 c_war_y_x_y = c * war_y[x, y]
 jd[1] += c_war_y_x_y * jl
 rd[1] += c_war_y_x_y * rl
 wd[1] += c_war_y_x_y * wl

 res_x[x, y] = (rd[0] + wd[0] - nmi * jd[0]) / (entropies[2] * entropies[3])
 res_y[x, y] = (rd[1] + wd[1] - nmi * jd[1]) / (entropies[2] * entropies[3])

There are several more areas for improvement. But keep doing these sorts of improvements and you should be fine.

answered Apr 29 at 18:17

FreezePhoenix

310218

answered Apr 29 at 18:17

FreezePhoenix

310218

answered Apr 29 at 18:17

FreezePhoenix

310218

answered Apr 29 at 18:17

FreezePhoenix

310218

Good point. Thanks, I will have a closer look and see. I was relying on caching from the compiler to sort of not have this issue but it could make a difference!
â€“Â Luca
Apr 29 at 18:27

add a commentÂ |Â

Good point. Thanks, I will have a closer look and see. I was relying on caching from the compiler to sort of not have this issue but it could make a difference!
â€“Â Luca
Apr 29 at 18:27

Good point. Thanks, I will have a closer look and see. I was relying on caching from the compiler to sort of not have this issue but it could make a difference!
â€“Â Luca
Apr 29 at 18:27

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr