Writing benchmarks
With Transonic, writting benchmarks for the different accelerators is very simple. We present an example in this page.
Other examples can be found here:
Comparison Numba vs Pythran (JIT)
We take this file with only pure-Numpy code from this blog post by Florian LE BOURDAIS.
import numpy as np
def laplace_numpy(image):
"""Laplace operator in NumPy for 2D images."""
laplacian = (
image[:-2, 1:-1]
+ image[2:, 1:-1]
+ image[1:-1, :-2]
+ image[1:-1, 2:]
- 4 * image[1:-1, 1:-1]
)
thresh = np.abs(laplacian) > 0.05
return thresh
def laplace_loops(image):
"""Laplace operator for 2D images."""
h = image.shape[0]
w = image.shape[1]
laplacian = np.empty((h - 2, w - 2), np.uint8)
for i in range(1, h - 1):
for j in range(1, w - 1):
laplacian[i - 1, j - 1] = (
np.abs(
image[i - 1, j]
+ image[i + 1, j]
+ image[i, j - 1]
+ image[i, j + 1]
- 4 * image[i, j]
)
> 0.05
)
return laplacian
Our code for a benchmark in JIT mode:
import numba
from pure_numpy import laplace_loops, laplace_numpy
from transonic import jit
laplace_transonic_pythran = jit(native=True, xsimd=True)(laplace_numpy)
laplace_transonic_python = jit(backend="python")(laplace_numpy)
laplace_transonic_numba = jit(backend="numba")(laplace_numpy)
laplace_numba = numba.njit(laplace_numpy)
laplace_transonic_pythran_loops = jit(native=True, xsimd=True)(laplace_loops)
laplace_transonic_python_loops = jit(backend="python")(laplace_loops)
laplace_transonic_numba_loops = jit(backend="numba")(laplace_loops)
laplace_numba_loops = numba.njit(laplace_loops)
if __name__ == "__main__":
from skimage.color import rgb2gray
from skimage.data import astronaut
from transonic import wait_for_all_extensions
image = astronaut()
image = rgb2gray(image)
# warm the functions
laplace_transonic_python(image)
laplace_transonic_pythran(image)
laplace_transonic_pythran_loops(image)
laplace_transonic_numba(image)
laplace_transonic_numba_loops(image)
laplace_numba(image)
laplace_numba_loops(image)
wait_for_all_extensions()
# again warming
laplace_transonic_numba(image)
laplace_transonic_numba_loops(image)
import pythran
from transonic import __version__
from transonic.util import timeit
loc = locals()
def bench(call, norm=None):
ret = result = timeit(call, globals=loc)
if norm is None:
norm = result
result /= norm
print(f"{call.split('(')[0]:33s}: {result:.2f}")
return ret
print(
f"transonic {__version__}\n"
f"pythran {pythran.__version__}\n"
f"numba {numba.__version__}\n"
)
norm = bench("laplace_transonic_pythran(image)")
print(f"norm = {norm:.2e} s")
bench("laplace_transonic_pythran_loops(image)", norm=norm)
bench("laplace_numba(image)", norm=norm)
bench("laplace_transonic_numba(image)", norm=norm)
bench("laplace_numba_loops(image)", norm=norm)
bench("laplace_transonic_numba_loops(image)", norm=norm)
bench("laplace_numpy(image)", norm=norm)
bench("laplace_transonic_python(image)", norm=norm)
gives:
transonic 0.4.0
pythran 0.9.3post1
numba 0.45.1
laplace_transonic_pythran : 1.00
norm = 1.44e-04 s
laplace_transonic_pythran_loops : 0.94
laplace_numba : 8.82
laplace_transonic_numba : 8.80
laplace_numba_loops : 0.94
laplace_transonic_numba_loops : 0.94
laplace_numpy : 6.94
laplace_transonic_python : 7.03
The warmup is much longer for Transonic-Pythran but remember that it is a cached JIT so it is an issue only for the first call of the function. When we reimport the module, there is no warmup.
Then we see that Pythran is very good to optimize high-level NumPy code! In contrast (with my setup and on my computer), Numba has not been able to optimize this function. However, Numba is good to speedup the code with loops!
Note that the Transonic overhead is negligible even for this very small case
(the shape of the image is (512, 512)).
Note
We don’t use the fastmath option of Numba because the Numba backend
does not support it yet!
Ahead-of-time compilation
import numba
import numpy as np
from transonic import Array, boost
Image = Array[np.float64, "2d", "C"]
def laplace_numpy(image: Image):
"""Laplace operator in NumPy for 2D images."""
laplacian = (
image[:-2, 1:-1]
+ image[2:, 1:-1]
+ image[1:-1, :-2]
+ image[1:-1, 2:]
- 4 * image[1:-1, 1:-1]
)
thresh = np.abs(laplacian) > 0.05
return thresh
def laplace_loops(image: Image):
"""Laplace operator for 2D images."""
h = image.shape[0]
w = image.shape[1]
laplacian = np.empty((h - 2, w - 2), np.uint8)
for i in range(1, h - 1):
for j in range(1, w - 1):
laplacian[i - 1, j - 1] = (
np.abs(
image[i - 1, j]
+ image[i + 1, j]
+ image[i, j - 1]
+ image[i, j + 1]
- 4 * image[i, j]
)
> 0.05
)
return laplacian
laplace_transonic_pythran = boost(backend="pythran")(laplace_numpy)
laplace_transonic_cython = boost(backend="cython")(laplace_numpy)
laplace_transonic_numba = boost(backend="numba")(laplace_numpy)
laplace_transonic_python = boost(backend="python")(laplace_numpy)
laplace_numba = numba.njit(laplace_numpy)
laplace_loops_transonic_pythran = boost(backend="pythran")(laplace_loops)
laplace_loops_transonic_python = boost(backend="python")(laplace_loops)
laplace_loops_transonic_numba = boost(backend="numba")(laplace_loops)
laplace_loops_numba = numba.njit(laplace_loops)
# For Cython, we need to add more type annotations
@boost(backend="cython", boundscheck=False, wraparound=False)
def laplace_loops_transonic_cython(image: Image):
"""Laplace operator for 2D images."""
i: int
j: int
h: int = image.shape[0]
w: int = image.shape[1]
laplacian: Array[np.uint8, "2d"] = np.empty((h - 2, w - 2), np.uint8)
for i in range(1, h - 1):
for j in range(1, w - 1):
laplacian[i - 1, j - 1] = (
abs(
image[i - 1, j]
+ image[i + 1, j]
+ image[i, j - 1]
+ image[i, j + 1]
- 4 * image[i, j]
)
> 0.05
)
return laplacian
if __name__ == "__main__":
from skimage.color import rgb2gray
from skimage.data import astronaut
image = astronaut()
image = rgb2gray(image)
# call these functions to warm them
laplace_transonic_numba(image)
laplace_loops_transonic_numba(image)
laplace_numba(image)
laplace_loops_numba(image)
import pythran
from transonic import __version__
from transonic.util import timeit
loc = locals()
def bench(call, norm=None):
ret = result = timeit(call, globals=loc)
if norm is None:
norm = result
result /= norm
print(f"{call.split('(')[0]:33s}: {result:.2f}")
return ret
print(
f"transonic {__version__}\n"
f"pythran {pythran.__version__}\n"
f"numba {numba.__version__}\n"
)
norm = bench("laplace_transonic_pythran(image)")
print(f"norm = {norm:.2e} s")
bench("laplace_loops_transonic_pythran(image)", norm=norm)
bench("laplace_transonic_cython(image)", norm=norm)
bench("laplace_loops_transonic_cython(image)", norm=norm)
bench("laplace_numba(image)", norm=norm)
bench("laplace_transonic_numba(image)", norm=norm)
bench("laplace_loops_numba(image)", norm=norm)
bench("laplace_loops_transonic_numba(image)", norm=norm)
bench("laplace_numpy(image)", norm=norm)
bench("laplace_transonic_python(image)", norm=norm)
The results are:
transonic 0.4.0
pythran 0.9.3post1
numba 0.45.1
laplace_transonic_pythran : 1.00
norm = 1.42e-04 s
laplace_loops_transonic_pythran : 0.95
laplace_transonic_cython : 8.36
laplace_loops_transonic_cython : 2.61
laplace_numba : 8.94
laplace_transonic_numba : 8.93
laplace_loops_numba : 0.95
laplace_loops_transonic_numba : 0.95
laplace_numpy : 7.01
laplace_transonic_python : 7.00