Writing benchmarks
With Transonic, writting benchmarks for the different accelerators is very simple. We present an example in this page.
Other examples can be found here:
Comparison Numba vs Pythran (JIT)
We take this file with only pure-Numpy code from this blog post by Florian LE BOURDAIS.
import numpy as np
def laplace_numpy(image):
"""Laplace operator in NumPy for 2D images."""
laplacian = (
image[:-2, 1:-1]
+ image[2:, 1:-1]
+ image[1:-1, :-2]
+ image[1:-1, 2:]
- 4 * image[1:-1, 1:-1]
)
thresh = np.abs(laplacian) > 0.05
return thresh
def laplace_loops(image):
"""Laplace operator for 2D images."""
h = image.shape[0]
w = image.shape[1]
laplacian = np.empty((h - 2, w - 2), np.uint8)
for i in range(1, h - 1):
for j in range(1, w - 1):
laplacian[i - 1, j - 1] = (
np.abs(
image[i - 1, j]
+ image[i + 1, j]
+ image[i, j - 1]
+ image[i, j + 1]
- 4 * image[i, j]
)
> 0.05
)
return laplacian
Our code for a benchmark in JIT mode:
from transonic import jit
import numba
from pure_numpy import laplace_numpy, laplace_loops
laplace_transonic_pythran = jit(native=True, xsimd=True)(laplace_numpy)
laplace_transonic_python = jit(backend="python")(laplace_numpy)
laplace_transonic_numba = jit(backend="numba")(laplace_numpy)
laplace_numba = numba.njit(laplace_numpy)
laplace_transonic_pythran_loops = jit(native=True, xsimd=True)(laplace_loops)
laplace_transonic_python_loops = jit(backend="python")(laplace_loops)
laplace_transonic_numba_loops = jit(backend="numba")(laplace_loops)
laplace_numba_loops = numba.njit(laplace_loops)
if __name__ == "__main__":
from transonic import wait_for_all_extensions
from skimage.data import astronaut
from skimage.color import rgb2gray
image = astronaut()
image = rgb2gray(image)
# warm the functions
laplace_transonic_python(image)
laplace_transonic_pythran(image)
laplace_transonic_pythran_loops(image)
laplace_transonic_numba(image)
laplace_transonic_numba_loops(image)
laplace_numba(image)
laplace_numba_loops(image)
wait_for_all_extensions()
# again warming
laplace_transonic_numba(image)
laplace_transonic_numba_loops(image)
from transonic.util import timeit
from transonic import __version__
import pythran
loc = locals()
def bench(call, norm=None):
ret = result = timeit(call, globals=loc)
if norm is None:
norm = result
result /= norm
print(f"{call.split('(')[0]:33s}: {result:.2f}")
return ret
print(
f"transonic {__version__}\n"
f"pythran {pythran.__version__}\n"
f"numba {numba.__version__}\n"
)
norm = bench("laplace_transonic_pythran(image)")
print(f"norm = {norm:.2e} s")
bench("laplace_transonic_pythran_loops(image)", norm=norm)
bench("laplace_numba(image)", norm=norm)
bench("laplace_transonic_numba(image)", norm=norm)
bench("laplace_numba_loops(image)", norm=norm)
bench("laplace_transonic_numba_loops(image)", norm=norm)
bench("laplace_numpy(image)", norm=norm)
bench("laplace_transonic_python(image)", norm=norm)
gives:
transonic 0.4.0
pythran 0.9.3post1
numba 0.45.1
laplace_transonic_pythran : 1.00
norm = 1.44e-04 s
laplace_transonic_pythran_loops : 0.94
laplace_numba : 8.82
laplace_transonic_numba : 8.80
laplace_numba_loops : 0.94
laplace_transonic_numba_loops : 0.94
laplace_numpy : 6.94
laplace_transonic_python : 7.03
The warmup is much longer for Transonic-Pythran but remember that it is a cached JIT so it is an issue only for the first call of the function. When we reimport the module, there is no warmup.
Then we see that Pythran is very good to optimize high-level NumPy code! In contrast (with my setup and on my computer), Numba has not been able to optimize this function. However, Numba is good to speedup the code with loops!
Note that the Transonic overhead is negligible even for this very small case
(the shape of the image is (512, 512)
).
Note
We don’t use the fastmath
option of Numba because the Numba backend
does not support it yet!
Ahead-of-time compilation
from transonic import boost, Array
import numba
import numpy as np
Image = Array[np.float64, "2d", "C"]
def laplace_numpy(image: Image):
"""Laplace operator in NumPy for 2D images."""
laplacian = (
image[:-2, 1:-1]
+ image[2:, 1:-1]
+ image[1:-1, :-2]
+ image[1:-1, 2:]
- 4 * image[1:-1, 1:-1]
)
thresh = np.abs(laplacian) > 0.05
return thresh
def laplace_loops(image: Image):
"""Laplace operator for 2D images."""
h = image.shape[0]
w = image.shape[1]
laplacian = np.empty((h - 2, w - 2), np.uint8)
for i in range(1, h - 1):
for j in range(1, w - 1):
laplacian[i - 1, j - 1] = (
np.abs(
image[i - 1, j]
+ image[i + 1, j]
+ image[i, j - 1]
+ image[i, j + 1]
- 4 * image[i, j]
)
> 0.05
)
return laplacian
laplace_transonic_pythran = boost(backend="pythran")(laplace_numpy)
laplace_transonic_cython = boost(backend="cython")(laplace_numpy)
laplace_transonic_numba = boost(backend="numba")(laplace_numpy)
laplace_transonic_python = boost(backend="python")(laplace_numpy)
laplace_numba = numba.njit(laplace_numpy)
laplace_loops_transonic_pythran = boost(backend="pythran")(laplace_loops)
laplace_loops_transonic_python = boost(backend="python")(laplace_loops)
laplace_loops_transonic_numba = boost(backend="numba")(laplace_loops)
laplace_loops_numba = numba.njit(laplace_loops)
# For Cython, we need to add more type annotations
@boost(backend="cython", boundscheck=False, wraparound=False)
def laplace_loops_transonic_cython(image: Image):
"""Laplace operator for 2D images."""
i: int
j: int
h: int = image.shape[0]
w: int = image.shape[1]
laplacian: Array[np.uint8, "2d"] = np.empty((h - 2, w - 2), np.uint8)
for i in range(1, h - 1):
for j in range(1, w - 1):
laplacian[i - 1, j - 1] = (
abs(
image[i - 1, j]
+ image[i + 1, j]
+ image[i, j - 1]
+ image[i, j + 1]
- 4 * image[i, j]
)
> 0.05
)
return laplacian
if __name__ == "__main__":
from skimage.data import astronaut
from skimage.color import rgb2gray
image = astronaut()
image = rgb2gray(image)
# call these functions to warm them
laplace_transonic_numba(image)
laplace_loops_transonic_numba(image)
laplace_numba(image)
laplace_loops_numba(image)
from transonic.util import timeit
from transonic import __version__
import pythran
loc = locals()
def bench(call, norm=None):
ret = result = timeit(call, globals=loc)
if norm is None:
norm = result
result /= norm
print(f"{call.split('(')[0]:33s}: {result:.2f}")
return ret
print(
f"transonic {__version__}\n"
f"pythran {pythran.__version__}\n"
f"numba {numba.__version__}\n"
)
norm = bench("laplace_transonic_pythran(image)")
print(f"norm = {norm:.2e} s")
bench("laplace_loops_transonic_pythran(image)", norm=norm)
bench("laplace_transonic_cython(image)", norm=norm)
bench("laplace_loops_transonic_cython(image)", norm=norm)
bench("laplace_numba(image)", norm=norm)
bench("laplace_transonic_numba(image)", norm=norm)
bench("laplace_loops_numba(image)", norm=norm)
bench("laplace_loops_transonic_numba(image)", norm=norm)
bench("laplace_numpy(image)", norm=norm)
bench("laplace_transonic_python(image)", norm=norm)
The results are:
transonic 0.4.0
pythran 0.9.3post1
numba 0.45.1
laplace_transonic_pythran : 1.00
norm = 1.42e-04 s
laplace_loops_transonic_pythran : 0.95
laplace_transonic_cython : 8.36
laplace_loops_transonic_cython : 2.61
laplace_numba : 8.94
laplace_transonic_numba : 8.93
laplace_loops_numba : 0.95
laplace_loops_transonic_numba : 0.95
laplace_numpy : 7.01
laplace_transonic_python : 7.00