Writing benchmarks

With Transonic, writting benchmarks for the different accelerators is very simple. We present an example in this page.

Other examples can be found here:

Comparison Numba vs Pythran (JIT)

We take this file with only pure-Numpy code from this blog post by Florian LE BOURDAIS.

import numpy as np


def laplace_numpy(image):
    """Laplace operator in NumPy for 2D images."""
    laplacian = (
        image[:-2, 1:-1]
        + image[2:, 1:-1]
        + image[1:-1, :-2]
        + image[1:-1, 2:]
        - 4 * image[1:-1, 1:-1]
    )
    thresh = np.abs(laplacian) > 0.05
    return thresh


def laplace_loops(image):
    """Laplace operator for 2D images."""
    h = image.shape[0]
    w = image.shape[1]
    laplacian = np.empty((h - 2, w - 2), np.uint8)
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i - 1, j - 1] = (
                np.abs(
                    image[i - 1, j]
                    + image[i + 1, j]
                    + image[i, j - 1]
                    + image[i, j + 1]
                    - 4 * image[i, j]
                )
                > 0.05
            )
    return laplacian

Our code for a benchmark in JIT mode:

import numba
from pure_numpy import laplace_loops, laplace_numpy

from transonic import jit

laplace_transonic_pythran = jit(native=True, xsimd=True)(laplace_numpy)
laplace_transonic_python = jit(backend="python")(laplace_numpy)
laplace_transonic_numba = jit(backend="numba")(laplace_numpy)
laplace_numba = numba.njit(laplace_numpy)

laplace_transonic_pythran_loops = jit(native=True, xsimd=True)(laplace_loops)
laplace_transonic_python_loops = jit(backend="python")(laplace_loops)
laplace_transonic_numba_loops = jit(backend="numba")(laplace_loops)
laplace_numba_loops = numba.njit(laplace_loops)

if __name__ == "__main__":
    from skimage.color import rgb2gray
    from skimage.data import astronaut

    from transonic import wait_for_all_extensions

    image = astronaut()
    image = rgb2gray(image)

    # warm the functions
    laplace_transonic_python(image)
    laplace_transonic_pythran(image)
    laplace_transonic_pythran_loops(image)
    laplace_transonic_numba(image)
    laplace_transonic_numba_loops(image)
    laplace_numba(image)
    laplace_numba_loops(image)

    wait_for_all_extensions()

    # again warming
    laplace_transonic_numba(image)
    laplace_transonic_numba_loops(image)

    import pythran
    from transonic import __version__
    from transonic.util import timeit

    loc = locals()

    def bench(call, norm=None):
        ret = result = timeit(call, globals=loc)
        if norm is None:
            norm = result
        result /= norm
        print(f"{call.split('(')[0]:33s}: {result:.2f}")
        return ret

    print(
        f"transonic {__version__}\n"
        f"pythran {pythran.__version__}\n"
        f"numba {numba.__version__}\n"
    )

    norm = bench("laplace_transonic_pythran(image)")
    print(f"norm = {norm:.2e} s")
    bench("laplace_transonic_pythran_loops(image)", norm=norm)
    bench("laplace_numba(image)", norm=norm)
    bench("laplace_transonic_numba(image)", norm=norm)
    bench("laplace_numba_loops(image)", norm=norm)
    bench("laplace_transonic_numba_loops(image)", norm=norm)
    bench("laplace_numpy(image)", norm=norm)
    bench("laplace_transonic_python(image)", norm=norm)

gives:

transonic 0.4.0
pythran 0.9.3post1
numba 0.45.1

laplace_transonic_pythran        : 1.00
norm = 1.44e-04 s
laplace_transonic_pythran_loops  : 0.94
laplace_numba                    : 8.82
laplace_transonic_numba          : 8.80
laplace_numba_loops              : 0.94
laplace_transonic_numba_loops    : 0.94
laplace_numpy                    : 6.94
laplace_transonic_python         : 7.03

The warmup is much longer for Transonic-Pythran but remember that it is a cached JIT so it is an issue only for the first call of the function. When we reimport the module, there is no warmup.

Then we see that Pythran is very good to optimize high-level NumPy code! In contrast (with my setup and on my computer), Numba has not been able to optimize this function. However, Numba is good to speedup the code with loops!

Note that the Transonic overhead is negligible even for this very small case (the shape of the image is (512, 512)).

Note

We don’t use the fastmath option of Numba because the Numba backend does not support it yet!

Ahead-of-time compilation

import numba
import numpy as np

from transonic import Array, boost

Image = Array[np.float64, "2d", "C"]


def laplace_numpy(image: Image):
    """Laplace operator in NumPy for 2D images."""
    laplacian = (
        image[:-2, 1:-1]
        + image[2:, 1:-1]
        + image[1:-1, :-2]
        + image[1:-1, 2:]
        - 4 * image[1:-1, 1:-1]
    )
    thresh = np.abs(laplacian) > 0.05
    return thresh


def laplace_loops(image: Image):
    """Laplace operator for 2D images."""
    h = image.shape[0]
    w = image.shape[1]
    laplacian = np.empty((h - 2, w - 2), np.uint8)
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i - 1, j - 1] = (
                np.abs(
                    image[i - 1, j]
                    + image[i + 1, j]
                    + image[i, j - 1]
                    + image[i, j + 1]
                    - 4 * image[i, j]
                )
                > 0.05
            )
    return laplacian


laplace_transonic_pythran = boost(backend="pythran")(laplace_numpy)
laplace_transonic_cython = boost(backend="cython")(laplace_numpy)
laplace_transonic_numba = boost(backend="numba")(laplace_numpy)
laplace_transonic_python = boost(backend="python")(laplace_numpy)
laplace_numba = numba.njit(laplace_numpy)


laplace_loops_transonic_pythran = boost(backend="pythran")(laplace_loops)
laplace_loops_transonic_python = boost(backend="python")(laplace_loops)
laplace_loops_transonic_numba = boost(backend="numba")(laplace_loops)
laplace_loops_numba = numba.njit(laplace_loops)


# For Cython, we need to add more type annotations


@boost(backend="cython", boundscheck=False, wraparound=False)
def laplace_loops_transonic_cython(image: Image):
    """Laplace operator for 2D images."""
    i: int
    j: int
    h: int = image.shape[0]
    w: int = image.shape[1]
    laplacian: Array[np.uint8, "2d"] = np.empty((h - 2, w - 2), np.uint8)
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i - 1, j - 1] = (
                abs(
                    image[i - 1, j]
                    + image[i + 1, j]
                    + image[i, j - 1]
                    + image[i, j + 1]
                    - 4 * image[i, j]
                )
                > 0.05
            )
    return laplacian


if __name__ == "__main__":

    from skimage.color import rgb2gray
    from skimage.data import astronaut

    image = astronaut()
    image = rgb2gray(image)

    # call these functions to warm them
    laplace_transonic_numba(image)
    laplace_loops_transonic_numba(image)
    laplace_numba(image)
    laplace_loops_numba(image)

    import pythran
    from transonic import __version__
    from transonic.util import timeit

    loc = locals()

    def bench(call, norm=None):
        ret = result = timeit(call, globals=loc)
        if norm is None:
            norm = result
        result /= norm
        print(f"{call.split('(')[0]:33s}: {result:.2f}")
        return ret

    print(
        f"transonic {__version__}\n"
        f"pythran {pythran.__version__}\n"
        f"numba {numba.__version__}\n"
    )

    norm = bench("laplace_transonic_pythran(image)")
    print(f"norm = {norm:.2e} s")
    bench("laplace_loops_transonic_pythran(image)", norm=norm)
    bench("laplace_transonic_cython(image)", norm=norm)
    bench("laplace_loops_transonic_cython(image)", norm=norm)
    bench("laplace_numba(image)", norm=norm)
    bench("laplace_transonic_numba(image)", norm=norm)
    bench("laplace_loops_numba(image)", norm=norm)
    bench("laplace_loops_transonic_numba(image)", norm=norm)
    bench("laplace_numpy(image)", norm=norm)
    bench("laplace_transonic_python(image)", norm=norm)

The results are:

transonic 0.4.0
pythran 0.9.3post1
numba 0.45.1

laplace_transonic_pythran        : 1.00
norm = 1.42e-04 s
laplace_loops_transonic_pythran  : 0.95
laplace_transonic_cython         : 8.36
laplace_loops_transonic_cython   : 2.61
laplace_numba                    : 8.94
laplace_transonic_numba          : 8.93
laplace_loops_numba              : 0.95
laplace_loops_transonic_numba    : 0.95
laplace_numpy                    : 7.01
laplace_transonic_python         : 7.00