Writing benchmarks

With Transonic, writting benchmarks for the different accelerators is very simple. We present an example in this page.

Other examples can be found here:

Comparison Numba vs Pythran (JIT)

We take this file with only pure-Numpy code from this blog post by Florian LE BOURDAIS.

import numpy as np


def laplace_numpy(image):
    """Laplace operator in NumPy for 2D images."""
    laplacian = (
        image[:-2, 1:-1]
        + image[2:, 1:-1]
        + image[1:-1, :-2]
        + image[1:-1, 2:]
        - 4 * image[1:-1, 1:-1]
    )
    thresh = np.abs(laplacian) > 0.05
    return thresh


def laplace_loops(image):
    """Laplace operator for 2D images."""
    h = image.shape[0]
    w = image.shape[1]
    laplacian = np.empty((h - 2, w - 2), np.uint8)
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i - 1, j - 1] = (
                np.abs(
                    image[i - 1, j]
                    + image[i + 1, j]
                    + image[i, j - 1]
                    + image[i, j + 1]
                    - 4 * image[i, j]
                )
                > 0.05
            )
    return laplacian

Our code for a benchmark in JIT mode:

from transonic import jit
import numba

from pure_numpy import laplace_numpy, laplace_loops

laplace_transonic_pythran = jit(native=True, xsimd=True)(laplace_numpy)
laplace_transonic_python = jit(backend="python")(laplace_numpy)
laplace_transonic_numba = jit(backend="numba")(laplace_numpy)
laplace_numba = numba.njit(laplace_numpy)

laplace_transonic_pythran_loops = jit(native=True, xsimd=True)(laplace_loops)
laplace_transonic_python_loops = jit(backend="python")(laplace_loops)
laplace_transonic_numba_loops = jit(backend="numba")(laplace_loops)
laplace_numba_loops = numba.njit(laplace_loops)

if __name__ == "__main__":
    from transonic import wait_for_all_extensions

    from skimage.data import astronaut
    from skimage.color import rgb2gray

    image = astronaut()
    image = rgb2gray(image)

    # warm the functions
    laplace_transonic_python(image)
    laplace_transonic_pythran(image)
    laplace_transonic_pythran_loops(image)
    laplace_transonic_numba(image)
    laplace_transonic_numba_loops(image)
    laplace_numba(image)
    laplace_numba_loops(image)

    wait_for_all_extensions()

    # again warming
    laplace_transonic_numba(image)
    laplace_transonic_numba_loops(image)

    from transonic.util import timeit
    from transonic import __version__
    import pythran

    loc = locals()

    def bench(call, norm=None):
        ret = result = timeit(call, globals=loc)
        if norm is None:
            norm = result
        result /= norm
        print(f"{call.split('(')[0]:33s}: {result:.2f}")
        return ret

    print(
        f"transonic {__version__}\n"
        f"pythran {pythran.__version__}\n"
        f"numba {numba.__version__}\n"
    )

    norm = bench("laplace_transonic_pythran(image)")
    print(f"norm = {norm:.2e} s")
    bench("laplace_transonic_pythran_loops(image)", norm=norm)
    bench("laplace_numba(image)", norm=norm)
    bench("laplace_transonic_numba(image)", norm=norm)
    bench("laplace_numba_loops(image)", norm=norm)
    bench("laplace_transonic_numba_loops(image)", norm=norm)
    bench("laplace_numpy(image)", norm=norm)
    bench("laplace_transonic_python(image)", norm=norm)

gives:

transonic 0.4.0
pythran 0.9.3post1
numba 0.45.1

laplace_transonic_pythran        : 1.00
norm = 1.44e-04 s
laplace_transonic_pythran_loops  : 0.94
laplace_numba                    : 8.82
laplace_transonic_numba          : 8.80
laplace_numba_loops              : 0.94
laplace_transonic_numba_loops    : 0.94
laplace_numpy                    : 6.94
laplace_transonic_python         : 7.03

The warmup is much longer for Transonic-Pythran but remember that it is a cached JIT so it is an issue only for the first call of the function. When we reimport the module, there is no warmup.

Then we see that Pythran is very good to optimize high-level NumPy code! In contrast (with my setup and on my computer), Numba has not been able to optimize this function. However, Numba is good to speedup the code with loops!

Note that the Transonic overhead is negligible even for this very small case (the shape of the image is (512, 512)).

Note

We don’t use the fastmath option of Numba because the Numba backend does not support it yet!

Ahead-of-time compilation

from transonic import boost, Array
import numba

import numpy as np

Image = Array[np.float64, "2d", "C"]


def laplace_numpy(image: Image):
    """Laplace operator in NumPy for 2D images."""
    laplacian = (
        image[:-2, 1:-1]
        + image[2:, 1:-1]
        + image[1:-1, :-2]
        + image[1:-1, 2:]
        - 4 * image[1:-1, 1:-1]
    )
    thresh = np.abs(laplacian) > 0.05
    return thresh


def laplace_loops(image: Image):
    """Laplace operator for 2D images."""
    h = image.shape[0]
    w = image.shape[1]
    laplacian = np.empty((h - 2, w - 2), np.uint8)
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i - 1, j - 1] = (
                np.abs(
                    image[i - 1, j]
                    + image[i + 1, j]
                    + image[i, j - 1]
                    + image[i, j + 1]
                    - 4 * image[i, j]
                )
                > 0.05
            )
    return laplacian


laplace_transonic_pythran = boost(backend="pythran")(laplace_numpy)
laplace_transonic_cython = boost(backend="cython")(laplace_numpy)
laplace_transonic_numba = boost(backend="numba")(laplace_numpy)
laplace_transonic_python = boost(backend="python")(laplace_numpy)
laplace_numba = numba.njit(laplace_numpy)


laplace_loops_transonic_pythran = boost(backend="pythran")(laplace_loops)
laplace_loops_transonic_python = boost(backend="python")(laplace_loops)
laplace_loops_transonic_numba = boost(backend="numba")(laplace_loops)
laplace_loops_numba = numba.njit(laplace_loops)


# For Cython, we need to add more type annotations

@boost(backend="cython", boundscheck=False, wraparound=False)
def laplace_loops_transonic_cython(image: Image):
    """Laplace operator for 2D images."""
    i: int
    j: int
    h: int = image.shape[0]
    w: int = image.shape[1]
    laplacian: Array[np.uint8, "2d"] = np.empty((h - 2, w - 2), np.uint8)
    for i in range(1, h - 1):
        for j in range(1, w - 1):
            laplacian[i - 1, j - 1] = (
                abs(
                    image[i - 1, j]
                    + image[i + 1, j]
                    + image[i, j - 1]
                    + image[i, j + 1]
                    - 4 * image[i, j]
                )
                > 0.05
            )
    return laplacian


if __name__ == "__main__":

    from skimage.data import astronaut
    from skimage.color import rgb2gray

    image = astronaut()
    image = rgb2gray(image)

    # call these functions to warm them
    laplace_transonic_numba(image)
    laplace_loops_transonic_numba(image)
    laplace_numba(image)
    laplace_loops_numba(image)

    from transonic.util import timeit
    from transonic import __version__
    import pythran

    loc = locals()

    def bench(call, norm=None):
        ret = result = timeit(call, globals=loc)
        if norm is None:
            norm = result
        result /= norm
        print(f"{call.split('(')[0]:33s}: {result:.2f}")
        return ret

    print(
        f"transonic {__version__}\n"
        f"pythran {pythran.__version__}\n"
        f"numba {numba.__version__}\n"
    )

    norm = bench("laplace_transonic_pythran(image)")
    print(f"norm = {norm:.2e} s")
    bench("laplace_loops_transonic_pythran(image)", norm=norm)
    bench("laplace_transonic_cython(image)", norm=norm)
    bench("laplace_loops_transonic_cython(image)", norm=norm)
    bench("laplace_numba(image)", norm=norm)
    bench("laplace_transonic_numba(image)", norm=norm)
    bench("laplace_loops_numba(image)", norm=norm)
    bench("laplace_loops_transonic_numba(image)", norm=norm)
    bench("laplace_numpy(image)", norm=norm)
    bench("laplace_transonic_python(image)", norm=norm)

The results are:

transonic 0.4.0
pythran 0.9.3post1
numba 0.45.1

laplace_transonic_pythran        : 1.00
norm = 1.42e-04 s
laplace_loops_transonic_pythran  : 0.95
laplace_transonic_cython         : 8.36
laplace_loops_transonic_cython   : 2.61
laplace_numba                    : 8.94
laplace_transonic_numba          : 8.93
laplace_loops_numba              : 0.95
laplace_loops_transonic_numba    : 0.95
laplace_numpy                    : 7.01
laplace_transonic_python         : 7.00