Benchmark projection vector

import numpy as np
from transonic import boost, Array, Type

A = Array[Type(np.float64, np.complex128), "3d"]
Af = "float[:,:,:]"
A = Af  # issue fused type with Cython


def proj(vx: A, vy: A, vz: A, kx: Af, ky: Af, kz: Af, inv_k_square_nozero: Af):
    tmp = (kx * vx + ky * vy + kz * vz) * inv_k_square_nozero
    vx -= kx * tmp
    vy -= ky * tmp
    vz -= kz * tmp


def proj_loop(
    vx: A, vy: A, vz: A, kx: Af, ky: Af, kz: Af, inv_k_square_nozero: Af
):

    # type annotations only useful for Cython
    n0: int
    n1: int
    n2: int
    i0: int
    i1: int
    i2: int
    tmp: float

    n0, n1, n2 = kx.shape[0], kx.shape[1], kx.shape[2]

    for i0 in range(n0):
        for i1 in range(n1):
            for i2 in range(n2):
                tmp = (
                    kx[i0, i1, i2] * vx[i0, i1, i2]
                    + ky[i0, i1, i2] * vy[i0, i1, i2]
                    + kz[i0, i1, i2] * vz[i0, i1, i2]
                ) * inv_k_square_nozero[i0, i1, i2]

                vx[i0, i1, i2] -= kx[i0, i1, i2] * tmp
                vy[i0, i1, i2] -= ky[i0, i1, i2] * tmp
                vz[i0, i1, i2] -= kz[i0, i1, i2] * tmp


proj_pythran = boost(backend="pythran")(proj)
proj_numba = boost(backend="numba")(proj)
proj_cython = boost(backend="cython")(proj)

proj_loop_pythran = boost(backend="pythran")(proj_loop)
proj_loop_numba = boost(backend="numba")(proj_loop)
proj_loop_cython = boost(backend="cython", boundscheck=False, wraparound=False)(
    proj_loop
)


if __name__ == "__main__":
    from textwrap import dedent

    from transonic.util import print_versions, timeit_verbose

    loc = locals()

    print_versions()

    setup = dedent(
        """
        shape = n0, n1, n2 = 64, 512, 512
        k0 = np.linspace(0, 100, n0)
        k1 = np.linspace(0, 100, n1)
        k2 = np.linspace(0, 100, n2)
        K1, K0, K2 = np.meshgrid(k1, k0, k2, copy=False)
        kz = np.ascontiguousarray(K0)
        ky = np.ascontiguousarray(K1)
        kx = np.ascontiguousarray(K2)

        k_square_nozero = K0 ** 2 + K1 ** 2 + K2 ** 2
        k_square_nozero[0, 0, 0] = 1e-14
        inv_k_square_nozero = 1.0 / k_square_nozero

        vx = np.ones(shape)
        vy = np.ones(shape)
        vz = np.ones(shape)
    """
    )

    print()
    norm = timeit_verbose(
        "proj(vx, vy, vz, kx, ky, kz, inv_k_square_nozero)",
        setup=setup,
        globals=loc,
    )

    for backend in ("cython", "numba", "pythran"):
        timeit_verbose(
            f"proj_{backend}(vx, vy, vz, kx, ky, kz, inv_k_square_nozero)",
            setup=setup,
            globals=loc,
            norm=norm,
        )
        timeit_verbose(
            f"proj_loop_{backend}(vx, vy, vz, kx, ky, kz, inv_k_square_nozero)",
            setup=setup,
            globals=loc,
            norm=norm,
        )

This example uses the boost decorator, so the Python file needs to be transpiled by Transonic and the accelerator files need to be compiled. You can for example run from the directory doc/examples/bench_proj_perp:

make clean
make
python bench.py

The last command gives something like:

Transonic 0.4.1
Pythran 0.9.3post1
Numba 0.46.0
Cython 0.29.13

proj                             : 1.00 * norm
norm = 5.76e-01 s
proj_cython                      : 1.26 * norm
proj_loop_cython                 : 0.18 * norm
proj_numba                       : 1.34 * norm
proj_loop_numba                  : 0.15 * norm
proj_pythran                     : 0.42 * norm
proj_loop_pythran                : 0.14 * norm

For the solution with loops, the 3 backends are equally good.

Pythran also accelerates the high level implementation, but in this case, it is still more than twice slower than the implementation with loops.