Benchmark projection vector
import numpy as np
from transonic import boost, Array, Type
A = Array[Type(np.float64, np.complex128), "3d"]
Af = "float[:,:,:]"
A = Af # issue fused type with Cython
def proj(vx: A, vy: A, vz: A, kx: Af, ky: Af, kz: Af, inv_k_square_nozero: Af):
tmp = (kx * vx + ky * vy + kz * vz) * inv_k_square_nozero
vx -= kx * tmp
vy -= ky * tmp
vz -= kz * tmp
def proj_loop(
vx: A, vy: A, vz: A, kx: Af, ky: Af, kz: Af, inv_k_square_nozero: Af
):
# type annotations only useful for Cython
n0: int
n1: int
n2: int
i0: int
i1: int
i2: int
tmp: float
n0, n1, n2 = kx.shape[0], kx.shape[1], kx.shape[2]
for i0 in range(n0):
for i1 in range(n1):
for i2 in range(n2):
tmp = (
kx[i0, i1, i2] * vx[i0, i1, i2]
+ ky[i0, i1, i2] * vy[i0, i1, i2]
+ kz[i0, i1, i2] * vz[i0, i1, i2]
) * inv_k_square_nozero[i0, i1, i2]
vx[i0, i1, i2] -= kx[i0, i1, i2] * tmp
vy[i0, i1, i2] -= ky[i0, i1, i2] * tmp
vz[i0, i1, i2] -= kz[i0, i1, i2] * tmp
proj_pythran = boost(backend="pythran")(proj)
proj_numba = boost(backend="numba")(proj)
proj_cython = boost(backend="cython")(proj)
proj_loop_pythran = boost(backend="pythran")(proj_loop)
proj_loop_numba = boost(backend="numba")(proj_loop)
proj_loop_cython = boost(backend="cython", boundscheck=False, wraparound=False)(
proj_loop
)
if __name__ == "__main__":
from textwrap import dedent
from transonic.util import print_versions, timeit_verbose
loc = locals()
print_versions()
setup = dedent(
"""
shape = n0, n1, n2 = 64, 512, 512
k0 = np.linspace(0, 100, n0)
k1 = np.linspace(0, 100, n1)
k2 = np.linspace(0, 100, n2)
K1, K0, K2 = np.meshgrid(k1, k0, k2, copy=False)
kz = np.ascontiguousarray(K0)
ky = np.ascontiguousarray(K1)
kx = np.ascontiguousarray(K2)
k_square_nozero = K0 ** 2 + K1 ** 2 + K2 ** 2
k_square_nozero[0, 0, 0] = 1e-14
inv_k_square_nozero = 1.0 / k_square_nozero
vx = np.ones(shape)
vy = np.ones(shape)
vz = np.ones(shape)
"""
)
print()
norm = timeit_verbose(
"proj(vx, vy, vz, kx, ky, kz, inv_k_square_nozero)",
setup=setup,
globals=loc,
)
for backend in ("cython", "numba", "pythran"):
timeit_verbose(
f"proj_{backend}(vx, vy, vz, kx, ky, kz, inv_k_square_nozero)",
setup=setup,
globals=loc,
norm=norm,
)
timeit_verbose(
f"proj_loop_{backend}(vx, vy, vz, kx, ky, kz, inv_k_square_nozero)",
setup=setup,
globals=loc,
norm=norm,
)
This example uses the boost
decorator, so the Python file needs to be
transpiled by Transonic and the accelerator files need to be compiled. You can
for example run from the directory doc/examples/bench_proj_perp
:
make clean
make
python bench.py
The last command gives something like:
Transonic 0.4.1
Pythran 0.9.3post1
Numba 0.46.0
Cython 0.29.13
proj : 1.00 * norm
norm = 5.76e-01 s
proj_cython : 1.26 * norm
proj_loop_cython : 0.18 * norm
proj_numba : 1.34 * norm
proj_loop_numba : 0.15 * norm
proj_pythran : 0.42 * norm
proj_loop_pythran : 0.14 * norm
For the solution with loops, the 3 backends are equally good.
Pythran also accelerates the high level implementation, but in this case, it is still more than twice slower than the implementation with loops.