# Benchmark projection vector

```import numpy as np
from transonic import boost, Array, Type

A = Array[Type(np.float64, np.complex128), "3d"]
Af = "float[:,:,:]"
A = Af  # issue fused type with Cython

def proj(vx: A, vy: A, vz: A, kx: Af, ky: Af, kz: Af, inv_k_square_nozero: Af):
tmp = (kx * vx + ky * vy + kz * vz) * inv_k_square_nozero
vx -= kx * tmp
vy -= ky * tmp
vz -= kz * tmp

def proj_loop(
vx: A, vy: A, vz: A, kx: Af, ky: Af, kz: Af, inv_k_square_nozero: Af
):

# type annotations only useful for Cython
n0: int
n1: int
n2: int
i0: int
i1: int
i2: int
tmp: float

n0, n1, n2 = kx.shape[0], kx.shape[1], kx.shape[2]

for i0 in range(n0):
for i1 in range(n1):
for i2 in range(n2):
tmp = (
kx[i0, i1, i2] * vx[i0, i1, i2]
+ ky[i0, i1, i2] * vy[i0, i1, i2]
+ kz[i0, i1, i2] * vz[i0, i1, i2]
) * inv_k_square_nozero[i0, i1, i2]

vx[i0, i1, i2] -= kx[i0, i1, i2] * tmp
vy[i0, i1, i2] -= ky[i0, i1, i2] * tmp
vz[i0, i1, i2] -= kz[i0, i1, i2] * tmp

proj_pythran = boost(backend="pythran")(proj)
proj_numba = boost(backend="numba")(proj)
proj_cython = boost(backend="cython")(proj)

proj_loop_pythran = boost(backend="pythran")(proj_loop)
proj_loop_numba = boost(backend="numba")(proj_loop)
proj_loop_cython = boost(backend="cython", boundscheck=False, wraparound=False)(
proj_loop
)

if __name__ == "__main__":
from textwrap import dedent

from transonic.util import print_versions, timeit_verbose

loc = locals()

print_versions()

setup = dedent(
"""
shape = n0, n1, n2 = 64, 512, 512
k0 = np.linspace(0, 100, n0)
k1 = np.linspace(0, 100, n1)
k2 = np.linspace(0, 100, n2)
K1, K0, K2 = np.meshgrid(k1, k0, k2, copy=False)
kz = np.ascontiguousarray(K0)
ky = np.ascontiguousarray(K1)
kx = np.ascontiguousarray(K2)

k_square_nozero = K0 ** 2 + K1 ** 2 + K2 ** 2
k_square_nozero[0, 0, 0] = 1e-14
inv_k_square_nozero = 1.0 / k_square_nozero

vx = np.ones(shape)
vy = np.ones(shape)
vz = np.ones(shape)
"""
)

print()
norm = timeit_verbose(
"proj(vx, vy, vz, kx, ky, kz, inv_k_square_nozero)",
setup=setup,
globals=loc,
)

for backend in ("cython", "numba", "pythran"):
timeit_verbose(
f"proj_{backend}(vx, vy, vz, kx, ky, kz, inv_k_square_nozero)",
setup=setup,
globals=loc,
norm=norm,
)
timeit_verbose(
f"proj_loop_{backend}(vx, vy, vz, kx, ky, kz, inv_k_square_nozero)",
setup=setup,
globals=loc,
norm=norm,
)
```

This example uses the `boost` decorator, so the Python file needs to be transpiled by Transonic and the accelerator files need to be compiled. You can for example run from the directory `doc/examples/bench_proj_perp`:

```make clean
make
python bench.py
```

The last command gives something like:

```Transonic 0.4.1
Pythran 0.9.3post1
Numba 0.46.0
Cython 0.29.13

proj                             : 1.00 * norm
norm = 5.76e-01 s
proj_cython                      : 1.26 * norm
proj_loop_cython                 : 0.18 * norm
proj_numba                       : 1.34 * norm
proj_loop_numba                  : 0.15 * norm
proj_pythran                     : 0.42 * norm
proj_loop_pythran                : 0.14 * norm
```

For the solution with loops, the 3 backends are equally good.

Pythran also accelerates the high level implementation, but in this case, it is still more than twice slower than the implementation with loops.