Slowest to Fastest:
2
with sequential addressing.Further removing instruction bottlenecks:
NUMBA code (possible off-by-1s):
def reduce(out, a, size):
# TPB = threads per block
cache = cuda.shared.array(TPB, numba.float32)
i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
local_i = cuda.threadIdx.x
stride = TPB//2
# First reduction on global read
if i + stride < size:
cache[local_i] = a[i] + a[i + stride]
cuda.syncthreads()
# Sequential addressing with thread-ids
while stride > 32:
if local_i < stride:
cache[local_i] = cache[local_i] + cache[local_i + stride]
cuda.syncthreads()
stride >>= 1
# Only 1 warp left, so no need to syncthreads()
if local_i < 32:
if local_i + 32 < TPB: cache[local_i] = cache[local_i] + cache[local_i + 32]
if local_i + 16 < TPB: cache[local_i] = cache[local_i] + cache[local_i + 16]
if local_i + 8 < TPB: cache[local_i] = cache[local_i] + cache[local_i + 8]
if local_i + 4 < TPB: cache[local_i] = cache[local_i] + cache[local_i + 4]
if local_i + 2 < TPB: cache[local_i] = cache[local_i] + cache[local_i + 2]
if local_i + 1 < TPB: cache[local_i] = cache[local_i] + cache[local_i + 1]
if i == 0:
out = cache