将从Python Numba CUDA内核调用加速的FFT

2022-04-02 00:00:00 python fft numba cuda jit

问题描述

我需要计算256个元素的Float64信号的傅里叶变换。要求是这样的,我需要从cuda.jitt节内部调用这些FFT,并且必须在25usec内完成。唉,cuda.jit编译的函数不允许调用外部库=>我自己写的。唉,我的单核代码仍然太慢了(在Quadro P4000上大约250usec)。有没有更好的办法?

我创建了一个单核FFT函数,它可以提供正确的结果,但速度却慢了10倍。我不明白如何利用好多核。

---fft.py

from numba import cuda, boolean, void, int32, float32, float64, complex128
import math, sys, cmath

def _transform_radix2(vector, inverse, out):    
    n = len(vector) 
    levels = int32(math.log(float32(n))/math.log(float32(2)))

    assert 2**levels==n # error: Length is not a power of 2 

    #uncomment either Numba.Cuda or Numpy memory allocation, (intelligent conditional compileation??)               
    exptable = cuda.local.array(1024, dtype=complex128)   
    #exptable = np.zeros(1024, np.complex128)

    assert (n // 2) <= len(exptable)  # error: FFT length > MAXFFTSIZE

    coef = complex128((2j if inverse else -2j) * math.pi / n)   
    for i in range(n // 2):                       
        exptable[i] = cmath.exp(i * coef)       

    for i in range(n):
        x = i   
        y = 0
        for j in range(levels):
            y = (y << 1) | (x & 1)
            x >>= 1
        out[i] = vector[y]      

    size = 2
    while size <= n:
        halfsize = size // 2
        tablestep = n // size
        for i in range(0, n, size):
            k = 0
            for j in range(i, i + halfsize):
                temp = out[j + halfsize] * exptable[k]    
                out[j + halfsize] = out[j] - temp
                out[j] += temp
                k += tablestep
        size *= 2

    scale=float64(n if inverse else 1)
    for i in range(n):
        out[i]=out[i]/scale   # the inverse requires a scaling

# now create the Numba.cuda version to be called by a GPU
gtransform_radix2 = cuda.jit(device=True)(_transform_radix2)

---test.py

from numba import cuda, void, float64, complex128, boolean
import cupy as cp
import numpy as np
import timeit  
import fft

@cuda.jit(void(float64[:],boolean, complex128[:]))    
def fftbench(y, inverse, FT):
  Y  = cuda.local.array(256, dtype=complex128)

  for i in range(len(y)):
    Y[i]=complex128(y[i])    
  fft.gtransform_radix2(Y, False, FT)


str='
best [%2d/%2d] iterations, min:[%9.3f], max:[%9.3f], mean:[%9.3f], std:[%9.3f] usec'
a=[127.734375 ,130.87890625 ,132.1953125  ,129.62109375 ,118.6015625
 ,110.2890625  ,106.55078125 ,104.8203125  ,106.1875     ,109.328125
 ,113.5        ,118.6640625  ,125.71875    ,127.625      ,120.890625
 ,114.04296875 ,112.0078125  ,112.71484375 ,110.18359375 ,104.8828125
 ,104.47265625 ,106.65625    ,109.53515625 ,110.73828125 ,111.2421875
 ,112.28125    ,112.38671875 ,112.7734375  ,112.7421875  ,113.1328125
 ,113.24609375 ,113.15625    ,113.66015625 ,114.19921875 ,114.5
 ,114.5546875  ,115.09765625 ,115.2890625  ,115.7265625  ,115.41796875
 ,115.73828125 ,116.         ,116.55078125 ,116.5625     ,116.33984375
 ,116.63671875 ,117.015625   ,117.25       ,117.41015625 ,117.6640625
 ,117.859375   ,117.91015625 ,118.38671875 ,118.51171875 ,118.69921875
 ,118.80859375 ,118.67578125 ,118.78125    ,118.49609375 ,119.0078125
 ,119.09375    ,119.15234375 ,119.33984375 ,119.31640625 ,119.6640625
 ,119.890625   ,119.80078125 ,119.69140625 ,119.65625    ,119.83984375
 ,119.9609375  ,120.15625    ,120.2734375  ,120.47265625 ,120.671875
 ,120.796875   ,120.4609375  ,121.1171875  ,121.35546875 ,120.94921875
 ,120.984375   ,121.35546875 ,120.87109375 ,120.8359375  ,121.2265625
 ,121.2109375  ,120.859375   ,121.17578125 ,121.60546875 ,121.84375
 ,121.5859375  ,121.6796875  ,121.671875   ,121.78125    ,121.796875
 ,121.8828125  ,121.9921875  ,121.8984375  ,122.1640625  ,121.9375
 ,122.         ,122.3515625  ,122.359375   ,122.1875     ,122.01171875
 ,121.91015625 ,122.11328125 ,122.1171875  ,122.6484375  ,122.81640625
 ,122.33984375 ,122.265625   ,122.78125    ,122.44921875 ,122.34765625
 ,122.59765625 ,122.63671875 ,122.6796875  ,122.6171875  ,122.34375
 ,122.359375   ,122.7109375  ,122.83984375 ,122.546875   ,122.25390625
 ,122.06640625 ,122.578125   ,122.7109375  ,122.83203125 ,122.5390625
 ,122.2421875  ,122.06640625 ,122.265625   ,122.13671875 ,121.8046875
 ,121.87890625 ,121.88671875 ,122.2265625  ,121.63671875 ,121.14453125
 ,120.84375    ,120.390625   ,119.875      ,119.34765625 ,119.0390625
 ,118.4609375  ,117.828125   ,117.1953125  ,116.9921875  ,116.046875
 ,115.16015625 ,114.359375   ,113.1875     ,110.390625   ,108.41796875
 ,111.90234375 ,117.296875   ,127.0234375  ,147.58984375 ,158.625
 ,129.8515625  ,120.96484375 ,124.90234375 ,130.17578125 ,136.47265625
 ,143.9296875  ,150.24609375 ,141.         ,117.71484375 ,109.80859375
 ,115.24609375 ,118.44140625 ,120.640625   ,120.9921875  ,111.828125
 ,101.6953125  ,111.21484375 ,114.91015625 ,115.2265625  ,118.21875
 ,125.3359375  ,139.44140625 ,139.76953125 ,135.84765625 ,137.3671875
 ,141.67578125 ,139.53125    ,136.44921875 ,135.08203125 ,135.7890625
 ,137.58203125 ,138.7265625  ,154.33203125 ,172.01171875 ,152.24609375
 ,129.8046875  ,125.59375    ,125.234375   ,127.32421875 ,132.8984375
 ,147.98828125 ,152.328125   ,153.7734375  ,155.09765625 ,156.66796875
 ,159.0546875  ,151.83203125 ,138.91796875 ,138.0546875  ,140.671875
 ,143.48046875 ,143.99609375 ,146.875      ,146.7578125  ,141.15234375
 ,141.5        ,140.76953125 ,140.8828125  ,145.5625     ,150.78125
 ,148.89453125 ,150.02734375 ,150.70703125 ,152.24609375 ,148.47265625
 ,131.95703125 ,125.40625    ,123.265625   ,123.57421875 ,129.859375
 ,135.6484375  ,144.51171875 ,155.05078125 ,158.4453125  ,140.8125
 ,100.08984375 ,104.29296875 ,128.55078125 ,139.9921875  ,143.38671875
 ,143.69921875 ,137.734375   ,124.48046875 ,116.73828125 ,114.84765625
 ,113.85546875 ,117.45703125 ,122.859375   ,125.8515625  ,133.22265625
 ,139.484375   ,135.75       ,122.69921875 ,115.7734375  ,116.9375
 ,127.57421875] 
y1 =cp.zeros(len(a), cp.complex128) 
FT1=cp.zeros(len(a), cp.complex128)

for i in range(len(a)):
  y1[i]=a[i]  #convert to complex to feed the FFT

r=1000
series=sorted(timeit.repeat("fftbench(y1, False, FT1)",      number=1, repeat=r, globals=globals()))
series=series[0:r-5]
print(str % (len(series), r, 1e6*np.min(series), 1e6*np.max(series), 1e6*np.mean(series), 1e6*np.std(series)));



a faster implementation t<<25usec

解决方案

您的算法的缺点是,即使在GPU上,它也运行在单核上。

为了了解如何在NVIDIA GPGPU上设计算法,我建议查看以下内容: the CUDA C Programming guide并切换到the numba documentation以应用python中的代码。

此外,为了了解您的代码出了什么问题,我建议使用nvidiaprofiler。

答案的以下部分将解释如何将基础知识应用于您的示例。


运行多线程

为了提高性能,您首先需要启动多个可以并行运行的线程,CUDA处理线程的方式如下:

  • 线程分为n个线程组(n<;1024)

  • 具有相同块的每个线程都可以同步,并且可以访问称为"共享内存"的(快速)公共内存空间。

  • 您可以在"网格"中并行运行多个块,但您将失去同步机制。

运行多个线程的语法如下:

fftbench[griddim, blockdim](y1, False, FT1)

为简化起见,我将仅使用一个大小为256的块:

fftbench[1, 256](y1, False, FT1)

内存

要提高GPU性能,重要的是要查看数据将存储在哪里,它们主要有三个空间:

  • 全局内存:它是您的GPU的"RAM",速度慢且延迟高,这是您将所有数组发送到GPU时所在的位置。

    /li>
  • 共享内存:这是一种访问速度较快的内存,一个块的所有线程都可以访问同一共享内存。

  • 本地内存:物理上与全局内存相同,但每个线程访问自己的本地内存。

通常,如果您使用相同数据的倍数,则应尝试将它们存储在共享内存中,以防止全局内存的延迟。

在您的代码中,可以将exptable存储在共享内存中:

exptable = cuda.shared.array(1024, dtype=complex128)

如果n不太大,您可能希望使用Working而不是使用out

working = cuda.shared.array(256, dtype=complex128)

将任务分配给每个线程

当然,如果您不更改函数,所有线程都将执行相同的工作,这只会减慢程序的运行速度。

在本例中,我们将每个线程分配给数组的一个单元。为此,我们必须获取包含块的线程的唯一ID:

idx = cuda.threadIdx.x

现在我们将能够加快for循环的速度,让我们逐一处理它们:

exptable = cuda.shared.array(1024, dtype=complex128)   
...
for i in range(n // 2):                       
    exptable[i] = cmath.exp(i * coef)       

目标是:我们希望n/2个第一个线程填充此数组,然后所有线程都将能够使用它。

因此,在本例中,只需将for循环替换为线程idx上的条件:

if idx < n // 2:
    exptable[idx] = cmath.exp(idx * coef)

最后两个循环比较简单,每个线程将处理数组的一个单元格:

for i in range(n):
    x = i   
    y = 0
    for j in range(levels):
        y = (y << 1) | (x & 1)
        x >>= 1
    out[i] = vector[y]   

成为

x = idx   
y = 0
for j in range(levels):
    y = (y << 1) | (x & 1)
    x >>= 1
working[idx] = vector[y]

for i in range(n):
    out[i]=out[i]/scale   # the inverse requires a scaling

成为

out[idx]=working[idx]/scale   # the inverse requires a scaling

我使用共享数组working,但如果您要使用全局内存,可以将其替换为out

现在,让我们来看一下While循环,我们说我们希望每个线程只处理数组的一个单元。因此,我们可以尝试将两个for循环并行化。

    ...
    for i in range(0, n, size):
        k = 0
        for j in range(i, i + halfsize):
            temp = out[j + halfsize] * exptable[k]    
            out[j + halfsize] = out[j] - temp
            out[j] += temp
            k += tablestep
    ...

为了简化,我将只使用一半的线程,我们将获取128个第一个线程,并确定j如下:

    ...
    if idx < 128:
        j = (idx%halfsize) + size*(idx//halfsize)
    ...

k为:

    k = tablestep*(idx%halfsize)

所以我们得到了循环:

size = 2
while size <= n:
    halfsize = size // 2
    tablestep = n // size

    if idx < 128:
        j = (idx%halfsize) + size*(idx//halfsize)            
        k = tablestep*(idx%halfsize)
        temp = working[j + halfsize] * exptable[k]
        working[j + halfsize] = working[j] - temp
        working[j] += temp
    size *= 2

同步

最后但并非最不重要的一点是,我们需要同步所有这些线程。事实上,如果我们不同步,程序就不会工作。在GPU线程上可能不会同时运行,因此当数据由一个线程生成并由另一个线程使用时,您可能会遇到问题,例如:

  • exptable[0]在THREAD_0填充存储其值之前由THREAD_2使用

  • working[j + halfsize]在您将其存储在temp

  • 之前被另一个线程修改

为了防止出现这种情况,我们可以使用函数:

cuda.syncthreads()

同一块中的所有线程将在执行其余代码之前完成此行。

在此示例中,您需要在working初始化和While循环的每次迭代之后的两个点进行同步。

然后您的代码如下所示:

def _transform_radix2(vector, inverse, out):    
  n = len(vector) 
  levels = int32(math.log(float32(n))/math.log(float32(2)))

  assert 2**levels==n # error: Length is not a power of 2 

  exptable = cuda.shared.array(1024, dtype=complex128)
  working = cuda.shared.array(256, dtype=complex128)

  assert (n // 2) <= len(exptable)  # error: FFT length > MAXFFTSIZE

  coef = complex128((2j if inverse else -2j) * math.pi / n)   
  if idx < n // 2:
    exptable[idx] = cmath.exp(idx * coef)    

  x = idx   
  y = 0
  for j in range(levels):
    y = (y << 1) | (x & 1)
    x >>= 1
  working[idx] = vector[y]    
  cuda.syncthreads()

  size = 2
  while size <= n:
    halfsize = size // 2
    tablestep = n // size

    if idx < 128:
      j = (idx%halfsize) + size*(idx//halfsize)            
      k = tablestep*(idx%halfsize)
      temp = working[j + halfsize] * exptable[k]
      working[j + halfsize] = working[j] - temp
      working[j] += temp
    size *= 2
    cuda.syncthreads()

  scale=float64(n if inverse else 1)
  out[idx]=working[idx]/scale   # the inverse requires a scaling

我觉得您的问题很好地介绍了GPGPU计算的一些基础知识,我试图以一种说教的方式回答它。最终的代码并不完美,可以进行很多优化,如果你想了解更多关于GPU优化的知识,我强烈建议你阅读Programming guide。

相关文章