CUDA:将单个位集与位集数组进行异或
我希望将单个位集与一组其他位集(~100k)进行XOR运算,并计算每个XOR结果的集合位数。单个位集的大小约为20k位。
位集已转换为unsigned int
数组,以便能够使用内在__popc()
函数。‘Bunch’已连续驻留在设备内存中。
我当前的内核代码如下:
// Grid/Blocks used for kernel invocation
dim3 block(32);
dim3 grid((bunch_size / 31) + 32);
__global__ void kernelXOR(uint * bitset, uint * bunch, int * set_bits, int bitset_size, int bunch_size) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < bunch_size){ // 1 Thread for each bitset in the 'bunch'
int sum = 0;
uint xor_res = 0;
for (int i = 0; i < bitset_size; ++i){ // Iterate through every uint-block of the bitsets
xor_res = bitset[i] ^ bunch[bitset_size * tid + i];
sum += __popc(xor_res);
}
set_bits[tid] = sum;
}
}
然而,与并行化的C++/Boost版本相比,我认为使用CUDA没有任何好处。
是否有优化此内核的潜力?
解决方案
是否有优化此内核的潜力?
我在这里看到了两个问题(它们是任何CUDA程序员的前两个经典的主要优化目标):
您希望尝试有效地使用全局内存。您对
bitset
和bunch
的访问未合并。(高效使用内存子系统)通常不建议每个块使用32个线程,这可能会限制您的总体占用。每比特集一个线程也有潜在的问题。(公开足够的并行性)
如果没有比较测试用例,就不可能说解决这些问题是否符合您对收益的定义。此外,在CUDA中,当单独考虑这样的简单内存限制问题时,很少会有兴趣。但是,我们(可能)可以提高您的内核的性能。
我们将使用一系列想法:
- 让每个块处理一个位集,而不是每个线程,以启用合并
- 使用共享内存加载比较位集并重新使用
- 仅使用足够的块使GPU饱和,striding loops
- 使用
const ... __restrict__
样式装饰可能受益于RO缓存
这里有一个工作示例:
$ cat t1649.cu
#include <iostream>
#include <cstdlib>
const int my_bitset_size = 20000/(32);
const int my_bunch_size = 100000;
typedef unsigned uint;
//using one thread per bitset in the bunch
__global__ void kernelXOR(uint * bitset, uint * bunch, int * set_bits, int bitset_size, int bunch_size) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < bunch_size){ // 1 Thread for each bitset in the 'bunch'
int sum = 0;
uint xor_res = 0;
for (int i = 0; i < bitset_size; ++i){ // Iterate through every uint-block of the bitsets
xor_res = bitset[i] ^ bunch[bitset_size * tid + i];
sum += __popc(xor_res);
}
set_bits[tid] = sum;
}
}
const int nTPB = 256;
// one block per bitset, multiple bitsets per block
__global__ void kernelXOR_imp(const uint * __restrict__ bitset, const uint * __restrict__ bunch, int * __restrict__ set_bits, int bitset_size, int bunch_size) {
__shared__ uint sbitset[my_bitset_size]; // could also be dynamically allocated for varying bitset sizes
__shared__ int ssum[nTPB];
// load shared, block-stride loop
for (int idx = threadIdx.x; idx < bitset_size; idx += blockDim.x) sbitset[idx] = bitset[idx];
__syncthreads();
// stride across all bitsets in bunch
for (int bidx = blockIdx.x; bidx < bunch_size; bidx += gridDim.x){
int my_sum = 0;
for (int idx = threadIdx.x; idx < bitset_size; idx += blockDim.x) my_sum += __popc(sbitset[idx] ^ bunch[bidx*bitset_size + idx]);
// block level parallel reduction
ssum[threadIdx.x] = my_sum;
for (int ridx = nTPB>>1; ridx > 0; ridx >>=1){
__syncthreads();
if (threadIdx.x < ridx) ssum[threadIdx.x] += ssum[threadIdx.x+ridx];}
if (!threadIdx.x) set_bits[bidx] = ssum[0];}
}
int main(){
// data setup
uint *d_cbitset, *d_bitsets, *h_cbitset, *h_bitsets;
int *d_r, *h_r, *h_ri;
h_cbitset = new uint[my_bitset_size];
h_bitsets = new uint[my_bitset_size*my_bunch_size];
h_r = new int[my_bunch_size];
h_ri = new int[my_bunch_size];
for (int i = 0; i < my_bitset_size*my_bunch_size; i++){
h_bitsets[i] = rand();
if (i < my_bitset_size) h_cbitset[i] = rand();}
cudaMalloc(&d_cbitset, my_bitset_size*sizeof(uint));
cudaMalloc(&d_bitsets, my_bitset_size*my_bunch_size*sizeof(uint));
cudaMalloc(&d_r, my_bunch_size*sizeof(int));
cudaMemcpy(d_cbitset, h_cbitset, my_bitset_size*sizeof(uint), cudaMemcpyHostToDevice);
cudaMemcpy(d_bitsets, h_bitsets, my_bitset_size*my_bunch_size*sizeof(uint), cudaMemcpyHostToDevice);
// original
// Grid/Blocks used for kernel invocation
dim3 block(32);
dim3 grid((my_bunch_size / 31) + 32);
kernelXOR<<<grid, block>>>(d_cbitset, d_bitsets, d_r, my_bitset_size, my_bunch_size);
cudaMemcpy(h_r, d_r, my_bunch_size*sizeof(int), cudaMemcpyDeviceToHost);
// improved
dim3 iblock(nTPB);
dim3 igrid(640);
kernelXOR_imp<<<igrid, iblock>>>(d_cbitset, d_bitsets, d_r, my_bitset_size, my_bunch_size);
cudaMemcpy(h_ri, d_r, my_bunch_size*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < my_bunch_size; i++)
if (h_r[i] != h_ri[i]) {std::cout << "mismatch at i: " << i << " was: " << h_ri[i] << " should be: " << h_r[i] << std::endl; return 0;}
std::cout << "Results match." << std::endl;
return 0;
}
$ nvcc -o t1649 t1649.cu
$ cuda-memcheck ./t1649
========= CUDA-MEMCHECK
Results match.
========= ERROR SUMMARY: 0 errors
$ nvprof ./t1649
==18868== NVPROF is profiling process 18868, command: ./t1649
Results match.
==18868== Profiling application: ./t1649
==18868== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 97.06% 71.113ms 2 35.557ms 2.3040us 71.111ms [CUDA memcpy HtoD]
2.26% 1.6563ms 1 1.6563ms 1.6563ms 1.6563ms kernelXOR(unsigned int*, unsigned int*, int*, int, int)
0.59% 432.68us 1 432.68us 432.68us 432.68us kernelXOR_imp(unsigned int const *, unsigned int const *, int*, int, int)
0.09% 64.770us 2 32.385us 31.873us 32.897us [CUDA memcpy DtoH]
API calls: 78.20% 305.44ms 3 101.81ms 11.373us 304.85ms cudaMalloc
18.99% 74.161ms 4 18.540ms 31.554us 71.403ms cudaMemcpy
1.39% 5.4121ms 4 1.3530ms 675.30us 3.3410ms cuDeviceTotalMem
1.26% 4.9393ms 388 12.730us 303ns 530.95us cuDeviceGetAttribute
0.11% 442.37us 4 110.59us 102.61us 125.59us cuDeviceGetName
0.03% 128.18us 2 64.088us 21.789us 106.39us cudaLaunchKernel
0.01% 35.764us 4 8.9410us 2.9670us 18.982us cuDeviceGetPCIBusId
0.00% 8.3090us 8 1.0380us 540ns 1.3870us cuDeviceGet
0.00% 5.9530us 3 1.9840us 310ns 3.9900us cuDeviceGetCount
0.00% 2.8800us 4 720ns 574ns 960ns cuDeviceGetUuid
$
在这种情况下,在我的Tesla V100上,对于您的问题大小,我看到内核性能大约提高了4倍。然而,与数据移动的成本相比,这里的内核性能微不足道。因此,如果这是您在GPU上做的唯一一件事,那么这些类型的优化不太可能在您的比较测试用例中产生显著差异。
上面的代码在块级和网格级使用跨度循环,这意味着它对于几乎任何选择的线程块大小(请选择32的倍数)以及网格大小都应该正确。这并不意味着任何/所有的选择都会平等地发挥作用。线程块大小的选择是为了考虑几乎完全占用的可能性(所以不要选择32)。网格大小的选择是实现每个SM的完全占用的块数乘以短消息的数量。这些应该是近乎最佳的选择,但根据我的测试,更多的块并不会真正降低性能,并且假设相应地计算块的数量,对于几乎任何线程块大小(除了32),性能应该大致恒定。
相关文章