Hillis Steele 扫描(并行前缀扫描算法)
在本文中,讨论了一种称为 Hillis-Steele 扫描的扫描算法,也称为并行前缀扫描算法。在此上下文中的扫描操作本质上是指计算数组的前缀和。 Hillis-Steele 扫描是一种用于以并行方式运行的扫描操作的算法。以下是数组 x[]大小为N的算法的方法:
- 使用变量d在范围[1, log 2 (N)] 中迭代,对于所有k并行,检查k的值是否至少为 2 d 。如果发现为真,则将x[k – 2 d – 1 ]的值与值x[k]相加。
视觉表现:
当深度d达到log 2 N 时,计算终止,结果计算为数组的前缀和。所有单独的附加操作并行运行,每一层(d = 1, d = 2, ..., )线性进行。
下面是该算法在 CUDA C++ 中的实现:
C++
// C++ program for the above approach
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include
#include
using namespace std::chrono;
using namespace std;
// Function to handle error
static void HandleError(cudaError_t err,
const char* file,
int line)
{
// If the error occurs then print
// that error
if (err != cudaSuccess) {
printf("\n%s in %s at line %d\n",
cudaGetErrorString(err),
file, line);
// Exit
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (
HandleError(err, __FILE__, __LINE__))
template
__global__ void
Hillis_Steele_Scan_Kernel(T* arr,
__int64 space,
__int64 step,
__int64 steps)
{
__int64 x = threadIdx.x
+ blockDim.x * blockIdx.x;
__int64 y = threadIdx.y
+ blockDim.y * blockIdx.y;
// 2D Kernel Launch parameters
__int64 tid = x + (y * gridDim.x
* blockDim.x);
// Kernel runs in the parallel
// TID is the unique thread ID
if (tid >= space)
arr[tid] += arr[tid - space];
}
template
T* Hillis_Steele_Scan(T* input, __int64 N)
{
__int64* out;
HANDLE_ERROR(
cudaMallocManaged(&out,
(sizeof(__int64) * N)));
// 2D Kernel Launch Parameters
dim3 THREADS(1024, 1, 1);
dim3 BLOCKS;
if (N >= 65536)
BLOCKS = dim3(64, N / 65536, 1);
else if (N <= 1024)
BLOCKS = dim3(1, 1, 1);
else
BLOCKS = dim3(N / 1024, 1, 1);
__int64 space = 1;
// Begin with a stride of 2^0
__int64 steps = __int64(log2(float(N)));
// Log2N depth dependency of scan
HANDLE_ERROR(cudaMemcpy(
out, input, sizeof(__int64) * N,
cudaMemcpyDeviceToDevice));
// Copy Input Array to Output Array
for (size_t step = 0;
step < steps; step++) {
Hillis_Steele_Scan_Kernel<< > >(
out, space, step, steps);
// Calls the parallel operation
space *= 2;
// A[i] += A[i - stride]
// log N times where N
// is array size
}
cudaDeviceSynchronize();
return out;
}
// Driver Code
int main()
{
__int64* inputArr;
__int64 arraysize = 10;
// Size of the input array
__int64 N = __int64(1)
<< (__int64(log2(float(arraysize))) + 1);
// N is the nearest power of 2
// to the array size
cout << "\n\nELEMS --> 2^" << N
<< " >= " << arraysize;
// Allocate memory on the GPU
HANDLE_ERROR(cudaMallocManaged(&inputArr,
(sizeof(__int64) * N)));
HANDLE_ERROR(cudaDeviceSynchronize());
// INIT Test Data
for (__int64 i = 0; i < N; i++) {
inputArr[i] = 1;
}
// An array with only 1s was chosen
// as test data so the result is
// 1, 2, 3, 4, ..., N
high_resolution_clock::time_point tg1
= high_resolution_clock::now();
__int64* out = Hillis_Steele_Scan(
inputArr, N);
// Function Call
high_resolution_clock::time_point tg2
= high_resolution_clock::now();
duration time_span
= duration_cast >(tg2 - tg1);
cout << "\nTime Taken : "
<< time_span.count() * 1000
<< " ms";
cout << endl;
for (__int64 i = 0; i < arraysize; i++)
std::cout << '\t' << out[i];
std::cout << std::endl;
cudaFree(out);
// Free allocated memory from GPU
cudaFree(inputArr);
return 0;
}
复杂度分析: O(log N) 时间和 O(N) 个处理器
想要从精选的视频和练习题中学习,请查看C++ 基础课程,从基础到高级 C++ 和C++ STL 课程,了解语言和 STL。要完成从学习语言到 DS Algo 等的准备工作,请参阅完整的面试准备课程。