Asynchronous executions of CUDA memory copies and cuFFT
我有一个 CUDA 程序,用于计算大小为
是否有可能,例如,复制第一个
由于 DFT 基本上是时间值乘以复指数函数的总和,因此我认为应该可以"按块"计算 FFT。
袖口支持这个吗?一般来说,它是一个好的计算想法吗?
编辑
为了更清楚,我不想在不同的阵列上并行计算不同的 FFT。假设我在时域中有大量正弦信号,我想知道信号中有哪些频率。例如,我的想法是将信号长度的三分之一复制到 GPU,然后再复制下三分之一,并使用已复制的输入值的前三分之一并行计算 FFT。然后复制最后三分之一并更新输出值,直到处理完所有时间值。所以最后应该有一个输出数组,在窦的频率上有一个峰值。
请考虑以上评论,尤其是:
考虑到以上两点,我认为只有按照下面代码所示的方式正确使用零填充,才能"模仿"你想要达到的效果。正如您将看到的,让
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | #include <stdio.h> #include <cufft.h> #define BLOCKSIZE 32 #define NUM_STREAMS 3 /**********/ /* iDivUp */ /*********/ int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); } /********************/ /* CUDA ERROR CHECK */ /********************/ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\ ", cudaGetErrorString(code), file, line); if (abort) exit(code); } } /******************/ /* SUMMING KERNEL */ /******************/ __global__ void kernel(float2 *vec1, float2 *vec2, float2 *vec3, float2 *out, int N) { int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < N) { out[tid].x = vec1[tid].x + vec2[tid].x + vec3[tid].x; out[tid].y = vec1[tid].y + vec2[tid].y + vec3[tid].y; } } /********/ /* MAIN */ /********/ int main() { const int N = 600000; const int Npartial = N / NUM_STREAMS; // --- Host input data initialization float2 *h_in1 = new float2[Npartial]; float2 *h_in2 = new float2[Npartial]; float2 *h_in3 = new float2[Npartial]; for (int i = 0; i < Npartial; i++) { h_in1[i].x = 1.f; h_in1[i].y = 0.f; h_in2[i].x = 1.f; h_in2[i].y = 0.f; h_in3[i].x = 1.f; h_in3[i].y = 0.f; } // --- Host output data initialization float2 *h_out = new float2[N]; // --- Registers host memory as page-locked (required for asynch cudaMemcpyAsync) gpuErrchk(cudaHostRegister(h_in1, Npartial*sizeof(float2), cudaHostRegisterPortable)); gpuErrchk(cudaHostRegister(h_in2, Npartial*sizeof(float2), cudaHostRegisterPortable)); gpuErrchk(cudaHostRegister(h_in3, Npartial*sizeof(float2), cudaHostRegisterPortable)); // --- Device input data allocation float2 *d_in1; gpuErrchk(cudaMalloc((void**)&d_in1, N*sizeof(float2))); float2 *d_in2; gpuErrchk(cudaMalloc((void**)&d_in2, N*sizeof(float2))); float2 *d_in3; gpuErrchk(cudaMalloc((void**)&d_in3, N*sizeof(float2))); float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2))); float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2))); float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2))); float2 *d_out; gpuErrchk(cudaMalloc((void**)&d_out, N*sizeof(float2))); // --- Zero padding gpuErrchk(cudaMemset(d_in1, 0, N*sizeof(float2))); gpuErrchk(cudaMemset(d_in2, 0, N*sizeof(float2))); gpuErrchk(cudaMemset(d_in3, 0, N*sizeof(float2))); // --- Creates CUDA streams cudaStream_t streams[NUM_STREAMS]; for (int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamCreate(&streams[i])); // --- Creates cuFFT plans and sets them in streams cufftHandle* plans = (cufftHandle*) malloc(sizeof(cufftHandle)*NUM_STREAMS); for (int i = 0; i < NUM_STREAMS; i++) { cufftPlan1d(&plans[i], N, CUFFT_C2C, 1); cufftSetStream(plans[i], streams[i]); } // --- Async memcopyes and computations gpuErrchk(cudaMemcpyAsync(d_in1, h_in1, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[0])); gpuErrchk(cudaMemcpyAsync(&d_in2[Npartial], h_in2, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[1])); gpuErrchk(cudaMemcpyAsync(&d_in3[2*Npartial], h_in3, Npartial*sizeof(float2), cudaMemcpyHostToDevice, streams[2])); cufftExecC2C(plans[0], (cufftComplex*)d_in1, (cufftComplex*)d_out1, CUFFT_FORWARD); cufftExecC2C(plans[1], (cufftComplex*)d_in2, (cufftComplex*)d_out2, CUFFT_FORWARD); cufftExecC2C(plans[2], (cufftComplex*)d_in3, (cufftComplex*)d_out3, CUFFT_FORWARD); for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamSynchronize(streams[i])); kernel<<<iDivUp(BLOCKSIZE,N), BLOCKSIZE>>>(d_out1, d_out2, d_out3, d_out, N); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpy(h_out, d_out, N*sizeof(float2), cudaMemcpyDeviceToHost)); for (int i=0; i<N; i++) printf("i = %i; real(h_out) = %f; imag(h_out) = %f\ ", i, h_out[i].x, h_out[i].y); // --- Releases resources gpuErrchk(cudaHostUnregister(h_in1)); gpuErrchk(cudaHostUnregister(h_in2)); gpuErrchk(cudaHostUnregister(h_in3)); gpuErrchk(cudaFree(d_in1)); gpuErrchk(cudaFree(d_in2)); gpuErrchk(cudaFree(d_in3)); gpuErrchk(cudaFree(d_out1)); gpuErrchk(cudaFree(d_out2)); gpuErrchk(cudaFree(d_out3)); gpuErrchk(cudaFree(d_out)); for(int i = 0; i < NUM_STREAMS; i++) gpuErrchk(cudaStreamDestroy(streams[i])); delete[] h_in1; delete[] h_in2; delete[] h_in3; delete[] h_out; cudaDeviceReset(); return 0; } |
这是上述代码在 Kepler K20c 卡上运行时的时间线。如您所见,计算与异步内存传输重叠。