Equivalent of usleep() in CUDA kernel?
我想在CUDA内核中调用类似
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | #include <unistd.h> #include <stdio.h> #include <cuda.h> #include <sys/time.h> __global__ void gpu_uSleep(useconds_t wait_time_in_ms) { usleep(wait_time_in_ms); } int main(void) { //input parameters -- arbitrary // TODO: set these exactly for full occupancy int m = 16; int n = 16; int block1D = 16; dim3 block(block1D, block1D); dim3 grid(m/block1D, n/block1D); useconds_t wait_time_in_ms = 1000; //execute the kernel gpu_uSleep<<< grid, block >>>(wait_time_in_ms); cudaDeviceSynchronize(); return 0; } |
当我尝试使用NVCC编译此错误时,出现以下错误:
1 2 | error: calling a host function("usleep") from a __device__/__global__ function("gpu_uSleep") is not allowed |
很明显,我不允许在内核内部使用主机功能,例如
您可以旋转clock()或clock64()。 CUDA SDK parallelKernels示例执行以下操作:
1 2 3 4 5 6 7 8 9 10 | __global__ void clock_block(clock_t *d_o, clock_t clock_count) { clock_t start_clock = clock(); clock_t clock_offset = 0; while (clock_offset < clock_count) { clock_offset = clock() - start_clock; } d_o[0] = clock_offset; } |
我建议使用clock64()。由于clock()和clock64()处于循环状态,因此您必须使用cudaDeviceProperties()查询频率。频率可以是动态的,因此很难保证精确的自旋环路。
您可以忙于等待读取
要等待至少10,000个时钟周期:
1 2 3 4 5 6 7 8 9 10 11 12 | clock_t start = clock(); clock_t now; for (;;) { now = clock(); clock_t cycles = now > start ? now - start : now + (0xffffffff - start); if (cycles >= 10000) { break; } } // Stored"now" in global memory here to prevent the // compiler from optimizing away the entire loop. *global_now = now; |
注意:这未经测试。 @Pedro从此答案中借来了处理溢出的代码。有关
对于最新版本的CUDA和具有Compute Capability 7.0或更高版本的设备(Volta,Turing,Ampere等),可以使用
1 | void __nanosleep(unsigned ns); |
避免了较早答案中所建议的忙碌睡眠。