并行编程实战——CUDA编程的事件-开发者社区

一、CUDA中的事件

大家可能在别的开发语言中都学习过事件这个概念，其实在CUDA中事件这个概念与它们都类似。不过，在CUDA中事件更贴近于其字面本身的意义，它是类似一种标志，用来密切监视设备进度即同步工具。同时可以通过让应用程序在程序中的任何点异步记录事件并查询这些事件何时完成来执行准确的计时。当事件之前的所有任务（或可选地，给定流中的所有命令）都已完成时，事件即已完成。在所有流中的所有先前任务和命令完成之后，流零（指空或默认流）中的事件也会完成。
也就是说，在CUDA编程中，事件既可以作为同步工具又可以作为精确测量执行时间的工具。

二、CUDA事件主要应用场景

通过上面的说明，大家可以基本明白CUDA事件的定义，那么对事件来说其应用场景是什么呢？

性能测量
利用事件既可以进行内核时间的测量，还可以进行流线的操作（计算重叠等）

cudaEventRecord(start,0);for(inti=0;i<2;++i){cudaMemcpyAsync(inputDev+i*size,inputHost+i*size,size,cudaMemcpyHostToDevice,stream[i]);MyKernel<<<100,512,0,stream[i]>>>(outputDev+i*size,inputDev+i*size,size);cudaMemcpyAsync(outputHost+i*size,outputDev+i*size,size,cudaMemcpyDeviceToHost,stream[i]);}cudaEventRecord(stop,0);cudaEventSynchronize(stop);floatelapsedTime;cudaEventElapsedTime(&elapsedTime,start,stop);

流的同步
主要用于控制流间的依赖关系，如多流的同步，不同阶段间的计算同步以及多GPU的数据依赖和Graphic，另外还可以进行动态工作流的控制处理。

__global__voidfoo1(char*A){*A=0x1;}__global__voidfoo2(char*B){printf("%d\n",*B);// *B == *A == 0x1 assuming foo2 waits for foo1// to complete before launching}cudaMemcpyAsync(B,input,size,stream1);// Aliases are allowed at// operation boundariesfoo1<<<1,1,0,stream1>>>(A);// allowing foo1 to access A.cudaEventRecord(event,stream1);cudaStreamWaitEvent(stream2,event);foo2<<<1,1,0,stream2>>>(B);cudaStreamWaitEvent(stream3,event);cudaMemcpyAsync(output,B,size,stream3);// Both launches of foo2 andcudaMemcpy (which both read)// wait for foo1 (which writes) to complete before proceeding

注：上面代码来自CUDA官网

三、流和流同步

虽然事件可以进行流同步，但与流同步还是有一些不同之处，主要有：

事件的同步机制更灵活，可以多流间控制。而流同步一般是整个流内部
事件控制比流同步更精确，上文也提到了，事件可以进行点的控制，而流同步一般是整个流
事件与流同步相比，开销更低
事件应用比流同步要广泛，除了同步外还可以进行计时等处理

技术概念间互相对比，可以更好的加强学习理解的深刻性。

四、例程

针对上面的说明，可以看下面的例程：

#include"cuda_runtime.h"#include"device_launch_parameters.h"#include<stdio.h>#include<stdlib.h>__global__voidkernelFunc(float*data,intnum,floatfactor){intidx=blockIdx.x*blockDim.x+threadIdx.x;if(idx<num){data[idx]=data[idx]*factor+idx*0.001f;}}intmain(){constintN=1<<20;// 100wconstintnumStreams=4;constintchunkSize=N/numStreams;constsize_tchunkBytes=chunkSize*sizeof(float);constsize_ttotalBytes=N*sizeof(float);printf("mul stream sync test...\n");float*hData=NULL;cudaMallocHost(&hData,totalBytes);for(inti=0;i<N;i++){hData[i]=(float)rand()/RAND_MAX;}float*dData=NULL;cudaMalloc(&dData,totalBytes);cudaStream_tstreams[numStreams];for(inti=0;i<numStreams;i++){cudaStreamCreate(&streams[i]);}//1 cacl timecudaEvent_tstartEvent,stopEvent;cudaEvent_tkernelEvents[numStreams];cudaEventCreate(&startEvent);cudaEventCreate(&stopEvent);for(inti=0;i<numStreams;i++){cudaEventCreateWithFlags(&kernelEvents[i],cudaEventDisableTiming);}cudaEventRecord(startEvent,0);intthreadsPerBlock=256;intblocksPerChunk=(chunkSize+threadsPerBlock-1)/threadsPerBlock;for(inti=0;i<numStreams;i++){intoffset=i*chunkSize;cudaMemcpyAsync(&dData[offset],&hData[offset],chunkBytes,cudaMemcpyHostToDevice,streams[i]);kernelFunc<<<blocksPerChunk,threadsPerBlock,0,streams[i]>>>(&dData[offset],chunkSize,(float)(i+1)*0.5f);cudaGetLastError();cudaEventRecord(kernelEvents[i],streams[i]);cudaMemcpyAsync(&hData[offset],&dData[offset],chunkBytes,cudaMemcpyDeviceToHost,streams[i]);}for(inti=0;i<numStreams;i++){cudaStreamSynchronize(streams[i]);}// record finish timepointcudaEventRecord(stopEvent,0);cudaEventSynchronize(stopEvent);floattotalTime=0.f;cudaEventElapsedTime(&totalTime,startEvent,stopEvent);printf("sum time: %.3f ms\n",totalTime);//2 stream dependedprintf("\n stream sync :\n");cudaEvent_tsyncEvent;cudaEventCreate(&syncEvent);kernelFunc<<<blocksPerChunk,threadsPerBlock,0,streams[0]>>>(dData,chunkSize,2.0f);cudaEventRecord(syncEvent,streams[0]);for(inti=1;i<numStreams;i++){cudaStreamWaitEvent(streams[i],syncEvent,0);kernelFunc<<<blocksPerChunk,threadsPerBlock,0,streams[i]>>>(&dData[i*chunkSize],chunkSize,1.5f);}for(inti=0;i<numStreams;i++){cudaStreamSynchronize(streams[i]);}printf("sync finish \n");printf("\n event query:\n");cudaEvent_tqueryEvent;cudaEventCreate(&queryEvent);kernelFunc<<<blocksPerChunk,threadsPerBlock>>>(dData,chunkSize,1.0f);cudaEventRecord(queryEvent,0);intmaxChecks=100;intcheckCount=0;while(cudaEventQuery(queryEvent)==cudaErrorNotReady){checkCount++;if(checkCount<maxChecks){intdummy=0;for(intj=0;j<1000;j++){dummy+=j;}}else{cudaEventSynchronize(queryEvent);break;}}printf("event query count: %d\n",checkCount);cudaEventDestroy(syncEvent);cudaEventDestroy(queryEvent);cudaEventDestroy(startEvent);cudaEventDestroy(stopEvent);for(inti=0;i<numStreams;i++){cudaEventDestroy(kernelEvents[i]);cudaStreamDestroy(streams[i]);}cudaFree(dData);cudaFreeHost(hData);cudaDeviceReset();printf("\n all finish！\n");return0;}