cuda优化c63code

yulovemm
0 ℃
2021-01-10

整理文档很辛苦，赏杯茶钱您下走！

还剩 ... 页未读，继续阅读 >>

免费阅读已结束，点击下载阅读编辑剩下 ... 页

阅读已结束，您可以下载文档离线阅读编辑

资源描述

目录1实验准备....................................................................................................................................11.1实验环境说明.................................................................................................................11.2瓶颈分析........................................................................................................................12cuda并行优化............................................................................................................................22.1cuda编程模型..................................................................................................................22.2SAD并行优化思路........................................................................................................23测试结果与比较........................................................................................................................83.1运行时间........................................................................................................................83.2图像质量......................................................................................................................104总结..........................................................................................................................................1111实验准备1.1实验环境说明本次实验程序首先在本地PC进行调试与优化，最后在老师实验室的服务器上进行测试与验证。实验环境如下：本地配置：ubuntu18.04+cuda10.0+NvidiaGeforceMX150服务器配置：linux+cuda+NvidiaTeslaK20c测试用例：FOREMAN_352x288.yuv，像素为352x2881.2瓶颈分析由于给出的源程序没有改变，所以采用gprof性能分析工具给出的结果与第一次实验一致。如图1-1所示，在运动估计模块，计算当前帧与参考帧block之间的sad函数占用了整个程序90%以上的时间，所以本次实验的优化目标依然是利用GPU线程级并行化技术加速块之间sad的计算。图1-1源程序各函数时间占比22cuda并行优化2.1cuda编程模型在cuda编程模型中，使用核函数对输入的数据在GPU上进行处理，而核函数的执行有三层构架。核函数会被多个线程(thread)并行执行，若干个线程会被组织成一个线程块(block)，同一个线程块中的线程可以同步，也可以通过共享内存来进行相互通信。而在此之上，多个线程块会构成网格(grid)。线程块与网格均可以是一维、二维或者是三维的。其构架如图2-1所示。图2-1cuda线程组织关系2.2SAD并行优化思路通过1.2小节的瓶颈分析，我们得知本次优化的主要目标函数是sad_block_8x8()，这个函数函数的主要作用是计算两个8x8块的误差和。在运动估计的过程中，需要在搜索范围内遍历所有的参考帧中的块计算与当前块的sad，并找出最小的sad。而这个过程每一块3是相互独立的，因此可以从这里入手，将每个块并行执行。由于数据在GPU与CPU之间进行拷贝十分耗时，在进行程序设计时一次将一帧数据传入GPU，避免数据频繁的调入调出。其数据与Grid,block与thread的对应关系如图2-2所示图2-2数据的组织将每一帧数据组织成二维网格(grid)，大小为mb_cols×mb_rows，每个线程块Block恰好对应一个8x8的图像块，而Block的大小与搜索范围对应，每个线程thread计算出俩个8x8图像块的sad。其具体实现如下：voidme_block_8x8_cuda(structc63_common*cm,uint8_t*orig,uint8_t*ref,intcc)intrange=cm-me_search_range;intmb_x=cm-mb_cols;intmb_y=cm-mb_rows;inttotal=mb_x*mb_y;if(cc0){range/=2;mb_x/=2;mb_y/=2;}intw=cm-padw[cc];//x,colsinth=cm-padh[cc];//y,rowssize_tsize;//loadframe1andframe2todevicememoryuint8_t*d_orig;4size=w*h*sizeof(uint8_t);cudaMalloc(&d_orig,size);cudaMemcpy(d_orig,orig,size,cudaMemcpyHostToDevice);uint8_t*d_ref;cudaMalloc(&d_ref,size);cudaMemcpy(d_ref,ref,size,cudaMemcpyHostToDevice);//Allocatesadindevicememoryandhostmemoryint*sad,*d_sad;size=mb_x*mb_y*4*range*range*sizeof(int);cudaMalloc(&d_sad,size);sad=(int*)malloc(size);memset(sad,INT_MAX,size);cudaMemcpy(d_sad,sad,size,cudaMemcpyHostToDevice);//toseeifthecudakernelfunctionlaunchfailedcudaError_tcudaStatus=cudaGetLastError();//Invokekerneldim3dimBlock(2*range,2*range);dim3dimGrid(mb_x,mb_y);compute_saddimGrid,dimBlock(d_orig,d_ref,w,d_sad,h);//printerrormessageif(cudaStatus!=cudaSuccess){fprintf(stderr,Kernellaunchfailed:%s\n,cudaGetErrorString(cudaStatus));}cudaDeviceSynchronize();//ReadsadfromdevicememorycudaMemcpy(sad,d_sad,size,cudaMemcpyDeviceToHost);//computebest_sadintbest_sad=INT_MAX;inti,j,x,y;intleft,top,right,bottom;structmacroblock*mb;structmacroblock_cuda*h_mbs,*d_mbs;Init_mb_gpu(&h_mbs,&d_mbs,total);cudaMemcpy(h_mbs-gpu_mv_x,h_mbs-mv_x,sizeof(uint8_t)*mb_x*mb_y,5cudaMemcpyHostToDevice);cudaMemcpy(h_mbs-gpu_mv_y,h_mbs-mv_y,sizeof(uint8_t)*mb_x*mb_y,cudaMemcpyHostToDevice);cudaDeviceSynchronize();dim3thrperblock(32,32,1);intgrid_dim=ceil(total/1024.0);FreeMacroblockCuda(h_mbs,d_mbs);cudaFree(d_orig);cudaFree(d_ref);cudaFree(d_sad);free(sad);}首先使用cudaMalloc函数分配GPU上内存，使用cudaMemcpy函数将数据读入GPU使用核函数compute_sad进行计算，之后将所得的sad计算结果送入核函数find_best_mv寻找每个8x8图像块最小的sad并确定每个8x8图像块的位移。最终将数据送回CPU进行存储。compute_sad核函数实现如下：__global__voidcompute_sad(uint8_t*block1,uint8_t*block2,intstride,int*result,inth)__global__voidcompute_sad(uint8_t*block1,uint8_t*block2,intstride,int*result,inth){intxbound,ybound;intleft=blockIdx.x*8-blockDim.x/2;inttop=blockIdx.y*8-blockDim.y/2;intright=blockIdx.x*8+blockDim.x/2;intbottom=blockIdx.y*8+blockDim.y/2;if(left0)left=0;if(top0)top=0;if(right(stride-8))right=stride-8;if(bottom(h-8))bottom=h-8;xbound=right-left;ybound=bottom-top;6uint8_t*search_range_ref=block2+top*stride+left;intsad_value;__shared__uint8_torig_block[64];if(threadIdx.x8&&threadIdx.y8){orig_block[threadIdx.y*8+threadIdx.x]=(block1+blockIdx.y*8*stride+blockIdx.x*8)[threadIdx.y*stride+threadIdx.x];}__syncthreads();constunsignedintmask=0x3210+0x1111*(threadIdx.x%4);uint8_t*ref_top_row_aligned=search_range_ref+threadIdx.y*stride+(threadIdx.x/4)*4;//v_2.0if(threadIdx.xxbound&&threadIdx.yybound){sad_value=0;#pragmaunrollfor(unsignedintv=8;v--;){uint32_t*ref_row=(uint32