我的kernel如下,profiler结果显示Global Load Efficiency 只有50% 我的理解是在对P1_d赋值的时候,把实部和虚部分开了,所以没能合并内存访问,是这样的吗?如果要提高效率该怎么做呢?
__global__ void computer_P1_with_P2(compx *P1_d,compx *P2_d,double *pupil_d,const double *zercoef_d,double *wf65_d,double *theta_d,int row,int col,int high)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
int j=blockIdx.y*blockDim.y+threadIdx.y;
int idx=j*col+i;
if(i<row&&j<col)
{
float pupil = pupil_d[idx];
float wavefront1 = 0;
//#pragma unroll
for(int k=2;k<Zs;k++)
{
wavefront1 += wf65[k*col*row+idx] * zercoef[k];
}
float wavefront2 = wavefront1 + theta_d[idx];
P1_d[idx].real = pupil * __cosf(wavefront1);
P1_d[idx].imag = pupil * __sinf(wavefront1);
P2_d[idx].real = pupil * __cosf(wavefront2);
P2_d[idx].imag = pupil * __sinf(wavefront2);
}
}
|