ice 发表于 2013-9-22 15:04 
LZ您好:
请您以profiler为准,并给出测试截图以便分析。
for (int j = 0; j <2; j++)
{
if(j==1)
{
cudaEventRecord(start1, NULL);
}
matrixMul<<<blocks,threads>>>(d_c,d_a,d_b,N,N);
if(j==1)
{
cudaEventRecord(stop1, NULL);
cudaEventSynchronize(stop1);
}
//cudaThreadSynchronize();
}
float msecTotal1 = 0.0f;
cudaEventElapsedTime(&msecTotal1, start1, stop1);
float msecPerMatrixMul1 = msecTotal1 ;
double flopsPerMatrixMul1 = 2.0 * (double)N* (double)N * (double)N;
double gigaFlops1 = (flopsPerMatrixMul1 * 1.0e-9f) / (msecPerMatrixMul1 / 1000.0f);
printf( "matrixMul1 erformance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops1,msecPerMatrixMul1,flopsPerMatrixMul1);
C:\Users\wang\Desktop\实验结果.jpg |