在学习cuda的时候,运行了《CUDA for Engineers an Introduction to High-Performance Parallel Computing》的dd_ld_shared这个案例时,一直出错,这个案例主要是输入一个一维数组,然后每个数组值取其前后两值做个加加减减的运算,然后输出打印,发现打印的值都是初始值,说明这个核没有运行过。然后我在每个cuda函数加了各check_error的函数,发现运行到从显存写出数据到内存的时候,说是非法的内存访问(illeage memory之类的),那么也间接说明了那个核根本没有运算。这个案例用到了动态申请共享内存。所有代码如下:
#include <math.h>
#include <stdio.h>
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define TPB 64
#define RAD 1
__global__ void ddKernel(float *d_out, const float *d_in, int size, float h) {
const int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i >= size) return;
const int s_idx = threadIdx.x + RAD;
extern __shared__ float s_in[];
// Regular cells
s_in[s_idx] = d_in;
// Halo cells
if (threadIdx.x < RAD) {
// careful: the two lines below will also access d_in[-1] and d_in[size+1] which
// are undefined! This bug is fixed in heat_2d (cf. idxClip function)
s_in[s_idx - RAD] = d_in[i - RAD];
s_in[s_idx + blockDim.x] = d_in[i + blockDim.x];
}
__syncthreads();
d_out = (s_in[s_idx-1] - 2.f*s_in[s_idx] + s_in[s_idx+1])/(h*h);
}
int main(){
const float PI = 3.1415927;
const int N = 150;
const float h = 2*PI/N;
float x[N] = {0.0};
float u[N] = {0.0};
float result_parallel[N] = {0.0};
for(int i=0;i<N;i++)
{
x = 2*PI*i/N;
u = sinf(x);
}
float *d_in = 0;
float *d_out = 0;
HANDLE_ERROR(cudaMalloc(&d_in,N*sizeof(float)));
HANDLE_ERROR(cudaMalloc(&d_out,N*sizeof(float)));
HANDLE_ERROR(cudaMemcpy(d_in,&u[0],N*sizeof(float),cudaMemcpyHostToDevice));
const size_t smemSize = (TPB+2*RAD)*sizeof(float);
ddKernel<<<(N+TPB-1)/TPB,TPB,smemSize>>>(d_out,d_in,N,h);
HANDLE_ERROR(cudaMemcpy(result_parallel,d_out,N*sizeof(float),cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaFree(d_in));
HANDLE_ERROR(cudaFree(d_out));
FILE *outfile = fopen("results.csv", "w");
for (int i = 1; i < N - 1; ++i) {
fprintf(outfile, "%f,%f,%f,%f\n", x, u,
result_parallel, result_parallel + u);
}
fclose(outfile);
}
我用的显卡RTX2080,TITTAN V都试过,CUDA版本9.1和10.1试过。我想是不是新的显卡或者cuda将什么功能给改变了什么,望高手能指导一下,谢谢。
|