本帖最后由 jfjkzero 于 2020-12-11 14:55 编辑
最近学习CUDA C的编程,在并行运行一个简单的解调算法的时候,统计时间后发现运行速度越来越慢(但还是运算结果正确的),后来简化到只运行其中一个核函数的时候,就算复杂度下降了,但还是会越跑越慢,尝试过每一轮都进行cudafree和cudamalloc也没用,这是为什么呢?
刚刚还发现了,对于我用过的一些矩阵加法、矩阵点乘的教程例子,加上for循环跑很多很多次,也是会出现这种越来越慢的情况。
环境:
win10 Visual studio2019 community
cuda 10.2
cudnn 7.6.5
用几年前的笔记本进行测试——显卡950M
跪求各路大神救救了
简化后作为测试的代码如下:
- #include <stdio.h>
- #include<cuda.h>
- #include<cuda_runtime.h>
- #include "..\common\book.h"
- #include <math.h>
- #include <time.h>
- #define BLOCK_NUM 16
- #define THREAD_NUM 16
- #define R_SIZE 256 //256
- #define M_SIZE R_SIZE*R_SIZE //256*256
- #define SNR_LEN 7
- #define N 100
- #define pi 3.1415926535
- double SNR[SNR_LEN] = { 0,1,2,3,4,5,6 };
- __global__ void DeModuate(double* ReceivedSignal_R, short int* demodSignal_HD, double sigma)
- {
- const int row = blockIdx.x * THREAD_NUM + threadIdx.x;
- double P_temp0, P_temp1, demodSignal_LLR;
- for (int i = 0; i < R_SIZE; i++)
- {
-
- demodSignal_LLR = 4 * ReceivedSignal_R[row * R_SIZE + i] / sigma;
- demodSignal_HD[row * R_SIZE + i] = (demodSignal_LLR > 0 ? 1 : 0);
- }
- }
- int main(int arc, char* argv[])
- {
- //GPU declaration
- double * Dev_ReceivedSignal_R;
- short int* Dev_demodSignal_HD;
- //GPU malloc
- HANDLE_ERROR(cudaMalloc((void**)&Dev_ReceivedSignal_R, sizeof(double) * M_SIZE));
- HANDLE_ERROR(cudaMalloc((void**)&Dev_demodSignal_HD, sizeof(short int) * M_SIZE));
- //time_counting declaration
- clock_t start, end;
- double DeMod_time = 0;
- int DeMod_time_temp;
- DeMod_time_temp = 0;
- for (int s = 0; s < SNR_LEN; s++)
- {
- err_Uncoded = 0;
- for (int frame = 0; frame < N; frame++)
- {
- start = clock();
- DeModuate << <BLOCK_NUM, THREAD_NUM >> > (Dev_ReceivedSignal_R, Dev_demodSignal_HD, sigma);
- end = clock();
- DeMod_time_temp += (end - start);
- DeMod_time = (double)DeMod_time_temp / CLK_TCK;
- printf("SNR = %1.1f, %4d/%d sim finished, ori_err = %4d, time: %f\n", SNR[s], frame, N, err_Uncoded_temp, DeMod_time);
- }
-
- }
- for (int s = 0; s < SNR_LEN; s++)
- {
- printf("SNR = %1.1f dB,BER_Uncoded= %1.10f;\n", SNR[s], BER_Uncoded[s]);
- }
- //GPU free
- cudaFree(Dev_demodSignal_HD);
- cudaFree(Dev_ReceivedSignal_R);
- return 0;
- }
复制代码
|