用户
 找回密码
 立即注册
回帖奖励 1 CUDA币 回复本帖可获得 1 CUDA币奖励! 每人限 1 次
yuhuilcm 该用户已被删除
发表于 2013-9-17 18:04:05
17540
最近在对一个PSO算法进行CUDA编程优化,核函数具体代码如下:
#include <stdlib.h>
#include <stdio.h>
#include <windows.h>
#include<iostream>
#include <time.h>
#include <math.h>
#include <curand.h>
#include <curand_kernel.h>
#include <cuda_gl_interop.h>
#include <cuda_runtime.h>
using namespace std;
#define FeaturesNum1 5
#define THREDANUM1 32
#define M_PI1       3.14159265358979323846
#define PNum1 24
#define PDim1 17
__constant__  int pso_lanmark_GPU[FeaturesNum1][3];
__constant__   double  Featuresinfo_GPU[FeaturesNum1][3];
__constant__ double alpas_GPU[FeaturesNum1][3][10];
__constant__ double pose_Xup_GPU[17];
__constant__   double pose_Xdown_GPU[17];
__constant__   double Vmax_GPU[17];

__device__ double myrand1(unsigned int &x,unsigned int &y,unsigned int &z,unsigned int &c,int tidx)
{  
unsigned long long t, A = 698769069ULL;  
x = 69069*x+1234+tidx*1000;  
y ^= (y<<13+tidx);
y ^= (y>>17+tidx);
y ^= (y<<5+tidx);  
t = (A*z + c);
c = (t >> 32);
z = t;
return (double)((x+y+z)/RAND_MAX)/RAND_MAX/4;  
}

__global__ void ParticleFly_GPU(double* CudaDat ,double* data_out,int isize,int circulate)
{

__shared__ double Fit[PNum1];
__shared__ int GBestIndex ;
int tidx = threadIdx.x;  
    unsigned int x;
    unsigned int y;
    unsigned int z;
    unsigned int c;  
      x=123456789;
      y=362436000;
   z=521288629;
   c=7654321;
__shared__ double test[PNum1];
__shared__ double Wmax ;
__shared__ double Wmin ;
__shared__ double W;
__shared__ double C1;
__shared__ double C2;
__shared__ double optical_center[2];
Wmax = 0.9;
Wmin = 0.4;
W = 1;
C1 = 2;
C2 = 2;
optical_center[0] = 172.766;
optical_center[1] = 306.3046;

  for(int k = 0;k<circulate;k++)
    {
    if(tidx < PNum1)
         {
     if( k <= 1500)
    W = Wmax - k*(Wmax-Wmin)/circulate;
    else
    W = Wmin;
   for(int j=0; j<PDim1; j++)   //合并访问优化的问题
   {
    *(CudaDat + (PDim1 + j)*PNum1 + tidx)  = W*(*(CudaDat + (PDim1 + j)*PNum1 + tidx)) +//修改速度
      myrand1(x,y,z,c,tidx)*C1*(*(CudaDat + (2*PDim1 + j)*PNum1 + tidx) -(*(CudaDat + j*PNum1 + tidx)) )+
      myrand1(x,y,z,c,tidx)*C2*(*(CudaDat + (2*PDim1 + j)*PNum1 + GBestIndex) -(*(CudaDat + j*PNum1 + tidx)));
   
    if(*(CudaDat + (PDim1 + j)*PNum1 + tidx) > Vmax_GPU[j]) *(CudaDat + (PDim1 + j)*PNum1 + tidx) = Vmax_GPU[j];
   
    if(CudaDat[(PDim1 + j)*PNum1 + tidx]<-Vmax_GPU[j]) *(CudaDat + (PDim1 + j)*PNum1 + tidx) = -Vmax_GPU[j];
     *(CudaDat +  j*PNum1 + tidx) += (*(CudaDat + (PDim1 + j)*PNum1 + tidx)); //修改坐标
     
    if(CudaDat[j*PNum1 + tidx]>pose_Xup_GPU[j])
     CudaDat[j*PNum1 + tidx]=pose_Xup_GPU[j];//保护
   
    if(CudaDat[j*PNum1 + tidx]<pose_Xdown_GPU[j])
     CudaDat[j*PNum1 + tidx]=pose_Xdown_GPU[j];

   }
          for(int i = 0;i<17;i++)
     test[i] = CudaDat[i*PNum1];

//////////////////////////////////////GetFit////////////////////////////////
  double cost = 0;   
  double angle_cos[3],angle_sin[3];

  angle_cos[0] = cos(*(CudaDat + 3*PNum1 + tidx));
  angle_cos[1] = cos(*(CudaDat + 4*PNum1 + tidx));
  angle_cos[2] = cos(*(CudaDat + 5*PNum1 + tidx));
  angle_sin[0] = sin(*(CudaDat + 3*PNum1 + tidx));
  angle_sin[1] = sin(*(CudaDat + 4*PNum1 + tidx));
  angle_sin[2] = sin(*(CudaDat + 5*PNum1 + tidx));

  test[1] = *(CudaDat + 3*PNum1 + tidx);
  for(int i=0; i< FeaturesNum1; i++)
  {
   double alpasx = 0,alpasy =0,alpasz=0;
   for(int j=0; j<10; j++)
   {
    alpasx += *(CudaDat + (j+7)*PNum1 + tidx)*alpas_GPU[i][0][j];
    alpasy += *(CudaDat + (j+7)*PNum1 + tidx)*alpas_GPU[i][1][j];
    alpasz += *(CudaDat + (j+7)*PNum1 + tidx)*alpas_GPU[i][2][j];
   
   }
   double xx = Featuresinfo_GPU[i][0] + alpasx;
   double xy = Featuresinfo_GPU[i][1] + alpasy;
   double xz = Featuresinfo_GPU[i][2] + alpasz;
   /*double xx = Featuresinfo[i][0];
   double xy = Featuresinfo[i][1];
   double xz = Featuresinfo[i][2];

   double wx = xx*angle_cos[2]*angle_cos[1] + xy*(angle_cos[2]*angle_sin[1]*angle_sin[0]-angle_sin[2]*angle_cos[0])
    + xz*(angle_cos[2]*angle_sin[1]*angle_cos[0]+angle_sin[2]*angle_sin[0]) +*(CudaDat+ tidx);
   double wy = xx*angle_sin[2]*angle_cos[1] + xy*(angle_sin[2]*angle_sin[1]*angle_sin[0]+angle_cos[2]*angle_cos[0])
    + xz*(angle_sin[2]*angle_sin[1]*angle_cos[0]-angle_cos[2]*angle_sin[0]) + *(CudaDat + 1*PNum1 + tidx);
   double wz = -xx*angle_sin[1] + xy*angle_cos[1]*angle_sin[0] +xz*angle_cos[1]*angle_cos[0] + *(CudaDat + 2*PNum1 + tidx);
   double px = optical_center[0] + *(CudaDat + 6*PNum1 + tidx)*(wx/wz) - (double)pso_lanmark_GPU[i][0];
   double py = optical_center[1] - *(CudaDat + 6*PNum1 + tidx)*(wy/wz) - (double)pso_lanmark_GPU[i][1];
   cost = cost +  px*px + py*py;
   //cost = cost + abs(px)+abs(py);
  }
   Fit[tidx] = -1*cost;  //Fit得到全部一样的值

    if( Fit[tidx] >= *(CudaDat + (3*PDim1+1)*PNum1 + tidx))
           {
     *(CudaDat + (3*PDim1+1)*PNum1 + tidx) = Fit[tidx];
     for(int j=0; j<PDim1; j++)
     *(CudaDat + (2*PDim1+j)*PNum1 + tidx) = *(CudaDat + j*PNum1 + tidx);
           }
  }
    __syncthreads();
    if(tidx < PNum1)
     {
      if(tidx == 0)  //用第一个线程计算GBestIndex
      {
      GBestIndex = 0;
     for(int i=0; i<PNum1; i++)
      if(*(CudaDat + (3*PDim1+1)*PNum1 + i) >= *(CudaDat + (3*PDim1+1)*PNum1 + GBestIndex) && i!=GBestIndex) GBestIndex = i;
      }
   }
    }
      __syncthreads();
    if(tidx == 0 )
    {
     for(int i = 0;i<isize;i++)
     {
    //data_out[i] = CudaDat[GBestIndex*isize + i];
     data_out[i] = *(CudaDat + i*PNum1 + GBestIndex);
        }
     }
}

extern "C"
clock_t ParticleFly_CPU(double* CudaDat,double* data_out,int isize,int circulate,int pso_lanmark1[5][3])
{
  
  double Featuresinfo1[FeaturesNum1][3]={{-31.1956,34.2044,94.8308},{30.3130,34.2285,94.5579},{0.0072,10.1444,128.9015},{-29.3909,-33.8801, 98.1061},{29.1029,-33.5093,97.8227}};
  double alpas1[FeaturesNum1][3][10]=.......................................

        double pose_Xup1[] = {   450, 450,1000, M_PI1/2, M_PI1/2, M_PI1/2, 900, 0.002653,  0.0016677, 0.0013104, 0.0009396, 0.0008288, 0.0006270, 0.0005781, 0.0005434, 0.0005332, 0.0004848}; //自变量上界   
        double pose_Xdown1[] = {-450,-450,1000,-M_PI1/2,-M_PI1/2,-M_PI1/2,-900,-0.002653, -0.0016677,-0.0013104,-0.0009396,-0.0008288,-0.0006270,-0.0005781,-0.0005434,-0.0005332,-0.0004848}; //自变量下界  
  double Vmax[PDim1];

    for(int i=0; i<PDim1; i++)
     Vmax[i] = (pose_Xup1[i]-pose_Xdown1[i])*0.1;
  

double* CudaDat_GPU;
double* data_out_GPU;
  clock_t start = clock();
  
cudaMalloc((void**) &CudaDat_GPU,sizeof(double)*PNum1*isize);
cudaMalloc((void**) &data_out_GPU,sizeof(double)*isize);
cudaMemset(CudaDat_GPU, 0, sizeof(double)*PNum1*isize);
cudaMemset(data_out, 0, sizeof(double)*isize);
clock_t start1 = clock();

cudaMemcpy(CudaDat_GPU,CudaDat,sizeof(double)*PNum1*isize,cudaMemcpyHostToDevice);

cudaMemcpyToSymbol( pso_lanmark_GPU, pso_lanmark1,
                                sizeof(int) *5*3);
cudaMemcpyToSymbol( Featuresinfo_GPU, Featuresinfo1,
                                sizeof(double) * 5*3);
cudaMemcpyToSymbol( alpas_GPU, alpas1,
                                sizeof(int) *5* 3*10);
cudaMemcpyToSymbol( pose_Xup_GPU, pose_Xup1,
                                sizeof(double) * 17);
cudaMemcpyToSymbol( pose_Xdown_GPU, pose_Xdown1,
                                sizeof(double) * 17);
cudaMemcpyToSymbol( Vmax_GPU, Vmax,
                                sizeof(double) * 17);

    clock_t end1 = clock();

ParticleFly_GPU<<<1,THREDANUM1>>>(CudaDat_GPU,data_out_GPU,isize,circulate);
cudaThreadSynchronize();
clock_t end2 = clock();
cudaMemcpy(data_out,data_out_GPU,sizeof(double)*isize,cudaMemcpyDeviceToHost);
    clock_t end = clock();

cudaFree(CudaDat_GPU);
cudaFree(data_out_GPU);

cout<<start1 - start<<"  -------  "<<end1 - start1<<"------"<<end2-end1<<"------"<<end-end2<<endl;
return end - start;
}
,我的配置是pentium dual coreE5300 2.6FHZ,GTX650TI显卡,这个CPU函数调用四次,通过clock()函数测量时间,得到cudaMalloc 部分第一次时间消耗70 毫秒,核函数时间消耗200毫秒左右,对于传入数据,我已经做了一些合并访问的修改,性能提高了2倍左右(以前核函数400毫秒)。
我想问的是-------
                     对与这个函数,还有哪些地方可以改进的地方,我觉得这个运行速度还有很大的提升空间,自己刚刚学校CUDA不久,跪求指点一二!!!!!
使用道具 举报 回复
发新帖
您需要登录后才可以回帖 登录 | 立即注册