用户
 找回密码
 立即注册
发表于 2021-7-9 22:15:48
30820
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#include<iostream>
using namespace std;

// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

int main( )
{
    //CSR format of matrix A and Vector b
    double* h_valA;
    int* h_csrRowPtrA;
    int* h_csrColIndA;
    double* h_b;
    int n, nnzA;
    n = 3; nnzA = 5;

    h_valA = (double*)malloc(nnzA * sizeof(double));
    h_csrRowPtrA = (int*)malloc((n + 1) * sizeof(int));
    h_csrColIndA = (int*)malloc(nnzA * sizeof(int));
    h_b = (double*)malloc(n * sizeof(double));

    h_valA[0] = 3.0;
    h_valA[1] = 2.0;
    h_valA[2] = 2.0;
    h_valA[3] = 2.0;
    h_valA[4] = 1.0;

    h_csrRowPtrA[0] = 0;
    h_csrRowPtrA[1] = 2;
    h_csrRowPtrA[2] = 3;
    h_csrRowPtrA[3] = 5;

    h_csrColIndA[0] = 0;
    h_csrColIndA[1] = 2;
    h_csrColIndA[2] = 1;
    h_csrColIndA[3] = 0;
    h_csrColIndA[4] = 2;

    h_b[0] = 3.5;
    h_b[1] = 1.5;
    h_b[2] = 2.0;

    //CSR format of matrix A and Vector b (device)
    double* valA;
    int* csrRowPtrA;
    int* csrColIndA;
    double* b;

    cudaMalloc((void**)&valA, nnzA * sizeof(double));
    cudaMalloc((void**)&csrRowPtrA, (n + 1) * sizeof(int));
    cudaMalloc((void**)&csrColIndA, nnzA * sizeof(int));
    cudaMalloc((void**)&b, n * sizeof(double));
    cudaCheckErrors("cudaMalloc fail");

    cudaMemcpy(valA, h_valA, (size_t)(nnzA * sizeof(double)), cudaMemcpyHostToDevice);
    cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (size_t)((n + 1) * sizeof(int)), cudaMemcpyHostToDevice);
    cudaMemcpy(csrColIndA, h_csrColIndA, (size_t)(nnzA * sizeof(int)), cudaMemcpyHostToDevice);
    cudaMemcpy(b, h_b, (size_t)(n * sizeof(double)), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy fail");

    //Initialize cuSPARSE
    cusparseHandle_t handle;
    CUSPARSE_CHECK(cusparseCreate(&handle));
    cusparseStatus_t status;

    cusparseMatDescr_t descrA;
    status = cusparseCreateMatDescr(&descrA);
    CUSPARSE_CHECK(status);

    cusparseMatDescr_t  descr_L;
    status = cusparseCreateMatDescr(&descr_L);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatIndexBase(descr_L, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
    CUSPARSE_CHECK(status);

    cusparseMatDescr_t  descr_U;
    status = cusparseCreateMatDescr(&descr_U);
    status = cusparseSetMatIndexBase(descr_U, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
    CUSPARSE_CHECK(status);
    status = cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
    CUSPARSE_CHECK(status);

    //Query space and allocate memory
    csrilu02Info_t info_A = 0; CUSPARSE_CHECK(cusparseCreateCsrilu02Info(&info_A));
    csrsv2Info_t info_L = 0; CUSPARSE_CHECK(cusparseCreateCsrsv2Info(&info_L));
    csrsv2Info_t info_U = 0; CUSPARSE_CHECK(cusparseCreateCsrsv2Info(&info_U));

    int pBufferSize_A; int pBufferSize_L; int pBufferSize_U;
    status = cusparseDcsrilu02_bufferSize(handle, n, nnzA, descrA, valA, csrRowPtrA, csrColIndA, info_A, &pBufferSize_A);
    CUSPARSE_CHECK(status);
    status = cusparseDcsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, descr_L, valA, csrRowPtrA, csrColIndA, info_L, &pBufferSize_L);
    CUSPARSE_CHECK(status);
    status = cusparseDcsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, descr_U, valA, csrRowPtrA, csrColIndA, info_U, &pBufferSize_U);
    CUSPARSE_CHECK(status);

    int pBufferSize = max(pBufferSize_A, max(pBufferSize_L, pBufferSize_U));
    void* pBuffer = 0;
    cudaMalloc((void**)&pBuffer, pBufferSize);
    cudaCheckErrors("cudaMalloc fail");

    // LU decomposition analysis
    int structural_zero;
    status = cusparseDcsrilu02_analysis(handle, n, nnzA, descrA, valA, csrRowPtrA, csrColIndA, info_A, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);
    status = cusparseXcsrilu02_zeroPivot(handle, info_A, &structural_zero);
    CUSPARSE_CHECK(status);
    if (CUSPARSE_STATUS_ZERO_PIVOT == status)
    {
        printf("A(%d,%d) is missing\n", structural_zero, structural_zero);
    }
    status = cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, descr_L, valA, csrRowPtrA, csrColIndA, info_L, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);
    status = cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, descr_U, valA, csrRowPtrA, csrColIndA, info_U, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);

    // A = L * U
    int numerical_zero;
    status = cusparseDcsrilu02(handle, n, nnzA, descrA, valA, csrRowPtrA, csrColIndA, info_A, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);
    status = cusparseXcsrilu02_zeroPivot(handle, info_A, &numerical_zero);
    CUSPARSE_CHECK(status);
    if (CUSPARSE_STATUS_ZERO_PIVOT == status)
    {
        printf("U(%d,%d) is zero\n", numerical_zero, numerical_zero);
    }

    // b = L * Z
    double* d_z;  
    cudaMalloc(&d_z, n * sizeof(double));
    cudaCheckErrors("cudaMalloc fail");

    const double alpha = 1.0;
    status = cusparseDcsrsv2_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, &alpha, descr_L, valA, csrRowPtrA, csrColIndA, info_L, b, d_z, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);

    // Z = U * X
    double* d_x;
    cudaMalloc((void**)&d_x, n * sizeof(double));
    cudaCheckErrors("cudaMalloc fail");
    status = cusparseDcsrsv2_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnzA, &alpha, descr_U, valA, csrRowPtrA, csrColIndA, info_U, d_z, d_x, CUSPARSE_SOLVE_POLICY_USE_LEVEL, pBuffer);
    CUSPARSE_CHECK(status);

    // Ax = A * x
    const double beta = 0.0;
    double* d_Ax;
    cudaMalloc((void**)&d_Ax, n * sizeof(double));
    cudaCheckErrors("cudaMalloc fail");
    cusparseSpMatDescr_t matA;
    cusparseDnVecDescr_t vecX, vecAx;
    void*   dBuffer    = NULL;
    size_t  bufferSize = 0;
    status = cusparseCreateCsr(&matA, n, n, nnzA, csrRowPtrA, csrColIndA, valA, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F);
    CUSPARSE_CHECK(status);
    status = cusparseCreateDnVec(&vecX, n, d_x, CUDA_R_64F);
    CUSPARSE_CHECK(status);
    status = cusparseCreateDnVec(&vecAx, n, d_Ax, CUDA_R_64F);
    CUSPARSE_CHECK(status);
    cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize);
    CUSPARSE_CHECK(status);
    cudaMalloc(&dBuffer, bufferSize);
    cudaCheckErrors("cudaMalloc fail");
    cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecAx, CUDA_R_64F, CUSPARSE_MV_ALG_DEFAULT, dBuffer);
    CUSPARSE_CHECK(status);

    // Return the data from GPU to CPU
   double* h_x = (double*)malloc(n * sizeof(double));
   cudaMemcpy(h_x, d_x, n * sizeof(double), cudaMemcpyDeviceToHost);
   cudaCheckErrors("cudaMemcpy fail");
   printf("Final result\n");
   for (int k = 0; k < n; k++)
   {
       printf("x[%i] = %f\n", k, h_x[k]);
   }
   double* h_Ax = (double*)malloc(n * sizeof(double));
   cudaMemcpy(h_Ax, d_Ax, n * sizeof(double), cudaMemcpyDeviceToHost);
   cudaCheckErrors("cudaMalloc fail");
   printf("Relative error analysis\n");
   for (int k = 0; k < n; k++)
   {
       printf("h_Ax[%i] = %f\n", k, h_Ax[k]);
   }



    if (h_valA)free(h_valA);
    if (h_csrRowPtrA)free(h_csrRowPtrA);
    if (h_csrColIndA)free(h_csrColIndA);
    if (h_x)free(h_x);
    if (valA)cudaFree(valA);
    if (csrRowPtrA)cudaFree(csrRowPtrA);
    if (csrColIndA)cudaFree(csrColIndA);
    if (d_z)cudaFree(d_z);
    if (d_x)cudaFree(d_x);
    if (dBuffer)cudaFree(dBuffer);
    if (d_Ax)cudaFree(d_Ax);
    CUSPARSE_CHECK(cusparseDestroy(handle));
    CUSPARSE_CHECK(cusparseDestroyMatDescr(descrA));
    CUSPARSE_CHECK(cusparseDestroyMatDescr(descr_L));
    CUSPARSE_CHECK(cusparseDestroyMatDescr(descr_U));
    CUSPARSE_CHECK(cusparseDestroyCsrilu02Info(info_A));
    CUSPARSE_CHECK(cusparseDestroyCsrsv2Info(info_L));
    CUSPARSE_CHECK(cusparseDestroyCsrsv2Info(info_U));
    CUSPARSE_CHECK(cusparseDestroySpMat(matA));
    CUSPARSE_CHECK(cusparseDestroyDnVec(vecX));
    CUSPARSE_CHECK(cusparseDestroyDnVec(vecAx));

    return 0;
}


本帖子中包含更多资源

您需要 登录 才可以下载或查看,没有帐号?立即注册

x
使用道具 举报 回复
发新帖
您需要登录后才可以回帖 登录 | 立即注册