accelerInt/complexInverse_8cu_source.html

 #include "header.cuh"
 #include "solver_props.cuh"
 #include <cuComplex.h>


 __device__
 int getComplexMax (const int n, const cuDoubleComplex * __restrict__ complexArr) {

     int maxInd = 0;
     if (n == 1)
         return maxInd;

     double maxVal = cuCabs(complexArr[INDEX(0)]);
     for (int i = 1; i < n; ++i) {
         if (cuCabs(complexArr[INDEX(i)]) > maxVal) {
             maxInd = i;
             maxVal = cuCabs(complexArr[INDEX(i)]);
         }
     }

     return maxInd;
 }


 __device__
 void scaleComplex (const int n, const cuDoubleComplex val, cuDoubleComplex* __restrict__ arrX) {

     for (int i = 0; i < n; ++i) {
         arrX[INDEX(i)] = cuCmul(arrX[INDEX(i)], val);
     }

 }


 /*
 __device__
 void swapComplex (const int n, cuDoubleComplex* __restrict__ arrX, const int incX,
     cuDoubleComplex* __restrict__ arrY, const int incY) {

     int ix = 0;
     int iy = 0;

     for (int i = 0; i < n; ++i) {
         cuDoubleComplex temp = arrX[INDEX(ix)];
         arrX[INDEX(ix)] = arrY[INDEX(iy)];
         arrY[INDEX(iy)] = temp;
         ix += incX;
         iy += incY;
     }

 }*/


 __device__
 void complexGERU (const int n, const cuDoubleComplex alpha, const cuDoubleComplex* arrX,
                                     const cuDoubleComplex* arrY, const int incY, cuDoubleComplex* A, const int lda) {

     for (int j = 0; j < n; ++j) {
         if (cuCabs(arrY[INDEX(j * incY)]) > 0.0) {

             cuDoubleComplex temp = cuCmul(alpha, arrY[INDEX(j * incY)]);

             for (int i = 0; i < n; ++i) {
                 A[INDEX(i + (lda * j))] = cuCfma(arrX[INDEX(i)], temp, A[INDEX(i + (lda * j))]);
             }

         }
     }

 }


 __device__
 void multiplyComplexUpperMV (const int n, cuDoubleComplex* x, const int lda, const cuDoubleComplex* A) {

     for (int j = 0; j < n; ++j) {
         if (cuCabs(x[INDEX(j)]) > 0.0) {
             cuDoubleComplex temp = x[INDEX(j)];
             for (int i = 0; i < j; ++i) {
                 //x[i] += temp * A[i + (lda * j)];
                 x[INDEX(i)] = cuCfma(temp, A[INDEX(i + (lda * j))], x[INDEX(i)]);
             }
             //x[j] *= A[j + (lda * j)];
             x[INDEX(j)] = cuCmul(x[INDEX(j)], A[INDEX(j + (lda * j))]);
         }
     }

 }


 __device__
 void complexGEMV (const int m, const int n, const int lda, const cuDoubleComplex alpha, const cuDoubleComplex* A,
                                     const cuDoubleComplex* arrX, cuDoubleComplex* arrY) {

     // first: y = beta*y
     // beta = 1, so nothing

     // second: y = alpha*A*x + y

     for (int j = 0; j < n - 1; ++j) {

         if (cuCabs(arrX[INDEX(j)]) > 0.0) {
             cuDoubleComplex temp = cuCmul(alpha, arrX[INDEX(j)]);
             for (int i = 0; i < m; ++i) {
                 //arrY[i] += temp * A[i + (m * j)];
                 arrY[INDEX(i)] = cuCfma(temp, A[INDEX(i + (lda * j))], arrY[INDEX(i)]);
             }
         }
     }

 }


 __device__
 void getComplexLU (const int n, cuDoubleComplex* __restrict__ A,
                     int* __restrict__ indPivot, int* __restrict__ info) {

     for (int j = 0; j < n; ++j) {

         // find pivot and test for singularity

         int jp = j + getComplexMax (n - j, &A[GRID_DIM * (j + (STRIDE * j))]);
         indPivot[INDEX(j)] = jp;

         if (cuCabs(A[INDEX(jp + (STRIDE * j))]) > 0.0) {

             // apply interchange to columns 1:n-1
             if (jp != j)
             {
                 for (int i = 0; i < n; ++i) {
                     cuDoubleComplex temp = A[INDEX(STRIDE * i + j)];
                     A[INDEX(STRIDE * i + j)] = A[INDEX(STRIDE * i + jp)];
                     A[INDEX(STRIDE * i + jp)] = temp;
                 }
                 //swapComplex (n, &A[GRID_DIM * (j)], STRIDE, &A[GRID_DIM * (jp)], STRIDE);
             }

             // compute elements j+1:m-1 of the jth column

             if (j < STRIDE - 1)
                 scaleComplex (n - j - 1, cuCdiv(make_cuDoubleComplex(1.0, 0.0), A[INDEX(j + (STRIDE * j))]), &A[GRID_DIM * (j + 1 + (STRIDE * j))]);

         } else if (*info == 0) {
             *info = j;
             break;
         }

         // update trailing submatrix
         if (j < n - 1)
             complexGERU (n - j - 1, make_cuDoubleComplex(-1.0, 0.0), &A[GRID_DIM * (j + 1 + (STRIDE * j))], &A[GRID_DIM * (j + STRIDE * (j + 1))], STRIDE, &A[GRID_DIM * (j + 1 + STRIDE * (j + 1))], STRIDE);

     }
 }

 __device__
 int getComplexInverseLU (const int n, cuDoubleComplex* __restrict__ A,
                             const int* __restrict__ indPivot,
                             cuDoubleComplex* __restrict__ work) {

     // form inv(U)
     for (int j = 0; j < n; ++j) {
         if (cuCabs(A[INDEX(j + (STRIDE * j))]) == 0)
             return j;
         A[INDEX(j + (STRIDE * j))] = cuCdiv(make_cuDoubleComplex(1.0, 0.0), A[INDEX(j + (STRIDE * j))]);
         cuDoubleComplex Ajj = cuCmul(make_cuDoubleComplex(-1.0, 0.0), A[INDEX(j + (STRIDE * j))]);

         // compute elements 0:j-1 of jth column
         multiplyComplexUpperMV (j, &A[GRID_DIM * (STRIDE * j)], STRIDE, A);

         // scale
         scaleComplex (j, Ajj, &A[GRID_DIM * (STRIDE * j)]);
     }

     // solve equation inv(A)*L = inv(U) for inv(A)

     for (int j = n - 1; j >= 0; --j) {

         // copy current column of L to work and replace with 0.0s
         for (int i = j + 1; i < n; ++i) {
             work[INDEX(i)] = A[INDEX(i + (STRIDE * j))];
             A[INDEX(i + (STRIDE * j))] = make_cuDoubleComplex(0.0, 0.0);
         }

         // compute current column of inv(A)
         if (j < n - 1)
             complexGEMV (n, n - j, STRIDE, make_cuDoubleComplex(-1.0, 0.0), &A[GRID_DIM * (STRIDE * (j + 1))], &work[GRID_DIM * (j + 1)], &A[GRID_DIM * (STRIDE * j)]);

     }

     // apply column interchanges

     for (int j = n - 2; j >= 0; --j) {

         int jp = indPivot[INDEX(j)];
         if (jp != j)
         {
             for (int i = 0; i < n; ++i) {
                 cuDoubleComplex temp = A[INDEX(STRIDE * j + i)];
                 A[INDEX(STRIDE * j + i)] = A[INDEX(STRIDE * jp + i)];
                 A[INDEX(STRIDE * jp + i)] = temp;
             }
         }
     }
     return 0;
 }

 __device__
 void getComplexInverse (const int n, cuDoubleComplex* __restrict__ A,
                             int* __restrict__ ipiv, int* __restrict__ info,
                             cuDoubleComplex* __restrict__ work) {

     // first get LU factorization
     getComplexLU (n, A, ipiv, info);

     // check for successful exit
     if (*info != 0) {
         return;
     }

     // now get inverse
     *info = getComplexInverseLU (n, A, ipiv, work);
 }

 __device__
 void getHessenbergLU(const int n, cuDoubleComplex* A, int* __restrict__ indPivot, int* __restrict__ info)
 {
     int last_pivot = 0;
     for (int i = 0; i < n - 1; i ++)
     {
         if (cuCabs(A[INDEX(i * STRIDE + i)]) < cuCabs(A[INDEX(i * STRIDE + i + 1)]))
         {
             //swap rows
             for(int k = 0; k < n; ++k)
             {
                 if (k >= last_pivot)
                 {
                     cuDoubleComplex temp = A[INDEX(k * STRIDE + i)];
                     A[INDEX(k * STRIDE + i)] = A[INDEX(k * STRIDE + i + 1)];
                     A[INDEX(k * STRIDE + i + 1)] = temp;
                 }
             }
             indPivot[INDEX(i)] = i + 1;
         }
         else
         {
             indPivot[INDEX(i)] = i;
             last_pivot = i;
         }
         if (cuCabs(A[INDEX(i * STRIDE + i)]) > 0.0)
         {
             cuDoubleComplex tau = cuCdiv(A[INDEX(i * STRIDE + i + 1)], A[INDEX(i * STRIDE + i)]);
             for (int j = i + 1; j < n; j++)
             {
                 A[INDEX(j * STRIDE + i + 1)] = cuCsub(A[INDEX(j * STRIDE + i + 1)], cuCmul(tau, A[INDEX(j * STRIDE + i)]));
             }
             A[INDEX(i * STRIDE + i + 1)] = tau;
         }
         else
         {
             *info = i;
             return;
         }
     }
     //last index is not pivoted
     indPivot[INDEX(n - 1)] = n - 1;
 }

 __device__
 void getComplexInverseHessenberg (const int n, cuDoubleComplex* __restrict__ A,
                                     int* __restrict__ ipiv, int* __restrict__ info,
                                     cuDoubleComplex* __restrict__ work)
 {
     // first get LU factorization
     getHessenbergLU (n, A, ipiv, info);

     if (*info != 0)
         return;

     // now get inverse
     *info = getComplexInverseLU (n, A, ipiv, work);
 }
getHessenbergLU
__device__ void getHessenbergLU(const int n, cuDoubleComplex *A, int *__restrict__ indPivot, int *__restrict__ info)
Computes the LU factorization of a (n x STRIDE) Hessenberg Matrix using partial pivoting with row int...
Definition: complexInverse.cu:367

scaleComplex
__device__ void scaleComplex(const int n, const cuDoubleComplex val, cuDoubleComplex *__restrict__ arrX)
scaleComplex scales a vector (with increment equal to one) by a constant val.
Definition: complexInverse.cu:49

GRID_DIM
#define GRID_DIM
The total number of threads in the Grid, provides an offset between vector entries.
Definition: gpu_macros.cuh:20

STRIDE
#define STRIDE
the matrix dimensions
Definition: radau2a_props.cuh:20

solver_props.cuh
simple convenience file to include the correct solver properties file

complexGEMV
__device__ void complexGEMV(const int m, const int n, const int lda, const cuDoubleComplex alpha, const cuDoubleComplex *A, const cuDoubleComplex *arrX, cuDoubleComplex *arrY)
Computes the matrix-vector operation  where alpha is a scalar, x and y are vectors and A is an m by n...
Definition: complexInverse.cu:176

getComplexMax
__device__ int getComplexMax(const int n, const cuDoubleComplex *__restrict__ complexArr)
getComplexMax finds the index of the first element having maximum absolute value. ...
Definition: complexInverse.cu:22

header.cuh
An example header file that defines system size, memory functions and other required methods for inte...

getComplexInverseHessenberg
__device__ void getComplexInverseHessenberg(const int n, cuDoubleComplex *__restrict__ A, int *__restrict__ ipiv, int *__restrict__ info, cuDoubleComplex *__restrict__ work)
getComplexInverseHessenberg computes the inverse of an upper Hessenberg matrix A using a LU factoriza...
Definition: complexInverse.cu:421

getComplexLU
__device__ void getComplexLU(const int n, cuDoubleComplex *__restrict__ A, int *__restrict__ indPivot, int *__restrict__ info)
Computes the LU factorization of a (n x n) matrix using partial pivoting with row interchanges...
Definition: complexInverse.cu:216

complexGERU
__device__ void complexGERU(const int n, const cuDoubleComplex alpha, const cuDoubleComplex *arrX, const cuDoubleComplex *arrY, const int incY, cuDoubleComplex *A, const int lda)
complexGERU performs the rank 1 operation  where alpha is a scalar, arrX and arrY are n element vecto...
Definition: complexInverse.cu:102

getComplexInverse
__device__ void getComplexInverse(const int n, cuDoubleComplex *__restrict__ A, int *__restrict__ ipiv, int *__restrict__ info, cuDoubleComplex *__restrict__ work)
getComplexInverse computes the inverse of an a general matrix A using a LU factorization method ...
Definition: complexInverse.cu:332

multiplyComplexUpperMV
__device__ void multiplyComplexUpperMV(const int n, cuDoubleComplex *x, const int lda, const cuDoubleComplex *A)
Performs the matrix-vector operation .
Definition: complexInverse.cu:138

INDEX
#define INDEX(i)
Convenience macro to get the value of a vector at index i, calculated as i * GRID_DIM + T_ID...
Definition: gpu_macros.cuh:24

getComplexInverseLU
__device__ int getComplexInverseLU(const int n, cuDoubleComplex *__restrict__ A, const int *__restrict__ indPivot, cuDoubleComplex *__restrict__ work)
getComplexInverseLU computes the inverse of a matrix using the LU factorization computed by getHessen...
Definition: complexInverse.cu:270