Actual source code: fnutilcuda.cu

slepc-3.18.1 2022-11-02
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Utility subroutines common to several impls
 12: */

 14: #include "fnutilcuda.h"

 16: __global__ void set_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
 17: {
 18:   PetscInt x;
 19:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;

 21:   if (x<n) {
 22:     d_pa[x+x*ld] = v;
 23:   }
 24: }

 26: __host__ PetscErrorCode set_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
 27: {
 28:   PetscInt    i,dimGrid_xcount;
 29:   dim3        blocks3d,threads3d;

 31:   SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
 32:   for (i=0;i<dimGrid_xcount;i++) {
 33:     set_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
 34:     cudaGetLastError();
 35:   }
 36:   return 0;
 37: }

 39: __global__ void set_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
 40: {
 41:   PetscInt x;
 42:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;

 44:   if (x<n) {
 45:     d_pa[x+x*ld] = thrust::complex<PetscReal>(vr, vi);
 46:   }
 47: }

 49: __host__ PetscErrorCode set_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
 50: {
 51:   PetscInt    i,dimGrid_xcount;
 52:   dim3        blocks3d,threads3d;

 54:   SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
 55:   for (i=0;i<dimGrid_xcount;i++) {
 56:     set_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
 57:     cudaGetLastError();
 58:   }
 59:   return 0;
 60: }

 62: __global__ void shift_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v,PetscInt xcount)
 63: {
 64:   PetscInt x;
 65:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;

 67:   if (x<n) {
 68:     d_pa[x+x*ld] += v;
 69:   }
 70: }

 72: __host__ PetscErrorCode shift_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar v)
 73: {
 74:   PetscInt    i,dimGrid_xcount;
 75:   dim3        blocks3d,threads3d;

 77:   SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
 78:   for (i=0;i<dimGrid_xcount;i++) {
 79:     shift_diagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,v,i);
 80:     cudaGetLastError();
 81:   }
 82:   return 0;
 83: }

 85: __global__ void shift_Cdiagonal_kernel(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi,PetscInt xcount)
 86: {
 87:   PetscInt x;
 88:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;

 90:   if (x<n) {
 91:     d_pa[x+x*ld] += thrust::complex<PetscReal>(vr, vi);
 92:   }
 93: }

 95: __host__ PetscErrorCode shift_Cdiagonal(PetscInt n,PetscComplex *d_pa,PetscInt ld,PetscReal vr,PetscReal vi)
 96: {
 97:   PetscInt    i,dimGrid_xcount;
 98:   dim3        blocks3d,threads3d;

100:   SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
101:   for (i=0;i<dimGrid_xcount;i++) {
102:     shift_Cdiagonal_kernel<<<blocks3d, threads3d>>>(n,d_pa,ld,vr,vi,i);
103:     cudaGetLastError();
104:   }
105:   return 0;
106: }

108: __global__ void copy_array2D_S2C_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
109: {
110:   PetscInt x,y,i,j;

112:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
113:   y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
114:   for (j=y;j<n;j+=blockDim.y) {
115:     for (i=x;i<m;i+=blockDim.x) {
116:       d_pa[i+j*lda] = d_pb[i+j*ldb];
117:     }
118:   }
119: }

121: __host__ PetscErrorCode copy_array2D_S2C(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscScalar *d_pb,PetscInt ldb)
122: {
123:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
124:   dim3        blocks3d,threads3d;

126:   SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
127:   for (i=0;i<dimGrid_xcount;i++) {
128:     for (j=0;j<dimGrid_ycount;j++) {
129:       copy_array2D_S2C_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
130:       cudaGetLastError();
131:     }
132:   }
133:   return 0;
134: }

136: __global__ void copy_array2D_C2S_kernel(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb,PetscInt xcount,PetscInt ycount)
137: {
138:   PetscInt x,y,i,j;

140:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
141:   y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
142:   for (j=y;j<n;j+=blockDim.y) {
143:     for (i=x;i<m;i+=blockDim.x) {
144:       d_pa[i+j*lda] = PetscRealPartComplex(d_pb[i+j*ldb]);
145:     }
146:   }
147: }

149: __host__ PetscErrorCode copy_array2D_C2S(PetscInt m,PetscInt n,PetscScalar *d_pa,PetscInt lda,PetscComplex *d_pb,PetscInt ldb)
150: {
151:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
152:   dim3        blocks3d,threads3d;

154:   SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
155:   for (i=0;i<dimGrid_xcount;i++) {
156:     for (j=0;j<dimGrid_ycount;j++) {
157:       copy_array2D_C2S_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_pb,ldb,i,j);
158:       cudaGetLastError();
159:     }
160:   }
161:   return 0;
162: }

164: __global__ void add_array2D_Conj_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscInt xcount,PetscInt ycount)
165: {
166:   PetscInt x,y,i,j;

168:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
169:   y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
170:   for (j=y;j<n;j+=blockDim.y) {
171:     for (i=x;i<m;i+=blockDim.x) {
172:       d_pa[i+j*lda] += PetscConj(d_pa[i+j*lda]);
173:     }
174:   }
175: }

177: __host__ PetscErrorCode add_array2D_Conj(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda)
178: {
179:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
180:   dim3        blocks3d,threads3d;

182:   SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
183:   for (i=0;i<dimGrid_xcount;i++) {
184:     for (j=0;j<dimGrid_ycount;j++) {
185:       add_array2D_Conj_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,i,j);
186:       cudaGetLastError();
187:     }
188:   }
189:   return 0;
190: }

192: __global__ void getisreal_array2D_kernel(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result,PetscInt xcount,PetscInt ycount)
193: {
194:   PetscInt x,y,i,j;

196:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
197:   y = ycount*gridDim.y*blockDim.y+blockIdx.y*blockDim.y+threadIdx.y;
198:   if (*d_result) {
199:     for (j=y;j<n;j+=blockDim.y) {
200:       for (i=x;i<m;i+=blockDim.x) {
201:         if (PetscImaginaryPartComplex(d_pa[i+j*lda])) *d_result=PETSC_FALSE;
202:       }
203:     }
204:   }
205: }

207: __host__ PetscErrorCode getisreal_array2D(PetscInt m,PetscInt n,PetscComplex *d_pa,PetscInt lda,PetscBool *d_result)
208: {
209:   PetscInt    i,j,dimGrid_xcount,dimGrid_ycount;
210:   PetscBool   result=PETSC_TRUE;
211:   dim3        blocks3d,threads3d;

213:   cudaMemcpy(d_result,&result,sizeof(PetscBool),cudaMemcpyHostToDevice);
214:   SlepcKernelSetGrid2DTiles(m,n,&blocks3d,&threads3d,&dimGrid_xcount,&dimGrid_ycount);
215:   for (i=0;i<dimGrid_xcount;i++) {
216:     for (j=0;j<dimGrid_ycount;j++) {
217:       getisreal_array2D_kernel<<<blocks3d,threads3d>>>(m,n,d_pa,lda,d_result,i,j);
218:       cudaGetLastError();
219:     }
220:   }
221:   return 0;
222: }

224: __global__ void mult_diagonal_kernel(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *d_v,PetscInt xcount)
225: {
226:   PetscInt               x;
227:   unsigned int           bs=blockDim.x;
228:   __shared__ PetscScalar shrdres[SLEPC_BLOCK_SIZE_X];

230:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
231:   shrdres[threadIdx.x] = (x<n)? d_pa[x+ld*x]: 1.0;
232:   __syncthreads();

234:   /* reduction */
235:   if ((bs >= 512) && (threadIdx.x < 256)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 256]; } __syncthreads();
236:   if ((bs >= 256) && (threadIdx.x < 128)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x + 128]; } __syncthreads();
237:   if ((bs >= 128) && (threadIdx.x <  64)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  64]; } __syncthreads();
238:   if ((bs >=  64) && (threadIdx.x <  32)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  32]; } __syncthreads();
239:   if ((bs >=  32) && (threadIdx.x <  16)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +  16]; } __syncthreads();
240:   if ((bs >=  16) && (threadIdx.x <   8)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   8]; } __syncthreads();
241:   if ((bs >=   8) && (threadIdx.x <   4)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   4]; } __syncthreads();
242:   if ((bs >=   4) && (threadIdx.x <   2)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   2]; } __syncthreads();
243:   if ((bs >=   2) && (threadIdx.x <   1)) { shrdres[threadIdx.x] *= shrdres[threadIdx.x +   1]; } __syncthreads();
244:   if (threadIdx.x == 0) d_v[blockIdx.x] = shrdres[threadIdx.x];
245: }

247: __host__ PetscErrorCode mult_diagonal(PetscInt n,PetscScalar *d_pa,PetscInt ld,PetscScalar *v)
248: {
249:   PetscInt    i,j,dimGrid_xcount;
250:   PetscScalar *part,*d_part;
251:   dim3        blocks3d,threads3d;

253:   SlepcKernelSetGrid1D(n,&blocks3d,&threads3d,&dimGrid_xcount);
254:   cudaMalloc((void **)&d_part,sizeof(PetscScalar)*blocks3d.x);
255:   PetscMalloc1(blocks3d.x,&part);
256:   for (i=0;i<dimGrid_xcount;i++) {
257:     mult_diagonal_kernel<<<blocks3d,threads3d>>>(n,d_pa,ld,d_part,i);
258:     cudaGetLastError();
259:     cudaMemcpy(part,d_part,blocks3d.x*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
260:     if (i == 0) {
261:       *v = part[0];
262:       j=1;
263:     } else j=0;
264:     for (; j<(int)blocks3d.x; j++) *v *= part[j];
265:   }
266:   cudaFree(d_part);
267:   PetscFree(part);
268:   return 0;
269: }