Actual source code: bvorthogcuda.cu
 
   slepc-3.10.1 2018-10-23
   
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2018, Universitat Politecnica de Valencia, Spain
  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV orthogonalization routines (CUDA)
 12: */
 14: #include <slepc/private/bvimpl.h>          /*I   "slepcbv.h"   I*/
 15: #include <slepcblaslapack.h>
 17: /*
 18:    BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
 19: */
 20: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
 21: {
 23:   PetscScalar    *d_hh,*d_a;
 24:   PetscInt       i;
 27:   if (!h) {
 28:     VecCUDAGetArrayReadWrite(bv->buffer,&d_a);
 29:     d_hh = d_a + j*(bv->nc+bv->m);
 30:     cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar));
 31:     WaitForGPU();CHKERRCUDA(ierr);
 32:     VecCUDARestoreArrayReadWrite(bv->buffer,&d_a);
 33:   } else { /* cpu memory */
 34:     for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
 35:   }
 36:   return(0);
 37: }
 39: /*
 40:    BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
 41:    into column j of the bv buffer
 42:  */
 43: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
 44: {
 46:   PetscScalar    *d_h,*d_c,sone=1.0;
 47:   PetscInt       i;
 48:   PetscBLASInt   one=1;
 49:   cublasStatus_t cberr;
 50:   cublasHandle_t cublasv2handle;
 53:   if (!h) {
 54:     PetscCUBLASGetHandle(&cublasv2handle);
 55:     VecCUDAGetArrayReadWrite(bv->buffer,&d_c);
 56:     d_h = d_c + j*(bv->nc+bv->m);
 57:     cberr = cublasXaxpy(cublasv2handle,bv->nc+j,&sone,d_c,one,d_h,one);CHKERRCUBLAS(cberr);
 58:     WaitForGPU();CHKERRCUDA(ierr);
 59:     VecCUDARestoreArrayReadWrite(bv->buffer,&d_c);
 60:   } else { /* cpu memory */
 61:     for (i=0;i<bv->nc+j;i++) h[i] += c[i];
 62:   }
 63:   return(0);
 64: }
 66: /*
 67:    BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
 68:    of the coefficients array
 69: */
 70: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
 71: {
 73:   PetscScalar    *d_h,*a;
 74:   cudaError_t    cerr;
 77:   if (!h) {
 78:     VecCUDAGetArrayReadWrite(bv->buffer,&a);
 79:     d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
 80:     cerr = cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 81:     WaitForGPU();CHKERRCUDA(ierr);
 82:     VecCUDARestoreArrayReadWrite(bv->buffer,&a);
 83:   } else { /* cpu memory */
 84:     h[bv->nc+j] = value;
 85:   }
 86:   return(0);
 87: }
 89: /*
 90:    BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
 91:    coefficients array (up to position j)
 92: */
 93: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
 94: {
 95:   PetscErrorCode    ierr;
 96:   const PetscScalar *d_h;
 97:   PetscScalar       dot;
 98:   PetscInt          i;
 99:   PetscBLASInt      one=1;
100:   cublasStatus_t    cberr;
101:   cublasHandle_t    cublasv2handle;
104:   if (!h) {
105:     PetscCUBLASGetHandle(&cublasv2handle);
106:     VecCUDAGetArrayRead(bv->buffer,&d_h);
107:     cberr = cublasXdotc(cublasv2handle,bv->nc+j,d_h,one,d_h,one,&dot);CHKERRCUBLAS(cberr);
108:     WaitForGPU();CHKERRCUDA(ierr);
109:     *sum = PetscRealPart(dot);
110:     VecCUDARestoreArrayRead(bv->buffer,&d_h);
111:   } else { /* cpu memory */
112:     *sum = 0.0;
113:     for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
114:   }
115:   return(0);
116: }
118: #define X_AXIS        0
119: #define BLOCK_SIZE_X 64
120: #define TILE_SIZE_X  16 /* work to be done by any thread on axis x */
122: /*
123:    Set the kernels grid dimensions
124:    xcount: number of kernel calls needed for the requested size
125:  */
126: PetscErrorCode SetGrid1D(PetscInt n, dim3 *dimGrid, dim3 *dimBlock,PetscInt *xcount)
127: {
128:   PetscInt              one=1,card;
129:   struct cudaDeviceProp devprop;
130:   cudaError_t           cerr;
133:   *xcount = 1;
134:   if (n>BLOCK_SIZE_X) {
135:     dimBlock->x = BLOCK_SIZE_X;
136:     dimGrid->x = (n+BLOCK_SIZE_X*TILE_SIZE_X-one)/BLOCK_SIZE_X*TILE_SIZE_X;
137:   } else {
138:     dimBlock->x = (n+TILE_SIZE_X-one)/TILE_SIZE_X;
139:     dimGrid->x = one;
140:   }
141:   cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
142:   cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);
143:   if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
144:     *xcount = (dimGrid->x+devprop.maxGridSize[X_AXIS]-one)/devprop.maxGridSize[X_AXIS];
145:     dimGrid->x = devprop.maxGridSize[X_AXIS];
146:   }
147:   return(0);
148: }
150: /* pointwise multiplication */
151: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
152: {
153:   PetscInt i,x;
155:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
156:   for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
157:     a[i] *= PetscRealPart(b[i]);
158:   }
159: }
161: /* pointwise division */
162: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
163: {
164:   PetscInt i,x;
166:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
167:   for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
168:     a[i] /= PetscRealPart(b[i]);
169:   }
170: }
172: /*
173:    BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
174:    the contents of the coefficients array (up to position j) and omega is the signature;
175:    if inverse=TRUE then the operation is h/omega
176: */
177: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
178: {
179:   PetscErrorCode    ierr;
180:   PetscScalar       *d_h;
181:   const PetscScalar *d_omega,*omega;
182:   PetscInt          i,xcount;
183:   dim3              blocks3d, threads3d;
184:   cudaError_t       cerr;
187:   if (!(bv->nc+j)) return(0);
188:   if (!h) {
189:     VecCUDAGetArrayReadWrite(bv->buffer,&d_h);
190:     VecCUDAGetArrayRead(bv->omega,&d_omega);
191:     SetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount);
192:     if (inverse) {
193:       for (i=0;i<xcount;i++) {
194:         PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
195:       }
196:     } else {
197:       for (i=0;i<xcount;i++) {
198:         PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
199:       }
200:     }
201:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
202:     WaitForGPU();CHKERRCUDA(ierr);
203:     VecCUDARestoreArrayRead(bv->omega,&d_omega);
204:     VecCUDARestoreArrayReadWrite(bv->buffer,&d_h);
205:   } else {
206:     VecGetArrayRead(bv->omega,&omega);
207:     if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
208:     else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
209:     VecRestoreArrayRead(bv->omega,&omega);
210:   }
211:   return(0);
212: }
214: /*
215:    BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
216:    of the coefficients array
217: */
218: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
219: {
220:   PetscErrorCode    ierr;
221:   const PetscScalar *d_h;
222:   PetscScalar       hh;
223:   cudaError_t       cerr;
226:   if (!h) {
227:     VecCUDAGetArrayRead(bv->buffer,&d_h);
228:     cerr = cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
229:     WaitForGPU();CHKERRCUDA(ierr);
230:     BV_SafeSqrt(bv,hh,beta);
231:     VecCUDARestoreArrayRead(bv->buffer,&d_h);
232:   } else {
233:     BV_SafeSqrt(bv,h[bv->nc+j],beta);
234:   }
235:   return(0);
236: }
238: /*
239:    BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
240:    provided by the caller (only values from l to j are copied)
241: */
242: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
243: {
244:   PetscErrorCode    ierr;
245:   const PetscScalar *d_h,*d_a;
246:   PetscInt          i;
247:   cudaError_t       cerr;
250:   if (!h) {
251:     VecCUDAGetArrayRead(bv->buffer,&d_a);
252:     d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
253:     cerr = cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
254:     WaitForGPU();CHKERRCUDA(ierr);
255:     VecCUDARestoreArrayRead(bv->buffer,&d_a);
256:   } else {
257:     for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
258:   }
259:   return(0);
260: }