Actual source code: sveccuda.cu

slepc-3.14.2 2021-02-01
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2020, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV implemented as a single Vec (CUDA version)
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include "../src/sys/classes/bv/impls/svec/svec.h"
 16: #include <petsccublas.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19: #include <thrust/device_ptr.h>
 20: #endif

 22: #define BLOCKSIZE 64

 24: /*
 25:     B := alpha*A + beta*B

 27:     A,B are nxk (ld=n)
 28:  */
 29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
 30: {
 32:   PetscBLASInt   m=0,one=1;
 33:   cublasStatus_t cberr;
 34:   cublasHandle_t cublasv2handle;

 37:   PetscCUBLASGetHandle(&cublasv2handle);
 38:   PetscBLASIntCast(n_*k_,&m);
 39:   PetscLogGpuTimeBegin();
 40:   if (beta!=(PetscScalar)1.0) {
 41:     cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
 42:     PetscLogGpuFlops(1.0*m);
 43:   }
 44:   cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
 45:   PetscLogGpuTimeEnd();
 46:   PetscLogGpuFlops(2.0*m);
 47:   return(0);
 48: }

 50: /*
 51:     C := alpha*A*B + beta*C
 52: */
 53: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
 54: {
 55:   PetscErrorCode    ierr;
 56:   BV_SVEC           *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
 57:   const PetscScalar *d_px,*d_A;
 58:   PetscScalar       *d_py,*q,*d_q,*d_B,*d_C;
 59:   PetscInt          ldq,mq;
 60:   PetscBLASInt      m=0,n=0,k=0,ldq_=0;
 61:   cublasStatus_t    cberr;
 62:   cudaError_t       cerr;
 63:   cublasHandle_t    cublasv2handle;

 66:   if (!Y->n) return(0);
 67:   VecCUDAGetArrayRead(x->v,&d_px);
 68:   if (beta==(PetscScalar)0.0) {
 69:     VecCUDAGetArrayWrite(y->v,&d_py);
 70:   } else {
 71:     VecCUDAGetArray(y->v,&d_py);
 72:   }
 73:   d_A = d_px+(X->nc+X->l)*X->n;
 74:   d_C = d_py+(Y->nc+Y->l)*Y->n;
 75:   if (Q) {
 76:     PetscBLASIntCast(Y->n,&m);
 77:     PetscBLASIntCast(Y->k-Y->l,&n);
 78:     PetscBLASIntCast(X->k-X->l,&k);
 79:     PetscCUBLASGetHandle(&cublasv2handle);
 80:     MatGetSize(Q,&ldq,&mq);
 81:     PetscBLASIntCast(ldq,&ldq_);
 82:     MatDenseGetArray(Q,&q);
 83:     cerr = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(cerr);
 84:     cerr = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 85:     PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
 86:     d_B = d_q+Y->l*ldq+X->l;
 87:     PetscLogGpuTimeBegin();
 88:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m);CHKERRCUBLAS(cberr);
 89:     PetscLogGpuTimeEnd();
 90:     MatDenseRestoreArray(Q,&q);
 91:     cerr = cudaFree(d_q);CHKERRCUDA(cerr);
 92:     PetscLogGpuFlops(2.0*m*n*k);
 93:   } else {
 94:     BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C);
 95:   }
 96:   VecCUDARestoreArrayRead(x->v,&d_px);
 97:   VecCUDARestoreArrayWrite(y->v,&d_py);
 98:   return(0);
 99: }

101: /*
102:     y := alpha*A*x + beta*y
103: */
104: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
105: {
106:   PetscErrorCode    ierr;
107:   BV_SVEC           *x = (BV_SVEC*)X->data;
108:   const PetscScalar *d_px,*d_A;
109:   PetscScalar       *d_py,*d_q,*d_x,*d_y;
110:   PetscBLASInt      n=0,k=0,one=1;
111:   cublasStatus_t    cberr;
112:   cublasHandle_t    cublasv2handle;
113:   cudaError_t       cerr;

116:   PetscBLASIntCast(X->n,&n);
117:   PetscBLASIntCast(X->k-X->l,&k);
118:   PetscCUBLASGetHandle(&cublasv2handle);
119:   VecCUDAGetArrayRead(x->v,&d_px);
120:   if (beta==(PetscScalar)0.0) {
121:     VecCUDAGetArrayWrite(y,&d_py);
122:   } else {
123:     VecCUDAGetArray(y,&d_py);
124:   }
125:   if (!q) {
126:     VecCUDAGetArray(X->buffer,&d_q);
127:   } else {
128:     cerr = cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
129:     cerr = cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
130:     PetscLogCpuToGpu(k*sizeof(PetscScalar));
131:   }
132:   d_A = d_px+(X->nc+X->l)*X->n;
133:   d_x = d_q;
134:   d_y = d_py;
135:   PetscLogGpuTimeBegin();
136:   cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
137:   PetscLogGpuTimeEnd();
138:   VecCUDARestoreArrayRead(x->v,&d_px);
139:   if (beta==(PetscScalar)0.0) {
140:     VecCUDARestoreArrayWrite(y,&d_py);
141:   } else {
142:     VecCUDARestoreArray(y,&d_py);
143:   }
144:   if (!q) {
145:     VecCUDARestoreArray(X->buffer,&d_q);
146:   } else {
147:     cerr = cudaFree(d_q);CHKERRCUDA(cerr);
148:   }
149:   PetscLogGpuFlops(2.0*n*k);
150:   return(0);
151: }

153: /*
154:     A(:,s:e-1) := A*B(:,s:e-1)
155: */
156: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
157: {
159:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
160:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
161:   PetscInt       j,ldq,nq;
162:   PetscBLASInt   m=0,n=0,k=0,l,ldq_=0,bs=BLOCKSIZE;
163:   cublasStatus_t cberr;
164:   size_t         freemem,totmem;
165:   cublasHandle_t cublasv2handle;
166:   cudaError_t    cerr;

169:   if (!V->n) return(0);
170:   PetscBLASIntCast(V->n,&m);
171:   PetscBLASIntCast(e-s,&n);
172:   PetscBLASIntCast(V->k-V->l,&k);
173:   MatGetSize(Q,&ldq,&nq);
174:   PetscBLASIntCast(ldq,&ldq_);
175:   VecCUDAGetArray(ctx->v,&d_pv);
176:   MatDenseGetArray(Q,&q);
177:   cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
178:   cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
179:   PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
180:   PetscCUBLASGetHandle(&cublasv2handle);
181:   PetscLogGpuTimeBegin();
182:   /* try to allocate the whole matrix */
183:   cerr = cudaMemGetInfo(&freemem,&totmem);CHKERRCUDA(cerr);
184:   if (freemem>=m*n*sizeof(PetscScalar)) {
185:     cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
186:     d_A = d_pv+(V->nc+V->l)*m;
187:     d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
188:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
189:     for (j=0;j<n;j++) {
190:       cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
191:     }
192:   } else {
193:     bs = freemem/(m*sizeof(PetscScalar));
194:     cerr = cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
195:     l = m % bs;
196:     if (l) {
197:       d_A = d_pv+(V->nc+V->l)*m;
198:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
199:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l);CHKERRCUBLAS(cberr);
200:       for (j=0;j<n;j++) {
201:         cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
202:       }
203:     }
204:     for (;l<m;l+=bs) {
205:       d_A = d_pv+(V->nc+V->l)*m+l;
206:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
207:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs);CHKERRCUBLAS(cberr);
208:       for (j=0;j<n;j++) {
209:         cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
210:       }
211:     }
212:   }
213:   PetscLogGpuTimeEnd();
214:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
215:   MatDenseRestoreArray(Q,&q);
216:   cerr = cudaFree(d_q);CHKERRCUDA(cerr);
217:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
218:   VecCUDARestoreArray(ctx->v,&d_pv);
219:   PetscLogGpuFlops(2.0*m*n*k);
220:   return(0);
221: }

223: /*
224:     A(:,s:e-1) := A*B(:,s:e-1)
225: */
226: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
227: {
229:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
230:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
231:   PetscInt       j,ldq,nq;
232:   PetscBLASInt   m=0,n=0,k=0,ldq_=0;
233:   cublasStatus_t cberr;
234:   cublasHandle_t cublasv2handle;
235:   cudaError_t    cerr;

238:   if (!V->n) return(0);
239:   PetscBLASIntCast(V->n,&m);
240:   PetscBLASIntCast(e-s,&n);
241:   PetscBLASIntCast(V->k-V->l,&k);
242:   MatGetSize(Q,&ldq,&nq);
243:   PetscBLASIntCast(ldq,&ldq_);
244:   VecCUDAGetArray(ctx->v,&d_pv);
245:   MatDenseGetArray(Q,&q);
246:   PetscCUBLASGetHandle(&cublasv2handle);
247:   cerr = cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));CHKERRCUDA(cerr);
248:   cerr = cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
249:   PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
250:   PetscLogGpuTimeBegin();
251:   cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
252:   d_A = d_pv+(V->nc+V->l)*m;
253:   d_B = d_q+V->l*ldq+s;
254:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m);CHKERRCUBLAS(cberr);
255:   for (j=0;j<n;j++) {
256:     cerr = cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
257:   }
258:   PetscLogGpuTimeEnd();
259:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
260:   MatDenseRestoreArray(Q,&q);
261:   cerr = cudaFree(d_q);CHKERRCUDA(cerr);
262:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
263:   VecCUDARestoreArray(ctx->v,&d_pv);
264:   PetscLogGpuFlops(2.0*m*n*k);
265:   return(0);
266: }

268: /*
269:     C := A'*B
270: */
271: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
272: {
273:   PetscErrorCode    ierr;
274:   BV_SVEC           *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
275:   const PetscScalar *d_px,*d_py,*d_A,*d_B;
276:   PetscScalar       *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
277:   PetscInt          j,ldm;
278:   PetscBLASInt      m=0,n=0,k=0,ldm_=0;
279:   PetscMPIInt       len;
280:   cublasStatus_t    cberr;
281:   cublasHandle_t    cublasv2handle;
282:   cudaError_t       cerr;

285:   PetscBLASIntCast(Y->k-Y->l,&m);
286:   PetscBLASIntCast(X->k-X->l,&n);
287:   PetscBLASIntCast(X->n,&k);
288:   MatGetSize(M,&ldm,NULL);
289:   PetscBLASIntCast(ldm,&ldm_);
290:   VecCUDAGetArrayRead(x->v,&d_px);
291:   VecCUDAGetArrayRead(y->v,&d_py);
292:   MatDenseGetArray(M,&pm);
293:   PetscCUBLASGetHandle(&cublasv2handle);
294:   cerr = cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));CHKERRCUDA(cerr);
295:   d_A = d_py+(Y->nc+Y->l)*Y->n;
296:   d_B = d_px+(X->nc+X->l)*X->n;
297:   C = pm+X->l*ldm+Y->l;
298:   if (x->mpi) {
299:     if (ldm==m) {
300:       BVAllocateWork_Private(X,m*n);
301:       if (k) {
302:         PetscLogGpuTimeBegin();
303:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_);CHKERRCUBLAS(cberr);
304:         PetscLogGpuTimeEnd();
305:         cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
306:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
307:       } else {
308:         PetscArrayzero(X->work,m*n);
309:       }
310:       PetscMPIIntCast(m*n,&len);
311:       MPIU_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
312:     } else {
313:       BVAllocateWork_Private(X,2*m*n);
314:       CC = X->work+m*n;
315:       if (k) {
316:         PetscLogGpuTimeBegin();
317:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
318:         PetscLogGpuTimeEnd();
319:         cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
320:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
321:       } else {
322:         PetscArrayzero(X->work,m*n);
323:       }
324:       PetscMPIIntCast(m*n,&len);
325:       MPIU_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
326:       for (j=0;j<n;j++) {
327:         PetscArraycpy(C+j*ldm,CC+j*m,m);
328:       }
329:     }
330:   } else {
331:     if (k) {
332:       BVAllocateWork_Private(X,m*n);
333:       PetscLogGpuTimeBegin();
334:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
335:       PetscLogGpuTimeEnd();
336:       cerr = cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
337:       PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
338:       for (j=0;j<n;j++) {
339:         PetscArraycpy(C+j*ldm,X->work+j*m,m);
340:       }
341:     }
342:   }
343:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
344:   cerr = cudaFree(d_work);CHKERRCUDA(cerr);
345:   MatDenseRestoreArray(M,&pm);
346:   VecCUDARestoreArrayRead(x->v,&d_px);
347:   VecCUDARestoreArrayRead(y->v,&d_py);
348:   PetscLogGpuFlops(2.0*m*n*k);
349:   return(0);
350: }

352: #if defined(PETSC_USE_COMPLEX)
353: struct conjugate
354: {
355:   __host__ __device__
356:     PetscScalar operator()(PetscScalar x)
357:     {
358:       return PetscConj(x);
359:     }
360: };

362: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
363: {
364:   cudaError_t                     cerr;
365:   thrust::device_ptr<PetscScalar> ptr;

368:   try {
369:     ptr = thrust::device_pointer_cast(a);
370:     thrust::transform(ptr,ptr+n,ptr,conjugate());
371:     cerr = WaitForCUDA();CHKERRCUDA(cerr);
372:   } catch (char *ex) {
373:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
374:   }
375:   return(0);
376: }
377: #endif

379: /*
380:     y := A'*x computed as y' := x'*A
381: */
382: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
383: {
384:   PetscErrorCode    ierr;
385:   BV_SVEC           *x = (BV_SVEC*)X->data;
386:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
387:   PetscScalar       *d_work,szero=0.0,sone=1.0,*qq=q;
388:   PetscBLASInt      n=0,k=0,one=1;
389:   PetscMPIInt       len;
390:   Vec               z = y;
391:   cublasStatus_t    cberr;
392:   cublasHandle_t    cublasv2handle;
393:   cudaError_t       cerr;

396:   PetscBLASIntCast(X->n,&n);
397:   PetscBLASIntCast(X->k-X->l,&k);
398:   PetscCUBLASGetHandle(&cublasv2handle);
399:   if (X->matrix) {
400:     BV_IPMatMult(X,y);
401:     z = X->Bx;
402:   }
403:   VecCUDAGetArrayRead(x->v,&d_px);
404:   VecCUDAGetArrayRead(z,&d_py);
405:   if (!q) {
406:     VecCUDAGetArrayWrite(X->buffer,&d_work);
407:   } else {
408:     cerr = cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
409:   }
410:   d_A = d_px+(X->nc+X->l)*X->n;
411:   d_x = d_py;
412:   if (x->mpi) {
413:     BVAllocateWork_Private(X,k);
414:     if (n) {
415:       PetscLogGpuTimeBegin();
416: #if defined(PETSC_USE_COMPLEX)
417:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
418:       ConjugateCudaArray(d_work,k);
419: #else
420:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
421: #endif
422:       PetscLogGpuTimeEnd();
423:       cerr = cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
424:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
425:     } else {
426:       PetscArrayzero(X->work,k);
427:     }
428:     if (!q) {
429:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
430:       VecGetArray(X->buffer,&qq);
431:     } else {
432:       cerr = cudaFree(d_work);CHKERRCUDA(cerr);
433:     }
434:     PetscMPIIntCast(k,&len);
435:     MPIU_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
436:     if (!q) { VecRestoreArray(X->buffer,&qq); }
437:   } else {
438:     if (n) {
439:       PetscLogGpuTimeBegin();
440: #if defined(PETSC_USE_COMPLEX)
441:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
442:       ConjugateCudaArray(d_work,k);
443: #else
444:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
445: #endif
446:       PetscLogGpuTimeEnd();
447:     }
448:     if (!q) {
449:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
450:     } else {
451:       cerr = cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
452:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
453:       cerr = cudaFree(d_work);CHKERRCUDA(cerr);
454:     }
455:   }
456:   cerr = WaitForCUDA();CHKERRCUDA(cerr);
457:   VecCUDARestoreArrayRead(z,&d_py);
458:   VecCUDARestoreArrayRead(x->v,&d_px);
459:   PetscLogGpuFlops(2.0*n*k);
460:   return(0);
461: }

463: /*
464:     y := A'*x computed as y' := x'*A
465: */
466: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
467: {
468:   PetscErrorCode    ierr;
469:   BV_SVEC           *x = (BV_SVEC*)X->data;
470:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
471:   PetscScalar       *d_y,szero=0.0,sone=1.0;
472:   PetscBLASInt      n=0,k=0,one=1;
473:   Vec               z = y;
474:   cublasStatus_t    cberr;
475:   cublasHandle_t    cublasv2handle;
476:   cudaError_t       cerr;

479:   PetscBLASIntCast(X->n,&n);
480:   PetscBLASIntCast(X->k-X->l,&k);
481:   if (X->matrix) {
482:     BV_IPMatMult(X,y);
483:     z = X->Bx;
484:   }
485:   PetscCUBLASGetHandle(&cublasv2handle);
486:   VecCUDAGetArrayRead(x->v,&d_px);
487:   VecCUDAGetArrayRead(z,&d_py);
488:   d_A = d_px+(X->nc+X->l)*X->n;
489:   d_x = d_py;
490:   if (n) {
491:     cerr = cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));CHKERRCUDA(cerr);
492:     PetscLogGpuTimeBegin();
493: #if defined(PETSC_USE_COMPLEX)
494:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
495:     ConjugateCudaArray(d_y,k);
496: #else
497:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
498: #endif
499:     PetscLogGpuTimeEnd();
500:     cerr = cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
501:     PetscLogGpuToCpu(k*sizeof(PetscScalar));
502:     cerr = cudaFree(d_y);CHKERRCUDA(cerr);
503:   }
504:   VecCUDARestoreArrayRead(z,&d_py);
505:   VecCUDARestoreArrayRead(x->v,&d_px);
506:   PetscLogGpuFlops(2.0*n*k);
507:   return(0);
508: }

510: /*
511:     Scale n scalars
512: */
513: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
514: {
516:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
517:   PetscScalar    *d_array, *d_A;
518:   PetscBLASInt   n=0,one=1;
519:   cublasStatus_t cberr;
520:   cublasHandle_t cublasv2handle;
521:   cudaError_t    cerr;

524:   VecCUDAGetArray(ctx->v,&d_array);
525:   if (j<0) {
526:     d_A = d_array+(bv->nc+bv->l)*bv->n;
527:     PetscBLASIntCast((bv->k-bv->l)*bv->n,&n);
528:   } else {
529:     d_A = d_array+(bv->nc+j)*bv->n;
530:     PetscBLASIntCast(bv->n,&n);
531:   }
532:   if (alpha == (PetscScalar)0.0) {
533:     cerr = cudaMemset(d_A,0,n*sizeof(PetscScalar));CHKERRCUDA(cerr);
534:   } else if (alpha != (PetscScalar)1.0) {
535:     PetscCUBLASGetHandle(&cublasv2handle);
536:     PetscLogGpuTimeBegin();
537:     cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
538:     PetscLogGpuTimeEnd();
539:     PetscLogGpuFlops(1.0*n);
540:   }
541:   VecCUDARestoreArray(ctx->v,&d_array);
542:   return(0);
543: }

545: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
546: {
547:   PetscErrorCode    ierr;
548:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
549:   Mat               Vmat,Wmat;
550:   const PetscScalar *d_pv;
551:   PetscScalar       *d_pw;
552:   PetscInt          j;

555:   if (V->vmm) {
556:     BVGetMat(V,&Vmat);
557:     BVGetMat(W,&Wmat);
558:     MatProductCreateWithMat(A,Vmat,NULL,Wmat);
559:     MatProductSetType(Wmat,MATPRODUCT_AB);
560:     MatProductSetFromOptions(Wmat);
561:     MatProductSymbolic(Wmat);
562:     MatProductNumeric(Wmat);
563:     MatProductClear(Wmat);
564:     BVRestoreMat(V,&Vmat);
565:     BVRestoreMat(W,&Wmat);
566:   } else {
567:     VecCUDAGetArrayRead(v->v,&d_pv);
568:     VecCUDAGetArrayWrite(w->v,&d_pw);
569:     for (j=0;j<V->k-V->l;j++) {
570:       VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
571:       VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
572:       MatMult(A,V->cv[1],W->cv[1]);
573:       VecCUDAResetArray(V->cv[1]);
574:       VecCUDAResetArray(W->cv[1]);
575:     }
576:     VecCUDARestoreArrayRead(v->v,&d_pv);
577:     VecCUDARestoreArrayWrite(w->v,&d_pw);
578:   }
579:   return(0);
580: }

582: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
583: {
584:   PetscErrorCode    ierr;
585:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
586:   const PetscScalar *d_pv,*d_pvc;
587:   PetscScalar       *d_pw,*d_pwc;
588:   cudaError_t       cerr;

591:   VecCUDAGetArrayRead(v->v,&d_pv);
592:   VecCUDAGetArrayWrite(w->v,&d_pw);
593:   d_pvc = d_pv+(V->nc+V->l)*V->n;
594:   d_pwc = d_pw+(W->nc+W->l)*W->n;
595:   cerr = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
596:   VecCUDARestoreArrayRead(v->v,&d_pv);
597:   VecCUDARestoreArrayWrite(w->v,&d_pw);
598:   return(0);
599: }

601: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
602: {
604:   BV_SVEC        *v = (BV_SVEC*)V->data;
605:   PetscScalar    *d_pv;
606:   cudaError_t    cerr;

609:   VecCUDAGetArray(v->v,&d_pv);
610:   cerr = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
611:   VecCUDARestoreArray(v->v,&d_pv);
612:   return(0);
613: }

615: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
616: {
617:   PetscErrorCode    ierr;
618:   BV_SVEC           *ctx = (BV_SVEC*)bv->data;
619:   const PetscScalar *d_pv;
620:   PetscScalar       *d_pnew;
621:   PetscInt          bs;
622:   Vec               vnew;
623:   char              str[50];
624:   cudaError_t       cerr;

627:   VecGetBlockSize(bv->t,&bs);
628:   VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
629:   VecSetType(vnew,((PetscObject)bv->t)->type_name);
630:   VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
631:   VecSetBlockSize(vnew,bs);
632:   PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
633:   if (((PetscObject)bv)->name) {
634:     PetscSNPrintf(str,sizeof(str),"%s_0",((PetscObject)bv)->name);
635:     PetscObjectSetName((PetscObject)vnew,str);
636:   }
637:   if (copy) {
638:     VecCUDAGetArrayRead(ctx->v,&d_pv);
639:     VecCUDAGetArrayWrite(vnew,&d_pnew);
640:     cerr = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
641:     VecCUDARestoreArrayRead(ctx->v,&d_pv);
642:     VecCUDARestoreArrayWrite(vnew,&d_pnew);
643:   }
644:   VecDestroy(&ctx->v);
645:   ctx->v = vnew;
646:   return(0);
647: }

649: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
650: {
652:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
653:   PetscScalar    *d_pv;
654:   PetscInt       l;

657:   l = BVAvailableVec;
658:   VecCUDAGetArray(ctx->v,&d_pv);
659:   VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
660:   return(0);
661: }

663: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
664: {
666:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
667:   PetscInt       l;

670:   l = (j==bv->ci[0])? 0: 1;
671:   VecCUDAResetArray(bv->cv[l]);
672:   VecCUDARestoreArray(ctx->v,NULL);
673:   return(0);
674: }

676: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
677: {
678:   PetscErrorCode    ierr;
679:   Vec               v;
680:   const PetscScalar *d_pv;
681:   PetscObjectState  lstate,rstate;
682:   PetscBool         change=PETSC_FALSE;

685:   /* force sync flag to PETSC_CUDA_BOTH */
686:   if (L) {
687:     PetscObjectStateGet((PetscObject)*L,&lstate);
688:     if (lstate != bv->lstate) {
689:       v = ((BV_SVEC*)bv->L->data)->v;
690:       VecCUDAGetArrayRead(v,&d_pv);
691:       VecCUDARestoreArrayRead(v,&d_pv);
692:       change = PETSC_TRUE;
693:     }
694:   }
695:   if (R) {
696:     PetscObjectStateGet((PetscObject)*R,&rstate);
697:     if (rstate != bv->rstate) {
698:       v = ((BV_SVEC*)bv->R->data)->v;
699:       VecCUDAGetArrayRead(v,&d_pv);
700:       VecCUDARestoreArrayRead(v,&d_pv);
701:       change = PETSC_TRUE;
702:     }
703:   }
704:   if (change) {
705:     v = ((BV_SVEC*)bv->data)->v;
706:     VecCUDAGetArray(v,(PetscScalar **)&d_pv);
707:     VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
708:   }
709:   return(0);
710: }

712: PetscErrorCode BVGetMat_Svec_CUDA(BV bv,Mat *A)
713: {
715:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
716:   PetscScalar    *vv,*aa;
717:   PetscBool      create=PETSC_FALSE;
718:   PetscInt       m,cols;

721:   m = bv->k-bv->l;
722:   if (!bv->Aget) create=PETSC_TRUE;
723:   else {
724:     MatDenseCUDAGetArray(bv->Aget,&aa);
725:     if (aa) SETERRQ(PetscObjectComm((PetscObject)bv),PETSC_ERR_ARG_WRONGSTATE,"BVGetMat already called on this BV");
726:     MatGetSize(bv->Aget,NULL,&cols);
727:     if (cols!=m) {
728:       MatDestroy(&bv->Aget);
729:       create=PETSC_TRUE;
730:     }
731:   }
732:   VecCUDAGetArray(ctx->v,&vv);
733:   if (create) {
734:     MatCreateDenseCUDA(PetscObjectComm((PetscObject)bv),bv->n,PETSC_DECIDE,bv->N,m,vv,&bv->Aget); /* pass a pointer to avoid allocation of storage */
735:     MatDenseCUDAReplaceArray(bv->Aget,NULL);  /* replace with a null pointer, the value after BVRestoreMat */
736:     PetscLogObjectParent((PetscObject)bv,(PetscObject)bv->Aget);
737:   }
738:   MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->n);  /* set the actual pointer */
739:   *A = bv->Aget;
740:   return(0);
741: }

743: PetscErrorCode BVRestoreMat_Svec_CUDA(BV bv,Mat *A)
744: {
746:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
747:   PetscScalar    *vv,*aa;

750:   MatDenseCUDAGetArray(bv->Aget,&aa);
751:   vv = aa-(bv->nc+bv->l)*bv->n;
752:   MatDenseCUDAResetArray(bv->Aget);
753:   VecCUDARestoreArray(ctx->v,&vv);
754:   *A = NULL;
755:   return(0);
756: }