Actual source code: bvlapack.c

slepc-3.15.0 2021-03-31
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV private kernels that use the LAPACK
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include <slepcblaslapack.h>

 17: /*
 18:     Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
 19: */
 20: SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
 21: {
 22:   PetscBLASInt i,n=*len;
 23:   PetscReal    *x = (PetscReal*)in,*y = (PetscReal*)inout;

 26:   if (PetscUnlikely(*datatype!=MPIU_REAL)) {
 27:     (*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
 28:     MPI_Abort(MPI_COMM_WORLD,1);
 29:   }
 30:   for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
 31:   PetscFunctionReturnVoid();
 32: }

 34: /*
 35:     Compute ||A|| for an mxn matrix
 36: */
 37: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
 38: {
 40:   PetscBLASInt   m,n,i,j;
 41:   PetscMPIInt    len;
 42:   PetscReal      lnrm,*rwork=NULL,*rwork2=NULL;

 45:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
 46:   PetscBLASIntCast(m_,&m);
 47:   PetscBLASIntCast(n_,&n);
 48:   if (type==NORM_FROBENIUS || type==NORM_2) {
 49:     lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
 50:     if (mpi) {
 51:       MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv));
 52:     } else *nrm = lnrm;
 53:     PetscLogFlops(2.0*m*n);
 54:   } else if (type==NORM_1) {
 55:     if (mpi) {
 56:       BVAllocateWork_Private(bv,2*n_);
 57:       rwork = (PetscReal*)bv->work;
 58:       rwork2 = rwork+n_;
 59:       PetscArrayzero(rwork,n_);
 60:       PetscArrayzero(rwork2,n_);
 61:       for (j=0;j<n_;j++) {
 62:         for (i=0;i<m_;i++) {
 63:           rwork[j] += PetscAbsScalar(A[i+j*m_]);
 64:         }
 65:       }
 66:       PetscMPIIntCast(n_,&len);
 67:       MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
 68:       *nrm = 0.0;
 69:       for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
 70:     } else {
 71:       *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
 72:     }
 73:     PetscLogFlops(1.0*m*n);
 74:   } else if (type==NORM_INFINITY) {
 75:     BVAllocateWork_Private(bv,m_);
 76:     rwork = (PetscReal*)bv->work;
 77:     lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
 78:     if (mpi) {
 79:       MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv));
 80:     } else *nrm = lnrm;
 81:     PetscLogFlops(1.0*m*n);
 82:   }
 83:   PetscFPTrapPop();
 84:   return(0);
 85: }

 87: /*
 88:     Normalize the columns of an mxn matrix A
 89: */
 90: PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscScalar *eigi,PetscBool mpi)
 91: {
 93:   PetscBLASInt   m,j,k,info,zero=0;
 94:   PetscMPIInt    len;
 95:   PetscReal      *norms,*rwork=NULL,*rwork2=NULL,done=1.0;

 98:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
 99:   PetscBLASIntCast(m_,&m);
100:   BVAllocateWork_Private(bv,2*n_);
101:   rwork = (PetscReal*)bv->work;
102:   rwork2 = rwork+n_;
103:   /* compute local norms */
104:   for (j=0;j<n_;j++) {
105:     k = 1;
106: #if !defined(PETSC_USE_COMPLEX)
107:     if (eigi && eigi[j] != 0.0) k = 2;
108: #endif
109:     rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*m_),&m,rwork2);
110:     if (k==2) { rwork[j+1] = rwork[j]; j++; }
111:   }
112:   /* reduction to get global norms */
113:   if (mpi) {
114:     PetscMPIIntCast(n_,&len);
115:     PetscArrayzero(rwork2,n_);
116:     MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv));
117:     norms = rwork2;
118:   } else norms = rwork;
119:   /* scale columns */
120:   for (j=0;j<n_;j++) {
121:     k = 1;
122: #if !defined(PETSC_USE_COMPLEX)
123:     if (eigi && eigi[j] != 0.0) k = 2;
124: #endif
125:     PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*m_),&m,&info));
126:     SlepcCheckLapackInfo("lascl",info);
127:     if (k==2) j++;
128:   }
129:   PetscLogFlops(3.0*m*n_);
130:   PetscFPTrapPop();
131:   return(0);
132: }

134: /*
135:    Compute the upper Cholesky factor in R and its inverse in S.
136:    If S == R then the inverse overwrites the Cholesky factor.
137:  */
138: PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
139: {
141:   PetscInt       i,k,l,n,m,ld,lds;
142:   PetscScalar    *pR,*pS;
143:   PetscBLASInt   info,n_ = 0,m_ = 0,ld_,lds_;

146:   l = bv->l;
147:   k = bv->k;
148:   MatGetSize(R,&m,NULL);
149:   n = k-l;
150:   PetscBLASIntCast(m,&m_);
151:   PetscBLASIntCast(n,&n_);
152:   ld  = m;
153:   ld_ = m_;
154:   MatDenseGetArray(R,&pR);

156:   if (S==R) {
157:     BVAllocateWork_Private(bv,m*k);
158:     pS = bv->work;
159:     lds = ld;
160:     lds_ = ld_;
161:   } else {
162:     MatDenseGetArray(S,&pS);
163:     MatGetSize(S,&lds,NULL);
164:     PetscBLASIntCast(lds,&lds_);
165:   }

167:   /* save a copy of matrix in S */
168:   for (i=l;i<k;i++) {
169:     PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
170:   }

172:   /* compute upper Cholesky factor in R */
173:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
174:   PetscStackCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
175:   PetscLogFlops((1.0*n*n*n)/3.0);

177:   if (info) {  /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
178:     for (i=l;i<k;i++) {
179:       PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n);
180:       pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
181:     }
182:     PetscStackCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
183:     SlepcCheckLapackInfo("potrf",info);
184:     PetscLogFlops((1.0*n*n*n)/3.0);
185:   }

187:   /* compute S = inv(R) */
188:   if (S==R) {
189:     PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
190:   } else {
191:     PetscArrayzero(pS+l*lds,(k-l)*k);
192:     for (i=l;i<k;i++) {
193:       PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
194:     }
195:     PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
196:   }
197:   SlepcCheckLapackInfo("trtri",info);
198:   PetscFPTrapPop();
199:   PetscLogFlops(0.33*n*n*n);

201:   /* Zero out entries below the diagonal */
202:   for (i=l;i<k-1;i++) {
203:     PetscArrayzero(pR+i*ld+i+1,(k-i-1));
204:     if (S!=R) { PetscArrayzero(pS+i*lds+i+1,(k-i-1)); }
205:   }
206:   MatDenseRestoreArray(R,&pR);
207:   if (S!=R) { MatDenseRestoreArray(S,&pS); }
208:   return(0);
209: }

211: /*
212:    Compute the inverse of an upper triangular matrix R, store it in S.
213:    If S == R then the inverse overwrites R.
214:  */
215: PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
216: {
218:   PetscInt       i,k,l,n,m,ld,lds;
219:   PetscScalar    *pR,*pS;
220:   PetscBLASInt   info,n_,m_ = 0,ld_,lds_;

223:   l = bv->l;
224:   k = bv->k;
225:   MatGetSize(R,&m,NULL);
226:   n = k-l;
227:   PetscBLASIntCast(m,&m_);
228:   PetscBLASIntCast(n,&n_);
229:   ld  = m;
230:   ld_ = m_;
231:   MatDenseGetArray(R,&pR);

233:   if (S==R) {
234:     BVAllocateWork_Private(bv,m*k);
235:     pS = bv->work;
236:     lds = ld;
237:     lds_ = ld_;
238:   } else {
239:     MatDenseGetArray(S,&pS);
240:     MatGetSize(S,&lds,NULL);
241:     PetscBLASIntCast(lds,&lds_);
242:   }

244:   /* compute S = inv(R) */
245:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
246:   if (S==R) {
247:     PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
248:   } else {
249:     PetscArrayzero(pS+l*lds,(k-l)*k);
250:     for (i=l;i<k;i++) {
251:       PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n);
252:     }
253:     PetscStackCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
254:   }
255:   SlepcCheckLapackInfo("trtri",info);
256:   PetscFPTrapPop();
257:   PetscLogFlops(0.33*n*n*n);

259:   MatDenseRestoreArray(R,&pR);
260:   if (S!=R) { MatDenseRestoreArray(S,&pS); }
261:   return(0);
262: }

264: /*
265:    Compute the matrix to be used for post-multiplying the basis in the SVQB
266:    block orthogonalization method.
267:    On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
268:    the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
269:    If S == R then the result overwrites R.
270:  */
271: PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
272: {
274:   PetscInt       i,j,k,l,n,m,ld,lds;
275:   PetscScalar    *pR,*pS,*D,*work,a;
276:   PetscReal      *eig,dummy;
277:   PetscBLASInt   info,lwork,n_,m_ = 0,ld_,lds_;
278: #if defined(PETSC_USE_COMPLEX)
279:   PetscReal      *rwork,rdummy;
280: #endif

283:   l = bv->l;
284:   k = bv->k;
285:   MatGetSize(R,&m,NULL);
286:   n = k-l;
287:   PetscBLASIntCast(m,&m_);
288:   PetscBLASIntCast(n,&n_);
289:   ld  = m;
290:   ld_ = m_;
291:   MatDenseGetArray(R,&pR);

293:   if (S==R) {
294:     pS = pR;
295:     lds = ld;
296:     lds_ = ld_;
297:   } else {
298:     MatDenseGetArray(S,&pS);
299:     MatGetSize(S,&lds,NULL);
300:     PetscBLASIntCast(lds,&lds_);
301:   }
302:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);

304:   /* workspace query and memory allocation */
305:   lwork = -1;
306: #if defined(PETSC_USE_COMPLEX)
307:   PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
308:   PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork);
309:   PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork);
310: #else
311:   PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
312:   PetscBLASIntCast((PetscInt)a,&lwork);
313:   PetscMalloc3(n,&eig,n,&D,lwork,&work);
314: #endif

316:   /* copy and scale matrix */
317:   for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
318:   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
319:   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];

321:   /* compute eigendecomposition */
322: #if defined(PETSC_USE_COMPLEX)
323:   PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
324: #else
325:   PetscStackCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
326: #endif
327:   SlepcCheckLapackInfo("syev",info);

329:   if (S!=R) {   /* R = U' */
330:     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
331:   }

333:   /* compute S = D*U*Lambda^{-1/2} */
334:   for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
335:   for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);

337:   if (S!=R) {   /* compute R = inv(S) = Lambda^{1/2}*U'/D */
338:     for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
339:     for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
340:   }

342: #if defined(PETSC_USE_COMPLEX)
343:   PetscFree4(eig,D,work,rwork);
344: #else
345:   PetscFree3(eig,D,work);
346: #endif
347:   PetscLogFlops(9.0*n*n*n);
348:   PetscFPTrapPop();

350:   MatDenseRestoreArray(R,&pR);
351:   if (S!=R) { MatDenseRestoreArray(S,&pS); }
352:   return(0);
353: }

355: /*
356:     QR factorization of an mxn matrix via parallel TSQR
357: */
358: PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
359: {
361:   PetscInt       level,plevel,nlevels,powtwo,lda,worklen;
362:   PetscBLASInt   m,n,i,j,k,l,s = 0,nb,sz,lwork,info;
363:   PetscScalar    *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
364:   PetscMPIInt    rank,size,count,stride;
365:   MPI_Datatype   tmat;

368:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
369:   PetscBLASIntCast(m_,&m);
370:   PetscBLASIntCast(n_,&n);
371:   k  = PetscMin(m,n);
372:   nb = 16;
373:   lda = 2*n;
374:   MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);CHKERRMPI(ierr);
375:   MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank);CHKERRMPI(ierr);
376:   nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
377:   powtwo  = PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size)));
378:   worklen = n+n*nb;
379:   if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
380:   BVAllocateWork_Private(bv,worklen);
381:   tau  = bv->work;
382:   work = bv->work+n;
383:   PetscBLASIntCast(n*nb,&lwork);
384:   if (nlevels) {
385:     A  = bv->work+n+n*nb;
386:     QQ = bv->work+n+n*nb+n*lda;
387:     C  = bv->work+n+n*nb+n*lda+n*lda*nlevels;
388:   }

390:   /* Compute QR */
391:   PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
392:   SlepcCheckLapackInfo("geqrf",info);

394:   /* Extract R */
395:   if (R || nlevels) {
396:     for (j=0;j<n;j++) {
397:       for (i=0;i<=PetscMin(j,m-1);i++) {
398:         if (nlevels) A[i+j*lda] = Q[i+j*m];
399:         else R[i+j*ldr] = Q[i+j*m];
400:       }
401:       for (i=PetscMin(j,m-1)+1;i<n;i++) {
402:         if (nlevels) A[i+j*lda] = 0.0;
403:         else R[i+j*ldr] = 0.0;
404:       }
405:     }
406:   }

408:   /* Compute orthogonal matrix in Q */
409:   PetscStackCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&m,tau,work,&lwork,&info));
410:   SlepcCheckLapackInfo("orgqr",info);

412:   if (nlevels) {

414:     PetscMPIIntCast(n,&count);
415:     PetscMPIIntCast(lda,&stride);
416:     PetscBLASIntCast(lda,&l);
417:     MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat);CHKERRMPI(ierr);
418:     MPI_Type_commit(&tmat);CHKERRMPI(ierr);

420:     for (level=nlevels;level>=1;level--) {

422:       plevel = PetscPowInt(2,level);
423:       PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s);

425:       /* Stack triangular matrices */
426:       if (rank<s && s<size) {  /* send top part, receive bottom part */
427:         MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);CHKERRMPI(ierr);
428:       } else if (s<size) {  /* copy top to bottom, receive top part */
429:         MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);CHKERRMPI(ierr);
430:         MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);CHKERRMPI(ierr);
431:       }
432:       if (level<nlevels && size!=powtwo) {  /* for cases when size is not a power of 2 */
433:         if (rank<size-powtwo) {  /* send bottom part */
434:           MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv));CHKERRMPI(ierr);
435:         } else if (rank>=powtwo) {  /* receive bottom part */
436:           MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE);CHKERRMPI(ierr);
437:         }
438:       }
439:       /* Compute QR and build orthogonal matrix */
440:       if (level<nlevels || (level==nlevels && s<size)) {
441:         PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
442:         SlepcCheckLapackInfo("geqrf",info);
443:         PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda);
444:         PetscStackCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
445:         SlepcCheckLapackInfo("orgqr",info);
446:         for (j=0;j<n;j++) {
447:           for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
448:         }
449:       } else if (level==nlevels) {  /* only one triangular matrix, set Q=I */
450:         PetscArrayzero(QQ+(level-1)*n*lda,n*lda);
451:         for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
452:       }
453:     }

455:     /* Extract R */
456:     if (R) {
457:       for (j=0;j<n;j++) {
458:         for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
459:         for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
460:       }
461:     }

463:     /* Accumulate orthogonal matrices */
464:     for (level=1;level<=nlevels;level++) {
465:       plevel = PetscPowInt(2,level);
466:       PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s);
467:       Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
468:       if (level<nlevels) {
469:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
470:         PetscArraycpy(QQ+level*n*lda,C,n*lda);
471:       } else {
472:         for (i=0;i<m/l;i++) {
473:           PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&m,Qhalf,&l,&zero,C,&l));
474:           for (j=0;j<n;j++) { PetscArraycpy(Q+i*l+j*m,C+j*l,l); }
475:         }
476:         sz = m%l;
477:         if (sz) {
478:           PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&m,Qhalf,&l,&zero,C,&l));
479:           for (j=0;j<n;j++) { PetscArraycpy(Q+(m/l)*l+j*m,C+j*l,sz); }
480:         }
481:       }
482:     }

484:     MPI_Type_free(&tmat);CHKERRMPI(ierr);
485:   }

487:   PetscLogFlops(3.0*m*n*n);
488:   PetscFPTrapPop();
489:   return(0);
490: }

492: /*
493:     Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
494:     all matrices are upper triangular stored in packed format
495: */
496: SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
497: {
499:   PetscBLASInt   n,i,j,k,one=1;
500:   PetscMPIInt    tsize;
501:   PetscScalar    v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
502:   PetscReal      c;

505:   MPI_Type_size(*datatype,&tsize);CHKERRABORT(PETSC_COMM_SELF,ierr);  /* we assume len=1 */
506:   tsize /= sizeof(PetscScalar);
507:   n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
508:   for (j=0;j<n;j++) {
509:     for (i=0;i<=j;i++) {
510:       LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
511:       R1[(2*n-j-1)*j/2+j] = v;
512:       k = n-j-1;
513:       if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
514:     }
515:   }
516:   PetscFunctionReturnVoid();
517: }

519: /*
520:     Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
521: */
522: PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
523: {
525:   PetscInt       worklen;
526:   PetscBLASInt   m,n,i,j,s,nb,lwork,info;
527:   PetscScalar    *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
528:   PetscMPIInt    size,count;
529:   MPI_Datatype   tmat;

532:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
533:   PetscBLASIntCast(m_,&m);
534:   PetscBLASIntCast(n_,&n);
535:   nb = 16;
536:   s  = n+n*(n-1)/2;  /* length of packed triangular matrix */
537:   MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);CHKERRMPI(ierr);
538:   worklen = n+n*nb+2*s+m*n;
539:   BVAllocateWork_Private(bv,worklen);
540:   tau  = bv->work;
541:   work = bv->work+n;
542:   R1   = bv->work+n+n*nb;
543:   R2   = bv->work+n+n*nb+s;
544:   A    = bv->work+n+n*nb+2*s;
545:   PetscBLASIntCast(n*nb,&lwork);
546:   PetscArraycpy(A,Q,m*n);

548:   /* Compute QR */
549:   PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&m,tau,work,&lwork,&info));
550:   SlepcCheckLapackInfo("geqrf",info);

552:   if (size==1) {
553:     /* Extract R */
554:     for (j=0;j<n;j++) {
555:       for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*m];
556:       for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
557:     }
558:   } else {
559:     /* Use MPI reduction operation to obtain global R */
560:     PetscMPIIntCast(s,&count);
561:     MPI_Type_contiguous(count,MPIU_SCALAR,&tmat);CHKERRMPI(ierr);
562:     MPI_Type_commit(&tmat);CHKERRMPI(ierr);
563:     for (i=0;i<n;i++) {
564:       for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*m]:0.0;
565:     }
566:     MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv));
567:     for (i=0;i<n;i++) {
568:       for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
569:       for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
570:     }
571:     MPI_Type_free(&tmat);CHKERRMPI(ierr);
572:   }

574:   PetscLogFlops(3.0*m*n*n);
575:   PetscFPTrapPop();
576:   return(0);
577: }