Actual source code: fnutil.c
slepc-3.15.0 2021-03-31
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Utility subroutines common to several impls
12: */
14: #include <slepc/private/fnimpl.h>
15: #include <slepcblaslapack.h>
17: /*
18: Compute the square root of an upper quasi-triangular matrix T,
19: using Higham's algorithm (LAA 88, 1987). T is overwritten with sqrtm(T).
20: */
21: static PetscErrorCode SlepcMatDenseSqrt(PetscBLASInt n,PetscScalar *T,PetscBLASInt ld)
22: {
23: PetscScalar one=1.0,mone=-1.0;
24: PetscReal scal;
25: PetscBLASInt i,j,si,sj,r,ione=1,info;
26: #if !defined(PETSC_USE_COMPLEX)
27: PetscReal alpha,theta,mu,mu2;
28: #endif
31: for (j=0;j<n;j++) {
32: #if defined(PETSC_USE_COMPLEX)
33: sj = 1;
34: T[j+j*ld] = PetscSqrtScalar(T[j+j*ld]);
35: #else
36: sj = (j==n-1 || T[j+1+j*ld] == 0.0)? 1: 2;
37: if (sj==1) {
38: if (T[j+j*ld]<0.0) SETERRQ(PETSC_COMM_SELF,1,"Matrix has a real negative eigenvalue, no real primary square root exists");
39: T[j+j*ld] = PetscSqrtReal(T[j+j*ld]);
40: } else {
41: /* square root of 2x2 block */
42: theta = (T[j+j*ld]+T[j+1+(j+1)*ld])/2.0;
43: mu = (T[j+j*ld]-T[j+1+(j+1)*ld])/2.0;
44: mu2 = -mu*mu-T[j+1+j*ld]*T[j+(j+1)*ld];
45: mu = PetscSqrtReal(mu2);
46: if (theta>0.0) alpha = PetscSqrtReal((theta+PetscSqrtReal(theta*theta+mu2))/2.0);
47: else alpha = mu/PetscSqrtReal(2.0*(-theta+PetscSqrtReal(theta*theta+mu2)));
48: T[j+j*ld] /= 2.0*alpha;
49: T[j+1+(j+1)*ld] /= 2.0*alpha;
50: T[j+(j+1)*ld] /= 2.0*alpha;
51: T[j+1+j*ld] /= 2.0*alpha;
52: T[j+j*ld] += alpha-theta/(2.0*alpha);
53: T[j+1+(j+1)*ld] += alpha-theta/(2.0*alpha);
54: }
55: #endif
56: for (i=j-1;i>=0;i--) {
57: #if defined(PETSC_USE_COMPLEX)
58: si = 1;
59: #else
60: si = (i==0 || T[i+(i-1)*ld] == 0.0)? 1: 2;
61: if (si==2) i--;
62: #endif
63: /* solve Sylvester equation of order si x sj */
64: r = j-i-si;
65: if (r) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&si,&sj,&r,&mone,T+i+(i+si)*ld,&ld,T+i+si+j*ld,&ld,&one,T+i+j*ld,&ld));
66: PetscStackCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,&si,&sj,T+i+i*ld,&ld,T+j+j*ld,&ld,T+i+j*ld,&ld,&scal,&info));
67: SlepcCheckLapackInfo("trsyl",info);
68: if (scal!=1.0) SETERRQ1(PETSC_COMM_SELF,1,"Current implementation cannot handle scale factor %g",scal);
69: }
70: if (sj==2) j++;
71: }
72: return(0);
73: }
75: #define BLOCKSIZE 64
77: /*
78: Schur method for the square root of an upper quasi-triangular matrix T.
79: T is overwritten with sqrtm(T).
80: If firstonly then only the first column of T will contain relevant values.
81: */
82: PetscErrorCode FNSqrtmSchur(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool firstonly)
83: {
85: PetscBLASInt i,j,k,r,ione=1,sdim,lwork,*s,*p,info,bs=BLOCKSIZE;
86: PetscScalar *wr,*W,*Q,*work,one=1.0,zero=0.0,mone=-1.0;
87: PetscInt m,nblk;
88: PetscReal scal;
89: #if defined(PETSC_USE_COMPLEX)
90: PetscReal *rwork;
91: #else
92: PetscReal *wi;
93: #endif
96: m = n;
97: nblk = (m+bs-1)/bs;
98: lwork = 5*n;
99: k = firstonly? 1: n;
101: /* compute Schur decomposition A*Q = Q*T */
102: #if !defined(PETSC_USE_COMPLEX)
103: PetscMalloc7(m,&wr,m,&wi,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p);
104: PetscStackCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,wi,Q,&ld,work,&lwork,NULL,&info));
105: #else
106: PetscMalloc7(m,&wr,m,&rwork,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p);
107: PetscStackCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,Q,&ld,work,&lwork,rwork,NULL,&info));
108: #endif
109: SlepcCheckLapackInfo("gees",info);
111: /* determine block sizes and positions, to avoid cutting 2x2 blocks */
112: j = 0;
113: p[j] = 0;
114: do {
115: s[j] = PetscMin(bs,n-p[j]);
116: #if !defined(PETSC_USE_COMPLEX)
117: if (p[j]+s[j]!=n && T[p[j]+s[j]+(p[j]+s[j]-1)*ld]!=0.0) s[j]++;
118: #endif
119: if (p[j]+s[j]==n) break;
120: j++;
121: p[j] = p[j-1]+s[j-1];
122: } while (1);
123: nblk = j+1;
125: for (j=0;j<nblk;j++) {
126: /* evaluate f(T_jj) */
127: SlepcMatDenseSqrt(s[j],T+p[j]+p[j]*ld,ld);
128: for (i=j-1;i>=0;i--) {
129: /* solve Sylvester equation for block (i,j) */
130: r = p[j]-p[i]-s[i];
131: if (r) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",s+i,s+j,&r,&mone,T+p[i]+(p[i]+s[i])*ld,&ld,T+p[i]+s[i]+p[j]*ld,&ld,&one,T+p[i]+p[j]*ld,&ld));
132: PetscStackCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,s+i,s+j,T+p[i]+p[i]*ld,&ld,T+p[j]+p[j]*ld,&ld,T+p[i]+p[j]*ld,&ld,&scal,&info));
133: SlepcCheckLapackInfo("trsyl",info);
134: if (scal!=1.0) SETERRQ1(PETSC_COMM_SELF,1,"Current implementation cannot handle scale factor %g",scal);
135: }
136: }
138: /* backtransform B = Q*T*Q' */
139: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","C",&n,&k,&n,&one,T,&ld,Q,&ld,&zero,W,&ld));
140: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&k,&n,&one,Q,&ld,W,&ld,&zero,T,&ld));
142: /* flop count: Schur decomposition, triangular square root, and backtransform */
143: PetscLogFlops(25.0*n*n*n+n*n*n/3.0+4.0*n*n*k);
145: #if !defined(PETSC_USE_COMPLEX)
146: PetscFree7(wr,wi,W,Q,work,s,p);
147: #else
148: PetscFree7(wr,rwork,W,Q,work,s,p);
149: #endif
150: return(0);
151: }
153: #define DBMAXIT 25
155: /*
156: Computes the principal square root of the matrix T using the product form
157: of the Denman-Beavers iteration.
158: T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv.
159: */
160: PetscErrorCode FNSqrtmDenmanBeavers(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool inv)
161: {
162: PetscScalar *Told,*M=NULL,*invM,*work,work1,prod,alpha;
163: PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sp25=0.25;
164: PetscReal tol,Mres=0.0,detM,g,reldiff,fnormdiff,fnormT,rwork[1];
165: PetscBLASInt N,i,it,*piv=NULL,info,query=-1,lwork;
166: const PetscBLASInt one=1;
167: PetscBool converged=PETSC_FALSE,scale=PETSC_FALSE;
168: PetscErrorCode ierr;
169: unsigned int ftz;
172: N = n*n;
173: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
174: SlepcSetFlushToZero(&ftz);
176: /* query work size */
177: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,M,&ld,piv,&work1,&query,&info));
178: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
179: PetscMalloc5(lwork,&work,n,&piv,n*n,&Told,n*n,&M,n*n,&invM);
180: PetscArraycpy(M,T,n*n);
182: if (inv) { /* start recurrence with I instead of A */
183: PetscArrayzero(T,n*n);
184: for (i=0;i<n;i++) T[i+i*ld] += 1.0;
185: }
187: for (it=0;it<DBMAXIT && !converged;it++) {
189: if (scale) { /* g = (abs(det(M)))^(-1/(2*n)) */
190: PetscArraycpy(invM,M,n*n);
191: PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info));
192: SlepcCheckLapackInfo("getrf",info);
193: prod = invM[0];
194: for (i=1;i<n;i++) prod *= invM[i+i*ld];
195: detM = PetscAbsScalar(prod);
196: g = PetscPowReal(detM,-1.0/(2.0*n));
197: alpha = g;
198: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&alpha,T,&one));
199: alpha = g*g;
200: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&alpha,M,&one));
201: PetscLogFlops(2.0*n*n*n/3.0+2.0*n*n);
202: }
204: PetscArraycpy(Told,T,n*n);
205: PetscArraycpy(invM,M,n*n);
207: PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info));
208: SlepcCheckLapackInfo("getrf",info);
209: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,invM,&ld,piv,work,&lwork,&info));
210: SlepcCheckLapackInfo("getri",info);
211: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0);
213: for (i=0;i<n;i++) invM[i+i*ld] += 1.0;
214: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Told,&ld,invM,&ld,&szero,T,&ld));
215: for (i=0;i<n;i++) invM[i+i*ld] -= 1.0;
217: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&sone,invM,&one,M,&one));
218: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&sp25,M,&one));
219: for (i=0;i<n;i++) M[i+i*ld] -= 0.5;
220: PetscLogFlops(2.0*n*n*n+2.0*n*n);
222: Mres = LAPACKlange_("F",&n,&n,M,&n,rwork);
223: for (i=0;i<n;i++) M[i+i*ld] += 1.0;
225: if (scale) {
226: /* reldiff = norm(T - Told,'fro')/norm(T,'fro') */
227: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,T,&one,Told,&one));
228: fnormdiff = LAPACKlange_("F",&n,&n,Told,&n,rwork);
229: fnormT = LAPACKlange_("F",&n,&n,T,&n,rwork);
230: PetscLogFlops(7.0*n*n);
231: reldiff = fnormdiff/fnormT;
232: PetscInfo4(fn,"it: %D reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)tol*g);
233: if (reldiff<1e-2) scale = PETSC_FALSE; /* Switch off scaling */
234: }
236: if (Mres<=tol) converged = PETSC_TRUE;
237: }
239: if (Mres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations",DBMAXIT);
240: PetscFree5(work,piv,Told,M,invM);
241: SlepcResetFlushToZero(&ftz);
242: return(0);
243: }
245: #define NSMAXIT 50
247: /*
248: Computes the principal square root of the matrix A using the Newton-Schulz iteration.
249: T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv.
250: */
251: PetscErrorCode FNSqrtmNewtonSchulz(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld,PetscBool inv)
252: {
253: PetscScalar *Y=A,*Yold,*Z,*Zold,*M;
254: PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0;
255: PetscReal sqrtnrm,tol,Yres=0.0,nrm,rwork[1],done=1.0;
256: PetscBLASInt info,i,it,N,one=1,zero=0;
257: PetscBool converged=PETSC_FALSE;
259: unsigned int ftz;
262: N = n*n;
263: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
264: SlepcSetFlushToZero(&ftz);
266: PetscMalloc4(N,&Yold,N,&Z,N,&Zold,N,&M);
268: /* scale A so that ||I-A|| < 1 */
269: PetscArraycpy(Z,A,N);
270: for (i=0;i<n;i++) Z[i+i*ld] -= 1.0;
271: nrm = LAPACKlange_("fro",&n,&n,Z,&n,rwork);
272: sqrtnrm = PetscSqrtReal(nrm);
273: PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&nrm,&done,&N,&one,A,&N,&info));
274: SlepcCheckLapackInfo("lascl",info);
275: tol *= nrm;
276: PetscInfo2(fn,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
277: PetscLogFlops(2.0*n*n);
279: /* Z = I */
280: PetscArrayzero(Z,N);
281: for (i=0;i<n;i++) Z[i+i*ld] = 1.0;
283: for (it=0;it<NSMAXIT && !converged;it++) {
284: /* Yold = Y, Zold = Z */
285: PetscArraycpy(Yold,Y,N);
286: PetscArraycpy(Zold,Z,N);
288: /* M = (3*I-Zold*Yold) */
289: PetscArrayzero(M,N);
290: for (i=0;i<n;i++) M[i+i*ld] = sthree;
291: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&smone,Zold,&ld,Yold,&ld,&sone,M,&ld));
293: /* Y = (1/2)*Yold*M, Z = (1/2)*M*Zold */
294: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Yold,&ld,M,&ld,&szero,Y,&ld));
295: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,M,&ld,Zold,&ld,&szero,Z,&ld));
297: /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */
298: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,Y,&one,Yold,&one));
299: Yres = LAPACKlange_("fro",&n,&n,Yold,&n,rwork);
300: PetscIsNanReal(Yres);
301: if (Yres<=tol) converged = PETSC_TRUE;
302: PetscInfo2(fn,"it: %D res: %g\n",it,(double)Yres);
304: PetscLogFlops(6.0*n*n*n+2.0*n*n);
305: }
307: if (Yres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations",NSMAXIT);
309: /* undo scaling */
310: if (inv) {
311: PetscArraycpy(A,Z,N);
312: PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&sqrtnrm,&done,&N,&one,A,&N,&info));
313: } else PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&done,&sqrtnrm,&N,&one,A,&N,&info));
314: SlepcCheckLapackInfo("lascl",info);
316: PetscFree4(Yold,Z,Zold,M);
317: SlepcResetFlushToZero(&ftz);
318: return(0);
319: }
321: #if defined(PETSC_HAVE_CUDA)
322: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
323: #include <slepccublas.h>
325: /*
326: * Matrix square root by Newton-Schulz iteration. CUDA version.
327: * Computes the principal square root of the matrix T using the
328: * Newton-Schulz iteration. T is overwritten with sqrtm(T).
329: */
330: PetscErrorCode FNSqrtmNewtonSchulz_CUDA(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld,PetscBool inv)
331: {
332: PetscScalar *d_A,*d_Yold,*d_Z,*d_Zold,*d_M;
333: PetscReal nrm,sqrtnrm;
334: const PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0;
335: PetscReal tol,Yres=0.0,alpha;
336: PetscInt it;
337: PetscBLASInt N;
338: const PetscBLASInt one=1,zero=0;
339: PetscBool converged=PETSC_FALSE;
340: cublasHandle_t cublasv2handle;
341: PetscErrorCode ierr;
342: cublasStatus_t cberr;
343: cudaError_t cerr;
346: PetscCUBLASGetHandle(&cublasv2handle);
347: N = n*n;
348: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
350: cerr = cudaMalloc((void **)&d_A,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
351: cerr = cudaMalloc((void **)&d_Yold,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
352: cerr = cudaMalloc((void **)&d_Z,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
353: cerr = cudaMalloc((void **)&d_Zold,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
354: cerr = cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
356: /* Y = A; */
357: cerr = cudaMemcpy(d_A,A,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
358: /* Z = I; */
359: cerr = cudaMemset(d_Z,zero,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
360: set_diagonal(n,d_Z,ld,sone);CHKERRQ(cerr);
362: /* scale A so that ||I-A|| < 1 */
363: cberr = cublasXaxpy(cublasv2handle,N,&smone,d_A,one,d_Z,one);CHKERRCUBLAS(cberr);
364: cberr = cublasXnrm2(cublasv2handle,N,d_Z,one,&nrm);CHKERRCUBLAS(cberr);
365: sqrtnrm = PetscSqrtReal(nrm);
366: alpha = 1.0/nrm;
367: cberr = cublasXscal(cublasv2handle,N,&alpha,d_A,one);CHKERRCUBLAS(cberr);
368: tol *= nrm;
369: PetscInfo2(fn,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
370: PetscLogFlops(2.0*n*n);
372: /* Z = I; */
373: cerr = cudaMemset(d_Z,zero,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
374: set_diagonal(n,d_Z,ld,sone);CHKERRQ(cerr);
376: for (it=0;it<NSMAXIT && !converged;it++) {
377: /* Yold = Y, Zold = Z */
378: cerr = cudaMemcpy(d_Yold,d_A,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
379: cerr = cudaMemcpy(d_Zold,d_Z,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
381: /* M = (3*I - Zold*Yold) */
382: cerr = cudaMemset(d_M,zero,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
383: set_diagonal(n,d_M,ld,sthree);CHKERRQ(cerr);
384: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&smone,d_Zold,ld,d_Yold,ld,&sone,d_M,ld);CHKERRCUBLAS(cberr);
386: /* Y = (1/2) * Yold * M, Z = (1/2) * M * Zold */
387: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_Yold,ld,d_M,ld,&szero,d_A,ld);CHKERRCUBLAS(cberr);
388: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_M,ld,d_Zold,ld,&szero,d_Z,ld);CHKERRCUBLAS(cberr);
390: /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */
391: cberr = cublasXaxpy(cublasv2handle,N,&smone,d_A,one,d_Yold,one);CHKERRCUBLAS(cberr);
392: cberr = cublasXnrm2(cublasv2handle,N,d_Yold,one,&Yres);CHKERRCUBLAS(cberr);
393: PetscIsNanReal(Yres);
394: if (Yres<=tol) converged = PETSC_TRUE;
395: PetscInfo2(fn,"it: %D res: %g\n",it,(double)Yres);
397: PetscLogFlops(6.0*n*n*n+2.0*n*n);
398: }
400: if (Yres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations", NSMAXIT);
402: /* undo scaling */
403: if (inv) {
404: sqrtnrm = 1.0/sqrtnrm;
405: cberr = cublasXscal(cublasv2handle,N,&sqrtnrm,d_Z,one);CHKERRCUBLAS(cberr);
406: cerr = cudaMemcpy(A,d_Z,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
407: } else {
408: cberr = cublasXscal(cublasv2handle,N,&sqrtnrm,d_A,one);CHKERRCUBLAS(cberr);
409: cerr = cudaMemcpy(A,d_A,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
410: }
412: cerr = cudaFree(d_A);CHKERRCUDA(cerr);
413: cerr = cudaFree(d_Yold);CHKERRCUDA(cerr);
414: cerr = cudaFree(d_Z);CHKERRCUDA(cerr);
415: cerr = cudaFree(d_Zold);CHKERRCUDA(cerr);
416: cerr = cudaFree(d_M);CHKERRCUDA(cerr);
417: return(0);
418: }
420: #if defined(PETSC_HAVE_MAGMA)
421: #include <slepcmagma.h>
423: /*
424: * Matrix square root by product form of Denman-Beavers iteration. CUDA version.
425: * Computes the principal square root of the matrix T using the product form
426: * of the Denman-Beavers iteration. T is overwritten with sqrtm(T).
427: */
428: PetscErrorCode FNSqrtmDenmanBeavers_CUDAm(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool inv)
429: {
430: PetscScalar *d_T,*d_Told,*d_M,*d_invM,*d_work,zero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sneg_pfive=-0.5,sp25=0.25,alpha;
431: PetscReal tol,Mres=0.0,detM,g,reldiff,fnormdiff,fnormT,prod;
432: PetscInt i,it,lwork,nb;
433: PetscBLASInt N,one=1,info,*piv=NULL;
434: PetscBool converged=PETSC_FALSE,scale=PETSC_FALSE;
435: cublasHandle_t cublasv2handle;
437: cublasStatus_t cberr;
438: cudaError_t cerr;
439: magma_int_t mierr;
442: PetscCUBLASGetHandle(&cublasv2handle);
443: magma_init();
444: N = n*n;
445: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
447: /* query work size */
448: nb = magma_get_xgetri_nb(n);
449: lwork = nb*n;
450: PetscMalloc1(n,&piv);
451: cerr = cudaMalloc((void **)&d_work,sizeof(PetscScalar)*lwork);CHKERRCUDA(cerr);
452: cerr = cudaMalloc((void **)&d_T,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
453: cerr = cudaMalloc((void **)&d_Told,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
454: cerr = cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
455: cerr = cudaMalloc((void **)&d_invM,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
457: cerr = cudaMemcpy(d_M,T,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
458: if (inv) { /* start recurrence with I instead of A */
459: cerr = cudaMemset(d_T,zero,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
460: set_diagonal(n,d_T,ld,1.0);CHKERRQ(cerr);
461: } else {
462: cerr = cudaMemcpy(d_T,T,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
463: }
465: for (it=0;it<DBMAXIT && !converged;it++) {
467: if (scale) { /* g = (abs(det(M)))^(-1/(2*n)); */
468: cerr = cudaMemcpy(d_invM,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
469: mmagma_xgetrf_gpu(n,n,d_invM,ld,piv,&info);CHKERRMAGMA(mierr);
470: if (info < 0) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
471: if (info > 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
473: /* XXX pending */
474: // mult_diagonal(d_invM,n,ld,&detM);CHKERRQ(cerr);
475: cerr = cudaMemcpy(T,d_invM,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
476: prod = T[0];
477: for (i=1;i<n;i++) { prod *= T[i+i*ld]; }
478: detM = PetscAbsReal(prod);
479: g = PetscPowReal(detM,-1.0/(2.0*n));
480: alpha = g;
481: cberr = cublasXscal(cublasv2handle,N,&alpha,d_T,one);CHKERRCUBLAS(cberr);
482: alpha = g*g;
483: cberr = cublasXscal(cublasv2handle,N,&alpha,d_M,one);CHKERRCUBLAS(cberr);
484: PetscLogFlops(2.0*n*n*n/3.0+2.0*n*n);
485: }
487: cerr = cudaMemcpy(d_Told,d_T,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
488: cerr = cudaMemcpy(d_invM,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
490: mmagma_xgetrf_gpu(n,n,d_invM,ld,piv,&info);CHKERRMAGMA(mierr);
491: if (info < 0) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
492: if (info > 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
493: mmagma_xgetri_gpu(n,d_invM,ld,piv,d_work,lwork,&info);CHKERRMAGMA(mierr);
494: if (info < 0) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "LAPACKgetri: Illegal value on argument %d",PetscAbsInt(info));
495: if (info > 0) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "LAPACKgetri: Matrix is singular. U(%d,%d) is zero",info,info);
496: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0);
498: shift_diagonal(n,d_invM,ld,sone);CHKERRQ(cerr);
499: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_Told,ld,d_invM,ld,&zero,d_T,ld);CHKERRCUBLAS(cberr);
500: shift_diagonal(n,d_invM,ld,smone);CHKERRQ(cerr);
502: cberr = cublasXaxpy(cublasv2handle,N,&sone,d_invM,one,d_M,one);CHKERRCUBLAS(cberr);
503: cberr = cublasXscal(cublasv2handle,N,&sp25,d_M,one);CHKERRCUBLAS(cberr);
504: shift_diagonal(n,d_M,ld,sneg_pfive);CHKERRQ(cerr);
505: PetscLogFlops(2.0*n*n*n+2.0*n*n);
507: cberr = cublasXnrm2(cublasv2handle,N,d_M,one,&Mres);CHKERRCUBLAS(cberr);
508: shift_diagonal(n,d_M,ld,sone);CHKERRQ(cerr);
510: if (scale) {
511: // reldiff = norm(T - Told,'fro')/norm(T,'fro');
512: cberr = cublasXaxpy(cublasv2handle,N,&smone,d_T,one,d_Told,one);CHKERRCUBLAS(cberr);
513: cberr = cublasXnrm2(cublasv2handle,N,d_Told,one,&fnormdiff);CHKERRCUBLAS(cberr);
514: cberr = cublasXnrm2(cublasv2handle,N,d_T,one,&fnormT);CHKERRCUBLAS(cberr);
515: PetscLogFlops(7.0*n*n);
516: reldiff = fnormdiff/fnormT;
517: PetscInfo4(fn,"it: %D reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)tol*g);
518: if (reldiff<1e-2) scale = PETSC_FALSE; /* Switch to no scaling. */
519: }
521: PetscInfo2(fn,"it: %D Mres: %g\n",it,(double)Mres);
522: if (Mres<=tol) converged = PETSC_TRUE;
523: }
525: if (Mres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations", DBMAXIT);
526: cerr = cudaMemcpy(T,d_T,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
527: PetscFree(piv);
528: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
529: cerr = cudaFree(d_T);CHKERRCUDA(cerr);
530: cerr = cudaFree(d_Told);CHKERRCUDA(cerr);
531: cerr = cudaFree(d_M);CHKERRCUDA(cerr);
532: cerr = cudaFree(d_invM);CHKERRCUDA(cerr);
533: magma_finalize();
534: return(0);
535: }
536: #endif /* PETSC_HAVE_MAGMA */
538: #endif /* PETSC_HAVE_CUDA */
540: #define ITMAX 5
541: #define SWAP(a,b,t) {t=a;a=b;b=t;}
543: /*
544: Estimate norm(A^m,1) by block 1-norm power method (required workspace is 11*n)
545: */
546: static PetscErrorCode SlepcNormEst1(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand,PetscReal *nrm)
547: {
548: PetscScalar *X,*Y,*Z,*S,*S_old,*aux,val,sone=1.0,szero=0.0;
549: PetscReal est=0.0,est_old,vals[2]={0.0,0.0},*zvals,maxzval[2],raux;
550: PetscBLASInt i,j,t=2,it=0,ind[2],est_j=0,m1;
554: X = work;
555: Y = work + 2*n;
556: Z = work + 4*n;
557: S = work + 6*n;
558: S_old = work + 8*n;
559: zvals = (PetscReal*)(work + 10*n);
561: for (i=0;i<n;i++) { /* X has columns of unit 1-norm */
562: X[i] = 1.0/n;
563: PetscRandomGetValue(rand,&val);
564: if (PetscRealPart(val) < 0.5) X[i+n] = -1.0/n;
565: else X[i+n] = 1.0/n;
566: }
567: for (i=0;i<t*n;i++) S[i] = 0.0;
568: ind[0] = 0; ind[1] = 0;
569: est_old = 0;
570: while (1) {
571: it++;
572: for (j=0;j<m;j++) { /* Y = A^m*X */
573: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&t,&n,&sone,A,&n,X,&n,&szero,Y,&n));
574: if (j<m-1) SWAP(X,Y,aux);
575: }
576: for (j=0;j<t;j++) { /* vals[j] = norm(Y(:,j),1) */
577: vals[j] = 0.0;
578: for (i=0;i<n;i++) vals[j] += PetscAbsScalar(Y[i+j*n]);
579: }
580: if (vals[0]<vals[1]) {
581: SWAP(vals[0],vals[1],raux);
582: m1 = 1;
583: } else m1 = 0;
584: est = vals[0];
585: if (est>est_old || it==2) est_j = ind[m1];
586: if (it>=2 && est<=est_old) {
587: est = est_old;
588: break;
589: }
590: est_old = est;
591: if (it>ITMAX) break;
592: SWAP(S,S_old,aux);
593: for (i=0;i<t*n;i++) { /* S = sign(Y) */
594: S[i] = (PetscRealPart(Y[i]) < 0.0)? -1.0: 1.0;
595: }
596: for (j=0;j<m;j++) { /* Z = (A^T)^m*S */
597: PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&n,&t,&n,&sone,A,&n,S,&n,&szero,Z,&n));
598: if (j<m-1) SWAP(S,Z,aux);
599: }
600: maxzval[0] = -1; maxzval[1] = -1;
601: ind[0] = 0; ind[1] = 0;
602: for (i=0;i<n;i++) { /* zvals[i] = norm(Z(i,:),inf) */
603: zvals[i] = PetscMax(PetscAbsScalar(Z[i+0*n]),PetscAbsScalar(Z[i+1*n]));
604: if (zvals[i]>maxzval[0]) {
605: maxzval[0] = zvals[i];
606: ind[0] = i;
607: } else if (zvals[i]>maxzval[1]) {
608: maxzval[1] = zvals[i];
609: ind[1] = i;
610: }
611: }
612: if (it>=2 && maxzval[0]==zvals[est_j]) break;
613: for (i=0;i<t*n;i++) X[i] = 0.0;
614: for (j=0;j<t;j++) X[ind[j]+j*n] = 1.0;
615: }
616: *nrm = est;
617: /* Flop count is roughly (it * 2*m * t*gemv) = 4*its*m*t*n*n */
618: PetscLogFlops(4.0*it*m*t*n*n);
619: return(0);
620: }
622: #define SMALLN 100
624: /*
625: Estimate norm(A^m,1) (required workspace is 2*n*n)
626: */
627: PetscErrorCode SlepcNormAm(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand,PetscReal *nrm)
628: {
629: PetscScalar *v=work,*w=work+n*n,*aux,sone=1.0,szero=0.0;
630: PetscReal rwork[1],tmp;
631: PetscBLASInt i,j,one=1;
632: PetscBool isrealpos=PETSC_TRUE;
636: if (n<SMALLN) { /* compute matrix power explicitly */
637: if (m==1) {
638: *nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
639: PetscLogFlops(1.0*n*n);
640: } else { /* m>=2 */
641: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,A,&n,&szero,v,&n));
642: for (j=0;j<m-2;j++) {
643: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,v,&n,&szero,w,&n));
644: SWAP(v,w,aux);
645: }
646: *nrm = LAPACKlange_("O",&n,&n,v,&n,rwork);
647: PetscLogFlops(2.0*n*n*n*(m-1)+1.0*n*n);
648: }
649: } else {
650: for (i=0;i<n;i++)
651: for (j=0;j<n;j++)
652: #if defined(PETSC_USE_COMPLEX)
653: if (PetscRealPart(A[i+j*n])<0.0 || PetscImaginaryPart(A[i+j*n])!=0.0) { isrealpos = PETSC_FALSE; break; }
654: #else
655: if (A[i+j*n]<0.0) { isrealpos = PETSC_FALSE; break; }
656: #endif
657: if (isrealpos) { /* for positive matrices only */
658: for (i=0;i<n;i++) v[i] = 1.0;
659: for (j=0;j<m;j++) { /* w = A'*v */
660: PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&n,&sone,A,&n,v,&one,&szero,w,&one));
661: SWAP(v,w,aux);
662: }
663: PetscLogFlops(2.0*n*n*m);
664: *nrm = 0.0;
665: for (i=0;i<n;i++) if ((tmp = PetscAbsScalar(v[i])) > *nrm) *nrm = tmp; /* norm(v,inf) */
666: } else {
667: SlepcNormEst1(n,A,m,work,rand,nrm);
668: }
669: }
670: return(0);
671: }