/** * \file mat_utils.txx * \author Dhairya Malhotra, dhairya.malhotra@gmail.com * \date 2-11-2011 * \brief This file contains BLAS and LAPACK wrapper functions. */ #include #include #include #include #include #include #include #include #include #include #include #if defined(PVFMM_HAVE_CUDA) #include #include #endif namespace pvfmm{ namespace mat{ template inline void gemm(char TransA, char TransB, int M, int N, int K, T alpha, T *A, int lda, T *B, int ldb, T beta, T *C, int ldc){ if((TransA=='N' || TransA=='n') && (TransB=='N' || TransB=='n')){ for(size_t n=0;n inline void gemm(char TransA, char TransB, int M, int N, int K, float alpha, float *A, int lda, float *B, int ldb, float beta, float *C, int ldc){ sgemm_(&TransA, &TransB, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } template<> inline void gemm(char TransA, char TransB, int M, int N, int K, double alpha, double *A, int lda, double *B, int ldb, double beta, double *C, int ldc){ dgemm_(&TransA, &TransB, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); } #if defined(PVFMM_HAVE_CUDA) //template //inline void cublasgemm(char TransA, char TransB, int M, int N, int K, T alpha, T*A, int lda, T *B, int ldb, T beta, T *C, int ldc){ // assert(false); //} template<> inline void cublasgemm(char TransA, char TransB, int M, int N, int K, float alpha, float *A, int lda, float *B, int ldb, float beta, float *C, int ldc) { cublasOperation_t cublasTransA, cublasTransB; cublasHandle_t *handle = CUDA_Lock::acquire_handle(); if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T; else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N; if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T; else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N; cublasStatus_t status = cublasSgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc); } template<> inline void cublasgemm(char TransA, char TransB, int M, int N, int K, double alpha, double *A, int lda, double *B, int ldb, double beta, double *C, int ldc){ cublasOperation_t cublasTransA, cublasTransB; cublasHandle_t *handle = CUDA_Lock::acquire_handle(); if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T; else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N; if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T; else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N; cublasStatus_t status = cublasDgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc); } #endif #define U(i,j) U_[(i)*dim[0]+(j)] #define S(i,j) S_[(i)*dim[1]+(j)] #define V(i,j) V_[(i)*dim[1]+(j)] //#define SVD_DEBUG template void GivensL(T* S_, const size_t dim[2], size_t m, T a, T b){ T r=sqrt(a*a+b*b); T c=a/r; T s=-b/r; #pragma omp parallel for for(size_t i=0;i void GivensR(T* S_, const size_t dim[2], size_t m, T a, T b){ T r=sqrt(a*a+b*b); T c=a/r; T s=-b/r; #pragma omp parallel for for(size_t i=0;i void SVD(const size_t dim[2], T* U_, T* S_, T* V_, T eps=-1){ assert(dim[0]>=dim[1]); #ifdef SVD_DEBUG Matrix M0(dim[0],dim[1],S_); #endif { // Bi-diagonalization size_t n=std::min(dim[0],dim[1]); std::vector house_vec(std::max(dim[0],dim[1])); for(size_t i=0;i=n-1) continue; { T x1=S(i,i+1); if(x1<0) x1=-x1; T x_inv_norm=0; for(size_t j=i+1;j1.0) eps*=0.5; eps*=64.0; } while(k0S(i,i)?S_max:S(i,i)); //while(k0eps*(fabs(S(n-1,n-1))+fabs(S(n,n)))) n++; while(neps*S_max) n++; T mu=0; { // Compute mu T C[3][2]; C[0][0]=S(n-2,n-2)*S(n-2,n-2)+S(n-3,n-2)*S(n-3,n-2); C[0][1]=S(n-2,n-2)*S(n-2,n-1); C[1][0]=S(n-2,n-2)*S(n-2,n-1); C[1][1]=S(n-1,n-1)*S(n-1,n-1)+S(n-2,n-1)*S(n-2,n-1); T b=-(C[0][0]+C[1][1])/2; T c= C[0][0]*C[1][1] - C[0][1]*C[1][0]; T d=sqrt(b*b-c); T lambda1=-b+d; T lambda2=-b-d; T d1=lambda1-C[1][1]; d1=(d1<0?-d1:d1); T d2=lambda2-C[1][1]; d2=(d2<0?-d2:d2); mu=(d1 U0(dim[0],dim[0],U_); Matrix S0(dim[0],dim[1],S_); Matrix V0(dim[1],dim[1],V_); Matrix E=M0-U0*S0*V0; T max_err=0; T max_nondiag0=0; T max_nondiag1=0; for(size_t i=0;ij+0 || i+0j+1 || i+1 inline void svd(char *JOBU, char *JOBVT, int *M, int *N, T *A, int *LDA, T *S, T *U, int *LDU, T *VT, int *LDVT, T *WORK, int *LWORK, int *INFO){ const size_t dim[2]={std::max(*N,*M), std::min(*N,*M)}; T* U_=mem::aligned_new(dim[0]*dim[0]); memset(U_, 0, dim[0]*dim[0]*sizeof(T)); T* V_=mem::aligned_new(dim[1]*dim[1]); memset(V_, 0, dim[1]*dim[1]*sizeof(T)); T* S_=mem::aligned_new(dim[0]*dim[1]); const size_t lda=*LDA; const size_t ldu=*LDU; const size_t ldv=*LDVT; if(dim[1]==*M){ for(size_t i=0;i(dim, U_, S_, V_, (T)-1); for(size_t i=0;i(U_); mem::aligned_delete(S_); mem::aligned_delete(V_); if(0){ // Verify const size_t dim[2]={std::max(*N,*M), std::min(*N,*M)}; const size_t lda=*LDA; const size_t ldu=*LDU; const size_t ldv=*LDVT; Matrix A1(*M,*N); Matrix S1(dim[1],dim[1]); Matrix U1(*M,dim[1]); Matrix V1(dim[1],*N); for(size_t i=0;i<*N;i++) for(size_t j=0;j<*M;j++){ A1[j][i]=A[j+i*lda]; } S1.SetZero(); for(size_t i=0;i inline void svd(char *JOBU, char *JOBVT, int *M, int *N, float *A, int *LDA, float *S, float *U, int *LDU, float *VT, int *LDVT, float *WORK, int *LWORK, int *INFO){ sgesvd_(JOBU,JOBVT,M,N,A,LDA,S,U,LDU,VT,LDVT,WORK,LWORK,INFO); } template<> inline void svd(char *JOBU, char *JOBVT, int *M, int *N, double *A, int *LDA, double *S, double *U, int *LDU, double *VT, int *LDVT, double *WORK, int *LWORK, int *INFO){ dgesvd_(JOBU,JOBVT,M,N,A,LDA,S,U,LDU,VT,LDVT,WORK,LWORK,INFO); } /** * \brief Computes the pseudo inverse of matrix M(n1xn2) (in row major form) * and returns the output M_(n2xn1). Original contents of M are destroyed. */ template void pinv(T* M, int n1, int n2, T eps, T* M_){ if(n1*n2==0) return; int m = n2; int n = n1; int k = (m(m*k); T* tS =mem::aligned_new(k); T* tVT=mem::aligned_new(k*n); //SVD int INFO=0; char JOBU = 'S'; char JOBVT = 'S'; //int wssize = max(3*min(m,n)+max(m,n), 5*min(m,n)); int wssize = 3*(mn?m:n); int wssize1 = 5*(mwssize1?wssize:wssize1); T* wsbuf = mem::aligned_new(wssize); svd(&JOBU, &JOBVT, &m, &n, &M[0], &m, &tS[0], &tU[0], &m, &tVT[0], &k, wsbuf, &wssize, &INFO); if(INFO!=0) std::cout<(wsbuf); T eps_=tS[0]*eps; for(int i=0;i('T','T',n,m,k,1.0,&tVT[0],k,&tU[0],m,0.0,M_,n); mem::aligned_delete(tU); mem::aligned_delete(tS); mem::aligned_delete(tVT); } }//end namespace }//end namespace