#include SCTL_INCLUDE(matrix.hpp) #include SCTL_INCLUDE(math_utils.hpp) #if defined(SCTL_HAVE_CUDA) #include #include #endif #if defined(SCTL_HAVE_BLAS) #include SCTL_INCLUDE(blas.h) #endif #if defined(SCTL_HAVE_LAPACK) #include SCTL_INCLUDE(lapack.h) #endif #include #include #include #include #include #include #include namespace SCTL_NAMESPACE { namespace mat { template inline void gemm(char TransA, char TransB, int M, int N, int K, ValueType alpha, Iterator A, int lda, Iterator B, int ldb, ValueType beta, Iterator C, int ldc) { if ((TransA == 'N' || TransA == 'n') && (TransB == 'N' || TransB == 'n')) { #pragma omp parallel for schedule(static) for (Long n = 0; n < N; n++) { // Columns of C for (Long m = 0; m < M; m++) { // Rows of C ValueType AxB = 0; for (Long k = 0; k < K; k++) { AxB += A[m + lda * k] * B[k + ldb * n]; } C[m + ldc * n] = alpha * AxB + (beta == 0 ? 0 : beta * C[m + ldc * n]); } } } else if (TransA == 'N' || TransA == 'n') { #pragma omp parallel for schedule(static) for (Long n = 0; n < N; n++) { // Columns of C for (Long m = 0; m < M; m++) { // Rows of C ValueType AxB = 0; for (Long k = 0; k < K; k++) { AxB += A[m + lda * k] * B[n + ldb * k]; } C[m + ldc * n] = alpha * AxB + (beta == 0 ? 0 : beta * C[m + ldc * n]); } } } else if (TransB == 'N' || TransB == 'n') { #pragma omp parallel for schedule(static) for (Long n = 0; n < N; n++) { // Columns of C for (Long m = 0; m < M; m++) { // Rows of C ValueType AxB = 0; for (Long k = 0; k < K; k++) { AxB += A[k + lda * m] * B[k + ldb * n]; } C[m + ldc * n] = alpha * AxB + (beta == 0 ? 0 : beta * C[m + ldc * n]); } } } else { #pragma omp parallel for schedule(static) for (Long n = 0; n < N; n++) { // Columns of C for (Long m = 0; m < M; m++) { // Rows of C ValueType AxB = 0; for (Long k = 0; k < K; k++) { AxB += A[k + lda * m] * B[n + ldb * k]; } C[m + ldc * n] = alpha * AxB + (beta == 0 ? 0 : beta * C[m + ldc * n]); } } } } #if defined(SCTL_HAVE_BLAS) template <> inline void gemm(char TransA, char TransB, int M, int N, int K, float alpha, Iterator A, int lda, Iterator B, int ldb, float beta, Iterator C, int ldc) { sgemm_(&TransA, &TransB, &M, &N, &K, &alpha, &A[0], &lda, &B[0], &ldb, &beta, &C[0], &ldc); } template <> inline void gemm(char TransA, char TransB, int M, int N, int K, double alpha, Iterator A, int lda, Iterator B, int ldb, double beta, Iterator C, int ldc) { dgemm_(&TransA, &TransB, &M, &N, &K, &alpha, &A[0], &lda, &B[0], &ldb, &beta, &C[0], &ldc); } #endif #if defined(SCTL_HAVE_CUDA) template <> inline void cublasgemm(char TransA, char TransB, int M, int N, int K, float alpha, Iterator A, int lda, Iterator B, int ldb, float beta, Iterator C, int ldc) { cublasOperation_t cublasTransA, cublasTransB; cublasHandle_t *handle = CUDA_Lock::acquire_handle(); if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T; else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N; if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T; else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N; cublasStatus_t status = cublasSgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc); } template <> inline void cublasgemm(char TransA, char TransB, int M, int N, int K, double alpha, Iterator A, int lda, Iterator B, int ldb, double beta, Iterator C, int ldc) { cublasOperation_t cublasTransA, cublasTransB; cublasHandle_t *handle = CUDA_Lock::acquire_handle(); if (TransA == 'T' || TransA == 't') cublasTransA = CUBLAS_OP_T; else if (TransA == 'N' || TransA == 'n') cublasTransA = CUBLAS_OP_N; if (TransB == 'T' || TransB == 't') cublasTransB = CUBLAS_OP_T; else if (TransB == 'N' || TransB == 'n') cublasTransB = CUBLAS_OP_N; cublasStatus_t status = cublasDgemm(*handle, cublasTransA, cublasTransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc); } #endif //#define SCTL_SVD_DEBUG template static inline void GivensL(Iterator S_, const StaticArray &dim, Long m, ValueType a, ValueType b) { auto S = [S_,dim](Long i, Long j) -> ValueType& { return S_[(i) * dim[1] + (j)]; }; ValueType r = sqrt(a * a + b * b); if (r == 0) return; ValueType c = a / r; ValueType s = -b / r; #pragma omp parallel for for (Long i = 0; i < dim[1]; i++) { ValueType S0 = S(m + 0, i); ValueType S1 = S(m + 1, i); S(m, i) += S0 * (c - 1); S(m, i) += S1 * (-s); S(m + 1, i) += S0 * (s); S(m + 1, i) += S1 * (c - 1); } } template static inline void GivensR(Iterator S_, const StaticArray &dim, Long m, ValueType a, ValueType b) { auto S = [S_,dim](Long i, Long j) -> ValueType& { return S_[(i) * dim[1] + (j)]; }; ValueType r = sqrt(a * a + b * b); if (r == 0) return; ValueType c = a / r; ValueType s = -b / r; #pragma omp parallel for for (Long i = 0; i < dim[0]; i++) { ValueType S0 = S(i, m + 0); ValueType S1 = S(i, m + 1); S(i, m) += S0 * (c - 1); S(i, m) += S1 * (-s); S(i, m + 1) += S0 * (s); S(i, m + 1) += S1 * (c - 1); } } template static inline void SVD(const StaticArray &dim, Iterator U_, Iterator S_, Iterator V_, ValueType eps = -1) { auto U = [U_,dim](Long i, Long j) -> ValueType& { return U_[(i) * dim[0] + (j)]; }; auto S = [S_,dim](Long i, Long j) -> ValueType& { return S_[(i) * dim[1] + (j)]; }; auto V = [V_,dim](Long i, Long j) -> ValueType& { return V_[(i) * dim[1] + (j)]; }; assert(dim[0] >= dim[1]); #ifdef SCTL_SVD_DEBUG Matrix M0(dim[0], dim[1], S_); #endif { // Bi-diagonalization Long n = std::min(dim[0], dim[1]); std::vector house_vec(std::max(dim[0], dim[1])); for (Long i = 0; i < n; i++) { // Column Householder { ValueType x1 = S(i, i); if (x1 < 0) x1 = -x1; ValueType x_inv_norm = 0; for (Long j = i; j < dim[0]; j++) { x_inv_norm += S(j, i) * S(j, i); } if (x_inv_norm > 0) x_inv_norm = 1 / sqrt(x_inv_norm); ValueType alpha = sqrt(1 + x1 * x_inv_norm); ValueType beta = x_inv_norm / alpha; if (x_inv_norm == 0) alpha = 0; // nothing to do house_vec[i] = -alpha; for (Long j = i + 1; j < dim[0]; j++) { house_vec[j] = -beta * S(j, i); } if (S(i, i) < 0) for (Long j = i + 1; j < dim[0]; j++) { house_vec[j] = -house_vec[j]; } } #pragma omp parallel for for (Long k = i; k < dim[1]; k++) { ValueType dot_prod = 0; for (Long j = i; j < dim[0]; j++) { dot_prod += S(j, k) * house_vec[j]; } for (Long j = i; j < dim[0]; j++) { S(j, k) -= dot_prod * house_vec[j]; } } #pragma omp parallel for for (Long k = 0; k < dim[0]; k++) { ValueType dot_prod = 0; for (Long j = i; j < dim[0]; j++) { dot_prod += U(k, j) * house_vec[j]; } for (Long j = i; j < dim[0]; j++) { U(k, j) -= dot_prod * house_vec[j]; } } // Row Householder if (i >= n - 1) continue; { ValueType x1 = S(i, i + 1); if (x1 < 0) x1 = -x1; ValueType x_inv_norm = 0; for (Long j = i + 1; j < dim[1]; j++) { x_inv_norm += S(i, j) * S(i, j); } if (x_inv_norm > 0) x_inv_norm = 1 / sqrt(x_inv_norm); ValueType alpha = sqrt(1 + x1 * x_inv_norm); ValueType beta = x_inv_norm / alpha; if (x_inv_norm == 0) alpha = 0; // nothing to do house_vec[i + 1] = -alpha; for (Long j = i + 2; j < dim[1]; j++) { house_vec[j] = -beta * S(i, j); } if (S(i, i + 1) < 0) for (Long j = i + 2; j < dim[1]; j++) { house_vec[j] = -house_vec[j]; } } #pragma omp parallel for for (Long k = i; k < dim[0]; k++) { ValueType dot_prod = 0; for (Long j = i + 1; j < dim[1]; j++) { dot_prod += S(k, j) * house_vec[j]; } for (Long j = i + 1; j < dim[1]; j++) { S(k, j) -= dot_prod * house_vec[j]; } } #pragma omp parallel for for (Long k = 0; k < dim[1]; k++) { ValueType dot_prod = 0; for (Long j = i + 1; j < dim[1]; j++) { dot_prod += V(j, k) * house_vec[j]; } for (Long j = i + 1; j < dim[1]; j++) { V(j, k) -= dot_prod * house_vec[j]; } } } } Long k0 = 0; Long iter = 0; if (eps < 0) eps = 64.0 * machine_eps(); while (k0 < dim[1] - 1) { // Diagonalization iter++; ValueType S_max = 0.0; for (Long i = 0; i < dim[1]; i++) S_max = (S_max > fabs(S(i, i)) ? S_max : fabs(S(i, i))); for (Long i = 0; i < dim[1] - 1; i++) S_max = (S_max > fabs(S(i, i + 1)) ? S_max : fabs(S(i, i + 1))); // while(k0(S(k0,k0+1))<=eps*(fabs(S(k0,k0))+fabs(S(k0+1,k0+1)))) k0++; while (k0 < dim[1] - 1 && fabs(S(k0, k0 + 1)) <= eps * S_max) k0++; if (k0 == dim[1] - 1) continue; Long n = k0 + 2; // while(n(S(n-1,n))>eps*(fabs(S(n-1,n-1))+fabs(S(n,n)))) n++; while (n < dim[1] && fabs(S(n - 1, n)) > eps * S_max) n++; ValueType alpha = 0; ValueType beta = 0; if (n - k0 == 2 && fabs(S(k0, k0)) < eps * S_max && fabs(S(k0 + 1, k0 + 1)) < eps * S_max) { // Compute mu alpha=0; beta=1; } else { StaticArray C; C[0 * 2 + 0] = S(n - 2, n - 2) * S(n - 2, n - 2); if (n - k0 > 2) C[0 * 2 + 0] += S(n - 3, n - 2) * S(n - 3, n - 2); C[0 * 2 + 1] = S(n - 2, n - 2) * S(n - 2, n - 1); C[1 * 2 + 0] = S(n - 2, n - 2) * S(n - 2, n - 1); C[1 * 2 + 1] = S(n - 1, n - 1) * S(n - 1, n - 1) + S(n - 2, n - 1) * S(n - 2, n - 1); ValueType b = -(C[0 * 2 + 0] + C[1 * 2 + 1]) / 2; ValueType c = C[0 * 2 + 0] * C[1 * 2 + 1] - C[0 * 2 + 1] * C[1 * 2 + 0]; ValueType d = 0; if (fabs(b * b - c) > eps*b*b) d = sqrt(b * b - c); else { ValueType b = (C[0 * 2 + 0] - C[1 * 2 + 1]) / 2; ValueType c = -C[0 * 2 + 1] * C[1 * 2 + 0]; if (b * b - c > 0) d = sqrt(b * b - c); } ValueType lambda1 = -b + d; ValueType lambda2 = -b - d; ValueType d1 = lambda1 - C[1 * 2 + 1]; d1 = (d1 < 0 ? -d1 : d1); ValueType d2 = lambda2 - C[1 * 2 + 1]; d2 = (d2 < 0 ? -d2 : d2); ValueType mu = (d1 < d2 ? lambda1 : lambda2); alpha = S(k0, k0) * S(k0, k0) - mu; beta = S(k0, k0) * S(k0, k0 + 1); } for (Long k = k0; k < n - 1; k++) { StaticArray dimU; dimU[0] = dim[0]; dimU[1] = dim[0]; StaticArray dimV; dimV[0] = dim[1]; dimV[1] = dim[1]; GivensR(S_, dim, k, alpha, beta); GivensL(V_, dimV, k, alpha, beta); alpha = S(k, k); beta = S(k + 1, k); GivensL(S_, dim, k, alpha, beta); GivensR(U_, dimU, k, alpha, beta); alpha = S(k, k + 1); beta = S(k, k + 2); } { // Make S bi-diagonal again for (Long i0 = k0; i0 < n - 1; i0++) { for (Long i1 = 0; i1 < dim[1]; i1++) { if (i0 > i1 || i0 + 1 < i1) S(i0, i1) = 0; } } for (Long i0 = 0; i0 < dim[0]; i0++) { for (Long i1 = k0; i1 < n - 1; i1++) { if (i0 > i1 || i0 + 1 < i1) S(i0, i1) = 0; } } for (Long i = 0; i < dim[1] - 1; i++) { if (fabs(S(i, i + 1)) <= eps * S_max) { S(i, i + 1) = 0; } } } // std::cout< U0(dim[0], dim[0], U_); Matrix S0(dim[0], dim[1], S_); Matrix V0(dim[1], dim[1], V_); Matrix E = M0 - U0 * S0 * V0; ValueType max_err = 0; ValueType max_nondiag0 = 0; ValueType max_nondiag1 = 0; for (Long i = 0; i < E.Dim(0); i++) for (Long j = 0; j < E.Dim(1); j++) { if (max_err < fabs(E[i][j])) max_err = fabs(E[i][j]); if ((i > j + 0 || i + 0 < j) && max_nondiag0 < fabs(S0[i][j])) max_nondiag0 = fabs(S0[i][j]); if ((i > j + 1 || i + 1 < j) && max_nondiag1 < fabs(S0[i][j])) max_nondiag1 = fabs(S0[i][j]); } std::cout << max_err << '\n'; std::cout << max_nondiag0 << '\n'; std::cout << max_nondiag1 << '\n'; #endif } } #undef SCTL_SVD_DEBUG template inline void svd(char *JOBU, char *JOBVT, int *M, int *N, Iterator A, int *LDA, Iterator S, Iterator U, int *LDU, Iterator VT, int *LDVT, Iterator WORK, int *LWORK, int *INFO) { StaticArray dim; dim[0] = std::max(*N, *M); dim[1] = std::min(*N, *M); Iterator U_ = aligned_new(dim[0] * dim[0]); memset(U_, 0, dim[0] * dim[0]); Iterator V_ = aligned_new(dim[1] * dim[1]); memset(V_, 0, dim[1] * dim[1]); Iterator S_ = aligned_new(dim[0] * dim[1]); const Long lda = *LDA; const Long ldu = *LDU; const Long ldv = *LDVT; if (dim[1] == *M) { for (Long i = 0; i < dim[0]; i++) for (Long j = 0; j < dim[1]; j++) { S_[i * dim[1] + j] = A[i * lda + j]; } } else { for (Long i = 0; i < dim[0]; i++) for (Long j = 0; j < dim[1]; j++) { S_[i * dim[1] + j] = A[j * lda + i]; } } for (Long i = 0; i < dim[0]; i++) { U_[i * dim[0] + i] = 1; } for (Long i = 0; i < dim[1]; i++) { V_[i * dim[1] + i] = 1; } SVD(dim, U_, S_, V_, (ValueType) - 1); for (Long i = 0; i < dim[1]; i++) { // Set S S[i] = S_[i * dim[1] + i]; } if (dim[1] == *M) { // Set U for (Long i = 0; i < dim[1]; i++) for (Long j = 0; j < *M; j++) { U[j + ldu * i] = V_[j + i * dim[1]] * (S[i] < 0.0 ? -1.0 : 1.0); } } else { for (Long i = 0; i < dim[1]; i++) for (Long j = 0; j < *M; j++) { U[j + ldu * i] = U_[i + j * dim[0]] * (S[i] < 0.0 ? -1.0 : 1.0); } } if (dim[0] == *N) { // Set V for (Long i = 0; i < *N; i++) for (Long j = 0; j < dim[1]; j++) { VT[j + ldv * i] = U_[j + i * dim[0]]; } } else { for (Long i = 0; i < *N; i++) for (Long j = 0; j < dim[1]; j++) { VT[j + ldv * i] = V_[i + j * dim[1]]; } } for (Long i = 0; i < dim[1]; i++) { S[i] = S[i] * (S[i] < 0.0 ? -1.0 : 1.0); } aligned_delete(U_); aligned_delete(S_); aligned_delete(V_); if (0) { // Verify StaticArray dim; dim[0] = std::max(*N, *M); dim[1] = std::min(*N, *M); const Long lda = *LDA; const Long ldu = *LDU; const Long ldv = *LDVT; Matrix A1(*M, *N); Matrix S1(dim[1], dim[1]); Matrix U1(*M, dim[1]); Matrix V1(dim[1], *N); for (Long i = 0; i < *N; i++) for (Long j = 0; j < *M; j++) { A1[j][i] = A[j + i * lda]; } S1.SetZero(); for (Long i = 0; i < dim[1]; i++) { // Set S S1[i][i] = S[i]; } for (Long i = 0; i < dim[1]; i++) for (Long j = 0; j < *M; j++) { U1[j][i] = U[j + ldu * i]; } for (Long i = 0; i < *N; i++) for (Long j = 0; j < dim[1]; j++) { V1[j][i] = VT[j + ldv * i]; } std::cout << U1 *S1 *V1 - A1 << '\n'; } } #if defined(SCTL_HAVE_LAPACK) template <> inline void svd(char *JOBU, char *JOBVT, int *M, int *N, Iterator A, int *LDA, Iterator S, Iterator U, int *LDU, Iterator VT, int *LDVT, Iterator WORK, int *LWORK, int *INFO) { sgesvd_(JOBU, JOBVT, M, N, &A[0], LDA, &S[0], &U[0], LDU, &VT[0], LDVT, &WORK[0], LWORK, INFO); } template <> inline void svd(char *JOBU, char *JOBVT, int *M, int *N, Iterator A, int *LDA, Iterator S, Iterator U, int *LDU, Iterator VT, int *LDVT, Iterator WORK, int *LWORK, int *INFO) { dgesvd_(JOBU, JOBVT, M, N, &A[0], LDA, &S[0], &U[0], LDU, &VT[0], LDVT, &WORK[0], LWORK, INFO); } #endif /** * \brief Computes the pseudo inverse of matrix M(n1xn2) (in row major form) * and returns the output M_(n2xn1). Original contents of M are destroyed. */ template inline void pinv(Iterator M, int n1, int n2, ValueType eps, Iterator M_) { if (n1 * n2 == 0) return; int m = n2; int n = n1; int k = (m < n ? m : n); Iterator tU = aligned_new(m * k); Iterator tS = aligned_new(k); Iterator tVT = aligned_new(k * n); // SVD int INFO = 0; char JOBU = 'S'; char JOBVT = 'S'; // int wssize = max(3*min(m,n)+max(m,n), 5*min(m,n)); int wssize = 3 * (m < n ? m : n) + (m > n ? m : n); int wssize1 = 5 * (m < n ? m : n); wssize = (wssize > wssize1 ? wssize : wssize1); Iterator wsbuf = aligned_new(wssize); svd(&JOBU, &JOBVT, &m, &n, M, &m, tS, tU, &m, tVT, &k, wsbuf, &wssize, &INFO); if (INFO != 0) std::cout << INFO << '\n'; assert(INFO == 0); aligned_delete(wsbuf); ValueType eps_ = tS[0] * eps; for (int i = 0; i < k; i++) if (tS[i] < eps_) tS[i] = 0; else tS[i] = 1 / tS[i]; for (int i = 0; i < m; i++) { for (int j = 0; j < k; j++) { tU[i + j * m] *= tS[j]; } } gemm('T', 'T', n, m, k, 1.0, tVT, k, tU, m, 0.0, M_, n); aligned_delete(tU); aligned_delete(tS); aligned_delete(tVT); } } // end namespace mat } // end namespace SCTL_NAMESPACE