|  | @@ -469,9 +469,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |        // Evaluate potential at check surface due to equivalent surface.
 | 
	
		
			
				|  |  |        Matrix<Real_t> M_e2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
 | 
	
		
			
				|  |  | -      Kernel<Real_t>::Eval(&ue_coord[0], n_ue,
 | 
	
		
			
				|  |  | -                           &uc_coord[0], n_uc, &(M_e2c[0][0]),
 | 
	
		
			
				|  |  | -                           aux_kernel.ker_poten, aux_ker_dim);
 | 
	
		
			
				|  |  | +      aux_kernel.BuildMatrix(&ue_coord[0], n_ue,
 | 
	
		
			
				|  |  | +                             &uc_coord[0], n_uc, &(M_e2c[0][0]));
 | 
	
		
			
				|  |  |        M=M_e2c.pinv(); //check 2 equivalent
 | 
	
		
			
				|  |  |        break;
 | 
	
		
			
				|  |  |      }
 | 
	
	
		
			
				|  | @@ -489,9 +488,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |        // Evaluate potential at check surface due to equivalent surface.
 | 
	
		
			
				|  |  |        Matrix<Real_t> M_e2c(n_eq*aux_ker_dim[0],n_ch*aux_ker_dim[1]);
 | 
	
		
			
				|  |  | -      Kernel<Real_t>::Eval(&equiv_surf[0], n_eq,
 | 
	
		
			
				|  |  | -                           &check_surf[0], n_ch, &(M_e2c[0][0]),
 | 
	
		
			
				|  |  | -                           aux_kernel.ker_poten,aux_ker_dim);
 | 
	
		
			
				|  |  | +      aux_kernel.BuildMatrix(&equiv_surf[0], n_eq,
 | 
	
		
			
				|  |  | +                             &check_surf[0], n_ch, &(M_e2c[0][0]));
 | 
	
		
			
				|  |  |        M=M_e2c.pinv(); //check 2 equivalent
 | 
	
		
			
				|  |  |        break;
 | 
	
		
			
				|  |  |      }
 | 
	
	
		
			
				|  | @@ -516,9 +514,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |        // Evaluate potential at check surface due to equivalent surface.
 | 
	
		
			
				|  |  |        Matrix<Real_t> M_ce2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
 | 
	
		
			
				|  |  | -      Kernel<Real_t>::Eval(&equiv_surf[0], n_ue,
 | 
	
		
			
				|  |  | -                           &check_surf[0], n_uc, &(M_ce2c[0][0]),
 | 
	
		
			
				|  |  | -                           aux_kernel.ker_poten, aux_ker_dim);
 | 
	
		
			
				|  |  | +      aux_kernel.BuildMatrix(&equiv_surf[0], n_ue,
 | 
	
		
			
				|  |  | +                             &check_surf[0], n_uc, &(M_ce2c[0][0]));
 | 
	
		
			
				|  |  |        Matrix<Real_t>& M_c2e = Precomp(level, UC2UE_Type, 0);
 | 
	
		
			
				|  |  |        M=M_ce2c*M_c2e;
 | 
	
		
			
				|  |  |        break;
 | 
	
	
		
			
				|  | @@ -540,9 +537,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |        // Evaluate potential at check surface due to equivalent surface.
 | 
	
		
			
				|  |  |        Matrix<Real_t> M_pe2c(n_de*aux_ker_dim[0],n_dc*aux_ker_dim[1]);
 | 
	
		
			
				|  |  | -      Kernel<Real_t>::Eval(&equiv_surf[0], n_de,
 | 
	
		
			
				|  |  | -                           &check_surf[0], n_dc, &(M_pe2c[0][0]),
 | 
	
		
			
				|  |  | -                           aux_kernel.ker_poten,aux_ker_dim);
 | 
	
		
			
				|  |  | +      aux_kernel.BuildMatrix(&equiv_surf[0], n_de,
 | 
	
		
			
				|  |  | +                             &check_surf[0], n_dc, &(M_pe2c[0][0]));
 | 
	
		
			
				|  |  |        Matrix<Real_t>& M_c2e=Precomp(level,DC2DE_Type,0);
 | 
	
		
			
				|  |  |        M=M_pe2c*M_c2e;
 | 
	
		
			
				|  |  |        break;
 | 
	
	
		
			
				|  | @@ -598,7 +594,7 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |        std::vector<Real_t> r_trg(COORD_DIM,0.0);
 | 
	
		
			
				|  |  |        std::vector<Real_t> conv_poten(n3*aux_ker_dim[0]*aux_ker_dim[1]);
 | 
	
		
			
				|  |  |        std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
 | 
	
		
			
				|  |  | -      Kernel<Real_t>::Eval(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0],aux_kernel.ker_poten,aux_ker_dim);
 | 
	
		
			
				|  |  | +      aux_kernel.BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |        //Rearrange data.
 | 
	
		
			
				|  |  |        Matrix<Real_t> M_conv(n3,aux_ker_dim[0]*aux_ker_dim[1],&conv_poten[0],false);
 | 
	
	
		
			
				|  | @@ -606,9 +602,9 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |        //Compute FFTW plan.
 | 
	
		
			
				|  |  |        int nnn[3]={n1,n1,n1};
 | 
	
		
			
				|  |  | -      void *fftw_in, *fftw_out;
 | 
	
		
			
				|  |  | -      fftw_in  = fftw_malloc(  n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
 | 
	
		
			
				|  |  | -      fftw_out = fftw_malloc(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
 | 
	
		
			
				|  |  | +      Real_t *fftw_in, *fftw_out;
 | 
	
		
			
				|  |  | +      fftw_in  = mem::aligned_malloc<Real_t>(  n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
 | 
	
		
			
				|  |  | +      fftw_out = mem::aligned_malloc<Real_t>(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
 | 
	
		
			
				|  |  |        #pragma omp critical (FFTW_PLAN)
 | 
	
		
			
				|  |  |        {
 | 
	
		
			
				|  |  |          if (!vprecomp_fft_flag){
 | 
	
	
		
			
				|  | @@ -625,8 +621,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |        M=M_;
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |        //Free memory.
 | 
	
		
			
				|  |  | -      fftw_free(fftw_in);
 | 
	
		
			
				|  |  | -      fftw_free(fftw_out);
 | 
	
		
			
				|  |  | +      mem::aligned_free<Real_t>(fftw_in);
 | 
	
		
			
				|  |  | +      mem::aligned_free<Real_t>(fftw_out);
 | 
	
		
			
				|  |  |        break;
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |      case V1_Type:
 | 
	
	
		
			
				|  | @@ -752,9 +748,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |                ue_coord[0]=x0*s; ue_coord[1]=x1*s; ue_coord[2]=x2*s;
 | 
	
		
			
				|  |  |                std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
 | 
	
		
			
				|  |  |                Matrix<Real_t> M_tmp(n_surf*aux_ker_dim[0], n_surf*aux_ker_dim[1]);
 | 
	
		
			
				|  |  | -              Kernel<Real_t>::Eval(&src_coord[0], n_surf,
 | 
	
		
			
				|  |  | -                                   &trg_coord[0], n_surf, &(M_tmp[0][0]),
 | 
	
		
			
				|  |  | -                                   aux_kernel.ker_poten, aux_ker_dim);
 | 
	
		
			
				|  |  | +              aux_kernel.BuildMatrix(&src_coord[0], n_surf,
 | 
	
		
			
				|  |  | +                                     &trg_coord[0], n_surf, &(M_tmp[0][0]));
 | 
	
		
			
				|  |  |                M_ue2dc+=M_tmp;
 | 
	
		
			
				|  |  |              }
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -789,9 +784,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |                size_t n_eq=equiv_surf.size()/3;
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |                // Evaluate potential at check surface due to equivalent surface.
 | 
	
		
			
				|  |  | -              Kernel<Real_t>::Eval(&equiv_surf[0], n_eq,
 | 
	
		
			
				|  |  | -                                   &check_surf[0], n_ch, &(M_de2dc[0][0]),
 | 
	
		
			
				|  |  | -                                   aux_kernel.ker_poten,aux_ker_dim);
 | 
	
		
			
				|  |  | +              aux_kernel.BuildMatrix(&equiv_surf[0], n_eq,
 | 
	
		
			
				|  |  | +                                     &check_surf[0], n_ch, &(M_de2dc[0][0]));
 | 
	
		
			
				|  |  |              }
 | 
	
		
			
				|  |  |              Matrix<Real_t> M_ue2dc=M*M_de2dc;
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -827,9 +821,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |            { // Evaluate potential at corner due to upward and dnward equivalent surface.
 | 
	
		
			
				|  |  |              { // Error from local expansion.
 | 
	
		
			
				|  |  |                Matrix<Real_t> M_e2pt(n_surf*aux_ker_dim[0],n_corner*aux_ker_dim[1]);
 | 
	
		
			
				|  |  | -              Kernel<Real_t>::Eval(&dn_equiv_surf[0], n_surf,
 | 
	
		
			
				|  |  | -                                   &corner_pts[0], n_corner, &(M_e2pt[0][0]),
 | 
	
		
			
				|  |  | -                                   aux_kernel.ker_poten,aux_ker_dim);
 | 
	
		
			
				|  |  | +              aux_kernel.BuildMatrix(&dn_equiv_surf[0], n_surf,
 | 
	
		
			
				|  |  | +                                     &corner_pts[0], n_corner, &(M_e2pt[0][0]));
 | 
	
		
			
				|  |  |                M_err=M*M_e2pt;
 | 
	
		
			
				|  |  |              }
 | 
	
		
			
				|  |  |              for(size_t k=0;k<4;k++){ // Error from colleagues of root.
 | 
	
	
		
			
				|  | @@ -841,9 +834,8 @@ Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type
 | 
	
		
			
				|  |  |                                      corner_pts[k*COORD_DIM+2]-j2};
 | 
	
		
			
				|  |  |                  if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){
 | 
	
		
			
				|  |  |                    Matrix<Real_t> M_e2pt(n_surf*aux_ker_dim[0],aux_ker_dim[1]);
 | 
	
		
			
				|  |  | -                  Kernel<Real_t>::Eval(&up_equiv_surf[0], n_surf,
 | 
	
		
			
				|  |  | -                                       &pt_coord[0], 1, &(M_e2pt[0][0]),
 | 
	
		
			
				|  |  | -                                       aux_kernel.ker_poten,aux_ker_dim);
 | 
	
		
			
				|  |  | +                  aux_kernel.BuildMatrix(&up_equiv_surf[0], n_surf,
 | 
	
		
			
				|  |  | +                                         &pt_coord[0], 1, &(M_e2pt[0][0]));
 | 
	
		
			
				|  |  |                    for(size_t i=0;i<M_e2pt.Dim(0);i++)
 | 
	
		
			
				|  |  |                      for(size_t j=0;j<M_e2pt.Dim(1);j++)
 | 
	
		
			
				|  |  |                        M_err[i][k*aux_ker_dim[1]+j]+=M_e2pt[i][j];
 | 
	
	
		
			
				|  | @@ -1921,7 +1913,7 @@ void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector
 | 
	
		
			
				|  |  |          //Compute flops.
 | 
	
		
			
				|  |  |          #ifndef FFTW3_MKL
 | 
	
		
			
				|  |  |          double add, mul, fma;
 | 
	
		
			
				|  |  | -        fftw_flops(vlist_fftplan, &add, &mul, &fma);
 | 
	
		
			
				|  |  | +        FFTW_t<Real_t>::fftw_flops(vlist_fftplan, &add, &mul, &fma);
 | 
	
		
			
				|  |  |          #ifndef __INTEL_OFFLOAD0
 | 
	
		
			
				|  |  |          Profile::Add_FLOP((long long)(add+mul+2*fma));
 | 
	
		
			
				|  |  |          #endif
 | 
	
	
		
			
				|  | @@ -1968,13 +1960,13 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
 | 
	
		
			
				|  |  |      if(!vlist_ifft_flag){
 | 
	
		
			
				|  |  |        //Build FFTW plan.
 | 
	
		
			
				|  |  |        int nnn[3]={(int)n1,(int)n1,(int)n1};
 | 
	
		
			
				|  |  | -      void *fftw_in, *fftw_out;
 | 
	
		
			
				|  |  | -      fftw_in  = fftw_malloc(2*n3_*ker_dim1*sizeof(Real_t)*chld_cnt);
 | 
	
		
			
				|  |  | -      fftw_out = fftw_malloc(  n3 *ker_dim1*sizeof(Real_t)*chld_cnt);
 | 
	
		
			
				|  |  | +      Real_t *fftw_in, *fftw_out;
 | 
	
		
			
				|  |  | +      fftw_in  = mem::aligned_malloc<Real_t>(2*n3_*ker_dim1*chld_cnt);
 | 
	
		
			
				|  |  | +      fftw_out = mem::aligned_malloc<Real_t>(  n3 *ker_dim1*chld_cnt);
 | 
	
		
			
				|  |  |        vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
 | 
	
		
			
				|  |  |            (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE);
 | 
	
		
			
				|  |  | -      fftw_free(fftw_in);
 | 
	
		
			
				|  |  | -      fftw_free(fftw_out);
 | 
	
		
			
				|  |  | +      mem::aligned_free<Real_t>(fftw_in);
 | 
	
		
			
				|  |  | +      mem::aligned_free<Real_t>(fftw_out);
 | 
	
		
			
				|  |  |        vlist_ifft_flag=true;
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |    }
 | 
	
	
		
			
				|  | @@ -2004,7 +1996,7 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
 | 
	
		
			
				|  |  |          //Compute flops.
 | 
	
		
			
				|  |  |          #ifndef FFTW3_MKL
 | 
	
		
			
				|  |  |          double add, mul, fma;
 | 
	
		
			
				|  |  | -        fftw_flops(vlist_ifftplan, &add, &mul, &fma);
 | 
	
		
			
				|  |  | +        FFTW_t<Real_t>::fftw_flops(vlist_ifftplan, &add, &mul, &fma);
 | 
	
		
			
				|  |  |          #ifndef __INTEL_OFFLOAD0
 | 
	
		
			
				|  |  |          Profile::Add_FLOP((long long)(add+mul+2*fma));
 | 
	
		
			
				|  |  |          #endif
 | 
	
	
		
			
				|  | @@ -2030,6 +2022,396 @@ void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Ve
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +template<class Real_t>
 | 
	
		
			
				|  |  | +inline void matmult_8x8x2(Real_t*& M_, Real_t*& IN0, Real_t*& IN1, Real_t*& OUT0, Real_t*& OUT1){
 | 
	
		
			
				|  |  | +  // Generic code.
 | 
	
		
			
				|  |  | +  Real_t out_reg000, out_reg001, out_reg010, out_reg011;
 | 
	
		
			
				|  |  | +  Real_t out_reg100, out_reg101, out_reg110, out_reg111;
 | 
	
		
			
				|  |  | +  Real_t  in_reg000,  in_reg001,  in_reg010,  in_reg011;
 | 
	
		
			
				|  |  | +  Real_t  in_reg100,  in_reg101,  in_reg110,  in_reg111;
 | 
	
		
			
				|  |  | +  Real_t   m_reg000,   m_reg001,   m_reg010,   m_reg011;
 | 
	
		
			
				|  |  | +  Real_t   m_reg100,   m_reg101,   m_reg110,   m_reg111;
 | 
	
		
			
				|  |  | +  //#pragma unroll
 | 
	
		
			
				|  |  | +  for(int i1=0;i1<8;i1+=2){
 | 
	
		
			
				|  |  | +    Real_t* IN0_=IN0;
 | 
	
		
			
				|  |  | +    Real_t* IN1_=IN1;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
 | 
	
		
			
				|  |  | +    out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
 | 
	
		
			
				|  |  | +    out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
 | 
	
		
			
				|  |  | +    out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
 | 
	
		
			
				|  |  | +    //#pragma unroll
 | 
	
		
			
				|  |  | +    for(int i2=0;i2<8;i2+=2){
 | 
	
		
			
				|  |  | +      m_reg000=M_[ 0]; m_reg001=M_[ 1];
 | 
	
		
			
				|  |  | +      m_reg010=M_[ 2]; m_reg011=M_[ 3];
 | 
	
		
			
				|  |  | +      m_reg100=M_[16]; m_reg101=M_[17];
 | 
	
		
			
				|  |  | +      m_reg110=M_[18]; m_reg111=M_[19];
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      in_reg000=IN0_[0]; in_reg001=IN0_[1];
 | 
	
		
			
				|  |  | +      in_reg010=IN0_[2]; in_reg011=IN0_[3];
 | 
	
		
			
				|  |  | +      in_reg100=IN1_[0]; in_reg101=IN1_[1];
 | 
	
		
			
				|  |  | +      in_reg110=IN1_[2]; in_reg111=IN1_[3];
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
 | 
	
		
			
				|  |  | +      out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
 | 
	
		
			
				|  |  | +      out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
 | 
	
		
			
				|  |  | +      out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
 | 
	
		
			
				|  |  | +      out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
 | 
	
		
			
				|  |  | +      out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
 | 
	
		
			
				|  |  | +      out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
 | 
	
		
			
				|  |  | +      out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
 | 
	
		
			
				|  |  | +      out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
 | 
	
		
			
				|  |  | +      out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
 | 
	
		
			
				|  |  | +      out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
 | 
	
		
			
				|  |  | +      out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
 | 
	
		
			
				|  |  | +      out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      M_+=32; // Jump to (column+2).
 | 
	
		
			
				|  |  | +      IN0_+=4;
 | 
	
		
			
				|  |  | +      IN1_+=4;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
 | 
	
		
			
				|  |  | +    OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
 | 
	
		
			
				|  |  | +    OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
 | 
	
		
			
				|  |  | +    OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
 | 
	
		
			
				|  |  | +    M_+=4-64*2; // Jump back to first column (row+2).
 | 
	
		
			
				|  |  | +    OUT0+=4;
 | 
	
		
			
				|  |  | +    OUT1+=4;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +#if defined(__AVX__) || defined(__SSE3__)
 | 
	
		
			
				|  |  | +template<>
 | 
	
		
			
				|  |  | +inline void matmult_8x8x2<double>(double*& M_, double*& IN0, double*& IN1, double*& OUT0, double*& OUT1){
 | 
	
		
			
				|  |  | +#ifdef __AVX__ //AVX code.
 | 
	
		
			
				|  |  | +  __m256d out00,out01,out10,out11;
 | 
	
		
			
				|  |  | +  __m256d out20,out21,out30,out31;
 | 
	
		
			
				|  |  | +  double* in0__ = IN0;
 | 
	
		
			
				|  |  | +  double* in1__ = IN1;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  out00 = _mm256_load_pd(OUT0);
 | 
	
		
			
				|  |  | +  out01 = _mm256_load_pd(OUT1);
 | 
	
		
			
				|  |  | +  out10 = _mm256_load_pd(OUT0+4);
 | 
	
		
			
				|  |  | +  out11 = _mm256_load_pd(OUT1+4);
 | 
	
		
			
				|  |  | +  out20 = _mm256_load_pd(OUT0+8);
 | 
	
		
			
				|  |  | +  out21 = _mm256_load_pd(OUT1+8);
 | 
	
		
			
				|  |  | +  out30 = _mm256_load_pd(OUT0+12);
 | 
	
		
			
				|  |  | +  out31 = _mm256_load_pd(OUT1+12);
 | 
	
		
			
				|  |  | +  for(int i2=0;i2<8;i2+=2){
 | 
	
		
			
				|  |  | +    __m256d m00;
 | 
	
		
			
				|  |  | +    __m256d ot00;
 | 
	
		
			
				|  |  | +    __m256d mt0,mtt0;
 | 
	
		
			
				|  |  | +    __m256d in00,in00_r,in01,in01_r;
 | 
	
		
			
				|  |  | +    in00 = _mm256_broadcast_pd((const __m128d*)in0__);
 | 
	
		
			
				|  |  | +    in00_r = _mm256_permute_pd(in00,5);
 | 
	
		
			
				|  |  | +    in01 = _mm256_broadcast_pd((const __m128d*)in1__);
 | 
	
		
			
				|  |  | +    in01_r = _mm256_permute_pd(in01,5);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm256_load_pd(M_);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | +    mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | +    out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | +    out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm256_load_pd(M_+4);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | +    mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | +    out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | +    out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm256_load_pd(M_+8);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | +    mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | +    out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | +    out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm256_load_pd(M_+12);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | +    mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | +    out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | +    out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
 | 
	
		
			
				|  |  | +    in00_r = _mm256_permute_pd(in00,5);
 | 
	
		
			
				|  |  | +    in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
 | 
	
		
			
				|  |  | +    in01_r = _mm256_permute_pd(in01,5);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm256_load_pd(M_+16);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | +    mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | +    out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | +    out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm256_load_pd(M_+20);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | +    mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | +    out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | +    out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm256_load_pd(M_+24);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | +    mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | +    out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | +    out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm256_load_pd(M_+28);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | +    mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | +    out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | +    out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    M_ += 32;
 | 
	
		
			
				|  |  | +    in0__ += 4;
 | 
	
		
			
				|  |  | +    in1__ += 4;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +  _mm256_store_pd(OUT0,out00);
 | 
	
		
			
				|  |  | +  _mm256_store_pd(OUT1,out01);
 | 
	
		
			
				|  |  | +  _mm256_store_pd(OUT0+4,out10);
 | 
	
		
			
				|  |  | +  _mm256_store_pd(OUT1+4,out11);
 | 
	
		
			
				|  |  | +  _mm256_store_pd(OUT0+8,out20);
 | 
	
		
			
				|  |  | +  _mm256_store_pd(OUT1+8,out21);
 | 
	
		
			
				|  |  | +  _mm256_store_pd(OUT0+12,out30);
 | 
	
		
			
				|  |  | +  _mm256_store_pd(OUT1+12,out31);
 | 
	
		
			
				|  |  | +#elif defined __SSE3__ // SSE code.
 | 
	
		
			
				|  |  | +  __m128d out00, out01, out10, out11;
 | 
	
		
			
				|  |  | +  __m128d in00, in01, in10, in11;
 | 
	
		
			
				|  |  | +  __m128d m00, m01, m10, m11;
 | 
	
		
			
				|  |  | +  //#pragma unroll
 | 
	
		
			
				|  |  | +  for(int i1=0;i1<8;i1+=2){
 | 
	
		
			
				|  |  | +    double* IN0_=IN0;
 | 
	
		
			
				|  |  | +    double* IN1_=IN1;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    out00 =_mm_load_pd (OUT0  );
 | 
	
		
			
				|  |  | +    out10 =_mm_load_pd (OUT0+2);
 | 
	
		
			
				|  |  | +    out01 =_mm_load_pd (OUT1  );
 | 
	
		
			
				|  |  | +    out11 =_mm_load_pd (OUT1+2);
 | 
	
		
			
				|  |  | +    //#pragma unroll
 | 
	
		
			
				|  |  | +    for(int i2=0;i2<8;i2+=2){
 | 
	
		
			
				|  |  | +      m00 =_mm_load1_pd (M_   );
 | 
	
		
			
				|  |  | +      m10 =_mm_load1_pd (M_+ 2);
 | 
	
		
			
				|  |  | +      m01 =_mm_load1_pd (M_+16);
 | 
	
		
			
				|  |  | +      m11 =_mm_load1_pd (M_+18);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      in00 =_mm_load_pd (IN0_  );
 | 
	
		
			
				|  |  | +      in10 =_mm_load_pd (IN0_+2);
 | 
	
		
			
				|  |  | +      in01 =_mm_load_pd (IN1_  );
 | 
	
		
			
				|  |  | +      in11 =_mm_load_pd (IN1_+2);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      out00 = _mm_add_pd   (out00, _mm_mul_pd(m00 , in00 ));
 | 
	
		
			
				|  |  | +      out00 = _mm_add_pd   (out00, _mm_mul_pd(m01 , in10 ));
 | 
	
		
			
				|  |  | +      out01 = _mm_add_pd   (out01, _mm_mul_pd(m00 , in01 ));
 | 
	
		
			
				|  |  | +      out01 = _mm_add_pd   (out01, _mm_mul_pd(m01 , in11 ));
 | 
	
		
			
				|  |  | +      out10 = _mm_add_pd   (out10, _mm_mul_pd(m10 , in00 ));
 | 
	
		
			
				|  |  | +      out10 = _mm_add_pd   (out10, _mm_mul_pd(m11 , in10 ));
 | 
	
		
			
				|  |  | +      out11 = _mm_add_pd   (out11, _mm_mul_pd(m10 , in01 ));
 | 
	
		
			
				|  |  | +      out11 = _mm_add_pd   (out11, _mm_mul_pd(m11 , in11 ));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      m00 =_mm_load1_pd (M_+   1);
 | 
	
		
			
				|  |  | +      m10 =_mm_load1_pd (M_+ 2+1);
 | 
	
		
			
				|  |  | +      m01 =_mm_load1_pd (M_+16+1);
 | 
	
		
			
				|  |  | +      m11 =_mm_load1_pd (M_+18+1);
 | 
	
		
			
				|  |  | +      in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
 | 
	
		
			
				|  |  | +      in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
 | 
	
		
			
				|  |  | +      in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
 | 
	
		
			
				|  |  | +      in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
 | 
	
		
			
				|  |  | +      out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
 | 
	
		
			
				|  |  | +      out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
 | 
	
		
			
				|  |  | +      out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
 | 
	
		
			
				|  |  | +      out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
 | 
	
		
			
				|  |  | +      out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
 | 
	
		
			
				|  |  | +      out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
 | 
	
		
			
				|  |  | +      out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
 | 
	
		
			
				|  |  | +      out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      M_+=32; // Jump to (column+2).
 | 
	
		
			
				|  |  | +      IN0_+=4;
 | 
	
		
			
				|  |  | +      IN1_+=4;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +    _mm_store_pd (OUT0  ,out00);
 | 
	
		
			
				|  |  | +    _mm_store_pd (OUT0+2,out10);
 | 
	
		
			
				|  |  | +    _mm_store_pd (OUT1  ,out01);
 | 
	
		
			
				|  |  | +    _mm_store_pd (OUT1+2,out11);
 | 
	
		
			
				|  |  | +    M_+=4-64*2; // Jump back to first column (row+2).
 | 
	
		
			
				|  |  | +    OUT0+=4;
 | 
	
		
			
				|  |  | +    OUT1+=4;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +#endif
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +#endif
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +#if defined(__SSE3__)
 | 
	
		
			
				|  |  | +template<>
 | 
	
		
			
				|  |  | +inline void matmult_8x8x2<float>(float*& M_, float*& IN0, float*& IN1, float*& OUT0, float*& OUT1){
 | 
	
		
			
				|  |  | +#if defined __SSE3__ // SSE code.
 | 
	
		
			
				|  |  | +  __m128 out00,out01,out10,out11;
 | 
	
		
			
				|  |  | +  __m128 out20,out21,out30,out31;
 | 
	
		
			
				|  |  | +  float* in0__ = IN0;
 | 
	
		
			
				|  |  | +  float* in1__ = IN1;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  out00 = _mm_load_ps(OUT0);
 | 
	
		
			
				|  |  | +  out01 = _mm_load_ps(OUT1);
 | 
	
		
			
				|  |  | +  out10 = _mm_load_ps(OUT0+4);
 | 
	
		
			
				|  |  | +  out11 = _mm_load_ps(OUT1+4);
 | 
	
		
			
				|  |  | +  out20 = _mm_load_ps(OUT0+8);
 | 
	
		
			
				|  |  | +  out21 = _mm_load_ps(OUT1+8);
 | 
	
		
			
				|  |  | +  out30 = _mm_load_ps(OUT0+12);
 | 
	
		
			
				|  |  | +  out31 = _mm_load_ps(OUT1+12);
 | 
	
		
			
				|  |  | +  for(int i2=0;i2<8;i2+=2){
 | 
	
		
			
				|  |  | +    __m128 m00;
 | 
	
		
			
				|  |  | +    __m128 ot00;
 | 
	
		
			
				|  |  | +    __m128 mt0,mtt0;
 | 
	
		
			
				|  |  | +    __m128 in00,in00_r,in01,in01_r;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    in00 = _mm_castpd_ps(_mm_load_pd1((const double*)in0__));
 | 
	
		
			
				|  |  | +    in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
 | 
	
		
			
				|  |  | +    in01 = _mm_castpd_ps(_mm_load_pd1((const double*)in1__));
 | 
	
		
			
				|  |  | +    in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm_load_ps(M_);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
 | 
	
		
			
				|  |  | +    out00= _mm_add_ps   (out00,_mm_mul_ps( mt0,in00  ));
 | 
	
		
			
				|  |  | +    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
 | 
	
		
			
				|  |  | +    out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    out01 = _mm_add_ps   (out01,_mm_mul_ps( mt0,in01  ));
 | 
	
		
			
				|  |  | +    out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm_load_ps(M_+4);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
 | 
	
		
			
				|  |  | +    out10= _mm_add_ps   (out10,_mm_mul_ps( mt0,in00  ));
 | 
	
		
			
				|  |  | +    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
 | 
	
		
			
				|  |  | +    out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    out11 = _mm_add_ps   (out11,_mm_mul_ps( mt0,in01  ));
 | 
	
		
			
				|  |  | +    out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm_load_ps(M_+8);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
 | 
	
		
			
				|  |  | +    out20= _mm_add_ps   (out20,_mm_mul_ps( mt0,in00  ));
 | 
	
		
			
				|  |  | +    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
 | 
	
		
			
				|  |  | +    out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    out21 = _mm_add_ps   (out21,_mm_mul_ps( mt0,in01  ));
 | 
	
		
			
				|  |  | +    out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm_load_ps(M_+12);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
 | 
	
		
			
				|  |  | +    out30= _mm_add_ps   (out30,_mm_mul_ps( mt0,  in00));
 | 
	
		
			
				|  |  | +    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
 | 
	
		
			
				|  |  | +    out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    out31 = _mm_add_ps   (out31,_mm_mul_ps( mt0,in01  ));
 | 
	
		
			
				|  |  | +    out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    in00 = _mm_castpd_ps(_mm_load_pd1((const double*) (in0__+2)));
 | 
	
		
			
				|  |  | +    in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
 | 
	
		
			
				|  |  | +    in01 = _mm_castpd_ps(_mm_load_pd1((const double*) (in1__+2)));
 | 
	
		
			
				|  |  | +    in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm_load_ps(M_+16);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
 | 
	
		
			
				|  |  | +    out00= _mm_add_ps   (out00,_mm_mul_ps( mt0,in00  ));
 | 
	
		
			
				|  |  | +    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
 | 
	
		
			
				|  |  | +    out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    out01 = _mm_add_ps   (out01,_mm_mul_ps( mt0,in01  ));
 | 
	
		
			
				|  |  | +    out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm_load_ps(M_+20);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
 | 
	
		
			
				|  |  | +    out10= _mm_add_ps   (out10,_mm_mul_ps( mt0,in00  ));
 | 
	
		
			
				|  |  | +    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
 | 
	
		
			
				|  |  | +    out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    out11 = _mm_add_ps   (out11,_mm_mul_ps( mt0,in01 ));
 | 
	
		
			
				|  |  | +    out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm_load_ps(M_+24);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
 | 
	
		
			
				|  |  | +    out20= _mm_add_ps   (out20,_mm_mul_ps( mt0,in00  ));
 | 
	
		
			
				|  |  | +    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
 | 
	
		
			
				|  |  | +    out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    out21 = _mm_add_ps   (out21,_mm_mul_ps( mt0,in01  ));
 | 
	
		
			
				|  |  | +    out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    m00 = _mm_load_ps(M_+28);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    mt0  = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
 | 
	
		
			
				|  |  | +    out30= _mm_add_ps   (out30,_mm_mul_ps( mt0,in00  ));
 | 
	
		
			
				|  |  | +    mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
 | 
	
		
			
				|  |  | +    out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    out31 = _mm_add_ps   (out31,_mm_mul_ps( mt0,in01  ));
 | 
	
		
			
				|  |  | +    out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    M_ += 32;
 | 
	
		
			
				|  |  | +    in0__ += 4;
 | 
	
		
			
				|  |  | +    in1__ += 4;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +  _mm_store_ps(OUT0,out00);
 | 
	
		
			
				|  |  | +  _mm_store_ps(OUT1,out01);
 | 
	
		
			
				|  |  | +  _mm_store_ps(OUT0+4,out10);
 | 
	
		
			
				|  |  | +  _mm_store_ps(OUT1+4,out11);
 | 
	
		
			
				|  |  | +  _mm_store_ps(OUT0+8,out20);
 | 
	
		
			
				|  |  | +  _mm_store_ps(OUT1+8,out21);
 | 
	
		
			
				|  |  | +  _mm_store_ps(OUT0+12,out30);
 | 
	
		
			
				|  |  | +  _mm_store_ps(OUT1+12,out31);
 | 
	
		
			
				|  |  | +#endif
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +#endif
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |  template <class Real_t>
 | 
	
		
			
				|  |  |  void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector<size_t>& interac_dsp,
 | 
	
		
			
				|  |  |      Vector<size_t>& interac_vec, Vector<Real_t*>& precomp_mat, Vector<Real_t>& fft_in, Vector<Real_t>& fft_out){
 | 
	
	
		
			
				|  | @@ -2051,6 +2433,7 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
 | 
	
		
			
				|  |  |    // Build list of interaction pairs (in, out vectors).
 | 
	
		
			
				|  |  |    size_t mat_cnt=precomp_mat.Dim();
 | 
	
		
			
				|  |  |    size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
 | 
	
		
			
				|  |  | +  const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
 | 
	
		
			
				|  |  |    Real_t** IN_ =new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
 | 
	
		
			
				|  |  |    Real_t** OUT_=new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
 | 
	
		
			
				|  |  |    #pragma omp parallel for
 | 
	
	
		
			
				|  | @@ -2113,257 +2496,7 @@ void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, V
 | 
	
		
			
				|  |  |            }
 | 
	
		
			
				|  |  |  #endif
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -#ifdef __AVX__ //AVX code.
 | 
	
		
			
				|  |  | -          __m256d out00,out01,out10,out11;
 | 
	
		
			
				|  |  | -          __m256d out20,out21,out30,out31;
 | 
	
		
			
				|  |  | -          double* in0__ = IN0;
 | 
	
		
			
				|  |  | -          double* in1__ = IN1;
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -          out00 = _mm256_load_pd(OUT0);
 | 
	
		
			
				|  |  | -          out01 = _mm256_load_pd(OUT1);
 | 
	
		
			
				|  |  | -          out10 = _mm256_load_pd(OUT0+4);
 | 
	
		
			
				|  |  | -          out11 = _mm256_load_pd(OUT1+4);
 | 
	
		
			
				|  |  | -          out20 = _mm256_load_pd(OUT0+8);
 | 
	
		
			
				|  |  | -          out21 = _mm256_load_pd(OUT1+8);
 | 
	
		
			
				|  |  | -          out30 = _mm256_load_pd(OUT0+12);
 | 
	
		
			
				|  |  | -          out31 = _mm256_load_pd(OUT1+12);
 | 
	
		
			
				|  |  | -          for(int i2=0;i2<8;i2+=2){
 | 
	
		
			
				|  |  | -            __m256d m00;
 | 
	
		
			
				|  |  | -            __m256d ot00;
 | 
	
		
			
				|  |  | -            __m256d mt0,mtt0;
 | 
	
		
			
				|  |  | -            __m256d in00,in00_r,in01,in01_r;
 | 
	
		
			
				|  |  | -            in00 = _mm256_broadcast_pd((const __m128d*)in0__);
 | 
	
		
			
				|  |  | -            in00_r = _mm256_permute_pd(in00,5);
 | 
	
		
			
				|  |  | -            in01 = _mm256_broadcast_pd((const __m128d*)in1__);
 | 
	
		
			
				|  |  | -            in01_r = _mm256_permute_pd(in01,5);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            m00 = _mm256_load_pd(M_);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | -            mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | -            out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | -            out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            m00 = _mm256_load_pd(M_+4);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | -            mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | -            out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | -            out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            m00 = _mm256_load_pd(M_+8);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | -            mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | -            out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | -            out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            m00 = _mm256_load_pd(M_+12);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | -            mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | -            out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | -            out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
 | 
	
		
			
				|  |  | -            in00_r = _mm256_permute_pd(in00,5);
 | 
	
		
			
				|  |  | -            in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
 | 
	
		
			
				|  |  | -            in01_r = _mm256_permute_pd(in01,5);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            m00 = _mm256_load_pd(M_+16);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | -            mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | -            out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | -            out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            m00 = _mm256_load_pd(M_+20);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | -            mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | -            out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | -            out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            m00 = _mm256_load_pd(M_+24);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | -            mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | -            out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | -            out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            m00 = _mm256_load_pd(M_+28);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            mt0 = _mm256_unpacklo_pd(m00,m00);
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in00);
 | 
	
		
			
				|  |  | -            mtt0 = _mm256_unpackhi_pd(m00,m00);
 | 
	
		
			
				|  |  | -            out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            ot00 = _mm256_mul_pd(mt0,in01);
 | 
	
		
			
				|  |  | -            out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            M_ += 32;
 | 
	
		
			
				|  |  | -            in0__ += 4;
 | 
	
		
			
				|  |  | -            in1__ += 4;
 | 
	
		
			
				|  |  | -          }
 | 
	
		
			
				|  |  | -          _mm256_store_pd(OUT0,out00);
 | 
	
		
			
				|  |  | -          _mm256_store_pd(OUT1,out01);
 | 
	
		
			
				|  |  | -          _mm256_store_pd(OUT0+4,out10);
 | 
	
		
			
				|  |  | -          _mm256_store_pd(OUT1+4,out11);
 | 
	
		
			
				|  |  | -          _mm256_store_pd(OUT0+8,out20);
 | 
	
		
			
				|  |  | -          _mm256_store_pd(OUT1+8,out21);
 | 
	
		
			
				|  |  | -          _mm256_store_pd(OUT0+12,out30);
 | 
	
		
			
				|  |  | -          _mm256_store_pd(OUT1+12,out31);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -#elif defined __SSE3__ // SSE code.
 | 
	
		
			
				|  |  | -          __m128d out00, out01, out10, out11;
 | 
	
		
			
				|  |  | -          __m128d in00, in01, in10, in11;
 | 
	
		
			
				|  |  | -          __m128d m00, m01, m10, m11;
 | 
	
		
			
				|  |  | -          //#pragma unroll
 | 
	
		
			
				|  |  | -          for(int i1=0;i1<8;i1+=2){
 | 
	
		
			
				|  |  | -            double* IN0_=IN0;
 | 
	
		
			
				|  |  | -            double* IN1_=IN1;
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            out00 =_mm_load_pd (OUT0  );
 | 
	
		
			
				|  |  | -            out10 =_mm_load_pd (OUT0+2);
 | 
	
		
			
				|  |  | -            out01 =_mm_load_pd (OUT1  );
 | 
	
		
			
				|  |  | -            out11 =_mm_load_pd (OUT1+2);
 | 
	
		
			
				|  |  | -            //#pragma unroll
 | 
	
		
			
				|  |  | -            for(int i2=0;i2<8;i2+=2){
 | 
	
		
			
				|  |  | -              m00 =_mm_load1_pd (M_   );
 | 
	
		
			
				|  |  | -              m10 =_mm_load1_pd (M_+ 2);
 | 
	
		
			
				|  |  | -              m01 =_mm_load1_pd (M_+16);
 | 
	
		
			
				|  |  | -              m11 =_mm_load1_pd (M_+18);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -              in00 =_mm_load_pd (IN0_  );
 | 
	
		
			
				|  |  | -              in10 =_mm_load_pd (IN0_+2);
 | 
	
		
			
				|  |  | -              in01 =_mm_load_pd (IN1_  );
 | 
	
		
			
				|  |  | -              in11 =_mm_load_pd (IN1_+2);
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -              out00 = _mm_add_pd   (out00, _mm_mul_pd(m00 , in00 ));
 | 
	
		
			
				|  |  | -              out00 = _mm_add_pd   (out00, _mm_mul_pd(m01 , in10 ));
 | 
	
		
			
				|  |  | -              out01 = _mm_add_pd   (out01, _mm_mul_pd(m00 , in01 ));
 | 
	
		
			
				|  |  | -              out01 = _mm_add_pd   (out01, _mm_mul_pd(m01 , in11 ));
 | 
	
		
			
				|  |  | -              out10 = _mm_add_pd   (out10, _mm_mul_pd(m10 , in00 ));
 | 
	
		
			
				|  |  | -              out10 = _mm_add_pd   (out10, _mm_mul_pd(m11 , in10 ));
 | 
	
		
			
				|  |  | -              out11 = _mm_add_pd   (out11, _mm_mul_pd(m10 , in01 ));
 | 
	
		
			
				|  |  | -              out11 = _mm_add_pd   (out11, _mm_mul_pd(m11 , in11 ));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -              m00 =_mm_load1_pd (M_+   1);
 | 
	
		
			
				|  |  | -              m10 =_mm_load1_pd (M_+ 2+1);
 | 
	
		
			
				|  |  | -              m01 =_mm_load1_pd (M_+16+1);
 | 
	
		
			
				|  |  | -              m11 =_mm_load1_pd (M_+18+1);
 | 
	
		
			
				|  |  | -              in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
 | 
	
		
			
				|  |  | -              in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
 | 
	
		
			
				|  |  | -              in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
 | 
	
		
			
				|  |  | -              in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
 | 
	
		
			
				|  |  | -              out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
 | 
	
		
			
				|  |  | -              out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
 | 
	
		
			
				|  |  | -              out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
 | 
	
		
			
				|  |  | -              out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
 | 
	
		
			
				|  |  | -              out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
 | 
	
		
			
				|  |  | -              out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
 | 
	
		
			
				|  |  | -              out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
 | 
	
		
			
				|  |  | -              out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -              M_+=32; // Jump to (column+2).
 | 
	
		
			
				|  |  | -              IN0_+=4;
 | 
	
		
			
				|  |  | -              IN1_+=4;
 | 
	
		
			
				|  |  | -            }
 | 
	
		
			
				|  |  | -            _mm_store_pd (OUT0  ,out00);
 | 
	
		
			
				|  |  | -            _mm_store_pd (OUT0+2,out10);
 | 
	
		
			
				|  |  | -            _mm_store_pd (OUT1  ,out01);
 | 
	
		
			
				|  |  | -            _mm_store_pd (OUT1+2,out11);
 | 
	
		
			
				|  |  | -            M_+=4-64*2; // Jump back to first column (row+2).
 | 
	
		
			
				|  |  | -            OUT0+=4;
 | 
	
		
			
				|  |  | -            OUT1+=4;
 | 
	
		
			
				|  |  | -          }
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -#else // Generic code.
 | 
	
		
			
				|  |  | -          Real_t out_reg000, out_reg001, out_reg010, out_reg011;
 | 
	
		
			
				|  |  | -          Real_t out_reg100, out_reg101, out_reg110, out_reg111;
 | 
	
		
			
				|  |  | -          Real_t  in_reg000,  in_reg001,  in_reg010,  in_reg011;
 | 
	
		
			
				|  |  | -          Real_t  in_reg100,  in_reg101,  in_reg110,  in_reg111;
 | 
	
		
			
				|  |  | -          Real_t   m_reg000,   m_reg001,   m_reg010,   m_reg011;
 | 
	
		
			
				|  |  | -          Real_t   m_reg100,   m_reg101,   m_reg110,   m_reg111;
 | 
	
		
			
				|  |  | -          //#pragma unroll
 | 
	
		
			
				|  |  | -          for(int i1=0;i1<8;i1+=2){
 | 
	
		
			
				|  |  | -            Real_t* IN0_=IN0;
 | 
	
		
			
				|  |  | -            Real_t* IN1_=IN1;
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -            out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
 | 
	
		
			
				|  |  | -            out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
 | 
	
		
			
				|  |  | -            out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
 | 
	
		
			
				|  |  | -            out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
 | 
	
		
			
				|  |  | -            //#pragma unroll
 | 
	
		
			
				|  |  | -            for(int i2=0;i2<8;i2+=2){
 | 
	
		
			
				|  |  | -              m_reg000=M_[ 0]; m_reg001=M_[ 1];
 | 
	
		
			
				|  |  | -              m_reg010=M_[ 2]; m_reg011=M_[ 3];
 | 
	
		
			
				|  |  | -              m_reg100=M_[16]; m_reg101=M_[17];
 | 
	
		
			
				|  |  | -              m_reg110=M_[18]; m_reg111=M_[19];
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -              in_reg000=IN0_[0]; in_reg001=IN0_[1];
 | 
	
		
			
				|  |  | -              in_reg010=IN0_[2]; in_reg011=IN0_[3];
 | 
	
		
			
				|  |  | -              in_reg100=IN1_[0]; in_reg101=IN1_[1];
 | 
	
		
			
				|  |  | -              in_reg110=IN1_[2]; in_reg111=IN1_[3];
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -              out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
 | 
	
		
			
				|  |  | -              out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
 | 
	
		
			
				|  |  | -              out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
 | 
	
		
			
				|  |  | -              out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -              out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
 | 
	
		
			
				|  |  | -              out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
 | 
	
		
			
				|  |  | -              out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
 | 
	
		
			
				|  |  | -              out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -              out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
 | 
	
		
			
				|  |  | -              out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
 | 
	
		
			
				|  |  | -              out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
 | 
	
		
			
				|  |  | -              out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -              out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
 | 
	
		
			
				|  |  | -              out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
 | 
	
		
			
				|  |  | -              out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
 | 
	
		
			
				|  |  | -              out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -              M_+=32; // Jump to (column+2).
 | 
	
		
			
				|  |  | -              IN0_+=4;
 | 
	
		
			
				|  |  | -              IN1_+=4;
 | 
	
		
			
				|  |  | -            }
 | 
	
		
			
				|  |  | -            OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
 | 
	
		
			
				|  |  | -            OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
 | 
	
		
			
				|  |  | -            OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
 | 
	
		
			
				|  |  | -            OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
 | 
	
		
			
				|  |  | -            M_+=4-64*2; // Jump back to first column (row+2).
 | 
	
		
			
				|  |  | -            OUT0+=4;
 | 
	
		
			
				|  |  | -            OUT1+=4;
 | 
	
		
			
				|  |  | -          }
 | 
	
		
			
				|  |  | -#endif
 | 
	
		
			
				|  |  | +          matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
 | 
	
		
			
				|  |  |          }
 | 
	
		
			
				|  |  |          M += M_dim*128;
 | 
	
		
			
				|  |  |        }
 | 
	
	
		
			
				|  | @@ -2506,7 +2639,7 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 | 
	
		
			
				|  |  |          std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
 | 
	
		
			
				|  |  |          for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
 | 
	
		
			
				|  |  |          { // Next blocking level.
 | 
	
		
			
				|  |  | -          size_t n_blk1=nodes_out_.size()*(ker_dim0+ker_dim1)/V_BLK_SIZE; //64 vectors should fit in L1.
 | 
	
		
			
				|  |  | +          size_t n_blk1=nodes_out_.size()*(ker_dim0+ker_dim1)*sizeof(Real_t)/(64*V_BLK_CACHE);
 | 
	
		
			
				|  |  |            if(n_blk1==0) n_blk1=1;
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |            size_t interac_dsp_=0;
 | 
	
	
		
			
				|  | @@ -2584,15 +2717,12 @@ void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>&  setup_data, std::vector<M
 | 
	
		
			
				|  |  |  template <class FMMNode>
 | 
	
		
			
				|  |  |  void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 | 
	
		
			
				|  |  |    assert(!device); //Can not run on accelerator yet.
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  int np;
 | 
	
		
			
				|  |  | +  MPI_Comm_size(comm,&np);
 | 
	
		
			
				|  |  |    if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
 | 
	
		
			
				|  |  | -    Profile::Tic("Host2Device",&this->comm,false,25);
 | 
	
		
			
				|  |  | -    Profile::Toc();
 | 
	
		
			
				|  |  | -    Profile::Tic("FFT",&comm,false,100);
 | 
	
		
			
				|  |  | -    Profile::Toc();
 | 
	
		
			
				|  |  | -    Profile::Tic("HadamardProduct",&comm,false,100);
 | 
	
		
			
				|  |  | -    Profile::Toc();
 | 
	
		
			
				|  |  | -    Profile::Tic("IFFT",&comm,false,100);
 | 
	
		
			
				|  |  | -    Profile::Toc();
 | 
	
		
			
				|  |  | +    if(np>1) Profile::Tic("Host2Device",&this->comm,false,25);
 | 
	
		
			
				|  |  | +    if(np>1) Profile::Toc();
 | 
	
		
			
				|  |  |      return;
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -2697,10 +2827,10 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 | 
	
		
			
				|  |  |        Vector<Real_t>  buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |        { //  FFT
 | 
	
		
			
				|  |  | -        //Profile::Tic("FFT",&comm,false,100);
 | 
	
		
			
				|  |  | +        if(np==1) Profile::Tic("FFT",&comm,false,100);
 | 
	
		
			
				|  |  |          Vector<Real_t>  input_data_( input_data.dim[0]* input_data.dim[1],  input_data[0], false);
 | 
	
		
			
				|  |  |          FFT_UpEquiv(dof, m, ker_dim0,  fft_vec[blk0],  input_data_, fft_in, buffer);
 | 
	
		
			
				|  |  | -        //Profile::Toc();
 | 
	
		
			
				|  |  | +        if(np==1) Profile::Toc();
 | 
	
		
			
				|  |  |        }
 | 
	
		
			
				|  |  |        { // Hadamard
 | 
	
		
			
				|  |  |  #ifdef PVFMM_HAVE_PAPI
 | 
	
	
		
			
				|  | @@ -2709,9 +2839,9 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 | 
	
		
			
				|  |  |          if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
 | 
	
		
			
				|  |  |  #endif
 | 
	
		
			
				|  |  |  #endif
 | 
	
		
			
				|  |  | -        //Profile::Tic("HadamardProduct",&comm,false,100);
 | 
	
		
			
				|  |  | +        if(np==1) Profile::Tic("HadamardProduct",&comm,false,100);
 | 
	
		
			
				|  |  |          VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
 | 
	
		
			
				|  |  | -        //Profile::Toc();
 | 
	
		
			
				|  |  | +        if(np==1) Profile::Toc();
 | 
	
		
			
				|  |  |  #ifdef PVFMM_HAVE_PAPI
 | 
	
		
			
				|  |  |  #ifdef __VERBOSE__
 | 
	
		
			
				|  |  |          if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
 | 
	
	
		
			
				|  | @@ -2720,11 +2850,11 @@ void FMM_Pts<FMMNode>::V_List     (SetupData<Real_t>&  setup_data, bool device){
 | 
	
		
			
				|  |  |  #endif
 | 
	
		
			
				|  |  |        }
 | 
	
		
			
				|  |  |        { // IFFT
 | 
	
		
			
				|  |  | -        //Profile::Tic("IFFT",&comm,false,100);
 | 
	
		
			
				|  |  | +        if(np==1) Profile::Tic("IFFT",&comm,false,100);
 | 
	
		
			
				|  |  |          Matrix<Real_t> M(M_d.dim[0],M_d.dim[1],M_d[0],false);
 | 
	
		
			
				|  |  |          Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
 | 
	
		
			
				|  |  |          FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], fft_out, output_data_, buffer, M);
 | 
	
		
			
				|  |  | -        //Profile::Toc();
 | 
	
		
			
				|  |  | +        if(np==1) Profile::Toc();
 | 
	
		
			
				|  |  |        }
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |    }
 | 
	
	
		
			
				|  | @@ -3107,7 +3237,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 | 
	
		
			
				|  |  |              }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |              single_layer_kernel(                s_coord   , src_cnt[i][2*j+0], input_data[0]+src_value[i][2*j+0], dof,
 | 
	
		
			
				|  |  | -                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value);
 | 
	
		
			
				|  |  | +                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
 | 
	
		
			
				|  |  |              interac_cnt+=src_cnt[i][2*j+0]*trg_cnt[i];
 | 
	
		
			
				|  |  |            }
 | 
	
		
			
				|  |  |            if(ptr_double_layer_kernel!=(size_t)NULL){// Double layer kernel
 | 
	
	
		
			
				|  | @@ -3120,7 +3250,7 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 | 
	
		
			
				|  |  |              }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |              double_layer_kernel(                s_coord   , src_cnt[i][2*j+1], input_data[0]+src_value[i][2*j+1], dof,
 | 
	
		
			
				|  |  | -                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value);
 | 
	
		
			
				|  |  | +                                coord_data[0]+trg_coord[i], trg_cnt[i]       , t_value, NULL);
 | 
	
		
			
				|  |  |              interac_cnt+=src_cnt[i][2*j+1]*trg_cnt[i];
 | 
	
		
			
				|  |  |            }
 | 
	
		
			
				|  |  |          }
 |