|
@@ -15,22 +15,7 @@
|
|
|
#include <vector.hpp>
|
|
|
#include <matrix.hpp>
|
|
|
#include <precomp_mat.hpp>
|
|
|
-
|
|
|
-#ifdef __SSE__
|
|
|
-#include <xmmintrin.h>
|
|
|
-#endif
|
|
|
-#ifdef __SSE2__
|
|
|
-#include <emmintrin.h>
|
|
|
-#endif
|
|
|
-#ifdef __SSE3__
|
|
|
-#include <pmmintrin.h>
|
|
|
-#endif
|
|
|
-#ifdef __AVX__
|
|
|
-#include <immintrin.h>
|
|
|
-#endif
|
|
|
-#if defined(__MIC__)
|
|
|
-#include <immintrin.h>
|
|
|
-#endif
|
|
|
+#include <intrin_wrapper.hpp>
|
|
|
|
|
|
namespace pvfmm{
|
|
|
|
|
@@ -702,153 +687,219 @@ void Kernel<T>::BuildMatrix(T* r_src, int src_cnt,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-////////////////////////////////////////////////////////////////////////////////
|
|
|
-//////// LAPLACE KERNEL ////////
|
|
|
-////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
/**
|
|
|
- * \brief Green's function for the Poisson's equation. Kernel tensor
|
|
|
- * dimension = 1x1.
|
|
|
+ * \brief Generic kernel which rearranges data for vectorization, calls the
|
|
|
+ * actual uKernel and copies data to the output array in the original order.
|
|
|
*/
|
|
|
-template <class T>
|
|
|
-void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
|
|
|
-#ifndef __MIC__
|
|
|
- Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof));
|
|
|
-#endif
|
|
|
-
|
|
|
- const T OOFP = 1.0/(4.0*const_pi<T>());
|
|
|
- for(int t=0;t<trg_cnt;t++){
|
|
|
- for(int i=0;i<dof;i++){
|
|
|
- T p=0;
|
|
|
- for(int s=0;s<src_cnt;s++){
|
|
|
- T dX_reg=r_trg[3*t ]-r_src[3*s ];
|
|
|
- T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
|
|
|
- T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
|
|
|
- T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
|
|
|
- if (invR!=0) invR = 1.0/sqrt(invR);
|
|
|
- p += v_src[s*dof+i]*invR;
|
|
|
- }
|
|
|
- k_out[t*dof+i] += p*OOFP;
|
|
|
+template <class Real_t, int SRC_DIM, int TRG_DIM, void (*uKernel)(Matrix<Real_t>&, Matrix<Real_t>&, Matrix<Real_t>&, Matrix<Real_t>&)>
|
|
|
+void generic_kernel(Real_t* r_src, int src_cnt, Real_t* v_src, int dof, Real_t* r_trg, int trg_cnt, Real_t* v_trg, mem::MemoryManager* mem_mgr){
|
|
|
+ assert(dof==1);
|
|
|
+ int VecLen=8;
|
|
|
+ if(sizeof(Real_t)==sizeof( float)) VecLen=8;
|
|
|
+ if(sizeof(Real_t)==sizeof(double)) VecLen=4;
|
|
|
+
|
|
|
+ #define STACK_BUFF_SIZE 4096
|
|
|
+ Real_t stack_buff[STACK_BUFF_SIZE+MEM_ALIGN];
|
|
|
+ Real_t* buff=NULL;
|
|
|
+
|
|
|
+ Matrix<Real_t> src_coord;
|
|
|
+ Matrix<Real_t> src_value;
|
|
|
+ Matrix<Real_t> trg_coord;
|
|
|
+ Matrix<Real_t> trg_value;
|
|
|
+ { // Rearrange data in src_coord, src_coord, trg_coord, trg_value
|
|
|
+ size_t src_cnt_, trg_cnt_; // counts after zero padding
|
|
|
+ src_cnt_=((src_cnt+VecLen-1)/VecLen)*VecLen;
|
|
|
+ trg_cnt_=((trg_cnt+VecLen-1)/VecLen)*VecLen;
|
|
|
+
|
|
|
+ size_t buff_size=src_cnt_*(COORD_DIM+SRC_DIM)+
|
|
|
+ trg_cnt_*(COORD_DIM+TRG_DIM);
|
|
|
+ if(buff_size>STACK_BUFF_SIZE){ // Allocate buff
|
|
|
+ buff=mem::aligned_new<Real_t>(buff_size, mem_mgr);
|
|
|
}
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-template <class T>
|
|
|
-void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
|
|
|
-//void laplace_poten(T* r_src_, int src_cnt, T* v_src_, int dof, T* r_trg_, int trg_cnt, T* k_out_){
|
|
|
-// int dim=3; //Only supporting 3D
|
|
|
-// T* r_src=mem::aligned_malloc<T>(src_cnt*dim);
|
|
|
-// T* r_trg=mem::aligned_malloc<T>(trg_cnt*dim);
|
|
|
-// T* v_src=mem::aligned_malloc<T>(src_cnt );
|
|
|
-// T* k_out=mem::aligned_malloc<T>(trg_cnt );
|
|
|
-// mem::memcopy(r_src,r_src_,src_cnt*dim*sizeof(T));
|
|
|
-// mem::memcopy(r_trg,r_trg_,trg_cnt*dim*sizeof(T));
|
|
|
-// mem::memcopy(v_src,v_src_,src_cnt *sizeof(T));
|
|
|
-// mem::memcopy(k_out,k_out_,trg_cnt *sizeof(T));
|
|
|
-
|
|
|
- #define EVAL_BLKSZ 32
|
|
|
- #define MAX_DOF 100
|
|
|
- //Compute source to target interactions.
|
|
|
- const T OOFP = 1.0/(4.0*const_pi<T>());
|
|
|
|
|
|
- if(dof==1){
|
|
|
- for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
|
|
|
- for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
|
|
|
- int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
|
|
|
- int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
|
|
|
- for(int t=t_;t<trg_blk;t++){
|
|
|
- T p=0;
|
|
|
- for(int s=s_;s<src_blk;s++){
|
|
|
- T dX_reg=r_trg[3*t ]-r_src[3*s ];
|
|
|
- T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
|
|
|
- T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
|
|
|
- T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
|
|
|
- if (invR!=0) invR = 1.0/sqrt(invR);
|
|
|
- p += v_src[s]*invR;
|
|
|
+ Real_t* buff_ptr=buff;
|
|
|
+ if(!buff_ptr){ // use stack_buff
|
|
|
+ uintptr_t ptr=(uintptr_t)stack_buff;
|
|
|
+ static uintptr_t ALIGN_MINUS_ONE=MEM_ALIGN-1;
|
|
|
+ static uintptr_t NOT_ALIGN_MINUS_ONE=~ALIGN_MINUS_ONE;
|
|
|
+ ptr=((ptr+ALIGN_MINUS_ONE) & NOT_ALIGN_MINUS_ONE);
|
|
|
+ buff_ptr=(Real_t*)ptr;
|
|
|
+ }
|
|
|
+ src_coord.ReInit(COORD_DIM, src_cnt_,buff_ptr,false); buff_ptr+=COORD_DIM*src_cnt_;
|
|
|
+ src_value.ReInit( SRC_DIM, src_cnt_,buff_ptr,false); buff_ptr+= SRC_DIM*src_cnt_;
|
|
|
+ trg_coord.ReInit(COORD_DIM, trg_cnt_,buff_ptr,false); buff_ptr+=COORD_DIM*trg_cnt_;
|
|
|
+ trg_value.ReInit( TRG_DIM, trg_cnt_,buff_ptr,false);//buff_ptr+= TRG_DIM*trg_cnt_;
|
|
|
+ { // Set src_coord
|
|
|
+ size_t i=0;
|
|
|
+ for( ;i<src_cnt ;i++){
|
|
|
+ for(size_t j=0;j<COORD_DIM;j++){
|
|
|
+ src_coord[j][i]=r_src[i*COORD_DIM+j];
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for( ;i<src_cnt_;i++){
|
|
|
+ for(size_t j=0;j<COORD_DIM;j++){
|
|
|
+ src_coord[j][i]=0;
|
|
|
}
|
|
|
- k_out[t] += p*OOFP;
|
|
|
}
|
|
|
}
|
|
|
- }else if(dof==2){
|
|
|
- T p[MAX_DOF];
|
|
|
- for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
|
|
|
- for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
|
|
|
- int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
|
|
|
- int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
|
|
|
- for(int t=t_;t<trg_blk;t++){
|
|
|
- p[0]=0; p[1]=0;
|
|
|
- for(int s=s_;s<src_blk;s++){
|
|
|
- T dX_reg=r_trg[3*t ]-r_src[3*s ];
|
|
|
- T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
|
|
|
- T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
|
|
|
- T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
|
|
|
- if (invR!=0) invR = 1.0/sqrt(invR);
|
|
|
- p[0] += v_src[s*dof+0]*invR;
|
|
|
- p[1] += v_src[s*dof+1]*invR;
|
|
|
+ { // Set src_value
|
|
|
+ size_t i=0;
|
|
|
+ for( ;i<src_cnt ;i++){
|
|
|
+ for(size_t j=0;j<SRC_DIM;j++){
|
|
|
+ src_value[j][i]=v_src[i*SRC_DIM+j];
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for( ;i<src_cnt_;i++){
|
|
|
+ for(size_t j=0;j<SRC_DIM;j++){
|
|
|
+ src_value[j][i]=0;
|
|
|
}
|
|
|
- k_out[t*dof+0] += p[0]*OOFP;
|
|
|
- k_out[t*dof+1] += p[1]*OOFP;
|
|
|
}
|
|
|
}
|
|
|
- }else if(dof==3){
|
|
|
- T p[MAX_DOF];
|
|
|
- for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
|
|
|
- for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
|
|
|
- int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
|
|
|
- int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
|
|
|
- for(int t=t_;t<trg_blk;t++){
|
|
|
- p[0]=0; p[1]=0; p[2]=0;
|
|
|
- for(int s=s_;s<src_blk;s++){
|
|
|
- T dX_reg=r_trg[3*t ]-r_src[3*s ];
|
|
|
- T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
|
|
|
- T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
|
|
|
- T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
|
|
|
- if (invR!=0) invR = 1.0/sqrt(invR);
|
|
|
- p[0] += v_src[s*dof+0]*invR;
|
|
|
- p[1] += v_src[s*dof+1]*invR;
|
|
|
- p[2] += v_src[s*dof+2]*invR;
|
|
|
+ { // Set trg_coord
|
|
|
+ size_t i=0;
|
|
|
+ for( ;i<trg_cnt ;i++){
|
|
|
+ for(size_t j=0;j<COORD_DIM;j++){
|
|
|
+ trg_coord[j][i]=r_trg[i*COORD_DIM+j];
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for( ;i<trg_cnt_;i++){
|
|
|
+ for(size_t j=0;j<COORD_DIM;j++){
|
|
|
+ trg_coord[j][i]=0;
|
|
|
}
|
|
|
- k_out[t*dof+0] += p[0]*OOFP;
|
|
|
- k_out[t*dof+1] += p[1]*OOFP;
|
|
|
- k_out[t*dof+2] += p[2]*OOFP;
|
|
|
}
|
|
|
}
|
|
|
- }else{
|
|
|
- T p[MAX_DOF];
|
|
|
- for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
|
|
|
- for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
|
|
|
- int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
|
|
|
- int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
|
|
|
- for(int t=t_;t<trg_blk;t++){
|
|
|
- for(int i=0;i<dof;i++) p[i]=0;
|
|
|
- for(int s=s_;s<src_blk;s++){
|
|
|
- T dX_reg=r_trg[3*t ]-r_src[3*s ];
|
|
|
- T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
|
|
|
- T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
|
|
|
- T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
|
|
|
- if (invR!=0) invR = 1.0/sqrt(invR);
|
|
|
- for(int i=0;i<dof;i++)
|
|
|
- p[i] += v_src[s*dof+i]*invR;
|
|
|
+ { // Set trg_value
|
|
|
+ size_t i=0;
|
|
|
+ for( ;i<trg_cnt_;i++){
|
|
|
+ for(size_t j=0;j<TRG_DIM;j++){
|
|
|
+ trg_value[j][i]=0;
|
|
|
}
|
|
|
- for(int i=0;i<dof;i++)
|
|
|
- k_out[t*dof+i] += p[i]*OOFP;
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-#ifndef __MIC__
|
|
|
- Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+2*dof));
|
|
|
-#endif
|
|
|
- #undef MAX_DOF
|
|
|
- #undef EVAL_BLKSZ
|
|
|
-
|
|
|
-// for (int t=0; t<trg_cnt; t++)
|
|
|
-// k_out_[t] += k_out[t];
|
|
|
-// mem::aligned_free(r_src);
|
|
|
-// mem::aligned_free(r_trg);
|
|
|
-// mem::aligned_free(v_src);
|
|
|
-// mem::aligned_free(k_out);
|
|
|
+ uKernel(src_coord,src_value,trg_coord,trg_value);
|
|
|
+ { // Set v_trg
|
|
|
+ for(size_t i=0;i<trg_cnt ;i++){
|
|
|
+ for(size_t j=0;j<TRG_DIM;j++){
|
|
|
+ v_trg[i*TRG_DIM+j]+=trg_value[j][i];
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if(buff){ // Free memory: buff
|
|
|
+ mem::aligned_delete<Real_t>(buff);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+////////////////////////////////////////////////////////////////////////////////
|
|
|
+//////// LAPLACE KERNEL ////////
|
|
|
+////////////////////////////////////////////////////////////////////////////////
|
|
|
+
|
|
|
+/**
|
|
|
+ * \brief Green's function for the Poisson's equation. Kernel tensor
|
|
|
+ * dimension = 1x1.
|
|
|
+ */
|
|
|
+template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
|
|
|
+void laplace_poten_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
|
|
|
+ #define SRC_BLK 1000
|
|
|
+ size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
|
|
|
+
|
|
|
+ //// Number of newton iterations
|
|
|
+ size_t NWTN_ITER=0;
|
|
|
+ if(RINV_INTRIN==rinv_intrin0<Vec_t,Real_t>) NWTN_ITER=0;
|
|
|
+ if(RINV_INTRIN==rinv_intrin1<Vec_t,Real_t>) NWTN_ITER=1;
|
|
|
+ if(RINV_INTRIN==rinv_intrin2<Vec_t,Real_t>) NWTN_ITER=2;
|
|
|
+ if(RINV_INTRIN==rinv_intrin3<Vec_t,Real_t>) NWTN_ITER=3;
|
|
|
+
|
|
|
+ Real_t nwtn_scal=1; // scaling factor for newton iterations
|
|
|
+ for(int i=0;i<NWTN_ITER;i++){
|
|
|
+ nwtn_scal=2*nwtn_scal*nwtn_scal*nwtn_scal;
|
|
|
+ }
|
|
|
+ const Real_t OOFP = 1.0/(4*nwtn_scal*const_pi<Real_t>());
|
|
|
+
|
|
|
+ size_t src_cnt_=src_coord.Dim(1);
|
|
|
+ size_t trg_cnt_=trg_coord.Dim(1);
|
|
|
+ for(size_t sblk=0;sblk<src_cnt_;sblk+=SRC_BLK){
|
|
|
+ size_t src_cnt=src_cnt_-sblk;
|
|
|
+ if(src_cnt>SRC_BLK) src_cnt=SRC_BLK;
|
|
|
+ for(size_t t=0;t<trg_cnt_;t+=VecLen){
|
|
|
+ Vec_t tx=load_intrin<Vec_t>(&trg_coord[0][t]);
|
|
|
+ Vec_t ty=load_intrin<Vec_t>(&trg_coord[1][t]);
|
|
|
+ Vec_t tz=load_intrin<Vec_t>(&trg_coord[2][t]);
|
|
|
+ Vec_t tv=zero_intrin<Vec_t>();
|
|
|
+ for(size_t s=sblk;s<sblk+src_cnt;s++){
|
|
|
+ Vec_t dx=sub_intrin(tx,bcast_intrin<Vec_t>(&src_coord[0][s]));
|
|
|
+ Vec_t dy=sub_intrin(ty,bcast_intrin<Vec_t>(&src_coord[1][s]));
|
|
|
+ Vec_t dz=sub_intrin(tz,bcast_intrin<Vec_t>(&src_coord[2][s]));
|
|
|
+ Vec_t sv= bcast_intrin<Vec_t>(&src_value[0][s]) ;
|
|
|
+
|
|
|
+ Vec_t r2= mul_intrin(dx,dx) ;
|
|
|
+ r2=add_intrin(r2,mul_intrin(dy,dy));
|
|
|
+ r2=add_intrin(r2,mul_intrin(dz,dz));
|
|
|
+
|
|
|
+ Vec_t rinv=RINV_INTRIN(r2);
|
|
|
+ tv=add_intrin(tv,mul_intrin(rinv,sv));
|
|
|
+ }
|
|
|
+ Vec_t oofp=set_intrin<Vec_t,Real_t>(OOFP);
|
|
|
+ tv=add_intrin(mul_intrin(tv,oofp),load_intrin<Vec_t>(&trg_value[0][t]));
|
|
|
+ store_intrin(&trg_value[0][t],tv);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ { // Add FLOPS
|
|
|
+ #ifndef __MIC__
|
|
|
+ Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(12+4*(NWTN_ITER)));
|
|
|
+ #endif
|
|
|
+ }
|
|
|
+ #undef SRC_BLK
|
|
|
}
|
|
|
|
|
|
+template <class T, int newton_iter>
|
|
|
+void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){
|
|
|
+ #define LAP_KER_NWTN(nwtn) if(newton_iter==nwtn) \
|
|
|
+ generic_kernel<Real_t, 1, 1, laplace_poten_uKernel<Real_t,Vec_t, rinv_intrin##nwtn<Vec_t,Real_t> > > \
|
|
|
+ ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr)
|
|
|
+ #define LAPLACE_KERNEL LAP_KER_NWTN(0); LAP_KER_NWTN(1); LAP_KER_NWTN(2); LAP_KER_NWTN(3);
|
|
|
+
|
|
|
+ if(mem::TypeTraits<T>::ID()==mem::TypeTraits<float>::ID()){
|
|
|
+ typedef float Real_t;
|
|
|
+ #if defined __MIC__
|
|
|
+ #define Vec_t Real_t
|
|
|
+ #elif defined __AVX__
|
|
|
+ #define Vec_t __m256
|
|
|
+ #elif defined __SSE3__
|
|
|
+ #define Vec_t __m128
|
|
|
+ #else
|
|
|
+ #define Vec_t Real_t
|
|
|
+ #endif
|
|
|
+ LAPLACE_KERNEL;
|
|
|
+ #undef Vec_t
|
|
|
+ }else if(mem::TypeTraits<T>::ID()==mem::TypeTraits<double>::ID()){
|
|
|
+ typedef double Real_t;
|
|
|
+ #if defined __MIC__
|
|
|
+ #define Vec_t Real_t
|
|
|
+ #elif defined __AVX__
|
|
|
+ #define Vec_t __m256d
|
|
|
+ #elif defined __SSE3__
|
|
|
+ #define Vec_t __m128d
|
|
|
+ #else
|
|
|
+ #define Vec_t Real_t
|
|
|
+ #endif
|
|
|
+ LAPLACE_KERNEL;
|
|
|
+ #undef Vec_t
|
|
|
+ }else{
|
|
|
+ typedef T Real_t;
|
|
|
+ #define Vec_t Real_t
|
|
|
+ LAPLACE_KERNEL;
|
|
|
+ #undef Vec_t
|
|
|
+ }
|
|
|
+
|
|
|
+ #undef LAP_KER_NWTN
|
|
|
+ #undef LAPLACE_KERNEL
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
// Laplace double layer potential.
|
|
|
template <class T>
|
|
|
void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
|
|
@@ -965,530 +1016,9 @@ void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cn
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-#ifndef __MIC__
|
|
|
-#ifdef USE_SSE
|
|
|
-namespace
|
|
|
-{
|
|
|
-#define IDEAL_ALIGNMENT 16
|
|
|
-#define SIMD_LEN (int)(IDEAL_ALIGNMENT / sizeof(double))
|
|
|
-#define DECL_SIMD_ALIGNED __declspec(align(IDEAL_ALIGNMENT))
|
|
|
- void laplaceSSE(
|
|
|
- const int ns,
|
|
|
- const int nt,
|
|
|
- const double *sx,
|
|
|
- const double *sy,
|
|
|
- const double *sz,
|
|
|
- const double *tx,
|
|
|
- const double *ty,
|
|
|
- const double *tz,
|
|
|
- const double *srcDen,
|
|
|
- double *trgVal)
|
|
|
- {
|
|
|
- if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
|
|
|
- abort();
|
|
|
-
|
|
|
- double OOFP = 1.0/(4.0*const_pi<double>());
|
|
|
- __m128d temp;
|
|
|
-
|
|
|
- double aux_arr[SIMD_LEN+1];
|
|
|
- double *tempval;
|
|
|
- // if aux_arr is misaligned
|
|
|
- if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempval = aux_arr + 1;
|
|
|
- else tempval = aux_arr;
|
|
|
- if (size_t(tempval)%IDEAL_ALIGNMENT) abort();
|
|
|
-
|
|
|
- /*! One over four pi */
|
|
|
- __m128d oofp = _mm_set1_pd (OOFP);
|
|
|
- __m128d half = _mm_set1_pd (0.5);
|
|
|
- __m128d opf = _mm_set1_pd (1.5);
|
|
|
- __m128d zero = _mm_setzero_pd ();
|
|
|
-
|
|
|
- // loop over sources
|
|
|
- int i = 0;
|
|
|
- for (; i < nt; i++) {
|
|
|
- temp = _mm_setzero_pd();
|
|
|
-
|
|
|
- __m128d txi = _mm_load1_pd (&tx[i]);
|
|
|
- __m128d tyi = _mm_load1_pd (&ty[i]);
|
|
|
- __m128d tzi = _mm_load1_pd (&tz[i]);
|
|
|
- int j = 0;
|
|
|
- // Load and calculate in groups of SIMD_LEN
|
|
|
- for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
|
|
|
- __m128d sxj = _mm_load_pd (&sx[j]);
|
|
|
- __m128d syj = _mm_load_pd (&sy[j]);
|
|
|
- __m128d szj = _mm_load_pd (&sz[j]);
|
|
|
- __m128d sden = _mm_set_pd (srcDen[j+1], srcDen[j]);
|
|
|
-
|
|
|
- __m128d dX, dY, dZ;
|
|
|
- __m128d dR2;
|
|
|
- __m128d S;
|
|
|
-
|
|
|
- dX = _mm_sub_pd(txi , sxj);
|
|
|
- dY = _mm_sub_pd(tyi , syj);
|
|
|
- dZ = _mm_sub_pd(tzi , szj);
|
|
|
-
|
|
|
- sxj = _mm_mul_pd(dX, dX);
|
|
|
- syj = _mm_mul_pd(dY, dY);
|
|
|
- szj = _mm_mul_pd(dZ, dZ);
|
|
|
-
|
|
|
- dR2 = _mm_add_pd(sxj, syj);
|
|
|
- dR2 = _mm_add_pd(szj, dR2);
|
|
|
- __m128d reqzero = _mm_cmpeq_pd (dR2, zero);
|
|
|
-
|
|
|
- __m128d xhalf = _mm_mul_pd (half, dR2);
|
|
|
- __m128 dR2_s = _mm_cvtpd_ps(dR2);
|
|
|
- __m128 S_s = _mm_rsqrt_ps(dR2_s);
|
|
|
- __m128d S_d = _mm_cvtps_pd(S_s);
|
|
|
- // To handle the condition when src and trg coincide
|
|
|
- S_d = _mm_andnot_pd (reqzero, S_d);
|
|
|
-
|
|
|
- S = _mm_mul_pd (S_d, S_d);
|
|
|
- S = _mm_mul_pd (S, xhalf);
|
|
|
- S = _mm_sub_pd (opf, S);
|
|
|
- S = _mm_mul_pd (S, S_d);
|
|
|
-
|
|
|
- sden = _mm_mul_pd (sden, S);
|
|
|
- temp = _mm_add_pd (sden, temp);
|
|
|
- }
|
|
|
- temp = _mm_mul_pd (temp, oofp);
|
|
|
-
|
|
|
- _mm_store_pd(tempval, temp);
|
|
|
- for (int k = 0; k < SIMD_LEN; k++) {
|
|
|
- trgVal[i] += tempval[k];
|
|
|
- }
|
|
|
-
|
|
|
- for (; j < ns; j++) {
|
|
|
- double x = tx[i] - sx[j];
|
|
|
- double y = ty[i] - sy[j];
|
|
|
- double z = tz[i] - sz[j];
|
|
|
- double r2 = x*x + y*y + z*z;
|
|
|
- double r = sqrt(r2);
|
|
|
- double invdr;
|
|
|
- if (r == 0)
|
|
|
- invdr = 0;
|
|
|
- else
|
|
|
- invdr = 1/r;
|
|
|
- double den = srcDen[j];
|
|
|
- trgVal[i] += den*invdr*OOFP;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- void laplaceDblSSE(
|
|
|
- const int ns,
|
|
|
- const int nt,
|
|
|
- const double *sx,
|
|
|
- const double *sy,
|
|
|
- const double *sz,
|
|
|
- const double *tx,
|
|
|
- const double *ty,
|
|
|
- const double *tz,
|
|
|
- const double *srcDen,
|
|
|
- double *trgVal)
|
|
|
- {
|
|
|
- if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
|
|
|
- abort();
|
|
|
-
|
|
|
- double OOFP = 1.0/(4.0*const_pi<double>());
|
|
|
- __m128d temp;
|
|
|
-
|
|
|
- double aux_arr[SIMD_LEN+1];
|
|
|
- double *tempval;
|
|
|
- // if aux_arr is misaligned
|
|
|
- if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempval = aux_arr + 1;
|
|
|
- else tempval = aux_arr;
|
|
|
- if (size_t(tempval)%IDEAL_ALIGNMENT) abort();
|
|
|
-
|
|
|
- /*! One over four pi */
|
|
|
- __m128d oofp = _mm_set1_pd (OOFP);
|
|
|
- __m128d half = _mm_set1_pd (0.5);
|
|
|
- __m128d opf = _mm_set1_pd (1.5);
|
|
|
- __m128d zero = _mm_setzero_pd ();
|
|
|
-
|
|
|
- // loop over sources
|
|
|
- int i = 0;
|
|
|
- for (; i < nt; i++) {
|
|
|
- temp = _mm_setzero_pd();
|
|
|
-
|
|
|
- __m128d txi = _mm_load1_pd (&tx[i]);
|
|
|
- __m128d tyi = _mm_load1_pd (&ty[i]);
|
|
|
- __m128d tzi = _mm_load1_pd (&tz[i]);
|
|
|
- int j = 0;
|
|
|
- // Load and calculate in groups of SIMD_LEN
|
|
|
- for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
|
|
|
- __m128d sxj = _mm_load_pd (&sx[j]);
|
|
|
- __m128d syj = _mm_load_pd (&sy[j]);
|
|
|
- __m128d szj = _mm_load_pd (&sz[j]);
|
|
|
-
|
|
|
- __m128d snormx = _mm_set_pd (srcDen[(j+1)*4+0], srcDen[j*4+0]);
|
|
|
- __m128d snormy = _mm_set_pd (srcDen[(j+1)*4+1], srcDen[j*4+1]);
|
|
|
- __m128d snormz = _mm_set_pd (srcDen[(j+1)*4+2], srcDen[j*4+2]);
|
|
|
- __m128d sden = _mm_set_pd (srcDen[(j+1)*4+3], srcDen[j*4+3]);
|
|
|
-
|
|
|
- __m128d dX, dY, dZ;
|
|
|
- __m128d dR2;
|
|
|
- __m128d S;
|
|
|
- __m128d S2;
|
|
|
- __m128d S3;
|
|
|
-
|
|
|
- dX = _mm_sub_pd(txi , sxj);
|
|
|
- dY = _mm_sub_pd(tyi , syj);
|
|
|
- dZ = _mm_sub_pd(tzi , szj);
|
|
|
-
|
|
|
- sxj = _mm_mul_pd(dX, dX);
|
|
|
- syj = _mm_mul_pd(dY, dY);
|
|
|
- szj = _mm_mul_pd(dZ, dZ);
|
|
|
-
|
|
|
- dR2 = _mm_add_pd(sxj, syj);
|
|
|
- dR2 = _mm_add_pd(szj, dR2);
|
|
|
- __m128d reqzero = _mm_cmpeq_pd (dR2, zero);
|
|
|
-
|
|
|
- __m128d xhalf = _mm_mul_pd (half, dR2);
|
|
|
- __m128 dR2_s = _mm_cvtpd_ps(dR2);
|
|
|
- __m128 S_s = _mm_rsqrt_ps(dR2_s);
|
|
|
- __m128d S_d = _mm_cvtps_pd(S_s);
|
|
|
- // To handle the condition when src and trg coincide
|
|
|
- S_d = _mm_andnot_pd (reqzero, S_d);
|
|
|
-
|
|
|
- S = _mm_mul_pd (S_d, S_d);
|
|
|
- S = _mm_mul_pd (S, xhalf);
|
|
|
- S = _mm_sub_pd (opf, S);
|
|
|
- S = _mm_mul_pd (S, S_d);
|
|
|
- S2 = _mm_mul_pd (S, S);
|
|
|
- S3 = _mm_mul_pd (S2, S);
|
|
|
-
|
|
|
- __m128d S3_sden=_mm_mul_pd(S3, sden);
|
|
|
-
|
|
|
- __m128d dot_sum = _mm_add_pd(_mm_mul_pd(snormx,dX),_mm_mul_pd(snormy,dY));
|
|
|
- dot_sum = _mm_add_pd(dot_sum,_mm_mul_pd(snormz,dZ));
|
|
|
- temp = _mm_add_pd(_mm_mul_pd(S3_sden,dot_sum),temp);
|
|
|
- }
|
|
|
- temp = _mm_mul_pd (temp, oofp);
|
|
|
- _mm_store_pd(tempval, temp);
|
|
|
-
|
|
|
- for (int k = 0; k < SIMD_LEN; k++) {
|
|
|
- trgVal[i] += tempval[k];
|
|
|
- }
|
|
|
-
|
|
|
- for (; j < ns; j++) {
|
|
|
- double x = tx[i] - sx[j];
|
|
|
- double y = ty[i] - sy[j];
|
|
|
- double z = tz[i] - sz[j];
|
|
|
- double r2 = x*x + y*y + z*z;
|
|
|
- double r = sqrt(r2);
|
|
|
- double invdr;
|
|
|
- if (r == 0)
|
|
|
- invdr = 0;
|
|
|
- else
|
|
|
- invdr = 1/r;
|
|
|
- double invdr2=invdr*invdr;
|
|
|
- double invdr3=invdr2*invdr;
|
|
|
-
|
|
|
- double dot_sum = x*srcDen[j*4+0] + y*srcDen[j*4+1] + z*srcDen[j*4+2];
|
|
|
- trgVal[i] += OOFP*invdr3*x*srcDen[j*4+3]*dot_sum;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- void laplaceGradSSE(
|
|
|
- const int ns,
|
|
|
- const int nt,
|
|
|
- const double *sx,
|
|
|
- const double *sy,
|
|
|
- const double *sz,
|
|
|
- const double *tx,
|
|
|
- const double *ty,
|
|
|
- const double *tz,
|
|
|
- const double *srcDen,
|
|
|
- double *trgVal)
|
|
|
- {
|
|
|
- if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
|
|
|
- abort();
|
|
|
-
|
|
|
- double OOFP = 1.0/(4.0*const_pi<double>());
|
|
|
- __m128d tempx; __m128d tempy; __m128d tempz;
|
|
|
-
|
|
|
- double aux_arr[3*SIMD_LEN+1];
|
|
|
- double *tempvalx, *tempvaly, *tempvalz;
|
|
|
- // if aux_arr is misaligned
|
|
|
- if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempvalx = aux_arr + 1;
|
|
|
- else tempvalx = aux_arr;
|
|
|
- if (size_t(tempvalx)%IDEAL_ALIGNMENT) abort();
|
|
|
-
|
|
|
- tempvaly=tempvalx+SIMD_LEN;
|
|
|
- tempvalz=tempvaly+SIMD_LEN;
|
|
|
-
|
|
|
- /*! One over four pi */
|
|
|
- __m128d oofp = _mm_set1_pd (OOFP);
|
|
|
- __m128d half = _mm_set1_pd (0.5);
|
|
|
- __m128d opf = _mm_set1_pd (1.5);
|
|
|
- __m128d zero = _mm_setzero_pd ();
|
|
|
-
|
|
|
- // loop over sources
|
|
|
- int i = 0;
|
|
|
- for (; i < nt; i++) {
|
|
|
- tempx = _mm_setzero_pd();
|
|
|
- tempy = _mm_setzero_pd();
|
|
|
- tempz = _mm_setzero_pd();
|
|
|
-
|
|
|
- __m128d txi = _mm_load1_pd (&tx[i]);
|
|
|
- __m128d tyi = _mm_load1_pd (&ty[i]);
|
|
|
- __m128d tzi = _mm_load1_pd (&tz[i]);
|
|
|
- int j = 0;
|
|
|
- // Load and calculate in groups of SIMD_LEN
|
|
|
- for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
|
|
|
- __m128d sxj = _mm_load_pd (&sx[j]);
|
|
|
- __m128d syj = _mm_load_pd (&sy[j]);
|
|
|
- __m128d szj = _mm_load_pd (&sz[j]);
|
|
|
- __m128d sden = _mm_set_pd (srcDen[j+1], srcDen[j]);
|
|
|
-
|
|
|
- __m128d dX, dY, dZ;
|
|
|
- __m128d dR2;
|
|
|
- __m128d S;
|
|
|
- __m128d S2;
|
|
|
- __m128d S3;
|
|
|
-
|
|
|
- dX = _mm_sub_pd(txi , sxj);
|
|
|
- dY = _mm_sub_pd(tyi , syj);
|
|
|
- dZ = _mm_sub_pd(tzi , szj);
|
|
|
-
|
|
|
- sxj = _mm_mul_pd(dX, dX);
|
|
|
- syj = _mm_mul_pd(dY, dY);
|
|
|
- szj = _mm_mul_pd(dZ, dZ);
|
|
|
-
|
|
|
- dR2 = _mm_add_pd(sxj, syj);
|
|
|
- dR2 = _mm_add_pd(szj, dR2);
|
|
|
- __m128d reqzero = _mm_cmpeq_pd (dR2, zero);
|
|
|
-
|
|
|
- __m128d xhalf = _mm_mul_pd (half, dR2);
|
|
|
- __m128 dR2_s = _mm_cvtpd_ps(dR2);
|
|
|
- __m128 S_s = _mm_rsqrt_ps(dR2_s);
|
|
|
- __m128d S_d = _mm_cvtps_pd(S_s);
|
|
|
- // To handle the condition when src and trg coincide
|
|
|
- S_d = _mm_andnot_pd (reqzero, S_d);
|
|
|
-
|
|
|
- S = _mm_mul_pd (S_d, S_d);
|
|
|
- S = _mm_mul_pd (S, xhalf);
|
|
|
- S = _mm_sub_pd (opf, S);
|
|
|
- S = _mm_mul_pd (S, S_d);
|
|
|
- S2 = _mm_mul_pd (S, S);
|
|
|
- S3 = _mm_mul_pd (S2, S);
|
|
|
-
|
|
|
- __m128d S3_sden=_mm_mul_pd(S3, sden);
|
|
|
- tempx = _mm_add_pd(_mm_mul_pd(S3_sden,dX),tempx);
|
|
|
- tempy = _mm_add_pd(_mm_mul_pd(S3_sden,dY),tempy);
|
|
|
- tempz = _mm_add_pd(_mm_mul_pd(S3_sden,dZ),tempz);
|
|
|
-
|
|
|
- }
|
|
|
- tempx = _mm_mul_pd (tempx, oofp);
|
|
|
- tempy = _mm_mul_pd (tempy, oofp);
|
|
|
- tempz = _mm_mul_pd (tempz, oofp);
|
|
|
-
|
|
|
- _mm_store_pd(tempvalx, tempx);
|
|
|
- _mm_store_pd(tempvaly, tempy);
|
|
|
- _mm_store_pd(tempvalz, tempz);
|
|
|
-
|
|
|
- for (int k = 0; k < SIMD_LEN; k++) {
|
|
|
- trgVal[i*3 ] += tempvalx[k];
|
|
|
- trgVal[i*3+1] += tempvaly[k];
|
|
|
- trgVal[i*3+2] += tempvalz[k];
|
|
|
- }
|
|
|
-
|
|
|
- for (; j < ns; j++) {
|
|
|
- double x = tx[i] - sx[j];
|
|
|
- double y = ty[i] - sy[j];
|
|
|
- double z = tz[i] - sz[j];
|
|
|
- double r2 = x*x + y*y + z*z;
|
|
|
- double r = sqrt(r2);
|
|
|
- double invdr;
|
|
|
- if (r == 0)
|
|
|
- invdr = 0;
|
|
|
- else
|
|
|
- invdr = 1/r;
|
|
|
- double invdr2=invdr*invdr;
|
|
|
- double invdr3=invdr2*invdr;
|
|
|
-
|
|
|
- trgVal[i*3 ] += OOFP*invdr3*x*srcDen[j];
|
|
|
- trgVal[i*3+1] += OOFP*invdr3*y*srcDen[j];
|
|
|
- trgVal[i*3+2] += OOFP*invdr3*z*srcDen[j];
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- return;
|
|
|
- }
|
|
|
-#undef SIMD_LEN
|
|
|
-
|
|
|
-#define X(s,k) (s)[(k)*COORD_DIM]
|
|
|
-#define Y(s,k) (s)[(k)*COORD_DIM+1]
|
|
|
-#define Z(s,k) (s)[(k)*COORD_DIM+2]
|
|
|
- void laplaceSSEShuffle(const int ns, const int nt, float const src[], float const trg[], float const den[], float pot[], mem::MemoryManager* mem_mgr=NULL)
|
|
|
- {
|
|
|
- // TODO
|
|
|
- }
|
|
|
-
|
|
|
- void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
|
|
|
- {
|
|
|
- double* buff=NULL;
|
|
|
- buff=mem::aligned_new<double>((ns+1+nt)*3,mem_mgr);
|
|
|
-
|
|
|
- double* buff_=buff;
|
|
|
- pvfmm::Vector<double> xs(ns+1,buff_,false); buff_+=ns+1;
|
|
|
- pvfmm::Vector<double> ys(ns+1,buff_,false); buff_+=ns+1;
|
|
|
- pvfmm::Vector<double> zs(ns+1,buff_,false); buff_+=ns+1;
|
|
|
-
|
|
|
- pvfmm::Vector<double> xt(nt ,buff_,false); buff_+=nt ;
|
|
|
- pvfmm::Vector<double> yt(nt ,buff_,false); buff_+=nt ;
|
|
|
- pvfmm::Vector<double> zt(nt ,buff_,false); buff_+=nt ;
|
|
|
-
|
|
|
- //std::vector<double> xs(ns+1);
|
|
|
- //std::vector<double> ys(ns+1);
|
|
|
- //std::vector<double> zs(ns+1);
|
|
|
-
|
|
|
- //std::vector<double> xt(nt );
|
|
|
- //std::vector<double> yt(nt );
|
|
|
- //std::vector<double> zt(nt );
|
|
|
-
|
|
|
- int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
|
|
|
- int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
|
|
|
- int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
|
|
|
-
|
|
|
- //1. reshuffle memory
|
|
|
- for (int k =0;k<ns;k++){
|
|
|
- xs[k+x_shift]=X(src,k);
|
|
|
- ys[k+y_shift]=Y(src,k);
|
|
|
- zs[k+z_shift]=Z(src,k);
|
|
|
- }
|
|
|
- for (int k=0;k<nt;k++){
|
|
|
- xt[k]=X(trg,k);
|
|
|
- yt[k]=Y(trg,k);
|
|
|
- zt[k]=Z(trg,k);
|
|
|
- }
|
|
|
-
|
|
|
- //2. perform caclulation
|
|
|
- laplaceSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
|
|
|
-
|
|
|
- mem::aligned_delete<double>(buff,mem_mgr);
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- void laplaceDblSSEShuffle(const int ns, const int nt, float const src[], float const trg[], float const den[], float pot[], mem::MemoryManager* mem_mgr=NULL)
|
|
|
- {
|
|
|
- // TODO
|
|
|
- }
|
|
|
-
|
|
|
- void laplaceDblSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
|
|
|
- {
|
|
|
- std::vector<double> xs(ns+1); std::vector<double> xt(nt);
|
|
|
- std::vector<double> ys(ns+1); std::vector<double> yt(nt);
|
|
|
- std::vector<double> zs(ns+1); std::vector<double> zt(nt);
|
|
|
-
|
|
|
- int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
|
|
|
- int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
|
|
|
- int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
|
|
|
-
|
|
|
- //1. reshuffle memory
|
|
|
- for (int k =0;k<ns;k++){
|
|
|
- xs[k+x_shift]=X(src,k);
|
|
|
- ys[k+y_shift]=Y(src,k);
|
|
|
- zs[k+z_shift]=Z(src,k);
|
|
|
- }
|
|
|
- for (int k=0;k<nt;k++){
|
|
|
- xt[k]=X(trg,k);
|
|
|
- yt[k]=Y(trg,k);
|
|
|
- zt[k]=Z(trg,k);
|
|
|
- }
|
|
|
-
|
|
|
- //2. perform caclulation
|
|
|
- laplaceDblSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- void laplaceGradSSEShuffle(const int ns, const int nt, float const src[], float const trg[], float const den[], float pot[], mem::MemoryManager* mem_mgr=NULL)
|
|
|
- {
|
|
|
- // TODO
|
|
|
- }
|
|
|
-
|
|
|
- void laplaceGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
|
|
|
- {
|
|
|
- int tid=omp_get_thread_num();
|
|
|
- static std::vector<std::vector<double> > xs_(100); static std::vector<std::vector<double> > xt_(100);
|
|
|
- static std::vector<std::vector<double> > ys_(100); static std::vector<std::vector<double> > yt_(100);
|
|
|
- static std::vector<std::vector<double> > zs_(100); static std::vector<std::vector<double> > zt_(100);
|
|
|
-
|
|
|
- std::vector<double>& xs=xs_[tid]; std::vector<double>& xt=xt_[tid];
|
|
|
- std::vector<double>& ys=ys_[tid]; std::vector<double>& yt=yt_[tid];
|
|
|
- std::vector<double>& zs=zs_[tid]; std::vector<double>& zt=zt_[tid];
|
|
|
- xs.resize(ns+1); xt.resize(nt);
|
|
|
- ys.resize(ns+1); yt.resize(nt);
|
|
|
- zs.resize(ns+1); zt.resize(nt);
|
|
|
-
|
|
|
- int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
|
|
|
- int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
|
|
|
- int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
|
|
|
-
|
|
|
- //1. reshuffle memory
|
|
|
- for (int k =0;k<ns;k++){
|
|
|
- xs[k+x_shift]=X(src,k);
|
|
|
- ys[k+y_shift]=Y(src,k);
|
|
|
- zs[k+z_shift]=Z(src,k);
|
|
|
- }
|
|
|
- for (int k=0;k<nt;k++){
|
|
|
- xt[k]=X(trg,k);
|
|
|
- yt[k]=Y(trg,k);
|
|
|
- zt[k]=Z(trg,k);
|
|
|
- }
|
|
|
-
|
|
|
- //2. perform caclulation
|
|
|
- laplaceGradSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
|
|
|
- return;
|
|
|
- }
|
|
|
-#undef X
|
|
|
-#undef Y
|
|
|
-#undef Z
|
|
|
-
|
|
|
-#undef IDEAL_ALIGNMENT
|
|
|
-#undef DECL_SIMD_ALIGNED
|
|
|
-}
|
|
|
-
|
|
|
-template <>
|
|
|
-void laplace_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
|
|
|
- Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof));
|
|
|
-
|
|
|
- if(dof==1){
|
|
|
- laplaceSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
|
|
|
- return;
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-template <>
|
|
|
-void laplace_dbl_poten<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
|
|
|
- Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof));
|
|
|
-
|
|
|
- if(dof==1){
|
|
|
- laplaceDblSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
|
|
|
- return;
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-template <>
|
|
|
-void laplace_grad<double>(double* r_src, int src_cnt, double* v_src, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
|
|
|
- Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof));
|
|
|
-
|
|
|
- if(dof==1){
|
|
|
- laplaceGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
|
|
|
- return;
|
|
|
- }
|
|
|
-}
|
|
|
-#endif
|
|
|
-#endif
|
|
|
-
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
-//////// STOKES KERNEL ////////
|
|
|
+//////// STOKES KERNEL ////////
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
/**
|