/** * \file kernel.txx * \author Dhairya Malhotra, dhairya.malhotra@gmail.com * \date 12-20-2011 * \brief This file contains the implementation of the struct Kernel and also the * implementation of various kernels for FMM. */ #ifdef USE_SSE #include #endif #include #include #include #include namespace pvfmm{ /** * \brief Constructor. */ template Kernel::Kernel(): dim(0){ ker_dim[0]=0; ker_dim[1]=0; } /** * \brief Constructor. */ template Kernel::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_, const int (&k_dim)[2], bool homogen_, T ker_scale, size_t dev_poten, size_t dev_dbl_poten){ dim=dim_; ker_dim[0]=k_dim[0]; ker_dim[1]=k_dim[1]; ker_poten=poten; dbl_layer_poten=dbl_poten; homogen=homogen_; poten_scale=ker_scale; ker_name=std::string(name); dev_ker_poten=dev_poten; dev_dbl_layer_poten=dev_dbl_poten; } /** * \brief Compute the transformation matrix (on the source strength vector) * to get potential at target coordinates due to sources at the given * coordinates. * \param[in] r_src Coordinates of source points. * \param[in] src_cnt Number of source points. * \param[in] r_trg Coordinates of target points. * \param[in] trg_cnt Number of target points. * \param[out] k_out Output array with potential values. */ template void Kernel::BuildMatrix(T* r_src, int src_cnt, T* r_trg, int trg_cnt, T* k_out){ Eval(r_src, src_cnt, r_trg, trg_cnt, k_out, ker_poten, ker_dim); } template void Kernel::Eval(T* r_src, int src_cnt, T* r_trg, int trg_cnt, T* k_out, Kernel::Ker_t eval_kernel, int* ker_dim){ int dim=3; //Only supporting 3D memset(k_out, 0, src_cnt*ker_dim[0]*trg_cnt*ker_dim[1]*sizeof(T)); for(int i=0;i v_src(ker_dim[0],0); v_src[j]=1.0; eval_kernel(&r_src[i*dim], 1, &v_src[0], 1, r_trg, trg_cnt, &k_out[(i*ker_dim[0]+j)*trg_cnt*ker_dim[1]] ); } } //////////////////////////////////////////////////////////////////////////////// //////// LAPLACE KERNEL //////// //////////////////////////////////////////////////////////////////////////////// #ifndef __MIC__ #ifdef USE_SSE namespace { #define IDEAL_ALIGNMENT 16 #define SIMD_LEN (int)(IDEAL_ALIGNMENT / sizeof(double)) #define DECL_SIMD_ALIGNED __declspec(align(IDEAL_ALIGNMENT)) #define OOFP_R 1.0/(4.0*M_PI) void laplaceSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double OOFP = 1.0/(4.0*M_PI); __m128d temp; double aux_arr[SIMD_LEN+1]; double *tempval; // if aux_arr is misaligned if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempval = aux_arr + 1; else tempval = aux_arr; if (size_t(tempval)%IDEAL_ALIGNMENT) abort(); /*! One over four pi */ __m128d oofp = _mm_set1_pd (OOFP_R); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); // loop over sources int i = 0; for (; i < nt; i++) { temp = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sden = _mm_set_pd (srcDen[j+1], srcDen[j]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d reqzero = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (reqzero, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); sden = _mm_mul_pd (sden, S); temp = _mm_add_pd (sden, temp); } temp = _mm_mul_pd (temp, oofp); _mm_store_pd(tempval, temp); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i] += tempval[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double den = srcDen[j]; trgVal[i] += den*invdr*OOFP; } } return; } void laplaceDblSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double OOFP = 1.0/(4.0*M_PI); __m128d temp; double aux_arr[SIMD_LEN+1]; double *tempval; // if aux_arr is misaligned if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempval = aux_arr + 1; else tempval = aux_arr; if (size_t(tempval)%IDEAL_ALIGNMENT) abort(); /*! One over four pi */ __m128d oofp = _mm_set1_pd (OOFP_R); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); // loop over sources int i = 0; for (; i < nt; i++) { temp = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d snormx = _mm_set_pd (srcDen[(j+1)*4+0], srcDen[j*4+0]); __m128d snormy = _mm_set_pd (srcDen[(j+1)*4+1], srcDen[j*4+1]); __m128d snormz = _mm_set_pd (srcDen[(j+1)*4+2], srcDen[j*4+2]); __m128d sden = _mm_set_pd (srcDen[(j+1)*4+3], srcDen[j*4+3]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; __m128d S2; __m128d S3; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d reqzero = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (reqzero, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); S2 = _mm_mul_pd (S, S); S3 = _mm_mul_pd (S2, S); __m128d S3_sden=_mm_mul_pd(S3, sden); __m128d dot_sum = _mm_add_pd(_mm_mul_pd(snormx,dX),_mm_mul_pd(snormy,dY)); dot_sum = _mm_add_pd(dot_sum,_mm_mul_pd(snormz,dZ)); temp = _mm_add_pd(_mm_mul_pd(S3_sden,dot_sum),temp); } temp = _mm_mul_pd (temp, oofp); _mm_store_pd(tempval, temp); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i] += tempval[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double invdr2=invdr*invdr; double invdr3=invdr2*invdr; double dot_sum = x*srcDen[j*4+0] + y*srcDen[j*4+1] + z*srcDen[j*4+2]; trgVal[i] += OOFP*invdr3*x*srcDen[j*4+3]*dot_sum; } } return; } void laplaceGradSSE( const int ns, const int nt, const double *sx, const double *sy, const double *sz, const double *tx, const double *ty, const double *tz, const double *srcDen, double *trgVal) { if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT ) abort(); double OOFP = 1.0/(4.0*M_PI); __m128d tempx; __m128d tempy; __m128d tempz; double aux_arr[3*SIMD_LEN+1]; double *tempvalx, *tempvaly, *tempvalz; // if aux_arr is misaligned if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempvalx = aux_arr + 1; else tempvalx = aux_arr; if (size_t(tempvalx)%IDEAL_ALIGNMENT) abort(); tempvaly=tempvalx+SIMD_LEN; tempvalz=tempvaly+SIMD_LEN; /*! One over four pi */ __m128d oofp = _mm_set1_pd (OOFP_R); __m128d half = _mm_set1_pd (0.5); __m128d opf = _mm_set1_pd (1.5); __m128d zero = _mm_setzero_pd (); // loop over sources int i = 0; for (; i < nt; i++) { tempx = _mm_setzero_pd(); tempy = _mm_setzero_pd(); tempz = _mm_setzero_pd(); __m128d txi = _mm_load1_pd (&tx[i]); __m128d tyi = _mm_load1_pd (&ty[i]); __m128d tzi = _mm_load1_pd (&tz[i]); int j = 0; // Load and calculate in groups of SIMD_LEN for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) { __m128d sxj = _mm_load_pd (&sx[j]); __m128d syj = _mm_load_pd (&sy[j]); __m128d szj = _mm_load_pd (&sz[j]); __m128d sden = _mm_set_pd (srcDen[j+1], srcDen[j]); __m128d dX, dY, dZ; __m128d dR2; __m128d S; __m128d S2; __m128d S3; dX = _mm_sub_pd(txi , sxj); dY = _mm_sub_pd(tyi , syj); dZ = _mm_sub_pd(tzi , szj); sxj = _mm_mul_pd(dX, dX); syj = _mm_mul_pd(dY, dY); szj = _mm_mul_pd(dZ, dZ); dR2 = _mm_add_pd(sxj, syj); dR2 = _mm_add_pd(szj, dR2); __m128d reqzero = _mm_cmpeq_pd (dR2, zero); __m128d xhalf = _mm_mul_pd (half, dR2); __m128 dR2_s = _mm_cvtpd_ps(dR2); __m128 S_s = _mm_rsqrt_ps(dR2_s); __m128d S_d = _mm_cvtps_pd(S_s); // To handle the condition when src and trg coincide S_d = _mm_andnot_pd (reqzero, S_d); S = _mm_mul_pd (S_d, S_d); S = _mm_mul_pd (S, xhalf); S = _mm_sub_pd (opf, S); S = _mm_mul_pd (S, S_d); S2 = _mm_mul_pd (S, S); S3 = _mm_mul_pd (S2, S); __m128d S3_sden=_mm_mul_pd(S3, sden); tempx = _mm_add_pd(_mm_mul_pd(S3_sden,dX),tempx); tempy = _mm_add_pd(_mm_mul_pd(S3_sden,dY),tempy); tempz = _mm_add_pd(_mm_mul_pd(S3_sden,dZ),tempz); } tempx = _mm_mul_pd (tempx, oofp); tempy = _mm_mul_pd (tempy, oofp); tempz = _mm_mul_pd (tempz, oofp); _mm_store_pd(tempvalx, tempx); _mm_store_pd(tempvaly, tempy); _mm_store_pd(tempvalz, tempz); for (int k = 0; k < SIMD_LEN; k++) { trgVal[i*3 ] += tempvalx[k]; trgVal[i*3+1] += tempvaly[k]; trgVal[i*3+2] += tempvalz[k]; } for (; j < ns; j++) { double x = tx[i] - sx[j]; double y = ty[i] - sy[j]; double z = tz[i] - sz[j]; double r2 = x*x + y*y + z*z; double r = sqrt(r2); double invdr; if (r == 0) invdr = 0; else invdr = 1/r; double invdr2=invdr*invdr; double invdr3=invdr2*invdr; trgVal[i*3 ] += OOFP*invdr3*x*srcDen[j]; trgVal[i*3+1] += OOFP*invdr3*y*srcDen[j]; trgVal[i*3+2] += OOFP*invdr3*z*srcDen[j]; } } return; } #undef OOFP_R #undef SIMD_LEN #define X(s,k) (s)[(k)*COORD_DIM] #define Y(s,k) (s)[(k)*COORD_DIM+1] #define Z(s,k) (s)[(k)*COORD_DIM+2] void laplaceSSEShuffle(const int ns, const int nt, float const src[], float const trg[], float const den[], float pot[]) { // TODO } void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[]) { std::vector xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof)); #ifdef USE_SSE if(dof==1){ laplaceSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out); return; } #endif #endif const T OOFP = 1.0/(4.0*M_PI); for(int t=0;t void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){ //void laplace_poten(T* r_src_, int src_cnt, T* v_src_, int dof, T* r_trg_, int trg_cnt, T* k_out_){ // int dim=3; //Only supporting 3D // T* r_src=new T[src_cnt*dim]; // T* r_trg=new T[trg_cnt*dim]; // T* v_src=new T[src_cnt ]; // T* k_out=new T[trg_cnt ]; // mem::memcopy(r_src,r_src_,src_cnt*dim*sizeof(T)); // mem::memcopy(r_trg,r_trg_,trg_cnt*dim*sizeof(T)); // mem::memcopy(v_src,v_src_,src_cnt *sizeof(T)); // mem::memcopy(k_out,k_out_,trg_cnt *sizeof(T)); #define EVAL_BLKSZ 32 #define MAX_DOF 100 //Compute source to target interactions. const T OOFP = 1.0/(4.0*M_PI); if(dof==1){ for (int t_=0; t_src_cnt?src_cnt:src_blk); int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk); for(int t=t_;tsrc_cnt?src_cnt:src_blk); int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk); for(int t=t_;tsrc_cnt?src_cnt:src_blk); int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk); for(int t=t_;tsrc_cnt?src_cnt:src_blk); int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk); for(int t=t_;t void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof)); #ifdef USE_SSE if(dof==1){ laplaceDblSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out); return; } #endif #endif const T OOFP = -1.0/(4.0*M_PI); for(int t=0;t void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof)); #ifdef USE_SSE if(dof==1){ laplaceGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out); return; } #endif #endif const T OOFP = -1.0/(4.0*M_PI); if(dof==1){ for(int t=0;t xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k xs(ns+1); std::vector xt(nt); std::vector ys(ns+1); std::vector yt(nt); std::vector zs(ns+1); std::vector zt(nt); int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0; int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0; int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0; //1. reshuffle memory for (int k =0;k void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){ const T mu=1.0; #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(28*dof)); #ifdef USE_SSE stokesDirectSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu); return; #endif #endif const T OOEPMU = 1.0/(8.0*M_PI*mu); for(int t=0;t void stokes_dbl_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(32*dof)); #endif const T mu=1.0; const T SOEPMU = -6.0/(8.0*M_PI*mu); for(int t=0;t void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof)); #ifdef USE_SSE stokesPressureSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out); return; #endif #endif const T OOFP = 1.0/(4.0*M_PI); for(int t=0;t void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof)); #ifdef USE_SSE stokesStressSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out); return; #endif #endif const T TOFP = -3.0/(4.0*M_PI); for(int t=0;t void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){ const T mu=1.0; #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof)); #ifdef USE_SSE stokesGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu); return; #endif #endif const T OOEPMU = 1.0/(8.0*M_PI*mu); for(int t=0;t void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(26*dof)); #endif const T OOFP = -1.0/(4.0*M_PI); for(int t=0;t void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){ #ifndef __MIC__ Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(24*dof)); #endif const T mu = (20.0*M_PI); for(int t=0;t void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out){ //TODO Implement this. } }//end namespace