kernel.txx 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001
  1. /**
  2. * \file kernel.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 12-20-2011
  5. * \brief This file contains the implementation of the struct Kernel and also the
  6. * implementation of various kernels for FMM.
  7. */
  8. #include <cmath>
  9. #include <cstdlib>
  10. #include <vector>
  11. #include <mem_utils.hpp>
  12. #include <profile.hpp>
  13. #include <vector.hpp>
  14. #ifdef __SSE__
  15. #include <xmmintrin.h>
  16. #endif
  17. #ifdef __SSE2__
  18. #include <emmintrin.h>
  19. #endif
  20. #ifdef __SSE3__
  21. #include <pmmintrin.h>
  22. #endif
  23. #ifdef __AVX__
  24. #include <immintrin.h>
  25. #endif
  26. #if defined(__MIC__)
  27. #include <immintrin.h>
  28. #endif
  29. namespace pvfmm{
  30. /**
  31. * \brief Constructor.
  32. */
  33. template <class T>
  34. Kernel<T>::Kernel(): dim(0){
  35. ker_dim[0]=0;
  36. ker_dim[1]=0;
  37. }
  38. /**
  39. * \brief Constructor.
  40. */
  41. template <class T>
  42. Kernel<T>::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_,
  43. std::pair<int,int> k_dim, bool homogen_, T ker_scale,
  44. size_t dev_poten, size_t dev_dbl_poten){
  45. dim=dim_;
  46. ker_dim[0]=k_dim.first;
  47. ker_dim[1]=k_dim.second;
  48. ker_poten=poten;
  49. dbl_layer_poten=dbl_poten;
  50. homogen=homogen_;
  51. poten_scale=ker_scale;
  52. ker_name=std::string(name);
  53. dev_ker_poten=dev_poten;
  54. dev_dbl_layer_poten=dev_dbl_poten;
  55. }
  56. /**
  57. * \brief Compute the transformation matrix (on the source strength vector)
  58. * to get potential at target coordinates due to sources at the given
  59. * coordinates.
  60. * \param[in] r_src Coordinates of source points.
  61. * \param[in] src_cnt Number of source points.
  62. * \param[in] r_trg Coordinates of target points.
  63. * \param[in] trg_cnt Number of target points.
  64. * \param[out] k_out Output array with potential values.
  65. */
  66. template <class T>
  67. void Kernel<T>::BuildMatrix(T* r_src, int src_cnt,
  68. T* r_trg, int trg_cnt, T* k_out){
  69. int dim=3; //Only supporting 3D
  70. memset(k_out, 0, src_cnt*ker_dim[0]*trg_cnt*ker_dim[1]*sizeof(T));
  71. for(int i=0;i<src_cnt;i++) //TODO Optimize this.
  72. for(int j=0;j<ker_dim[0];j++){
  73. std::vector<T> v_src(ker_dim[0],0);
  74. v_src[j]=1.0;
  75. ker_poten(&r_src[i*dim], 1, &v_src[0], 1, r_trg, trg_cnt,
  76. &k_out[(i*ker_dim[0]+j)*trg_cnt*ker_dim[1]], NULL);
  77. }
  78. }
  79. ////////////////////////////////////////////////////////////////////////////////
  80. //////// LAPLACE KERNEL ////////
  81. ////////////////////////////////////////////////////////////////////////////////
  82. /**
  83. * \brief Green's function for the Poisson's equation. Kernel tensor
  84. * dimension = 1x1.
  85. */
  86. template <class T>
  87. void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  88. #ifndef __MIC__
  89. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof));
  90. #endif
  91. const T OOFP = 1.0/(4.0*const_pi<T>());
  92. for(int t=0;t<trg_cnt;t++){
  93. for(int i=0;i<dof;i++){
  94. T p=0;
  95. for(int s=0;s<src_cnt;s++){
  96. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  97. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  98. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  99. T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  100. if (invR!=0) invR = 1.0/sqrt(invR);
  101. p += v_src[s*dof+i]*invR;
  102. }
  103. k_out[t*dof+i] += p*OOFP;
  104. }
  105. }
  106. }
  107. template <class T>
  108. void laplace_poten_(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  109. //void laplace_poten(T* r_src_, int src_cnt, T* v_src_, int dof, T* r_trg_, int trg_cnt, T* k_out_){
  110. // int dim=3; //Only supporting 3D
  111. // T* r_src=new T[src_cnt*dim];
  112. // T* r_trg=new T[trg_cnt*dim];
  113. // T* v_src=new T[src_cnt ];
  114. // T* k_out=new T[trg_cnt ];
  115. // mem::memcopy(r_src,r_src_,src_cnt*dim*sizeof(T));
  116. // mem::memcopy(r_trg,r_trg_,trg_cnt*dim*sizeof(T));
  117. // mem::memcopy(v_src,v_src_,src_cnt *sizeof(T));
  118. // mem::memcopy(k_out,k_out_,trg_cnt *sizeof(T));
  119. #define EVAL_BLKSZ 32
  120. #define MAX_DOF 100
  121. //Compute source to target interactions.
  122. const T OOFP = 1.0/(4.0*const_pi<T>());
  123. if(dof==1){
  124. for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
  125. for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
  126. int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
  127. int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
  128. for(int t=t_;t<trg_blk;t++){
  129. T p=0;
  130. for(int s=s_;s<src_blk;s++){
  131. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  132. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  133. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  134. T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  135. if (invR!=0) invR = 1.0/sqrt(invR);
  136. p += v_src[s]*invR;
  137. }
  138. k_out[t] += p*OOFP;
  139. }
  140. }
  141. }else if(dof==2){
  142. T p[MAX_DOF];
  143. for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
  144. for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
  145. int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
  146. int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
  147. for(int t=t_;t<trg_blk;t++){
  148. p[0]=0; p[1]=0;
  149. for(int s=s_;s<src_blk;s++){
  150. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  151. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  152. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  153. T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  154. if (invR!=0) invR = 1.0/sqrt(invR);
  155. p[0] += v_src[s*dof+0]*invR;
  156. p[1] += v_src[s*dof+1]*invR;
  157. }
  158. k_out[t*dof+0] += p[0]*OOFP;
  159. k_out[t*dof+1] += p[1]*OOFP;
  160. }
  161. }
  162. }else if(dof==3){
  163. T p[MAX_DOF];
  164. for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
  165. for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
  166. int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
  167. int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
  168. for(int t=t_;t<trg_blk;t++){
  169. p[0]=0; p[1]=0; p[2]=0;
  170. for(int s=s_;s<src_blk;s++){
  171. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  172. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  173. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  174. T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  175. if (invR!=0) invR = 1.0/sqrt(invR);
  176. p[0] += v_src[s*dof+0]*invR;
  177. p[1] += v_src[s*dof+1]*invR;
  178. p[2] += v_src[s*dof+2]*invR;
  179. }
  180. k_out[t*dof+0] += p[0]*OOFP;
  181. k_out[t*dof+1] += p[1]*OOFP;
  182. k_out[t*dof+2] += p[2]*OOFP;
  183. }
  184. }
  185. }else{
  186. T p[MAX_DOF];
  187. for (int t_=0; t_<trg_cnt; t_+=EVAL_BLKSZ)
  188. for (int s_=0; s_<src_cnt; s_+=EVAL_BLKSZ){
  189. int src_blk=s_+EVAL_BLKSZ; src_blk=(src_blk>src_cnt?src_cnt:src_blk);
  190. int trg_blk=t_+EVAL_BLKSZ; trg_blk=(trg_blk>trg_cnt?trg_cnt:trg_blk);
  191. for(int t=t_;t<trg_blk;t++){
  192. for(int i=0;i<dof;i++) p[i]=0;
  193. for(int s=s_;s<src_blk;s++){
  194. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  195. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  196. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  197. T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  198. if (invR!=0) invR = 1.0/sqrt(invR);
  199. for(int i=0;i<dof;i++)
  200. p[i] += v_src[s*dof+i]*invR;
  201. }
  202. for(int i=0;i<dof;i++)
  203. k_out[t*dof+i] += p[i]*OOFP;
  204. }
  205. }
  206. }
  207. #ifndef __MIC__
  208. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+2*dof));
  209. #endif
  210. #undef MAX_DOF
  211. #undef EVAL_BLKSZ
  212. // for (int t=0; t<trg_cnt; t++)
  213. // k_out_[t] += k_out[t];
  214. // delete[] r_src;
  215. // delete[] r_trg;
  216. // delete[] v_src;
  217. // delete[] k_out;
  218. }
  219. // Laplace double layer potential.
  220. template <class T>
  221. void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  222. #ifndef __MIC__
  223. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof));
  224. #endif
  225. const T OOFP = -1.0/(4.0*const_pi<T>());
  226. for(int t=0;t<trg_cnt;t++){
  227. for(int i=0;i<dof;i++){
  228. T p=0;
  229. for(int s=0;s<src_cnt;s++){
  230. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  231. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  232. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  233. T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  234. if (invR!=0) invR = 1.0/sqrt(invR);
  235. p = v_src[(s*dof+i)*4+3]*invR*invR*invR;
  236. k_out[t*dof+i] += p*OOFP*( dX_reg*v_src[(s*dof+i)*4+0] +
  237. dY_reg*v_src[(s*dof+i)*4+1] +
  238. dZ_reg*v_src[(s*dof+i)*4+2] );
  239. }
  240. }
  241. }
  242. }
  243. // Laplace grdient kernel.
  244. template <class T>
  245. void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  246. #ifndef __MIC__
  247. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof));
  248. #endif
  249. const T OOFP = -1.0/(4.0*const_pi<T>());
  250. if(dof==1){
  251. for(int t=0;t<trg_cnt;t++){
  252. T p=0;
  253. for(int s=0;s<src_cnt;s++){
  254. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  255. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  256. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  257. T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  258. if (invR!=0) invR = 1.0/sqrt(invR);
  259. p = v_src[s]*invR*invR*invR;
  260. k_out[(t)*3+0] += p*OOFP*dX_reg;
  261. k_out[(t)*3+1] += p*OOFP*dY_reg;
  262. k_out[(t)*3+2] += p*OOFP*dZ_reg;
  263. }
  264. }
  265. }else if(dof==2){
  266. for(int t=0;t<trg_cnt;t++){
  267. T p=0;
  268. for(int s=0;s<src_cnt;s++){
  269. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  270. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  271. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  272. T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  273. if (invR!=0) invR = 1.0/sqrt(invR);
  274. p = v_src[s*dof+0]*invR*invR*invR;
  275. k_out[(t*dof+0)*3+0] += p*OOFP*dX_reg;
  276. k_out[(t*dof+0)*3+1] += p*OOFP*dY_reg;
  277. k_out[(t*dof+0)*3+2] += p*OOFP*dZ_reg;
  278. p = v_src[s*dof+1]*invR*invR*invR;
  279. k_out[(t*dof+1)*3+0] += p*OOFP*dX_reg;
  280. k_out[(t*dof+1)*3+1] += p*OOFP*dY_reg;
  281. k_out[(t*dof+1)*3+2] += p*OOFP*dZ_reg;
  282. }
  283. }
  284. }else if(dof==3){
  285. for(int t=0;t<trg_cnt;t++){
  286. T p=0;
  287. for(int s=0;s<src_cnt;s++){
  288. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  289. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  290. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  291. T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  292. if (invR!=0) invR = 1.0/sqrt(invR);
  293. p = v_src[s*dof+0]*invR*invR*invR;
  294. k_out[(t*dof+0)*3+0] += p*OOFP*dX_reg;
  295. k_out[(t*dof+0)*3+1] += p*OOFP*dY_reg;
  296. k_out[(t*dof+0)*3+2] += p*OOFP*dZ_reg;
  297. p = v_src[s*dof+1]*invR*invR*invR;
  298. k_out[(t*dof+1)*3+0] += p*OOFP*dX_reg;
  299. k_out[(t*dof+1)*3+1] += p*OOFP*dY_reg;
  300. k_out[(t*dof+1)*3+2] += p*OOFP*dZ_reg;
  301. p = v_src[s*dof+2]*invR*invR*invR;
  302. k_out[(t*dof+2)*3+0] += p*OOFP*dX_reg;
  303. k_out[(t*dof+2)*3+1] += p*OOFP*dY_reg;
  304. k_out[(t*dof+2)*3+2] += p*OOFP*dZ_reg;
  305. }
  306. }
  307. }else{
  308. for(int t=0;t<trg_cnt;t++){
  309. for(int i=0;i<dof;i++){
  310. T p=0;
  311. for(int s=0;s<src_cnt;s++){
  312. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  313. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  314. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  315. T invR = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  316. if (invR!=0) invR = 1.0/sqrt(invR);
  317. p = v_src[s*dof+i]*invR*invR*invR;
  318. k_out[(t*dof+i)*3+0] += p*OOFP*dX_reg;
  319. k_out[(t*dof+i)*3+1] += p*OOFP*dY_reg;
  320. k_out[(t*dof+i)*3+2] += p*OOFP*dZ_reg;
  321. }
  322. }
  323. }
  324. }
  325. }
  326. #ifndef __MIC__
  327. #ifdef USE_SSE
  328. namespace
  329. {
  330. #define IDEAL_ALIGNMENT 16
  331. #define SIMD_LEN (int)(IDEAL_ALIGNMENT / sizeof(double))
  332. #define DECL_SIMD_ALIGNED __declspec(align(IDEAL_ALIGNMENT))
  333. void laplaceSSE(
  334. const int ns,
  335. const int nt,
  336. const double *sx,
  337. const double *sy,
  338. const double *sz,
  339. const double *tx,
  340. const double *ty,
  341. const double *tz,
  342. const double *srcDen,
  343. double *trgVal)
  344. {
  345. if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
  346. abort();
  347. double OOFP = 1.0/(4.0*const_pi<double>());
  348. __m128d temp;
  349. double aux_arr[SIMD_LEN+1];
  350. double *tempval;
  351. // if aux_arr is misaligned
  352. if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempval = aux_arr + 1;
  353. else tempval = aux_arr;
  354. if (size_t(tempval)%IDEAL_ALIGNMENT) abort();
  355. /*! One over four pi */
  356. __m128d oofp = _mm_set1_pd (OOFP);
  357. __m128d half = _mm_set1_pd (0.5);
  358. __m128d opf = _mm_set1_pd (1.5);
  359. __m128d zero = _mm_setzero_pd ();
  360. // loop over sources
  361. int i = 0;
  362. for (; i < nt; i++) {
  363. temp = _mm_setzero_pd();
  364. __m128d txi = _mm_load1_pd (&tx[i]);
  365. __m128d tyi = _mm_load1_pd (&ty[i]);
  366. __m128d tzi = _mm_load1_pd (&tz[i]);
  367. int j = 0;
  368. // Load and calculate in groups of SIMD_LEN
  369. for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
  370. __m128d sxj = _mm_load_pd (&sx[j]);
  371. __m128d syj = _mm_load_pd (&sy[j]);
  372. __m128d szj = _mm_load_pd (&sz[j]);
  373. __m128d sden = _mm_set_pd (srcDen[j+1], srcDen[j]);
  374. __m128d dX, dY, dZ;
  375. __m128d dR2;
  376. __m128d S;
  377. dX = _mm_sub_pd(txi , sxj);
  378. dY = _mm_sub_pd(tyi , syj);
  379. dZ = _mm_sub_pd(tzi , szj);
  380. sxj = _mm_mul_pd(dX, dX);
  381. syj = _mm_mul_pd(dY, dY);
  382. szj = _mm_mul_pd(dZ, dZ);
  383. dR2 = _mm_add_pd(sxj, syj);
  384. dR2 = _mm_add_pd(szj, dR2);
  385. __m128d reqzero = _mm_cmpeq_pd (dR2, zero);
  386. __m128d xhalf = _mm_mul_pd (half, dR2);
  387. __m128 dR2_s = _mm_cvtpd_ps(dR2);
  388. __m128 S_s = _mm_rsqrt_ps(dR2_s);
  389. __m128d S_d = _mm_cvtps_pd(S_s);
  390. // To handle the condition when src and trg coincide
  391. S_d = _mm_andnot_pd (reqzero, S_d);
  392. S = _mm_mul_pd (S_d, S_d);
  393. S = _mm_mul_pd (S, xhalf);
  394. S = _mm_sub_pd (opf, S);
  395. S = _mm_mul_pd (S, S_d);
  396. sden = _mm_mul_pd (sden, S);
  397. temp = _mm_add_pd (sden, temp);
  398. }
  399. temp = _mm_mul_pd (temp, oofp);
  400. _mm_store_pd(tempval, temp);
  401. for (int k = 0; k < SIMD_LEN; k++) {
  402. trgVal[i] += tempval[k];
  403. }
  404. for (; j < ns; j++) {
  405. double x = tx[i] - sx[j];
  406. double y = ty[i] - sy[j];
  407. double z = tz[i] - sz[j];
  408. double r2 = x*x + y*y + z*z;
  409. double r = sqrt(r2);
  410. double invdr;
  411. if (r == 0)
  412. invdr = 0;
  413. else
  414. invdr = 1/r;
  415. double den = srcDen[j];
  416. trgVal[i] += den*invdr*OOFP;
  417. }
  418. }
  419. return;
  420. }
  421. void laplaceDblSSE(
  422. const int ns,
  423. const int nt,
  424. const double *sx,
  425. const double *sy,
  426. const double *sz,
  427. const double *tx,
  428. const double *ty,
  429. const double *tz,
  430. const double *srcDen,
  431. double *trgVal)
  432. {
  433. if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
  434. abort();
  435. double OOFP = 1.0/(4.0*const_pi<double>());
  436. __m128d temp;
  437. double aux_arr[SIMD_LEN+1];
  438. double *tempval;
  439. // if aux_arr is misaligned
  440. if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempval = aux_arr + 1;
  441. else tempval = aux_arr;
  442. if (size_t(tempval)%IDEAL_ALIGNMENT) abort();
  443. /*! One over four pi */
  444. __m128d oofp = _mm_set1_pd (OOFP);
  445. __m128d half = _mm_set1_pd (0.5);
  446. __m128d opf = _mm_set1_pd (1.5);
  447. __m128d zero = _mm_setzero_pd ();
  448. // loop over sources
  449. int i = 0;
  450. for (; i < nt; i++) {
  451. temp = _mm_setzero_pd();
  452. __m128d txi = _mm_load1_pd (&tx[i]);
  453. __m128d tyi = _mm_load1_pd (&ty[i]);
  454. __m128d tzi = _mm_load1_pd (&tz[i]);
  455. int j = 0;
  456. // Load and calculate in groups of SIMD_LEN
  457. for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
  458. __m128d sxj = _mm_load_pd (&sx[j]);
  459. __m128d syj = _mm_load_pd (&sy[j]);
  460. __m128d szj = _mm_load_pd (&sz[j]);
  461. __m128d snormx = _mm_set_pd (srcDen[(j+1)*4+0], srcDen[j*4+0]);
  462. __m128d snormy = _mm_set_pd (srcDen[(j+1)*4+1], srcDen[j*4+1]);
  463. __m128d snormz = _mm_set_pd (srcDen[(j+1)*4+2], srcDen[j*4+2]);
  464. __m128d sden = _mm_set_pd (srcDen[(j+1)*4+3], srcDen[j*4+3]);
  465. __m128d dX, dY, dZ;
  466. __m128d dR2;
  467. __m128d S;
  468. __m128d S2;
  469. __m128d S3;
  470. dX = _mm_sub_pd(txi , sxj);
  471. dY = _mm_sub_pd(tyi , syj);
  472. dZ = _mm_sub_pd(tzi , szj);
  473. sxj = _mm_mul_pd(dX, dX);
  474. syj = _mm_mul_pd(dY, dY);
  475. szj = _mm_mul_pd(dZ, dZ);
  476. dR2 = _mm_add_pd(sxj, syj);
  477. dR2 = _mm_add_pd(szj, dR2);
  478. __m128d reqzero = _mm_cmpeq_pd (dR2, zero);
  479. __m128d xhalf = _mm_mul_pd (half, dR2);
  480. __m128 dR2_s = _mm_cvtpd_ps(dR2);
  481. __m128 S_s = _mm_rsqrt_ps(dR2_s);
  482. __m128d S_d = _mm_cvtps_pd(S_s);
  483. // To handle the condition when src and trg coincide
  484. S_d = _mm_andnot_pd (reqzero, S_d);
  485. S = _mm_mul_pd (S_d, S_d);
  486. S = _mm_mul_pd (S, xhalf);
  487. S = _mm_sub_pd (opf, S);
  488. S = _mm_mul_pd (S, S_d);
  489. S2 = _mm_mul_pd (S, S);
  490. S3 = _mm_mul_pd (S2, S);
  491. __m128d S3_sden=_mm_mul_pd(S3, sden);
  492. __m128d dot_sum = _mm_add_pd(_mm_mul_pd(snormx,dX),_mm_mul_pd(snormy,dY));
  493. dot_sum = _mm_add_pd(dot_sum,_mm_mul_pd(snormz,dZ));
  494. temp = _mm_add_pd(_mm_mul_pd(S3_sden,dot_sum),temp);
  495. }
  496. temp = _mm_mul_pd (temp, oofp);
  497. _mm_store_pd(tempval, temp);
  498. for (int k = 0; k < SIMD_LEN; k++) {
  499. trgVal[i] += tempval[k];
  500. }
  501. for (; j < ns; j++) {
  502. double x = tx[i] - sx[j];
  503. double y = ty[i] - sy[j];
  504. double z = tz[i] - sz[j];
  505. double r2 = x*x + y*y + z*z;
  506. double r = sqrt(r2);
  507. double invdr;
  508. if (r == 0)
  509. invdr = 0;
  510. else
  511. invdr = 1/r;
  512. double invdr2=invdr*invdr;
  513. double invdr3=invdr2*invdr;
  514. double dot_sum = x*srcDen[j*4+0] + y*srcDen[j*4+1] + z*srcDen[j*4+2];
  515. trgVal[i] += OOFP*invdr3*x*srcDen[j*4+3]*dot_sum;
  516. }
  517. }
  518. return;
  519. }
  520. void laplaceGradSSE(
  521. const int ns,
  522. const int nt,
  523. const double *sx,
  524. const double *sy,
  525. const double *sz,
  526. const double *tx,
  527. const double *ty,
  528. const double *tz,
  529. const double *srcDen,
  530. double *trgVal)
  531. {
  532. if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
  533. abort();
  534. double OOFP = 1.0/(4.0*const_pi<double>());
  535. __m128d tempx; __m128d tempy; __m128d tempz;
  536. double aux_arr[3*SIMD_LEN+1];
  537. double *tempvalx, *tempvaly, *tempvalz;
  538. // if aux_arr is misaligned
  539. if (size_t(aux_arr)%IDEAL_ALIGNMENT) tempvalx = aux_arr + 1;
  540. else tempvalx = aux_arr;
  541. if (size_t(tempvalx)%IDEAL_ALIGNMENT) abort();
  542. tempvaly=tempvalx+SIMD_LEN;
  543. tempvalz=tempvaly+SIMD_LEN;
  544. /*! One over four pi */
  545. __m128d oofp = _mm_set1_pd (OOFP);
  546. __m128d half = _mm_set1_pd (0.5);
  547. __m128d opf = _mm_set1_pd (1.5);
  548. __m128d zero = _mm_setzero_pd ();
  549. // loop over sources
  550. int i = 0;
  551. for (; i < nt; i++) {
  552. tempx = _mm_setzero_pd();
  553. tempy = _mm_setzero_pd();
  554. tempz = _mm_setzero_pd();
  555. __m128d txi = _mm_load1_pd (&tx[i]);
  556. __m128d tyi = _mm_load1_pd (&ty[i]);
  557. __m128d tzi = _mm_load1_pd (&tz[i]);
  558. int j = 0;
  559. // Load and calculate in groups of SIMD_LEN
  560. for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
  561. __m128d sxj = _mm_load_pd (&sx[j]);
  562. __m128d syj = _mm_load_pd (&sy[j]);
  563. __m128d szj = _mm_load_pd (&sz[j]);
  564. __m128d sden = _mm_set_pd (srcDen[j+1], srcDen[j]);
  565. __m128d dX, dY, dZ;
  566. __m128d dR2;
  567. __m128d S;
  568. __m128d S2;
  569. __m128d S3;
  570. dX = _mm_sub_pd(txi , sxj);
  571. dY = _mm_sub_pd(tyi , syj);
  572. dZ = _mm_sub_pd(tzi , szj);
  573. sxj = _mm_mul_pd(dX, dX);
  574. syj = _mm_mul_pd(dY, dY);
  575. szj = _mm_mul_pd(dZ, dZ);
  576. dR2 = _mm_add_pd(sxj, syj);
  577. dR2 = _mm_add_pd(szj, dR2);
  578. __m128d reqzero = _mm_cmpeq_pd (dR2, zero);
  579. __m128d xhalf = _mm_mul_pd (half, dR2);
  580. __m128 dR2_s = _mm_cvtpd_ps(dR2);
  581. __m128 S_s = _mm_rsqrt_ps(dR2_s);
  582. __m128d S_d = _mm_cvtps_pd(S_s);
  583. // To handle the condition when src and trg coincide
  584. S_d = _mm_andnot_pd (reqzero, S_d);
  585. S = _mm_mul_pd (S_d, S_d);
  586. S = _mm_mul_pd (S, xhalf);
  587. S = _mm_sub_pd (opf, S);
  588. S = _mm_mul_pd (S, S_d);
  589. S2 = _mm_mul_pd (S, S);
  590. S3 = _mm_mul_pd (S2, S);
  591. __m128d S3_sden=_mm_mul_pd(S3, sden);
  592. tempx = _mm_add_pd(_mm_mul_pd(S3_sden,dX),tempx);
  593. tempy = _mm_add_pd(_mm_mul_pd(S3_sden,dY),tempy);
  594. tempz = _mm_add_pd(_mm_mul_pd(S3_sden,dZ),tempz);
  595. }
  596. tempx = _mm_mul_pd (tempx, oofp);
  597. tempy = _mm_mul_pd (tempy, oofp);
  598. tempz = _mm_mul_pd (tempz, oofp);
  599. _mm_store_pd(tempvalx, tempx);
  600. _mm_store_pd(tempvaly, tempy);
  601. _mm_store_pd(tempvalz, tempz);
  602. for (int k = 0; k < SIMD_LEN; k++) {
  603. trgVal[i*3 ] += tempvalx[k];
  604. trgVal[i*3+1] += tempvaly[k];
  605. trgVal[i*3+2] += tempvalz[k];
  606. }
  607. for (; j < ns; j++) {
  608. double x = tx[i] - sx[j];
  609. double y = ty[i] - sy[j];
  610. double z = tz[i] - sz[j];
  611. double r2 = x*x + y*y + z*z;
  612. double r = sqrt(r2);
  613. double invdr;
  614. if (r == 0)
  615. invdr = 0;
  616. else
  617. invdr = 1/r;
  618. double invdr2=invdr*invdr;
  619. double invdr3=invdr2*invdr;
  620. trgVal[i*3 ] += OOFP*invdr3*x*srcDen[j];
  621. trgVal[i*3+1] += OOFP*invdr3*y*srcDen[j];
  622. trgVal[i*3+2] += OOFP*invdr3*z*srcDen[j];
  623. }
  624. }
  625. return;
  626. }
  627. #undef SIMD_LEN
  628. #define X(s,k) (s)[(k)*COORD_DIM]
  629. #define Y(s,k) (s)[(k)*COORD_DIM+1]
  630. #define Z(s,k) (s)[(k)*COORD_DIM+2]
  631. void laplaceSSEShuffle(const int ns, const int nt, float const src[], float const trg[], float const den[], float pot[], mem::MemoryManager* mem_mgr=NULL)
  632. {
  633. // TODO
  634. }
  635. void laplaceSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
  636. {
  637. double* buff=NULL;
  638. if(mem_mgr) buff=(double*)mem_mgr->malloc((ns+1+nt)*3*sizeof(double));
  639. else buff=(double*)malloc((ns+1+nt)*3*sizeof(double));
  640. double* buff_=buff;
  641. pvfmm::Vector<double> xs(ns+1,buff_,false); buff_+=ns+1;
  642. pvfmm::Vector<double> ys(ns+1,buff_,false); buff_+=ns+1;
  643. pvfmm::Vector<double> zs(ns+1,buff_,false); buff_+=ns+1;
  644. pvfmm::Vector<double> xt(nt ,buff_,false); buff_+=nt ;
  645. pvfmm::Vector<double> yt(nt ,buff_,false); buff_+=nt ;
  646. pvfmm::Vector<double> zt(nt ,buff_,false); buff_+=nt ;
  647. //std::vector<double> xs(ns+1);
  648. //std::vector<double> ys(ns+1);
  649. //std::vector<double> zs(ns+1);
  650. //std::vector<double> xt(nt );
  651. //std::vector<double> yt(nt );
  652. //std::vector<double> zt(nt );
  653. int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
  654. int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
  655. int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
  656. //1. reshuffle memory
  657. for (int k =0;k<ns;k++){
  658. xs[k+x_shift]=X(src,k);
  659. ys[k+y_shift]=Y(src,k);
  660. zs[k+z_shift]=Z(src,k);
  661. }
  662. for (int k=0;k<nt;k++){
  663. xt[k]=X(trg,k);
  664. yt[k]=Y(trg,k);
  665. zt[k]=Z(trg,k);
  666. }
  667. //2. perform caclulation
  668. laplaceSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
  669. if(mem_mgr) mem_mgr->free(buff);
  670. else free(buff);
  671. return;
  672. }
  673. void laplaceDblSSEShuffle(const int ns, const int nt, float const src[], float const trg[], float const den[], float pot[], mem::MemoryManager* mem_mgr=NULL)
  674. {
  675. // TODO
  676. }
  677. void laplaceDblSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
  678. {
  679. std::vector<double> xs(ns+1); std::vector<double> xt(nt);
  680. std::vector<double> ys(ns+1); std::vector<double> yt(nt);
  681. std::vector<double> zs(ns+1); std::vector<double> zt(nt);
  682. int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
  683. int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
  684. int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
  685. //1. reshuffle memory
  686. for (int k =0;k<ns;k++){
  687. xs[k+x_shift]=X(src,k);
  688. ys[k+y_shift]=Y(src,k);
  689. zs[k+z_shift]=Z(src,k);
  690. }
  691. for (int k=0;k<nt;k++){
  692. xt[k]=X(trg,k);
  693. yt[k]=Y(trg,k);
  694. zt[k]=Z(trg,k);
  695. }
  696. //2. perform caclulation
  697. laplaceDblSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
  698. return;
  699. }
  700. void laplaceGradSSEShuffle(const int ns, const int nt, float const src[], float const trg[], float const den[], float pot[], mem::MemoryManager* mem_mgr=NULL)
  701. {
  702. // TODO
  703. }
  704. void laplaceGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
  705. {
  706. int tid=omp_get_thread_num();
  707. static std::vector<std::vector<double> > xs_(100); static std::vector<std::vector<double> > xt_(100);
  708. static std::vector<std::vector<double> > ys_(100); static std::vector<std::vector<double> > yt_(100);
  709. static std::vector<std::vector<double> > zs_(100); static std::vector<std::vector<double> > zt_(100);
  710. std::vector<double>& xs=xs_[tid]; std::vector<double>& xt=xt_[tid];
  711. std::vector<double>& ys=ys_[tid]; std::vector<double>& yt=yt_[tid];
  712. std::vector<double>& zs=zs_[tid]; std::vector<double>& zt=zt_[tid];
  713. xs.resize(ns+1); xt.resize(nt);
  714. ys.resize(ns+1); yt.resize(nt);
  715. zs.resize(ns+1); zt.resize(nt);
  716. int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
  717. int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
  718. int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
  719. //1. reshuffle memory
  720. for (int k =0;k<ns;k++){
  721. xs[k+x_shift]=X(src,k);
  722. ys[k+y_shift]=Y(src,k);
  723. zs[k+z_shift]=Z(src,k);
  724. }
  725. for (int k=0;k<nt;k++){
  726. xt[k]=X(trg,k);
  727. yt[k]=Y(trg,k);
  728. zt[k]=Z(trg,k);
  729. }
  730. //2. perform caclulation
  731. laplaceGradSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
  732. return;
  733. }
  734. #undef X
  735. #undef Y
  736. #undef Z
  737. #undef IDEAL_ALIGNMENT
  738. #undef DECL_SIMD_ALIGNED
  739. }
  740. template <>
  741. void laplace_poten<double>(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  742. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(12*dof));
  743. if(dof==1){
  744. laplaceSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
  745. return;
  746. }
  747. }
  748. template <>
  749. void laplace_dbl_poten<double>(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  750. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(19*dof));
  751. if(dof==1){
  752. laplaceDblSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
  753. return;
  754. }
  755. }
  756. template <>
  757. void laplace_grad<double>(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  758. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(10+12*dof));
  759. if(dof==1){
  760. laplaceGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src, k_out, mem_mgr);
  761. return;
  762. }
  763. }
  764. #endif
  765. #endif
  766. ////////////////////////////////////////////////////////////////////////////////
  767. //////// STOKES KERNEL ////////
  768. ////////////////////////////////////////////////////////////////////////////////
  769. /**
  770. * \brief Green's function for the Stokes's equation. Kernel tensor
  771. * dimension = 3x3.
  772. */
  773. template <class T>
  774. void stokes_vel(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  775. #ifndef __MIC__
  776. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(28*dof));
  777. #endif
  778. const T mu=1.0;
  779. const T OOEPMU = 1.0/(8.0*const_pi<T>()*mu);
  780. for(int t=0;t<trg_cnt;t++){
  781. for(int i=0;i<dof;i++){
  782. T p[3]={0,0,0};
  783. for(int s=0;s<src_cnt;s++){
  784. T dR[3]={r_trg[3*t ]-r_src[3*s ],
  785. r_trg[3*t+1]-r_src[3*s+1],
  786. r_trg[3*t+2]-r_src[3*s+2]};
  787. T R = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
  788. if (R!=0){
  789. T invR2=1.0/R;
  790. T invR=sqrt(invR2);
  791. T v_src[3]={v_src_[(s*dof+i)*3 ],
  792. v_src_[(s*dof+i)*3+1],
  793. v_src_[(s*dof+i)*3+2]};
  794. T inner_prod=(v_src[0]*dR[0] +
  795. v_src[1]*dR[1] +
  796. v_src[2]*dR[2])* invR2;
  797. p[0] += (v_src[0] + dR[0]*inner_prod)*invR;
  798. p[1] += (v_src[1] + dR[1]*inner_prod)*invR;
  799. p[2] += (v_src[2] + dR[2]*inner_prod)*invR;
  800. }
  801. }
  802. k_out[(t*dof+i)*3+0] += p[0]*OOEPMU;
  803. k_out[(t*dof+i)*3+1] += p[1]*OOEPMU;
  804. k_out[(t*dof+i)*3+2] += p[2]*OOEPMU;
  805. }
  806. }
  807. }
  808. template <class T>
  809. void stokes_sym_dip(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  810. #ifndef __MIC__
  811. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(47*dof));
  812. #endif
  813. const T mu=1.0;
  814. const T OOEPMU = -1.0/(8.0*const_pi<T>()*mu);
  815. for(int t=0;t<trg_cnt;t++){
  816. for(int i=0;i<dof;i++){
  817. T p[3]={0,0,0};
  818. for(int s=0;s<src_cnt;s++){
  819. T dR[3]={r_trg[3*t ]-r_src[3*s ],
  820. r_trg[3*t+1]-r_src[3*s+1],
  821. r_trg[3*t+2]-r_src[3*s+2]};
  822. T R = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
  823. if (R!=0){
  824. T invR2=1.0/R;
  825. T invR=sqrt(invR2);
  826. T invR3=invR2*invR;
  827. T* f=&v_src[(s*dof+i)*6+0];
  828. T* n=&v_src[(s*dof+i)*6+3];
  829. T r_dot_n=(n[0]*dR[0]+n[1]*dR[1]+n[2]*dR[2]);
  830. T r_dot_f=(f[0]*dR[0]+f[1]*dR[1]+f[2]*dR[2]);
  831. T n_dot_f=(f[0]* n[0]+f[1]* n[1]+f[2]* n[2]);
  832. p[0] += dR[0]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3;
  833. p[1] += dR[1]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3;
  834. p[2] += dR[2]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3;
  835. }
  836. }
  837. k_out[(t*dof+i)*3+0] += p[0]*OOEPMU;
  838. k_out[(t*dof+i)*3+1] += p[1]*OOEPMU;
  839. k_out[(t*dof+i)*3+2] += p[2]*OOEPMU;
  840. }
  841. }
  842. }
  843. template <class T>
  844. void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  845. #ifndef __MIC__
  846. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof));
  847. #endif
  848. const T OOFP = 1.0/(4.0*const_pi<T>());
  849. for(int t=0;t<trg_cnt;t++){
  850. for(int i=0;i<dof;i++){
  851. T p=0;
  852. for(int s=0;s<src_cnt;s++){
  853. T dR[3]={r_trg[3*t ]-r_src[3*s ],
  854. r_trg[3*t+1]-r_src[3*s+1],
  855. r_trg[3*t+2]-r_src[3*s+2]};
  856. T R = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
  857. if (R!=0){
  858. T invR2=1.0/R;
  859. T invR=sqrt(invR2);
  860. T invR3=invR2*invR;
  861. T v_src[3]={v_src_[(s*dof+i)*3 ],
  862. v_src_[(s*dof+i)*3+1],
  863. v_src_[(s*dof+i)*3+2]};
  864. T inner_prod=(v_src[0]*dR[0] +
  865. v_src[1]*dR[1] +
  866. v_src[2]*dR[2])* invR3;
  867. p += inner_prod;
  868. }
  869. }
  870. k_out[t*dof+i] += p*OOFP;
  871. }
  872. }
  873. }
  874. template <class T>
  875. void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  876. #ifndef __MIC__
  877. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof));
  878. #endif
  879. const T TOFP = -3.0/(4.0*const_pi<T>());
  880. for(int t=0;t<trg_cnt;t++){
  881. for(int i=0;i<dof;i++){
  882. T p[9]={0,0,0,
  883. 0,0,0,
  884. 0,0,0};
  885. for(int s=0;s<src_cnt;s++){
  886. T dR[3]={r_trg[3*t ]-r_src[3*s ],
  887. r_trg[3*t+1]-r_src[3*s+1],
  888. r_trg[3*t+2]-r_src[3*s+2]};
  889. T R = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
  890. if (R!=0){
  891. T invR2=1.0/R;
  892. T invR=sqrt(invR2);
  893. T invR3=invR2*invR;
  894. T invR5=invR3*invR2;
  895. T v_src[3]={v_src_[(s*dof+i)*3 ],
  896. v_src_[(s*dof+i)*3+1],
  897. v_src_[(s*dof+i)*3+2]};
  898. T inner_prod=(v_src[0]*dR[0] +
  899. v_src[1]*dR[1] +
  900. v_src[2]*dR[2])* invR5;
  901. p[0] += inner_prod*dR[0]*dR[0]; p[1] += inner_prod*dR[1]*dR[0]; p[2] += inner_prod*dR[2]*dR[0];
  902. p[3] += inner_prod*dR[0]*dR[1]; p[4] += inner_prod*dR[1]*dR[1]; p[5] += inner_prod*dR[2]*dR[1];
  903. p[6] += inner_prod*dR[0]*dR[2]; p[7] += inner_prod*dR[1]*dR[2]; p[8] += inner_prod*dR[2]*dR[2];
  904. }
  905. }
  906. k_out[(t*dof+i)*9+0] += p[0]*TOFP;
  907. k_out[(t*dof+i)*9+1] += p[1]*TOFP;
  908. k_out[(t*dof+i)*9+2] += p[2]*TOFP;
  909. k_out[(t*dof+i)*9+3] += p[3]*TOFP;
  910. k_out[(t*dof+i)*9+4] += p[4]*TOFP;
  911. k_out[(t*dof+i)*9+5] += p[5]*TOFP;
  912. k_out[(t*dof+i)*9+6] += p[6]*TOFP;
  913. k_out[(t*dof+i)*9+7] += p[7]*TOFP;
  914. k_out[(t*dof+i)*9+8] += p[8]*TOFP;
  915. }
  916. }
  917. }
  918. template <class T>
  919. void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  920. #ifndef __MIC__
  921. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof));
  922. #endif
  923. const T mu=1.0;
  924. const T OOEPMU = 1.0/(8.0*const_pi<T>()*mu);
  925. for(int t=0;t<trg_cnt;t++){
  926. for(int i=0;i<dof;i++){
  927. T p[9]={0,0,0,
  928. 0,0,0,
  929. 0,0,0};
  930. for(int s=0;s<src_cnt;s++){
  931. T dR[3]={r_trg[3*t ]-r_src[3*s ],
  932. r_trg[3*t+1]-r_src[3*s+1],
  933. r_trg[3*t+2]-r_src[3*s+2]};
  934. T R = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
  935. if (R!=0){
  936. T invR2=1.0/R;
  937. T invR=sqrt(invR2);
  938. T invR3=invR2*invR;
  939. T v_src[3]={v_src_[(s*dof+i)*3 ],
  940. v_src_[(s*dof+i)*3+1],
  941. v_src_[(s*dof+i)*3+2]};
  942. T inner_prod=(v_src[0]*dR[0] +
  943. v_src[1]*dR[1] +
  944. v_src[2]*dR[2]);
  945. p[0] += ( inner_prod*(1-3*dR[0]*dR[0]*invR2))*invR3; //6
  946. p[1] += (dR[1]*v_src[0]-v_src[1]*dR[0]+inner_prod*( -3*dR[1]*dR[0]*invR2))*invR3; //9
  947. p[2] += (dR[2]*v_src[0]-v_src[2]*dR[0]+inner_prod*( -3*dR[2]*dR[0]*invR2))*invR3;
  948. p[3] += (dR[0]*v_src[1]-v_src[0]*dR[1]+inner_prod*( -3*dR[0]*dR[1]*invR2))*invR3;
  949. p[4] += ( inner_prod*(1-3*dR[1]*dR[1]*invR2))*invR3;
  950. p[5] += (dR[2]*v_src[1]-v_src[2]*dR[1]+inner_prod*( -3*dR[2]*dR[1]*invR2))*invR3;
  951. p[6] += (dR[0]*v_src[2]-v_src[0]*dR[2]+inner_prod*( -3*dR[0]*dR[2]*invR2))*invR3;
  952. p[7] += (dR[1]*v_src[2]-v_src[1]*dR[2]+inner_prod*( -3*dR[1]*dR[2]*invR2))*invR3;
  953. p[8] += ( inner_prod*(1-3*dR[2]*dR[2]*invR2))*invR3;
  954. }
  955. }
  956. k_out[(t*dof+i)*9+0] += p[0]*OOEPMU;
  957. k_out[(t*dof+i)*9+1] += p[1]*OOEPMU;
  958. k_out[(t*dof+i)*9+2] += p[2]*OOEPMU;
  959. k_out[(t*dof+i)*9+3] += p[3]*OOEPMU;
  960. k_out[(t*dof+i)*9+4] += p[4]*OOEPMU;
  961. k_out[(t*dof+i)*9+5] += p[5]*OOEPMU;
  962. k_out[(t*dof+i)*9+6] += p[6]*OOEPMU;
  963. k_out[(t*dof+i)*9+7] += p[7]*OOEPMU;
  964. k_out[(t*dof+i)*9+8] += p[8]*OOEPMU;
  965. }
  966. }
  967. }
  968. #ifndef __MIC__
  969. #ifdef USE_SSE
  970. namespace
  971. {
  972. #define IDEAL_ALIGNMENT 16
  973. #define SIMD_LEN (int)(IDEAL_ALIGNMENT / sizeof(double))
  974. #define DECL_SIMD_ALIGNED __declspec(align(IDEAL_ALIGNMENT))
  975. void stokesDirectVecSSE(
  976. const int ns,
  977. const int nt,
  978. const double *sx,
  979. const double *sy,
  980. const double *sz,
  981. const double *tx,
  982. const double *ty,
  983. const double *tz,
  984. const double *srcDen,
  985. double *trgVal,
  986. const double cof )
  987. {
  988. if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
  989. abort();
  990. double mu = cof;
  991. double OOEP = 1.0/(8.0*const_pi<double>());
  992. __m128d tempx;
  993. __m128d tempy;
  994. __m128d tempz;
  995. double oomeu = 1/mu;
  996. double aux_arr[3*SIMD_LEN+1];
  997. double *tempvalx;
  998. double *tempvaly;
  999. double *tempvalz;
  1000. if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned
  1001. {
  1002. tempvalx = aux_arr + 1;
  1003. if (size_t(tempvalx)%IDEAL_ALIGNMENT)
  1004. abort();
  1005. }
  1006. else
  1007. tempvalx = aux_arr;
  1008. tempvaly=tempvalx+SIMD_LEN;
  1009. tempvalz=tempvaly+SIMD_LEN;
  1010. /*! One over eight pi */
  1011. __m128d ooep = _mm_set1_pd (OOEP);
  1012. __m128d half = _mm_set1_pd (0.5);
  1013. __m128d opf = _mm_set1_pd (1.5);
  1014. __m128d zero = _mm_setzero_pd ();
  1015. __m128d oomu = _mm_set1_pd (1/mu);
  1016. // loop over sources
  1017. int i = 0;
  1018. for (; i < nt; i++) {
  1019. tempx = _mm_setzero_pd();
  1020. tempy = _mm_setzero_pd();
  1021. tempz = _mm_setzero_pd();
  1022. __m128d txi = _mm_load1_pd (&tx[i]);
  1023. __m128d tyi = _mm_load1_pd (&ty[i]);
  1024. __m128d tzi = _mm_load1_pd (&tz[i]);
  1025. int j = 0;
  1026. // Load and calculate in groups of SIMD_LEN
  1027. for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
  1028. __m128d sxj = _mm_load_pd (&sx[j]);
  1029. __m128d syj = _mm_load_pd (&sy[j]);
  1030. __m128d szj = _mm_load_pd (&sz[j]);
  1031. __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]);
  1032. __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]);
  1033. __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]);
  1034. __m128d dX, dY, dZ;
  1035. __m128d dR2;
  1036. __m128d S;
  1037. dX = _mm_sub_pd(txi , sxj);
  1038. dY = _mm_sub_pd(tyi , syj);
  1039. dZ = _mm_sub_pd(tzi , szj);
  1040. sxj = _mm_mul_pd(dX, dX);
  1041. syj = _mm_mul_pd(dY, dY);
  1042. szj = _mm_mul_pd(dZ, dZ);
  1043. dR2 = _mm_add_pd(sxj, syj);
  1044. dR2 = _mm_add_pd(szj, dR2);
  1045. __m128d temp = _mm_cmpeq_pd (dR2, zero);
  1046. __m128d xhalf = _mm_mul_pd (half, dR2);
  1047. __m128 dR2_s = _mm_cvtpd_ps(dR2);
  1048. __m128 S_s = _mm_rsqrt_ps(dR2_s);
  1049. __m128d S_d = _mm_cvtps_pd(S_s);
  1050. // To handle the condition when src and trg coincide
  1051. S_d = _mm_andnot_pd (temp, S_d);
  1052. S = _mm_mul_pd (S_d, S_d);
  1053. S = _mm_mul_pd (S, xhalf);
  1054. S = _mm_sub_pd (opf, S);
  1055. S = _mm_mul_pd (S, S_d);
  1056. __m128d dotx = _mm_mul_pd (dX, sdenx);
  1057. __m128d doty = _mm_mul_pd (dY, sdeny);
  1058. __m128d dotz = _mm_mul_pd (dZ, sdenz);
  1059. __m128d dot_sum = _mm_add_pd (dotx, doty);
  1060. dot_sum = _mm_add_pd (dot_sum, dotz);
  1061. dot_sum = _mm_mul_pd (dot_sum, S);
  1062. dot_sum = _mm_mul_pd (dot_sum, S);
  1063. dotx = _mm_mul_pd (dot_sum, dX);
  1064. doty = _mm_mul_pd (dot_sum, dY);
  1065. dotz = _mm_mul_pd (dot_sum, dZ);
  1066. sdenx = _mm_add_pd (sdenx, dotx);
  1067. sdeny = _mm_add_pd (sdeny, doty);
  1068. sdenz = _mm_add_pd (sdenz, dotz);
  1069. sdenx = _mm_mul_pd (sdenx, S);
  1070. sdeny = _mm_mul_pd (sdeny, S);
  1071. sdenz = _mm_mul_pd (sdenz, S);
  1072. tempx = _mm_add_pd (sdenx, tempx);
  1073. tempy = _mm_add_pd (sdeny, tempy);
  1074. tempz = _mm_add_pd (sdenz, tempz);
  1075. }
  1076. tempx = _mm_mul_pd (tempx, ooep);
  1077. tempy = _mm_mul_pd (tempy, ooep);
  1078. tempz = _mm_mul_pd (tempz, ooep);
  1079. tempx = _mm_mul_pd (tempx, oomu);
  1080. tempy = _mm_mul_pd (tempy, oomu);
  1081. tempz = _mm_mul_pd (tempz, oomu);
  1082. _mm_store_pd(tempvalx, tempx);
  1083. _mm_store_pd(tempvaly, tempy);
  1084. _mm_store_pd(tempvalz, tempz);
  1085. for (int k = 0; k < SIMD_LEN; k++) {
  1086. trgVal[i*3] += tempvalx[k];
  1087. trgVal[i*3+1] += tempvaly[k];
  1088. trgVal[i*3+2] += tempvalz[k];
  1089. }
  1090. for (; j < ns; j++) {
  1091. double x = tx[i] - sx[j];
  1092. double y = ty[i] - sy[j];
  1093. double z = tz[i] - sz[j];
  1094. double r2 = x*x + y*y + z*z;
  1095. double r = sqrt(r2);
  1096. double invdr;
  1097. if (r == 0)
  1098. invdr = 0;
  1099. else
  1100. invdr = 1/r;
  1101. double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]) * invdr * invdr;
  1102. double denx = srcDen[j*3] + dot*x;
  1103. double deny = srcDen[j*3+1] + dot*y;
  1104. double denz = srcDen[j*3+2] + dot*z;
  1105. trgVal[i*3] += denx*invdr*OOEP*oomeu;
  1106. trgVal[i*3+1] += deny*invdr*OOEP*oomeu;
  1107. trgVal[i*3+2] += denz*invdr*OOEP*oomeu;
  1108. }
  1109. }
  1110. return;
  1111. }
  1112. void stokesPressureSSE(
  1113. const int ns,
  1114. const int nt,
  1115. const double *sx,
  1116. const double *sy,
  1117. const double *sz,
  1118. const double *tx,
  1119. const double *ty,
  1120. const double *tz,
  1121. const double *srcDen,
  1122. double *trgVal)
  1123. {
  1124. if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
  1125. abort();
  1126. double OOFP = 1.0/(4.0*const_pi<double>());
  1127. __m128d temp_press;
  1128. double aux_arr[SIMD_LEN+1];
  1129. double *tempval_press;
  1130. if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned
  1131. {
  1132. tempval_press = aux_arr + 1;
  1133. if (size_t(tempval_press)%IDEAL_ALIGNMENT)
  1134. abort();
  1135. }
  1136. else
  1137. tempval_press = aux_arr;
  1138. /*! One over eight pi */
  1139. __m128d oofp = _mm_set1_pd (OOFP);
  1140. __m128d half = _mm_set1_pd (0.5);
  1141. __m128d opf = _mm_set1_pd (1.5);
  1142. __m128d zero = _mm_setzero_pd ();
  1143. // loop over sources
  1144. int i = 0;
  1145. for (; i < nt; i++) {
  1146. temp_press = _mm_setzero_pd();
  1147. __m128d txi = _mm_load1_pd (&tx[i]);
  1148. __m128d tyi = _mm_load1_pd (&ty[i]);
  1149. __m128d tzi = _mm_load1_pd (&tz[i]);
  1150. int j = 0;
  1151. // Load and calculate in groups of SIMD_LEN
  1152. for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
  1153. __m128d sxj = _mm_load_pd (&sx[j]);
  1154. __m128d syj = _mm_load_pd (&sy[j]);
  1155. __m128d szj = _mm_load_pd (&sz[j]);
  1156. __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]);
  1157. __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]);
  1158. __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]);
  1159. __m128d dX, dY, dZ;
  1160. __m128d dR2;
  1161. __m128d S;
  1162. dX = _mm_sub_pd(txi , sxj);
  1163. dY = _mm_sub_pd(tyi , syj);
  1164. dZ = _mm_sub_pd(tzi , szj);
  1165. sxj = _mm_mul_pd(dX, dX);
  1166. syj = _mm_mul_pd(dY, dY);
  1167. szj = _mm_mul_pd(dZ, dZ);
  1168. dR2 = _mm_add_pd(sxj, syj);
  1169. dR2 = _mm_add_pd(szj, dR2);
  1170. __m128d temp = _mm_cmpeq_pd (dR2, zero);
  1171. __m128d xhalf = _mm_mul_pd (half, dR2);
  1172. __m128 dR2_s = _mm_cvtpd_ps(dR2);
  1173. __m128 S_s = _mm_rsqrt_ps(dR2_s);
  1174. __m128d S_d = _mm_cvtps_pd(S_s);
  1175. // To handle the condition when src and trg coincide
  1176. S_d = _mm_andnot_pd (temp, S_d);
  1177. S = _mm_mul_pd (S_d, S_d);
  1178. S = _mm_mul_pd (S, xhalf);
  1179. S = _mm_sub_pd (opf, S);
  1180. S = _mm_mul_pd (S, S_d);
  1181. __m128d dotx = _mm_mul_pd (dX, sdenx);
  1182. __m128d doty = _mm_mul_pd (dY, sdeny);
  1183. __m128d dotz = _mm_mul_pd (dZ, sdenz);
  1184. __m128d dot_sum = _mm_add_pd (dotx, doty);
  1185. dot_sum = _mm_add_pd (dot_sum, dotz);
  1186. dot_sum = _mm_mul_pd (dot_sum, S);
  1187. dot_sum = _mm_mul_pd (dot_sum, S);
  1188. dot_sum = _mm_mul_pd (dot_sum, S);
  1189. temp_press = _mm_add_pd (dot_sum, temp_press);
  1190. }
  1191. temp_press = _mm_mul_pd (temp_press, oofp);
  1192. _mm_store_pd(tempval_press, temp_press);
  1193. for (int k = 0; k < SIMD_LEN; k++) {
  1194. trgVal[i] += tempval_press[k];
  1195. }
  1196. for (; j < ns; j++) {
  1197. double x = tx[i] - sx[j];
  1198. double y = ty[i] - sy[j];
  1199. double z = tz[i] - sz[j];
  1200. double r2 = x*x + y*y + z*z;
  1201. double r = sqrt(r2);
  1202. double invdr;
  1203. if (r == 0)
  1204. invdr = 0;
  1205. else
  1206. invdr = 1/r;
  1207. double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]) * invdr * invdr * invdr;
  1208. trgVal[i] += dot*OOFP;
  1209. }
  1210. }
  1211. return;
  1212. }
  1213. void stokesStressSSE(
  1214. const int ns,
  1215. const int nt,
  1216. const double *sx,
  1217. const double *sy,
  1218. const double *sz,
  1219. const double *tx,
  1220. const double *ty,
  1221. const double *tz,
  1222. const double *srcDen,
  1223. double *trgVal)
  1224. {
  1225. if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
  1226. abort();
  1227. double TOFP = -3.0/(4.0*const_pi<double>());
  1228. __m128d tempxx; __m128d tempxy; __m128d tempxz;
  1229. __m128d tempyx; __m128d tempyy; __m128d tempyz;
  1230. __m128d tempzx; __m128d tempzy; __m128d tempzz;
  1231. double aux_arr[9*SIMD_LEN+1];
  1232. double *tempvalxx, *tempvalxy, *tempvalxz;
  1233. double *tempvalyx, *tempvalyy, *tempvalyz;
  1234. double *tempvalzx, *tempvalzy, *tempvalzz;
  1235. if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned
  1236. {
  1237. tempvalxx = aux_arr + 1;
  1238. if (size_t(tempvalxx)%IDEAL_ALIGNMENT)
  1239. abort();
  1240. }
  1241. else
  1242. tempvalxx = aux_arr;
  1243. tempvalxy=tempvalxx+SIMD_LEN;
  1244. tempvalxz=tempvalxy+SIMD_LEN;
  1245. tempvalyx=tempvalxz+SIMD_LEN;
  1246. tempvalyy=tempvalyx+SIMD_LEN;
  1247. tempvalyz=tempvalyy+SIMD_LEN;
  1248. tempvalzx=tempvalyz+SIMD_LEN;
  1249. tempvalzy=tempvalzx+SIMD_LEN;
  1250. tempvalzz=tempvalzy+SIMD_LEN;
  1251. /*! One over eight pi */
  1252. __m128d tofp = _mm_set1_pd (TOFP);
  1253. __m128d half = _mm_set1_pd (0.5);
  1254. __m128d opf = _mm_set1_pd (1.5);
  1255. __m128d zero = _mm_setzero_pd ();
  1256. // loop over sources
  1257. int i = 0;
  1258. for (; i < nt; i++) {
  1259. tempxx = _mm_setzero_pd(); tempxy = _mm_setzero_pd(); tempxz = _mm_setzero_pd();
  1260. tempyx = _mm_setzero_pd(); tempyy = _mm_setzero_pd(); tempyz = _mm_setzero_pd();
  1261. tempzx = _mm_setzero_pd(); tempzy = _mm_setzero_pd(); tempzz = _mm_setzero_pd();
  1262. __m128d txi = _mm_load1_pd (&tx[i]);
  1263. __m128d tyi = _mm_load1_pd (&ty[i]);
  1264. __m128d tzi = _mm_load1_pd (&tz[i]);
  1265. int j = 0;
  1266. // Load and calculate in groups of SIMD_LEN
  1267. for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
  1268. __m128d sxj = _mm_load_pd (&sx[j]);
  1269. __m128d syj = _mm_load_pd (&sy[j]);
  1270. __m128d szj = _mm_load_pd (&sz[j]);
  1271. __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]);
  1272. __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]);
  1273. __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]);
  1274. __m128d dX, dY, dZ;
  1275. __m128d dR2;
  1276. __m128d S;
  1277. __m128d S2;
  1278. dX = _mm_sub_pd(txi , sxj);
  1279. dY = _mm_sub_pd(tyi , syj);
  1280. dZ = _mm_sub_pd(tzi , szj);
  1281. sxj = _mm_mul_pd(dX, dX);
  1282. syj = _mm_mul_pd(dY, dY);
  1283. szj = _mm_mul_pd(dZ, dZ);
  1284. dR2 = _mm_add_pd(sxj, syj);
  1285. dR2 = _mm_add_pd(szj, dR2);
  1286. __m128d temp = _mm_cmpeq_pd (dR2, zero);
  1287. __m128d xhalf = _mm_mul_pd (half, dR2);
  1288. __m128 dR2_s = _mm_cvtpd_ps(dR2);
  1289. __m128 S_s = _mm_rsqrt_ps(dR2_s);
  1290. __m128d S_d = _mm_cvtps_pd(S_s);
  1291. // To handle the condition when src and trg coincide
  1292. S_d = _mm_andnot_pd (temp, S_d);
  1293. S = _mm_mul_pd (S_d, S_d);
  1294. S = _mm_mul_pd (S, xhalf);
  1295. S = _mm_sub_pd (opf, S);
  1296. S = _mm_mul_pd (S, S_d);
  1297. S2 = _mm_mul_pd (S, S);
  1298. __m128d dotx = _mm_mul_pd (dX, sdenx);
  1299. __m128d doty = _mm_mul_pd (dY, sdeny);
  1300. __m128d dotz = _mm_mul_pd (dZ, sdenz);
  1301. __m128d dot_sum = _mm_add_pd (dotx, doty);
  1302. dot_sum = _mm_add_pd (dot_sum, dotz);
  1303. dot_sum = _mm_mul_pd (dot_sum, S);
  1304. dot_sum = _mm_mul_pd (dot_sum, S2);
  1305. dot_sum = _mm_mul_pd (dot_sum, S2);
  1306. dotx = _mm_mul_pd (dot_sum, dX);
  1307. doty = _mm_mul_pd (dot_sum, dY);
  1308. dotz = _mm_mul_pd (dot_sum, dZ);
  1309. tempxx = _mm_add_pd (_mm_mul_pd(dotx,dX), tempxx);
  1310. tempxy = _mm_add_pd (_mm_mul_pd(dotx,dY), tempxy);
  1311. tempxz = _mm_add_pd (_mm_mul_pd(dotx,dZ), tempxz);
  1312. tempyx = _mm_add_pd (_mm_mul_pd(doty,dX), tempyx);
  1313. tempyy = _mm_add_pd (_mm_mul_pd(doty,dY), tempyy);
  1314. tempyz = _mm_add_pd (_mm_mul_pd(doty,dZ), tempyz);
  1315. tempzx = _mm_add_pd (_mm_mul_pd(dotz,dX), tempzx);
  1316. tempzy = _mm_add_pd (_mm_mul_pd(dotz,dY), tempzy);
  1317. tempzz = _mm_add_pd (_mm_mul_pd(dotz,dZ), tempzz);
  1318. }
  1319. tempxx = _mm_mul_pd (tempxx, tofp);
  1320. tempxy = _mm_mul_pd (tempxy, tofp);
  1321. tempxz = _mm_mul_pd (tempxz, tofp);
  1322. tempyx = _mm_mul_pd (tempyx, tofp);
  1323. tempyy = _mm_mul_pd (tempyy, tofp);
  1324. tempyz = _mm_mul_pd (tempyz, tofp);
  1325. tempzx = _mm_mul_pd (tempzx, tofp);
  1326. tempzy = _mm_mul_pd (tempzy, tofp);
  1327. tempzz = _mm_mul_pd (tempzz, tofp);
  1328. _mm_store_pd(tempvalxx, tempxx); _mm_store_pd(tempvalxy, tempxy); _mm_store_pd(tempvalxz, tempxz);
  1329. _mm_store_pd(tempvalyx, tempyx); _mm_store_pd(tempvalyy, tempyy); _mm_store_pd(tempvalyz, tempyz);
  1330. _mm_store_pd(tempvalzx, tempzx); _mm_store_pd(tempvalzy, tempzy); _mm_store_pd(tempvalzz, tempzz);
  1331. for (int k = 0; k < SIMD_LEN; k++) {
  1332. trgVal[i*9 ] += tempvalxx[k];
  1333. trgVal[i*9+1] += tempvalxy[k];
  1334. trgVal[i*9+2] += tempvalxz[k];
  1335. trgVal[i*9+3] += tempvalyx[k];
  1336. trgVal[i*9+4] += tempvalyy[k];
  1337. trgVal[i*9+5] += tempvalyz[k];
  1338. trgVal[i*9+6] += tempvalzx[k];
  1339. trgVal[i*9+7] += tempvalzy[k];
  1340. trgVal[i*9+8] += tempvalzz[k];
  1341. }
  1342. for (; j < ns; j++) {
  1343. double x = tx[i] - sx[j];
  1344. double y = ty[i] - sy[j];
  1345. double z = tz[i] - sz[j];
  1346. double r2 = x*x + y*y + z*z;
  1347. double r = sqrt(r2);
  1348. double invdr;
  1349. if (r == 0)
  1350. invdr = 0;
  1351. else
  1352. invdr = 1/r;
  1353. double invdr2=invdr*invdr;
  1354. double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]) * invdr2 * invdr2 * invdr;
  1355. double denx = dot*x;
  1356. double deny = dot*y;
  1357. double denz = dot*z;
  1358. trgVal[i*9 ] += denx*x*TOFP;
  1359. trgVal[i*9+1] += denx*y*TOFP;
  1360. trgVal[i*9+2] += denx*z*TOFP;
  1361. trgVal[i*9+3] += deny*x*TOFP;
  1362. trgVal[i*9+4] += deny*y*TOFP;
  1363. trgVal[i*9+5] += deny*z*TOFP;
  1364. trgVal[i*9+6] += denz*x*TOFP;
  1365. trgVal[i*9+7] += denz*y*TOFP;
  1366. trgVal[i*9+8] += denz*z*TOFP;
  1367. }
  1368. }
  1369. return;
  1370. }
  1371. void stokesGradSSE(
  1372. const int ns,
  1373. const int nt,
  1374. const double *sx,
  1375. const double *sy,
  1376. const double *sz,
  1377. const double *tx,
  1378. const double *ty,
  1379. const double *tz,
  1380. const double *srcDen,
  1381. double *trgVal,
  1382. const double cof )
  1383. {
  1384. if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
  1385. abort();
  1386. double mu = cof;
  1387. double OOEP = 1.0/(8.0*const_pi<double>());
  1388. __m128d tempxx; __m128d tempxy; __m128d tempxz;
  1389. __m128d tempyx; __m128d tempyy; __m128d tempyz;
  1390. __m128d tempzx; __m128d tempzy; __m128d tempzz;
  1391. double oomeu = 1/mu;
  1392. double aux_arr[9*SIMD_LEN+1];
  1393. double *tempvalxx, *tempvalxy, *tempvalxz;
  1394. double *tempvalyx, *tempvalyy, *tempvalyz;
  1395. double *tempvalzx, *tempvalzy, *tempvalzz;
  1396. if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned
  1397. {
  1398. tempvalxx = aux_arr + 1;
  1399. if (size_t(tempvalxx)%IDEAL_ALIGNMENT)
  1400. abort();
  1401. }
  1402. else
  1403. tempvalxx = aux_arr;
  1404. tempvalxy=tempvalxx+SIMD_LEN;
  1405. tempvalxz=tempvalxy+SIMD_LEN;
  1406. tempvalyx=tempvalxz+SIMD_LEN;
  1407. tempvalyy=tempvalyx+SIMD_LEN;
  1408. tempvalyz=tempvalyy+SIMD_LEN;
  1409. tempvalzx=tempvalyz+SIMD_LEN;
  1410. tempvalzy=tempvalzx+SIMD_LEN;
  1411. tempvalzz=tempvalzy+SIMD_LEN;
  1412. /*! One over eight pi */
  1413. __m128d ooep = _mm_set1_pd (OOEP);
  1414. __m128d half = _mm_set1_pd (0.5);
  1415. __m128d opf = _mm_set1_pd (1.5);
  1416. __m128d three = _mm_set1_pd (3.0);
  1417. __m128d zero = _mm_setzero_pd ();
  1418. __m128d oomu = _mm_set1_pd (1/mu);
  1419. __m128d ooepmu = _mm_mul_pd(ooep,oomu);
  1420. // loop over sources
  1421. int i = 0;
  1422. for (; i < nt; i++) {
  1423. tempxx = _mm_setzero_pd(); tempxy = _mm_setzero_pd(); tempxz = _mm_setzero_pd();
  1424. tempyx = _mm_setzero_pd(); tempyy = _mm_setzero_pd(); tempyz = _mm_setzero_pd();
  1425. tempzx = _mm_setzero_pd(); tempzy = _mm_setzero_pd(); tempzz = _mm_setzero_pd();
  1426. __m128d txi = _mm_load1_pd (&tx[i]);
  1427. __m128d tyi = _mm_load1_pd (&ty[i]);
  1428. __m128d tzi = _mm_load1_pd (&tz[i]);
  1429. int j = 0;
  1430. // Load and calculate in groups of SIMD_LEN
  1431. for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
  1432. __m128d sxj = _mm_load_pd (&sx[j]);
  1433. __m128d syj = _mm_load_pd (&sy[j]);
  1434. __m128d szj = _mm_load_pd (&sz[j]);
  1435. __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]);
  1436. __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]);
  1437. __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]);
  1438. __m128d dX, dY, dZ;
  1439. __m128d dR2;
  1440. __m128d S;
  1441. __m128d S2;
  1442. __m128d S3;
  1443. dX = _mm_sub_pd(txi , sxj);
  1444. dY = _mm_sub_pd(tyi , syj);
  1445. dZ = _mm_sub_pd(tzi , szj);
  1446. sxj = _mm_mul_pd(dX, dX);
  1447. syj = _mm_mul_pd(dY, dY);
  1448. szj = _mm_mul_pd(dZ, dZ);
  1449. dR2 = _mm_add_pd(sxj, syj);
  1450. dR2 = _mm_add_pd(szj, dR2);
  1451. __m128d temp = _mm_cmpeq_pd (dR2, zero);
  1452. __m128d xhalf = _mm_mul_pd (half, dR2);
  1453. __m128 dR2_s = _mm_cvtpd_ps(dR2);
  1454. __m128 S_s = _mm_rsqrt_ps(dR2_s);
  1455. __m128d S_d = _mm_cvtps_pd(S_s);
  1456. // To handle the condition when src and trg coincide
  1457. S_d = _mm_andnot_pd (temp, S_d);
  1458. S = _mm_mul_pd (S_d, S_d);
  1459. S = _mm_mul_pd (S, xhalf);
  1460. S = _mm_sub_pd (opf, S);
  1461. S = _mm_mul_pd (S, S_d);
  1462. S2 = _mm_mul_pd (S, S);
  1463. S3 = _mm_mul_pd (S2, S);
  1464. __m128d dotx = _mm_mul_pd (dX, sdenx);
  1465. __m128d doty = _mm_mul_pd (dY, sdeny);
  1466. __m128d dotz = _mm_mul_pd (dZ, sdenz);
  1467. __m128d dot_sum = _mm_add_pd (dotx, doty);
  1468. dot_sum = _mm_add_pd (dot_sum, dotz);
  1469. dot_sum = _mm_mul_pd (dot_sum, S2);
  1470. tempxx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdenx), _mm_mul_pd(sdenx, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dX, dX)))))),tempxx);
  1471. tempxy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdenx), _mm_mul_pd(sdeny, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dY, dX)))))),tempxy);
  1472. tempxz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdenx), _mm_mul_pd(sdenz, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dZ, dX)))))),tempxz);
  1473. tempyx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdeny), _mm_mul_pd(sdenx, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dX, dY)))))),tempyx);
  1474. tempyy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdeny), _mm_mul_pd(sdeny, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dY, dY)))))),tempyy);
  1475. tempyz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdeny), _mm_mul_pd(sdenz, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dZ, dY)))))),tempyz);
  1476. tempzx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdenz), _mm_mul_pd(sdenx, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dX, dZ)))))),tempzx);
  1477. tempzy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdenz), _mm_mul_pd(sdeny, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dY, dZ)))))),tempzy);
  1478. tempzz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdenz), _mm_mul_pd(sdenz, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dZ, dZ)))))),tempzz);
  1479. }
  1480. tempxx = _mm_mul_pd (tempxx, ooepmu);
  1481. tempxy = _mm_mul_pd (tempxy, ooepmu);
  1482. tempxz = _mm_mul_pd (tempxz, ooepmu);
  1483. tempyx = _mm_mul_pd (tempyx, ooepmu);
  1484. tempyy = _mm_mul_pd (tempyy, ooepmu);
  1485. tempyz = _mm_mul_pd (tempyz, ooepmu);
  1486. tempzx = _mm_mul_pd (tempzx, ooepmu);
  1487. tempzy = _mm_mul_pd (tempzy, ooepmu);
  1488. tempzz = _mm_mul_pd (tempzz, ooepmu);
  1489. _mm_store_pd(tempvalxx, tempxx); _mm_store_pd(tempvalxy, tempxy); _mm_store_pd(tempvalxz, tempxz);
  1490. _mm_store_pd(tempvalyx, tempyx); _mm_store_pd(tempvalyy, tempyy); _mm_store_pd(tempvalyz, tempyz);
  1491. _mm_store_pd(tempvalzx, tempzx); _mm_store_pd(tempvalzy, tempzy); _mm_store_pd(tempvalzz, tempzz);
  1492. for (int k = 0; k < SIMD_LEN; k++) {
  1493. trgVal[i*9 ] += tempvalxx[k];
  1494. trgVal[i*9+1] += tempvalxy[k];
  1495. trgVal[i*9+2] += tempvalxz[k];
  1496. trgVal[i*9+3] += tempvalyx[k];
  1497. trgVal[i*9+4] += tempvalyy[k];
  1498. trgVal[i*9+5] += tempvalyz[k];
  1499. trgVal[i*9+6] += tempvalzx[k];
  1500. trgVal[i*9+7] += tempvalzy[k];
  1501. trgVal[i*9+8] += tempvalzz[k];
  1502. }
  1503. for (; j < ns; j++) {
  1504. double x = tx[i] - sx[j];
  1505. double y = ty[i] - sy[j];
  1506. double z = tz[i] - sz[j];
  1507. double r2 = x*x + y*y + z*z;
  1508. double r = sqrt(r2);
  1509. double invdr;
  1510. if (r == 0)
  1511. invdr = 0;
  1512. else
  1513. invdr = 1/r;
  1514. double invdr2=invdr*invdr;
  1515. double invdr3=invdr2*invdr;
  1516. double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]);
  1517. trgVal[i*9 ] += OOEP*oomeu*invdr3*( x*srcDen[j*3 ] - srcDen[j*3 ]*x + dot*(1-3*x*x*invdr2) );
  1518. trgVal[i*9+1] += OOEP*oomeu*invdr3*( y*srcDen[j*3 ] - srcDen[j*3+1]*x + dot*(0-3*y*x*invdr2) );
  1519. trgVal[i*9+2] += OOEP*oomeu*invdr3*( z*srcDen[j*3 ] - srcDen[j*3+2]*x + dot*(0-3*z*x*invdr2) );
  1520. trgVal[i*9+3] += OOEP*oomeu*invdr3*( x*srcDen[j*3+1] - srcDen[j*3 ]*y + dot*(0-3*x*y*invdr2) );
  1521. trgVal[i*9+4] += OOEP*oomeu*invdr3*( y*srcDen[j*3+1] - srcDen[j*3+1]*y + dot*(1-3*y*y*invdr2) );
  1522. trgVal[i*9+5] += OOEP*oomeu*invdr3*( z*srcDen[j*3+1] - srcDen[j*3+2]*y + dot*(0-3*z*y*invdr2) );
  1523. trgVal[i*9+6] += OOEP*oomeu*invdr3*( x*srcDen[j*3+2] - srcDen[j*3 ]*z + dot*(0-3*x*z*invdr2) );
  1524. trgVal[i*9+7] += OOEP*oomeu*invdr3*( y*srcDen[j*3+2] - srcDen[j*3+1]*z + dot*(0-3*y*z*invdr2) );
  1525. trgVal[i*9+8] += OOEP*oomeu*invdr3*( z*srcDen[j*3+2] - srcDen[j*3+2]*z + dot*(1-3*z*z*invdr2) );
  1526. }
  1527. }
  1528. return;
  1529. }
  1530. #undef SIMD_LEN
  1531. #define X(s,k) (s)[(k)*COORD_DIM]
  1532. #define Y(s,k) (s)[(k)*COORD_DIM+1]
  1533. #define Z(s,k) (s)[(k)*COORD_DIM+2]
  1534. void stokesDirectSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef, mem::MemoryManager* mem_mgr=NULL)
  1535. {
  1536. std::vector<double> xs(ns+1); std::vector<double> xt(nt);
  1537. std::vector<double> ys(ns+1); std::vector<double> yt(nt);
  1538. std::vector<double> zs(ns+1); std::vector<double> zt(nt);
  1539. int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1540. int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
  1541. int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1542. //1. reshuffle memory
  1543. for (int k =0;k<ns;k++){
  1544. xs[k+x_shift]=X(src,k);
  1545. ys[k+y_shift]=Y(src,k);
  1546. zs[k+z_shift]=Z(src,k);
  1547. }
  1548. for (int k=0;k<nt;k++){
  1549. xt[k]=X(trg,k);
  1550. yt[k]=Y(trg,k);
  1551. zt[k]=Z(trg,k);
  1552. }
  1553. //2. perform caclulation
  1554. stokesDirectVecSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot,kernel_coef);
  1555. return;
  1556. }
  1557. void stokesPressureSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
  1558. {
  1559. std::vector<double> xs(ns+1); std::vector<double> xt(nt);
  1560. std::vector<double> ys(ns+1); std::vector<double> yt(nt);
  1561. std::vector<double> zs(ns+1); std::vector<double> zt(nt);
  1562. int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1563. int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
  1564. int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1565. //1. reshuffle memory
  1566. for (int k =0;k<ns;k++){
  1567. xs[k+x_shift]=X(src,k);
  1568. ys[k+y_shift]=Y(src,k);
  1569. zs[k+z_shift]=Z(src,k);
  1570. }
  1571. for (int k=0;k<nt;k++){
  1572. xt[k]=X(trg,k);
  1573. yt[k]=Y(trg,k);
  1574. zt[k]=Z(trg,k);
  1575. }
  1576. //2. perform caclulation
  1577. stokesPressureSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
  1578. return;
  1579. }
  1580. void stokesStressSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
  1581. {
  1582. std::vector<double> xs(ns+1); std::vector<double> xt(nt);
  1583. std::vector<double> ys(ns+1); std::vector<double> yt(nt);
  1584. std::vector<double> zs(ns+1); std::vector<double> zt(nt);
  1585. int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1586. int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
  1587. int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1588. //1. reshuffle memory
  1589. for (int k =0;k<ns;k++){
  1590. xs[k+x_shift]=X(src,k);
  1591. ys[k+y_shift]=Y(src,k);
  1592. zs[k+z_shift]=Z(src,k);
  1593. }
  1594. for (int k=0;k<nt;k++){
  1595. xt[k]=X(trg,k);
  1596. yt[k]=Y(trg,k);
  1597. zt[k]=Z(trg,k);
  1598. }
  1599. //2. perform caclulation
  1600. stokesStressSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
  1601. return;
  1602. }
  1603. void stokesGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef, mem::MemoryManager* mem_mgr=NULL)
  1604. {
  1605. std::vector<double> xs(ns+1); std::vector<double> xt(nt);
  1606. std::vector<double> ys(ns+1); std::vector<double> yt(nt);
  1607. std::vector<double> zs(ns+1); std::vector<double> zt(nt);
  1608. int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1609. int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
  1610. int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1611. //1. reshuffle memory
  1612. for (int k =0;k<ns;k++){
  1613. xs[k+x_shift]=X(src,k);
  1614. ys[k+y_shift]=Y(src,k);
  1615. zs[k+z_shift]=Z(src,k);
  1616. }
  1617. for (int k=0;k<nt;k++){
  1618. xt[k]=X(trg,k);
  1619. yt[k]=Y(trg,k);
  1620. zt[k]=Z(trg,k);
  1621. }
  1622. //2. perform caclulation
  1623. stokesGradSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot,kernel_coef);
  1624. return;
  1625. }
  1626. #undef X
  1627. #undef Y
  1628. #undef Z
  1629. #undef IDEAL_ALIGNMENT
  1630. #undef DECL_SIMD_ALIGNED
  1631. }
  1632. template <>
  1633. void stokes_vel<double>(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1634. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(28*dof));
  1635. const T mu=1.0;
  1636. stokesDirectSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr);
  1637. }
  1638. template <>
  1639. void stokes_press<double>(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1640. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof));
  1641. stokesPressureSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr);
  1642. return;
  1643. }
  1644. template <>
  1645. void stokes_stress<double>(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1646. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof));
  1647. stokesStressSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr);
  1648. }
  1649. template <>
  1650. void stokes_grad<double>(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1651. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof));
  1652. const T mu=1.0;
  1653. stokesGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr);
  1654. }
  1655. #endif
  1656. #endif
  1657. ////////////////////////////////////////////////////////////////////////////////
  1658. //////// BIOT-SAVART KERNEL ////////
  1659. ////////////////////////////////////////////////////////////////////////////////
  1660. template <class T>
  1661. void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1662. #ifndef __MIC__
  1663. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(26*dof));
  1664. #endif
  1665. const T OOFP = -1.0/(4.0*const_pi<T>());
  1666. for(int t=0;t<trg_cnt;t++){
  1667. for(int i=0;i<dof;i++){
  1668. T p[3]={0,0,0};
  1669. for(int s=0;s<src_cnt;s++){
  1670. T dR[3]={r_trg[3*t ]-r_src[3*s ],
  1671. r_trg[3*t+1]-r_src[3*s+1],
  1672. r_trg[3*t+2]-r_src[3*s+2]};
  1673. T R2 = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
  1674. if (R2!=0){
  1675. T invR2=1.0/R2;
  1676. T invR=sqrt(invR2);
  1677. T invR3=invR*invR2;
  1678. T v_src[3]={v_src_[(s*dof+i)*3 ],
  1679. v_src_[(s*dof+i)*3+1],
  1680. v_src_[(s*dof+i)*3+2]};
  1681. p[0] -= (v_src[1]*dR[2]-v_src[2]*dR[1])*invR3;
  1682. p[1] -= (v_src[2]*dR[0]-v_src[0]*dR[2])*invR3;
  1683. p[2] -= (v_src[0]*dR[1]-v_src[1]*dR[0])*invR3;
  1684. }
  1685. }
  1686. k_out[(t*dof+i)*3+0] += p[0]*OOFP;
  1687. k_out[(t*dof+i)*3+1] += p[1]*OOFP;
  1688. k_out[(t*dof+i)*3+2] += p[2]*OOFP;
  1689. }
  1690. }
  1691. }
  1692. ////////////////////////////////////////////////////////////////////////////////
  1693. //////// HELMHOLTZ KERNEL ////////
  1694. ////////////////////////////////////////////////////////////////////////////////
  1695. /**
  1696. * \brief Green's function for the Helmholtz's equation. Kernel tensor
  1697. * dimension = 2x2.
  1698. */
  1699. template <class T>
  1700. void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1701. #ifndef __MIC__
  1702. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(24*dof));
  1703. #endif
  1704. const T mu = (20.0*const_pi<T>());
  1705. for(int t=0;t<trg_cnt;t++){
  1706. for(int i=0;i<dof;i++){
  1707. T p[2]={0,0};
  1708. for(int s=0;s<src_cnt;s++){
  1709. T dX_reg=r_trg[3*t ]-r_src[3*s ];
  1710. T dY_reg=r_trg[3*t+1]-r_src[3*s+1];
  1711. T dZ_reg=r_trg[3*t+2]-r_src[3*s+2];
  1712. T R = (dX_reg*dX_reg+dY_reg*dY_reg+dZ_reg*dZ_reg);
  1713. if (R!=0){
  1714. R = sqrt(R);
  1715. T invR=1.0/R;
  1716. T G[2]={cos(mu*R)*invR, sin(mu*R)*invR};
  1717. p[0] += v_src[(s*dof+i)*2+0]*G[0] - v_src[(s*dof+i)*2+1]*G[1];
  1718. p[1] += v_src[(s*dof+i)*2+0]*G[1] + v_src[(s*dof+i)*2+1]*G[0];
  1719. }
  1720. }
  1721. k_out[(t*dof+i)*2+0] += p[0];
  1722. k_out[(t*dof+i)*2+1] += p[1];
  1723. }
  1724. }
  1725. }
  1726. template <class T>
  1727. void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1728. //TODO Implement this.
  1729. }
  1730. }//end namespace