kernel.txx 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415
  1. /**
  2. * \file kernel.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 12-20-2011
  5. * \brief This file contains the implementation of the struct Kernel and also the
  6. * implementation of various kernels for FMM.
  7. */
  8. #include <cmath>
  9. #include <cstdlib>
  10. #include <vector>
  11. #include <mem_mgr.hpp>
  12. #include <profile.hpp>
  13. #include <vector.hpp>
  14. #include <matrix.hpp>
  15. #include <precomp_mat.hpp>
  16. #include <intrin_wrapper.hpp>
  17. namespace pvfmm{
  18. /**
  19. * \brief Constructor.
  20. */
  21. template <class T>
  22. Kernel<T>::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_, std::pair<int,int> k_dim,
  23. size_t dev_poten, size_t dev_dbl_poten){
  24. dim=dim_;
  25. ker_dim[0]=k_dim.first;
  26. ker_dim[1]=k_dim.second;
  27. ker_poten=poten;
  28. dbl_layer_poten=dbl_poten;
  29. ker_name=std::string(name);
  30. dev_ker_poten=dev_poten;
  31. dev_dbl_layer_poten=dev_dbl_poten;
  32. k_s2m=NULL;
  33. k_s2l=NULL;
  34. k_s2t=NULL;
  35. k_m2m=NULL;
  36. k_m2l=NULL;
  37. k_m2t=NULL;
  38. k_l2l=NULL;
  39. k_l2t=NULL;
  40. scale_invar=true;
  41. src_scal.Resize(ker_dim[0]); src_scal.SetZero();
  42. trg_scal.Resize(ker_dim[1]); trg_scal.SetZero();
  43. perm_vec.Resize(Perm_Count);
  44. for(size_t p_type=0;p_type<C_Perm;p_type++){
  45. perm_vec[p_type ]=Permutation<T>(ker_dim[0]);
  46. perm_vec[p_type+C_Perm]=Permutation<T>(ker_dim[1]);
  47. }
  48. init=false;
  49. }
  50. /**
  51. * \brief Initialize the kernel.
  52. */
  53. template <class T>
  54. void Kernel<T>::Initialize(bool verbose) const{
  55. if(init) return;
  56. init=true;
  57. T eps=1.0;
  58. while(eps+(T)1.0>1.0) eps*=0.5;
  59. T scal=1.0;
  60. if(ker_dim[0]*ker_dim[1]>0){ // Determine scaling
  61. Matrix<T> M_scal(ker_dim[0],ker_dim[1]);
  62. size_t N=1024;
  63. T eps_=N*eps;
  64. T src_coord[3]={0,0,0};
  65. std::vector<T> trg_coord1(N*COORD_DIM);
  66. Matrix<T> M1(N,ker_dim[0]*ker_dim[1]);
  67. while(true){
  68. T abs_sum=0;
  69. for(size_t i=0;i<N/2;i++){
  70. T x,y,z,r;
  71. do{
  72. x=(drand48()-0.5);
  73. y=(drand48()-0.5);
  74. z=(drand48()-0.5);
  75. r=sqrt(x*x+y*y+z*z);
  76. }while(r<0.25);
  77. trg_coord1[i*COORD_DIM+0]=x*scal;
  78. trg_coord1[i*COORD_DIM+1]=y*scal;
  79. trg_coord1[i*COORD_DIM+2]=z*scal;
  80. }
  81. for(size_t i=N/2;i<N;i++){
  82. T x,y,z,r;
  83. do{
  84. x=(drand48()-0.5);
  85. y=(drand48()-0.5);
  86. z=(drand48()-0.5);
  87. r=sqrt(x*x+y*y+z*z);
  88. }while(r<0.25);
  89. trg_coord1[i*COORD_DIM+0]=x*1.0/scal;
  90. trg_coord1[i*COORD_DIM+1]=y*1.0/scal;
  91. trg_coord1[i*COORD_DIM+2]=z*1.0/scal;
  92. }
  93. for(size_t i=0;i<N;i++){
  94. BuildMatrix(&src_coord [ 0], 1,
  95. &trg_coord1[i*COORD_DIM], 1, &(M1[i][0]));
  96. for(size_t j=0;j<ker_dim[0]*ker_dim[1];j++){
  97. abs_sum+=fabs(M1[i][j]);
  98. }
  99. }
  100. if(abs_sum>sqrt(eps) || scal<eps) break;
  101. scal=scal*0.5;
  102. }
  103. std::vector<T> trg_coord2(N*COORD_DIM);
  104. Matrix<T> M2(N,ker_dim[0]*ker_dim[1]);
  105. for(size_t i=0;i<N*COORD_DIM;i++){
  106. trg_coord2[i]=trg_coord1[i]*0.5;
  107. }
  108. for(size_t i=0;i<N;i++){
  109. BuildMatrix(&src_coord [ 0], 1,
  110. &trg_coord2[i*COORD_DIM], 1, &(M2[i][0]));
  111. }
  112. T max_val=0;
  113. for(size_t i=0;i<N;i++){
  114. for(size_t j=0;j<ker_dim[0]*ker_dim[1];j++){
  115. max_val=std::max<T>(max_val,M1[i][j]);
  116. max_val=std::max<T>(max_val,M2[i][j]);
  117. }
  118. }
  119. for(size_t i=0;i<ker_dim[0]*ker_dim[1];i++){
  120. T dot11=0, dot12=0, dot22=0;
  121. for(size_t j=0;j<N;j++){
  122. dot11+=M1[j][i]*M1[j][i];
  123. dot12+=M1[j][i]*M2[j][i];
  124. dot22+=M2[j][i]*M2[j][i];
  125. }
  126. if(dot11>max_val*max_val*eps_ &&
  127. dot22>max_val*max_val*eps_ ){
  128. T s=dot12/dot11;
  129. M_scal[0][i]=log(s)/log(2.0);
  130. T err=sqrt(0.5*(dot22/dot11)/(s*s)-0.5);
  131. if(err>eps_){
  132. scale_invar=false;
  133. M_scal[0][i]=0.0;
  134. }
  135. assert(M_scal[0][i]>=0.0); // Kernel function must decay
  136. }else M_scal[0][i]=-1;
  137. }
  138. src_scal.Resize(ker_dim[0]); src_scal.SetZero();
  139. trg_scal.Resize(ker_dim[1]); trg_scal.SetZero();
  140. if(scale_invar){
  141. Matrix<T> b(ker_dim[0]*ker_dim[1]+1,1); b.SetZero();
  142. mem::memcopy(&b[0][0],&M_scal[0][0],ker_dim[0]*ker_dim[1]*sizeof(T));
  143. Matrix<T> M(ker_dim[0]*ker_dim[1]+1,ker_dim[0]+ker_dim[1]); M.SetZero();
  144. M[ker_dim[0]*ker_dim[1]][0]=1;
  145. for(size_t i0=0;i0<ker_dim[0];i0++)
  146. for(size_t i1=0;i1<ker_dim[1];i1++){
  147. size_t j=i0*ker_dim[1]+i1;
  148. if(b[j][0]>=0){
  149. M[j][ 0+ i0]=1;
  150. M[j][i1+ker_dim[0]]=1;
  151. }
  152. }
  153. Matrix<T> x=M.pinv()*b;
  154. for(size_t i=0;i<ker_dim[0];i++){
  155. src_scal[i]=x[i][0];
  156. }
  157. for(size_t i=0;i<ker_dim[1];i++){
  158. trg_scal[i]=x[ker_dim[0]+i][0];
  159. }
  160. for(size_t i0=0;i0<ker_dim[0];i0++)
  161. for(size_t i1=0;i1<ker_dim[1];i1++){
  162. if(M_scal[i0][i1]>=0){
  163. if(fabs(src_scal[i0]+trg_scal[i1]-M_scal[i0][i1])>eps_){
  164. scale_invar=false;
  165. }
  166. }
  167. }
  168. }
  169. if(!scale_invar){
  170. src_scal.SetZero();
  171. trg_scal.SetZero();
  172. //std::cout<<ker_name<<" not-scale-invariant\n";
  173. }
  174. }
  175. if(ker_dim[0]*ker_dim[1]>0){ // Determine symmetry
  176. size_t N=1024;
  177. T eps_=N*eps;
  178. T src_coord[3]={0,0,0};
  179. std::vector<T> trg_coord1(N*COORD_DIM);
  180. std::vector<T> trg_coord2(N*COORD_DIM);
  181. for(size_t i=0;i<N/2;i++){
  182. T x,y,z,r;
  183. do{
  184. x=(drand48()-0.5);
  185. y=(drand48()-0.5);
  186. z=(drand48()-0.5);
  187. r=sqrt(x*x+y*y+z*z);
  188. }while(r<0.25);
  189. trg_coord1[i*COORD_DIM+0]=x*scal;
  190. trg_coord1[i*COORD_DIM+1]=y*scal;
  191. trg_coord1[i*COORD_DIM+2]=z*scal;
  192. }
  193. for(size_t i=N/2;i<N;i++){
  194. T x,y,z,r;
  195. do{
  196. x=(drand48()-0.5);
  197. y=(drand48()-0.5);
  198. z=(drand48()-0.5);
  199. r=sqrt(x*x+y*y+z*z);
  200. }while(r<0.25);
  201. trg_coord1[i*COORD_DIM+0]=x*1.0/scal;
  202. trg_coord1[i*COORD_DIM+1]=y*1.0/scal;
  203. trg_coord1[i*COORD_DIM+2]=z*1.0/scal;
  204. }
  205. for(size_t p_type=0;p_type<C_Perm;p_type++){ // For each symmetry transform
  206. switch(p_type){ // Set trg_coord2
  207. case ReflecX:
  208. for(size_t i=0;i<N;i++){
  209. trg_coord2[i*COORD_DIM+0]=-trg_coord1[i*COORD_DIM+0];
  210. trg_coord2[i*COORD_DIM+1]= trg_coord1[i*COORD_DIM+1];
  211. trg_coord2[i*COORD_DIM+2]= trg_coord1[i*COORD_DIM+2];
  212. }
  213. break;
  214. case ReflecY:
  215. for(size_t i=0;i<N;i++){
  216. trg_coord2[i*COORD_DIM+0]= trg_coord1[i*COORD_DIM+0];
  217. trg_coord2[i*COORD_DIM+1]=-trg_coord1[i*COORD_DIM+1];
  218. trg_coord2[i*COORD_DIM+2]= trg_coord1[i*COORD_DIM+2];
  219. }
  220. break;
  221. case ReflecZ:
  222. for(size_t i=0;i<N;i++){
  223. trg_coord2[i*COORD_DIM+0]= trg_coord1[i*COORD_DIM+0];
  224. trg_coord2[i*COORD_DIM+1]= trg_coord1[i*COORD_DIM+1];
  225. trg_coord2[i*COORD_DIM+2]=-trg_coord1[i*COORD_DIM+2];
  226. }
  227. break;
  228. case SwapXY:
  229. for(size_t i=0;i<N;i++){
  230. trg_coord2[i*COORD_DIM+0]= trg_coord1[i*COORD_DIM+1];
  231. trg_coord2[i*COORD_DIM+1]= trg_coord1[i*COORD_DIM+0];
  232. trg_coord2[i*COORD_DIM+2]= trg_coord1[i*COORD_DIM+2];
  233. }
  234. break;
  235. case SwapXZ:
  236. for(size_t i=0;i<N;i++){
  237. trg_coord2[i*COORD_DIM+0]= trg_coord1[i*COORD_DIM+2];
  238. trg_coord2[i*COORD_DIM+1]= trg_coord1[i*COORD_DIM+1];
  239. trg_coord2[i*COORD_DIM+2]= trg_coord1[i*COORD_DIM+0];
  240. }
  241. break;
  242. default:
  243. for(size_t i=0;i<N;i++){
  244. trg_coord2[i*COORD_DIM+0]= trg_coord1[i*COORD_DIM+0];
  245. trg_coord2[i*COORD_DIM+1]= trg_coord1[i*COORD_DIM+1];
  246. trg_coord2[i*COORD_DIM+2]= trg_coord1[i*COORD_DIM+2];
  247. }
  248. }
  249. Matrix<long long> M11, M22;
  250. {
  251. Matrix<T> M1(N,ker_dim[0]*ker_dim[1]); M1.SetZero();
  252. Matrix<T> M2(N,ker_dim[0]*ker_dim[1]); M2.SetZero();
  253. for(size_t i=0;i<N;i++){
  254. BuildMatrix(&src_coord [ 0], 1,
  255. &trg_coord1[i*COORD_DIM], 1, &(M1[i][0]));
  256. BuildMatrix(&src_coord [ 0], 1,
  257. &trg_coord2[i*COORD_DIM], 1, &(M2[i][0]));
  258. }
  259. Matrix<T> dot11(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot11.SetZero();
  260. Matrix<T> dot12(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot12.SetZero();
  261. Matrix<T> dot22(ker_dim[0]*ker_dim[1],ker_dim[0]*ker_dim[1]);dot22.SetZero();
  262. std::vector<T> norm1(ker_dim[0]*ker_dim[1]);
  263. std::vector<T> norm2(ker_dim[0]*ker_dim[1]);
  264. {
  265. for(size_t k=0;k<N;k++)
  266. for(size_t i=0;i<ker_dim[0]*ker_dim[1];i++)
  267. for(size_t j=0;j<ker_dim[0]*ker_dim[1];j++){
  268. dot11[i][j]+=M1[k][i]*M1[k][j];
  269. dot12[i][j]+=M1[k][i]*M2[k][j];
  270. dot22[i][j]+=M2[k][i]*M2[k][j];
  271. }
  272. for(size_t i=0;i<ker_dim[0]*ker_dim[1];i++){
  273. norm1[i]=sqrt(dot11[i][i]);
  274. norm2[i]=sqrt(dot22[i][i]);
  275. }
  276. for(size_t i=0;i<ker_dim[0]*ker_dim[1];i++)
  277. for(size_t j=0;j<ker_dim[0]*ker_dim[1];j++){
  278. dot11[i][j]/=(norm1[i]*norm1[j]);
  279. dot12[i][j]/=(norm1[i]*norm2[j]);
  280. dot22[i][j]/=(norm2[i]*norm2[j]);
  281. }
  282. }
  283. long long flag=1;
  284. M11.Resize(ker_dim[0],ker_dim[1]); M11.SetZero();
  285. M22.Resize(ker_dim[0],ker_dim[1]); M22.SetZero();
  286. for(size_t i=0;i<ker_dim[0]*ker_dim[1];i++){
  287. if(norm1[i]>eps_ && M11[0][i]==0){
  288. for(size_t j=0;j<ker_dim[0]*ker_dim[1];j++){
  289. if(fabs(norm1[i]-norm1[j])<eps_ && fabs(fabs(dot11[i][j])-1.0)<eps_){
  290. M11[0][j]=(dot11[i][j]>0?flag:-flag);
  291. }
  292. if(fabs(norm1[i]-norm2[j])<eps_ && fabs(fabs(dot12[i][j])-1.0)<eps_){
  293. M22[0][j]=(dot12[i][j]>0?flag:-flag);
  294. }
  295. }
  296. flag++;
  297. }
  298. }
  299. }
  300. Matrix<long long> P1, P2;
  301. { // P1
  302. Matrix<long long>& P=P1;
  303. Matrix<long long> M1=M11;
  304. Matrix<long long> M2=M22;
  305. for(size_t i=0;i<M1.Dim(0);i++){
  306. for(size_t j=0;j<M1.Dim(1);j++){
  307. if(M1[i][j]<0) M1[i][j]=-M1[i][j];
  308. if(M2[i][j]<0) M2[i][j]=-M2[i][j];
  309. }
  310. std::sort(&M1[i][0],&M1[i][M1.Dim(1)]);
  311. std::sort(&M2[i][0],&M2[i][M2.Dim(1)]);
  312. }
  313. P.Resize(M1.Dim(0),M1.Dim(0));
  314. for(size_t i=0;i<M1.Dim(0);i++)
  315. for(size_t j=0;j<M1.Dim(0);j++){
  316. P[i][j]=1;
  317. for(size_t k=0;k<M1.Dim(1);k++)
  318. if(M1[i][k]!=M2[j][k]){
  319. P[i][j]=0;
  320. break;
  321. }
  322. }
  323. }
  324. { // P2
  325. Matrix<long long>& P=P2;
  326. Matrix<long long> M1=M11.Transpose();
  327. Matrix<long long> M2=M22.Transpose();
  328. for(size_t i=0;i<M1.Dim(0);i++){
  329. for(size_t j=0;j<M1.Dim(1);j++){
  330. if(M1[i][j]<0) M1[i][j]=-M1[i][j];
  331. if(M2[i][j]<0) M2[i][j]=-M2[i][j];
  332. }
  333. std::sort(&M1[i][0],&M1[i][M1.Dim(1)]);
  334. std::sort(&M2[i][0],&M2[i][M2.Dim(1)]);
  335. }
  336. P.Resize(M1.Dim(0),M1.Dim(0));
  337. for(size_t i=0;i<M1.Dim(0);i++)
  338. for(size_t j=0;j<M1.Dim(0);j++){
  339. P[i][j]=1;
  340. for(size_t k=0;k<M1.Dim(1);k++)
  341. if(M1[i][k]!=M2[j][k]){
  342. P[i][j]=0;
  343. break;
  344. }
  345. }
  346. }
  347. std::vector<Permutation<long long> > P1vec, P2vec;
  348. { // P1vec
  349. Matrix<long long>& Pmat=P1;
  350. std::vector<Permutation<long long> >& Pvec=P1vec;
  351. Permutation<long long> P(Pmat.Dim(0));
  352. Vector<PERM_INT_T>& perm=P.perm;
  353. perm.SetZero();
  354. // First permutation
  355. for(size_t i=0;i<P.Dim();i++)
  356. for(size_t j=0;j<P.Dim();j++){
  357. if(Pmat[i][j]){
  358. perm[i]=j;
  359. break;
  360. }
  361. }
  362. Vector<PERM_INT_T> perm_tmp;
  363. while(true){ // Next permutation
  364. perm_tmp=perm;
  365. std::sort(&perm_tmp[0],&perm_tmp[0]+perm_tmp.Dim());
  366. for(size_t i=0;i<perm_tmp.Dim();i++){
  367. if(perm_tmp[i]!=i) break;
  368. if(i==perm_tmp.Dim()-1){
  369. Pvec.push_back(P);
  370. }
  371. }
  372. bool last=false;
  373. for(size_t i=0;i<P.Dim();i++){
  374. PERM_INT_T tmp=perm[i];
  375. for(size_t j=perm[i]+1;j<P.Dim();j++){
  376. if(Pmat[i][j]){
  377. perm[i]=j;
  378. break;
  379. }
  380. }
  381. if(perm[i]>tmp) break;
  382. for(size_t j=0;j<P.Dim();j++){
  383. if(Pmat[i][j]){
  384. perm[i]=j;
  385. break;
  386. }
  387. }
  388. if(i==P.Dim()-1) last=true;
  389. }
  390. if(last) break;
  391. }
  392. }
  393. { // P2vec
  394. Matrix<long long>& Pmat=P2;
  395. std::vector<Permutation<long long> >& Pvec=P2vec;
  396. Permutation<long long> P(Pmat.Dim(0));
  397. Vector<PERM_INT_T>& perm=P.perm;
  398. perm.SetZero();
  399. // First permutation
  400. for(size_t i=0;i<P.Dim();i++)
  401. for(size_t j=0;j<P.Dim();j++){
  402. if(Pmat[i][j]){
  403. perm[i]=j;
  404. break;
  405. }
  406. }
  407. Vector<PERM_INT_T> perm_tmp;
  408. while(true){ // Next permutation
  409. perm_tmp=perm;
  410. std::sort(&perm_tmp[0],&perm_tmp[0]+perm_tmp.Dim());
  411. for(size_t i=0;i<perm_tmp.Dim();i++){
  412. if(perm_tmp[i]!=i) break;
  413. if(i==perm_tmp.Dim()-1){
  414. Pvec.push_back(P);
  415. }
  416. }
  417. bool last=false;
  418. for(size_t i=0;i<P.Dim();i++){
  419. PERM_INT_T tmp=perm[i];
  420. for(size_t j=perm[i]+1;j<P.Dim();j++){
  421. if(Pmat[i][j]){
  422. perm[i]=j;
  423. break;
  424. }
  425. }
  426. if(perm[i]>tmp) break;
  427. for(size_t j=0;j<P.Dim();j++){
  428. if(Pmat[i][j]){
  429. perm[i]=j;
  430. break;
  431. }
  432. }
  433. if(i==P.Dim()-1) last=true;
  434. }
  435. if(last) break;
  436. }
  437. }
  438. { // Find pairs which acutally work (neglect scaling)
  439. std::vector<Permutation<long long> > P1vec_, P2vec_;
  440. Matrix<long long> M1=M11;
  441. Matrix<long long> M2=M22;
  442. for(size_t i=0;i<M1.Dim(0);i++){
  443. for(size_t j=0;j<M1.Dim(1);j++){
  444. if(M1[i][j]<0) M1[i][j]=-M1[i][j];
  445. if(M2[i][j]<0) M2[i][j]=-M2[i][j];
  446. }
  447. }
  448. Matrix<long long> M;
  449. for(size_t i=0;i<P1vec.size();i++)
  450. for(size_t j=0;j<P2vec.size();j++){
  451. M=P1vec[i]*M2*P2vec[j];
  452. for(size_t k=0;k<M.Dim(0)*M.Dim(1);k++){
  453. if(M[0][k]!=M1[0][k]) break;
  454. if(k==M.Dim(0)*M.Dim(1)-1){
  455. P1vec_.push_back(P1vec[i]);
  456. P2vec_.push_back(P2vec[j]);
  457. }
  458. }
  459. }
  460. P1vec=P1vec_;
  461. P2vec=P2vec_;
  462. }
  463. Permutation<T> P1_, P2_;
  464. { // Find pairs which acutally work
  465. for(size_t k=0;k<P1vec.size();k++){
  466. Permutation<long long> P1=P1vec[k];
  467. Permutation<long long> P2=P2vec[k];
  468. Matrix<long long> M1= M11 ;
  469. Matrix<long long> M2=P1*M22*P2;
  470. Matrix<T> M(M1.Dim(0)*M1.Dim(1)+1,M1.Dim(0)+M1.Dim(1));
  471. M.SetZero(); M[M1.Dim(0)*M1.Dim(1)][0]=1.0;
  472. for(size_t i=0;i<M1.Dim(0);i++)
  473. for(size_t j=0;j<M1.Dim(1);j++){
  474. size_t k=i*M1.Dim(1)+j;
  475. M[k][ i]= M1[i][j];
  476. M[k][M1.Dim(0)+j]=-M2[i][j];
  477. }
  478. M=M.pinv();
  479. { // Construct new permutation
  480. Permutation<long long> P1_(M1.Dim(0));
  481. Permutation<long long> P2_(M1.Dim(1));
  482. for(size_t i=0;i<M1.Dim(0);i++){
  483. P1_.scal[i]=(M[i][M1.Dim(0)*M1.Dim(1)]>0?1:-1);
  484. }
  485. for(size_t i=0;i<M1.Dim(1);i++){
  486. P2_.scal[i]=(M[M1.Dim(0)+i][M1.Dim(0)*M1.Dim(1)]>0?1:-1);
  487. }
  488. P1=P1_*P1 ;
  489. P2=P2 *P2_;
  490. }
  491. bool done=true;
  492. Matrix<long long> Merr=P1*M22*P2-M11;
  493. for(size_t i=0;i<Merr.Dim(0)*Merr.Dim(1);i++){
  494. if(Merr[0][i]){
  495. done=false;
  496. break;
  497. }
  498. }
  499. if(done){
  500. P1_=Permutation<T>(P1.Dim());
  501. P2_=Permutation<T>(P2.Dim());
  502. for(size_t i=0;i<P1.Dim();i++){
  503. P1_.perm[i]=P1.perm[i];
  504. P1_.scal[i]=P1.scal[i];
  505. }
  506. for(size_t i=0;i<P2.Dim();i++){
  507. P2_.perm[i]=P2.perm[i];
  508. P2_.scal[i]=P2.scal[i];
  509. }
  510. break;
  511. }
  512. }
  513. }
  514. //std::cout<<P1_<<'\n';
  515. //std::cout<<P2_<<'\n';
  516. perm_vec[p_type ]=P1_.Transpose();
  517. perm_vec[p_type+C_Perm]=P2_;
  518. }
  519. for(size_t i=0;i<2*C_Perm;i++){
  520. if(perm_vec[i].Dim()==0){
  521. perm_vec.Resize(0);
  522. std::cout<<"no-symmetry for: "<<ker_name<<'\n';
  523. break;
  524. }
  525. }
  526. }
  527. if(verbose){ // Display kernel information
  528. std::cout<<"\n";
  529. std::cout<<"Kernel Name : "<<ker_name<<'\n';
  530. std::cout<<"Precision : "<<(double)eps<<'\n';
  531. std::cout<<"Symmetry : "<<(perm_vec.Dim()>0?"yes":"no")<<'\n';
  532. std::cout<<"Scale Invariant: "<<(scale_invar?"yes":"no")<<'\n';
  533. if(scale_invar && ker_dim[0]*ker_dim[1]>0){
  534. std::cout<<"Scaling Matrix :\n";
  535. Matrix<T> Src(ker_dim[0],1);
  536. Matrix<T> Trg(1,ker_dim[1]);
  537. for(size_t i=0;i<ker_dim[0];i++) Src[i][0]=pow(2.0,src_scal[i]);
  538. for(size_t i=0;i<ker_dim[1];i++) Trg[0][i]=pow(2.0,trg_scal[i]);
  539. std::cout<<Src*Trg;
  540. }
  541. if(ker_dim[0]*ker_dim[1]>0){ // Accuracy of multipole expansion
  542. std::cout<<"Error : ";
  543. for(T rad=1.0; rad>1.0e-2; rad*=0.5){
  544. int m=8; // multipole order
  545. std::vector<T> equiv_surf;
  546. std::vector<T> check_surf;
  547. for(int i0=0;i0<m;i0++){
  548. for(int i1=0;i1<m;i1++){
  549. for(int i2=0;i2<m;i2++){
  550. if(i0== 0 || i1== 0 || i2== 0 ||
  551. i0==m-1 || i1==m-1 || i2==m-1){
  552. // Range: [-1/3,1/3]^3
  553. T x=((T)2*i0-(m-1))/(m-1)/3;
  554. T y=((T)2*i1-(m-1))/(m-1)/3;
  555. T z=((T)2*i2-(m-1))/(m-1)/3;
  556. equiv_surf.push_back(x*RAD0*rad);
  557. equiv_surf.push_back(y*RAD0*rad);
  558. equiv_surf.push_back(z*RAD0*rad);
  559. check_surf.push_back(x*RAD1*rad);
  560. check_surf.push_back(y*RAD1*rad);
  561. check_surf.push_back(z*RAD1*rad);
  562. }
  563. }
  564. }
  565. }
  566. size_t n_equiv=equiv_surf.size()/COORD_DIM;
  567. size_t n_check=equiv_surf.size()/COORD_DIM;
  568. size_t n_src=m;
  569. size_t n_trg=m;
  570. std::vector<T> src_coord;
  571. std::vector<T> trg_coord;
  572. for(size_t i=0;i<n_src*COORD_DIM;i++){
  573. src_coord.push_back((2*drand48()-1)/3*rad);
  574. }
  575. for(size_t i=0;i<n_trg;i++){
  576. T x,y,z,r;
  577. do{
  578. x=(drand48()-0.5);
  579. y=(drand48()-0.5);
  580. z=(drand48()-0.5);
  581. r=sqrt(x*x+y*y+z*z);
  582. }while(r==0.0);
  583. trg_coord.push_back(x/r*sqrt((T)COORD_DIM)*rad);
  584. trg_coord.push_back(y/r*sqrt((T)COORD_DIM)*rad);
  585. trg_coord.push_back(z/r*sqrt((T)COORD_DIM)*rad);
  586. }
  587. Matrix<T> M_s2c(n_src*ker_dim[0],n_check*ker_dim[1]);
  588. BuildMatrix( &src_coord[0], n_src,
  589. &check_surf[0], n_check, &(M_s2c[0][0]));
  590. Matrix<T> M_e2c(n_equiv*ker_dim[0],n_check*ker_dim[1]);
  591. BuildMatrix(&equiv_surf[0], n_equiv,
  592. &check_surf[0], n_check, &(M_e2c[0][0]));
  593. Matrix<T> M_c2e0, M_c2e1;
  594. {
  595. Matrix<T> U,S,V;
  596. M_e2c.SVD(U,S,V);
  597. T eps=1, max_S=0;
  598. while(eps*(T)0.5+(T)1.0>1.0) eps*=0.5;
  599. for(size_t i=0;i<std::min(S.Dim(0),S.Dim(1));i++){
  600. if(fabs(S[i][i])>max_S) max_S=fabs(S[i][i]);
  601. }
  602. for(size_t i=0;i<S.Dim(0);i++) S[i][i]=(S[i][i]>eps*max_S*4?1.0/S[i][i]:0.0);
  603. M_c2e0=V.Transpose()*S;
  604. M_c2e1=U.Transpose();
  605. }
  606. Matrix<T> M_e2t(n_equiv*ker_dim[0],n_trg*ker_dim[1]);
  607. BuildMatrix(&equiv_surf[0], n_equiv,
  608. &trg_coord[0], n_trg , &(M_e2t[0][0]));
  609. Matrix<T> M_s2t(n_src*ker_dim[0],n_trg*ker_dim[1]);
  610. BuildMatrix( &src_coord[0], n_src,
  611. &trg_coord[0], n_trg , &(M_s2t[0][0]));
  612. Matrix<T> M=(M_s2c*M_c2e0)*(M_c2e1*M_e2t)-M_s2t;
  613. T max_error=0, max_value=0;
  614. for(size_t i=0;i<M.Dim(0);i++)
  615. for(size_t j=0;j<M.Dim(1);j++){
  616. max_error=std::max<T>(max_error,fabs(M [i][j]));
  617. max_value=std::max<T>(max_value,fabs(M_s2t[i][j]));
  618. }
  619. std::cout<<(double)(max_error/max_value)<<' ';
  620. if(scale_invar) break;
  621. }
  622. std::cout<<"\n";
  623. }
  624. std::cout<<"\n";
  625. }
  626. { // Initialize auxiliary FMM kernels
  627. if(!k_s2m) k_s2m=this;
  628. if(!k_s2l) k_s2l=this;
  629. if(!k_s2t) k_s2t=this;
  630. if(!k_m2m) k_m2m=this;
  631. if(!k_m2l) k_m2l=this;
  632. if(!k_m2t) k_m2t=this;
  633. if(!k_l2l) k_l2l=this;
  634. if(!k_l2t) k_l2t=this;
  635. assert(k_s2t->ker_dim[0]==ker_dim[0]);
  636. assert(k_s2m->ker_dim[0]==k_s2l->ker_dim[0]);
  637. assert(k_s2m->ker_dim[0]==k_s2t->ker_dim[0]);
  638. assert(k_m2m->ker_dim[0]==k_m2l->ker_dim[0]);
  639. assert(k_m2m->ker_dim[0]==k_m2t->ker_dim[0]);
  640. assert(k_l2l->ker_dim[0]==k_l2t->ker_dim[0]);
  641. assert(k_s2t->ker_dim[1]==ker_dim[1]);
  642. assert(k_s2m->ker_dim[1]==k_m2m->ker_dim[1]);
  643. assert(k_s2l->ker_dim[1]==k_l2l->ker_dim[1]);
  644. assert(k_m2l->ker_dim[1]==k_l2l->ker_dim[1]);
  645. assert(k_s2t->ker_dim[1]==k_m2t->ker_dim[1]);
  646. assert(k_s2t->ker_dim[1]==k_l2t->ker_dim[1]);
  647. k_s2m->Initialize(verbose);
  648. k_s2l->Initialize(verbose);
  649. k_s2t->Initialize(verbose);
  650. k_m2m->Initialize(verbose);
  651. k_m2l->Initialize(verbose);
  652. k_m2t->Initialize(verbose);
  653. k_l2l->Initialize(verbose);
  654. k_l2t->Initialize(verbose);
  655. }
  656. }
  657. /**
  658. * \brief Compute the transformation matrix (on the source strength vector)
  659. * to get potential at target coordinates due to sources at the given
  660. * coordinates.
  661. * \param[in] r_src Coordinates of source points.
  662. * \param[in] src_cnt Number of source points.
  663. * \param[in] r_trg Coordinates of target points.
  664. * \param[in] trg_cnt Number of target points.
  665. * \param[out] k_out Output array with potential values.
  666. */
  667. template <class T>
  668. void Kernel<T>::BuildMatrix(T* r_src, int src_cnt,
  669. T* r_trg, int trg_cnt, T* k_out) const{
  670. int dim=3; //Only supporting 3D
  671. memset(k_out, 0, src_cnt*ker_dim[0]*trg_cnt*ker_dim[1]*sizeof(T));
  672. for(int i=0;i<src_cnt;i++) //TODO Optimize this.
  673. for(int j=0;j<ker_dim[0];j++){
  674. std::vector<T> v_src(ker_dim[0],0);
  675. v_src[j]=1.0;
  676. ker_poten(&r_src[i*dim], 1, &v_src[0], 1, r_trg, trg_cnt,
  677. &k_out[(i*ker_dim[0]+j)*trg_cnt*ker_dim[1]], NULL);
  678. }
  679. }
  680. /**
  681. * \brief Generic kernel which rearranges data for vectorization, calls the
  682. * actual uKernel and copies data to the output array in the original order.
  683. */
  684. template <class Real_t, int SRC_DIM, int TRG_DIM, void (*uKernel)(Matrix<Real_t>&, Matrix<Real_t>&, Matrix<Real_t>&, Matrix<Real_t>&)>
  685. void generic_kernel(Real_t* r_src, int src_cnt, Real_t* v_src, int dof, Real_t* r_trg, int trg_cnt, Real_t* v_trg, mem::MemoryManager* mem_mgr){
  686. assert(dof==1);
  687. int VecLen=8;
  688. if(sizeof(Real_t)==sizeof( float)) VecLen=8;
  689. if(sizeof(Real_t)==sizeof(double)) VecLen=4;
  690. #define STACK_BUFF_SIZE 4096
  691. Real_t stack_buff[STACK_BUFF_SIZE+MEM_ALIGN];
  692. Real_t* buff=NULL;
  693. Matrix<Real_t> src_coord;
  694. Matrix<Real_t> src_value;
  695. Matrix<Real_t> trg_coord;
  696. Matrix<Real_t> trg_value;
  697. { // Rearrange data in src_coord, src_coord, trg_coord, trg_value
  698. size_t src_cnt_, trg_cnt_; // counts after zero padding
  699. src_cnt_=((src_cnt+VecLen-1)/VecLen)*VecLen;
  700. trg_cnt_=((trg_cnt+VecLen-1)/VecLen)*VecLen;
  701. size_t buff_size=src_cnt_*(COORD_DIM+SRC_DIM)+
  702. trg_cnt_*(COORD_DIM+TRG_DIM);
  703. if(buff_size>STACK_BUFF_SIZE){ // Allocate buff
  704. buff=mem::aligned_new<Real_t>(buff_size, mem_mgr);
  705. }
  706. Real_t* buff_ptr=buff;
  707. if(!buff_ptr){ // use stack_buff
  708. uintptr_t ptr=(uintptr_t)stack_buff;
  709. static uintptr_t ALIGN_MINUS_ONE=MEM_ALIGN-1;
  710. static uintptr_t NOT_ALIGN_MINUS_ONE=~ALIGN_MINUS_ONE;
  711. ptr=((ptr+ALIGN_MINUS_ONE) & NOT_ALIGN_MINUS_ONE);
  712. buff_ptr=(Real_t*)ptr;
  713. }
  714. src_coord.ReInit(COORD_DIM, src_cnt_,buff_ptr,false); buff_ptr+=COORD_DIM*src_cnt_;
  715. src_value.ReInit( SRC_DIM, src_cnt_,buff_ptr,false); buff_ptr+= SRC_DIM*src_cnt_;
  716. trg_coord.ReInit(COORD_DIM, trg_cnt_,buff_ptr,false); buff_ptr+=COORD_DIM*trg_cnt_;
  717. trg_value.ReInit( TRG_DIM, trg_cnt_,buff_ptr,false);//buff_ptr+= TRG_DIM*trg_cnt_;
  718. { // Set src_coord
  719. size_t i=0;
  720. for( ;i<src_cnt ;i++){
  721. for(size_t j=0;j<COORD_DIM;j++){
  722. src_coord[j][i]=r_src[i*COORD_DIM+j];
  723. }
  724. }
  725. for( ;i<src_cnt_;i++){
  726. for(size_t j=0;j<COORD_DIM;j++){
  727. src_coord[j][i]=0;
  728. }
  729. }
  730. }
  731. { // Set src_value
  732. size_t i=0;
  733. for( ;i<src_cnt ;i++){
  734. for(size_t j=0;j<SRC_DIM;j++){
  735. src_value[j][i]=v_src[i*SRC_DIM+j];
  736. }
  737. }
  738. for( ;i<src_cnt_;i++){
  739. for(size_t j=0;j<SRC_DIM;j++){
  740. src_value[j][i]=0;
  741. }
  742. }
  743. }
  744. { // Set trg_coord
  745. size_t i=0;
  746. for( ;i<trg_cnt ;i++){
  747. for(size_t j=0;j<COORD_DIM;j++){
  748. trg_coord[j][i]=r_trg[i*COORD_DIM+j];
  749. }
  750. }
  751. for( ;i<trg_cnt_;i++){
  752. for(size_t j=0;j<COORD_DIM;j++){
  753. trg_coord[j][i]=0;
  754. }
  755. }
  756. }
  757. { // Set trg_value
  758. size_t i=0;
  759. for( ;i<trg_cnt_;i++){
  760. for(size_t j=0;j<TRG_DIM;j++){
  761. trg_value[j][i]=0;
  762. }
  763. }
  764. }
  765. }
  766. uKernel(src_coord,src_value,trg_coord,trg_value);
  767. { // Set v_trg
  768. for(size_t i=0;i<trg_cnt ;i++){
  769. for(size_t j=0;j<TRG_DIM;j++){
  770. v_trg[i*TRG_DIM+j]+=trg_value[j][i];
  771. }
  772. }
  773. }
  774. if(buff){ // Free memory: buff
  775. mem::aligned_delete<Real_t>(buff);
  776. }
  777. }
  778. ////////////////////////////////////////////////////////////////////////////////
  779. //////// LAPLACE KERNEL ////////
  780. ////////////////////////////////////////////////////////////////////////////////
  781. /**
  782. * \brief Green's function for the Poisson's equation. Kernel tensor
  783. * dimension = 1x1.
  784. */
  785. template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
  786. void laplace_poten_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
  787. #define SRC_BLK 1000
  788. size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
  789. //// Number of newton iterations
  790. size_t NWTN_ITER=0;
  791. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin0<Vec_t,Real_t>) NWTN_ITER=0;
  792. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin1<Vec_t,Real_t>) NWTN_ITER=1;
  793. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin2<Vec_t,Real_t>) NWTN_ITER=2;
  794. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin3<Vec_t,Real_t>) NWTN_ITER=3;
  795. Real_t nwtn_scal=1; // scaling factor for newton iterations
  796. for(int i=0;i<NWTN_ITER;i++){
  797. nwtn_scal=2*nwtn_scal*nwtn_scal*nwtn_scal;
  798. }
  799. const Real_t OOFP = 1.0/(4*nwtn_scal*const_pi<Real_t>());
  800. size_t src_cnt_=src_coord.Dim(1);
  801. size_t trg_cnt_=trg_coord.Dim(1);
  802. for(size_t sblk=0;sblk<src_cnt_;sblk+=SRC_BLK){
  803. size_t src_cnt=src_cnt_-sblk;
  804. if(src_cnt>SRC_BLK) src_cnt=SRC_BLK;
  805. for(size_t t=0;t<trg_cnt_;t+=VecLen){
  806. Vec_t tx=load_intrin<Vec_t>(&trg_coord[0][t]);
  807. Vec_t ty=load_intrin<Vec_t>(&trg_coord[1][t]);
  808. Vec_t tz=load_intrin<Vec_t>(&trg_coord[2][t]);
  809. Vec_t tv=zero_intrin<Vec_t>();
  810. for(size_t s=sblk;s<sblk+src_cnt;s++){
  811. Vec_t dx=sub_intrin(tx,bcast_intrin<Vec_t>(&src_coord[0][s]));
  812. Vec_t dy=sub_intrin(ty,bcast_intrin<Vec_t>(&src_coord[1][s]));
  813. Vec_t dz=sub_intrin(tz,bcast_intrin<Vec_t>(&src_coord[2][s]));
  814. Vec_t sv= bcast_intrin<Vec_t>(&src_value[0][s]) ;
  815. Vec_t r2= mul_intrin(dx,dx) ;
  816. r2=add_intrin(r2,mul_intrin(dy,dy));
  817. r2=add_intrin(r2,mul_intrin(dz,dz));
  818. Vec_t rinv=RINV_INTRIN(r2);
  819. tv=add_intrin(tv,mul_intrin(rinv,sv));
  820. }
  821. Vec_t oofp=set_intrin<Vec_t,Real_t>(OOFP);
  822. tv=add_intrin(mul_intrin(tv,oofp),load_intrin<Vec_t>(&trg_value[0][t]));
  823. store_intrin(&trg_value[0][t],tv);
  824. }
  825. }
  826. { // Add FLOPS
  827. #ifndef __MIC__
  828. Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(12+4*(NWTN_ITER)));
  829. #endif
  830. }
  831. #undef SRC_BLK
  832. }
  833. template <class T, int newton_iter=0>
  834. void laplace_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){
  835. #define LAP_KER_NWTN(nwtn) if(newton_iter==nwtn) \
  836. generic_kernel<Real_t, 1, 1, laplace_poten_uKernel<Real_t,Vec_t, rinv_intrin##nwtn<Vec_t,Real_t> > > \
  837. ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr)
  838. #define LAPLACE_KERNEL LAP_KER_NWTN(0); LAP_KER_NWTN(1); LAP_KER_NWTN(2); LAP_KER_NWTN(3);
  839. if(mem::TypeTraits<T>::ID()==mem::TypeTraits<float>::ID()){
  840. typedef float Real_t;
  841. #if defined __MIC__
  842. #define Vec_t Real_t
  843. #elif defined __AVX__
  844. #define Vec_t __m256
  845. #elif defined __SSE3__
  846. #define Vec_t __m128
  847. #else
  848. #define Vec_t Real_t
  849. #endif
  850. LAPLACE_KERNEL;
  851. #undef Vec_t
  852. }else if(mem::TypeTraits<T>::ID()==mem::TypeTraits<double>::ID()){
  853. typedef double Real_t;
  854. #if defined __MIC__
  855. #define Vec_t Real_t
  856. #elif defined __AVX__
  857. #define Vec_t __m256d
  858. #elif defined __SSE3__
  859. #define Vec_t __m128d
  860. #else
  861. #define Vec_t Real_t
  862. #endif
  863. LAPLACE_KERNEL;
  864. #undef Vec_t
  865. }else{
  866. typedef T Real_t;
  867. #define Vec_t Real_t
  868. LAPLACE_KERNEL;
  869. #undef Vec_t
  870. }
  871. #undef LAP_KER_NWTN
  872. #undef LAPLACE_KERNEL
  873. }
  874. // Laplace double layer potential.
  875. template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
  876. void laplace_dbl_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
  877. #define SRC_BLK 500
  878. size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
  879. //// Number of newton iterations
  880. size_t NWTN_ITER=0;
  881. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin0<Vec_t,Real_t>) NWTN_ITER=0;
  882. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin1<Vec_t,Real_t>) NWTN_ITER=1;
  883. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin2<Vec_t,Real_t>) NWTN_ITER=2;
  884. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin3<Vec_t,Real_t>) NWTN_ITER=3;
  885. Real_t nwtn_scal=1; // scaling factor for newton iterations
  886. for(int i=0;i<NWTN_ITER;i++){
  887. nwtn_scal=2*nwtn_scal*nwtn_scal*nwtn_scal;
  888. }
  889. const Real_t OOFP = -1.0/(4*nwtn_scal*nwtn_scal*nwtn_scal*const_pi<Real_t>());
  890. size_t src_cnt_=src_coord.Dim(1);
  891. size_t trg_cnt_=trg_coord.Dim(1);
  892. for(size_t sblk=0;sblk<src_cnt_;sblk+=SRC_BLK){
  893. size_t src_cnt=src_cnt_-sblk;
  894. if(src_cnt>SRC_BLK) src_cnt=SRC_BLK;
  895. for(size_t t=0;t<trg_cnt_;t+=VecLen){
  896. Vec_t tx=load_intrin<Vec_t>(&trg_coord[0][t]);
  897. Vec_t ty=load_intrin<Vec_t>(&trg_coord[1][t]);
  898. Vec_t tz=load_intrin<Vec_t>(&trg_coord[2][t]);
  899. Vec_t tv=zero_intrin<Vec_t>();
  900. for(size_t s=sblk;s<sblk+src_cnt;s++){
  901. Vec_t dx=sub_intrin(tx,bcast_intrin<Vec_t>(&src_coord[0][s]));
  902. Vec_t dy=sub_intrin(ty,bcast_intrin<Vec_t>(&src_coord[1][s]));
  903. Vec_t dz=sub_intrin(tz,bcast_intrin<Vec_t>(&src_coord[2][s]));
  904. Vec_t sn0= bcast_intrin<Vec_t>(&src_value[0][s]) ;
  905. Vec_t sn1= bcast_intrin<Vec_t>(&src_value[1][s]) ;
  906. Vec_t sn2= bcast_intrin<Vec_t>(&src_value[2][s]) ;
  907. Vec_t sv= bcast_intrin<Vec_t>(&src_value[3][s]) ;
  908. Vec_t r2= mul_intrin(dx,dx) ;
  909. r2=add_intrin(r2,mul_intrin(dy,dy));
  910. r2=add_intrin(r2,mul_intrin(dz,dz));
  911. Vec_t rinv=RINV_INTRIN(r2);
  912. Vec_t r3inv=mul_intrin(mul_intrin(rinv,rinv),rinv);
  913. Vec_t rdotn= mul_intrin(sn0,dx);
  914. rdotn=add_intrin(rdotn, mul_intrin(sn1,dy));
  915. rdotn=add_intrin(rdotn, mul_intrin(sn2,dz));
  916. sv=mul_intrin(sv,rdotn);
  917. tv=add_intrin(tv,mul_intrin(r3inv,sv));
  918. }
  919. Vec_t oofp=set_intrin<Vec_t,Real_t>(OOFP);
  920. tv=add_intrin(mul_intrin(tv,oofp),load_intrin<Vec_t>(&trg_value[0][t]));
  921. store_intrin(&trg_value[0][t],tv);
  922. }
  923. }
  924. { // Add FLOPS
  925. #ifndef __MIC__
  926. Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(20+4*(NWTN_ITER)));
  927. #endif
  928. }
  929. #undef SRC_BLK
  930. }
  931. template <class T, int newton_iter=0>
  932. void laplace_dbl_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){
  933. #define LAP_KER_NWTN(nwtn) if(newton_iter==nwtn) \
  934. generic_kernel<Real_t, 4, 1, laplace_dbl_uKernel<Real_t,Vec_t, rinv_intrin##nwtn<Vec_t,Real_t> > > \
  935. ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr)
  936. #define LAPLACE_KERNEL LAP_KER_NWTN(0); LAP_KER_NWTN(1); LAP_KER_NWTN(2); LAP_KER_NWTN(3);
  937. if(mem::TypeTraits<T>::ID()==mem::TypeTraits<float>::ID()){
  938. typedef float Real_t;
  939. #if defined __MIC__
  940. #define Vec_t Real_t
  941. #elif defined __AVX__
  942. #define Vec_t __m256
  943. #elif defined __SSE3__
  944. #define Vec_t __m128
  945. #else
  946. #define Vec_t Real_t
  947. #endif
  948. LAPLACE_KERNEL;
  949. #undef Vec_t
  950. }else if(mem::TypeTraits<T>::ID()==mem::TypeTraits<double>::ID()){
  951. typedef double Real_t;
  952. #if defined __MIC__
  953. #define Vec_t Real_t
  954. #elif defined __AVX__
  955. #define Vec_t __m256d
  956. #elif defined __SSE3__
  957. #define Vec_t __m128d
  958. #else
  959. #define Vec_t Real_t
  960. #endif
  961. LAPLACE_KERNEL;
  962. #undef Vec_t
  963. }else{
  964. typedef T Real_t;
  965. #define Vec_t Real_t
  966. LAPLACE_KERNEL;
  967. #undef Vec_t
  968. }
  969. #undef LAP_KER_NWTN
  970. #undef LAPLACE_KERNEL
  971. }
  972. // Laplace grdient kernel.
  973. template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
  974. void laplace_grad_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
  975. #define SRC_BLK 500
  976. size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
  977. //// Number of newton iterations
  978. size_t NWTN_ITER=0;
  979. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin0<Vec_t,Real_t>) NWTN_ITER=0;
  980. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin1<Vec_t,Real_t>) NWTN_ITER=1;
  981. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin2<Vec_t,Real_t>) NWTN_ITER=2;
  982. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin3<Vec_t,Real_t>) NWTN_ITER=3;
  983. Real_t nwtn_scal=1; // scaling factor for newton iterations
  984. for(int i=0;i<NWTN_ITER;i++){
  985. nwtn_scal=2*nwtn_scal*nwtn_scal*nwtn_scal;
  986. }
  987. const Real_t OOFP = -1.0/(4*nwtn_scal*nwtn_scal*nwtn_scal*const_pi<Real_t>());
  988. size_t src_cnt_=src_coord.Dim(1);
  989. size_t trg_cnt_=trg_coord.Dim(1);
  990. for(size_t sblk=0;sblk<src_cnt_;sblk+=SRC_BLK){
  991. size_t src_cnt=src_cnt_-sblk;
  992. if(src_cnt>SRC_BLK) src_cnt=SRC_BLK;
  993. for(size_t t=0;t<trg_cnt_;t+=VecLen){
  994. Vec_t tx=load_intrin<Vec_t>(&trg_coord[0][t]);
  995. Vec_t ty=load_intrin<Vec_t>(&trg_coord[1][t]);
  996. Vec_t tz=load_intrin<Vec_t>(&trg_coord[2][t]);
  997. Vec_t tv0=zero_intrin<Vec_t>();
  998. Vec_t tv1=zero_intrin<Vec_t>();
  999. Vec_t tv2=zero_intrin<Vec_t>();
  1000. for(size_t s=sblk;s<sblk+src_cnt;s++){
  1001. Vec_t dx=sub_intrin(tx,bcast_intrin<Vec_t>(&src_coord[0][s]));
  1002. Vec_t dy=sub_intrin(ty,bcast_intrin<Vec_t>(&src_coord[1][s]));
  1003. Vec_t dz=sub_intrin(tz,bcast_intrin<Vec_t>(&src_coord[2][s]));
  1004. Vec_t sv= bcast_intrin<Vec_t>(&src_value[0][s]) ;
  1005. Vec_t r2= mul_intrin(dx,dx) ;
  1006. r2=add_intrin(r2,mul_intrin(dy,dy));
  1007. r2=add_intrin(r2,mul_intrin(dz,dz));
  1008. Vec_t rinv=RINV_INTRIN(r2);
  1009. Vec_t r3inv=mul_intrin(mul_intrin(rinv,rinv),rinv);
  1010. sv=mul_intrin(sv,r3inv);
  1011. tv0=add_intrin(tv0,mul_intrin(sv,dx));
  1012. tv1=add_intrin(tv1,mul_intrin(sv,dy));
  1013. tv2=add_intrin(tv2,mul_intrin(sv,dz));
  1014. }
  1015. Vec_t oofp=set_intrin<Vec_t,Real_t>(OOFP);
  1016. tv0=add_intrin(mul_intrin(tv0,oofp),load_intrin<Vec_t>(&trg_value[0][t]));
  1017. tv1=add_intrin(mul_intrin(tv1,oofp),load_intrin<Vec_t>(&trg_value[1][t]));
  1018. tv2=add_intrin(mul_intrin(tv2,oofp),load_intrin<Vec_t>(&trg_value[2][t]));
  1019. store_intrin(&trg_value[0][t],tv0);
  1020. store_intrin(&trg_value[1][t],tv1);
  1021. store_intrin(&trg_value[2][t],tv2);
  1022. }
  1023. }
  1024. { // Add FLOPS
  1025. #ifndef __MIC__
  1026. Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(19+4*(NWTN_ITER)));
  1027. #endif
  1028. }
  1029. #undef SRC_BLK
  1030. }
  1031. template <class T, int newton_iter=0>
  1032. void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){
  1033. #define LAP_KER_NWTN(nwtn) if(newton_iter==nwtn) \
  1034. generic_kernel<Real_t, 1, 3, laplace_grad_uKernel<Real_t,Vec_t, rinv_intrin##nwtn<Vec_t,Real_t> > > \
  1035. ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr)
  1036. #define LAPLACE_KERNEL LAP_KER_NWTN(0); LAP_KER_NWTN(1); LAP_KER_NWTN(2); LAP_KER_NWTN(3);
  1037. if(mem::TypeTraits<T>::ID()==mem::TypeTraits<float>::ID()){
  1038. typedef float Real_t;
  1039. #if defined __MIC__
  1040. #define Vec_t Real_t
  1041. #elif defined __AVX__
  1042. #define Vec_t __m256
  1043. #elif defined __SSE3__
  1044. #define Vec_t __m128
  1045. #else
  1046. #define Vec_t Real_t
  1047. #endif
  1048. LAPLACE_KERNEL;
  1049. #undef Vec_t
  1050. }else if(mem::TypeTraits<T>::ID()==mem::TypeTraits<double>::ID()){
  1051. typedef double Real_t;
  1052. #if defined __MIC__
  1053. #define Vec_t Real_t
  1054. #elif defined __AVX__
  1055. #define Vec_t __m256d
  1056. #elif defined __SSE3__
  1057. #define Vec_t __m128d
  1058. #else
  1059. #define Vec_t Real_t
  1060. #endif
  1061. LAPLACE_KERNEL;
  1062. #undef Vec_t
  1063. }else{
  1064. typedef T Real_t;
  1065. #define Vec_t Real_t
  1066. LAPLACE_KERNEL;
  1067. #undef Vec_t
  1068. }
  1069. #undef LAP_KER_NWTN
  1070. #undef LAPLACE_KERNEL
  1071. }
  1072. template<class T> const Kernel<T>& LaplaceKernel<T>::potential(){
  1073. static Kernel<T> potn_ker=BuildKernel<T, laplace_poten<T,1>, laplace_dbl_poten<T,1> >("laplace" , 3, std::pair<int,int>(1,1));
  1074. return potn_ker;
  1075. }
  1076. template<class T> const Kernel<T>& LaplaceKernel<T>::gradient(){
  1077. static Kernel<T> potn_ker=BuildKernel<T, laplace_poten<T,1>, laplace_dbl_poten<T,1> >("laplace" , 3, std::pair<int,int>(1,1));
  1078. static Kernel<T> grad_ker=BuildKernel<T, laplace_grad <T,1> >("laplace_grad", 3, std::pair<int,int>(1,3),
  1079. &potn_ker, &potn_ker, NULL, &potn_ker, &potn_ker, NULL, &potn_ker, NULL);
  1080. return grad_ker;
  1081. }
  1082. template<> const Kernel<double>& LaplaceKernel<double>::potential(){
  1083. typedef double T;
  1084. static Kernel<T> potn_ker=BuildKernel<T, laplace_poten<T,2>, laplace_dbl_poten<T,2> >("laplace" , 3, std::pair<int,int>(1,1));
  1085. return potn_ker;
  1086. }
  1087. template<> const Kernel<double>& LaplaceKernel<double>::gradient(){
  1088. typedef double T;
  1089. static Kernel<T> potn_ker=BuildKernel<T, laplace_poten<T,2>, laplace_dbl_poten<T,2> >("laplace" , 3, std::pair<int,int>(1,1));
  1090. static Kernel<T> grad_ker=BuildKernel<T, laplace_grad <T,2> >("laplace_grad", 3, std::pair<int,int>(1,3),
  1091. &potn_ker, &potn_ker, NULL, &potn_ker, &potn_ker, NULL, &potn_ker, NULL);
  1092. return grad_ker;
  1093. }
  1094. ////////////////////////////////////////////////////////////////////////////////
  1095. //////// STOKES KERNEL ////////
  1096. ////////////////////////////////////////////////////////////////////////////////
  1097. /**
  1098. * \brief Green's function for the Stokes's equation. Kernel tensor
  1099. * dimension = 3x3.
  1100. */
  1101. template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
  1102. void stokes_vel_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
  1103. #define SRC_BLK 500
  1104. size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
  1105. //// Number of newton iterations
  1106. size_t NWTN_ITER=0;
  1107. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin0<Vec_t,Real_t>) NWTN_ITER=0;
  1108. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin1<Vec_t,Real_t>) NWTN_ITER=1;
  1109. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin2<Vec_t,Real_t>) NWTN_ITER=2;
  1110. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin3<Vec_t,Real_t>) NWTN_ITER=3;
  1111. Real_t nwtn_scal=1; // scaling factor for newton iterations
  1112. for(int i=0;i<NWTN_ITER;i++){
  1113. nwtn_scal=2*nwtn_scal*nwtn_scal*nwtn_scal;
  1114. }
  1115. const Real_t OOEP = 1.0/(8*nwtn_scal*const_pi<Real_t>());
  1116. Vec_t inv_nwrn_scal2=set_intrin<Vec_t,Real_t>(1.0/(nwtn_scal*nwtn_scal));
  1117. size_t src_cnt_=src_coord.Dim(1);
  1118. size_t trg_cnt_=trg_coord.Dim(1);
  1119. for(size_t sblk=0;sblk<src_cnt_;sblk+=SRC_BLK){
  1120. size_t src_cnt=src_cnt_-sblk;
  1121. if(src_cnt>SRC_BLK) src_cnt=SRC_BLK;
  1122. for(size_t t=0;t<trg_cnt_;t+=VecLen){
  1123. Vec_t tx=load_intrin<Vec_t>(&trg_coord[0][t]);
  1124. Vec_t ty=load_intrin<Vec_t>(&trg_coord[1][t]);
  1125. Vec_t tz=load_intrin<Vec_t>(&trg_coord[2][t]);
  1126. Vec_t tvx=zero_intrin<Vec_t>();
  1127. Vec_t tvy=zero_intrin<Vec_t>();
  1128. Vec_t tvz=zero_intrin<Vec_t>();
  1129. for(size_t s=sblk;s<sblk+src_cnt;s++){
  1130. Vec_t dx=sub_intrin(tx,bcast_intrin<Vec_t>(&src_coord[0][s]));
  1131. Vec_t dy=sub_intrin(ty,bcast_intrin<Vec_t>(&src_coord[1][s]));
  1132. Vec_t dz=sub_intrin(tz,bcast_intrin<Vec_t>(&src_coord[2][s]));
  1133. Vec_t svx= bcast_intrin<Vec_t>(&src_value[0][s]) ;
  1134. Vec_t svy= bcast_intrin<Vec_t>(&src_value[1][s]) ;
  1135. Vec_t svz= bcast_intrin<Vec_t>(&src_value[2][s]) ;
  1136. Vec_t r2= mul_intrin(dx,dx) ;
  1137. r2=add_intrin(r2,mul_intrin(dy,dy));
  1138. r2=add_intrin(r2,mul_intrin(dz,dz));
  1139. Vec_t rinv=RINV_INTRIN(r2);
  1140. Vec_t rinv2=mul_intrin(mul_intrin(rinv,rinv),inv_nwrn_scal2);
  1141. Vec_t inner_prod= mul_intrin(svx,dx) ;
  1142. inner_prod=add_intrin(inner_prod,mul_intrin(svy,dy));
  1143. inner_prod=add_intrin(inner_prod,mul_intrin(svz,dz));
  1144. inner_prod=mul_intrin(inner_prod,rinv2);
  1145. tvx=add_intrin(tvx,mul_intrin(rinv,add_intrin(svx,mul_intrin(dx,inner_prod))));
  1146. tvy=add_intrin(tvy,mul_intrin(rinv,add_intrin(svy,mul_intrin(dy,inner_prod))));
  1147. tvz=add_intrin(tvz,mul_intrin(rinv,add_intrin(svz,mul_intrin(dz,inner_prod))));
  1148. }
  1149. Vec_t ooep=set_intrin<Vec_t,Real_t>(OOEP);
  1150. tvx=add_intrin(mul_intrin(tvx,ooep),load_intrin<Vec_t>(&trg_value[0][t]));
  1151. tvy=add_intrin(mul_intrin(tvy,ooep),load_intrin<Vec_t>(&trg_value[1][t]));
  1152. tvz=add_intrin(mul_intrin(tvz,ooep),load_intrin<Vec_t>(&trg_value[2][t]));
  1153. store_intrin(&trg_value[0][t],tvx);
  1154. store_intrin(&trg_value[1][t],tvy);
  1155. store_intrin(&trg_value[2][t],tvz);
  1156. }
  1157. }
  1158. { // Add FLOPS
  1159. #ifndef __MIC__
  1160. Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(29+4*(NWTN_ITER)));
  1161. #endif
  1162. }
  1163. #undef SRC_BLK
  1164. }
  1165. template <class T, int newton_iter=0>
  1166. void stokes_vel(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){
  1167. #define STK_KER_NWTN(nwtn) if(newton_iter==nwtn) \
  1168. generic_kernel<Real_t, 3, 3, stokes_vel_uKernel<Real_t,Vec_t, rinv_intrin##nwtn<Vec_t,Real_t> > > \
  1169. ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr)
  1170. #define STOKES_KERNEL STK_KER_NWTN(0); STK_KER_NWTN(1); STK_KER_NWTN(2); STK_KER_NWTN(3);
  1171. if(mem::TypeTraits<T>::ID()==mem::TypeTraits<float>::ID()){
  1172. typedef float Real_t;
  1173. #if defined __MIC__
  1174. #define Vec_t Real_t
  1175. #elif defined __AVX__
  1176. #define Vec_t __m256
  1177. #elif defined __SSE3__
  1178. #define Vec_t __m128
  1179. #else
  1180. #define Vec_t Real_t
  1181. #endif
  1182. STOKES_KERNEL;
  1183. #undef Vec_t
  1184. }else if(mem::TypeTraits<T>::ID()==mem::TypeTraits<double>::ID()){
  1185. typedef double Real_t;
  1186. #if defined __MIC__
  1187. #define Vec_t Real_t
  1188. #elif defined __AVX__
  1189. #define Vec_t __m256d
  1190. #elif defined __SSE3__
  1191. #define Vec_t __m128d
  1192. #else
  1193. #define Vec_t Real_t
  1194. #endif
  1195. STOKES_KERNEL;
  1196. #undef Vec_t
  1197. }else{
  1198. typedef T Real_t;
  1199. #define Vec_t Real_t
  1200. STOKES_KERNEL;
  1201. #undef Vec_t
  1202. }
  1203. #undef STK_KER_NWTN
  1204. #undef STOKES_KERNEL
  1205. }
  1206. template <class T>
  1207. void stokes_sym_dip(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1208. #ifndef __MIC__
  1209. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(47*dof));
  1210. #endif
  1211. const T mu=1.0;
  1212. const T OOEPMU = -1.0/(8.0*const_pi<T>()*mu);
  1213. for(int t=0;t<trg_cnt;t++){
  1214. for(int i=0;i<dof;i++){
  1215. T p[3]={0,0,0};
  1216. for(int s=0;s<src_cnt;s++){
  1217. T dR[3]={r_trg[3*t ]-r_src[3*s ],
  1218. r_trg[3*t+1]-r_src[3*s+1],
  1219. r_trg[3*t+2]-r_src[3*s+2]};
  1220. T R = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
  1221. if (R!=0){
  1222. T invR2=1.0/R;
  1223. T invR=sqrt(invR2);
  1224. T invR3=invR2*invR;
  1225. T* f=&v_src[(s*dof+i)*6+0];
  1226. T* n=&v_src[(s*dof+i)*6+3];
  1227. T r_dot_n=(n[0]*dR[0]+n[1]*dR[1]+n[2]*dR[2]);
  1228. T r_dot_f=(f[0]*dR[0]+f[1]*dR[1]+f[2]*dR[2]);
  1229. T n_dot_f=(f[0]* n[0]+f[1]* n[1]+f[2]* n[2]);
  1230. p[0] += dR[0]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3;
  1231. p[1] += dR[1]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3;
  1232. p[2] += dR[2]*(n_dot_f - 3*r_dot_n*r_dot_f*invR2)*invR3;
  1233. }
  1234. }
  1235. k_out[(t*dof+i)*3+0] += p[0]*OOEPMU;
  1236. k_out[(t*dof+i)*3+1] += p[1]*OOEPMU;
  1237. k_out[(t*dof+i)*3+2] += p[2]*OOEPMU;
  1238. }
  1239. }
  1240. }
  1241. template <class T>
  1242. void stokes_press(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1243. #ifndef __MIC__
  1244. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof));
  1245. #endif
  1246. const T OOFP = 1.0/(4.0*const_pi<T>());
  1247. for(int t=0;t<trg_cnt;t++){
  1248. for(int i=0;i<dof;i++){
  1249. T p=0;
  1250. for(int s=0;s<src_cnt;s++){
  1251. T dR[3]={r_trg[3*t ]-r_src[3*s ],
  1252. r_trg[3*t+1]-r_src[3*s+1],
  1253. r_trg[3*t+2]-r_src[3*s+2]};
  1254. T R = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
  1255. if (R!=0){
  1256. T invR2=1.0/R;
  1257. T invR=sqrt(invR2);
  1258. T invR3=invR2*invR;
  1259. T v_src[3]={v_src_[(s*dof+i)*3 ],
  1260. v_src_[(s*dof+i)*3+1],
  1261. v_src_[(s*dof+i)*3+2]};
  1262. T inner_prod=(v_src[0]*dR[0] +
  1263. v_src[1]*dR[1] +
  1264. v_src[2]*dR[2])* invR3;
  1265. p += inner_prod;
  1266. }
  1267. }
  1268. k_out[t*dof+i] += p*OOFP;
  1269. }
  1270. }
  1271. }
  1272. template <class T>
  1273. void stokes_stress(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1274. #ifndef __MIC__
  1275. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof));
  1276. #endif
  1277. const T TOFP = -3.0/(4.0*const_pi<T>());
  1278. for(int t=0;t<trg_cnt;t++){
  1279. for(int i=0;i<dof;i++){
  1280. T p[9]={0,0,0,
  1281. 0,0,0,
  1282. 0,0,0};
  1283. for(int s=0;s<src_cnt;s++){
  1284. T dR[3]={r_trg[3*t ]-r_src[3*s ],
  1285. r_trg[3*t+1]-r_src[3*s+1],
  1286. r_trg[3*t+2]-r_src[3*s+2]};
  1287. T R = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
  1288. if (R!=0){
  1289. T invR2=1.0/R;
  1290. T invR=sqrt(invR2);
  1291. T invR3=invR2*invR;
  1292. T invR5=invR3*invR2;
  1293. T v_src[3]={v_src_[(s*dof+i)*3 ],
  1294. v_src_[(s*dof+i)*3+1],
  1295. v_src_[(s*dof+i)*3+2]};
  1296. T inner_prod=(v_src[0]*dR[0] +
  1297. v_src[1]*dR[1] +
  1298. v_src[2]*dR[2])* invR5;
  1299. p[0] += inner_prod*dR[0]*dR[0]; p[1] += inner_prod*dR[1]*dR[0]; p[2] += inner_prod*dR[2]*dR[0];
  1300. p[3] += inner_prod*dR[0]*dR[1]; p[4] += inner_prod*dR[1]*dR[1]; p[5] += inner_prod*dR[2]*dR[1];
  1301. p[6] += inner_prod*dR[0]*dR[2]; p[7] += inner_prod*dR[1]*dR[2]; p[8] += inner_prod*dR[2]*dR[2];
  1302. }
  1303. }
  1304. k_out[(t*dof+i)*9+0] += p[0]*TOFP;
  1305. k_out[(t*dof+i)*9+1] += p[1]*TOFP;
  1306. k_out[(t*dof+i)*9+2] += p[2]*TOFP;
  1307. k_out[(t*dof+i)*9+3] += p[3]*TOFP;
  1308. k_out[(t*dof+i)*9+4] += p[4]*TOFP;
  1309. k_out[(t*dof+i)*9+5] += p[5]*TOFP;
  1310. k_out[(t*dof+i)*9+6] += p[6]*TOFP;
  1311. k_out[(t*dof+i)*9+7] += p[7]*TOFP;
  1312. k_out[(t*dof+i)*9+8] += p[8]*TOFP;
  1313. }
  1314. }
  1315. }
  1316. template <class T>
  1317. void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr){
  1318. #ifndef __MIC__
  1319. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof));
  1320. #endif
  1321. const T mu=1.0;
  1322. const T OOEPMU = 1.0/(8.0*const_pi<T>()*mu);
  1323. for(int t=0;t<trg_cnt;t++){
  1324. for(int i=0;i<dof;i++){
  1325. T p[9]={0,0,0,
  1326. 0,0,0,
  1327. 0,0,0};
  1328. for(int s=0;s<src_cnt;s++){
  1329. T dR[3]={r_trg[3*t ]-r_src[3*s ],
  1330. r_trg[3*t+1]-r_src[3*s+1],
  1331. r_trg[3*t+2]-r_src[3*s+2]};
  1332. T R = (dR[0]*dR[0]+dR[1]*dR[1]+dR[2]*dR[2]);
  1333. if (R!=0){
  1334. T invR2=1.0/R;
  1335. T invR=sqrt(invR2);
  1336. T invR3=invR2*invR;
  1337. T v_src[3]={v_src_[(s*dof+i)*3 ],
  1338. v_src_[(s*dof+i)*3+1],
  1339. v_src_[(s*dof+i)*3+2]};
  1340. T inner_prod=(v_src[0]*dR[0] +
  1341. v_src[1]*dR[1] +
  1342. v_src[2]*dR[2]);
  1343. p[0] += ( inner_prod*(1-3*dR[0]*dR[0]*invR2))*invR3; //6
  1344. p[1] += (dR[1]*v_src[0]-v_src[1]*dR[0]+inner_prod*( -3*dR[1]*dR[0]*invR2))*invR3; //9
  1345. p[2] += (dR[2]*v_src[0]-v_src[2]*dR[0]+inner_prod*( -3*dR[2]*dR[0]*invR2))*invR3;
  1346. p[3] += (dR[0]*v_src[1]-v_src[0]*dR[1]+inner_prod*( -3*dR[0]*dR[1]*invR2))*invR3;
  1347. p[4] += ( inner_prod*(1-3*dR[1]*dR[1]*invR2))*invR3;
  1348. p[5] += (dR[2]*v_src[1]-v_src[2]*dR[1]+inner_prod*( -3*dR[2]*dR[1]*invR2))*invR3;
  1349. p[6] += (dR[0]*v_src[2]-v_src[0]*dR[2]+inner_prod*( -3*dR[0]*dR[2]*invR2))*invR3;
  1350. p[7] += (dR[1]*v_src[2]-v_src[1]*dR[2]+inner_prod*( -3*dR[1]*dR[2]*invR2))*invR3;
  1351. p[8] += ( inner_prod*(1-3*dR[2]*dR[2]*invR2))*invR3;
  1352. }
  1353. }
  1354. k_out[(t*dof+i)*9+0] += p[0]*OOEPMU;
  1355. k_out[(t*dof+i)*9+1] += p[1]*OOEPMU;
  1356. k_out[(t*dof+i)*9+2] += p[2]*OOEPMU;
  1357. k_out[(t*dof+i)*9+3] += p[3]*OOEPMU;
  1358. k_out[(t*dof+i)*9+4] += p[4]*OOEPMU;
  1359. k_out[(t*dof+i)*9+5] += p[5]*OOEPMU;
  1360. k_out[(t*dof+i)*9+6] += p[6]*OOEPMU;
  1361. k_out[(t*dof+i)*9+7] += p[7]*OOEPMU;
  1362. k_out[(t*dof+i)*9+8] += p[8]*OOEPMU;
  1363. }
  1364. }
  1365. }
  1366. #ifndef __MIC__
  1367. #ifdef USE_SSE
  1368. namespace
  1369. {
  1370. #define IDEAL_ALIGNMENT 16
  1371. #define SIMD_LEN (int)(IDEAL_ALIGNMENT / sizeof(double))
  1372. #define DECL_SIMD_ALIGNED __declspec(align(IDEAL_ALIGNMENT))
  1373. void stokesPressureSSE(
  1374. const int ns,
  1375. const int nt,
  1376. const double *sx,
  1377. const double *sy,
  1378. const double *sz,
  1379. const double *tx,
  1380. const double *ty,
  1381. const double *tz,
  1382. const double *srcDen,
  1383. double *trgVal)
  1384. {
  1385. if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
  1386. abort();
  1387. double OOFP = 1.0/(4.0*const_pi<double>());
  1388. __m128d temp_press;
  1389. double aux_arr[SIMD_LEN+1];
  1390. double *tempval_press;
  1391. if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned
  1392. {
  1393. tempval_press = aux_arr + 1;
  1394. if (size_t(tempval_press)%IDEAL_ALIGNMENT)
  1395. abort();
  1396. }
  1397. else
  1398. tempval_press = aux_arr;
  1399. /*! One over eight pi */
  1400. __m128d oofp = _mm_set1_pd (OOFP);
  1401. __m128d half = _mm_set1_pd (0.5);
  1402. __m128d opf = _mm_set1_pd (1.5);
  1403. __m128d zero = _mm_setzero_pd ();
  1404. // loop over sources
  1405. int i = 0;
  1406. for (; i < nt; i++) {
  1407. temp_press = _mm_setzero_pd();
  1408. __m128d txi = _mm_load1_pd (&tx[i]);
  1409. __m128d tyi = _mm_load1_pd (&ty[i]);
  1410. __m128d tzi = _mm_load1_pd (&tz[i]);
  1411. int j = 0;
  1412. // Load and calculate in groups of SIMD_LEN
  1413. for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
  1414. __m128d sxj = _mm_load_pd (&sx[j]);
  1415. __m128d syj = _mm_load_pd (&sy[j]);
  1416. __m128d szj = _mm_load_pd (&sz[j]);
  1417. __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]);
  1418. __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]);
  1419. __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]);
  1420. __m128d dX, dY, dZ;
  1421. __m128d dR2;
  1422. __m128d S;
  1423. dX = _mm_sub_pd(txi , sxj);
  1424. dY = _mm_sub_pd(tyi , syj);
  1425. dZ = _mm_sub_pd(tzi , szj);
  1426. sxj = _mm_mul_pd(dX, dX);
  1427. syj = _mm_mul_pd(dY, dY);
  1428. szj = _mm_mul_pd(dZ, dZ);
  1429. dR2 = _mm_add_pd(sxj, syj);
  1430. dR2 = _mm_add_pd(szj, dR2);
  1431. __m128d temp = _mm_cmpeq_pd (dR2, zero);
  1432. __m128d xhalf = _mm_mul_pd (half, dR2);
  1433. __m128 dR2_s = _mm_cvtpd_ps(dR2);
  1434. __m128 S_s = _mm_rsqrt_ps(dR2_s);
  1435. __m128d S_d = _mm_cvtps_pd(S_s);
  1436. // To handle the condition when src and trg coincide
  1437. S_d = _mm_andnot_pd (temp, S_d);
  1438. S = _mm_mul_pd (S_d, S_d);
  1439. S = _mm_mul_pd (S, xhalf);
  1440. S = _mm_sub_pd (opf, S);
  1441. S = _mm_mul_pd (S, S_d);
  1442. __m128d dotx = _mm_mul_pd (dX, sdenx);
  1443. __m128d doty = _mm_mul_pd (dY, sdeny);
  1444. __m128d dotz = _mm_mul_pd (dZ, sdenz);
  1445. __m128d dot_sum = _mm_add_pd (dotx, doty);
  1446. dot_sum = _mm_add_pd (dot_sum, dotz);
  1447. dot_sum = _mm_mul_pd (dot_sum, S);
  1448. dot_sum = _mm_mul_pd (dot_sum, S);
  1449. dot_sum = _mm_mul_pd (dot_sum, S);
  1450. temp_press = _mm_add_pd (dot_sum, temp_press);
  1451. }
  1452. temp_press = _mm_mul_pd (temp_press, oofp);
  1453. _mm_store_pd(tempval_press, temp_press);
  1454. for (int k = 0; k < SIMD_LEN; k++) {
  1455. trgVal[i] += tempval_press[k];
  1456. }
  1457. for (; j < ns; j++) {
  1458. double x = tx[i] - sx[j];
  1459. double y = ty[i] - sy[j];
  1460. double z = tz[i] - sz[j];
  1461. double r2 = x*x + y*y + z*z;
  1462. double r = sqrt(r2);
  1463. double invdr;
  1464. if (r == 0)
  1465. invdr = 0;
  1466. else
  1467. invdr = 1/r;
  1468. double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]) * invdr * invdr * invdr;
  1469. trgVal[i] += dot*OOFP;
  1470. }
  1471. }
  1472. return;
  1473. }
  1474. void stokesStressSSE(
  1475. const int ns,
  1476. const int nt,
  1477. const double *sx,
  1478. const double *sy,
  1479. const double *sz,
  1480. const double *tx,
  1481. const double *ty,
  1482. const double *tz,
  1483. const double *srcDen,
  1484. double *trgVal)
  1485. {
  1486. if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
  1487. abort();
  1488. double TOFP = -3.0/(4.0*const_pi<double>());
  1489. __m128d tempxx; __m128d tempxy; __m128d tempxz;
  1490. __m128d tempyx; __m128d tempyy; __m128d tempyz;
  1491. __m128d tempzx; __m128d tempzy; __m128d tempzz;
  1492. double aux_arr[9*SIMD_LEN+1];
  1493. double *tempvalxx, *tempvalxy, *tempvalxz;
  1494. double *tempvalyx, *tempvalyy, *tempvalyz;
  1495. double *tempvalzx, *tempvalzy, *tempvalzz;
  1496. if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned
  1497. {
  1498. tempvalxx = aux_arr + 1;
  1499. if (size_t(tempvalxx)%IDEAL_ALIGNMENT)
  1500. abort();
  1501. }
  1502. else
  1503. tempvalxx = aux_arr;
  1504. tempvalxy=tempvalxx+SIMD_LEN;
  1505. tempvalxz=tempvalxy+SIMD_LEN;
  1506. tempvalyx=tempvalxz+SIMD_LEN;
  1507. tempvalyy=tempvalyx+SIMD_LEN;
  1508. tempvalyz=tempvalyy+SIMD_LEN;
  1509. tempvalzx=tempvalyz+SIMD_LEN;
  1510. tempvalzy=tempvalzx+SIMD_LEN;
  1511. tempvalzz=tempvalzy+SIMD_LEN;
  1512. /*! One over eight pi */
  1513. __m128d tofp = _mm_set1_pd (TOFP);
  1514. __m128d half = _mm_set1_pd (0.5);
  1515. __m128d opf = _mm_set1_pd (1.5);
  1516. __m128d zero = _mm_setzero_pd ();
  1517. // loop over sources
  1518. int i = 0;
  1519. for (; i < nt; i++) {
  1520. tempxx = _mm_setzero_pd(); tempxy = _mm_setzero_pd(); tempxz = _mm_setzero_pd();
  1521. tempyx = _mm_setzero_pd(); tempyy = _mm_setzero_pd(); tempyz = _mm_setzero_pd();
  1522. tempzx = _mm_setzero_pd(); tempzy = _mm_setzero_pd(); tempzz = _mm_setzero_pd();
  1523. __m128d txi = _mm_load1_pd (&tx[i]);
  1524. __m128d tyi = _mm_load1_pd (&ty[i]);
  1525. __m128d tzi = _mm_load1_pd (&tz[i]);
  1526. int j = 0;
  1527. // Load and calculate in groups of SIMD_LEN
  1528. for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
  1529. __m128d sxj = _mm_load_pd (&sx[j]);
  1530. __m128d syj = _mm_load_pd (&sy[j]);
  1531. __m128d szj = _mm_load_pd (&sz[j]);
  1532. __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]);
  1533. __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]);
  1534. __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]);
  1535. __m128d dX, dY, dZ;
  1536. __m128d dR2;
  1537. __m128d S;
  1538. __m128d S2;
  1539. dX = _mm_sub_pd(txi , sxj);
  1540. dY = _mm_sub_pd(tyi , syj);
  1541. dZ = _mm_sub_pd(tzi , szj);
  1542. sxj = _mm_mul_pd(dX, dX);
  1543. syj = _mm_mul_pd(dY, dY);
  1544. szj = _mm_mul_pd(dZ, dZ);
  1545. dR2 = _mm_add_pd(sxj, syj);
  1546. dR2 = _mm_add_pd(szj, dR2);
  1547. __m128d temp = _mm_cmpeq_pd (dR2, zero);
  1548. __m128d xhalf = _mm_mul_pd (half, dR2);
  1549. __m128 dR2_s = _mm_cvtpd_ps(dR2);
  1550. __m128 S_s = _mm_rsqrt_ps(dR2_s);
  1551. __m128d S_d = _mm_cvtps_pd(S_s);
  1552. // To handle the condition when src and trg coincide
  1553. S_d = _mm_andnot_pd (temp, S_d);
  1554. S = _mm_mul_pd (S_d, S_d);
  1555. S = _mm_mul_pd (S, xhalf);
  1556. S = _mm_sub_pd (opf, S);
  1557. S = _mm_mul_pd (S, S_d);
  1558. S2 = _mm_mul_pd (S, S);
  1559. __m128d dotx = _mm_mul_pd (dX, sdenx);
  1560. __m128d doty = _mm_mul_pd (dY, sdeny);
  1561. __m128d dotz = _mm_mul_pd (dZ, sdenz);
  1562. __m128d dot_sum = _mm_add_pd (dotx, doty);
  1563. dot_sum = _mm_add_pd (dot_sum, dotz);
  1564. dot_sum = _mm_mul_pd (dot_sum, S);
  1565. dot_sum = _mm_mul_pd (dot_sum, S2);
  1566. dot_sum = _mm_mul_pd (dot_sum, S2);
  1567. dotx = _mm_mul_pd (dot_sum, dX);
  1568. doty = _mm_mul_pd (dot_sum, dY);
  1569. dotz = _mm_mul_pd (dot_sum, dZ);
  1570. tempxx = _mm_add_pd (_mm_mul_pd(dotx,dX), tempxx);
  1571. tempxy = _mm_add_pd (_mm_mul_pd(dotx,dY), tempxy);
  1572. tempxz = _mm_add_pd (_mm_mul_pd(dotx,dZ), tempxz);
  1573. tempyx = _mm_add_pd (_mm_mul_pd(doty,dX), tempyx);
  1574. tempyy = _mm_add_pd (_mm_mul_pd(doty,dY), tempyy);
  1575. tempyz = _mm_add_pd (_mm_mul_pd(doty,dZ), tempyz);
  1576. tempzx = _mm_add_pd (_mm_mul_pd(dotz,dX), tempzx);
  1577. tempzy = _mm_add_pd (_mm_mul_pd(dotz,dY), tempzy);
  1578. tempzz = _mm_add_pd (_mm_mul_pd(dotz,dZ), tempzz);
  1579. }
  1580. tempxx = _mm_mul_pd (tempxx, tofp);
  1581. tempxy = _mm_mul_pd (tempxy, tofp);
  1582. tempxz = _mm_mul_pd (tempxz, tofp);
  1583. tempyx = _mm_mul_pd (tempyx, tofp);
  1584. tempyy = _mm_mul_pd (tempyy, tofp);
  1585. tempyz = _mm_mul_pd (tempyz, tofp);
  1586. tempzx = _mm_mul_pd (tempzx, tofp);
  1587. tempzy = _mm_mul_pd (tempzy, tofp);
  1588. tempzz = _mm_mul_pd (tempzz, tofp);
  1589. _mm_store_pd(tempvalxx, tempxx); _mm_store_pd(tempvalxy, tempxy); _mm_store_pd(tempvalxz, tempxz);
  1590. _mm_store_pd(tempvalyx, tempyx); _mm_store_pd(tempvalyy, tempyy); _mm_store_pd(tempvalyz, tempyz);
  1591. _mm_store_pd(tempvalzx, tempzx); _mm_store_pd(tempvalzy, tempzy); _mm_store_pd(tempvalzz, tempzz);
  1592. for (int k = 0; k < SIMD_LEN; k++) {
  1593. trgVal[i*9 ] += tempvalxx[k];
  1594. trgVal[i*9+1] += tempvalxy[k];
  1595. trgVal[i*9+2] += tempvalxz[k];
  1596. trgVal[i*9+3] += tempvalyx[k];
  1597. trgVal[i*9+4] += tempvalyy[k];
  1598. trgVal[i*9+5] += tempvalyz[k];
  1599. trgVal[i*9+6] += tempvalzx[k];
  1600. trgVal[i*9+7] += tempvalzy[k];
  1601. trgVal[i*9+8] += tempvalzz[k];
  1602. }
  1603. for (; j < ns; j++) {
  1604. double x = tx[i] - sx[j];
  1605. double y = ty[i] - sy[j];
  1606. double z = tz[i] - sz[j];
  1607. double r2 = x*x + y*y + z*z;
  1608. double r = sqrt(r2);
  1609. double invdr;
  1610. if (r == 0)
  1611. invdr = 0;
  1612. else
  1613. invdr = 1/r;
  1614. double invdr2=invdr*invdr;
  1615. double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]) * invdr2 * invdr2 * invdr;
  1616. double denx = dot*x;
  1617. double deny = dot*y;
  1618. double denz = dot*z;
  1619. trgVal[i*9 ] += denx*x*TOFP;
  1620. trgVal[i*9+1] += denx*y*TOFP;
  1621. trgVal[i*9+2] += denx*z*TOFP;
  1622. trgVal[i*9+3] += deny*x*TOFP;
  1623. trgVal[i*9+4] += deny*y*TOFP;
  1624. trgVal[i*9+5] += deny*z*TOFP;
  1625. trgVal[i*9+6] += denz*x*TOFP;
  1626. trgVal[i*9+7] += denz*y*TOFP;
  1627. trgVal[i*9+8] += denz*z*TOFP;
  1628. }
  1629. }
  1630. return;
  1631. }
  1632. void stokesGradSSE(
  1633. const int ns,
  1634. const int nt,
  1635. const double *sx,
  1636. const double *sy,
  1637. const double *sz,
  1638. const double *tx,
  1639. const double *ty,
  1640. const double *tz,
  1641. const double *srcDen,
  1642. double *trgVal,
  1643. const double cof )
  1644. {
  1645. if ( size_t(sx)%IDEAL_ALIGNMENT || size_t(sy)%IDEAL_ALIGNMENT || size_t(sz)%IDEAL_ALIGNMENT )
  1646. abort();
  1647. double mu = cof;
  1648. double OOEP = 1.0/(8.0*const_pi<double>());
  1649. __m128d tempxx; __m128d tempxy; __m128d tempxz;
  1650. __m128d tempyx; __m128d tempyy; __m128d tempyz;
  1651. __m128d tempzx; __m128d tempzy; __m128d tempzz;
  1652. double oomeu = 1/mu;
  1653. double aux_arr[9*SIMD_LEN+1];
  1654. double *tempvalxx, *tempvalxy, *tempvalxz;
  1655. double *tempvalyx, *tempvalyy, *tempvalyz;
  1656. double *tempvalzx, *tempvalzy, *tempvalzz;
  1657. if (size_t(aux_arr)%IDEAL_ALIGNMENT) // if aux_arr is misaligned
  1658. {
  1659. tempvalxx = aux_arr + 1;
  1660. if (size_t(tempvalxx)%IDEAL_ALIGNMENT)
  1661. abort();
  1662. }
  1663. else
  1664. tempvalxx = aux_arr;
  1665. tempvalxy=tempvalxx+SIMD_LEN;
  1666. tempvalxz=tempvalxy+SIMD_LEN;
  1667. tempvalyx=tempvalxz+SIMD_LEN;
  1668. tempvalyy=tempvalyx+SIMD_LEN;
  1669. tempvalyz=tempvalyy+SIMD_LEN;
  1670. tempvalzx=tempvalyz+SIMD_LEN;
  1671. tempvalzy=tempvalzx+SIMD_LEN;
  1672. tempvalzz=tempvalzy+SIMD_LEN;
  1673. /*! One over eight pi */
  1674. __m128d ooep = _mm_set1_pd (OOEP);
  1675. __m128d half = _mm_set1_pd (0.5);
  1676. __m128d opf = _mm_set1_pd (1.5);
  1677. __m128d three = _mm_set1_pd (3.0);
  1678. __m128d zero = _mm_setzero_pd ();
  1679. __m128d oomu = _mm_set1_pd (1/mu);
  1680. __m128d ooepmu = _mm_mul_pd(ooep,oomu);
  1681. // loop over sources
  1682. int i = 0;
  1683. for (; i < nt; i++) {
  1684. tempxx = _mm_setzero_pd(); tempxy = _mm_setzero_pd(); tempxz = _mm_setzero_pd();
  1685. tempyx = _mm_setzero_pd(); tempyy = _mm_setzero_pd(); tempyz = _mm_setzero_pd();
  1686. tempzx = _mm_setzero_pd(); tempzy = _mm_setzero_pd(); tempzz = _mm_setzero_pd();
  1687. __m128d txi = _mm_load1_pd (&tx[i]);
  1688. __m128d tyi = _mm_load1_pd (&ty[i]);
  1689. __m128d tzi = _mm_load1_pd (&tz[i]);
  1690. int j = 0;
  1691. // Load and calculate in groups of SIMD_LEN
  1692. for (; j + SIMD_LEN <= ns; j+=SIMD_LEN) {
  1693. __m128d sxj = _mm_load_pd (&sx[j]);
  1694. __m128d syj = _mm_load_pd (&sy[j]);
  1695. __m128d szj = _mm_load_pd (&sz[j]);
  1696. __m128d sdenx = _mm_set_pd (srcDen[(j+1)*3], srcDen[j*3]);
  1697. __m128d sdeny = _mm_set_pd (srcDen[(j+1)*3+1], srcDen[j*3+1]);
  1698. __m128d sdenz = _mm_set_pd (srcDen[(j+1)*3+2], srcDen[j*3+2]);
  1699. __m128d dX, dY, dZ;
  1700. __m128d dR2;
  1701. __m128d S;
  1702. __m128d S2;
  1703. __m128d S3;
  1704. dX = _mm_sub_pd(txi , sxj);
  1705. dY = _mm_sub_pd(tyi , syj);
  1706. dZ = _mm_sub_pd(tzi , szj);
  1707. sxj = _mm_mul_pd(dX, dX);
  1708. syj = _mm_mul_pd(dY, dY);
  1709. szj = _mm_mul_pd(dZ, dZ);
  1710. dR2 = _mm_add_pd(sxj, syj);
  1711. dR2 = _mm_add_pd(szj, dR2);
  1712. __m128d temp = _mm_cmpeq_pd (dR2, zero);
  1713. __m128d xhalf = _mm_mul_pd (half, dR2);
  1714. __m128 dR2_s = _mm_cvtpd_ps(dR2);
  1715. __m128 S_s = _mm_rsqrt_ps(dR2_s);
  1716. __m128d S_d = _mm_cvtps_pd(S_s);
  1717. // To handle the condition when src and trg coincide
  1718. S_d = _mm_andnot_pd (temp, S_d);
  1719. S = _mm_mul_pd (S_d, S_d);
  1720. S = _mm_mul_pd (S, xhalf);
  1721. S = _mm_sub_pd (opf, S);
  1722. S = _mm_mul_pd (S, S_d);
  1723. S2 = _mm_mul_pd (S, S);
  1724. S3 = _mm_mul_pd (S2, S);
  1725. __m128d dotx = _mm_mul_pd (dX, sdenx);
  1726. __m128d doty = _mm_mul_pd (dY, sdeny);
  1727. __m128d dotz = _mm_mul_pd (dZ, sdenz);
  1728. __m128d dot_sum = _mm_add_pd (dotx, doty);
  1729. dot_sum = _mm_add_pd (dot_sum, dotz);
  1730. dot_sum = _mm_mul_pd (dot_sum, S2);
  1731. tempxx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdenx), _mm_mul_pd(sdenx, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dX, dX)))))),tempxx);
  1732. tempxy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdenx), _mm_mul_pd(sdeny, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dY, dX)))))),tempxy);
  1733. tempxz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdenx), _mm_mul_pd(sdenz, dX)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dZ, dX)))))),tempxz);
  1734. tempyx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdeny), _mm_mul_pd(sdenx, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dX, dY)))))),tempyx);
  1735. tempyy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdeny), _mm_mul_pd(sdeny, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dY, dY)))))),tempyy);
  1736. tempyz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdeny), _mm_mul_pd(sdenz, dY)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dZ, dY)))))),tempyz);
  1737. tempzx = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dX, sdenz), _mm_mul_pd(sdenx, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dX, dZ)))))),tempzx);
  1738. tempzy = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dY, sdenz), _mm_mul_pd(sdeny, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(zero, _mm_mul_pd(three, _mm_mul_pd(dY, dZ)))))),tempzy);
  1739. tempzz = _mm_add_pd(_mm_mul_pd(S3,_mm_add_pd(_mm_sub_pd(_mm_mul_pd(dZ, sdenz), _mm_mul_pd(sdenz, dZ)), _mm_mul_pd(dot_sum, _mm_sub_pd(dR2 , _mm_mul_pd(three, _mm_mul_pd(dZ, dZ)))))),tempzz);
  1740. }
  1741. tempxx = _mm_mul_pd (tempxx, ooepmu);
  1742. tempxy = _mm_mul_pd (tempxy, ooepmu);
  1743. tempxz = _mm_mul_pd (tempxz, ooepmu);
  1744. tempyx = _mm_mul_pd (tempyx, ooepmu);
  1745. tempyy = _mm_mul_pd (tempyy, ooepmu);
  1746. tempyz = _mm_mul_pd (tempyz, ooepmu);
  1747. tempzx = _mm_mul_pd (tempzx, ooepmu);
  1748. tempzy = _mm_mul_pd (tempzy, ooepmu);
  1749. tempzz = _mm_mul_pd (tempzz, ooepmu);
  1750. _mm_store_pd(tempvalxx, tempxx); _mm_store_pd(tempvalxy, tempxy); _mm_store_pd(tempvalxz, tempxz);
  1751. _mm_store_pd(tempvalyx, tempyx); _mm_store_pd(tempvalyy, tempyy); _mm_store_pd(tempvalyz, tempyz);
  1752. _mm_store_pd(tempvalzx, tempzx); _mm_store_pd(tempvalzy, tempzy); _mm_store_pd(tempvalzz, tempzz);
  1753. for (int k = 0; k < SIMD_LEN; k++) {
  1754. trgVal[i*9 ] += tempvalxx[k];
  1755. trgVal[i*9+1] += tempvalxy[k];
  1756. trgVal[i*9+2] += tempvalxz[k];
  1757. trgVal[i*9+3] += tempvalyx[k];
  1758. trgVal[i*9+4] += tempvalyy[k];
  1759. trgVal[i*9+5] += tempvalyz[k];
  1760. trgVal[i*9+6] += tempvalzx[k];
  1761. trgVal[i*9+7] += tempvalzy[k];
  1762. trgVal[i*9+8] += tempvalzz[k];
  1763. }
  1764. for (; j < ns; j++) {
  1765. double x = tx[i] - sx[j];
  1766. double y = ty[i] - sy[j];
  1767. double z = tz[i] - sz[j];
  1768. double r2 = x*x + y*y + z*z;
  1769. double r = sqrt(r2);
  1770. double invdr;
  1771. if (r == 0)
  1772. invdr = 0;
  1773. else
  1774. invdr = 1/r;
  1775. double invdr2=invdr*invdr;
  1776. double invdr3=invdr2*invdr;
  1777. double dot = (x*srcDen[j*3] + y*srcDen[j*3+1] + z*srcDen[j*3+2]);
  1778. trgVal[i*9 ] += OOEP*oomeu*invdr3*( x*srcDen[j*3 ] - srcDen[j*3 ]*x + dot*(1-3*x*x*invdr2) );
  1779. trgVal[i*9+1] += OOEP*oomeu*invdr3*( y*srcDen[j*3 ] - srcDen[j*3+1]*x + dot*(0-3*y*x*invdr2) );
  1780. trgVal[i*9+2] += OOEP*oomeu*invdr3*( z*srcDen[j*3 ] - srcDen[j*3+2]*x + dot*(0-3*z*x*invdr2) );
  1781. trgVal[i*9+3] += OOEP*oomeu*invdr3*( x*srcDen[j*3+1] - srcDen[j*3 ]*y + dot*(0-3*x*y*invdr2) );
  1782. trgVal[i*9+4] += OOEP*oomeu*invdr3*( y*srcDen[j*3+1] - srcDen[j*3+1]*y + dot*(1-3*y*y*invdr2) );
  1783. trgVal[i*9+5] += OOEP*oomeu*invdr3*( z*srcDen[j*3+1] - srcDen[j*3+2]*y + dot*(0-3*z*y*invdr2) );
  1784. trgVal[i*9+6] += OOEP*oomeu*invdr3*( x*srcDen[j*3+2] - srcDen[j*3 ]*z + dot*(0-3*x*z*invdr2) );
  1785. trgVal[i*9+7] += OOEP*oomeu*invdr3*( y*srcDen[j*3+2] - srcDen[j*3+1]*z + dot*(0-3*y*z*invdr2) );
  1786. trgVal[i*9+8] += OOEP*oomeu*invdr3*( z*srcDen[j*3+2] - srcDen[j*3+2]*z + dot*(1-3*z*z*invdr2) );
  1787. }
  1788. }
  1789. return;
  1790. }
  1791. #undef SIMD_LEN
  1792. #define X(s,k) (s)[(k)*COORD_DIM]
  1793. #define Y(s,k) (s)[(k)*COORD_DIM+1]
  1794. #define Z(s,k) (s)[(k)*COORD_DIM+2]
  1795. void stokesPressureSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
  1796. {
  1797. std::vector<double> xs(ns+1); std::vector<double> xt(nt);
  1798. std::vector<double> ys(ns+1); std::vector<double> yt(nt);
  1799. std::vector<double> zs(ns+1); std::vector<double> zt(nt);
  1800. int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1801. int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
  1802. int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1803. //1. reshuffle memory
  1804. for (int k =0;k<ns;k++){
  1805. xs[k+x_shift]=X(src,k);
  1806. ys[k+y_shift]=Y(src,k);
  1807. zs[k+z_shift]=Z(src,k);
  1808. }
  1809. for (int k=0;k<nt;k++){
  1810. xt[k]=X(trg,k);
  1811. yt[k]=Y(trg,k);
  1812. zt[k]=Z(trg,k);
  1813. }
  1814. //2. perform caclulation
  1815. stokesPressureSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
  1816. return;
  1817. }
  1818. void stokesStressSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], mem::MemoryManager* mem_mgr=NULL)
  1819. {
  1820. std::vector<double> xs(ns+1); std::vector<double> xt(nt);
  1821. std::vector<double> ys(ns+1); std::vector<double> yt(nt);
  1822. std::vector<double> zs(ns+1); std::vector<double> zt(nt);
  1823. int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1824. int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
  1825. int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1826. //1. reshuffle memory
  1827. for (int k =0;k<ns;k++){
  1828. xs[k+x_shift]=X(src,k);
  1829. ys[k+y_shift]=Y(src,k);
  1830. zs[k+z_shift]=Z(src,k);
  1831. }
  1832. for (int k=0;k<nt;k++){
  1833. xt[k]=X(trg,k);
  1834. yt[k]=Y(trg,k);
  1835. zt[k]=Z(trg,k);
  1836. }
  1837. //2. perform caclulation
  1838. stokesStressSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot);
  1839. return;
  1840. }
  1841. void stokesGradSSEShuffle(const int ns, const int nt, double const src[], double const trg[], double const den[], double pot[], const double kernel_coef, mem::MemoryManager* mem_mgr=NULL)
  1842. {
  1843. std::vector<double> xs(ns+1); std::vector<double> xt(nt);
  1844. std::vector<double> ys(ns+1); std::vector<double> yt(nt);
  1845. std::vector<double> zs(ns+1); std::vector<double> zt(nt);
  1846. int x_shift = size_t(&xs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1847. int y_shift = size_t(&ys[0]) % IDEAL_ALIGNMENT ? 1:0;
  1848. int z_shift = size_t(&zs[0]) % IDEAL_ALIGNMENT ? 1:0;
  1849. //1. reshuffle memory
  1850. for (int k =0;k<ns;k++){
  1851. xs[k+x_shift]=X(src,k);
  1852. ys[k+y_shift]=Y(src,k);
  1853. zs[k+z_shift]=Z(src,k);
  1854. }
  1855. for (int k=0;k<nt;k++){
  1856. xt[k]=X(trg,k);
  1857. yt[k]=Y(trg,k);
  1858. zt[k]=Z(trg,k);
  1859. }
  1860. //2. perform caclulation
  1861. stokesGradSSE(ns,nt,&xs[x_shift],&ys[y_shift],&zs[z_shift],&xt[0],&yt[0],&zt[0],den,pot,kernel_coef);
  1862. return;
  1863. }
  1864. #undef X
  1865. #undef Y
  1866. #undef Z
  1867. #undef IDEAL_ALIGNMENT
  1868. #undef DECL_SIMD_ALIGNED
  1869. }
  1870. template <>
  1871. void stokes_press<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
  1872. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(17*dof));
  1873. stokesPressureSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr);
  1874. return;
  1875. }
  1876. template <>
  1877. void stokes_stress<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
  1878. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(45*dof));
  1879. stokesStressSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mem_mgr);
  1880. }
  1881. template <>
  1882. void stokes_grad<double>(double* r_src, int src_cnt, double* v_src_, int dof, double* r_trg, int trg_cnt, double* k_out, mem::MemoryManager* mem_mgr){
  1883. Profile::Add_FLOP((long long)trg_cnt*(long long)src_cnt*(89*dof));
  1884. const double mu=1.0;
  1885. stokesGradSSEShuffle(src_cnt, trg_cnt, r_src, r_trg, v_src_, k_out, mu, mem_mgr);
  1886. }
  1887. #endif
  1888. #endif
  1889. template<class T> const Kernel<T>& StokesKernel<T>::velocity(){
  1890. static Kernel<T> ker=BuildKernel<T, stokes_vel<T,1>, stokes_sym_dip>("stokes_vel" , 3, std::pair<int,int>(3,3));
  1891. return ker;
  1892. }
  1893. template<class T> const Kernel<T>& StokesKernel<T>::pressure(){
  1894. static Kernel<T> ker=BuildKernel<T, stokes_press >("stokes_press" , 3, std::pair<int,int>(3,1));
  1895. return ker;
  1896. }
  1897. template<class T> const Kernel<T>& StokesKernel<T>::stress(){
  1898. static Kernel<T> ker=BuildKernel<T, stokes_stress >("stokes_stress", 3, std::pair<int,int>(3,9));
  1899. return ker;
  1900. }
  1901. template<class T> const Kernel<T>& StokesKernel<T>::vel_grad(){
  1902. static Kernel<T> ker=BuildKernel<T, stokes_grad >("stokes_grad" , 3, std::pair<int,int>(3,9));
  1903. return ker;
  1904. }
  1905. template<> const Kernel<double>& StokesKernel<double>::velocity(){
  1906. typedef double T;
  1907. static Kernel<T> ker=BuildKernel<T, stokes_vel<T,2>, stokes_sym_dip>("stokes_vel" , 3, std::pair<int,int>(3,3));
  1908. return ker;
  1909. }
  1910. ////////////////////////////////////////////////////////////////////////////////
  1911. //////// BIOT-SAVART KERNEL ////////
  1912. ////////////////////////////////////////////////////////////////////////////////
  1913. template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
  1914. void biot_savart_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
  1915. #define SRC_BLK 500
  1916. size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
  1917. //// Number of newton iterations
  1918. size_t NWTN_ITER=0;
  1919. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin0<Vec_t,Real_t>) NWTN_ITER=0;
  1920. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin1<Vec_t,Real_t>) NWTN_ITER=1;
  1921. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin2<Vec_t,Real_t>) NWTN_ITER=2;
  1922. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin3<Vec_t,Real_t>) NWTN_ITER=3;
  1923. Real_t nwtn_scal=1; // scaling factor for newton iterations
  1924. for(int i=0;i<NWTN_ITER;i++){
  1925. nwtn_scal=2*nwtn_scal*nwtn_scal*nwtn_scal;
  1926. }
  1927. const Real_t OOFP = 1.0/(4*nwtn_scal*nwtn_scal*nwtn_scal*const_pi<Real_t>());
  1928. size_t src_cnt_=src_coord.Dim(1);
  1929. size_t trg_cnt_=trg_coord.Dim(1);
  1930. for(size_t sblk=0;sblk<src_cnt_;sblk+=SRC_BLK){
  1931. size_t src_cnt=src_cnt_-sblk;
  1932. if(src_cnt>SRC_BLK) src_cnt=SRC_BLK;
  1933. for(size_t t=0;t<trg_cnt_;t+=VecLen){
  1934. Vec_t tx=load_intrin<Vec_t>(&trg_coord[0][t]);
  1935. Vec_t ty=load_intrin<Vec_t>(&trg_coord[1][t]);
  1936. Vec_t tz=load_intrin<Vec_t>(&trg_coord[2][t]);
  1937. Vec_t tvx=zero_intrin<Vec_t>();
  1938. Vec_t tvy=zero_intrin<Vec_t>();
  1939. Vec_t tvz=zero_intrin<Vec_t>();
  1940. for(size_t s=sblk;s<sblk+src_cnt;s++){
  1941. Vec_t dx=sub_intrin(tx,bcast_intrin<Vec_t>(&src_coord[0][s]));
  1942. Vec_t dy=sub_intrin(ty,bcast_intrin<Vec_t>(&src_coord[1][s]));
  1943. Vec_t dz=sub_intrin(tz,bcast_intrin<Vec_t>(&src_coord[2][s]));
  1944. Vec_t svx= bcast_intrin<Vec_t>(&src_value[0][s]) ;
  1945. Vec_t svy= bcast_intrin<Vec_t>(&src_value[1][s]) ;
  1946. Vec_t svz= bcast_intrin<Vec_t>(&src_value[2][s]) ;
  1947. Vec_t r2= mul_intrin(dx,dx) ;
  1948. r2=add_intrin(r2,mul_intrin(dy,dy));
  1949. r2=add_intrin(r2,mul_intrin(dz,dz));
  1950. Vec_t rinv=RINV_INTRIN(r2);
  1951. Vec_t rinv3=mul_intrin(mul_intrin(rinv,rinv),rinv);
  1952. tvx=sub_intrin(tvx,mul_intrin(rinv3,sub_intrin(mul_intrin(svy,dz),mul_intrin(svz,dy))));
  1953. tvy=sub_intrin(tvy,mul_intrin(rinv3,sub_intrin(mul_intrin(svz,dx),mul_intrin(svx,dz))));
  1954. tvz=sub_intrin(tvz,mul_intrin(rinv3,sub_intrin(mul_intrin(svx,dy),mul_intrin(svy,dx))));
  1955. }
  1956. Vec_t oofp=set_intrin<Vec_t,Real_t>(OOFP);
  1957. tvx=add_intrin(mul_intrin(tvx,oofp),load_intrin<Vec_t>(&trg_value[0][t]));
  1958. tvy=add_intrin(mul_intrin(tvy,oofp),load_intrin<Vec_t>(&trg_value[1][t]));
  1959. tvz=add_intrin(mul_intrin(tvz,oofp),load_intrin<Vec_t>(&trg_value[2][t]));
  1960. store_intrin(&trg_value[0][t],tvx);
  1961. store_intrin(&trg_value[1][t],tvy);
  1962. store_intrin(&trg_value[2][t],tvz);
  1963. }
  1964. }
  1965. { // Add FLOPS
  1966. #ifndef __MIC__
  1967. Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(29+4*(NWTN_ITER)));
  1968. #endif
  1969. }
  1970. #undef SRC_BLK
  1971. }
  1972. template <class T, int newton_iter=0>
  1973. void biot_savart(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){
  1974. #define BS_KER_NWTN(nwtn) if(newton_iter==nwtn) \
  1975. generic_kernel<Real_t, 3, 3, biot_savart_uKernel<Real_t,Vec_t, rinv_intrin##nwtn<Vec_t,Real_t> > > \
  1976. ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr)
  1977. #define BIOTSAVART_KERNEL BS_KER_NWTN(0); BS_KER_NWTN(1); BS_KER_NWTN(2); BS_KER_NWTN(3);
  1978. if(mem::TypeTraits<T>::ID()==mem::TypeTraits<float>::ID()){
  1979. typedef float Real_t;
  1980. #if defined __MIC__
  1981. #define Vec_t Real_t
  1982. #elif defined __AVX__
  1983. #define Vec_t __m256
  1984. #elif defined __SSE3__
  1985. #define Vec_t __m128
  1986. #else
  1987. #define Vec_t Real_t
  1988. #endif
  1989. BIOTSAVART_KERNEL;
  1990. #undef Vec_t
  1991. }else if(mem::TypeTraits<T>::ID()==mem::TypeTraits<double>::ID()){
  1992. typedef double Real_t;
  1993. #if defined __MIC__
  1994. #define Vec_t Real_t
  1995. #elif defined __AVX__
  1996. #define Vec_t __m256d
  1997. #elif defined __SSE3__
  1998. #define Vec_t __m128d
  1999. #else
  2000. #define Vec_t Real_t
  2001. #endif
  2002. BIOTSAVART_KERNEL;
  2003. #undef Vec_t
  2004. }else{
  2005. typedef T Real_t;
  2006. #define Vec_t Real_t
  2007. BIOTSAVART_KERNEL;
  2008. #undef Vec_t
  2009. }
  2010. #undef BS_KER_NWTN
  2011. #undef BIOTSAVART_KERNEL
  2012. }
  2013. template<class T> const Kernel<T>& BiotSavartKernel<T>::potential(){
  2014. static Kernel<T> ker=BuildKernel<T, biot_savart<T,1> >("biot_savart", 3, std::pair<int,int>(3,3));
  2015. return ker;
  2016. }
  2017. template<> const Kernel<double>& BiotSavartKernel<double>::potential(){
  2018. typedef double T;
  2019. static Kernel<T> ker=BuildKernel<T, biot_savart<T,2> >("biot_savart", 3, std::pair<int,int>(3,3));
  2020. return ker;
  2021. }
  2022. ////////////////////////////////////////////////////////////////////////////////
  2023. //////// HELMHOLTZ KERNEL ////////
  2024. ////////////////////////////////////////////////////////////////////////////////
  2025. /**
  2026. * \brief Green's function for the Helmholtz's equation. Kernel tensor
  2027. * dimension = 2x2.
  2028. */
  2029. template <class Real_t, class Vec_t=Real_t, Vec_t (*RINV_INTRIN)(Vec_t)=rinv_intrin0<Vec_t> >
  2030. void helmholtz_poten_uKernel(Matrix<Real_t>& src_coord, Matrix<Real_t>& src_value, Matrix<Real_t>& trg_coord, Matrix<Real_t>& trg_value){
  2031. #define SRC_BLK 500
  2032. size_t VecLen=sizeof(Vec_t)/sizeof(Real_t);
  2033. //// Number of newton iterations
  2034. size_t NWTN_ITER=0;
  2035. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin0<Vec_t,Real_t>) NWTN_ITER=0;
  2036. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin1<Vec_t,Real_t>) NWTN_ITER=1;
  2037. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin2<Vec_t,Real_t>) NWTN_ITER=2;
  2038. if(RINV_INTRIN==(Vec_t (*)(Vec_t))rinv_intrin3<Vec_t,Real_t>) NWTN_ITER=3;
  2039. Real_t nwtn_scal=1; // scaling factor for newton iterations
  2040. for(int i=0;i<NWTN_ITER;i++){
  2041. nwtn_scal=2*nwtn_scal*nwtn_scal*nwtn_scal;
  2042. }
  2043. const Real_t OOFP = 1.0/(4*nwtn_scal*const_pi<Real_t>());
  2044. const Vec_t mu = set_intrin<Vec_t,Real_t>(20.0*const_pi<Real_t>()/nwtn_scal);
  2045. size_t src_cnt_=src_coord.Dim(1);
  2046. size_t trg_cnt_=trg_coord.Dim(1);
  2047. for(size_t sblk=0;sblk<src_cnt_;sblk+=SRC_BLK){
  2048. size_t src_cnt=src_cnt_-sblk;
  2049. if(src_cnt>SRC_BLK) src_cnt=SRC_BLK;
  2050. for(size_t t=0;t<trg_cnt_;t+=VecLen){
  2051. Vec_t tx=load_intrin<Vec_t>(&trg_coord[0][t]);
  2052. Vec_t ty=load_intrin<Vec_t>(&trg_coord[1][t]);
  2053. Vec_t tz=load_intrin<Vec_t>(&trg_coord[2][t]);
  2054. Vec_t tvx=zero_intrin<Vec_t>();
  2055. Vec_t tvy=zero_intrin<Vec_t>();
  2056. for(size_t s=sblk;s<sblk+src_cnt;s++){
  2057. Vec_t dx=sub_intrin(tx,bcast_intrin<Vec_t>(&src_coord[0][s]));
  2058. Vec_t dy=sub_intrin(ty,bcast_intrin<Vec_t>(&src_coord[1][s]));
  2059. Vec_t dz=sub_intrin(tz,bcast_intrin<Vec_t>(&src_coord[2][s]));
  2060. Vec_t svx= bcast_intrin<Vec_t>(&src_value[0][s]) ;
  2061. Vec_t svy= bcast_intrin<Vec_t>(&src_value[1][s]) ;
  2062. Vec_t r2= mul_intrin(dx,dx) ;
  2063. r2=add_intrin(r2,mul_intrin(dy,dy));
  2064. r2=add_intrin(r2,mul_intrin(dz,dz));
  2065. Vec_t rinv=RINV_INTRIN(r2);
  2066. Vec_t mu_r=mul_intrin(mu,mul_intrin(r2,rinv));
  2067. Vec_t G0=mul_intrin(cos_intrin(mu_r),rinv);
  2068. Vec_t G1=mul_intrin(sin_intrin(mu_r),rinv);
  2069. tvx=add_intrin(tvx,sub_intrin(mul_intrin(svx,G0),mul_intrin(svy,G1)));
  2070. tvy=add_intrin(tvy,add_intrin(mul_intrin(svx,G1),mul_intrin(svy,G0)));
  2071. }
  2072. Vec_t oofp=set_intrin<Vec_t,Real_t>(OOFP);
  2073. tvx=add_intrin(mul_intrin(tvx,oofp),load_intrin<Vec_t>(&trg_value[0][t]));
  2074. tvy=add_intrin(mul_intrin(tvy,oofp),load_intrin<Vec_t>(&trg_value[1][t]));
  2075. store_intrin(&trg_value[0][t],tvx);
  2076. store_intrin(&trg_value[1][t],tvy);
  2077. }
  2078. }
  2079. { // Add FLOPS
  2080. #ifndef __MIC__
  2081. Profile::Add_FLOP((long long)trg_cnt_*(long long)src_cnt_*(24+4*(NWTN_ITER)));
  2082. #endif
  2083. }
  2084. #undef SRC_BLK
  2085. }
  2086. template <class T, int newton_iter=0>
  2087. void helmholtz_poten(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cnt, T* v_trg, mem::MemoryManager* mem_mgr){
  2088. #define HELM_KER_NWTN(nwtn) if(newton_iter==nwtn) \
  2089. generic_kernel<Real_t, 2, 2, helmholtz_poten_uKernel<Real_t,Vec_t, rinv_intrin##nwtn<Vec_t,Real_t> > > \
  2090. ((Real_t*)r_src, src_cnt, (Real_t*)v_src, dof, (Real_t*)r_trg, trg_cnt, (Real_t*)v_trg, mem_mgr)
  2091. #define HELMHOLTZ_KERNEL HELM_KER_NWTN(0); HELM_KER_NWTN(1); HELM_KER_NWTN(2); HELM_KER_NWTN(3);
  2092. if(mem::TypeTraits<T>::ID()==mem::TypeTraits<float>::ID()){
  2093. typedef float Real_t;
  2094. #if defined __MIC__
  2095. #define Vec_t Real_t
  2096. #elif defined __AVX__
  2097. #define Vec_t __m256
  2098. #elif defined __SSE3__
  2099. #define Vec_t __m128
  2100. #else
  2101. #define Vec_t Real_t
  2102. #endif
  2103. HELMHOLTZ_KERNEL;
  2104. #undef Vec_t
  2105. }else if(mem::TypeTraits<T>::ID()==mem::TypeTraits<double>::ID()){
  2106. typedef double Real_t;
  2107. #if defined __MIC__
  2108. #define Vec_t Real_t
  2109. #elif defined __AVX__
  2110. #define Vec_t __m256d
  2111. #elif defined __SSE3__
  2112. #define Vec_t __m128d
  2113. #else
  2114. #define Vec_t Real_t
  2115. #endif
  2116. HELMHOLTZ_KERNEL;
  2117. #undef Vec_t
  2118. }else{
  2119. typedef T Real_t;
  2120. #define Vec_t Real_t
  2121. HELMHOLTZ_KERNEL;
  2122. #undef Vec_t
  2123. }
  2124. #undef HELM_KER_NWTN
  2125. #undef HELMHOLTZ_KERNEL
  2126. }
  2127. template<class T> const Kernel<T>& HelmholtzKernel<T>::potential(){
  2128. static Kernel<T> ker=BuildKernel<T, helmholtz_poten<T,1> >("helmholtz" , 3, std::pair<int,int>(2,2));
  2129. return ker;
  2130. }
  2131. template<> const Kernel<double>& HelmholtzKernel<double>::potential(){
  2132. typedef double T;
  2133. static Kernel<T> ker=BuildKernel<T, helmholtz_poten<T,3> >("helmholtz" , 3, std::pair<int,int>(2,2));
  2134. return ker;
  2135. }
  2136. }//end namespace