fmm_pts.txx 132 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393
  1. /**
  2. * \file fmm_pts.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 3-07-2011
  5. * \brief This file contains the implementation of the FMM_Pts class.
  6. */
  7. #include <mpi.h>
  8. #include <set>
  9. #include <sstream>
  10. #include <fft_wrapper.hpp>
  11. #include <mat_utils.hpp>
  12. #ifdef PVFMM_HAVE_SYS_STAT_H
  13. #include <sys/stat.h>
  14. #endif
  15. #ifdef __SSE__
  16. #include <xmmintrin.h>
  17. #endif
  18. #ifdef __SSE3__
  19. #include <pmmintrin.h>
  20. #endif
  21. #ifdef __AVX__
  22. #include <immintrin.h>
  23. #endif
  24. #ifdef __INTEL_OFFLOAD
  25. #include <immintrin.h>
  26. #endif
  27. #ifdef __INTEL_OFFLOAD0
  28. #pragma offload_attribute(push,target(mic))
  29. #endif
  30. namespace pvfmm{
  31. /**
  32. * \brief Returns the coordinates of points on the surface of a cube.
  33. * \param[in] p Number of points on an edge of the cube is (n+1)
  34. * \param[in] c Coordinates to the centre of the cube (3D array).
  35. * \param[in] alpha Scaling factor for the size of the cube.
  36. * \param[in] depth Depth of the cube in the octree.
  37. * \return Vector with coordinates of points on the surface of the cube in the
  38. * format [x0 y0 z0 x1 y1 z1 .... ].
  39. */
  40. template <class Real_t>
  41. std::vector<Real_t> surface(int p, Real_t* c, Real_t alpha, int depth){
  42. size_t n_=(6*(p-1)*(p-1)+2); //Total number of points.
  43. std::vector<Real_t> coord(n_*3);
  44. coord[0]=coord[1]=coord[2]=-1.0;
  45. size_t cnt=1;
  46. for(int i=0;i<p-1;i++)
  47. for(int j=0;j<p-1;j++){
  48. coord[cnt*3 ]=-1.0;
  49. coord[cnt*3+1]=(2.0*(i+1)-p+1)/(p-1);
  50. coord[cnt*3+2]=(2.0*j-p+1)/(p-1);
  51. cnt++;
  52. }
  53. for(int i=0;i<p-1;i++)
  54. for(int j=0;j<p-1;j++){
  55. coord[cnt*3 ]=(2.0*i-p+1)/(p-1);
  56. coord[cnt*3+1]=-1.0;
  57. coord[cnt*3+2]=(2.0*(j+1)-p+1)/(p-1);
  58. cnt++;
  59. }
  60. for(int i=0;i<p-1;i++)
  61. for(int j=0;j<p-1;j++){
  62. coord[cnt*3 ]=(2.0*(i+1)-p+1)/(p-1);
  63. coord[cnt*3+1]=(2.0*j-p+1)/(p-1);
  64. coord[cnt*3+2]=-1.0;
  65. cnt++;
  66. }
  67. for(size_t i=0;i<(n_/2)*3;i++)
  68. coord[cnt*3+i]=-coord[i];
  69. Real_t r = 0.5*pow(0.5,depth);
  70. Real_t b = alpha*r;
  71. for(size_t i=0;i<n_;i++){
  72. coord[i*3+0]=(coord[i*3+0]+1.0)*b+c[0];
  73. coord[i*3+1]=(coord[i*3+1]+1.0)*b+c[1];
  74. coord[i*3+2]=(coord[i*3+2]+1.0)*b+c[2];
  75. }
  76. return coord;
  77. }
  78. /**
  79. * \brief Returns the coordinates of points on the upward check surface of cube.
  80. * \see surface()
  81. */
  82. template <class Real_t>
  83. std::vector<Real_t> u_check_surf(int p, Real_t* c, int depth){
  84. Real_t r=0.5*pow(0.5,depth);
  85. Real_t coord[3]={c[0]-r*(RAD1-1.0),c[1]-r*(RAD1-1.0),c[2]-r*(RAD1-1.0)};
  86. return surface(p,coord,(Real_t)RAD1,depth);
  87. }
  88. /**
  89. * \brief Returns the coordinates of points on the upward equivalent surface of cube.
  90. * \see surface()
  91. */
  92. template <class Real_t>
  93. std::vector<Real_t> u_equiv_surf(int p, Real_t* c, int depth){
  94. Real_t r=0.5*pow(0.5,depth);
  95. Real_t coord[3]={c[0]-r*(RAD0-1.0),c[1]-r*(RAD0-1.0),c[2]-r*(RAD0-1.0)};
  96. return surface(p,coord,(Real_t)RAD0,depth);
  97. }
  98. /**
  99. * \brief Returns the coordinates of points on the downward check surface of cube.
  100. * \see surface()
  101. */
  102. template <class Real_t>
  103. std::vector<Real_t> d_check_surf(int p, Real_t* c, int depth){
  104. Real_t r=0.5*pow(0.5,depth);
  105. Real_t coord[3]={c[0]-r*(RAD0-1.0),c[1]-r*(RAD0-1.0),c[2]-r*(RAD0-1.0)};
  106. return surface(p,coord,(Real_t)RAD0,depth);
  107. }
  108. /**
  109. * \brief Returns the coordinates of points on the downward equivalent surface of cube.
  110. * \see surface()
  111. */
  112. template <class Real_t>
  113. std::vector<Real_t> d_equiv_surf(int p, Real_t* c, int depth){
  114. Real_t r=0.5*pow(0.5,depth);
  115. Real_t coord[3]={c[0]-r*(RAD1-1.0),c[1]-r*(RAD1-1.0),c[2]-r*(RAD1-1.0)};
  116. return surface(p,coord,(Real_t)RAD1,depth);
  117. }
  118. /**
  119. * \brief Defines the 3D grid for convolution in FFT acceleration of V-list.
  120. * \see surface()
  121. */
  122. template <class Real_t>
  123. std::vector<Real_t> conv_grid(int p, Real_t* c, int depth){
  124. Real_t r=pow(0.5,depth);
  125. Real_t a=r*RAD0;
  126. Real_t coord[3]={c[0],c[1],c[2]};
  127. int n1=p*2;
  128. int n2=(int)pow((Real_t)n1,2);
  129. int n3=(int)pow((Real_t)n1,3);
  130. std::vector<Real_t> grid(n3*3);
  131. for(int i=0;i<n1;i++)
  132. for(int j=0;j<n1;j++)
  133. for(int k=0;k<n1;k++){
  134. grid[(i+n1*j+n2*k)*3+0]=(i-p)*a/(p-1)+coord[0];
  135. grid[(i+n1*j+n2*k)*3+1]=(j-p)*a/(p-1)+coord[1];
  136. grid[(i+n1*j+n2*k)*3+2]=(k-p)*a/(p-1)+coord[2];
  137. }
  138. return grid;
  139. }
  140. #ifdef __INTEL_OFFLOAD0
  141. #pragma offload_attribute(pop)
  142. #endif
  143. template <class Real_t>
  144. void FMM_Data<Real_t>::Clear(){
  145. upward_equiv.Resize(0);
  146. }
  147. template <class Real_t>
  148. PackedData FMM_Data<Real_t>::PackMultipole(void* buff_ptr){
  149. PackedData p0; p0.data=buff_ptr;
  150. p0.length=upward_equiv.Dim()*sizeof(Real_t);
  151. if(p0.length==0) return p0;
  152. if(p0.data==NULL) p0.data=(char*)&upward_equiv[0];
  153. else mem::memcopy(p0.data,&upward_equiv[0],p0.length);
  154. return p0;
  155. }
  156. template <class Real_t>
  157. void FMM_Data<Real_t>::AddMultipole(PackedData p0){
  158. Real_t* data=(Real_t*)p0.data;
  159. size_t n=p0.length/sizeof(Real_t);
  160. assert(upward_equiv.Dim()==n);
  161. Matrix<Real_t> v0(1,n,&upward_equiv[0],false);
  162. Matrix<Real_t> v1(1,n,data,false);
  163. v0+=v1;
  164. }
  165. template <class Real_t>
  166. void FMM_Data<Real_t>::InitMultipole(PackedData p0, bool own_data){
  167. Real_t* data=(Real_t*)p0.data;
  168. size_t n=p0.length/sizeof(Real_t);
  169. if(own_data){
  170. if(n>0) upward_equiv=Vector<Real_t>(n, &data[0], false);
  171. }else{
  172. if(n>0) upward_equiv.ReInit(n, &data[0], false);
  173. }
  174. }
  175. template <class FMMNode>
  176. FMM_Pts<FMMNode>::~FMM_Pts() {
  177. if(mat!=NULL){
  178. // int rank;
  179. // MPI_Comm_rank(comm,&rank);
  180. // if(rank==0) mat->Save2File("Precomp.data");
  181. delete mat;
  182. mat=NULL;
  183. }
  184. if(vprecomp_fft_flag) FFTW_t<Real_t>::fft_destroy_plan(vprecomp_fftplan);
  185. #ifdef __INTEL_OFFLOAD0
  186. #pragma offload target(mic:0)
  187. #endif
  188. {
  189. if(vlist_fft_flag ) FFTW_t<Real_t>::fft_destroy_plan(vlist_fftplan );
  190. if(vlist_ifft_flag) FFTW_t<Real_t>::fft_destroy_plan(vlist_ifftplan);
  191. vlist_fft_flag =false;
  192. vlist_ifft_flag=false;
  193. }
  194. }
  195. template <class FMMNode>
  196. void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel<Real_t>* kernel_, const Kernel<Real_t>* aux_kernel_){
  197. Profile::Tic("InitFMM_Pts",&comm_,true);{
  198. multipole_order=mult_order;
  199. comm=comm_;
  200. kernel=*kernel_;
  201. aux_kernel=*(aux_kernel_?aux_kernel_:kernel_);
  202. assert(kernel.ker_dim[0]==aux_kernel.ker_dim[0]);
  203. mat=new PrecompMat<Real_t>(Homogen(), MAX_DEPTH+1);
  204. if(this->mat_fname.size()==0){
  205. std::stringstream st;
  206. st<<PVFMM_PRECOMP_DATA_PATH;
  207. if(!st.str().size()){ // look in PVFMM_DIR
  208. char* pvfmm_dir = getenv ("PVFMM_DIR");
  209. if(pvfmm_dir) st<<pvfmm_dir<<'/';
  210. }
  211. #ifndef STAT_MACROS_BROKEN
  212. if(st.str().size()){ // check if the path is a directory
  213. struct stat stat_buff;
  214. if(stat(st.str().c_str(), &stat_buff) || !S_ISDIR(stat_buff.st_mode)){
  215. std::cout<<"error: path not found: "<<st.str()<<'\n';
  216. exit(0);
  217. }
  218. }
  219. #endif
  220. st<<"Precomp_"<<kernel.ker_name.c_str()<<"_m"<<mult_order<<(typeid(Real_t)==typeid(float)?"_f":"")<<".data";
  221. this->mat_fname=st.str();
  222. }
  223. this->mat->LoadFile(mat_fname.c_str(), this->comm);
  224. interac_list.Initialize(COORD_DIM, this->mat);
  225. Profile::Tic("PrecompUC2UE",&comm,false,4);
  226. this->PrecompAll(UC2UE_Type);
  227. Profile::Toc();
  228. Profile::Tic("PrecompDC2DE",&comm,false,4);
  229. this->PrecompAll(DC2DE_Type);
  230. Profile::Toc();
  231. Profile::Tic("PrecompBC",&comm,false,4);
  232. { /*
  233. int type=BC_Type;
  234. for(int l=0;l<MAX_DEPTH;l++)
  235. for(size_t indx=0;indx<this->interac_list.ListCount((Mat_Type)type);indx++){
  236. Matrix<Real_t>& M=this->mat->Mat(l, (Mat_Type)type, indx);
  237. M.Resize(0,0);
  238. } // */
  239. }
  240. this->PrecompAll(BC_Type,0);
  241. Profile::Toc();
  242. Profile::Tic("PrecompU2U",&comm,false,4);
  243. this->PrecompAll(U2U_Type);
  244. Profile::Toc();
  245. Profile::Tic("PrecompD2D",&comm,false,4);
  246. this->PrecompAll(D2D_Type);
  247. Profile::Toc();
  248. Profile::Tic("PrecompV",&comm,false,4);
  249. this->PrecompAll(V_Type);
  250. Profile::Toc();
  251. Profile::Tic("PrecompV1",&comm,false,4);
  252. this->PrecompAll(V1_Type);
  253. Profile::Toc();
  254. }Profile::Toc();
  255. }
  256. template <class FMMNode>
  257. Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type type, Perm_Type perm_indx){
  258. //Check if the matrix already exists.
  259. Permutation<Real_t>& P_ = mat->Perm((Mat_Type)type, perm_indx);
  260. if(P_.Dim()!=0) return P_;
  261. Matrix<size_t> swap_xy(10,9);
  262. Matrix<size_t> swap_xz(10,9);
  263. {
  264. for(int i=0;i<9;i++)
  265. for(int j=0;j<9;j++){
  266. swap_xy[i][j]=j;
  267. swap_xz[i][j]=j;
  268. }
  269. swap_xy[3][0]=1; swap_xy[3][1]=0; swap_xy[3][2]=2;
  270. swap_xz[3][0]=2; swap_xz[3][1]=1; swap_xz[3][2]=0;
  271. swap_xy[6][0]=1; swap_xy[6][1]=0; swap_xy[6][2]=2;
  272. swap_xy[6][3]=4; swap_xy[6][4]=3; swap_xy[6][5]=5;
  273. swap_xz[6][0]=2; swap_xz[6][1]=1; swap_xz[6][2]=0;
  274. swap_xz[6][3]=5; swap_xz[6][4]=4; swap_xz[6][5]=3;
  275. swap_xy[9][0]=4; swap_xy[9][1]=3; swap_xy[9][2]=5;
  276. swap_xy[9][3]=1; swap_xy[9][4]=0; swap_xy[9][5]=2;
  277. swap_xy[9][6]=7; swap_xy[9][7]=6; swap_xy[9][8]=8;
  278. swap_xz[9][0]=8; swap_xz[9][1]=7; swap_xz[9][2]=6;
  279. swap_xz[9][3]=5; swap_xz[9][4]=4; swap_xz[9][5]=3;
  280. swap_xz[9][6]=2; swap_xz[9][7]=1; swap_xz[9][8]=0;
  281. }
  282. //Compute the matrix.
  283. Permutation<Real_t> P;
  284. switch (type){
  285. case UC2UE_Type:
  286. {
  287. break;
  288. }
  289. case DC2DE_Type:
  290. {
  291. break;
  292. }
  293. case S2U_Type:
  294. {
  295. break;
  296. }
  297. case U2U_Type:
  298. {
  299. break;
  300. }
  301. case D2D_Type:
  302. {
  303. Real_t eps=1e-10;
  304. int dof=kernel.ker_dim[0];
  305. size_t p_indx=perm_indx % C_Perm;
  306. Real_t c[3]={-0.5,-0.5,-0.5};
  307. std::vector<Real_t> trg_coord=d_check_surf(this->MultipoleOrder(),c,0);
  308. int n_trg=trg_coord.size()/3;
  309. P=Permutation<Real_t>(n_trg*dof);
  310. if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){
  311. for(int i=0;i<n_trg;i++)
  312. for(int j=0;j<n_trg;j++){
  313. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+0]*(p_indx==ReflecX?-1.0:1.0))<eps)
  314. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+1]*(p_indx==ReflecY?-1.0:1.0))<eps)
  315. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+2]*(p_indx==ReflecZ?-1.0:1.0))<eps){
  316. for(int k=0;k<dof;k++){
  317. P.perm[j*dof+k]=i*dof+k;
  318. }
  319. }
  320. }
  321. if(dof==3) //stokes_vel (and like kernels)
  322. for(int j=0;j<n_trg;j++)
  323. P.scal[j*dof+(int)p_indx]*=-1.0;
  324. }else if(p_indx==SwapXY || p_indx==SwapXZ)
  325. for(int i=0;i<n_trg;i++)
  326. for(int j=0;j<n_trg;j++){
  327. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+(p_indx==SwapXY?1:2)])<eps)
  328. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+(p_indx==SwapXY?0:1)])<eps)
  329. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+(p_indx==SwapXY?2:0)])<eps){
  330. for(int k=0;k<dof;k++){
  331. P.perm[j*dof+k]=i*dof+(p_indx==SwapXY?swap_xy[dof][k]:swap_xz[dof][k]);
  332. }
  333. }
  334. }
  335. break;
  336. }
  337. case D2T_Type:
  338. {
  339. break;
  340. }
  341. case U0_Type:
  342. {
  343. break;
  344. }
  345. case U1_Type:
  346. {
  347. break;
  348. }
  349. case U2_Type:
  350. {
  351. break;
  352. }
  353. case V_Type:
  354. {
  355. break;
  356. }
  357. case V1_Type:
  358. {
  359. break;
  360. }
  361. case W_Type:
  362. {
  363. break;
  364. }
  365. case X_Type:
  366. {
  367. break;
  368. }
  369. case BC_Type:
  370. {
  371. break;
  372. }
  373. default:
  374. break;
  375. }
  376. //Save the matrix for future use.
  377. #pragma omp critical (PRECOMP_MATRIX_PTS)
  378. {
  379. if(P_.Dim()==0) P_=P;
  380. }
  381. return P_;
  382. }
  383. template <class FMMNode>
  384. Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type type, size_t mat_indx){
  385. if(this->Homogen()) level=0;
  386. //Check if the matrix already exists.
  387. Matrix<Real_t>& M_ = this->mat->Mat(level, type, mat_indx);
  388. if(M_.Dim(0)!=0 && M_.Dim(1)!=0) return M_;
  389. else{ //Compute matrix from symmetry class (if possible).
  390. size_t class_indx = this->interac_list.InteracClass(type, mat_indx);
  391. if(class_indx!=mat_indx){
  392. Matrix<Real_t>& M0 = this->Precomp(level, type, class_indx);
  393. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, type, mat_indx);
  394. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, type, mat_indx);
  395. if(Pr.Dim()>0 && Pc.Dim()>0 && M0.Dim(0)>0 && M0.Dim(1)>0) return M_;
  396. }
  397. }
  398. //Compute the matrix.
  399. Matrix<Real_t> M;
  400. int* ker_dim=kernel.ker_dim;
  401. int* aux_ker_dim=aux_kernel.ker_dim;
  402. //int omp_p=omp_get_max_threads();
  403. switch (type){
  404. case UC2UE_Type:
  405. {
  406. // Coord of upward check surface
  407. Real_t c[3]={0,0,0};
  408. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  409. size_t n_uc=uc_coord.size()/3;
  410. // Coord of upward equivalent surface
  411. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  412. size_t n_ue=ue_coord.size()/3;
  413. // Evaluate potential at check surface due to equivalent surface.
  414. Matrix<Real_t> M_e2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
  415. Kernel<Real_t>::Eval(&ue_coord[0], n_ue,
  416. &uc_coord[0], n_uc, &(M_e2c[0][0]),
  417. aux_kernel.ker_poten, aux_ker_dim);
  418. M=M_e2c.pinv(); //check 2 equivalent
  419. break;
  420. }
  421. case DC2DE_Type:
  422. {
  423. // Coord of downward check surface
  424. Real_t c[3]={0,0,0};
  425. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  426. size_t n_ch=check_surf.size()/3;
  427. // Coord of downward equivalent surface
  428. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  429. size_t n_eq=equiv_surf.size()/3;
  430. // Evaluate potential at check surface due to equivalent surface.
  431. Matrix<Real_t> M_e2c(n_eq*aux_ker_dim[0],n_ch*aux_ker_dim[1]);
  432. Kernel<Real_t>::Eval(&equiv_surf[0], n_eq,
  433. &check_surf[0], n_ch, &(M_e2c[0][0]),
  434. aux_kernel.ker_poten,aux_ker_dim);
  435. M=M_e2c.pinv(); //check 2 equivalent
  436. break;
  437. }
  438. case S2U_Type:
  439. {
  440. break;
  441. }
  442. case U2U_Type:
  443. {
  444. // Coord of upward check surface
  445. Real_t c[3]={0,0,0};
  446. std::vector<Real_t> check_surf=u_check_surf(MultipoleOrder(),c,level);
  447. size_t n_uc=check_surf.size()/3;
  448. // Coord of child's upward equivalent surface
  449. Real_t s=pow(0.5,(level+2));
  450. int* coord=interac_list.RelativeCoord(type,mat_indx);
  451. Real_t child_coord[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  452. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),child_coord,level+1);
  453. size_t n_ue=equiv_surf.size()/3;
  454. // Evaluate potential at check surface due to equivalent surface.
  455. Matrix<Real_t> M_ce2c(n_ue*aux_ker_dim[0],n_uc*aux_ker_dim[1]);
  456. Kernel<Real_t>::Eval(&equiv_surf[0], n_ue,
  457. &check_surf[0], n_uc, &(M_ce2c[0][0]),
  458. aux_kernel.ker_poten, aux_ker_dim);
  459. Matrix<Real_t>& M_c2e = Precomp(level, UC2UE_Type, 0);
  460. M=M_ce2c*M_c2e;
  461. break;
  462. }
  463. case D2D_Type:
  464. {
  465. // Coord of downward check surface
  466. Real_t s=pow(0.5,level+1);
  467. int* coord=interac_list.RelativeCoord(type,mat_indx);
  468. Real_t c[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  469. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  470. size_t n_dc=check_surf.size()/3;
  471. // Coord of parent's downward equivalent surface
  472. Real_t parent_coord[3]={0,0,0};
  473. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),parent_coord,level-1);
  474. size_t n_de=equiv_surf.size()/3;
  475. // Evaluate potential at check surface due to equivalent surface.
  476. Matrix<Real_t> M_pe2c(n_de*aux_ker_dim[0],n_dc*aux_ker_dim[1]);
  477. Kernel<Real_t>::Eval(&equiv_surf[0], n_de,
  478. &check_surf[0], n_dc, &(M_pe2c[0][0]),
  479. aux_kernel.ker_poten,aux_ker_dim);
  480. Matrix<Real_t>& M_c2e=Precomp(level,DC2DE_Type,0);
  481. M=M_pe2c*M_c2e;
  482. break;
  483. }
  484. case D2T_Type:
  485. {
  486. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  487. // Coord of target points
  488. Real_t r=pow(0.5,level);
  489. size_t n_trg=rel_trg_coord.size()/3;
  490. std::vector<Real_t> trg_coord(n_trg*3);
  491. for(size_t i=0;i<n_trg*COORD_DIM;i++) trg_coord[i]=rel_trg_coord[i]*r;
  492. // Coord of downward equivalent surface
  493. Real_t c[3]={0,0,0};
  494. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  495. size_t n_eq=equiv_surf.size()/3;
  496. // Evaluate potential at target points due to equivalent surface.
  497. {
  498. M .Resize(n_eq*ker_dim [0], n_trg*ker_dim [1]);
  499. kernel.BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  500. }
  501. break;
  502. }
  503. case U0_Type:
  504. {
  505. break;
  506. }
  507. case U1_Type:
  508. {
  509. break;
  510. }
  511. case U2_Type:
  512. {
  513. break;
  514. }
  515. case V_Type:
  516. {
  517. int n1=MultipoleOrder()*2;
  518. int n3 =n1*n1*n1;
  519. int n3_=n1*n1*(n1/2+1);
  520. //Compute the matrix.
  521. Real_t s=pow(0.5,level);
  522. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  523. Real_t coord_diff[3]={coord2[0]*s,coord2[1]*s,coord2[2]*s};
  524. //Evaluate potential.
  525. std::vector<Real_t> r_trg(COORD_DIM,0.0);
  526. std::vector<Real_t> conv_poten(n3*aux_ker_dim[0]*aux_ker_dim[1]);
  527. std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
  528. Kernel<Real_t>::Eval(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0],aux_kernel.ker_poten,aux_ker_dim);
  529. //Rearrange data.
  530. Matrix<Real_t> M_conv(n3,aux_ker_dim[0]*aux_ker_dim[1],&conv_poten[0],false);
  531. M_conv=M_conv.Transpose();
  532. //Compute FFTW plan.
  533. int nnn[3]={n1,n1,n1};
  534. void *fftw_in, *fftw_out;
  535. fftw_in = fftw_malloc( n3 *aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
  536. fftw_out = fftw_malloc(2*n3_*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
  537. #pragma omp critical (FFTW_PLAN)
  538. {
  539. if (!vprecomp_fft_flag){
  540. vprecomp_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM, nnn, aux_ker_dim[0]*aux_ker_dim[1],
  541. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_, FFTW_ESTIMATE);
  542. vprecomp_fft_flag=true;
  543. }
  544. }
  545. //Compute FFT.
  546. mem::memcopy(fftw_in, &conv_poten[0], n3*aux_ker_dim[0]*aux_ker_dim[1]*sizeof(Real_t));
  547. FFTW_t<Real_t>::fft_execute_dft_r2c(vprecomp_fftplan, (Real_t*)fftw_in, (typename FFTW_t<Real_t>::cplx*)(fftw_out));
  548. Matrix<Real_t> M_(2*n3_*aux_ker_dim[0]*aux_ker_dim[1],1,(Real_t*)fftw_out,false);
  549. M=M_;
  550. //Free memory.
  551. fftw_free(fftw_in);
  552. fftw_free(fftw_out);
  553. break;
  554. }
  555. case V1_Type:
  556. {
  557. size_t mat_cnt =interac_list.ListCount( V_Type);
  558. for(size_t k=0;k<mat_cnt;k++) Precomp(level, V_Type, k);
  559. const size_t chld_cnt=1UL<<COORD_DIM;
  560. size_t n1=MultipoleOrder()*2;
  561. size_t M_dim=n1*n1*(n1/2+1);
  562. size_t n3=n1*n1*n1;
  563. Vector<Real_t> zero_vec(M_dim*aux_ker_dim[0]*aux_ker_dim[1]*2);
  564. zero_vec.SetZero();
  565. Vector<Real_t*> M_ptr(chld_cnt*chld_cnt);
  566. for(size_t i=0;i<chld_cnt*chld_cnt;i++) M_ptr[i]=&zero_vec[0];
  567. int* rel_coord_=interac_list.RelativeCoord(V1_Type, mat_indx);
  568. for(int j1=0;j1<chld_cnt;j1++)
  569. for(int j2=0;j2<chld_cnt;j2++){
  570. int rel_coord[3]={rel_coord_[0]*2-(j1/1)%2+(j2/1)%2,
  571. rel_coord_[1]*2-(j1/2)%2+(j2/2)%2,
  572. rel_coord_[2]*2-(j1/4)%2+(j2/4)%2};
  573. for(size_t k=0;k<mat_cnt;k++){
  574. int* ref_coord=interac_list.RelativeCoord(V_Type, k);
  575. if(ref_coord[0]==rel_coord[0] &&
  576. ref_coord[1]==rel_coord[1] &&
  577. ref_coord[2]==rel_coord[2]){
  578. Matrix<Real_t>& M = this->mat->Mat(level, V_Type, k);
  579. M_ptr[j2*chld_cnt+j1]=&M[0][0];
  580. break;
  581. }
  582. }
  583. }
  584. // Build matrix aux_ker_dim0 x aux_ker_dim1 x M_dim x 8 x 8
  585. M.Resize(aux_ker_dim[0]*aux_ker_dim[1]*M_dim, 2*chld_cnt*chld_cnt);
  586. for(int j=0;j<aux_ker_dim[0]*aux_ker_dim[1]*M_dim;j++){
  587. for(size_t k=0;k<chld_cnt*chld_cnt;k++){
  588. M[j][k*2+0]=M_ptr[k][j*2+0]/n3;
  589. M[j][k*2+1]=M_ptr[k][j*2+1]/n3;
  590. }
  591. }
  592. break;
  593. }
  594. case W_Type:
  595. {
  596. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  597. // Coord of target points
  598. Real_t s=pow(0.5,level);
  599. size_t n_trg=rel_trg_coord.size()/3;
  600. std::vector<Real_t> trg_coord(n_trg*3);
  601. for(size_t j=0;j<n_trg*COORD_DIM;j++) trg_coord[j]=rel_trg_coord[j]*s;
  602. // Coord of downward equivalent surface
  603. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  604. Real_t c[3]={(coord2[0]+1)*s*0.25,(coord2[1]+1)*s*0.25,(coord2[2]+1)*s*0.25};
  605. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),c,level+1);
  606. size_t n_eq=equiv_surf.size()/3;
  607. // Evaluate potential at target points due to equivalent surface.
  608. {
  609. M .Resize(n_eq*ker_dim [0],n_trg*ker_dim [1]);
  610. kernel.BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  611. }
  612. break;
  613. }
  614. case X_Type:
  615. {
  616. break;
  617. }
  618. case BC_Type:
  619. {
  620. size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type);
  621. size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2); //Total number of points.
  622. if((M.Dim(0)!=n_surf*aux_ker_dim[0] || M.Dim(1)!=n_surf*aux_ker_dim[1]) && level==0){
  623. Matrix<Real_t> M_m2m[BC_LEVELS+1];
  624. Matrix<Real_t> M_m2l[BC_LEVELS+1];
  625. Matrix<Real_t> M_l2l[BC_LEVELS+1];
  626. Matrix<Real_t> M_zero_avg(n_surf*aux_ker_dim[0],n_surf*aux_ker_dim[0]);
  627. { // Set average multipole charge to zero. (improves stability for large BC_LEVELS)
  628. M_zero_avg.SetZero();
  629. for(size_t i=0;i<n_surf*aux_ker_dim[0];i++)
  630. M_zero_avg[i][i]+=1;
  631. for(size_t i=0;i<n_surf;i++)
  632. for(size_t j=0;j<n_surf;j++)
  633. for(size_t k=0;k<aux_ker_dim[0];k++)
  634. M_zero_avg[i*aux_ker_dim[0]+k][j*aux_ker_dim[0]+k]-=1.0/n_surf;
  635. }
  636. for(int level=0; level>-BC_LEVELS; level--){
  637. M_l2l[-level] = this->Precomp(level, D2D_Type, 0);
  638. if(M_l2l[-level].Dim(0)==0 || M_l2l[-level].Dim(1)==0){
  639. Matrix<Real_t>& M0 = interac_list.ClassMat(level, D2D_Type, 0);
  640. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, D2D_Type, 0);
  641. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, D2D_Type, 0);
  642. M_l2l[-level] = Pr*M0*Pc;
  643. }
  644. M_m2m[-level] = M_zero_avg*this->Precomp(level, U2U_Type, 0);
  645. for(size_t mat_indx=1; mat_indx<mat_cnt_m2m; mat_indx++)
  646. M_m2m[-level] += M_zero_avg*this->Precomp(level, U2U_Type, mat_indx);
  647. }
  648. for(int level=-BC_LEVELS;level<=0;level++){
  649. if(!Homogen() || level==-BC_LEVELS){
  650. Real_t s=(1UL<<(-level));
  651. Real_t ue_coord[3]={0,0,0};
  652. Real_t dc_coord[3]={0,0,0};
  653. std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
  654. std::vector<Real_t> trg_coord=d_check_surf(MultipoleOrder(), dc_coord, level);
  655. Matrix<Real_t> M_ue2dc(n_surf*aux_ker_dim[0], n_surf*aux_ker_dim[1]);
  656. M_ue2dc.SetZero();
  657. for(int x0=-2;x0<4;x0++)
  658. for(int x1=-2;x1<4;x1++)
  659. for(int x2=-2;x2<4;x2++)
  660. if(abs(x0)>1 || abs(x1)>1 || abs(x2)>1){
  661. ue_coord[0]=x0*s; ue_coord[1]=x1*s; ue_coord[2]=x2*s;
  662. std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
  663. Matrix<Real_t> M_tmp(n_surf*aux_ker_dim[0], n_surf*aux_ker_dim[1]);
  664. Kernel<Real_t>::Eval(&src_coord[0], n_surf,
  665. &trg_coord[0], n_surf, &(M_tmp[0][0]),
  666. aux_kernel.ker_poten, aux_ker_dim);
  667. M_ue2dc+=M_tmp;
  668. }
  669. // Shift by constant.
  670. Real_t scale_adj=(Homogen()?pow(0.5, level*aux_kernel.poten_scale):1);
  671. for(size_t i=0;i<M_ue2dc.Dim(0);i++){
  672. std::vector<Real_t> avg(aux_ker_dim[1],0);
  673. for(size_t j=0; j<M_ue2dc.Dim(1); j+=aux_ker_dim[1])
  674. for(int k=0; k<aux_ker_dim[1]; k++) avg[k]+=M_ue2dc[i][j+k];
  675. for(int k=0; k<aux_ker_dim[1]; k++) avg[k]/=n_surf;
  676. for(size_t j=0; j<M_ue2dc.Dim(1); j+=aux_ker_dim[1])
  677. for(int k=0; k<aux_ker_dim[1]; k++) M_ue2dc[i][j+k]=(M_ue2dc[i][j+k]-avg[k])*scale_adj;
  678. }
  679. Matrix<Real_t>& M_dc2de = Precomp(level, DC2DE_Type, 0);
  680. M_m2l[-level]=M_ue2dc*M_dc2de;
  681. }else M_m2l[-level]=M_m2l[1-level];
  682. if(level==-BC_LEVELS) M = M_m2l[-level];
  683. else M = M_m2l[-level] + M_m2m[-level]*M*M_l2l[-level];
  684. { // Shift by constant. (improves stability for large BC_LEVELS)
  685. Matrix<Real_t> M_de2dc(n_surf*aux_ker_dim[0], n_surf*aux_ker_dim[1]);
  686. { // M_de2dc TODO: For homogeneous kernels, compute only once.
  687. // Coord of downward check surface
  688. Real_t c[3]={0,0,0};
  689. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,0);
  690. size_t n_ch=check_surf.size()/3;
  691. // Coord of downward equivalent surface
  692. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,0);
  693. size_t n_eq=equiv_surf.size()/3;
  694. // Evaluate potential at check surface due to equivalent surface.
  695. Kernel<Real_t>::Eval(&equiv_surf[0], n_eq,
  696. &check_surf[0], n_ch, &(M_de2dc[0][0]),
  697. aux_kernel.ker_poten,aux_ker_dim);
  698. }
  699. Matrix<Real_t> M_ue2dc=M*M_de2dc;
  700. for(size_t i=0;i<M_ue2dc.Dim(0);i++){
  701. std::vector<Real_t> avg(aux_ker_dim[1],0);
  702. for(size_t j=0; j<M_ue2dc.Dim(1); j+=aux_ker_dim[1])
  703. for(int k=0; k<aux_ker_dim[1]; k++) avg[k]+=M_ue2dc[i][j+k];
  704. for(int k=0; k<aux_ker_dim[1]; k++) avg[k]/=n_surf;
  705. for(size_t j=0; j<M_ue2dc.Dim(1); j+=aux_ker_dim[1])
  706. for(int k=0; k<aux_ker_dim[1]; k++) M_ue2dc[i][j+k]-=avg[k];
  707. }
  708. Matrix<Real_t>& M_dc2de = Precomp(level, DC2DE_Type, 0);
  709. M=M_ue2dc*M_dc2de;
  710. }
  711. }
  712. { // ax+by+cz+d correction.
  713. std::vector<Real_t> corner_pts;
  714. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0);
  715. corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0);
  716. corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0);
  717. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1);
  718. size_t n_corner=corner_pts.size()/3;
  719. // Coord of downward equivalent surface
  720. Real_t c[3]={0,0,0};
  721. std::vector<Real_t> up_equiv_surf=u_equiv_surf(MultipoleOrder(),c,0);
  722. std::vector<Real_t> dn_equiv_surf=d_equiv_surf(MultipoleOrder(),c,0);
  723. std::vector<Real_t> dn_check_surf=d_check_surf(MultipoleOrder(),c,0);
  724. Matrix<Real_t> M_err;
  725. { // Evaluate potential at corner due to upward and dnward equivalent surface.
  726. { // Error from local expansion.
  727. Matrix<Real_t> M_e2pt(n_surf*aux_ker_dim[0],n_corner*aux_ker_dim[1]);
  728. Kernel<Real_t>::Eval(&dn_equiv_surf[0], n_surf,
  729. &corner_pts[0], n_corner, &(M_e2pt[0][0]),
  730. aux_kernel.ker_poten,aux_ker_dim);
  731. M_err=M*M_e2pt;
  732. }
  733. for(size_t k=0;k<4;k++){ // Error from colleagues of root.
  734. for(int j0=-1;j0<=1;j0++)
  735. for(int j1=-1;j1<=1;j1++)
  736. for(int j2=-1;j2<=1;j2++){
  737. Real_t pt_coord[3]={corner_pts[k*COORD_DIM+0]-j0,
  738. corner_pts[k*COORD_DIM+1]-j1,
  739. corner_pts[k*COORD_DIM+2]-j2};
  740. if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){
  741. Matrix<Real_t> M_e2pt(n_surf*aux_ker_dim[0],aux_ker_dim[1]);
  742. Kernel<Real_t>::Eval(&up_equiv_surf[0], n_surf,
  743. &pt_coord[0], 1, &(M_e2pt[0][0]),
  744. aux_kernel.ker_poten,aux_ker_dim);
  745. for(size_t i=0;i<M_e2pt.Dim(0);i++)
  746. for(size_t j=0;j<M_e2pt.Dim(1);j++)
  747. M_err[i][k*aux_ker_dim[1]+j]+=M_e2pt[i][j];
  748. }
  749. }
  750. }
  751. }
  752. Matrix<Real_t> M_grad(M_err.Dim(0),n_surf*aux_ker_dim[1]);
  753. for(size_t i=0;i<M_err.Dim(0);i++)
  754. for(size_t k=0;k<aux_ker_dim[1];k++)
  755. for(size_t j=0;j<n_surf;j++){
  756. M_grad[i][j*aux_ker_dim[1]+k]=(M_err[i][0*aux_ker_dim[1]+k] )*1.0 +
  757. (M_err[i][1*aux_ker_dim[1]+k]-M_err[i][0*aux_ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]+
  758. (M_err[i][2*aux_ker_dim[1]+k]-M_err[i][0*aux_ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+1]+
  759. (M_err[i][3*aux_ker_dim[1]+k]-M_err[i][0*aux_ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+2];
  760. }
  761. Matrix<Real_t>& M_dc2de = Precomp(0, DC2DE_Type, 0);
  762. M-=M_grad*M_dc2de;
  763. }
  764. { // Free memory
  765. Mat_Type type=D2D_Type;
  766. for(int l=-BC_LEVELS;l<0;l++)
  767. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  768. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  769. M.Resize(0,0);
  770. }
  771. type=U2U_Type;
  772. for(int l=-BC_LEVELS;l<0;l++)
  773. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  774. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  775. M.Resize(0,0);
  776. }
  777. type=DC2DE_Type;
  778. for(int l=-BC_LEVELS;l<0;l++)
  779. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  780. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  781. M.Resize(0,0);
  782. }
  783. type=UC2UE_Type;
  784. for(int l=-BC_LEVELS;l<0;l++)
  785. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  786. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  787. M.Resize(0,0);
  788. }
  789. }
  790. }
  791. break;
  792. }
  793. default:
  794. break;
  795. }
  796. //Save the matrix for future use.
  797. #pragma omp critical (PRECOMP_MATRIX_PTS)
  798. if(M_.Dim(0)==0 && M_.Dim(1)==0){
  799. M_=M;
  800. /*
  801. M_.Resize(M.Dim(0),M.Dim(1));
  802. int dof=aux_ker_dim[0]*aux_ker_dim[1];
  803. for(int j=0;j<dof;j++){
  804. size_t a=(M.Dim(0)*M.Dim(1)* j )/dof;
  805. size_t b=(M.Dim(0)*M.Dim(1)*(j+1))/dof;
  806. #pragma omp parallel for // NUMA
  807. for(int tid=0;tid<omp_p;tid++){
  808. size_t a_=a+((b-a)* tid )/omp_p;
  809. size_t b_=a+((b-a)*(tid+1))/omp_p;
  810. mem::memcopy(&M_[0][a_], &M[0][a_], (b_-a_)*sizeof(Real_t));
  811. }
  812. }
  813. */
  814. }
  815. return M_;
  816. }
  817. template <class FMMNode>
  818. void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
  819. int depth=(Homogen()?1:MAX_DEPTH);
  820. if(level==-1){
  821. for(int l=0;l<depth;l++){
  822. std::stringstream level_str;
  823. level_str<<"level="<<l;
  824. PrecompAll(type, l);
  825. }
  826. return;
  827. }
  828. //Compute basic permutations.
  829. for(size_t i=0;i<Perm_Count;i++)
  830. this->PrecompPerm(type, (Perm_Type) i);
  831. {
  832. //Allocate matrices.
  833. size_t mat_cnt=interac_list.ListCount((Mat_Type)type);
  834. mat->Mat(level, (Mat_Type)type, mat_cnt-1);
  835. { // Compute InteracClass matrices.
  836. std::vector<size_t> indx_lst;
  837. for(size_t i=0; i<mat_cnt; i++){
  838. if(interac_list.InteracClass((Mat_Type)type,i)==i)
  839. indx_lst.push_back(i);
  840. }
  841. //Compute Transformations.
  842. //#pragma omp parallel for //lets use fine grained parallelism
  843. for(size_t i=0; i<indx_lst.size(); i++){
  844. Precomp(level, (Mat_Type)type, indx_lst[i]);
  845. }
  846. }
  847. //#pragma omp parallel for //lets use fine grained parallelism
  848. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
  849. Matrix<Real_t>& M0=interac_list.ClassMat(level,(Mat_Type)type,mat_indx);
  850. assert(M0.Dim(0)>0 && M0.Dim(1)>0);
  851. Permutation<Real_t>& pr=interac_list.Perm_R(level, (Mat_Type)type, mat_indx);
  852. Permutation<Real_t>& pc=interac_list.Perm_C(level, (Mat_Type)type, mat_indx);
  853. if(pr.Dim()!=M0.Dim(0) || pc.Dim()!=M0.Dim(1)) Precomp(level, (Mat_Type)type, mat_indx);
  854. }
  855. }
  856. }
  857. template <class FMMNode>
  858. void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<size_t> extra_size){
  859. if( buff.size()<7) buff.resize(7);
  860. if( n_list.size()<7) n_list.resize(7);
  861. if(node.size()==0) return;
  862. {// 0. upward_equiv
  863. int indx=0;
  864. Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE_Type, 0);
  865. size_t vec_sz=M_uc2ue.Dim(1);
  866. std::vector< FMMNode* > node_lst;
  867. {
  868. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  869. FMMNode_t* r_node=NULL;
  870. for(size_t i=0;i<node.size();i++){
  871. if(!node[i]->IsLeaf())
  872. node_lst_[node[i]->Depth()].push_back(node[i]);
  873. if(node[i]->Depth()==0) r_node=node[i];
  874. }
  875. size_t chld_cnt=1UL<<COORD_DIM;
  876. for(int i=0;i<=MAX_DEPTH;i++)
  877. for(size_t j=0;j<node_lst_[i].size();j++)
  878. for(size_t k=0;k<chld_cnt;k++)
  879. node_lst.push_back((FMMNode_t*)node_lst_[i][j]->Child(k));
  880. if(r_node!=NULL) node_lst.push_back(r_node);
  881. }
  882. n_list[indx]=node_lst;
  883. size_t buff_size=node_lst.size()*vec_sz;
  884. buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
  885. buff[indx].Resize(1,buff_size);
  886. #pragma omp parallel for
  887. for(size_t i=0;i<node_lst.size();i++){
  888. Vector<Real_t>& upward_equiv=node_lst[i]->FMMData()->upward_equiv;
  889. upward_equiv.ReInit(vec_sz, buff[indx][0]+i*vec_sz, false);
  890. upward_equiv.SetZero();
  891. }
  892. buff[indx].AllocDevice(true);
  893. }
  894. {// 1. dnward_equiv
  895. int indx=1;
  896. Matrix<Real_t>& M_dc2de = this->interac_list.ClassMat(0, DC2DE_Type, 0);
  897. size_t vec_sz=M_dc2de.Dim(1);
  898. std::vector< FMMNode* > node_lst;
  899. {
  900. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  901. FMMNode_t* r_node=NULL;
  902. for(size_t i=0;i<node.size();i++){
  903. if(!node[i]->IsLeaf() && !node[i]->IsGhost())
  904. node_lst_[node[i]->Depth()].push_back(node[i]);
  905. if(node[i]->Depth()==0) r_node=node[i];
  906. }
  907. size_t chld_cnt=1UL<<COORD_DIM;
  908. for(int i=0;i<=MAX_DEPTH;i++)
  909. for(size_t j=0;j<node_lst_[i].size();j++)
  910. for(size_t k=0;k<chld_cnt;k++)
  911. node_lst.push_back((FMMNode_t*)node_lst_[i][j]->Child(k));
  912. if(r_node!=NULL) node_lst.push_back(r_node);
  913. }
  914. n_list[indx]=node_lst;
  915. size_t buff_size=node_lst.size()*vec_sz;
  916. buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
  917. buff[indx].Resize(1,buff_size);
  918. #pragma omp parallel for
  919. for(size_t i=0;i<node_lst.size();i++){
  920. Vector<Real_t>& dnward_equiv=node_lst[i]->FMMData()->dnward_equiv;
  921. dnward_equiv.ReInit(vec_sz, buff[indx][0]+i*vec_sz, false);
  922. dnward_equiv.SetZero();
  923. }
  924. buff[indx].AllocDevice(true);
  925. }
  926. {// 2. upward_equiv_fft
  927. int indx=2;
  928. std::vector< FMMNode* > node_lst;
  929. {
  930. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  931. for(size_t i=0;i<node.size();i++)
  932. if(!node[i]->IsLeaf())
  933. node_lst_[node[i]->Depth()].push_back(node[i]);
  934. for(int i=0;i<=MAX_DEPTH;i++)
  935. for(size_t j=0;j<node_lst_[i].size();j++)
  936. node_lst.push_back(node_lst_[i][j]);
  937. }
  938. n_list[indx]=node_lst;
  939. buff[indx].AllocDevice(true);
  940. }
  941. {// 3. dnward_check_fft
  942. int indx=3;
  943. std::vector< FMMNode* > node_lst;
  944. {
  945. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  946. for(size_t i=0;i<node.size();i++)
  947. if(!node[i]->IsLeaf() && !node[i]->IsGhost())
  948. node_lst_[node[i]->Depth()].push_back(node[i]);
  949. for(int i=0;i<=MAX_DEPTH;i++)
  950. for(size_t j=0;j<node_lst_[i].size();j++)
  951. node_lst.push_back(node_lst_[i][j]);
  952. }
  953. n_list[indx]=node_lst;
  954. buff[indx].AllocDevice(true);
  955. }
  956. {// 4. src_val
  957. int indx=4;
  958. int src_dof=kernel.ker_dim[0];
  959. int surf_dof=COORD_DIM+src_dof;
  960. std::vector< FMMNode* > node_lst;
  961. size_t buff_size=0;
  962. for(size_t i=0;i<node.size();i++)
  963. if(node[i]->IsLeaf()){
  964. node_lst.push_back(node[i]);
  965. buff_size+=(node[i]->src_coord.Dim()/COORD_DIM)*src_dof;
  966. buff_size+=(node[i]->surf_coord.Dim()/COORD_DIM)*surf_dof;
  967. }
  968. n_list[indx]=node_lst;
  969. #pragma omp parallel for
  970. for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
  971. { // src_value
  972. Vector<Real_t>& src_value=node_lst[i]->src_value;
  973. Vector<Real_t> new_buff=src_value;
  974. src_value.ReInit(new_buff.Dim(), &new_buff[0]);
  975. }
  976. { // surf_value
  977. Vector<Real_t>& surf_value=node_lst[i]->surf_value;
  978. Vector<Real_t> new_buff=surf_value;
  979. surf_value.ReInit(new_buff.Dim(), &new_buff[0]);
  980. }
  981. }
  982. buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
  983. Real_t* buff_ptr=&buff[indx][0][0];
  984. for(size_t i=0;i<node_lst.size();i++){
  985. FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
  986. { // src_value
  987. Vector<Real_t>& src_value=node_lst[i]->src_value;
  988. mem::memcopy(buff_ptr,&src_value[0],src_value.Dim()*sizeof(Real_t));
  989. src_value.ReInit((node_lst[i]->src_coord.Dim()/COORD_DIM)*src_dof, buff_ptr, false);
  990. buff_ptr+=(node_lst[i]->src_coord.Dim()/COORD_DIM)*src_dof;
  991. }
  992. { // surf_value
  993. Vector<Real_t>& surf_value=node_lst[i]->surf_value;
  994. mem::memcopy(buff_ptr,&surf_value[0],surf_value.Dim()*sizeof(Real_t));
  995. surf_value.ReInit((node_lst[i]->surf_coord.Dim()/COORD_DIM)*surf_dof, buff_ptr, false);
  996. buff_ptr+=(node_lst[i]->surf_coord.Dim()/COORD_DIM)*surf_dof;
  997. }
  998. }
  999. buff[indx].AllocDevice(true);
  1000. }
  1001. {// 5. trg_val
  1002. int indx=5;
  1003. int trg_dof=kernel.ker_dim[1];
  1004. std::vector< FMMNode* > node_lst;
  1005. size_t buff_size=0;
  1006. for(size_t i=0;i<node.size();i++)
  1007. if(node[i]->IsLeaf() && !node[i]->IsGhost()){
  1008. node_lst.push_back(node[i]);
  1009. FMMData* fmm_data=((FMMData*)node[i]->FMMData());
  1010. buff_size+=(node[i]->trg_coord.Dim()/COORD_DIM)*trg_dof;
  1011. }
  1012. n_list[indx]=node_lst;
  1013. buff[indx].Resize(1,buff_size+(extra_size.size()>indx?extra_size[indx]:0));
  1014. Real_t* buff_ptr=&buff[indx][0][0];
  1015. for(size_t i=0;i<node_lst.size();i++){
  1016. FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
  1017. { // trg_value
  1018. Vector<Real_t>& trg_value=node_lst[i]->trg_value;
  1019. trg_value.ReInit((node_lst[i]->trg_coord.Dim()/COORD_DIM)*trg_dof, buff_ptr, false);
  1020. buff_ptr+=(node_lst[i]->trg_coord.Dim()/COORD_DIM)*trg_dof;
  1021. }
  1022. }
  1023. #pragma omp parallel for
  1024. for(size_t i=0;i<node_lst.size();i++){
  1025. FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
  1026. Vector<Real_t>& trg_value=node_lst[i]->trg_value;
  1027. trg_value.SetZero();
  1028. }
  1029. buff[indx].AllocDevice(true);
  1030. }
  1031. {// 6. pts_coord
  1032. int indx=6;
  1033. size_t m=MultipoleOrder();
  1034. std::vector< FMMNode* > node_lst;
  1035. size_t buff_size=0;
  1036. for(size_t i=0;i<node.size();i++)
  1037. if(node[i]->IsLeaf()){
  1038. node_lst.push_back(node[i]);
  1039. FMMData* fmm_data=((FMMData*)node[i]->FMMData());
  1040. buff_size+=node[i]->src_coord.Dim();
  1041. buff_size+=node[i]->surf_coord.Dim();
  1042. buff_size+=node[i]->trg_coord.Dim();
  1043. }
  1044. n_list[indx]=node_lst;
  1045. #pragma omp parallel for
  1046. for(size_t i=0;i<node_lst.size();i++){ // Move data before resizing buff[indx]
  1047. FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
  1048. { // src_coord
  1049. Vector<Real_t>& src_coord=node_lst[i]->src_coord;
  1050. Vector<Real_t> new_buff=src_coord;
  1051. src_coord.ReInit(new_buff.Dim(), &new_buff[0]);
  1052. }
  1053. { // surf_coord
  1054. Vector<Real_t>& surf_coord=node_lst[i]->surf_coord;
  1055. Vector<Real_t> new_buff=surf_coord;
  1056. surf_coord.ReInit(new_buff.Dim(), &new_buff[0]);
  1057. }
  1058. { // trg_coord
  1059. Vector<Real_t>& trg_coord=node_lst[i]->trg_coord;
  1060. Vector<Real_t> new_buff=trg_coord;
  1061. trg_coord.ReInit(new_buff.Dim(), &new_buff[0]);
  1062. }
  1063. }
  1064. buff_size+=(extra_size.size()>indx?extra_size[indx]:0);
  1065. buff_size+=4*MAX_DEPTH*(6*(m-1)*(m-1)+2)*COORD_DIM;
  1066. buff[indx].Resize(1,buff_size);
  1067. Real_t* buff_ptr=&buff[indx][0][0];
  1068. for(size_t i=0;i<node_lst.size();i++){
  1069. FMMData* fmm_data=((FMMData*)node_lst[i]->FMMData());
  1070. { // src_coord
  1071. Vector<Real_t>& src_coord=node_lst[i]->src_coord;
  1072. mem::memcopy(buff_ptr,&src_coord[0],src_coord.Dim()*sizeof(Real_t));
  1073. src_coord.ReInit(node_lst[i]->src_coord.Dim(), buff_ptr, false);
  1074. buff_ptr+=node_lst[i]->src_coord.Dim();
  1075. }
  1076. { // surf_coord
  1077. Vector<Real_t>& surf_coord=node_lst[i]->surf_coord;
  1078. mem::memcopy(buff_ptr,&surf_coord[0],surf_coord.Dim()*sizeof(Real_t));
  1079. surf_coord.ReInit(node_lst[i]->surf_coord.Dim(), buff_ptr, false);
  1080. buff_ptr+=node_lst[i]->surf_coord.Dim();
  1081. }
  1082. { // trg_coord
  1083. Vector<Real_t>& trg_coord=node_lst[i]->trg_coord;
  1084. mem::memcopy(buff_ptr,&trg_coord[0],trg_coord.Dim()*sizeof(Real_t));
  1085. trg_coord.ReInit(node_lst[i]->trg_coord.Dim(), buff_ptr, false);
  1086. buff_ptr+=node_lst[i]->trg_coord.Dim();
  1087. }
  1088. }
  1089. { // check and equiv surfaces.
  1090. upwd_check_surf.resize(MAX_DEPTH);
  1091. upwd_equiv_surf.resize(MAX_DEPTH);
  1092. dnwd_check_surf.resize(MAX_DEPTH);
  1093. dnwd_equiv_surf.resize(MAX_DEPTH);
  1094. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1095. Real_t c[3]={0.0,0.0,0.0};
  1096. upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
  1097. upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
  1098. dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
  1099. dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM, buff_ptr, false); buff_ptr+=(6*(m-1)*(m-1)+2)*COORD_DIM;
  1100. upwd_check_surf[depth]=u_check_surf(m,c,depth);
  1101. upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
  1102. dnwd_check_surf[depth]=d_check_surf(m,c,depth);
  1103. dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
  1104. }
  1105. }
  1106. buff[indx].AllocDevice(true);
  1107. }
  1108. }
  1109. template <class FMMNode>
  1110. void FMM_Pts<FMMNode>::SetupPrecomp(SetupData<Real_t>& setup_data, bool device){
  1111. if(setup_data.precomp_data==NULL) return;
  1112. Profile::Tic("SetupPrecomp",&this->comm,true,25);
  1113. { // Build precomp_data
  1114. size_t precomp_offset=0;
  1115. int level=setup_data.level;
  1116. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1117. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1118. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1119. Mat_Type& interac_type=interac_type_lst[type_indx];
  1120. this->PrecompAll(interac_type, level); // Compute matrices.
  1121. precomp_offset=this->mat->CompactData(level, interac_type, precomp_data, precomp_offset);
  1122. }
  1123. }
  1124. Profile::Toc();
  1125. if(device){ // Host2Device
  1126. Profile::Tic("Host2Device",&this->comm,false,25);
  1127. setup_data.precomp_data->AllocDevice(true);
  1128. Profile::Toc();
  1129. }
  1130. }
  1131. template <class FMMNode>
  1132. void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
  1133. int level=setup_data.level;
  1134. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1135. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1136. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1137. Matrix<Real_t>& input_data=*setup_data. input_data;
  1138. Matrix<Real_t>& output_data=*setup_data.output_data;
  1139. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector;
  1140. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector;
  1141. size_t n_in =nodes_in .size();
  1142. size_t n_out=nodes_out.size();
  1143. // Setup precomputed data.
  1144. SetupPrecomp(setup_data,device);
  1145. // Build interac_data
  1146. Profile::Tic("Interac-Data",&this->comm,true,25);
  1147. Matrix<char>& interac_data=setup_data.interac_data;
  1148. if(n_out>0 && n_in >0){ // Build precomp_data, interac_data
  1149. std::vector<size_t> interac_mat;
  1150. std::vector<size_t> interac_cnt;
  1151. std::vector<size_t> interac_blk;
  1152. std::vector<size_t> input_perm;
  1153. std::vector<size_t> output_perm;
  1154. std::vector<Real_t> scaling;
  1155. size_t dof=0, M_dim0=0, M_dim1=0;
  1156. size_t precomp_offset=0;
  1157. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  1158. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1159. Mat_Type& interac_type=interac_type_lst[type_indx];
  1160. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  1161. Vector<size_t> precomp_data_offset;
  1162. { // Load precomp_data for interac_type.
  1163. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1164. char* indx_ptr=precomp_data[0]+precomp_offset;
  1165. size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
  1166. /*size_t mat_cnt_ =((size_t*)indx_ptr)[0];*/ indx_ptr+=sizeof(size_t);
  1167. precomp_data_offset.ReInit((1+2+2)*mat_cnt, (size_t*)indx_ptr, false);
  1168. precomp_offset+=total_size;
  1169. }
  1170. Matrix<FMMNode*> src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero();
  1171. Matrix<FMMNode*> trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero();
  1172. { // Build trg_interac_list
  1173. for(size_t i=0;i<n_out;i++){
  1174. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  1175. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  1176. mem::memcopy(&trg_interac_list[i][0], &lst[0], lst.size()*sizeof(FMMNode*));
  1177. assert(lst.size()==mat_cnt);
  1178. }
  1179. }
  1180. }
  1181. { // Build src_interac_list
  1182. for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
  1183. for(size_t i=0;i<n_out;i++){
  1184. for(size_t j=0;j<mat_cnt;j++)
  1185. if(trg_interac_list[i][j]!=NULL){
  1186. src_interac_list[trg_interac_list[i][j]->node_id][j]=(FMMNode*)nodes_out[i];
  1187. }
  1188. }
  1189. }
  1190. Matrix<size_t> interac_dsp(n_out,mat_cnt);
  1191. std::vector<size_t> interac_blk_dsp(1,0);
  1192. { // Determine dof, M_dim0, M_dim1
  1193. dof=1;
  1194. Matrix<Real_t>& M0 = this->interac_list.ClassMat(level, interac_type_lst[0], 0);
  1195. M_dim0=M0.Dim(0); M_dim1=M0.Dim(1);
  1196. }
  1197. { // Determine interaction blocks which fit in memory.
  1198. size_t vec_size=(M_dim0+M_dim1)*sizeof(Real_t)*dof;
  1199. for(size_t j=0;j<mat_cnt;j++){// Determine minimum buff_size
  1200. size_t vec_cnt=0;
  1201. for(size_t i=0;i<n_out;i++){
  1202. if(trg_interac_list[i][j]!=NULL) vec_cnt++;
  1203. }
  1204. if(buff_size<vec_cnt*vec_size)
  1205. buff_size=vec_cnt*vec_size;
  1206. }
  1207. size_t interac_dsp_=0;
  1208. for(size_t j=0;j<mat_cnt;j++){
  1209. for(size_t i=0;i<n_out;i++){
  1210. interac_dsp[i][j]=interac_dsp_;
  1211. if(trg_interac_list[i][j]!=NULL) interac_dsp_++;
  1212. }
  1213. if(interac_dsp_*vec_size>buff_size){
  1214. interac_blk.push_back(j-interac_blk_dsp.back());
  1215. interac_blk_dsp.push_back(j);
  1216. size_t offset=interac_dsp[0][j];
  1217. for(size_t i=0;i<n_out;i++) interac_dsp[i][j]-=offset;
  1218. interac_dsp_-=offset;
  1219. assert(interac_dsp_*vec_size<=buff_size); // Problem too big for buff_size.
  1220. }
  1221. interac_mat.push_back(precomp_data_offset[5*this->interac_list.InteracClass(interac_type,j)+0]);
  1222. interac_cnt.push_back(interac_dsp_-interac_dsp[0][j]);
  1223. }
  1224. interac_blk.push_back(mat_cnt-interac_blk_dsp.back());
  1225. interac_blk_dsp.push_back(mat_cnt);
  1226. }
  1227. { // Determine input_perm.
  1228. size_t vec_size=M_dim0*dof;
  1229. for(size_t i=0;i<n_out;i++) ((FMMNode*)nodes_out[i])->node_id=i;
  1230. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1231. for(size_t i=0;i<n_in ;i++){
  1232. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1233. FMMNode_t* trg_node=src_interac_list[i][j];
  1234. if(trg_node!=NULL){
  1235. input_perm .push_back(precomp_data_offset[5*j+1]); // prem
  1236. input_perm .push_back(precomp_data_offset[5*j+2]); // scal
  1237. input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
  1238. input_perm .push_back((size_t)(& input_vector[i][0][0]- input_data[0])); // src_ptr
  1239. assert(input_vector[i]->Dim()==vec_size);
  1240. }
  1241. }
  1242. }
  1243. }
  1244. }
  1245. { // Determine scaling and output_perm
  1246. size_t vec_size=M_dim1*dof;
  1247. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1248. for(size_t i=0;i<n_out;i++){
  1249. Real_t scaling_=0.0;
  1250. if(!this->Homogen()) scaling_=1.0;
  1251. else if(interac_type==D2D_Type) scaling_=1.0;
  1252. else if(interac_type==D2T_Type) scaling_=pow(0.5, -setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
  1253. else if(interac_type== U0_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth());
  1254. else if(interac_type== U1_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth());
  1255. else if(interac_type== U2_Type) scaling_=pow(0.5,(COORD_DIM-setup_data.kernel->poten_scale)*((FMMNode*)nodes_out[i])->Depth());
  1256. else if(interac_type== W_Type) scaling_=pow(0.5, -setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
  1257. else if(interac_type== X_Type) scaling_=pow(0.5, COORD_DIM *((FMMNode*)nodes_out[i])->Depth());
  1258. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1259. if(trg_interac_list[i][j]!=NULL){
  1260. scaling.push_back(scaling_); // scaling
  1261. output_perm.push_back(precomp_data_offset[5*j+3]); // prem
  1262. output_perm.push_back(precomp_data_offset[5*j+4]); // scal
  1263. output_perm.push_back(interac_dsp[ i ][j]*vec_size*sizeof(Real_t)); // src_ptr
  1264. output_perm.push_back((size_t)(&output_vector[i][0][0]-output_data[0])); // trg_ptr
  1265. assert(output_vector[i]->Dim()==vec_size);
  1266. }
  1267. }
  1268. }
  1269. }
  1270. }
  1271. }
  1272. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.Resize(buff_size);
  1273. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.Resize(buff_size);
  1274. { // Set interac_data.
  1275. size_t data_size=sizeof(size_t)*4;
  1276. data_size+=sizeof(size_t)+interac_blk.size()*sizeof(size_t);
  1277. data_size+=sizeof(size_t)+interac_cnt.size()*sizeof(size_t);
  1278. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  1279. data_size+=sizeof(size_t)+ input_perm.size()*sizeof(size_t);
  1280. data_size+=sizeof(size_t)+output_perm.size()*sizeof(size_t);
  1281. data_size+=sizeof(size_t)+scaling.size()*sizeof(Real_t);
  1282. if(interac_data.Dim(0)*interac_data.Dim(1)<sizeof(size_t)){
  1283. data_size+=sizeof(size_t);
  1284. interac_data.Resize(1,data_size);
  1285. ((size_t*)&interac_data[0][0])[0]=sizeof(size_t);
  1286. }else{
  1287. size_t pts_data_size=*((size_t*)&interac_data[0][0]);
  1288. assert(interac_data.Dim(0)*interac_data.Dim(1)>=pts_data_size);
  1289. data_size+=pts_data_size;
  1290. if(data_size>interac_data.Dim(0)*interac_data.Dim(1)){ //Resize and copy interac_data.
  1291. Matrix< char> pts_interac_data=interac_data;
  1292. interac_data.Resize(1,data_size);
  1293. mem::memcopy(&interac_data[0][0],&pts_interac_data[0][0],pts_data_size);
  1294. }
  1295. }
  1296. char* data_ptr=&interac_data[0][0];
  1297. data_ptr+=((size_t*)data_ptr)[0];
  1298. ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t);
  1299. ((size_t*)data_ptr)[0]= M_dim0; data_ptr+=sizeof(size_t);
  1300. ((size_t*)data_ptr)[0]= M_dim1; data_ptr+=sizeof(size_t);
  1301. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  1302. ((size_t*)data_ptr)[0]=interac_blk.size(); data_ptr+=sizeof(size_t);
  1303. mem::memcopy(data_ptr, &interac_blk[0], interac_blk.size()*sizeof(size_t));
  1304. data_ptr+=interac_blk.size()*sizeof(size_t);
  1305. ((size_t*)data_ptr)[0]=interac_cnt.size(); data_ptr+=sizeof(size_t);
  1306. mem::memcopy(data_ptr, &interac_cnt[0], interac_cnt.size()*sizeof(size_t));
  1307. data_ptr+=interac_cnt.size()*sizeof(size_t);
  1308. ((size_t*)data_ptr)[0]=interac_mat.size(); data_ptr+=sizeof(size_t);
  1309. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  1310. data_ptr+=interac_mat.size()*sizeof(size_t);
  1311. ((size_t*)data_ptr)[0]= input_perm.size(); data_ptr+=sizeof(size_t);
  1312. mem::memcopy(data_ptr, & input_perm[0], input_perm.size()*sizeof(size_t));
  1313. data_ptr+= input_perm.size()*sizeof(size_t);
  1314. ((size_t*)data_ptr)[0]=output_perm.size(); data_ptr+=sizeof(size_t);
  1315. mem::memcopy(data_ptr, &output_perm[0], output_perm.size()*sizeof(size_t));
  1316. data_ptr+=output_perm.size()*sizeof(size_t);
  1317. ((size_t*)data_ptr)[0]=scaling.size(); data_ptr+=sizeof(size_t);
  1318. mem::memcopy(data_ptr, &scaling[0], scaling.size()*sizeof(Real_t));
  1319. data_ptr+=scaling.size()*sizeof(Real_t);
  1320. }
  1321. }
  1322. Profile::Toc();
  1323. if(device){ // Host2Device
  1324. Profile::Tic("Host2Device",&this->comm,false,25);
  1325. setup_data.interac_data .AllocDevice(true);
  1326. Profile::Toc();
  1327. }
  1328. }
  1329. template <class FMMNode>
  1330. void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
  1331. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  1332. Profile::Tic("Host2Device",&this->comm,false,25);
  1333. Profile::Toc();
  1334. Profile::Tic("DeviceComp",&this->comm,false,20);
  1335. Profile::Toc();
  1336. return;
  1337. }
  1338. Profile::Tic("Host2Device",&this->comm,false,25);
  1339. typename Vector<char>::Device buff;
  1340. typename Matrix<char>::Device precomp_data;
  1341. typename Matrix<char>::Device interac_data;
  1342. typename Matrix<Real_t>::Device input_data;
  1343. typename Matrix<Real_t>::Device output_data;
  1344. if(device){
  1345. buff = this-> dev_buffer. AllocDevice(false);
  1346. precomp_data= setup_data.precomp_data->AllocDevice(false);
  1347. interac_data= setup_data.interac_data. AllocDevice(false);
  1348. input_data = setup_data. input_data->AllocDevice(false);
  1349. output_data = setup_data. output_data->AllocDevice(false);
  1350. }else{
  1351. buff = this-> cpu_buffer;
  1352. precomp_data=*setup_data.precomp_data;
  1353. interac_data= setup_data.interac_data;
  1354. input_data =*setup_data. input_data;
  1355. output_data =*setup_data. output_data;
  1356. }
  1357. Profile::Toc();
  1358. Profile::Tic("DeviceComp",&this->comm,false,20);
  1359. #ifdef __INTEL_OFFLOAD
  1360. int lock_idx=-1;
  1361. int wait_lock_idx=-1;
  1362. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  1363. if(device) lock_idx=MIC_Lock::get_lock();
  1364. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  1365. #endif
  1366. { // Offloaded computation.
  1367. // Set interac_data.
  1368. size_t data_size, M_dim0, M_dim1, dof;
  1369. Vector<size_t> interac_blk;
  1370. Vector<size_t> interac_cnt;
  1371. Vector<size_t> interac_mat;
  1372. Vector<size_t> input_perm;
  1373. Vector<size_t> output_perm;
  1374. Vector<Real_t> scaling;
  1375. { // Set interac_data.
  1376. char* data_ptr=&interac_data[0][0];
  1377. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
  1378. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1379. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1380. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1381. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1382. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1383. data_ptr+=sizeof(size_t)+interac_blk.Dim()*sizeof(size_t);
  1384. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1385. data_ptr+=sizeof(size_t)+interac_cnt.Dim()*sizeof(size_t);
  1386. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1387. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  1388. input_perm .ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1389. data_ptr+=sizeof(size_t)+ input_perm.Dim()*sizeof(size_t);
  1390. output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1391. data_ptr+=sizeof(size_t)+output_perm.Dim()*sizeof(size_t);
  1392. scaling.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  1393. data_ptr+=sizeof(size_t)+scaling.Dim()*sizeof(Real_t);
  1394. }
  1395. #ifdef __INTEL_OFFLOAD
  1396. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  1397. #endif
  1398. //Compute interaction from Chebyshev source density.
  1399. { // interactions
  1400. int omp_p=omp_get_max_threads();
  1401. size_t interac_indx=0;
  1402. size_t interac_blk_dsp=0;
  1403. for(size_t k=0;k<interac_blk.Dim();k++){
  1404. size_t vec_cnt=0;
  1405. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1406. char* buff_in =&buff[0];
  1407. char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1408. // Input permutation.
  1409. #pragma omp parallel for
  1410. for(int tid=0;tid<omp_p;tid++){
  1411. size_t a=( tid *vec_cnt)/omp_p;
  1412. size_t b=((tid+1)*vec_cnt)/omp_p;
  1413. for(size_t i=a;i<b;i++){
  1414. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]);
  1415. const Real_t* scal=( Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]);
  1416. const Real_t* v_in =( Real_t*)( input_data[0]+input_perm[(interac_indx+i)*4+3]);
  1417. Real_t* v_out=( Real_t*)( buff_in +input_perm[(interac_indx+i)*4+2]);
  1418. // TODO: Fix for dof>1
  1419. #ifdef __MIC__
  1420. {
  1421. __m512d v8;
  1422. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1423. size_t j_end =(((uintptr_t)(v_out+M_dim0) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1424. j_start/=sizeof(Real_t);
  1425. j_end /=sizeof(Real_t);
  1426. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1427. assert(((uintptr_t)(v_out+j_start))%64==0);
  1428. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1429. size_t j=0;
  1430. for(;j<j_start;j++ ){
  1431. v_out[j]=v_in[perm[j]]*scal[j];
  1432. }
  1433. for(;j<j_end ;j+=8){
  1434. v8=_mm512_setr_pd(
  1435. v_in[perm[j+0]]*scal[j+0],
  1436. v_in[perm[j+1]]*scal[j+1],
  1437. v_in[perm[j+2]]*scal[j+2],
  1438. v_in[perm[j+3]]*scal[j+3],
  1439. v_in[perm[j+4]]*scal[j+4],
  1440. v_in[perm[j+5]]*scal[j+5],
  1441. v_in[perm[j+6]]*scal[j+6],
  1442. v_in[perm[j+7]]*scal[j+7]);
  1443. _mm512_storenrngo_pd(v_out+j,v8);
  1444. }
  1445. for(;j<M_dim0 ;j++ ){
  1446. v_out[j]=v_in[perm[j]]*scal[j];
  1447. }
  1448. }
  1449. #else
  1450. for(size_t j=0;j<M_dim0;j++ ){
  1451. v_out[j]=v_in[perm[j]]*scal[j];
  1452. }
  1453. #endif
  1454. }
  1455. }
  1456. size_t vec_cnt0=0;
  1457. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];){
  1458. size_t vec_cnt1=0;
  1459. size_t interac_mat0=interac_mat[j];
  1460. for(;j<interac_blk_dsp+interac_blk[k] && interac_mat[j]==interac_mat0;j++) vec_cnt1+=interac_cnt[j];
  1461. Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
  1462. #ifdef __MIC__
  1463. {
  1464. Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1465. Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1466. Matrix<Real_t>::DGEMM(Mt,Ms,M);
  1467. }
  1468. #else
  1469. #pragma omp parallel for
  1470. for(int tid=0;tid<omp_p;tid++){
  1471. size_t a=(dof*vec_cnt1*(tid ))/omp_p;
  1472. size_t b=(dof*vec_cnt1*(tid+1))/omp_p;
  1473. Matrix<Real_t> Ms(b-a, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t))+M_dim0*a, false);
  1474. Matrix<Real_t> Mt(b-a, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t))+M_dim1*a, false);
  1475. Matrix<Real_t>::DGEMM(Mt,Ms,M);
  1476. }
  1477. #endif
  1478. vec_cnt0+=vec_cnt1;
  1479. }
  1480. // Output permutation.
  1481. #pragma omp parallel for
  1482. for(int tid=0;tid<omp_p;tid++){
  1483. size_t a=( tid *vec_cnt)/omp_p;
  1484. size_t b=((tid+1)*vec_cnt)/omp_p;
  1485. if(tid> 0 && a<vec_cnt){ // Find 'a' independent of other threads.
  1486. size_t out_ptr=output_perm[(interac_indx+a)*4+3];
  1487. if(tid> 0) while(a<vec_cnt && out_ptr==output_perm[(interac_indx+a)*4+3]) a++;
  1488. }
  1489. if(tid<omp_p-1 && b<vec_cnt){ // Find 'b' independent of other threads.
  1490. size_t out_ptr=output_perm[(interac_indx+b)*4+3];
  1491. if(tid<omp_p-1) while(b<vec_cnt && out_ptr==output_perm[(interac_indx+b)*4+3]) b++;
  1492. }
  1493. for(size_t i=a;i<b;i++){ // Compute permutations.
  1494. Real_t scaling_factor=scaling[interac_indx+i];
  1495. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
  1496. const Real_t* scal=( Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
  1497. const Real_t* v_in =( Real_t*)( buff_out +output_perm[(interac_indx+i)*4+2]);
  1498. Real_t* v_out=( Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]);
  1499. // TODO: Fix for dof>1
  1500. #ifdef __MIC__
  1501. {
  1502. __m512d v8;
  1503. __m512d v_old;
  1504. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1505. size_t j_end =(((uintptr_t)(v_out+M_dim1) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1506. j_start/=sizeof(Real_t);
  1507. j_end /=sizeof(Real_t);
  1508. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1509. assert(((uintptr_t)(v_out+j_start))%64==0);
  1510. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1511. size_t j=0;
  1512. for(;j<j_start;j++ ){
  1513. v_out[j]+=v_in[perm[j]]*scal[j]*scaling_factor;
  1514. }
  1515. for(;j<j_end ;j+=8){
  1516. v_old=_mm512_load_pd(v_out+j);
  1517. v8=_mm512_setr_pd(
  1518. v_in[perm[j+0]]*scal[j+0]*scaling_factor,
  1519. v_in[perm[j+1]]*scal[j+1]*scaling_factor,
  1520. v_in[perm[j+2]]*scal[j+2]*scaling_factor,
  1521. v_in[perm[j+3]]*scal[j+3]*scaling_factor,
  1522. v_in[perm[j+4]]*scal[j+4]*scaling_factor,
  1523. v_in[perm[j+5]]*scal[j+5]*scaling_factor,
  1524. v_in[perm[j+6]]*scal[j+6]*scaling_factor,
  1525. v_in[perm[j+7]]*scal[j+7]*scaling_factor);
  1526. v_old=_mm512_add_pd(v_old, v8);
  1527. _mm512_storenrngo_pd(v_out+j,v_old);
  1528. }
  1529. for(;j<M_dim1 ;j++ ){
  1530. v_out[j]+=v_in[perm[j]]*scal[j]*scaling_factor;
  1531. }
  1532. }
  1533. #else
  1534. for(size_t j=0;j<M_dim1;j++ ){
  1535. v_out[j]+=v_in[perm[j]]*scal[j]*scaling_factor;
  1536. }
  1537. #endif
  1538. }
  1539. }
  1540. interac_indx+=vec_cnt;
  1541. interac_blk_dsp+=interac_blk[k];
  1542. }
  1543. }
  1544. #ifdef __INTEL_OFFLOAD
  1545. if(device) MIC_Lock::release_lock(lock_idx);
  1546. #endif
  1547. }
  1548. #ifndef __MIC_ASYNCH__
  1549. #ifdef __INTEL_OFFLOAD
  1550. #pragma offload if(device) target(mic:0)
  1551. {if(device) MIC_Lock::wait_lock(lock_idx);}
  1552. #endif
  1553. #endif
  1554. Profile::Toc();
  1555. }
  1556. template <class FMMNode>
  1557. void FMM_Pts<FMMNode>::InitMultipole(FMMNode** node, size_t n, int level){
  1558. if(n==0) return;
  1559. int dof=1;
  1560. //Get the upward-check surface.
  1561. std::vector<Real_t> uc_coord[MAX_DEPTH+1];
  1562. size_t uc_cnt;
  1563. {
  1564. Real_t c[3]={0,0,0};
  1565. for(int i=0;i<=MAX_DEPTH;i++)
  1566. uc_coord[i]=u_check_surf(MultipoleOrder(),c,i);
  1567. uc_cnt=uc_coord[0].size()/COORD_DIM;
  1568. }
  1569. //Read matrices.
  1570. Matrix<Real_t>& M_uc2ue = this->mat->Mat(level, UC2UE_Type, 0);
  1571. //Compute multipole expansion.
  1572. #pragma omp parallel
  1573. {
  1574. int omp_p=omp_get_num_threads();
  1575. int pid = omp_get_thread_num();
  1576. size_t a=(pid*n)/omp_p;
  1577. size_t b=((pid+1)*n)/omp_p;
  1578. Matrix<Real_t> u_check(dof, M_uc2ue.Dim(0));
  1579. for (size_t j=a;j<b;j++){
  1580. Vector<Real_t>& upward_equiv=node[j]->FMMData()->upward_equiv;
  1581. assert(upward_equiv.Dim()==M_uc2ue.Dim(1)*dof);
  1582. Matrix<Real_t> u_equiv(dof,M_uc2ue.Dim(1),&upward_equiv[0],false);
  1583. u_equiv.SetZero(); // TODO: Do in setup
  1584. //Compute multipole expansion form point sources.
  1585. size_t pt_cnt=node[j]->src_coord.Dim()/COORD_DIM;
  1586. size_t surf_pt_cnt=node[j]->surf_coord.Dim()/COORD_DIM;
  1587. if(pt_cnt+surf_pt_cnt>0){
  1588. //Relative coord of upward check surf.
  1589. Real_t* c=node[j]->Coord();
  1590. std::vector<Real_t> uc_coord1(uc_cnt*COORD_DIM);
  1591. for(size_t i=0;i<uc_cnt;i++) for(int k=0;k<COORD_DIM;k++)
  1592. uc_coord1[i*COORD_DIM+k]=uc_coord[node[j]->Depth()][i*COORD_DIM+k]+c[k];
  1593. Profile::Add_FLOP((long long)uc_cnt*(long long)COORD_DIM);
  1594. //Compute upward check potential.
  1595. u_check.SetZero();
  1596. if(pt_cnt>0)
  1597. aux_kernel.ker_poten(&node[j]->src_coord[0], pt_cnt,
  1598. &node[j]->src_value[0], dof,
  1599. &uc_coord1[0], uc_cnt, u_check[0]);
  1600. if(surf_pt_cnt>0)
  1601. aux_kernel.dbl_layer_poten(&node[j]->surf_coord[0], surf_pt_cnt,
  1602. &node[j]->surf_value[0], dof,
  1603. &uc_coord1[0], uc_cnt, u_check[0]);
  1604. //Rearrange data.
  1605. {
  1606. Matrix<Real_t> M_tmp(M_uc2ue.Dim(0)/aux_kernel.ker_dim[1],aux_kernel.ker_dim[1]*dof, &u_check[0][0], false);
  1607. M_tmp=M_tmp.Transpose();
  1608. for(int i=0;i<dof;i++){
  1609. Matrix<Real_t> M_tmp1(aux_kernel.ker_dim[1], M_uc2ue.Dim(0)/aux_kernel.ker_dim[1], &u_check[i][0], false);
  1610. M_tmp1=M_tmp1.Transpose();
  1611. }
  1612. }
  1613. //Compute UC2UE (vector-matrix multiply).
  1614. Matrix<Real_t>::DGEMM(u_equiv,u_check,M_uc2ue,1.0);
  1615. }
  1616. //Adjusting for scale.
  1617. if(Homogen()){
  1618. Real_t scale_factor=pow(0.5, node[j]->Depth()*aux_kernel.poten_scale);
  1619. for(size_t i=0;i<upward_equiv.Dim();i++)
  1620. upward_equiv[i]*=scale_factor;
  1621. }
  1622. }
  1623. }
  1624. }
  1625. template <class FMMNode>
  1626. void FMM_Pts<FMMNode>::Up2Up(FMMNode** nodes, size_t n, int level){
  1627. if(n==0) return;
  1628. int dof=1;
  1629. size_t n_eq=this->interac_list.ClassMat(level,U2U_Type,0).Dim(1);
  1630. size_t mat_cnt=interac_list.ListCount(U2U_Type);
  1631. //For all non-leaf, non-ghost nodes.
  1632. #pragma omp parallel
  1633. {
  1634. int omp_p=omp_get_num_threads();
  1635. int pid = omp_get_thread_num();
  1636. size_t a=(pid*n)/omp_p;
  1637. size_t b=((pid+1)*n)/omp_p;
  1638. size_t cnt;
  1639. //Initialize this node's upward equivalent data
  1640. for(size_t i=a;i<b;i++){
  1641. Vector<Real_t>& upward_equiv=nodes[i]->FMMData()->upward_equiv;
  1642. assert(upward_equiv.Dim()==dof*n_eq);
  1643. upward_equiv.SetZero(); // TODO: Do in setup
  1644. UNUSED(upward_equiv);
  1645. UNUSED(n_eq);
  1646. }
  1647. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
  1648. Matrix<Real_t>& M = this->mat->Mat(level, U2U_Type, mat_indx);
  1649. if(M.Dim(0)!=0 && M.Dim(1)!=0){
  1650. cnt=0;
  1651. for(size_t i=a;i<b;i++)
  1652. if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0) cnt++;
  1653. Matrix<Real_t> src_vec(cnt*dof,M.Dim(0));
  1654. Matrix<Real_t> trg_vec(cnt*dof,M.Dim(1));
  1655. //Get child's multipole expansion.
  1656. cnt=0;
  1657. for(size_t i=a;i<b;i++)
  1658. if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0){
  1659. Vector<Real_t>& child_equiv=static_cast<FMMNode*>(nodes[i]->Child(mat_indx))->FMMData()->upward_equiv;
  1660. mem::memcopy(&(src_vec[cnt*dof][0]), &(child_equiv[0]), dof*M.Dim(0)*sizeof(Real_t));
  1661. cnt++;
  1662. }
  1663. Matrix<Real_t>::DGEMM(trg_vec,src_vec,M);
  1664. cnt=0;
  1665. size_t vec_len=M.Dim(1)*dof;
  1666. for(size_t i=a;i<b;i++)
  1667. if(/*!nodes[i]->IsGhost() && */!nodes[i]->IsLeaf()) if(((FMMNode*)nodes[i]->Child(mat_indx))->FMMData()->upward_equiv.Dim()>0){
  1668. Vector<Real_t>& upward_equiv=nodes[i]->FMMData()->upward_equiv;
  1669. assert(upward_equiv.Dim()==vec_len);
  1670. Matrix<Real_t> equiv (1,vec_len,&upward_equiv[0],false);
  1671. Matrix<Real_t> equiv_ (1,vec_len,trg_vec[cnt*dof],false);
  1672. equiv+=equiv_;
  1673. cnt++;
  1674. }
  1675. }
  1676. }
  1677. }
  1678. }
  1679. template <class FMMNode>
  1680. void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
  1681. Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
  1682. assert(node->FMMData()->upward_equiv.Dim()>0);
  1683. int dof=1;
  1684. Vector<Real_t>& upward_equiv=node->FMMData()->upward_equiv;
  1685. Vector<Real_t>& dnward_equiv=node->FMMData()->dnward_equiv;
  1686. assert(upward_equiv.Dim()==M.Dim(0)*dof);
  1687. assert(dnward_equiv.Dim()==M.Dim(1)*dof);
  1688. Matrix<Real_t> d_equiv(dof,M.Dim(0),&dnward_equiv[0],false);
  1689. Matrix<Real_t> u_equiv(dof,M.Dim(1),&upward_equiv[0],false);
  1690. Matrix<Real_t>::DGEMM(d_equiv,u_equiv,M);
  1691. }
  1692. template <class FMMNode>
  1693. void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec,
  1694. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  1695. size_t n1=m*2;
  1696. size_t n2=n1*n1;
  1697. size_t n3=n1*n2;
  1698. size_t n3_=n2*(n1/2+1);
  1699. size_t chld_cnt=1UL<<COORD_DIM;
  1700. size_t fftsize_in =2*n3_*chld_cnt*ker_dim0*dof;
  1701. int omp_p=omp_get_max_threads();
  1702. //Load permutation map.
  1703. size_t n=6*(m-1)*(m-1)+2;
  1704. static Vector<size_t> map;
  1705. { // Build map to reorder upward_equiv
  1706. size_t n_old=map.Dim();
  1707. if(n_old!=n){
  1708. Real_t c[3]={0,0,0};
  1709. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  1710. map.Resize(surf.Dim()/COORD_DIM);
  1711. for(size_t i=0;i<map.Dim();i++)
  1712. map[i]=((size_t)(m-1-surf[i*3]+0.5))+((size_t)(m-1-surf[i*3+1]+0.5))*n1+((size_t)(m-1-surf[i*3+2]+0.5))*n2;
  1713. }
  1714. }
  1715. { // Build FFTW plan.
  1716. if(!vlist_fft_flag){
  1717. int nnn[3]={(int)n1,(int)n1,(int)n1};
  1718. void *fftw_in, *fftw_out;
  1719. fftw_in = mem::aligned_malloc<Real_t>( n3 *ker_dim0*chld_cnt);
  1720. fftw_out = mem::aligned_malloc<Real_t>(2*n3_*ker_dim0*chld_cnt);
  1721. vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
  1722. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_, FFTW_ESTIMATE);
  1723. mem::aligned_free<Real_t>((Real_t*)fftw_in );
  1724. mem::aligned_free<Real_t>((Real_t*)fftw_out);
  1725. vlist_fft_flag=true;
  1726. }
  1727. }
  1728. { // Offload section
  1729. size_t n_in = fft_vec.Dim();
  1730. #pragma omp parallel for
  1731. for(int pid=0; pid<omp_p; pid++){
  1732. size_t node_start=(n_in*(pid ))/omp_p;
  1733. size_t node_end =(n_in*(pid+1))/omp_p;
  1734. Vector<Real_t> buffer(fftsize_in, &buffer_[fftsize_in*pid], false);
  1735. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  1736. Vector<Real_t*> upward_equiv(chld_cnt);
  1737. for(size_t i=0;i<chld_cnt;i++) upward_equiv[i]=&input_data[0] + fft_vec[node_idx] + n*ker_dim0*dof*i;
  1738. Vector<Real_t> upward_equiv_fft(fftsize_in, &output_data[fftsize_in *node_idx], false);
  1739. upward_equiv_fft.SetZero();
  1740. // Rearrange upward equivalent data.
  1741. for(size_t k=0;k<n;k++){
  1742. size_t idx=map[k];
  1743. for(int j1=0;j1<dof;j1++)
  1744. for(int j0=0;j0<(int)chld_cnt;j0++)
  1745. for(int i=0;i<ker_dim0;i++)
  1746. upward_equiv_fft[idx+(j0+(i+j1*ker_dim0)*chld_cnt)*n3]=upward_equiv[j0][ker_dim0*(n*j1+k)+i];
  1747. }
  1748. // Compute FFT.
  1749. for(int i=0;i<dof;i++)
  1750. FFTW_t<Real_t>::fft_execute_dft_r2c(vlist_fftplan, (Real_t*)&upward_equiv_fft[i* n3 *ker_dim0*chld_cnt],
  1751. (typename FFTW_t<Real_t>::cplx*)&buffer [i*2*n3_*ker_dim0*chld_cnt]);
  1752. //Compute flops.
  1753. #ifndef FFTW3_MKL
  1754. double add, mul, fma;
  1755. fftw_flops(vlist_fftplan, &add, &mul, &fma);
  1756. #ifndef __INTEL_OFFLOAD0
  1757. Profile::Add_FLOP((long long)(add+mul+2*fma));
  1758. #endif
  1759. #endif
  1760. for(int i=0;i<ker_dim0*dof;i++)
  1761. for(size_t j=0;j<n3_;j++)
  1762. for(size_t k=0;k<chld_cnt;k++){
  1763. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+0]=buffer[2*(n3_*(chld_cnt*i+k)+j)+0];
  1764. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+1]=buffer[2*(n3_*(chld_cnt*i+k)+j)+1];
  1765. }
  1766. }
  1767. }
  1768. }
  1769. }
  1770. template <class FMMNode>
  1771. void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector<size_t>& ifft_vec,
  1772. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_, Matrix<Real_t>& M){
  1773. size_t n1=m*2;
  1774. size_t n2=n1*n1;
  1775. size_t n3=n1*n2;
  1776. size_t n3_=n2*(n1/2+1);
  1777. size_t chld_cnt=1UL<<COORD_DIM;
  1778. size_t fftsize_out=2*n3_*dof*ker_dim1*chld_cnt;
  1779. int omp_p=omp_get_max_threads();
  1780. //Load permutation map.
  1781. size_t n=6*(m-1)*(m-1)+2;
  1782. static Vector<size_t> map;
  1783. { // Build map to reorder dnward_check
  1784. size_t n_old=map.Dim();
  1785. if(n_old!=n){
  1786. Real_t c[3]={0,0,0};
  1787. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  1788. map.Resize(surf.Dim()/COORD_DIM);
  1789. for(size_t i=0;i<map.Dim();i++)
  1790. map[i]=((size_t)(m*2-0.5-surf[i*3]))+((size_t)(m*2-0.5-surf[i*3+1]))*n1+((size_t)(m*2-0.5-surf[i*3+2]))*n2;
  1791. //map;//.AllocDevice(true);
  1792. }
  1793. }
  1794. { // Build FFTW plan.
  1795. if(!vlist_ifft_flag){
  1796. //Build FFTW plan.
  1797. int nnn[3]={(int)n1,(int)n1,(int)n1};
  1798. void *fftw_in, *fftw_out;
  1799. fftw_in = fftw_malloc(2*n3_*ker_dim1*sizeof(Real_t)*chld_cnt);
  1800. fftw_out = fftw_malloc( n3 *ker_dim1*sizeof(Real_t)*chld_cnt);
  1801. vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
  1802. (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE);
  1803. fftw_free(fftw_in);
  1804. fftw_free(fftw_out);
  1805. vlist_ifft_flag=true;
  1806. }
  1807. }
  1808. { // Offload section
  1809. size_t n_out=ifft_vec.Dim();
  1810. #pragma omp parallel for
  1811. for(int pid=0; pid<omp_p; pid++){
  1812. size_t node_start=(n_out*(pid ))/omp_p;
  1813. size_t node_end =(n_out*(pid+1))/omp_p;
  1814. Vector<Real_t> buffer(fftsize_out, &buffer_[fftsize_out*pid], false);
  1815. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  1816. Vector<Real_t> dnward_check_fft(fftsize_out, &input_data[fftsize_out*node_idx], false);
  1817. //De-interleave data.
  1818. for(int i=0;i<ker_dim1*dof;i++)
  1819. for(size_t j=0;j<n3_;j++)
  1820. for(size_t k=0;k<chld_cnt;k++){
  1821. buffer[2*(n3_*(ker_dim1*dof*k+i)+j)+0]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+0];
  1822. buffer[2*(n3_*(ker_dim1*dof*k+i)+j)+1]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+1];
  1823. }
  1824. // Compute FFT.
  1825. for(int i=0;i<dof;i++)
  1826. FFTW_t<Real_t>::fft_execute_dft_c2r(vlist_ifftplan, (typename FFTW_t<Real_t>::cplx*)&buffer [i*2*n3_*ker_dim1*chld_cnt],
  1827. (Real_t*)&dnward_check_fft[i* n3 *ker_dim1*chld_cnt]);
  1828. //Compute flops.
  1829. #ifndef FFTW3_MKL
  1830. double add, mul, fma;
  1831. fftw_flops(vlist_ifftplan, &add, &mul, &fma);
  1832. #ifndef __INTEL_OFFLOAD0
  1833. Profile::Add_FLOP((long long)(add+mul+2*fma));
  1834. #endif
  1835. #endif
  1836. // Rearrange downward check data.
  1837. for(size_t k=0;k<n;k++){
  1838. size_t idx=map[k];
  1839. for(int j1=0;j1<dof;j1++)
  1840. for(int j0=0;j0<(int)chld_cnt;j0++)
  1841. for(int i=0;i<ker_dim1;i++)
  1842. buffer[ker_dim1*(n*(dof*j0+j1)+k)+i]=dnward_check_fft[idx+(j1+(i+j0*ker_dim1)*dof)*n3];
  1843. }
  1844. // Compute check to equiv.
  1845. for(size_t j=0;j<chld_cnt;j++){
  1846. Matrix<Real_t> d_check(dof,M.Dim(0),&buffer[n*ker_dim1*dof*j],false);
  1847. Matrix<Real_t> d_equiv(dof,M.Dim(1),&output_data[0] + ifft_vec[node_idx] + M.Dim(1)*dof*j,false);
  1848. Matrix<Real_t>::DGEMM(d_equiv,d_check,M,1.0);
  1849. }
  1850. }
  1851. }
  1852. }
  1853. }
  1854. template <class Real_t>
  1855. void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector<size_t>& interac_dsp,
  1856. Vector<size_t>& interac_vec, Vector<Real_t*>& precomp_mat, Vector<Real_t>& fft_in, Vector<Real_t>& fft_out){
  1857. size_t chld_cnt=1UL<<COORD_DIM;
  1858. size_t fftsize_in =M_dim*ker_dim0*chld_cnt*2;
  1859. size_t fftsize_out=M_dim*ker_dim1*chld_cnt*2;
  1860. Real_t* zero_vec0=mem::aligned_malloc<Real_t>(fftsize_in );
  1861. Real_t* zero_vec1=mem::aligned_malloc<Real_t>(fftsize_out);
  1862. size_t n_out=fft_out.Dim()/fftsize_out;
  1863. // Set buff_out to zero.
  1864. #pragma omp parallel for
  1865. for(size_t k=0;k<n_out;k++){
  1866. Vector<Real_t> dnward_check_fft(fftsize_out, &fft_out[k*fftsize_out], false);
  1867. dnward_check_fft.SetZero();
  1868. }
  1869. // Build list of interaction pairs (in, out vectors).
  1870. size_t mat_cnt=precomp_mat.Dim();
  1871. size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
  1872. Real_t** IN_ =new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
  1873. Real_t** OUT_=new Real_t*[2*V_BLK_SIZE*blk1_cnt*mat_cnt];
  1874. #pragma omp parallel for
  1875. for(size_t interac_blk1=0; interac_blk1<blk1_cnt*mat_cnt; interac_blk1++){
  1876. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  1877. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  1878. size_t interac_cnt = interac_dsp1-interac_dsp0;
  1879. for(size_t j=0;j<interac_cnt;j++){
  1880. IN_ [2*V_BLK_SIZE*interac_blk1 +j]=&fft_in [interac_vec[(interac_dsp0+j)*2+0]];
  1881. OUT_[2*V_BLK_SIZE*interac_blk1 +j]=&fft_out[interac_vec[(interac_dsp0+j)*2+1]];
  1882. }
  1883. IN_ [2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec0;
  1884. OUT_[2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec1;
  1885. }
  1886. int omp_p=omp_get_max_threads();
  1887. #pragma omp parallel for
  1888. for(int pid=0; pid<omp_p; pid++){
  1889. size_t a=( pid *M_dim)/omp_p;
  1890. size_t b=((pid+1)*M_dim)/omp_p;
  1891. for(size_t blk1=0; blk1<blk1_cnt; blk1++)
  1892. for(size_t k=a; k< b; k++)
  1893. for(size_t mat_indx=0; mat_indx< mat_cnt;mat_indx++){
  1894. size_t interac_blk1 = blk1*mat_cnt+mat_indx;
  1895. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  1896. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  1897. size_t interac_cnt = interac_dsp1-interac_dsp0;
  1898. Real_t** IN = IN_ + 2*V_BLK_SIZE*interac_blk1;
  1899. Real_t** OUT= OUT_+ 2*V_BLK_SIZE*interac_blk1;
  1900. Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2;
  1901. #ifdef __SSE__
  1902. if (mat_indx +1 < mat_cnt){ // Prefetch
  1903. _mm_prefetch(((char *)(precomp_mat[mat_indx+1] + k*chld_cnt*chld_cnt*2)), _MM_HINT_T0);
  1904. _mm_prefetch(((char *)(precomp_mat[mat_indx+1] + k*chld_cnt*chld_cnt*2) + 64), _MM_HINT_T0);
  1905. }
  1906. #endif
  1907. for(int in_dim=0;in_dim<ker_dim0;in_dim++)
  1908. for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++){
  1909. for(size_t j=0;j<interac_cnt;j+=2){
  1910. Real_t* M_ = M;
  1911. Real_t* IN0 = IN [j+0] + (in_dim*M_dim+k)*chld_cnt*2;
  1912. Real_t* IN1 = IN [j+1] + (in_dim*M_dim+k)*chld_cnt*2;
  1913. Real_t* OUT0 = OUT[j+0] + (ot_dim*M_dim+k)*chld_cnt*2;
  1914. Real_t* OUT1 = OUT[j+1] + (ot_dim*M_dim+k)*chld_cnt*2;
  1915. #ifdef __SSE__
  1916. if (j+2 < interac_cnt) { // Prefetch
  1917. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  1918. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  1919. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  1920. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  1921. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  1922. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  1923. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  1924. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  1925. }
  1926. #endif
  1927. #ifdef __AVX__ //AVX code.
  1928. __m256d out00,out01,out10,out11;
  1929. __m256d out20,out21,out30,out31;
  1930. double* in0__ = IN0;
  1931. double* in1__ = IN1;
  1932. out00 = _mm256_load_pd(OUT0);
  1933. out01 = _mm256_load_pd(OUT1);
  1934. out10 = _mm256_load_pd(OUT0+4);
  1935. out11 = _mm256_load_pd(OUT1+4);
  1936. out20 = _mm256_load_pd(OUT0+8);
  1937. out21 = _mm256_load_pd(OUT1+8);
  1938. out30 = _mm256_load_pd(OUT0+12);
  1939. out31 = _mm256_load_pd(OUT1+12);
  1940. for(int i2=0;i2<8;i2+=2){
  1941. __m256d m00;
  1942. __m256d ot00;
  1943. __m256d mt0,mtt0;
  1944. __m256d in00,in00_r,in01,in01_r;
  1945. in00 = _mm256_broadcast_pd((const __m128d*)in0__);
  1946. in00_r = _mm256_permute_pd(in00,5);
  1947. in01 = _mm256_broadcast_pd((const __m128d*)in1__);
  1948. in01_r = _mm256_permute_pd(in01,5);
  1949. m00 = _mm256_load_pd(M_);
  1950. mt0 = _mm256_unpacklo_pd(m00,m00);
  1951. ot00 = _mm256_mul_pd(mt0,in00);
  1952. mtt0 = _mm256_unpackhi_pd(m00,m00);
  1953. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  1954. ot00 = _mm256_mul_pd(mt0,in01);
  1955. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  1956. m00 = _mm256_load_pd(M_+4);
  1957. mt0 = _mm256_unpacklo_pd(m00,m00);
  1958. ot00 = _mm256_mul_pd(mt0,in00);
  1959. mtt0 = _mm256_unpackhi_pd(m00,m00);
  1960. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  1961. ot00 = _mm256_mul_pd(mt0,in01);
  1962. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  1963. m00 = _mm256_load_pd(M_+8);
  1964. mt0 = _mm256_unpacklo_pd(m00,m00);
  1965. ot00 = _mm256_mul_pd(mt0,in00);
  1966. mtt0 = _mm256_unpackhi_pd(m00,m00);
  1967. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  1968. ot00 = _mm256_mul_pd(mt0,in01);
  1969. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  1970. m00 = _mm256_load_pd(M_+12);
  1971. mt0 = _mm256_unpacklo_pd(m00,m00);
  1972. ot00 = _mm256_mul_pd(mt0,in00);
  1973. mtt0 = _mm256_unpackhi_pd(m00,m00);
  1974. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  1975. ot00 = _mm256_mul_pd(mt0,in01);
  1976. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  1977. in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
  1978. in00_r = _mm256_permute_pd(in00,5);
  1979. in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
  1980. in01_r = _mm256_permute_pd(in01,5);
  1981. m00 = _mm256_load_pd(M_+16);
  1982. mt0 = _mm256_unpacklo_pd(m00,m00);
  1983. ot00 = _mm256_mul_pd(mt0,in00);
  1984. mtt0 = _mm256_unpackhi_pd(m00,m00);
  1985. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  1986. ot00 = _mm256_mul_pd(mt0,in01);
  1987. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  1988. m00 = _mm256_load_pd(M_+20);
  1989. mt0 = _mm256_unpacklo_pd(m00,m00);
  1990. ot00 = _mm256_mul_pd(mt0,in00);
  1991. mtt0 = _mm256_unpackhi_pd(m00,m00);
  1992. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  1993. ot00 = _mm256_mul_pd(mt0,in01);
  1994. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  1995. m00 = _mm256_load_pd(M_+24);
  1996. mt0 = _mm256_unpacklo_pd(m00,m00);
  1997. ot00 = _mm256_mul_pd(mt0,in00);
  1998. mtt0 = _mm256_unpackhi_pd(m00,m00);
  1999. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2000. ot00 = _mm256_mul_pd(mt0,in01);
  2001. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2002. m00 = _mm256_load_pd(M_+28);
  2003. mt0 = _mm256_unpacklo_pd(m00,m00);
  2004. ot00 = _mm256_mul_pd(mt0,in00);
  2005. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2006. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2007. ot00 = _mm256_mul_pd(mt0,in01);
  2008. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2009. M_ += 32;
  2010. in0__ += 4;
  2011. in1__ += 4;
  2012. }
  2013. _mm256_store_pd(OUT0,out00);
  2014. _mm256_store_pd(OUT1,out01);
  2015. _mm256_store_pd(OUT0+4,out10);
  2016. _mm256_store_pd(OUT1+4,out11);
  2017. _mm256_store_pd(OUT0+8,out20);
  2018. _mm256_store_pd(OUT1+8,out21);
  2019. _mm256_store_pd(OUT0+12,out30);
  2020. _mm256_store_pd(OUT1+12,out31);
  2021. #elif defined __SSE3__ // SSE code.
  2022. __m128d out00, out01, out10, out11;
  2023. __m128d in00, in01, in10, in11;
  2024. __m128d m00, m01, m10, m11;
  2025. //#pragma unroll
  2026. for(int i1=0;i1<8;i1+=2){
  2027. double* IN0_=IN0;
  2028. double* IN1_=IN1;
  2029. out00 =_mm_load_pd (OUT0 );
  2030. out10 =_mm_load_pd (OUT0+2);
  2031. out01 =_mm_load_pd (OUT1 );
  2032. out11 =_mm_load_pd (OUT1+2);
  2033. //#pragma unroll
  2034. for(int i2=0;i2<8;i2+=2){
  2035. m00 =_mm_load1_pd (M_ );
  2036. m10 =_mm_load1_pd (M_+ 2);
  2037. m01 =_mm_load1_pd (M_+16);
  2038. m11 =_mm_load1_pd (M_+18);
  2039. in00 =_mm_load_pd (IN0_ );
  2040. in10 =_mm_load_pd (IN0_+2);
  2041. in01 =_mm_load_pd (IN1_ );
  2042. in11 =_mm_load_pd (IN1_+2);
  2043. out00 = _mm_add_pd (out00, _mm_mul_pd(m00 , in00 ));
  2044. out00 = _mm_add_pd (out00, _mm_mul_pd(m01 , in10 ));
  2045. out01 = _mm_add_pd (out01, _mm_mul_pd(m00 , in01 ));
  2046. out01 = _mm_add_pd (out01, _mm_mul_pd(m01 , in11 ));
  2047. out10 = _mm_add_pd (out10, _mm_mul_pd(m10 , in00 ));
  2048. out10 = _mm_add_pd (out10, _mm_mul_pd(m11 , in10 ));
  2049. out11 = _mm_add_pd (out11, _mm_mul_pd(m10 , in01 ));
  2050. out11 = _mm_add_pd (out11, _mm_mul_pd(m11 , in11 ));
  2051. m00 =_mm_load1_pd (M_+ 1);
  2052. m10 =_mm_load1_pd (M_+ 2+1);
  2053. m01 =_mm_load1_pd (M_+16+1);
  2054. m11 =_mm_load1_pd (M_+18+1);
  2055. in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
  2056. in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
  2057. in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
  2058. in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
  2059. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
  2060. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
  2061. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
  2062. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
  2063. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
  2064. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
  2065. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
  2066. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
  2067. M_+=32; // Jump to (column+2).
  2068. IN0_+=4;
  2069. IN1_+=4;
  2070. }
  2071. _mm_store_pd (OUT0 ,out00);
  2072. _mm_store_pd (OUT0+2,out10);
  2073. _mm_store_pd (OUT1 ,out01);
  2074. _mm_store_pd (OUT1+2,out11);
  2075. M_+=4-64*2; // Jump back to first column (row+2).
  2076. OUT0+=4;
  2077. OUT1+=4;
  2078. }
  2079. #else // Generic code.
  2080. Real_t out_reg000, out_reg001, out_reg010, out_reg011;
  2081. Real_t out_reg100, out_reg101, out_reg110, out_reg111;
  2082. Real_t in_reg000, in_reg001, in_reg010, in_reg011;
  2083. Real_t in_reg100, in_reg101, in_reg110, in_reg111;
  2084. Real_t m_reg000, m_reg001, m_reg010, m_reg011;
  2085. Real_t m_reg100, m_reg101, m_reg110, m_reg111;
  2086. //#pragma unroll
  2087. for(int i1=0;i1<8;i1+=2){
  2088. Real_t* IN0_=IN0;
  2089. Real_t* IN1_=IN1;
  2090. out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
  2091. out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
  2092. out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
  2093. out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
  2094. //#pragma unroll
  2095. for(int i2=0;i2<8;i2+=2){
  2096. m_reg000=M_[ 0]; m_reg001=M_[ 1];
  2097. m_reg010=M_[ 2]; m_reg011=M_[ 3];
  2098. m_reg100=M_[16]; m_reg101=M_[17];
  2099. m_reg110=M_[18]; m_reg111=M_[19];
  2100. in_reg000=IN0_[0]; in_reg001=IN0_[1];
  2101. in_reg010=IN0_[2]; in_reg011=IN0_[3];
  2102. in_reg100=IN1_[0]; in_reg101=IN1_[1];
  2103. in_reg110=IN1_[2]; in_reg111=IN1_[3];
  2104. out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
  2105. out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
  2106. out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
  2107. out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
  2108. out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
  2109. out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
  2110. out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
  2111. out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
  2112. out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
  2113. out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
  2114. out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
  2115. out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
  2116. out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
  2117. out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
  2118. out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
  2119. out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
  2120. M_+=32; // Jump to (column+2).
  2121. IN0_+=4;
  2122. IN1_+=4;
  2123. }
  2124. OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
  2125. OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
  2126. OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
  2127. OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
  2128. M_+=4-64*2; // Jump back to first column (row+2).
  2129. OUT0+=4;
  2130. OUT1+=4;
  2131. }
  2132. #endif
  2133. }
  2134. M += M_dim*128;
  2135. }
  2136. }
  2137. }
  2138. // Compute flops.
  2139. {
  2140. Profile::Add_FLOP(8*8*8*(interac_vec.Dim()/2)*M_dim*ker_dim0*ker_dim1*dof);
  2141. }
  2142. // Free memory
  2143. delete[] IN_ ;
  2144. delete[] OUT_;
  2145. mem::aligned_free<Real_t>(zero_vec0);
  2146. mem::aligned_free<Real_t>(zero_vec1);
  2147. }
  2148. template <class FMMNode>
  2149. void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2150. if(level==0) return;
  2151. { // Set setup_data
  2152. setup_data.level=level;
  2153. setup_data.kernel=&aux_kernel;
  2154. setup_data.interac_type.resize(1);
  2155. setup_data.interac_type[0]=V1_Type;
  2156. setup_data. input_data=&buff[0];
  2157. setup_data.output_data=&buff[1];
  2158. Vector<FMMNode_t*>& nodes_in =n_list[2];
  2159. Vector<FMMNode_t*>& nodes_out=n_list[3];
  2160. setup_data.nodes_in .clear();
  2161. setup_data.nodes_out.clear();
  2162. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  2163. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level-1 || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  2164. }
  2165. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2166. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2167. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2168. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2169. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_in [i])->Child(0))->FMMData())->upward_equiv);
  2170. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_out[i])->Child(0))->FMMData())->dnward_equiv);
  2171. /////////////////////////////////////////////////////////////////////////////
  2172. size_t n_in =nodes_in .size();
  2173. size_t n_out=nodes_out.size();
  2174. // Setup precomputed data.
  2175. SetupPrecomp(setup_data,device);
  2176. // Build interac_data
  2177. Profile::Tic("Interac-Data",&this->comm,true,25);
  2178. Matrix<char>& interac_data=setup_data.interac_data;
  2179. if(n_out>0 && n_in >0){ // Build precomp_data, interac_data
  2180. size_t precomp_offset=0;
  2181. Mat_Type& interac_type=setup_data.interac_type[0];
  2182. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  2183. Vector<size_t> precomp_data_offset;
  2184. std::vector<size_t> interac_mat;
  2185. { // Load precomp_data for interac_type.
  2186. Matrix<char>& precomp_data=*setup_data.precomp_data;
  2187. char* indx_ptr=precomp_data[0]+precomp_offset;
  2188. size_t total_size=((size_t*)indx_ptr)[0]; indx_ptr+=sizeof(size_t);
  2189. /*size_t mat_cnt_ =((size_t*)indx_ptr)[0];*/ indx_ptr+=sizeof(size_t);
  2190. precomp_data_offset.ReInit((1+2+2)*mat_cnt, (size_t*)indx_ptr, false);
  2191. precomp_offset+=total_size;
  2192. for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
  2193. Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
  2194. assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
  2195. interac_mat.push_back(precomp_data_offset[5*mat_id]);
  2196. }
  2197. }
  2198. size_t dof;
  2199. size_t m=MultipoleOrder();
  2200. size_t ker_dim0=setup_data.kernel->ker_dim[0];
  2201. size_t ker_dim1=setup_data.kernel->ker_dim[1];
  2202. size_t fftsize;
  2203. {
  2204. size_t n1=m*2;
  2205. size_t n2=n1*n1;
  2206. size_t n3_=n2*(n1/2+1);
  2207. size_t chld_cnt=1UL<<COORD_DIM;
  2208. fftsize=2*n3_*chld_cnt;
  2209. dof=1;
  2210. }
  2211. int omp_p=omp_get_max_threads();
  2212. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  2213. size_t n_blk0=2*fftsize*dof*(ker_dim0*n_in +ker_dim1*n_out)*sizeof(Real_t)/buff_size;
  2214. if(n_blk0==0) n_blk0=1;
  2215. std::vector<std::vector<size_t> > fft_vec(n_blk0);
  2216. std::vector<std::vector<size_t> > ifft_vec(n_blk0);
  2217. std::vector<std::vector<size_t> > interac_vec(n_blk0);
  2218. std::vector<std::vector<size_t> > interac_dsp(n_blk0);
  2219. {
  2220. Matrix<Real_t>& input_data=*setup_data. input_data;
  2221. Matrix<Real_t>& output_data=*setup_data.output_data;
  2222. std::vector<std::vector<FMMNode*> > nodes_blk_in (n_blk0);
  2223. std::vector<std::vector<FMMNode*> > nodes_blk_out(n_blk0);
  2224. for(size_t i=0;i<n_in;i++) ((FMMNode*)nodes_in[i])->node_id=i;
  2225. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2226. size_t blk0_start=(n_out* blk0 )/n_blk0;
  2227. size_t blk0_end =(n_out*(blk0+1))/n_blk0;
  2228. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2229. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2230. { // Build node list for blk0.
  2231. std::set<void*> nodes_in;
  2232. for(size_t i=blk0_start;i<blk0_end;i++){
  2233. nodes_out_.push_back((FMMNode*)nodes_out[i]);
  2234. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  2235. for(size_t k=0;k<mat_cnt;k++) if(lst[k]!=NULL) nodes_in.insert(lst[k]);
  2236. }
  2237. for(std::set<void*>::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){
  2238. nodes_in_.push_back((FMMNode*)*node);
  2239. }
  2240. size_t input_dim=nodes_in_ .size()*ker_dim0*dof*fftsize;
  2241. size_t output_dim=nodes_out_.size()*ker_dim1*dof*fftsize;
  2242. size_t buffer_dim=(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2243. if(buff_size<(input_dim + output_dim + buffer_dim)*sizeof(Real_t))
  2244. buff_size=(input_dim + output_dim + buffer_dim)*sizeof(Real_t);
  2245. }
  2246. { // Set fft vectors.
  2247. for(size_t i=0;i<nodes_in_ .size();i++) fft_vec[blk0].push_back((size_t)(& input_vector[nodes_in_[i]->node_id][0][0]- input_data[0]));
  2248. for(size_t i=0;i<nodes_out_.size();i++)ifft_vec[blk0].push_back((size_t)(&output_vector[blk0_start + i ][0][0]-output_data[0]));
  2249. }
  2250. }
  2251. for(size_t blk0=0;blk0<n_blk0;blk0++){ // Hadamard interactions.
  2252. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2253. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2254. for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
  2255. { // Next blocking level.
  2256. size_t n_blk1=nodes_out_.size()*(ker_dim0+ker_dim1)/V_BLK_SIZE; //64 vectors should fit in L1.
  2257. if(n_blk1==0) n_blk1=1;
  2258. size_t interac_dsp_=0;
  2259. for(size_t blk1=0;blk1<n_blk1;blk1++){
  2260. size_t blk1_start=(nodes_out_.size()* blk1 )/n_blk1;
  2261. size_t blk1_end =(nodes_out_.size()*(blk1+1))/n_blk1;
  2262. for(size_t k=0;k<mat_cnt;k++){
  2263. for(size_t i=blk1_start;i<blk1_end;i++){
  2264. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type];
  2265. if(lst[k]!=NULL){
  2266. interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof);
  2267. interac_vec[blk0].push_back( i *fftsize*ker_dim1*dof);
  2268. interac_dsp_++;
  2269. }
  2270. }
  2271. interac_dsp[blk0].push_back(interac_dsp_);
  2272. }
  2273. }
  2274. }
  2275. }
  2276. }
  2277. { // Set interac_data.
  2278. size_t data_size=sizeof(size_t)*5; // m, dof, ker_dim0, ker_dim1, n_blk0
  2279. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2280. data_size+=sizeof(size_t)+ fft_vec[blk0].size()*sizeof(size_t);
  2281. data_size+=sizeof(size_t)+ ifft_vec[blk0].size()*sizeof(size_t);
  2282. data_size+=sizeof(size_t)+interac_vec[blk0].size()*sizeof(size_t);
  2283. data_size+=sizeof(size_t)+interac_dsp[blk0].size()*sizeof(size_t);
  2284. }
  2285. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  2286. if(data_size>interac_data.Dim(0)*interac_data.Dim(1))
  2287. interac_data.Resize(1,data_size);
  2288. char* data_ptr=&interac_data[0][0];
  2289. ((size_t*)data_ptr)[0]=buff_size; data_ptr+=sizeof(size_t);
  2290. ((size_t*)data_ptr)[0]= m; data_ptr+=sizeof(size_t);
  2291. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  2292. ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
  2293. ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
  2294. ((size_t*)data_ptr)[0]= n_blk0; data_ptr+=sizeof(size_t);
  2295. ((size_t*)data_ptr)[0]= interac_mat.size(); data_ptr+=sizeof(size_t);
  2296. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  2297. data_ptr+=interac_mat.size()*sizeof(size_t);
  2298. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2299. ((size_t*)data_ptr)[0]= fft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2300. mem::memcopy(data_ptr, & fft_vec[blk0][0], fft_vec[blk0].size()*sizeof(size_t));
  2301. data_ptr+= fft_vec[blk0].size()*sizeof(size_t);
  2302. ((size_t*)data_ptr)[0]=ifft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2303. mem::memcopy(data_ptr, &ifft_vec[blk0][0], ifft_vec[blk0].size()*sizeof(size_t));
  2304. data_ptr+=ifft_vec[blk0].size()*sizeof(size_t);
  2305. ((size_t*)data_ptr)[0]=interac_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2306. mem::memcopy(data_ptr, &interac_vec[blk0][0], interac_vec[blk0].size()*sizeof(size_t));
  2307. data_ptr+=interac_vec[blk0].size()*sizeof(size_t);
  2308. ((size_t*)data_ptr)[0]=interac_dsp[blk0].size(); data_ptr+=sizeof(size_t);
  2309. mem::memcopy(data_ptr, &interac_dsp[blk0][0], interac_dsp[blk0].size()*sizeof(size_t));
  2310. data_ptr+=interac_dsp[blk0].size()*sizeof(size_t);
  2311. }
  2312. }
  2313. }
  2314. Profile::Toc();
  2315. Profile::Tic("Host2Device",&this->comm,false,25);
  2316. if(device){ // Host2Device
  2317. setup_data.interac_data. AllocDevice(true);
  2318. }
  2319. Profile::Toc();
  2320. }
  2321. template <class FMMNode>
  2322. void FMM_Pts<FMMNode>::V_List (SetupData<Real_t>& setup_data, bool device){
  2323. assert(!device); //Can not run on accelerator yet.
  2324. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  2325. Profile::Tic("Host2Device",&this->comm,false,25);
  2326. Profile::Toc();
  2327. Profile::Tic("FFT",&comm,false,100);
  2328. Profile::Toc();
  2329. Profile::Tic("HadamardProduct",&comm,false,100);
  2330. Profile::Toc();
  2331. Profile::Tic("IFFT",&comm,false,100);
  2332. Profile::Toc();
  2333. return;
  2334. }
  2335. Profile::Tic("Host2Device",&this->comm,false,25);
  2336. int level=setup_data.level;
  2337. size_t buff_size=*((size_t*)&setup_data.interac_data[0][0]);
  2338. typename Matrix<Real_t>::Device M_d;
  2339. typename Vector<char>::Device buff;
  2340. typename Matrix<char>::Device precomp_data;
  2341. typename Matrix<char>::Device interac_data;
  2342. typename Matrix<Real_t>::Device input_data;
  2343. typename Matrix<Real_t>::Device output_data;
  2344. Matrix<Real_t>& M = this->mat->Mat(level, DC2DE_Type, 0);
  2345. if(device){
  2346. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.Resize(buff_size);
  2347. M_d = M. AllocDevice(false);
  2348. buff = this-> dev_buffer. AllocDevice(false);
  2349. precomp_data= setup_data.precomp_data->AllocDevice(false);
  2350. interac_data= setup_data.interac_data. AllocDevice(false);
  2351. input_data = setup_data. input_data->AllocDevice(false);
  2352. output_data = setup_data. output_data->AllocDevice(false);
  2353. }else{
  2354. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.Resize(buff_size);
  2355. M_d = M;
  2356. buff = this-> cpu_buffer;
  2357. precomp_data=*setup_data.precomp_data;
  2358. interac_data= setup_data.interac_data;
  2359. input_data =*setup_data. input_data;
  2360. output_data =*setup_data. output_data;
  2361. }
  2362. Profile::Toc();
  2363. { // Offloaded computation.
  2364. // Set interac_data.
  2365. size_t m, dof, ker_dim0, ker_dim1, n_blk0;
  2366. std::vector<Vector<size_t> > fft_vec;
  2367. std::vector<Vector<size_t> > ifft_vec;
  2368. std::vector<Vector<size_t> > interac_vec;
  2369. std::vector<Vector<size_t> > interac_dsp;
  2370. Vector<Real_t*> precomp_mat;
  2371. { // Set interac_data.
  2372. char* data_ptr=&interac_data[0][0];
  2373. buff_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2374. m =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2375. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2376. ker_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2377. ker_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2378. n_blk0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2379. fft_vec .resize(n_blk0);
  2380. ifft_vec.resize(n_blk0);
  2381. interac_vec.resize(n_blk0);
  2382. interac_dsp.resize(n_blk0);
  2383. Vector<size_t> interac_mat;
  2384. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2385. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  2386. precomp_mat.Resize(interac_mat.Dim());
  2387. for(size_t i=0;i<interac_mat.Dim();i++){
  2388. precomp_mat[i]=(Real_t*)(precomp_data[0]+interac_mat[i]);
  2389. }
  2390. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2391. fft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2392. data_ptr+=sizeof(size_t)+fft_vec[blk0].Dim()*sizeof(size_t);
  2393. ifft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2394. data_ptr+=sizeof(size_t)+ifft_vec[blk0].Dim()*sizeof(size_t);
  2395. interac_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2396. data_ptr+=sizeof(size_t)+interac_vec[blk0].Dim()*sizeof(size_t);
  2397. interac_dsp[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2398. data_ptr+=sizeof(size_t)+interac_dsp[blk0].Dim()*sizeof(size_t);
  2399. }
  2400. }
  2401. int omp_p=omp_get_max_threads();
  2402. size_t M_dim, fftsize;
  2403. {
  2404. size_t n1=m*2;
  2405. size_t n2=n1*n1;
  2406. size_t n3_=n2*(n1/2+1);
  2407. size_t chld_cnt=1UL<<COORD_DIM;
  2408. fftsize=2*n3_*chld_cnt;
  2409. M_dim=n3_;
  2410. }
  2411. for(size_t blk0=0;blk0<n_blk0;blk0++){ // interactions
  2412. size_t n_in = fft_vec[blk0].Dim();
  2413. size_t n_out=ifft_vec[blk0].Dim();
  2414. size_t input_dim=n_in *ker_dim0*dof*fftsize;
  2415. size_t output_dim=n_out*ker_dim1*dof*fftsize;
  2416. size_t buffer_dim=(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2417. Vector<Real_t> fft_in ( input_dim, (Real_t*)&buff[ 0 ],false);
  2418. Vector<Real_t> fft_out(output_dim, (Real_t*)&buff[ input_dim *sizeof(Real_t)],false);
  2419. Vector<Real_t> buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
  2420. { // FFT
  2421. Profile::Tic("FFT",&comm,false,100);
  2422. Vector<Real_t> input_data_( input_data.dim[0]* input_data.dim[1], input_data[0], false);
  2423. FFT_UpEquiv(dof, m, ker_dim0, fft_vec[blk0], input_data_, fft_in, buffer);
  2424. Profile::Toc();
  2425. }
  2426. { // Hadamard
  2427. #ifdef PVFMM_HAVE_PAPI
  2428. #ifdef __VERBOSE__
  2429. std::cout << "Starting counters new\n";
  2430. if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
  2431. #endif
  2432. #endif
  2433. Profile::Tic("HadamardProduct",&comm,false,100);
  2434. VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
  2435. Profile::Toc();
  2436. #ifdef PVFMM_HAVE_PAPI
  2437. #ifdef __VERBOSE__
  2438. if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
  2439. std::cout << "Stopping counters\n";
  2440. #endif
  2441. #endif
  2442. }
  2443. { // IFFT
  2444. Profile::Tic("IFFT",&comm,false,100);
  2445. Matrix<Real_t> M(M_d.dim[0],M_d.dim[1],M_d[0],false);
  2446. Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
  2447. FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], fft_out, output_data_, buffer, M);
  2448. Profile::Toc();
  2449. }
  2450. }
  2451. }
  2452. }
  2453. template <class FMMNode>
  2454. void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2455. { // Set setup_data
  2456. setup_data.level=level;
  2457. setup_data.kernel=&aux_kernel;
  2458. setup_data.interac_type.resize(1);
  2459. setup_data.interac_type[0]=D2D_Type;
  2460. setup_data. input_data=&buff[1];
  2461. setup_data.output_data=&buff[1];
  2462. Vector<FMMNode_t*>& nodes_in =n_list[1];
  2463. Vector<FMMNode_t*>& nodes_out=n_list[1];
  2464. setup_data.nodes_in .clear();
  2465. setup_data.nodes_out.clear();
  2466. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1) setup_data.nodes_in .push_back(nodes_in [i]);
  2467. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level ) setup_data.nodes_out.push_back(nodes_out[i]);
  2468. }
  2469. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2470. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2471. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2472. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2473. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
  2474. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
  2475. SetupInterac(setup_data,device);
  2476. }
  2477. template <class FMMNode>
  2478. void FMM_Pts<FMMNode>::Down2Down (SetupData<Real_t>& setup_data, bool device){
  2479. //Add Down2Down contribution.
  2480. EvalList(setup_data, device);
  2481. }
  2482. template <class FMMNode>
  2483. void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift_src, bool shift_trg, Matrix<Real_t>* M, bool device){
  2484. int level=setup_data.level;
  2485. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  2486. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2487. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2488. Matrix<Real_t>& output_data=*setup_data.output_data;
  2489. Matrix<Real_t>& input_data=*setup_data. input_data;
  2490. Matrix<Real_t>& coord_data=*setup_data. coord_data;
  2491. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector;
  2492. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector;
  2493. size_t n_in =nodes_in .size();
  2494. size_t n_out=nodes_out.size();
  2495. //setup_data.precomp_data=NULL;
  2496. // Build interac_data
  2497. Profile::Tic("Interac-Data",&this->comm,true,25);
  2498. Matrix<char>& interac_data=setup_data.interac_data;
  2499. if(n_out>0 && n_in >0){
  2500. size_t ker_dim0=setup_data.kernel->ker_dim[0];
  2501. size_t ker_dim1=setup_data.kernel->ker_dim[1];
  2502. size_t dof=1;
  2503. for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
  2504. std::vector<size_t> trg_interac_cnt(n_out,0);
  2505. std::vector<size_t> trg_coord(n_out);
  2506. std::vector<size_t> trg_value(n_out);
  2507. std::vector<size_t> trg_cnt(n_out);
  2508. std::vector<Real_t> scaling(n_out,0);
  2509. { // Set trg data
  2510. Mat_Type& interac_type=interac_type_lst[0];
  2511. for(size_t i=0;i<n_out;i++){
  2512. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  2513. trg_cnt [i]=output_vector[i*2+0]->Dim()/COORD_DIM;
  2514. trg_coord[i]=(size_t)(&output_vector[i*2+0][0][0]- coord_data[0]);
  2515. trg_value[i]=(size_t)(&output_vector[i*2+1][0][0]-output_data[0]);
  2516. if(!this->Homogen()) scaling[i]=1.0;
  2517. else if(interac_type==X_Type) scaling[i]=pow(0.5, setup_data.kernel->poten_scale *((FMMNode*)nodes_out[i])->Depth());
  2518. }
  2519. }
  2520. }
  2521. std::vector<std::vector<size_t> > src_cnt(n_out);
  2522. std::vector<std::vector<size_t> > src_coord(n_out);
  2523. std::vector<std::vector<size_t> > src_value(n_out);
  2524. std::vector<std::vector<Real_t> > shift_coord(n_out);
  2525. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  2526. Mat_Type& interac_type=interac_type_lst[type_indx];
  2527. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  2528. for(size_t i=0;i<n_out;i++){ // For each target node.
  2529. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  2530. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  2531. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++) if(lst[mat_indx]!=NULL){ // For each direction.
  2532. size_t j=lst[mat_indx]->node_id;
  2533. if(input_vector[j*4+0]->Dim()>0 || input_vector[j*4+2]->Dim()>0){
  2534. trg_interac_cnt[i]++;
  2535. { // Determine shift for periodic boundary condition
  2536. Real_t* sc=lst[mat_indx]->Coord();
  2537. Real_t* tc=((FMMNode*)nodes_out[i])->Coord();
  2538. int* rel_coord=this->interac_list.RelativeCoord(interac_type, mat_indx);
  2539. shift_coord[i].push_back((tc[0]>sc[0] && rel_coord[0]>0? 1.0:
  2540. (tc[0]<sc[0] && rel_coord[0]<0?-1.0:0.0)) +
  2541. (shift_src?sc[0]:0) - (shift_trg?tc[0]:0) );
  2542. shift_coord[i].push_back((tc[1]>sc[1] && rel_coord[1]>0? 1.0:
  2543. (tc[1]<sc[1] && rel_coord[1]<0?-1.0:0.0)) +
  2544. (shift_src?sc[1]:0) - (shift_trg?tc[1]:0) );
  2545. shift_coord[i].push_back((tc[2]>sc[2] && rel_coord[2]>0? 1.0:
  2546. (tc[2]<sc[2] && rel_coord[2]<0?-1.0:0.0)) +
  2547. (shift_src?sc[2]:0) - (shift_trg?tc[2]:0) );
  2548. }
  2549. { // Set src data
  2550. if(input_vector[j*4+0]!=NULL){
  2551. src_cnt [i].push_back(input_vector[j*4+0]->Dim()/COORD_DIM);
  2552. src_coord[i].push_back((size_t)(& input_vector[j*4+0][0][0]- coord_data[0]));
  2553. src_value[i].push_back((size_t)(& input_vector[j*4+1][0][0]- input_data[0]));
  2554. }else{
  2555. src_cnt [i].push_back(0);
  2556. src_coord[i].push_back(0);
  2557. src_value[i].push_back(0);
  2558. }
  2559. if(input_vector[j*4+2]!=NULL){
  2560. src_cnt [i].push_back(input_vector[j*4+2]->Dim()/COORD_DIM);
  2561. src_coord[i].push_back((size_t)(& input_vector[j*4+2][0][0]- coord_data[0]));
  2562. src_value[i].push_back((size_t)(& input_vector[j*4+3][0][0]- input_data[0]));
  2563. }else{
  2564. src_cnt [i].push_back(0);
  2565. src_coord[i].push_back(0);
  2566. src_value[i].push_back(0);
  2567. }
  2568. }
  2569. }
  2570. }
  2571. }
  2572. }
  2573. }
  2574. { // Set interac_data.
  2575. size_t data_size=sizeof(size_t)*4;
  2576. data_size+=sizeof(size_t)+trg_interac_cnt.size()*sizeof(size_t);
  2577. data_size+=sizeof(size_t)+trg_coord.size()*sizeof(size_t);
  2578. data_size+=sizeof(size_t)+trg_value.size()*sizeof(size_t);
  2579. data_size+=sizeof(size_t)+trg_cnt .size()*sizeof(size_t);
  2580. data_size+=sizeof(size_t)+scaling .size()*sizeof(Real_t);
  2581. data_size+=sizeof(size_t)*2+(M!=NULL?M->Dim(0)*M->Dim(1)*sizeof(Real_t):0);
  2582. for(size_t i=0;i<n_out;i++){
  2583. data_size+=sizeof(size_t)+src_cnt [i].size()*sizeof(size_t);
  2584. data_size+=sizeof(size_t)+src_coord[i].size()*sizeof(size_t);
  2585. data_size+=sizeof(size_t)+src_value[i].size()*sizeof(size_t);
  2586. data_size+=sizeof(size_t)+shift_coord[i].size()*sizeof(Real_t);
  2587. }
  2588. if(data_size>interac_data.Dim(0)*interac_data.Dim(1))
  2589. interac_data.Resize(1,data_size);
  2590. char* data_ptr=&interac_data[0][0];
  2591. ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t);
  2592. ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
  2593. ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
  2594. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  2595. ((size_t*)data_ptr)[0]=trg_interac_cnt.size(); data_ptr+=sizeof(size_t);
  2596. mem::memcopy(data_ptr, &trg_interac_cnt[0], trg_interac_cnt.size()*sizeof(size_t));
  2597. data_ptr+=trg_interac_cnt.size()*sizeof(size_t);
  2598. ((size_t*)data_ptr)[0]=trg_coord.size(); data_ptr+=sizeof(size_t);
  2599. mem::memcopy(data_ptr, &trg_coord[0], trg_coord.size()*sizeof(size_t));
  2600. data_ptr+=trg_coord.size()*sizeof(size_t);
  2601. ((size_t*)data_ptr)[0]=trg_value.size(); data_ptr+=sizeof(size_t);
  2602. mem::memcopy(data_ptr, &trg_value[0], trg_value.size()*sizeof(size_t));
  2603. data_ptr+=trg_value.size()*sizeof(size_t);
  2604. ((size_t*)data_ptr)[0]=trg_cnt.size(); data_ptr+=sizeof(size_t);
  2605. mem::memcopy(data_ptr, &trg_cnt[0], trg_cnt.size()*sizeof(size_t));
  2606. data_ptr+=trg_cnt.size()*sizeof(size_t);
  2607. ((size_t*)data_ptr)[0]=scaling.size(); data_ptr+=sizeof(size_t);
  2608. mem::memcopy(data_ptr, &scaling[0], scaling.size()*sizeof(Real_t));
  2609. data_ptr+=scaling.size()*sizeof(Real_t);
  2610. if(M!=NULL){
  2611. ((size_t*)data_ptr)[0]=M->Dim(0); data_ptr+=sizeof(size_t);
  2612. ((size_t*)data_ptr)[0]=M->Dim(1); data_ptr+=sizeof(size_t);
  2613. mem::memcopy(data_ptr, M[0][0], M->Dim(0)*M->Dim(1)*sizeof(Real_t));
  2614. data_ptr+=M->Dim(0)*M->Dim(1)*sizeof(Real_t);
  2615. }else{
  2616. ((size_t*)data_ptr)[0]=0; data_ptr+=sizeof(size_t);
  2617. ((size_t*)data_ptr)[0]=0; data_ptr+=sizeof(size_t);
  2618. }
  2619. for(size_t i=0;i<n_out;i++){
  2620. ((size_t*)data_ptr)[0]=src_cnt[i].size(); data_ptr+=sizeof(size_t);
  2621. mem::memcopy(data_ptr, &src_cnt[i][0], src_cnt[i].size()*sizeof(size_t));
  2622. data_ptr+=src_cnt[i].size()*sizeof(size_t);
  2623. ((size_t*)data_ptr)[0]=src_coord[i].size(); data_ptr+=sizeof(size_t);
  2624. mem::memcopy(data_ptr, &src_coord[i][0], src_coord[i].size()*sizeof(size_t));
  2625. data_ptr+=src_coord[i].size()*sizeof(size_t);
  2626. ((size_t*)data_ptr)[0]=src_value[i].size(); data_ptr+=sizeof(size_t);
  2627. mem::memcopy(data_ptr, &src_value[i][0], src_value[i].size()*sizeof(size_t));
  2628. data_ptr+=src_value[i].size()*sizeof(size_t);
  2629. ((size_t*)data_ptr)[0]=shift_coord[i].size(); data_ptr+=sizeof(size_t);
  2630. mem::memcopy(data_ptr, &shift_coord[i][0], shift_coord[i].size()*sizeof(Real_t));
  2631. data_ptr+=shift_coord[i].size()*sizeof(Real_t);
  2632. }
  2633. }
  2634. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  2635. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.Resize(buff_size);
  2636. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.Resize(buff_size);
  2637. }
  2638. Profile::Toc();
  2639. Profile::Tic("Host2Device",&this->comm,false,25);
  2640. if(device){ // Host2Device
  2641. setup_data.interac_data .AllocDevice(true);
  2642. }
  2643. Profile::Toc();
  2644. }
  2645. template <class FMMNode>
  2646. void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
  2647. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  2648. Profile::Tic("Host2Device",&this->comm,false,25);
  2649. Profile::Toc();
  2650. Profile::Tic("DeviceComp",&this->comm,false,20);
  2651. Profile::Toc();
  2652. return;
  2653. }
  2654. Profile::Tic("Host2Device",&this->comm,false,25);
  2655. typename Vector<char>::Device buff;
  2656. //typename Matrix<char>::Device precomp_data;
  2657. typename Matrix<char>::Device interac_data;
  2658. typename Matrix<Real_t>::Device coord_data;
  2659. typename Matrix<Real_t>::Device input_data;
  2660. typename Matrix<Real_t>::Device output_data;
  2661. if(device){
  2662. buff = this-> dev_buffer. AllocDevice(false);
  2663. interac_data= setup_data.interac_data. AllocDevice(false);
  2664. //if(setup_data.precomp_data!=NULL) precomp_data= setup_data.precomp_data->AllocDevice(false);
  2665. if(setup_data. coord_data!=NULL) coord_data = setup_data. coord_data->AllocDevice(false);
  2666. if(setup_data. input_data!=NULL) input_data = setup_data. input_data->AllocDevice(false);
  2667. if(setup_data. output_data!=NULL) output_data = setup_data. output_data->AllocDevice(false);
  2668. }else{
  2669. buff = this-> cpu_buffer;
  2670. interac_data= setup_data.interac_data;
  2671. //if(setup_data.precomp_data!=NULL) precomp_data=*setup_data.precomp_data;
  2672. if(setup_data. coord_data!=NULL) coord_data =*setup_data. coord_data;
  2673. if(setup_data. input_data!=NULL) input_data =*setup_data. input_data;
  2674. if(setup_data. output_data!=NULL) output_data =*setup_data. output_data;
  2675. }
  2676. Profile::Toc();
  2677. size_t ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten;
  2678. size_t ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
  2679. Profile::Tic("DeviceComp",&this->comm,false,20);
  2680. #ifdef __INTEL_OFFLOAD
  2681. int lock_idx=-1;
  2682. int wait_lock_idx=-1;
  2683. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  2684. if(device) lock_idx=MIC_Lock::get_lock();
  2685. if(device) ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten;
  2686. if(device) ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten;
  2687. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  2688. #endif
  2689. { // Offloaded computation.
  2690. // Set interac_data.
  2691. //size_t data_size;
  2692. //size_t ker_dim0;
  2693. size_t ker_dim1;
  2694. size_t dof, n_out;
  2695. Vector<size_t> trg_interac_cnt;
  2696. Vector<size_t> trg_coord;
  2697. Vector<size_t> trg_value;
  2698. Vector<size_t> trg_cnt;
  2699. Vector<Real_t> scaling;
  2700. Matrix<Real_t> M;
  2701. Vector< Vector<size_t> > src_cnt;
  2702. Vector< Vector<size_t> > src_coord;
  2703. Vector< Vector<size_t> > src_value;
  2704. Vector< Vector<Real_t> > shift_coord;
  2705. { // Set interac_data.
  2706. char* data_ptr=&interac_data[0][0];
  2707. /*data_size=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t);
  2708. /*ker_dim0=((size_t*)data_ptr)[0];*/ data_ptr+=sizeof(size_t);
  2709. ker_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2710. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2711. trg_interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2712. data_ptr+=sizeof(size_t)+trg_interac_cnt.Dim()*sizeof(size_t);
  2713. n_out=trg_interac_cnt.Dim();
  2714. trg_coord.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2715. data_ptr+=sizeof(size_t)+trg_coord.Dim()*sizeof(size_t);
  2716. trg_value.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2717. data_ptr+=sizeof(size_t)+trg_value.Dim()*sizeof(size_t);
  2718. trg_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2719. data_ptr+=sizeof(size_t)+trg_cnt.Dim()*sizeof(size_t);
  2720. scaling.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  2721. data_ptr+=sizeof(size_t)+scaling.Dim()*sizeof(Real_t);
  2722. M.ReInit(((size_t*)data_ptr)[0],((size_t*)data_ptr)[1],(Real_t*)(data_ptr+2*sizeof(size_t)),false);
  2723. data_ptr+=sizeof(size_t)*2+M.Dim(0)*M.Dim(1)*sizeof(Real_t);
  2724. src_cnt.Resize(n_out);
  2725. src_coord.Resize(n_out);
  2726. src_value.Resize(n_out);
  2727. shift_coord.Resize(n_out);
  2728. for(size_t i=0;i<n_out;i++){
  2729. src_cnt[i].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2730. data_ptr+=sizeof(size_t)+src_cnt[i].Dim()*sizeof(size_t);
  2731. src_coord[i].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2732. data_ptr+=sizeof(size_t)+src_coord[i].Dim()*sizeof(size_t);
  2733. src_value[i].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2734. data_ptr+=sizeof(size_t)+src_value[i].Dim()*sizeof(size_t);
  2735. shift_coord[i].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  2736. data_ptr+=sizeof(size_t)+shift_coord[i].Dim()*sizeof(Real_t);
  2737. }
  2738. }
  2739. #ifdef __INTEL_OFFLOAD
  2740. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  2741. #endif
  2742. //Compute interaction from point sources.
  2743. { // interactions
  2744. typename Kernel<Real_t>::Ker_t single_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_single_layer_kernel;
  2745. typename Kernel<Real_t>::Ker_t double_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_double_layer_kernel;
  2746. int omp_p=omp_get_max_threads();
  2747. Vector<Real_t*> thread_buff(omp_p);
  2748. size_t thread_buff_size=buff.dim/sizeof(Real_t)/omp_p;
  2749. for(int i=0;i<omp_p;i++) thread_buff[i]=(Real_t*)&buff[i*thread_buff_size*sizeof(Real_t)];
  2750. #pragma omp parallel for
  2751. for(size_t i=0;i<n_out;i++)
  2752. if(trg_interac_cnt[i]>0 && trg_cnt[i]>0){
  2753. int thread_id=omp_get_thread_num();
  2754. Real_t* s_coord=thread_buff[thread_id];
  2755. Real_t* t_value=output_data[0]+trg_value[i];
  2756. if(M.Dim(0)>0 && M.Dim(1)>0){
  2757. s_coord+=dof*M.Dim(0);
  2758. t_value=thread_buff[thread_id];
  2759. for(size_t j=0;j<dof*M.Dim(0);j++) t_value[j]=0;
  2760. }
  2761. size_t interac_cnt=0;
  2762. for(size_t j=0;j<trg_interac_cnt[i];j++){
  2763. if(ptr_single_layer_kernel!=(size_t)NULL){// Single layer kernel
  2764. Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+0];
  2765. assert(thread_buff_size>=dof*M.Dim(0)+src_cnt[i][2*j+0]*COORD_DIM);
  2766. for(size_t k1=0;k1<src_cnt[i][2*j+0];k1++){ // Compute shifted source coordinates.
  2767. for(size_t k0=0;k0<COORD_DIM;k0++){
  2768. s_coord[k1*COORD_DIM+k0]=src_coord_[k1*COORD_DIM+k0]+shift_coord[i][j*COORD_DIM+k0];
  2769. }
  2770. }
  2771. single_layer_kernel( s_coord , src_cnt[i][2*j+0], input_data[0]+src_value[i][2*j+0], dof,
  2772. coord_data[0]+trg_coord[i], trg_cnt[i] , t_value);
  2773. interac_cnt+=src_cnt[i][2*j+0]*trg_cnt[i];
  2774. }
  2775. if(ptr_double_layer_kernel!=(size_t)NULL){// Double layer kernel
  2776. Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+1];
  2777. assert(thread_buff_size>=dof*M.Dim(0)+src_cnt[i][2*j+1]*COORD_DIM);
  2778. for(size_t k1=0;k1<src_cnt[i][2*j+1];k1++){ // Compute shifted source coordinates.
  2779. for(size_t k0=0;k0<COORD_DIM;k0++){
  2780. s_coord[k1*COORD_DIM+k0]=src_coord_[k1*COORD_DIM+k0]+shift_coord[i][j*COORD_DIM+k0];
  2781. }
  2782. }
  2783. double_layer_kernel( s_coord , src_cnt[i][2*j+1], input_data[0]+src_value[i][2*j+1], dof,
  2784. coord_data[0]+trg_coord[i], trg_cnt[i] , t_value);
  2785. interac_cnt+=src_cnt[i][2*j+1]*trg_cnt[i];
  2786. }
  2787. }
  2788. if(M.Dim(0)>0 && M.Dim(1)>0 && interac_cnt>0){
  2789. assert(trg_cnt[i]*ker_dim1==M.Dim(0)); UNUSED(ker_dim1);
  2790. for(size_t j=0;j<dof*M.Dim(0);j++) t_value[j]*=scaling[i];
  2791. Matrix<Real_t> in_vec(dof, M.Dim(0), t_value , false);
  2792. Matrix<Real_t> out_vec(dof, M.Dim(1), output_data[0]+trg_value[i], false);
  2793. Matrix<Real_t>::DGEMM(out_vec, in_vec, M, 1.0);
  2794. }
  2795. }
  2796. }
  2797. #ifdef __INTEL_OFFLOAD
  2798. if(device) MIC_Lock::release_lock(lock_idx);
  2799. #endif
  2800. }
  2801. #ifndef __MIC_ASYNCH__
  2802. #ifdef __INTEL_OFFLOAD
  2803. #pragma offload if(device) target(mic:0)
  2804. {if(device) MIC_Lock::wait_lock(lock_idx);}
  2805. #endif
  2806. #endif
  2807. Profile::Toc();
  2808. }
  2809. template <class FMMNode>
  2810. void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2811. { // Set setup_data
  2812. setup_data.level=level;
  2813. setup_data.kernel=&aux_kernel;
  2814. setup_data.interac_type.resize(1);
  2815. setup_data.interac_type[0]=X_Type;
  2816. setup_data. input_data=&buff[4];
  2817. setup_data.output_data=&buff[1];
  2818. setup_data. coord_data=&buff[6];
  2819. Vector<FMMNode_t*>& nodes_in =n_list[4];
  2820. Vector<FMMNode_t*>& nodes_out=n_list[1];
  2821. setup_data.nodes_in .clear();
  2822. setup_data.nodes_out.clear();
  2823. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  2824. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  2825. }
  2826. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2827. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2828. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2829. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2830. for(size_t i=0;i<nodes_in .size();i++){
  2831. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
  2832. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
  2833. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
  2834. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
  2835. }
  2836. for(size_t i=0;i<nodes_out.size();i++){
  2837. output_vector.push_back(&dnwd_check_surf[((FMMNode*)nodes_out[i])->Depth()]);
  2838. output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
  2839. }
  2840. //Downward check to downward equivalent matrix.
  2841. Matrix<Real_t>& M_dc2de = this->mat->Mat(level, DC2DE_Type, 0);
  2842. this->SetupInteracPts(setup_data, false, true, &M_dc2de,device);
  2843. { // Resize device buffer
  2844. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  2845. if(this->dev_buffer.Dim()<n) this->dev_buffer.Resize(n);
  2846. }
  2847. }
  2848. template <class FMMNode>
  2849. void FMM_Pts<FMMNode>::X_List (SetupData<Real_t>& setup_data, bool device){
  2850. //Add X_List contribution.
  2851. this->EvalListPts(setup_data, device);
  2852. }
  2853. template <class FMMNode>
  2854. void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2855. { // Set setup_data
  2856. setup_data.level=level;
  2857. setup_data.kernel=&kernel;
  2858. setup_data.interac_type.resize(1);
  2859. setup_data.interac_type[0]=W_Type;
  2860. setup_data. input_data=&buff[0];
  2861. setup_data.output_data=&buff[5];
  2862. setup_data. coord_data=&buff[6];
  2863. Vector<FMMNode_t*>& nodes_in =n_list[0];
  2864. Vector<FMMNode_t*>& nodes_out=n_list[5];
  2865. setup_data.nodes_in .clear();
  2866. setup_data.nodes_out.clear();
  2867. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level+1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  2868. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  2869. }
  2870. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2871. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2872. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2873. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2874. for(size_t i=0;i<nodes_in .size();i++){
  2875. input_vector .push_back(&upwd_equiv_surf[((FMMNode*)nodes_in [i])->Depth()]);
  2876. input_vector .push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
  2877. input_vector .push_back(NULL);
  2878. input_vector .push_back(NULL);
  2879. }
  2880. for(size_t i=0;i<nodes_out.size();i++){
  2881. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_coord);
  2882. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value);
  2883. }
  2884. this->SetupInteracPts(setup_data, true, false, NULL, device);
  2885. { // Resize device buffer
  2886. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  2887. if(this->dev_buffer.Dim()<n) this->dev_buffer.Resize(n);
  2888. }
  2889. }
  2890. template <class FMMNode>
  2891. void FMM_Pts<FMMNode>::W_List (SetupData<Real_t>& setup_data, bool device){
  2892. //Add W_List contribution.
  2893. this->EvalListPts(setup_data, device);
  2894. }
  2895. template <class FMMNode>
  2896. void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2897. { // Set setup_data
  2898. setup_data.level=level;
  2899. setup_data.kernel=&kernel;
  2900. setup_data.interac_type.resize(3);
  2901. setup_data.interac_type[0]=U0_Type;
  2902. setup_data.interac_type[1]=U1_Type;
  2903. setup_data.interac_type[2]=U2_Type;
  2904. setup_data. input_data=&buff[4];
  2905. setup_data.output_data=&buff[5];
  2906. setup_data. coord_data=&buff[6];
  2907. Vector<FMMNode_t*>& nodes_in =n_list[4];
  2908. Vector<FMMNode_t*>& nodes_out=n_list[5];
  2909. setup_data.nodes_in .clear();
  2910. setup_data.nodes_out.clear();
  2911. for(size_t i=0;i<nodes_in .Dim();i++) if((level-1<=nodes_in [i]->Depth() && nodes_in [i]->Depth()<=level+1) || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  2912. for(size_t i=0;i<nodes_out.Dim();i++) if(( nodes_out[i]->Depth()==level ) || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  2913. }
  2914. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2915. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2916. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2917. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2918. for(size_t i=0;i<nodes_in .size();i++){
  2919. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
  2920. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
  2921. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
  2922. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
  2923. }
  2924. for(size_t i=0;i<nodes_out.size();i++){
  2925. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_coord);
  2926. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value);
  2927. }
  2928. this->SetupInteracPts(setup_data, false, false, NULL, device);
  2929. { // Resize device buffer
  2930. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  2931. if(this->dev_buffer.Dim()<n) this->dev_buffer.Resize(n);
  2932. }
  2933. }
  2934. template <class FMMNode>
  2935. void FMM_Pts<FMMNode>::U_List (SetupData<Real_t>& setup_data, bool device){
  2936. //Add U_List contribution.
  2937. this->EvalListPts(setup_data, device);
  2938. }
  2939. template <class FMMNode>
  2940. void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2941. { // Set setup_data
  2942. setup_data.level=level;
  2943. setup_data.kernel=&kernel;
  2944. setup_data.interac_type.resize(1);
  2945. setup_data.interac_type[0]=D2T_Type;
  2946. setup_data. input_data=&buff[1];
  2947. setup_data.output_data=&buff[5];
  2948. setup_data. coord_data=&buff[6];
  2949. Vector<FMMNode_t*>& nodes_in =n_list[1];
  2950. Vector<FMMNode_t*>& nodes_out=n_list[5];
  2951. setup_data.nodes_in .clear();
  2952. setup_data.nodes_out.clear();
  2953. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  2954. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  2955. }
  2956. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2957. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2958. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2959. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2960. for(size_t i=0;i<nodes_in .size();i++){
  2961. input_vector .push_back(&dnwd_equiv_surf[((FMMNode*)nodes_in [i])->Depth()]);
  2962. input_vector .push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
  2963. input_vector .push_back(NULL);
  2964. input_vector .push_back(NULL);
  2965. }
  2966. for(size_t i=0;i<nodes_out.size();i++){
  2967. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_coord);
  2968. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value);
  2969. }
  2970. this->SetupInteracPts(setup_data, true, false, NULL, device);
  2971. { // Resize device buffer
  2972. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  2973. if(this->dev_buffer.Dim()<n) this->dev_buffer.Resize(n);
  2974. }
  2975. }
  2976. template <class FMMNode>
  2977. void FMM_Pts<FMMNode>::Down2Target(SetupData<Real_t>& setup_data, bool device){
  2978. //Add Down2Target contribution.
  2979. this->EvalListPts(setup_data, device);
  2980. }
  2981. template <class FMMNode>
  2982. void FMM_Pts<FMMNode>::PostProcessing(std::vector<FMMNode_t*>& nodes){
  2983. }
  2984. template <class FMMNode>
  2985. void FMM_Pts<FMMNode>::CopyOutput(FMMNode** nodes, size_t n){
  2986. // for(size_t i=0;i<n;i++){
  2987. // FMMData* fmm_data=((FMMData*)nodes[i]->FMMData());
  2988. // }
  2989. }
  2990. }//end namespace