fmm_pts.txx 145 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841
  1. /**
  2. * \file fmm_pts.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 3-07-2011
  5. * \brief This file contains the implementation of the FMM_Pts class.
  6. */
  7. #include <omp.h>
  8. #include <cmath>
  9. #include <cstdlib>
  10. #include <cassert>
  11. #include <sstream>
  12. #include <iostream>
  13. #include <stdint.h>
  14. #include <set>
  15. #ifdef PVFMM_HAVE_SYS_STAT_H
  16. #include <sys/stat.h>
  17. #endif
  18. #ifdef __SSE__
  19. #include <xmmintrin.h>
  20. #endif
  21. #ifdef __SSE2__
  22. #include <emmintrin.h>
  23. #endif
  24. #ifdef __SSE3__
  25. #include <pmmintrin.h>
  26. #endif
  27. #ifdef __AVX__
  28. #include <immintrin.h>
  29. #endif
  30. #if defined(__MIC__)
  31. #include <immintrin.h>
  32. #endif
  33. #include <profile.hpp>
  34. namespace pvfmm{
  35. /**
  36. * \brief Returns the coordinates of points on the surface of a cube.
  37. * \param[in] p Number of points on an edge of the cube is (n+1)
  38. * \param[in] c Coordinates to the centre of the cube (3D array).
  39. * \param[in] alpha Scaling factor for the size of the cube.
  40. * \param[in] depth Depth of the cube in the octree.
  41. * \return Vector with coordinates of points on the surface of the cube in the
  42. * format [x0 y0 z0 x1 y1 z1 .... ].
  43. */
  44. template <class Real_t>
  45. std::vector<Real_t> surface(int p, Real_t* c, Real_t alpha, int depth){
  46. size_t n_=(6*(p-1)*(p-1)+2); //Total number of points.
  47. std::vector<Real_t> coord(n_*3);
  48. coord[0]=coord[1]=coord[2]=-1.0;
  49. size_t cnt=1;
  50. for(int i=0;i<p-1;i++)
  51. for(int j=0;j<p-1;j++){
  52. coord[cnt*3 ]=-1.0;
  53. coord[cnt*3+1]=(2.0*(i+1)-p+1)/(p-1);
  54. coord[cnt*3+2]=(2.0*j-p+1)/(p-1);
  55. cnt++;
  56. }
  57. for(int i=0;i<p-1;i++)
  58. for(int j=0;j<p-1;j++){
  59. coord[cnt*3 ]=(2.0*i-p+1)/(p-1);
  60. coord[cnt*3+1]=-1.0;
  61. coord[cnt*3+2]=(2.0*(j+1)-p+1)/(p-1);
  62. cnt++;
  63. }
  64. for(int i=0;i<p-1;i++)
  65. for(int j=0;j<p-1;j++){
  66. coord[cnt*3 ]=(2.0*(i+1)-p+1)/(p-1);
  67. coord[cnt*3+1]=(2.0*j-p+1)/(p-1);
  68. coord[cnt*3+2]=-1.0;
  69. cnt++;
  70. }
  71. for(size_t i=0;i<(n_/2)*3;i++)
  72. coord[cnt*3+i]=-coord[i];
  73. Real_t r = 0.5*pow(0.5,depth);
  74. Real_t b = alpha*r;
  75. for(size_t i=0;i<n_;i++){
  76. coord[i*3+0]=(coord[i*3+0]+1.0)*b+c[0];
  77. coord[i*3+1]=(coord[i*3+1]+1.0)*b+c[1];
  78. coord[i*3+2]=(coord[i*3+2]+1.0)*b+c[2];
  79. }
  80. return coord;
  81. }
  82. /**
  83. * \brief Returns the coordinates of points on the upward check surface of cube.
  84. * \see surface()
  85. */
  86. template <class Real_t>
  87. std::vector<Real_t> u_check_surf(int p, Real_t* c, int depth){
  88. Real_t r=0.5*pow(0.5,depth);
  89. Real_t coord[3]={c[0]-r*(RAD1-1.0),c[1]-r*(RAD1-1.0),c[2]-r*(RAD1-1.0)};
  90. return surface(p,coord,(Real_t)RAD1,depth);
  91. }
  92. /**
  93. * \brief Returns the coordinates of points on the upward equivalent surface of cube.
  94. * \see surface()
  95. */
  96. template <class Real_t>
  97. std::vector<Real_t> u_equiv_surf(int p, Real_t* c, int depth){
  98. Real_t r=0.5*pow(0.5,depth);
  99. Real_t coord[3]={c[0]-r*(RAD0-1.0),c[1]-r*(RAD0-1.0),c[2]-r*(RAD0-1.0)};
  100. return surface(p,coord,(Real_t)RAD0,depth);
  101. }
  102. /**
  103. * \brief Returns the coordinates of points on the downward check surface of cube.
  104. * \see surface()
  105. */
  106. template <class Real_t>
  107. std::vector<Real_t> d_check_surf(int p, Real_t* c, int depth){
  108. Real_t r=0.5*pow(0.5,depth);
  109. Real_t coord[3]={c[0]-r*(RAD0-1.0),c[1]-r*(RAD0-1.0),c[2]-r*(RAD0-1.0)};
  110. return surface(p,coord,(Real_t)RAD0,depth);
  111. }
  112. /**
  113. * \brief Returns the coordinates of points on the downward equivalent surface of cube.
  114. * \see surface()
  115. */
  116. template <class Real_t>
  117. std::vector<Real_t> d_equiv_surf(int p, Real_t* c, int depth){
  118. Real_t r=0.5*pow(0.5,depth);
  119. Real_t coord[3]={c[0]-r*(RAD1-1.0),c[1]-r*(RAD1-1.0),c[2]-r*(RAD1-1.0)};
  120. return surface(p,coord,(Real_t)RAD1,depth);
  121. }
  122. /**
  123. * \brief Defines the 3D grid for convolution in FFT acceleration of V-list.
  124. * \see surface()
  125. */
  126. template <class Real_t>
  127. std::vector<Real_t> conv_grid(int p, Real_t* c, int depth){
  128. Real_t r=pow(0.5,depth);
  129. Real_t a=r*RAD0;
  130. Real_t coord[3]={c[0],c[1],c[2]};
  131. int n1=p*2;
  132. int n2=(int)pow((Real_t)n1,2);
  133. int n3=(int)pow((Real_t)n1,3);
  134. std::vector<Real_t> grid(n3*3);
  135. for(int i=0;i<n1;i++)
  136. for(int j=0;j<n1;j++)
  137. for(int k=0;k<n1;k++){
  138. grid[(i+n1*j+n2*k)*3+0]=(i-p)*a/(p-1)+coord[0];
  139. grid[(i+n1*j+n2*k)*3+1]=(j-p)*a/(p-1)+coord[1];
  140. grid[(i+n1*j+n2*k)*3+2]=(k-p)*a/(p-1)+coord[2];
  141. }
  142. return grid;
  143. }
  144. template <class Real_t>
  145. void FMM_Data<Real_t>::Clear(){
  146. upward_equiv.Resize(0);
  147. }
  148. template <class Real_t>
  149. PackedData FMM_Data<Real_t>::PackMultipole(void* buff_ptr){
  150. PackedData p0; p0.data=buff_ptr;
  151. p0.length=upward_equiv.Dim()*sizeof(Real_t);
  152. if(p0.length==0) return p0;
  153. if(p0.data==NULL) p0.data=(char*)&upward_equiv[0];
  154. else mem::memcopy(p0.data,&upward_equiv[0],p0.length);
  155. return p0;
  156. }
  157. template <class Real_t>
  158. void FMM_Data<Real_t>::AddMultipole(PackedData p0){
  159. Real_t* data=(Real_t*)p0.data;
  160. size_t n=p0.length/sizeof(Real_t);
  161. assert(upward_equiv.Dim()==n);
  162. Matrix<Real_t> v0(1,n,&upward_equiv[0],false);
  163. Matrix<Real_t> v1(1,n,data,false);
  164. v0+=v1;
  165. }
  166. template <class Real_t>
  167. void FMM_Data<Real_t>::InitMultipole(PackedData p0, bool own_data){
  168. Real_t* data=(Real_t*)p0.data;
  169. size_t n=p0.length/sizeof(Real_t);
  170. if(n==0) return;
  171. if(own_data){
  172. upward_equiv=Vector<Real_t>(n, &data[0], false);
  173. }else{
  174. upward_equiv.ReInit(n, &data[0], false);
  175. }
  176. }
  177. template <class FMMNode>
  178. FMM_Pts<FMMNode>::~FMM_Pts() {
  179. if(mat!=NULL){
  180. // int rank;
  181. // MPI_Comm_rank(comm,&rank);
  182. // if(rank==0) mat->Save2File("Precomp.data");
  183. delete mat;
  184. mat=NULL;
  185. }
  186. if(vprecomp_fft_flag) FFTW_t<Real_t>::fft_destroy_plan(vprecomp_fftplan);
  187. #ifdef __INTEL_OFFLOAD0
  188. #pragma offload target(mic:0)
  189. #endif
  190. {
  191. if(vlist_fft_flag ) FFTW_t<Real_t>::fft_destroy_plan(vlist_fftplan );
  192. if(vlist_ifft_flag) FFTW_t<Real_t>::fft_destroy_plan(vlist_ifftplan);
  193. vlist_fft_flag =false;
  194. vlist_ifft_flag=false;
  195. }
  196. }
  197. template <class FMMNode>
  198. void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel<Real_t>* kernel_){
  199. Profile::Tic("InitFMM_Pts",&comm_,true);{
  200. bool verbose=false;
  201. #ifndef NDEBUG
  202. #ifdef __VERBOSE__
  203. int rank;
  204. MPI_Comm_rank(comm_,&rank);
  205. if(!rank) verbose=true;
  206. #endif
  207. #endif
  208. if(kernel_) kernel_->Initialize(verbose);
  209. multipole_order=mult_order;
  210. comm=comm_;
  211. kernel=kernel_;
  212. assert(kernel!=NULL);
  213. mat=new PrecompMat<Real_t>(Homogen(), MAX_DEPTH+1);
  214. if(this->mat_fname.size()==0){
  215. std::stringstream st;
  216. st<<PVFMM_PRECOMP_DATA_PATH;
  217. if(!st.str().size()){ // look in PVFMM_DIR
  218. char* pvfmm_dir = getenv ("PVFMM_DIR");
  219. if(pvfmm_dir) st<<pvfmm_dir<<'/';
  220. }
  221. #ifndef STAT_MACROS_BROKEN
  222. if(st.str().size()){ // check if the path is a directory
  223. struct stat stat_buff;
  224. if(stat(st.str().c_str(), &stat_buff) || !S_ISDIR(stat_buff.st_mode)){
  225. std::cout<<"error: path not found: "<<st.str()<<'\n';
  226. exit(0);
  227. }
  228. }
  229. #endif
  230. st<<"Precomp_"<<kernel->ker_name.c_str()<<"_m"<<mult_order;
  231. if(sizeof(Real_t)==8) st<<"";
  232. else if(sizeof(Real_t)==4) st<<"_f";
  233. else st<<"_t"<<sizeof(Real_t);
  234. st<<".data";
  235. this->mat_fname=st.str();
  236. }
  237. this->mat->LoadFile(mat_fname.c_str(), this->comm);
  238. interac_list.Initialize(COORD_DIM, this->mat);
  239. Profile::Tic("PrecompUC2UE",&comm,false,4);
  240. this->PrecompAll(UC2UE_Type);
  241. Profile::Toc();
  242. Profile::Tic("PrecompDC2DE",&comm,false,4);
  243. this->PrecompAll(DC2DE_Type);
  244. Profile::Toc();
  245. Profile::Tic("PrecompBC",&comm,false,4);
  246. { /*
  247. int type=BC_Type;
  248. for(int l=0;l<MAX_DEPTH;l++)
  249. for(size_t indx=0;indx<this->interac_list.ListCount((Mat_Type)type);indx++){
  250. Matrix<Real_t>& M=this->mat->Mat(l, (Mat_Type)type, indx);
  251. M.Resize(0,0);
  252. } // */
  253. }
  254. this->PrecompAll(BC_Type,0);
  255. Profile::Toc();
  256. Profile::Tic("PrecompU2U",&comm,false,4);
  257. this->PrecompAll(U2U_Type);
  258. Profile::Toc();
  259. Profile::Tic("PrecompD2D",&comm,false,4);
  260. this->PrecompAll(D2D_Type);
  261. Profile::Toc();
  262. Profile::Tic("PrecompV",&comm,false,4);
  263. this->PrecompAll(V_Type);
  264. Profile::Toc();
  265. Profile::Tic("PrecompV1",&comm,false,4);
  266. this->PrecompAll(V1_Type);
  267. Profile::Toc();
  268. }Profile::Toc();
  269. }
  270. template <class Real_t>
  271. Permutation<Real_t> equiv_surf_perm(size_t m, size_t p_indx, const Permutation<Real_t>& ker_perm, const Vector<Real_t>* scal_exp=NULL){
  272. Real_t eps=1e-10;
  273. int dof=ker_perm.Dim();
  274. Real_t c[3]={-0.5,-0.5,-0.5};
  275. std::vector<Real_t> trg_coord=d_check_surf(m,c,0);
  276. int n_trg=trg_coord.size()/3;
  277. Permutation<Real_t> P=Permutation<Real_t>(n_trg*dof);
  278. if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){ // Set P.perm
  279. for(int i=0;i<n_trg;i++)
  280. for(int j=0;j<n_trg;j++){
  281. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+0]*(p_indx==ReflecX?-1.0:1.0))<eps)
  282. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+1]*(p_indx==ReflecY?-1.0:1.0))<eps)
  283. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+2]*(p_indx==ReflecZ?-1.0:1.0))<eps){
  284. for(int k=0;k<dof;k++){
  285. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  286. }
  287. }
  288. }
  289. }else if(p_indx==SwapXY || p_indx==SwapXZ){
  290. for(int i=0;i<n_trg;i++)
  291. for(int j=0;j<n_trg;j++){
  292. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+(p_indx==SwapXY?1:2)])<eps)
  293. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+(p_indx==SwapXY?0:1)])<eps)
  294. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+(p_indx==SwapXY?2:0)])<eps){
  295. for(int k=0;k<dof;k++){
  296. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  297. }
  298. }
  299. }
  300. }else{
  301. for(int j=0;j<n_trg;j++){
  302. for(int k=0;k<dof;k++){
  303. P.perm[j*dof+k]=j*dof+ker_perm.perm[k];
  304. }
  305. }
  306. }
  307. if(scal_exp && p_indx==Scaling){ // Set level-by-level scaling
  308. assert(dof==scal_exp->Dim());
  309. Vector<Real_t> scal(scal_exp->Dim());
  310. for(size_t i=0;i<scal.Dim();i++){
  311. scal[i]=pow(2.0,(*scal_exp)[i]);
  312. }
  313. for(int j=0;j<n_trg;j++){
  314. for(int i=0;i<dof;i++){
  315. P.scal[j*dof+i]*=scal[i];
  316. }
  317. }
  318. }
  319. { // Set P.scal
  320. for(int j=0;j<n_trg;j++){
  321. for(int i=0;i<dof;i++){
  322. P.scal[j*dof+i]*=ker_perm.scal[i];
  323. }
  324. }
  325. }
  326. return P;
  327. }
  328. template <class FMMNode>
  329. Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type type, Perm_Type perm_indx){
  330. //Check if the matrix already exists.
  331. Permutation<Real_t>& P_ = mat->Perm((Mat_Type)type, perm_indx);
  332. if(P_.Dim()!=0) return P_;
  333. size_t m=this->MultipoleOrder();
  334. size_t p_indx=perm_indx % C_Perm;
  335. //Compute the matrix.
  336. Permutation<Real_t> P;
  337. switch (type){
  338. case UC2UE_Type:
  339. {
  340. break;
  341. }
  342. case DC2DE_Type:
  343. {
  344. break;
  345. }
  346. case S2U_Type:
  347. {
  348. break;
  349. }
  350. case U2U_Type:
  351. {
  352. Vector<Real_t> scal_exp;
  353. Permutation<Real_t> ker_perm;
  354. if(perm_indx<C_Perm){ // Source permutation
  355. ker_perm=kernel->k_m2m->perm_vec[0 +p_indx];
  356. scal_exp=kernel->k_m2m->src_scal;
  357. }else{ // Target permutation
  358. ker_perm=kernel->k_m2m->perm_vec[0 +p_indx];
  359. scal_exp=kernel->k_m2m->src_scal;
  360. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  361. }
  362. P=equiv_surf_perm(m, p_indx, ker_perm, (this->Homogen()?&scal_exp:NULL));
  363. break;
  364. }
  365. case D2D_Type:
  366. {
  367. Vector<Real_t> scal_exp;
  368. Permutation<Real_t> ker_perm;
  369. if(perm_indx<C_Perm){ // Source permutation
  370. ker_perm=kernel->k_l2l->perm_vec[0 +p_indx];
  371. scal_exp=kernel->k_l2l->src_scal;
  372. }else{ // Target permutation
  373. ker_perm=kernel->k_l2l->perm_vec[0 +p_indx];
  374. scal_exp=kernel->k_l2l->src_scal;
  375. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  376. }
  377. P=equiv_surf_perm(m, p_indx, ker_perm, (this->Homogen()?&scal_exp:NULL));
  378. break;
  379. }
  380. case D2T_Type:
  381. {
  382. break;
  383. }
  384. case U0_Type:
  385. {
  386. break;
  387. }
  388. case U1_Type:
  389. {
  390. break;
  391. }
  392. case U2_Type:
  393. {
  394. break;
  395. }
  396. case V_Type:
  397. {
  398. break;
  399. }
  400. case V1_Type:
  401. {
  402. break;
  403. }
  404. case W_Type:
  405. {
  406. break;
  407. }
  408. case X_Type:
  409. {
  410. break;
  411. }
  412. case BC_Type:
  413. {
  414. break;
  415. }
  416. default:
  417. break;
  418. }
  419. //Save the matrix for future use.
  420. #pragma omp critical (PRECOMP_MATRIX_PTS)
  421. {
  422. if(P_.Dim()==0) P_=P;
  423. }
  424. return P_;
  425. }
  426. template <class FMMNode>
  427. Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type type, size_t mat_indx){
  428. if(this->Homogen()) level=0;
  429. //Check if the matrix already exists.
  430. Matrix<Real_t>& M_ = this->mat->Mat(level, type, mat_indx);
  431. if(M_.Dim(0)!=0 && M_.Dim(1)!=0) return M_;
  432. else{ //Compute matrix from symmetry class (if possible).
  433. size_t class_indx = this->interac_list.InteracClass(type, mat_indx);
  434. if(class_indx!=mat_indx){
  435. Matrix<Real_t>& M0 = this->Precomp(level, type, class_indx);
  436. if(M0.Dim(0)==0 || M0.Dim(1)==0) return M_;
  437. for(size_t i=0;i<Perm_Count;i++) this->PrecompPerm(type, (Perm_Type) i);
  438. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, type, mat_indx);
  439. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, type, mat_indx);
  440. if(Pr.Dim()>0 && Pc.Dim()>0 && M0.Dim(0)>0 && M0.Dim(1)>0) return M_;
  441. }
  442. }
  443. //Compute the matrix.
  444. Matrix<Real_t> M;
  445. //int omp_p=omp_get_max_threads();
  446. switch (type){
  447. case UC2UE_Type:
  448. {
  449. if(MultipoleOrder()==0) break;
  450. const int* ker_dim=kernel->k_m2m->ker_dim;
  451. // Coord of upward check surface
  452. Real_t c[3]={0,0,0};
  453. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  454. size_t n_uc=uc_coord.size()/3;
  455. // Coord of upward equivalent surface
  456. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  457. size_t n_ue=ue_coord.size()/3;
  458. // Evaluate potential at check surface due to equivalent surface.
  459. Matrix<Real_t> M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  460. kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue,
  461. &uc_coord[0], n_uc, &(M_e2c[0][0]));
  462. Real_t eps=1.0;
  463. while(eps+(Real_t)1.0>1.0) eps*=0.5;
  464. M=M_e2c.pinv(sqrt(eps)); //check 2 equivalent
  465. break;
  466. }
  467. case DC2DE_Type:
  468. {
  469. if(MultipoleOrder()==0) break;
  470. const int* ker_dim=kernel->k_l2l->ker_dim;
  471. // Coord of downward check surface
  472. Real_t c[3]={0,0,0};
  473. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  474. size_t n_ch=check_surf.size()/3;
  475. // Coord of downward equivalent surface
  476. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  477. size_t n_eq=equiv_surf.size()/3;
  478. // Evaluate potential at check surface due to equivalent surface.
  479. Matrix<Real_t> M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]);
  480. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq,
  481. &check_surf[0], n_ch, &(M_e2c[0][0]));
  482. Real_t eps=1.0;
  483. while(eps+(Real_t)1.0>1.0) eps*=0.5;
  484. M=M_e2c.pinv(sqrt(eps)); //check 2 equivalent
  485. break;
  486. }
  487. case S2U_Type:
  488. {
  489. break;
  490. }
  491. case U2U_Type:
  492. {
  493. if(MultipoleOrder()==0) break;
  494. const int* ker_dim=kernel->k_m2m->ker_dim;
  495. // Coord of upward check surface
  496. Real_t c[3]={0,0,0};
  497. std::vector<Real_t> check_surf=u_check_surf(MultipoleOrder(),c,level);
  498. size_t n_uc=check_surf.size()/3;
  499. // Coord of child's upward equivalent surface
  500. Real_t s=pow(0.5,(level+2));
  501. int* coord=interac_list.RelativeCoord(type,mat_indx);
  502. Real_t child_coord[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  503. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),child_coord,level+1);
  504. size_t n_ue=equiv_surf.size()/3;
  505. // Evaluate potential at check surface due to equivalent surface.
  506. Matrix<Real_t> M_ce2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  507. kernel->k_m2m->BuildMatrix(&equiv_surf[0], n_ue,
  508. &check_surf[0], n_uc, &(M_ce2c[0][0]));
  509. Matrix<Real_t>& M_c2e = Precomp(level, UC2UE_Type, 0);
  510. M=M_ce2c*M_c2e;
  511. break;
  512. }
  513. case D2D_Type:
  514. {
  515. if(MultipoleOrder()==0) break;
  516. const int* ker_dim=kernel->k_l2l->ker_dim;
  517. // Coord of downward check surface
  518. Real_t s=pow(0.5,level+1);
  519. int* coord=interac_list.RelativeCoord(type,mat_indx);
  520. Real_t c[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  521. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  522. size_t n_dc=check_surf.size()/3;
  523. // Coord of parent's downward equivalent surface
  524. Real_t parent_coord[3]={0,0,0};
  525. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),parent_coord,level-1);
  526. size_t n_de=equiv_surf.size()/3;
  527. // Evaluate potential at check surface due to equivalent surface.
  528. Matrix<Real_t> M_pe2c(n_de*ker_dim[0],n_dc*ker_dim[1]);
  529. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_de,
  530. &check_surf[0], n_dc, &(M_pe2c[0][0]));
  531. Matrix<Real_t>& M_c2e=Precomp(level,DC2DE_Type,0);
  532. M=M_pe2c*M_c2e;
  533. break;
  534. }
  535. case D2T_Type:
  536. {
  537. if(MultipoleOrder()==0) break;
  538. const int* ker_dim=kernel->k_l2t->ker_dim;
  539. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  540. // Coord of target points
  541. Real_t r=pow(0.5,level);
  542. size_t n_trg=rel_trg_coord.size()/3;
  543. std::vector<Real_t> trg_coord(n_trg*3);
  544. for(size_t i=0;i<n_trg*COORD_DIM;i++) trg_coord[i]=rel_trg_coord[i]*r;
  545. // Coord of downward equivalent surface
  546. Real_t c[3]={0,0,0};
  547. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  548. size_t n_eq=equiv_surf.size()/3;
  549. // Evaluate potential at target points due to equivalent surface.
  550. {
  551. M .Resize(n_eq*ker_dim [0], n_trg*ker_dim [1]);
  552. kernel->k_l2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  553. }
  554. break;
  555. }
  556. case U0_Type:
  557. {
  558. break;
  559. }
  560. case U1_Type:
  561. {
  562. break;
  563. }
  564. case U2_Type:
  565. {
  566. break;
  567. }
  568. case V_Type:
  569. {
  570. if(MultipoleOrder()==0) break;
  571. const int* ker_dim=kernel->k_m2l->ker_dim;
  572. int n1=MultipoleOrder()*2;
  573. int n3 =n1*n1*n1;
  574. int n3_=n1*n1*(n1/2+1);
  575. //Compute the matrix.
  576. Real_t s=pow(0.5,level);
  577. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  578. Real_t coord_diff[3]={coord2[0]*s,coord2[1]*s,coord2[2]*s};
  579. //Evaluate potential.
  580. std::vector<Real_t> r_trg(COORD_DIM,0.0);
  581. std::vector<Real_t> conv_poten(n3*ker_dim[0]*ker_dim[1]);
  582. std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
  583. kernel->k_m2l->BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
  584. //Rearrange data.
  585. Matrix<Real_t> M_conv(n3,ker_dim[0]*ker_dim[1],&conv_poten[0],false);
  586. M_conv=M_conv.Transpose();
  587. //Compute FFTW plan.
  588. int nnn[3]={n1,n1,n1};
  589. Real_t *fftw_in, *fftw_out;
  590. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  591. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  592. #pragma omp critical (FFTW_PLAN)
  593. {
  594. if (!vprecomp_fft_flag){
  595. vprecomp_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM, nnn, ker_dim[0]*ker_dim[1],
  596. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_, FFTW_ESTIMATE);
  597. vprecomp_fft_flag=true;
  598. }
  599. }
  600. //Compute FFT.
  601. mem::memcopy(fftw_in, &conv_poten[0], n3*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  602. FFTW_t<Real_t>::fft_execute_dft_r2c(vprecomp_fftplan, (Real_t*)fftw_in, (typename FFTW_t<Real_t>::cplx*)(fftw_out));
  603. Matrix<Real_t> M_(2*n3_*ker_dim[0]*ker_dim[1],1,(Real_t*)fftw_out,false);
  604. M=M_;
  605. //Free memory.
  606. mem::aligned_delete<Real_t>(fftw_in);
  607. mem::aligned_delete<Real_t>(fftw_out);
  608. break;
  609. }
  610. case V1_Type:
  611. {
  612. if(MultipoleOrder()==0) break;
  613. const int* ker_dim=kernel->k_m2l->ker_dim;
  614. size_t mat_cnt =interac_list.ListCount( V_Type);
  615. for(size_t k=0;k<mat_cnt;k++) Precomp(level, V_Type, k);
  616. const size_t chld_cnt=1UL<<COORD_DIM;
  617. size_t n1=MultipoleOrder()*2;
  618. size_t M_dim=n1*n1*(n1/2+1);
  619. size_t n3=n1*n1*n1;
  620. Vector<Real_t> zero_vec(M_dim*ker_dim[0]*ker_dim[1]*2);
  621. zero_vec.SetZero();
  622. Vector<Real_t*> M_ptr(chld_cnt*chld_cnt);
  623. for(size_t i=0;i<chld_cnt*chld_cnt;i++) M_ptr[i]=&zero_vec[0];
  624. int* rel_coord_=interac_list.RelativeCoord(V1_Type, mat_indx);
  625. for(int j1=0;j1<chld_cnt;j1++)
  626. for(int j2=0;j2<chld_cnt;j2++){
  627. int rel_coord[3]={rel_coord_[0]*2-(j1/1)%2+(j2/1)%2,
  628. rel_coord_[1]*2-(j1/2)%2+(j2/2)%2,
  629. rel_coord_[2]*2-(j1/4)%2+(j2/4)%2};
  630. for(size_t k=0;k<mat_cnt;k++){
  631. int* ref_coord=interac_list.RelativeCoord(V_Type, k);
  632. if(ref_coord[0]==rel_coord[0] &&
  633. ref_coord[1]==rel_coord[1] &&
  634. ref_coord[2]==rel_coord[2]){
  635. Matrix<Real_t>& M = this->mat->Mat(level, V_Type, k);
  636. M_ptr[j2*chld_cnt+j1]=&M[0][0];
  637. break;
  638. }
  639. }
  640. }
  641. // Build matrix ker_dim0 x ker_dim1 x M_dim x 8 x 8
  642. M.Resize(ker_dim[0]*ker_dim[1]*M_dim, 2*chld_cnt*chld_cnt);
  643. for(int j=0;j<ker_dim[0]*ker_dim[1]*M_dim;j++){
  644. for(size_t k=0;k<chld_cnt*chld_cnt;k++){
  645. M[j][k*2+0]=M_ptr[k][j*2+0]/n3;
  646. M[j][k*2+1]=M_ptr[k][j*2+1]/n3;
  647. }
  648. }
  649. break;
  650. }
  651. case W_Type:
  652. {
  653. if(MultipoleOrder()==0) break;
  654. const int* ker_dim=kernel->k_m2t->ker_dim;
  655. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  656. // Coord of target points
  657. Real_t s=pow(0.5,level);
  658. size_t n_trg=rel_trg_coord.size()/3;
  659. std::vector<Real_t> trg_coord(n_trg*3);
  660. for(size_t j=0;j<n_trg*COORD_DIM;j++) trg_coord[j]=rel_trg_coord[j]*s;
  661. // Coord of downward equivalent surface
  662. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  663. Real_t c[3]={(coord2[0]+1)*s*0.25,(coord2[1]+1)*s*0.25,(coord2[2]+1)*s*0.25};
  664. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),c,level+1);
  665. size_t n_eq=equiv_surf.size()/3;
  666. // Evaluate potential at target points due to equivalent surface.
  667. {
  668. M .Resize(n_eq*ker_dim [0],n_trg*ker_dim [1]);
  669. kernel->k_m2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  670. }
  671. break;
  672. }
  673. case X_Type:
  674. {
  675. break;
  676. }
  677. case BC_Type:
  678. {
  679. if(!this->Homogen() || MultipoleOrder()==0) break;
  680. const int* ker_dim=kernel->k_m2l->ker_dim;
  681. size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type);
  682. size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2); //Total number of points.
  683. if((M.Dim(0)!=n_surf*ker_dim[0] || M.Dim(1)!=n_surf*ker_dim[1]) && level==0){
  684. Matrix<Real_t> M_m2m[BC_LEVELS+1];
  685. Matrix<Real_t> M_m2l[BC_LEVELS+1];
  686. Matrix<Real_t> M_l2l[BC_LEVELS+1];
  687. Matrix<Real_t> M_zero_avg(n_surf*ker_dim[0],n_surf*ker_dim[0]);
  688. { // Set average multipole charge to zero. (improves stability for large BC_LEVELS)
  689. M_zero_avg.SetZero();
  690. for(size_t i=0;i<n_surf*ker_dim[0];i++)
  691. M_zero_avg[i][i]+=1;
  692. for(size_t i=0;i<n_surf;i++)
  693. for(size_t j=0;j<n_surf;j++)
  694. for(size_t k=0;k<ker_dim[0];k++)
  695. M_zero_avg[i*ker_dim[0]+k][j*ker_dim[0]+k]-=1.0/n_surf;
  696. }
  697. for(int level=0; level>=-BC_LEVELS; level--){
  698. // Compute M_l2l
  699. {
  700. this->Precomp(level, D2D_Type, 0);
  701. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, D2D_Type, 0);
  702. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, D2D_Type, 0);
  703. M_l2l[-level] = Pr * this->Precomp(level, D2D_Type, this->interac_list.InteracClass(D2D_Type, 0)) * Pc;
  704. assert(M_l2l[-level].Dim(0)>0 && M_l2l[-level].Dim(1)>0);
  705. }
  706. // Compute M_m2m
  707. for(size_t mat_indx=0; mat_indx<mat_cnt_m2m; mat_indx++){
  708. this->Precomp(level, U2U_Type, mat_indx);
  709. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, U2U_Type, mat_indx);
  710. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, U2U_Type, mat_indx);
  711. Matrix<Real_t> M = Pr * this->Precomp(level, U2U_Type, this->interac_list.InteracClass(U2U_Type, mat_indx)) * Pc;
  712. assert(M.Dim(0)>0 && M.Dim(1)>0);
  713. if(mat_indx==0) M_m2m[-level] = M_zero_avg*M;
  714. else M_m2m[-level] += M_zero_avg*M;
  715. }
  716. // Compute M_m2l
  717. if(!Homogen() || level==0){
  718. Real_t s=(1UL<<(-level));
  719. Real_t ue_coord[3]={0,0,0};
  720. Real_t dc_coord[3]={0,0,0};
  721. std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
  722. std::vector<Real_t> trg_coord=d_check_surf(MultipoleOrder(), dc_coord, level);
  723. Matrix<Real_t> M_ue2dc(n_surf*ker_dim[0], n_surf*ker_dim[1]);
  724. M_ue2dc.SetZero();
  725. for(int x0=-2;x0<4;x0++)
  726. for(int x1=-2;x1<4;x1++)
  727. for(int x2=-2;x2<4;x2++)
  728. if(abs(x0)>1 || abs(x1)>1 || abs(x2)>1){
  729. ue_coord[0]=x0*s; ue_coord[1]=x1*s; ue_coord[2]=x2*s;
  730. std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
  731. Matrix<Real_t> M_tmp(n_surf*ker_dim[0], n_surf*ker_dim[1]);
  732. kernel->k_m2l->BuildMatrix(&src_coord[0], n_surf,
  733. &trg_coord[0], n_surf, &(M_tmp[0][0]));
  734. M_ue2dc+=M_tmp;
  735. }
  736. // Shift by constant.
  737. for(size_t i=0;i<M_ue2dc.Dim(0);i++){
  738. std::vector<Real_t> avg(ker_dim[1],0);
  739. for(size_t j=0; j<M_ue2dc.Dim(1); j+=ker_dim[1])
  740. for(int k=0; k<ker_dim[1]; k++) avg[k]+=M_ue2dc[i][j+k];
  741. for(int k=0; k<ker_dim[1]; k++) avg[k]/=n_surf;
  742. for(size_t j=0; j<M_ue2dc.Dim(1); j+=ker_dim[1])
  743. for(int k=0; k<ker_dim[1]; k++) M_ue2dc[i][j+k]-=avg[k];
  744. }
  745. Matrix<Real_t>& M_dc2de = Precomp(level, DC2DE_Type, 0);
  746. M_m2l[-level]=M_ue2dc*M_dc2de;
  747. }else M_m2l[-level]=M_m2l[-level-1];
  748. }
  749. for(int level=-BC_LEVELS;level<=0;level++){
  750. if(level==-BC_LEVELS) M = M_m2l[-level];
  751. else M = M_m2l[-level] + M_m2m[-level]*M*M_l2l[-level];
  752. { // Shift by constant. (improves stability for large BC_LEVELS)
  753. Matrix<Real_t> M_de2dc(n_surf*ker_dim[0], n_surf*ker_dim[1]);
  754. { // M_de2dc TODO: For homogeneous kernels, compute only once.
  755. // Coord of downward check surface
  756. Real_t c[3]={0,0,0};
  757. int level_=(Homogen()?0:level);
  758. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level_);
  759. size_t n_ch=check_surf.size()/3;
  760. // Coord of downward equivalent surface
  761. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level_);
  762. size_t n_eq=equiv_surf.size()/3;
  763. // Evaluate potential at check surface due to equivalent surface.
  764. kernel->k_m2l->BuildMatrix(&equiv_surf[0], n_eq,
  765. &check_surf[0], n_ch, &(M_de2dc[0][0]));
  766. }
  767. Matrix<Real_t> M_ue2dc=M*M_de2dc;
  768. for(size_t i=0;i<M_ue2dc.Dim(0);i++){
  769. std::vector<Real_t> avg(ker_dim[1],0);
  770. for(size_t j=0; j<M_ue2dc.Dim(1); j+=ker_dim[1])
  771. for(int k=0; k<ker_dim[1]; k++) avg[k]+=M_ue2dc[i][j+k];
  772. for(int k=0; k<ker_dim[1]; k++) avg[k]/=n_surf;
  773. for(size_t j=0; j<M_ue2dc.Dim(1); j+=ker_dim[1])
  774. for(int k=0; k<ker_dim[1]; k++) M_ue2dc[i][j+k]-=avg[k];
  775. }
  776. Matrix<Real_t>& M_dc2de = Precomp(level, DC2DE_Type, 0);
  777. M=M_ue2dc*M_dc2de;
  778. }
  779. }
  780. { // ax+by+cz+d correction.
  781. std::vector<Real_t> corner_pts;
  782. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0);
  783. corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0);
  784. corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0);
  785. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1);
  786. size_t n_corner=corner_pts.size()/3;
  787. // Coord of downward equivalent surface
  788. Real_t c[3]={0,0,0};
  789. std::vector<Real_t> up_equiv_surf=u_equiv_surf(MultipoleOrder(),c,0);
  790. std::vector<Real_t> dn_equiv_surf=d_equiv_surf(MultipoleOrder(),c,0);
  791. std::vector<Real_t> dn_check_surf=d_check_surf(MultipoleOrder(),c,0);
  792. Matrix<Real_t> M_err;
  793. { // Evaluate potential at corner due to upward and dnward equivalent surface.
  794. { // Error from local expansion.
  795. Matrix<Real_t> M_e2pt(n_surf*ker_dim[0],n_corner*ker_dim[1]);
  796. kernel->k_m2l->BuildMatrix(&dn_equiv_surf[0], n_surf,
  797. &corner_pts[0], n_corner, &(M_e2pt[0][0]));
  798. M_err=M*M_e2pt;
  799. }
  800. for(size_t k=0;k<4;k++){ // Error from colleagues of root.
  801. for(int j0=-1;j0<=1;j0++)
  802. for(int j1=-1;j1<=1;j1++)
  803. for(int j2=-1;j2<=1;j2++){
  804. Real_t pt_coord[3]={corner_pts[k*COORD_DIM+0]-j0,
  805. corner_pts[k*COORD_DIM+1]-j1,
  806. corner_pts[k*COORD_DIM+2]-j2};
  807. if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){
  808. Matrix<Real_t> M_e2pt(n_surf*ker_dim[0],ker_dim[1]);
  809. kernel->k_m2l->BuildMatrix(&up_equiv_surf[0], n_surf,
  810. &pt_coord[0], 1, &(M_e2pt[0][0]));
  811. for(size_t i=0;i<M_e2pt.Dim(0);i++)
  812. for(size_t j=0;j<M_e2pt.Dim(1);j++)
  813. M_err[i][k*ker_dim[1]+j]+=M_e2pt[i][j];
  814. }
  815. }
  816. }
  817. }
  818. Matrix<Real_t> M_grad(M_err.Dim(0),n_surf*ker_dim[1]);
  819. for(size_t i=0;i<M_err.Dim(0);i++)
  820. for(size_t k=0;k<ker_dim[1];k++)
  821. for(size_t j=0;j<n_surf;j++){
  822. M_grad[i][j*ker_dim[1]+k]=(M_err[i][0*ker_dim[1]+k] )*1.0 +
  823. (M_err[i][1*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]+
  824. (M_err[i][2*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+1]+
  825. (M_err[i][3*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+2];
  826. }
  827. Matrix<Real_t>& M_dc2de = Precomp(0, DC2DE_Type, 0);
  828. M-=M_grad*M_dc2de;
  829. }
  830. { // Free memory
  831. Mat_Type type=D2D_Type;
  832. for(int l=-BC_LEVELS;l<0;l++)
  833. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  834. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  835. M.Resize(0,0);
  836. }
  837. type=U2U_Type;
  838. for(int l=-BC_LEVELS;l<0;l++)
  839. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  840. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  841. M.Resize(0,0);
  842. }
  843. type=DC2DE_Type;
  844. for(int l=-BC_LEVELS;l<0;l++)
  845. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  846. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  847. M.Resize(0,0);
  848. }
  849. type=UC2UE_Type;
  850. for(int l=-BC_LEVELS;l<0;l++)
  851. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  852. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  853. M.Resize(0,0);
  854. }
  855. }
  856. }
  857. break;
  858. }
  859. default:
  860. break;
  861. }
  862. //Save the matrix for future use.
  863. #pragma omp critical (PRECOMP_MATRIX_PTS)
  864. if(M_.Dim(0)==0 && M_.Dim(1)==0){
  865. M_=M;
  866. /*
  867. M_.Resize(M.Dim(0),M.Dim(1));
  868. int dof=ker_dim[0]*ker_dim[1];
  869. for(int j=0;j<dof;j++){
  870. size_t a=(M.Dim(0)*M.Dim(1)* j )/dof;
  871. size_t b=(M.Dim(0)*M.Dim(1)*(j+1))/dof;
  872. #pragma omp parallel for // NUMA
  873. for(int tid=0;tid<omp_p;tid++){
  874. size_t a_=a+((b-a)* tid )/omp_p;
  875. size_t b_=a+((b-a)*(tid+1))/omp_p;
  876. mem::memcopy(&M_[0][a_], &M[0][a_], (b_-a_)*sizeof(Real_t));
  877. }
  878. }
  879. */
  880. }
  881. return M_;
  882. }
  883. template <class FMMNode>
  884. void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
  885. if(level==-1){
  886. for(int l=0;l<MAX_DEPTH;l++){
  887. PrecompAll(type, l);
  888. }
  889. return;
  890. }
  891. //Compute basic permutations.
  892. for(size_t i=0;i<Perm_Count;i++)
  893. this->PrecompPerm(type, (Perm_Type) i);
  894. {
  895. //Allocate matrices.
  896. size_t mat_cnt=interac_list.ListCount((Mat_Type)type);
  897. mat->Mat(level, (Mat_Type)type, mat_cnt-1);
  898. { // Compute InteracClass matrices.
  899. std::vector<size_t> indx_lst;
  900. for(size_t i=0; i<mat_cnt; i++){
  901. if(interac_list.InteracClass((Mat_Type)type,i)==i)
  902. indx_lst.push_back(i);
  903. }
  904. //Compute Transformations.
  905. //#pragma omp parallel for //lets use fine grained parallelism
  906. for(size_t i=0; i<indx_lst.size(); i++){
  907. Precomp(level, (Mat_Type)type, indx_lst[i]);
  908. }
  909. }
  910. //#pragma omp parallel for //lets use fine grained parallelism
  911. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
  912. Matrix<Real_t>& M0=interac_list.ClassMat(level,(Mat_Type)type,mat_indx);
  913. Permutation<Real_t>& pr=interac_list.Perm_R(level, (Mat_Type)type, mat_indx);
  914. Permutation<Real_t>& pc=interac_list.Perm_C(level, (Mat_Type)type, mat_indx);
  915. if(pr.Dim()!=M0.Dim(0) || pc.Dim()!=M0.Dim(1)) Precomp(level, (Mat_Type)type, mat_indx);
  916. }
  917. }
  918. }
  919. template <class FMMNode>
  920. void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff_list, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
  921. if(buff_list.size()<7) buff_list.resize(7);
  922. if( n_list.size()<7) n_list.resize(7);
  923. if( vec_list.size()<7) vec_list.resize(7);
  924. int omp_p=omp_get_max_threads();
  925. if(node.size()==0) return;
  926. {// 0. upward_equiv
  927. int indx=0;
  928. size_t vec_sz;
  929. { // Set vec_sz
  930. Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE_Type, 0);
  931. vec_sz=M_uc2ue.Dim(1);
  932. }
  933. std::vector< FMMNode* > node_lst;
  934. {// Construct node_lst
  935. node_lst.clear();
  936. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  937. FMMNode_t* r_node=NULL;
  938. for(size_t i=0;i<node.size();i++){
  939. if(!node[i]->IsLeaf())
  940. node_lst_[node[i]->Depth()].push_back(node[i]);
  941. if(node[i]->Depth()==0) r_node=node[i];
  942. }
  943. size_t chld_cnt=1UL<<COORD_DIM;
  944. for(int i=0;i<=MAX_DEPTH;i++){
  945. for(size_t j=0;j<node_lst_[i].size();j++){
  946. for(size_t k=0;k<chld_cnt;k++){
  947. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  948. node_lst.push_back(node);
  949. }
  950. }
  951. }
  952. if(r_node!=NULL) node_lst.push_back(r_node);
  953. n_list[indx]=node_lst;
  954. }
  955. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  956. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  957. FMMNode_t* node=node_lst[i];
  958. Vector<Real_t>& data_vec=node->FMMData()->upward_equiv;
  959. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
  960. vec_lst.push_back(&data_vec);
  961. }
  962. }
  963. {// 1. dnward_equiv
  964. int indx=1;
  965. size_t vec_sz;
  966. { // Set vec_sz
  967. Matrix<Real_t>& M_dc2de = this->interac_list.ClassMat(0, DC2DE_Type, 0);
  968. vec_sz=M_dc2de.Dim(1);
  969. }
  970. std::vector< FMMNode* > node_lst;
  971. {// Construct node_lst
  972. node_lst.clear();
  973. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  974. FMMNode_t* r_node=NULL;
  975. for(size_t i=0;i<node.size();i++){
  976. if(!node[i]->IsLeaf())
  977. node_lst_[node[i]->Depth()].push_back(node[i]);
  978. if(node[i]->Depth()==0) r_node=node[i];
  979. }
  980. size_t chld_cnt=1UL<<COORD_DIM;
  981. for(int i=0;i<=MAX_DEPTH;i++){
  982. for(size_t j=0;j<node_lst_[i].size();j++){
  983. for(size_t k=0;k<chld_cnt;k++){
  984. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  985. node_lst.push_back(node);
  986. }
  987. }
  988. }
  989. if(r_node!=NULL) node_lst.push_back(r_node);
  990. n_list[indx]=node_lst;
  991. }
  992. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  993. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  994. FMMNode_t* node=node_lst[i];
  995. Vector<Real_t>& data_vec=node->FMMData()->dnward_equiv;
  996. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
  997. vec_lst.push_back(&data_vec);
  998. }
  999. }
  1000. {// 2. upward_equiv_fft
  1001. int indx=2;
  1002. std::vector< FMMNode* > node_lst;
  1003. {
  1004. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1005. for(size_t i=0;i<node.size();i++)
  1006. if(!node[i]->IsLeaf())
  1007. node_lst_[node[i]->Depth()].push_back(node[i]);
  1008. for(int i=0;i<=MAX_DEPTH;i++)
  1009. for(size_t j=0;j<node_lst_[i].size();j++)
  1010. node_lst.push_back(node_lst_[i][j]);
  1011. }
  1012. n_list[indx]=node_lst;
  1013. }
  1014. {// 3. dnward_check_fft
  1015. int indx=3;
  1016. std::vector< FMMNode* > node_lst;
  1017. {
  1018. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1019. for(size_t i=0;i<node.size();i++)
  1020. if(!node[i]->IsLeaf() && !node[i]->IsGhost())
  1021. node_lst_[node[i]->Depth()].push_back(node[i]);
  1022. for(int i=0;i<=MAX_DEPTH;i++)
  1023. for(size_t j=0;j<node_lst_[i].size();j++)
  1024. node_lst.push_back(node_lst_[i][j]);
  1025. }
  1026. n_list[indx]=node_lst;
  1027. }
  1028. {// 4. src_val
  1029. int indx=4;
  1030. int src_dof=kernel->ker_dim[0];
  1031. int surf_dof=COORD_DIM+src_dof;
  1032. std::vector< FMMNode* > node_lst;
  1033. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1034. if(node[i]->IsLeaf()){
  1035. node_lst.push_back(node[i]);
  1036. }
  1037. }
  1038. n_list[indx]=node_lst;
  1039. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1040. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1041. FMMNode_t* node=node_lst[i];
  1042. { // src_value
  1043. Vector<Real_t>& data_vec=node->src_value;
  1044. size_t vec_sz=(node->src_coord.Dim()/COORD_DIM)*src_dof;
  1045. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
  1046. vec_lst.push_back(&data_vec);
  1047. }
  1048. { // surf_value
  1049. Vector<Real_t>& data_vec=node->surf_value;
  1050. size_t vec_sz=(node->surf_coord.Dim()/COORD_DIM)*surf_dof;
  1051. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
  1052. vec_lst.push_back(&data_vec);
  1053. }
  1054. }
  1055. }
  1056. {// 5. trg_val
  1057. int indx=5;
  1058. int trg_dof=kernel->ker_dim[1];
  1059. std::vector< FMMNode* > node_lst;
  1060. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1061. if(node[i]->IsLeaf() && !node[i]->IsGhost()){
  1062. node_lst.push_back(node[i]);
  1063. }
  1064. }
  1065. n_list[indx]=node_lst;
  1066. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1067. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1068. FMMNode_t* node=node_lst[i];
  1069. { // trg_value
  1070. Vector<Real_t>& data_vec=node->trg_value;
  1071. size_t vec_sz=(node->trg_coord.Dim()/COORD_DIM)*trg_dof;
  1072. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
  1073. vec_lst.push_back(&data_vec);
  1074. }
  1075. }
  1076. }
  1077. {// 6. pts_coord
  1078. int indx=6;
  1079. std::vector< FMMNode* > node_lst;
  1080. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1081. if(node[i]->IsLeaf()){
  1082. node_lst.push_back(node[i]);
  1083. }
  1084. }
  1085. n_list[indx]=node_lst;
  1086. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1087. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1088. FMMNode_t* node=node_lst[i];
  1089. { // src_coord
  1090. Vector<Real_t>& data_vec=node->src_coord;
  1091. vec_lst.push_back(&data_vec);
  1092. }
  1093. { // surf_coord
  1094. Vector<Real_t>& data_vec=node->surf_coord;
  1095. vec_lst.push_back(&data_vec);
  1096. }
  1097. { // trg_coord
  1098. Vector<Real_t>& data_vec=node->trg_coord;
  1099. vec_lst.push_back(&data_vec);
  1100. }
  1101. }
  1102. { // check and equiv surfaces.
  1103. if(upwd_check_surf.size()==0){
  1104. size_t m=MultipoleOrder();
  1105. upwd_check_surf.resize(MAX_DEPTH);
  1106. upwd_equiv_surf.resize(MAX_DEPTH);
  1107. dnwd_check_surf.resize(MAX_DEPTH);
  1108. dnwd_equiv_surf.resize(MAX_DEPTH);
  1109. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1110. Real_t c[3]={0.0,0.0,0.0};
  1111. upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1112. upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1113. dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1114. dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1115. upwd_check_surf[depth]=u_check_surf(m,c,depth);
  1116. upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
  1117. dnwd_check_surf[depth]=d_check_surf(m,c,depth);
  1118. dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
  1119. }
  1120. }
  1121. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1122. vec_lst.push_back(&upwd_check_surf[depth]);
  1123. vec_lst.push_back(&upwd_equiv_surf[depth]);
  1124. vec_lst.push_back(&dnwd_check_surf[depth]);
  1125. vec_lst.push_back(&dnwd_equiv_surf[depth]);
  1126. }
  1127. }
  1128. }
  1129. // Create extra auxiliary buffer.
  1130. if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1);
  1131. for(size_t indx=0;indx<vec_list.size();indx++){ // Resize buffer
  1132. Matrix<Real_t>& aux_buff=buff_list[vec_list.size()];
  1133. Matrix<Real_t>& buff=buff_list[indx];
  1134. std::vector<Vector<Real_t>*>& vec_lst= vec_list[indx];
  1135. bool keep_data=(indx==4 || indx==6);
  1136. size_t n_vec=vec_lst.size();
  1137. { // Continue if nothing to be done.
  1138. if(!n_vec) continue;
  1139. if(buff.Dim(0)*buff.Dim(1)>0){
  1140. bool init_buff=false;
  1141. Real_t* buff_start=&buff[0][0];
  1142. Real_t* buff_end=&buff[0][0]+buff.Dim(0)*buff.Dim(1);
  1143. #pragma omp parallel for reduction(||:init_buff)
  1144. for(size_t i=0;i<n_vec;i++){
  1145. if(&(*vec_lst[i])[0]<buff_start || &(*vec_lst[i])[0]>=buff_end){
  1146. init_buff=true;
  1147. }
  1148. }
  1149. if(!init_buff) continue;
  1150. }
  1151. }
  1152. std::vector<size_t> vec_size(n_vec);
  1153. std::vector<size_t> vec_disp(n_vec);
  1154. if(n_vec){ // Set vec_size and vec_disp
  1155. #pragma omp parallel for
  1156. for(size_t i=0;i<n_vec;i++){ // Set vec_size
  1157. vec_size[i]=vec_lst[i]->Dim();
  1158. }
  1159. vec_disp[0]=0;
  1160. omp_par::scan(&vec_size[0],&vec_disp[0],n_vec);
  1161. }
  1162. size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1];
  1163. if(keep_data){ // Copy to aux_buff
  1164. if(aux_buff.Dim(0)*aux_buff.Dim(1)<buff_size){ // Resize aux_buff
  1165. aux_buff.ReInit(1,buff_size*1.05);
  1166. }
  1167. #pragma omp parallel for schedule(dynamic)
  1168. for(size_t i=0;i<n_vec;i++){
  1169. mem::memcopy(&aux_buff[0][0]+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
  1170. }
  1171. }
  1172. if(buff.Dim(0)*buff.Dim(1)<buff_size){ // Resize buff
  1173. buff.ReInit(1,buff_size*1.05);
  1174. }
  1175. if(keep_data){ // Copy to buff (from aux_buff)
  1176. #pragma omp parallel for
  1177. for(size_t tid=0;tid<omp_p;tid++){
  1178. size_t a=(buff_size*(tid+0))/omp_p;
  1179. size_t b=(buff_size*(tid+1))/omp_p;
  1180. mem::memcopy(&buff[0][0]+a,&aux_buff[0][0]+a,(b-a)*sizeof(Real_t));
  1181. }
  1182. }
  1183. #pragma omp parallel for
  1184. for(size_t i=0;i<n_vec;i++){ // ReInit vectors
  1185. vec_lst[i]->ReInit(vec_size[i],&buff[0][0]+vec_disp[i],false);
  1186. }
  1187. }
  1188. }
  1189. template <class FMMNode>
  1190. void FMM_Pts<FMMNode>::SetupPrecomp(SetupData<Real_t>& setup_data, bool device){
  1191. if(setup_data.precomp_data==NULL || setup_data.level>MAX_DEPTH) return;
  1192. Profile::Tic("SetupPrecomp",&this->comm,true,25);
  1193. { // Build precomp_data
  1194. size_t precomp_offset=0;
  1195. int level=setup_data.level;
  1196. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1197. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1198. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1199. Mat_Type& interac_type=interac_type_lst[type_indx];
  1200. this->PrecompAll(interac_type, level); // Compute matrices.
  1201. precomp_offset=this->mat->CompactData(level, interac_type, precomp_data, precomp_offset);
  1202. }
  1203. }
  1204. Profile::Toc();
  1205. if(device){ // Host2Device
  1206. Profile::Tic("Host2Device",&this->comm,false,25);
  1207. setup_data.precomp_data->AllocDevice(true);
  1208. Profile::Toc();
  1209. }
  1210. }
  1211. template <class FMMNode>
  1212. void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
  1213. int level=setup_data.level;
  1214. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1215. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1216. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1217. Matrix<Real_t>& input_data=*setup_data. input_data;
  1218. Matrix<Real_t>& output_data=*setup_data.output_data;
  1219. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector;
  1220. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector;
  1221. size_t n_in =nodes_in .size();
  1222. size_t n_out=nodes_out.size();
  1223. // Setup precomputed data.
  1224. if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  1225. // Build interac_data
  1226. Profile::Tic("Interac-Data",&this->comm,true,25);
  1227. Matrix<char>& interac_data=setup_data.interac_data;
  1228. if(n_out>0 && n_in >0){ // Build precomp_data, interac_data
  1229. std::vector<size_t> interac_mat;
  1230. std::vector<size_t> interac_cnt;
  1231. std::vector<size_t> interac_blk;
  1232. std::vector<size_t> input_perm;
  1233. std::vector<size_t> output_perm;
  1234. size_t dof=0, M_dim0=0, M_dim1=0;
  1235. size_t precomp_offset=0;
  1236. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  1237. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1238. Mat_Type& interac_type=interac_type_lst[type_indx];
  1239. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  1240. Matrix<size_t> precomp_data_offset;
  1241. { // Load precomp_data for interac_type.
  1242. struct HeaderData{
  1243. size_t total_size;
  1244. size_t level;
  1245. size_t mat_cnt ;
  1246. size_t max_depth;
  1247. };
  1248. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1249. char* indx_ptr=precomp_data[0]+precomp_offset;
  1250. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  1251. precomp_data_offset.ReInit(header.mat_cnt,(1+(2+2)*header.max_depth), (size_t*)indx_ptr, false);
  1252. precomp_offset+=header.total_size;
  1253. }
  1254. Matrix<FMMNode*> src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero();
  1255. Matrix<FMMNode*> trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero();
  1256. { // Build trg_interac_list
  1257. #pragma omp parallel for
  1258. for(size_t i=0;i<n_out;i++){
  1259. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  1260. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  1261. mem::memcopy(&trg_interac_list[i][0], &lst[0], lst.size()*sizeof(FMMNode*));
  1262. assert(lst.size()==mat_cnt);
  1263. }
  1264. }
  1265. }
  1266. { // Build src_interac_list
  1267. #pragma omp parallel for
  1268. for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
  1269. #pragma omp parallel for
  1270. for(size_t i=0;i<n_out;i++){
  1271. for(size_t j=0;j<mat_cnt;j++)
  1272. if(trg_interac_list[i][j]!=NULL){
  1273. src_interac_list[trg_interac_list[i][j]->node_id][j]=(FMMNode*)nodes_out[i];
  1274. }
  1275. }
  1276. }
  1277. Matrix<size_t> interac_dsp(n_out,mat_cnt);
  1278. std::vector<size_t> interac_blk_dsp(1,0);
  1279. { // Determine dof, M_dim0, M_dim1
  1280. dof=1;
  1281. Matrix<Real_t>& M0 = this->interac_list.ClassMat(level, interac_type_lst[0], 0);
  1282. M_dim0=M0.Dim(0); M_dim1=M0.Dim(1);
  1283. }
  1284. { // Determine interaction blocks which fit in memory.
  1285. size_t vec_size=(M_dim0+M_dim1)*sizeof(Real_t)*dof;
  1286. for(size_t j=0;j<mat_cnt;j++){// Determine minimum buff_size
  1287. size_t vec_cnt=0;
  1288. for(size_t i=0;i<n_out;i++){
  1289. if(trg_interac_list[i][j]!=NULL) vec_cnt++;
  1290. }
  1291. if(buff_size<vec_cnt*vec_size)
  1292. buff_size=vec_cnt*vec_size;
  1293. }
  1294. size_t interac_dsp_=0;
  1295. for(size_t j=0;j<mat_cnt;j++){
  1296. for(size_t i=0;i<n_out;i++){
  1297. interac_dsp[i][j]=interac_dsp_;
  1298. if(trg_interac_list[i][j]!=NULL) interac_dsp_++;
  1299. }
  1300. if(interac_dsp_*vec_size>buff_size) // Comment to disable symmetries.
  1301. {
  1302. interac_blk.push_back(j-interac_blk_dsp.back());
  1303. interac_blk_dsp.push_back(j);
  1304. size_t offset=interac_dsp[0][j];
  1305. for(size_t i=0;i<n_out;i++) interac_dsp[i][j]-=offset;
  1306. interac_dsp_-=offset;
  1307. assert(interac_dsp_*vec_size<=buff_size); // Problem too big for buff_size.
  1308. }
  1309. interac_mat.push_back(precomp_data_offset[this->interac_list.InteracClass(interac_type,j)][0]);
  1310. interac_cnt.push_back(interac_dsp_-interac_dsp[0][j]);
  1311. }
  1312. interac_blk.push_back(mat_cnt-interac_blk_dsp.back());
  1313. interac_blk_dsp.push_back(mat_cnt);
  1314. }
  1315. { // Determine input_perm.
  1316. size_t vec_size=M_dim0*dof;
  1317. for(size_t i=0;i<n_out;i++) ((FMMNode*)nodes_out[i])->node_id=i;
  1318. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1319. for(size_t i=0;i<n_in ;i++){
  1320. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1321. FMMNode_t* trg_node=src_interac_list[i][j];
  1322. if(trg_node!=NULL){
  1323. size_t depth=(this->Homogen()?trg_node->Depth():0);
  1324. input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
  1325. input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
  1326. input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
  1327. input_perm .push_back((size_t)(& input_vector[i][0][0]- input_data[0])); // src_ptr
  1328. assert(input_vector[i]->Dim()==vec_size);
  1329. }
  1330. }
  1331. }
  1332. }
  1333. }
  1334. { // Determine output_perm
  1335. size_t vec_size=M_dim1*dof;
  1336. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1337. for(size_t i=0;i<n_out;i++){
  1338. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1339. if(trg_interac_list[i][j]!=NULL){
  1340. size_t depth=(this->Homogen()?((FMMNode*)nodes_out[i])->Depth():0);
  1341. output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem
  1342. output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal
  1343. output_perm.push_back(interac_dsp[ i ][j]*vec_size*sizeof(Real_t)); // src_ptr
  1344. output_perm.push_back((size_t)(&output_vector[i][0][0]-output_data[0])); // trg_ptr
  1345. assert(output_vector[i]->Dim()==vec_size);
  1346. }
  1347. }
  1348. }
  1349. }
  1350. }
  1351. }
  1352. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  1353. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.ReInit(buff_size);
  1354. { // Set interac_data.
  1355. size_t data_size=sizeof(size_t)*4;
  1356. data_size+=sizeof(size_t)+interac_blk.size()*sizeof(size_t);
  1357. data_size+=sizeof(size_t)+interac_cnt.size()*sizeof(size_t);
  1358. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  1359. data_size+=sizeof(size_t)+ input_perm.size()*sizeof(size_t);
  1360. data_size+=sizeof(size_t)+output_perm.size()*sizeof(size_t);
  1361. if(interac_data.Dim(0)*interac_data.Dim(1)<sizeof(size_t)){
  1362. data_size+=sizeof(size_t);
  1363. interac_data.ReInit(1,data_size);
  1364. ((size_t*)&interac_data[0][0])[0]=sizeof(size_t);
  1365. }else{
  1366. size_t pts_data_size=*((size_t*)&interac_data[0][0]);
  1367. assert(interac_data.Dim(0)*interac_data.Dim(1)>=pts_data_size);
  1368. data_size+=pts_data_size;
  1369. if(data_size>interac_data.Dim(0)*interac_data.Dim(1)){ //Resize and copy interac_data.
  1370. Matrix< char> pts_interac_data=interac_data;
  1371. interac_data.ReInit(1,data_size);
  1372. mem::memcopy(&interac_data[0][0],&pts_interac_data[0][0],pts_data_size);
  1373. }
  1374. }
  1375. char* data_ptr=&interac_data[0][0];
  1376. data_ptr+=((size_t*)data_ptr)[0];
  1377. ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t);
  1378. ((size_t*)data_ptr)[0]= M_dim0; data_ptr+=sizeof(size_t);
  1379. ((size_t*)data_ptr)[0]= M_dim1; data_ptr+=sizeof(size_t);
  1380. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  1381. ((size_t*)data_ptr)[0]=interac_blk.size(); data_ptr+=sizeof(size_t);
  1382. mem::memcopy(data_ptr, &interac_blk[0], interac_blk.size()*sizeof(size_t));
  1383. data_ptr+=interac_blk.size()*sizeof(size_t);
  1384. ((size_t*)data_ptr)[0]=interac_cnt.size(); data_ptr+=sizeof(size_t);
  1385. mem::memcopy(data_ptr, &interac_cnt[0], interac_cnt.size()*sizeof(size_t));
  1386. data_ptr+=interac_cnt.size()*sizeof(size_t);
  1387. ((size_t*)data_ptr)[0]=interac_mat.size(); data_ptr+=sizeof(size_t);
  1388. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  1389. data_ptr+=interac_mat.size()*sizeof(size_t);
  1390. ((size_t*)data_ptr)[0]= input_perm.size(); data_ptr+=sizeof(size_t);
  1391. mem::memcopy(data_ptr, & input_perm[0], input_perm.size()*sizeof(size_t));
  1392. data_ptr+= input_perm.size()*sizeof(size_t);
  1393. ((size_t*)data_ptr)[0]=output_perm.size(); data_ptr+=sizeof(size_t);
  1394. mem::memcopy(data_ptr, &output_perm[0], output_perm.size()*sizeof(size_t));
  1395. data_ptr+=output_perm.size()*sizeof(size_t);
  1396. }
  1397. }
  1398. Profile::Toc();
  1399. if(device){ // Host2Device
  1400. Profile::Tic("Host2Device",&this->comm,false,25);
  1401. setup_data.interac_data .AllocDevice(true);
  1402. Profile::Toc();
  1403. }
  1404. }
  1405. #if defined(PVFMM_HAVE_CUDA)
  1406. #include <fmm_pts_gpu.hpp>
  1407. template <class Real_t, int SYNC>
  1408. void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Comm& comm) {
  1409. cudaStream_t* stream = pvfmm::CUDA_Lock::acquire_stream();
  1410. Profile::Tic("Host2Device",&comm,false,25);
  1411. typename Matrix<char>::Device interac_data;
  1412. typename Vector<char>::Device buff;
  1413. typename Matrix<char>::Device precomp_data_d;
  1414. typename Matrix<char>::Device interac_data_d;
  1415. typename Matrix<Real_t>::Device input_data_d;
  1416. typename Matrix<Real_t>::Device output_data_d;
  1417. interac_data = setup_data.interac_data;
  1418. buff = dev_buffer. AllocDevice(false);
  1419. precomp_data_d= setup_data.precomp_data->AllocDevice(false);
  1420. interac_data_d= setup_data.interac_data. AllocDevice(false);
  1421. input_data_d = setup_data. input_data->AllocDevice(false);
  1422. output_data_d = setup_data. output_data->AllocDevice(false);
  1423. Profile::Toc();
  1424. Profile::Tic("DeviceComp",&comm,false,20);
  1425. { // Offloaded computation.
  1426. size_t data_size, M_dim0, M_dim1, dof;
  1427. Vector<size_t> interac_blk;
  1428. Vector<size_t> interac_cnt;
  1429. Vector<size_t> interac_mat;
  1430. Vector<size_t> input_perm_d;
  1431. Vector<size_t> output_perm_d;
  1432. { // Set interac_data.
  1433. char* data_ptr=&interac_data [0][0];
  1434. char* dev_ptr=&interac_data_d[0][0];
  1435. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; dev_ptr += data_size;
  1436. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1437. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1438. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1439. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1440. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1441. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1442. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1443. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1444. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1445. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1446. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1447. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1448. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1449. input_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1450. data_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1451. dev_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1452. output_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1453. data_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1454. dev_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1455. }
  1456. { // interactions
  1457. size_t interac_indx = 0;
  1458. size_t interac_blk_dsp = 0;
  1459. cudaError_t error;
  1460. for (size_t k = 0; k < interac_blk.Dim(); k++) {
  1461. size_t vec_cnt=0;
  1462. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1463. if(vec_cnt==0){
  1464. //interac_indx += vec_cnt;
  1465. interac_blk_dsp += interac_blk[k];
  1466. continue;
  1467. }
  1468. char *buff_in_d =&buff[0];
  1469. char *buff_out_d =&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1470. { // Input permutation.
  1471. in_perm_gpu<Real_t>(&precomp_data_d[0][0], &input_data_d[0][0], buff_in_d,
  1472. &input_perm_d[interac_indx*4], vec_cnt, M_dim0, stream);
  1473. }
  1474. size_t vec_cnt0 = 0;
  1475. for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
  1476. size_t vec_cnt1 = 0;
  1477. size_t interac_mat0 = interac_mat[j];
  1478. for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) vec_cnt1 += interac_cnt[j];
  1479. Matrix<Real_t> M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
  1480. Matrix<Real_t> Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1481. Matrix<Real_t> Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1482. Matrix<Real_t>::CUBLASGEMM(Mt_d, Ms_d, M_d);
  1483. vec_cnt0 += vec_cnt1;
  1484. }
  1485. { // Output permutation.
  1486. out_perm_gpu<Real_t>(&precomp_data_d[0][0], &output_data_d[0][0], buff_out_d,
  1487. &output_perm_d[interac_indx*4], vec_cnt, M_dim1, stream);
  1488. }
  1489. interac_indx += vec_cnt;
  1490. interac_blk_dsp += interac_blk[k];
  1491. }
  1492. }
  1493. }
  1494. Profile::Toc();
  1495. if(SYNC) CUDA_Lock::wait();
  1496. }
  1497. #endif
  1498. template <class FMMNode>
  1499. template <int SYNC>
  1500. void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
  1501. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  1502. Profile::Tic("Host2Device",&this->comm,false,25);
  1503. Profile::Toc();
  1504. Profile::Tic("DeviceComp",&this->comm,false,20);
  1505. Profile::Toc();
  1506. return;
  1507. }
  1508. #if defined(PVFMM_HAVE_CUDA)
  1509. if (device) {
  1510. EvalListGPU<Real_t, SYNC>(setup_data, this->dev_buffer, this->comm);
  1511. return;
  1512. }
  1513. #endif
  1514. Profile::Tic("Host2Device",&this->comm,false,25);
  1515. typename Vector<char>::Device buff;
  1516. typename Matrix<char>::Device precomp_data;
  1517. typename Matrix<char>::Device interac_data;
  1518. typename Matrix<Real_t>::Device input_data;
  1519. typename Matrix<Real_t>::Device output_data;
  1520. if(device){
  1521. buff = this-> dev_buffer. AllocDevice(false);
  1522. precomp_data= setup_data.precomp_data->AllocDevice(false);
  1523. interac_data= setup_data.interac_data. AllocDevice(false);
  1524. input_data = setup_data. input_data->AllocDevice(false);
  1525. output_data = setup_data. output_data->AllocDevice(false);
  1526. }else{
  1527. buff = this-> cpu_buffer;
  1528. precomp_data=*setup_data.precomp_data;
  1529. interac_data= setup_data.interac_data;
  1530. input_data =*setup_data. input_data;
  1531. output_data =*setup_data. output_data;
  1532. }
  1533. Profile::Toc();
  1534. Profile::Tic("DeviceComp",&this->comm,false,20);
  1535. int lock_idx=-1;
  1536. int wait_lock_idx=-1;
  1537. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  1538. if(device) lock_idx=MIC_Lock::get_lock();
  1539. #ifdef __INTEL_OFFLOAD
  1540. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  1541. #endif
  1542. { // Offloaded computation.
  1543. // Set interac_data.
  1544. size_t data_size, M_dim0, M_dim1, dof;
  1545. Vector<size_t> interac_blk;
  1546. Vector<size_t> interac_cnt;
  1547. Vector<size_t> interac_mat;
  1548. Vector<size_t> input_perm;
  1549. Vector<size_t> output_perm;
  1550. { // Set interac_data.
  1551. char* data_ptr=&interac_data[0][0];
  1552. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
  1553. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1554. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1555. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1556. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1557. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1558. data_ptr+=sizeof(size_t)+interac_blk.Dim()*sizeof(size_t);
  1559. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1560. data_ptr+=sizeof(size_t)+interac_cnt.Dim()*sizeof(size_t);
  1561. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1562. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  1563. input_perm .ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1564. data_ptr+=sizeof(size_t)+ input_perm.Dim()*sizeof(size_t);
  1565. output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1566. data_ptr+=sizeof(size_t)+output_perm.Dim()*sizeof(size_t);
  1567. }
  1568. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  1569. //Compute interaction from Chebyshev source density.
  1570. { // interactions
  1571. int omp_p=omp_get_max_threads();
  1572. size_t interac_indx=0;
  1573. size_t interac_blk_dsp=0;
  1574. for(size_t k=0;k<interac_blk.Dim();k++){
  1575. size_t vec_cnt=0;
  1576. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1577. if(vec_cnt==0){
  1578. //interac_indx += vec_cnt;
  1579. interac_blk_dsp += interac_blk[k];
  1580. continue;
  1581. }
  1582. char* buff_in =&buff[0];
  1583. char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1584. // Input permutation.
  1585. #pragma omp parallel for
  1586. for(int tid=0;tid<omp_p;tid++){
  1587. size_t a=( tid *vec_cnt)/omp_p;
  1588. size_t b=((tid+1)*vec_cnt)/omp_p;
  1589. for(size_t i=a;i<b;i++){
  1590. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]);
  1591. const Real_t* scal=( Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]);
  1592. const Real_t* v_in =( Real_t*)( input_data[0]+input_perm[(interac_indx+i)*4+3]);
  1593. Real_t* v_out=( Real_t*)( buff_in +input_perm[(interac_indx+i)*4+2]);
  1594. // TODO: Fix for dof>1
  1595. #ifdef __MIC__
  1596. {
  1597. __m512d v8;
  1598. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1599. size_t j_end =(((uintptr_t)(v_out+M_dim0) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1600. j_start/=sizeof(Real_t);
  1601. j_end /=sizeof(Real_t);
  1602. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1603. assert(((uintptr_t)(v_out+j_start))%64==0);
  1604. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1605. size_t j=0;
  1606. for(;j<j_start;j++ ){
  1607. v_out[j]=v_in[perm[j]]*scal[j];
  1608. }
  1609. for(;j<j_end ;j+=8){
  1610. v8=_mm512_setr_pd(
  1611. v_in[perm[j+0]]*scal[j+0],
  1612. v_in[perm[j+1]]*scal[j+1],
  1613. v_in[perm[j+2]]*scal[j+2],
  1614. v_in[perm[j+3]]*scal[j+3],
  1615. v_in[perm[j+4]]*scal[j+4],
  1616. v_in[perm[j+5]]*scal[j+5],
  1617. v_in[perm[j+6]]*scal[j+6],
  1618. v_in[perm[j+7]]*scal[j+7]);
  1619. _mm512_storenrngo_pd(v_out+j,v8);
  1620. }
  1621. for(;j<M_dim0 ;j++ ){
  1622. v_out[j]=v_in[perm[j]]*scal[j];
  1623. }
  1624. }
  1625. #else
  1626. for(size_t j=0;j<M_dim0;j++ ){
  1627. v_out[j]=v_in[perm[j]]*scal[j];
  1628. }
  1629. #endif
  1630. }
  1631. }
  1632. size_t vec_cnt0=0;
  1633. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];){
  1634. size_t vec_cnt1=0;
  1635. size_t interac_mat0=interac_mat[j];
  1636. for(;j<interac_blk_dsp+interac_blk[k] && interac_mat[j]==interac_mat0;j++) vec_cnt1+=interac_cnt[j];
  1637. Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
  1638. #ifdef __MIC__
  1639. {
  1640. Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1641. Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1642. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1643. }
  1644. #else
  1645. #pragma omp parallel for
  1646. for(int tid=0;tid<omp_p;tid++){
  1647. size_t a=(dof*vec_cnt1*(tid ))/omp_p;
  1648. size_t b=(dof*vec_cnt1*(tid+1))/omp_p;
  1649. Matrix<Real_t> Ms(b-a, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t))+M_dim0*a, false);
  1650. Matrix<Real_t> Mt(b-a, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t))+M_dim1*a, false);
  1651. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1652. }
  1653. #endif
  1654. vec_cnt0+=vec_cnt1;
  1655. }
  1656. // Output permutation.
  1657. #pragma omp parallel for
  1658. for(int tid=0;tid<omp_p;tid++){
  1659. size_t a=( tid *vec_cnt)/omp_p;
  1660. size_t b=((tid+1)*vec_cnt)/omp_p;
  1661. if(tid> 0 && a<vec_cnt){ // Find 'a' independent of other threads.
  1662. size_t out_ptr=output_perm[(interac_indx+a)*4+3];
  1663. if(tid> 0) while(a<vec_cnt && out_ptr==output_perm[(interac_indx+a)*4+3]) a++;
  1664. }
  1665. if(tid<omp_p-1 && b<vec_cnt){ // Find 'b' independent of other threads.
  1666. size_t out_ptr=output_perm[(interac_indx+b)*4+3];
  1667. if(tid<omp_p-1) while(b<vec_cnt && out_ptr==output_perm[(interac_indx+b)*4+3]) b++;
  1668. }
  1669. for(size_t i=a;i<b;i++){ // Compute permutations.
  1670. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
  1671. const Real_t* scal=( Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
  1672. const Real_t* v_in =( Real_t*)( buff_out +output_perm[(interac_indx+i)*4+2]);
  1673. Real_t* v_out=( Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]);
  1674. // TODO: Fix for dof>1
  1675. #ifdef __MIC__
  1676. {
  1677. __m512d v8;
  1678. __m512d v_old;
  1679. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1680. size_t j_end =(((uintptr_t)(v_out+M_dim1) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1681. j_start/=sizeof(Real_t);
  1682. j_end /=sizeof(Real_t);
  1683. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1684. assert(((uintptr_t)(v_out+j_start))%64==0);
  1685. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1686. size_t j=0;
  1687. for(;j<j_start;j++ ){
  1688. v_out[j]+=v_in[perm[j]]*scal[j];
  1689. }
  1690. for(;j<j_end ;j+=8){
  1691. v_old=_mm512_load_pd(v_out+j);
  1692. v8=_mm512_setr_pd(
  1693. v_in[perm[j+0]]*scal[j+0],
  1694. v_in[perm[j+1]]*scal[j+1],
  1695. v_in[perm[j+2]]*scal[j+2],
  1696. v_in[perm[j+3]]*scal[j+3],
  1697. v_in[perm[j+4]]*scal[j+4],
  1698. v_in[perm[j+5]]*scal[j+5],
  1699. v_in[perm[j+6]]*scal[j+6],
  1700. v_in[perm[j+7]]*scal[j+7]);
  1701. v_old=_mm512_add_pd(v_old, v8);
  1702. _mm512_storenrngo_pd(v_out+j,v_old);
  1703. }
  1704. for(;j<M_dim1 ;j++ ){
  1705. v_out[j]+=v_in[perm[j]]*scal[j];
  1706. }
  1707. }
  1708. #else
  1709. for(size_t j=0;j<M_dim1;j++ ){
  1710. v_out[j]+=v_in[perm[j]]*scal[j];
  1711. }
  1712. #endif
  1713. }
  1714. }
  1715. interac_indx+=vec_cnt;
  1716. interac_blk_dsp+=interac_blk[k];
  1717. }
  1718. }
  1719. if(device) MIC_Lock::release_lock(lock_idx);
  1720. }
  1721. #ifdef __INTEL_OFFLOAD
  1722. if(SYNC){
  1723. #pragma offload if(device) target(mic:0)
  1724. {if(device) MIC_Lock::wait_lock(lock_idx);}
  1725. }
  1726. #endif
  1727. Profile::Toc();
  1728. }
  1729. template <class FMMNode>
  1730. void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  1731. if(this->MultipoleOrder()==0) return;
  1732. { // Set setup_data
  1733. setup_data.level=level;
  1734. setup_data.kernel=kernel->k_s2m;
  1735. setup_data.interac_type.resize(1);
  1736. setup_data.interac_type[0]=S2U_Type;
  1737. setup_data. input_data=&buff[4];
  1738. setup_data.output_data=&buff[0];
  1739. setup_data. coord_data=&buff[6];
  1740. Vector<FMMNode_t*>& nodes_in =n_list[4];
  1741. Vector<FMMNode_t*>& nodes_out=n_list[0];
  1742. setup_data.nodes_in .clear();
  1743. setup_data.nodes_out.clear();
  1744. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  1745. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  1746. }
  1747. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1748. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1749. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  1750. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  1751. for(size_t i=0;i<nodes_in .size();i++){
  1752. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
  1753. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
  1754. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
  1755. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
  1756. }
  1757. for(size_t i=0;i<nodes_out.size();i++){
  1758. output_vector.push_back(&upwd_check_surf[((FMMNode*)nodes_out[i])->Depth()]);
  1759. output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
  1760. }
  1761. //Upward check to upward equivalent matrix.
  1762. Matrix<Real_t>& M_uc2ue = this->mat->Mat(level, UC2UE_Type, 0);
  1763. this->SetupInteracPts(setup_data, false, true, &M_uc2ue,device);
  1764. { // Resize device buffer
  1765. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  1766. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  1767. }
  1768. }
  1769. template <class FMMNode>
  1770. void FMM_Pts<FMMNode>::Source2Up(SetupData<Real_t>& setup_data, bool device){
  1771. //Add Source2Up contribution.
  1772. this->EvalListPts(setup_data, device);
  1773. }
  1774. template <class FMMNode>
  1775. void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  1776. if(this->MultipoleOrder()==0) return;
  1777. { // Set setup_data
  1778. setup_data.level=level;
  1779. setup_data.kernel=kernel->k_m2m;
  1780. setup_data.interac_type.resize(1);
  1781. setup_data.interac_type[0]=U2U_Type;
  1782. setup_data. input_data=&buff[0];
  1783. setup_data.output_data=&buff[0];
  1784. Vector<FMMNode_t*>& nodes_in =n_list[0];
  1785. Vector<FMMNode_t*>& nodes_out=n_list[0];
  1786. setup_data.nodes_in .clear();
  1787. setup_data.nodes_out.clear();
  1788. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level+1) setup_data.nodes_in .push_back(nodes_in [i]);
  1789. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level ) setup_data.nodes_out.push_back(nodes_out[i]);
  1790. }
  1791. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1792. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1793. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  1794. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  1795. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
  1796. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
  1797. SetupInterac(setup_data,device);
  1798. }
  1799. template <class FMMNode>
  1800. void FMM_Pts<FMMNode>::Up2Up (SetupData<Real_t>& setup_data, bool device){
  1801. //Add Up2Up contribution.
  1802. EvalList(setup_data, device);
  1803. }
  1804. template <class FMMNode>
  1805. void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
  1806. if(!this->Homogen() || this->MultipoleOrder()==0) return;
  1807. Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
  1808. assert(node->FMMData()->upward_equiv.Dim()>0);
  1809. int dof=1;
  1810. Vector<Real_t>& upward_equiv=node->FMMData()->upward_equiv;
  1811. Vector<Real_t>& dnward_equiv=node->FMMData()->dnward_equiv;
  1812. assert(upward_equiv.Dim()==M.Dim(0)*dof);
  1813. assert(dnward_equiv.Dim()==M.Dim(1)*dof);
  1814. Matrix<Real_t> d_equiv(dof,M.Dim(0),&dnward_equiv[0],false);
  1815. Matrix<Real_t> u_equiv(dof,M.Dim(1),&upward_equiv[0],false);
  1816. Matrix<Real_t>::GEMM(d_equiv,u_equiv,M);
  1817. }
  1818. template <class FMMNode>
  1819. void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec, Vector<Real_t>& fft_scal,
  1820. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  1821. size_t n1=m*2;
  1822. size_t n2=n1*n1;
  1823. size_t n3=n1*n2;
  1824. size_t n3_=n2*(n1/2+1);
  1825. size_t chld_cnt=1UL<<COORD_DIM;
  1826. size_t fftsize_in =2*n3_*chld_cnt*ker_dim0*dof;
  1827. int omp_p=omp_get_max_threads();
  1828. //Load permutation map.
  1829. size_t n=6*(m-1)*(m-1)+2;
  1830. static Vector<size_t> map;
  1831. { // Build map to reorder upward_equiv
  1832. size_t n_old=map.Dim();
  1833. if(n_old!=n){
  1834. Real_t c[3]={0,0,0};
  1835. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  1836. map.Resize(surf.Dim()/COORD_DIM);
  1837. for(size_t i=0;i<map.Dim();i++)
  1838. map[i]=((size_t)(m-1-surf[i*3]+0.5))+((size_t)(m-1-surf[i*3+1]+0.5))*n1+((size_t)(m-1-surf[i*3+2]+0.5))*n2;
  1839. }
  1840. }
  1841. { // Build FFTW plan.
  1842. if(!vlist_fft_flag){
  1843. int nnn[3]={(int)n1,(int)n1,(int)n1};
  1844. void *fftw_in, *fftw_out;
  1845. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim0*chld_cnt);
  1846. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim0*chld_cnt);
  1847. vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
  1848. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_, FFTW_ESTIMATE);
  1849. mem::aligned_delete<Real_t>((Real_t*)fftw_in );
  1850. mem::aligned_delete<Real_t>((Real_t*)fftw_out);
  1851. vlist_fft_flag=true;
  1852. }
  1853. }
  1854. { // Offload section
  1855. size_t n_in = fft_vec.Dim();
  1856. #pragma omp parallel for
  1857. for(int pid=0; pid<omp_p; pid++){
  1858. size_t node_start=(n_in*(pid ))/omp_p;
  1859. size_t node_end =(n_in*(pid+1))/omp_p;
  1860. Vector<Real_t> buffer(fftsize_in, &buffer_[fftsize_in*pid], false);
  1861. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  1862. Vector<Real_t*> upward_equiv(chld_cnt);
  1863. for(size_t i=0;i<chld_cnt;i++) upward_equiv[i]=&input_data[0] + fft_vec[node_idx] + n*ker_dim0*dof*i;
  1864. Vector<Real_t> upward_equiv_fft(fftsize_in, &output_data[fftsize_in *node_idx], false);
  1865. upward_equiv_fft.SetZero();
  1866. // Rearrange upward equivalent data.
  1867. for(size_t k=0;k<n;k++){
  1868. size_t idx=map[k];
  1869. for(int j1=0;j1<dof;j1++)
  1870. for(int j0=0;j0<(int)chld_cnt;j0++)
  1871. for(int i=0;i<ker_dim0;i++)
  1872. upward_equiv_fft[idx+(j0+(i+j1*ker_dim0)*chld_cnt)*n3]=upward_equiv[j0][ker_dim0*(n*j1+k)+i]*fft_scal[ker_dim0*node_idx+i];
  1873. }
  1874. // Compute FFT.
  1875. for(int i=0;i<dof;i++)
  1876. FFTW_t<Real_t>::fft_execute_dft_r2c(vlist_fftplan, (Real_t*)&upward_equiv_fft[i* n3 *ker_dim0*chld_cnt],
  1877. (typename FFTW_t<Real_t>::cplx*)&buffer [i*2*n3_*ker_dim0*chld_cnt]);
  1878. //Compute flops.
  1879. #ifndef FFTW3_MKL
  1880. double add, mul, fma;
  1881. FFTW_t<Real_t>::fftw_flops(vlist_fftplan, &add, &mul, &fma);
  1882. #ifndef __INTEL_OFFLOAD0
  1883. Profile::Add_FLOP((long long)(add+mul+2*fma));
  1884. #endif
  1885. #endif
  1886. for(int i=0;i<ker_dim0*dof;i++)
  1887. for(size_t j=0;j<n3_;j++)
  1888. for(size_t k=0;k<chld_cnt;k++){
  1889. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+0]=buffer[2*(n3_*(chld_cnt*i+k)+j)+0];
  1890. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+1]=buffer[2*(n3_*(chld_cnt*i+k)+j)+1];
  1891. }
  1892. }
  1893. }
  1894. }
  1895. }
  1896. template <class FMMNode>
  1897. void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector<size_t>& ifft_vec, Vector<Real_t>& ifft_scal,
  1898. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_, Matrix<Real_t>& M){
  1899. size_t n1=m*2;
  1900. size_t n2=n1*n1;
  1901. size_t n3=n1*n2;
  1902. size_t n3_=n2*(n1/2+1);
  1903. size_t chld_cnt=1UL<<COORD_DIM;
  1904. size_t fftsize_out=2*n3_*dof*ker_dim1*chld_cnt;
  1905. size_t ker_dim0=M.Dim(1)/(M.Dim(0)/ker_dim1);
  1906. int omp_p=omp_get_max_threads();
  1907. //Load permutation map.
  1908. size_t n=6*(m-1)*(m-1)+2;
  1909. static Vector<size_t> map;
  1910. { // Build map to reorder dnward_check
  1911. size_t n_old=map.Dim();
  1912. if(n_old!=n){
  1913. Real_t c[3]={0,0,0};
  1914. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  1915. map.Resize(surf.Dim()/COORD_DIM);
  1916. for(size_t i=0;i<map.Dim();i++)
  1917. map[i]=((size_t)(m*2-0.5-surf[i*3]))+((size_t)(m*2-0.5-surf[i*3+1]))*n1+((size_t)(m*2-0.5-surf[i*3+2]))*n2;
  1918. //map;//.AllocDevice(true);
  1919. }
  1920. }
  1921. { // Build FFTW plan.
  1922. if(!vlist_ifft_flag){
  1923. //Build FFTW plan.
  1924. int nnn[3]={(int)n1,(int)n1,(int)n1};
  1925. Real_t *fftw_in, *fftw_out;
  1926. fftw_in = mem::aligned_new<Real_t>(2*n3_*ker_dim1*chld_cnt);
  1927. fftw_out = mem::aligned_new<Real_t>( n3 *ker_dim1*chld_cnt);
  1928. vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
  1929. (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE);
  1930. mem::aligned_delete<Real_t>(fftw_in);
  1931. mem::aligned_delete<Real_t>(fftw_out);
  1932. vlist_ifft_flag=true;
  1933. }
  1934. }
  1935. { // Offload section
  1936. assert(buffer_.Dim()>=(fftsize_out+M.Dim(1)*dof)*omp_p);
  1937. size_t n_out=ifft_vec.Dim();
  1938. #pragma omp parallel for
  1939. for(int pid=0; pid<omp_p; pid++){
  1940. size_t node_start=(n_out*(pid ))/omp_p;
  1941. size_t node_end =(n_out*(pid+1))/omp_p;
  1942. Vector<Real_t> buffer(fftsize_out+M.Dim(1)*dof, &buffer_[(fftsize_out+M.Dim(1)*dof)*pid], false);
  1943. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  1944. Vector<Real_t> dnward_check_fft(fftsize_out, &input_data[fftsize_out*node_idx], false);
  1945. //De-interleave data.
  1946. for(int i=0;i<ker_dim1*dof;i++)
  1947. for(size_t j=0;j<n3_;j++)
  1948. for(size_t k=0;k<chld_cnt;k++){
  1949. buffer[2*(n3_*(ker_dim1*dof*k+i)+j)+0]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+0];
  1950. buffer[2*(n3_*(ker_dim1*dof*k+i)+j)+1]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+1];
  1951. }
  1952. // Compute FFT.
  1953. for(int i=0;i<dof;i++)
  1954. FFTW_t<Real_t>::fft_execute_dft_c2r(vlist_ifftplan, (typename FFTW_t<Real_t>::cplx*)&buffer [i*2*n3_*ker_dim1*chld_cnt],
  1955. (Real_t*)&dnward_check_fft[i* n3 *ker_dim1*chld_cnt]);
  1956. //Compute flops.
  1957. #ifndef FFTW3_MKL
  1958. double add, mul, fma;
  1959. FFTW_t<Real_t>::fftw_flops(vlist_ifftplan, &add, &mul, &fma);
  1960. #ifndef __INTEL_OFFLOAD0
  1961. Profile::Add_FLOP((long long)(add+mul+2*fma));
  1962. #endif
  1963. #endif
  1964. // Rearrange downward check data.
  1965. for(size_t k=0;k<n;k++){
  1966. size_t idx=map[k];
  1967. for(int j1=0;j1<dof;j1++)
  1968. for(int j0=0;j0<(int)chld_cnt;j0++)
  1969. for(int i=0;i<ker_dim1;i++)
  1970. buffer[ker_dim1*(n*(dof*j0+j1)+k)+i]=dnward_check_fft[idx+(j1+(i+j0*ker_dim1)*dof)*n3];
  1971. }
  1972. // Compute check to equiv.
  1973. for(size_t j=0;j<chld_cnt;j++){
  1974. Matrix<Real_t> d_check(dof,M.Dim(0),&buffer[n*ker_dim1*dof*j],false);
  1975. Matrix<Real_t> d_equiv(dof,M.Dim(1),&buffer[ fftsize_out],false);
  1976. Matrix<Real_t>::GEMM(d_equiv,d_check,M,0.0);
  1977. for(size_t i=0;i<dof*M.Dim(1);i+=ker_dim0){
  1978. for(size_t j=0;j<ker_dim0;j++){
  1979. d_equiv[0][i+j]*=ifft_scal[ker_dim0*node_idx+j];
  1980. }
  1981. }
  1982. { // Add to equiv density
  1983. Matrix<Real_t> d_equiv_(dof,M.Dim(1),&output_data[0] + ifft_vec[node_idx] + M.Dim(1)*dof*j,false);
  1984. d_equiv_+=d_equiv;
  1985. }
  1986. }
  1987. }
  1988. }
  1989. }
  1990. }
  1991. template<class Real_t>
  1992. inline void matmult_8x8x2(Real_t*& M_, Real_t*& IN0, Real_t*& IN1, Real_t*& OUT0, Real_t*& OUT1){
  1993. // Generic code.
  1994. Real_t out_reg000, out_reg001, out_reg010, out_reg011;
  1995. Real_t out_reg100, out_reg101, out_reg110, out_reg111;
  1996. Real_t in_reg000, in_reg001, in_reg010, in_reg011;
  1997. Real_t in_reg100, in_reg101, in_reg110, in_reg111;
  1998. Real_t m_reg000, m_reg001, m_reg010, m_reg011;
  1999. Real_t m_reg100, m_reg101, m_reg110, m_reg111;
  2000. //#pragma unroll
  2001. for(int i1=0;i1<8;i1+=2){
  2002. Real_t* IN0_=IN0;
  2003. Real_t* IN1_=IN1;
  2004. out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
  2005. out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
  2006. out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
  2007. out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
  2008. //#pragma unroll
  2009. for(int i2=0;i2<8;i2+=2){
  2010. m_reg000=M_[ 0]; m_reg001=M_[ 1];
  2011. m_reg010=M_[ 2]; m_reg011=M_[ 3];
  2012. m_reg100=M_[16]; m_reg101=M_[17];
  2013. m_reg110=M_[18]; m_reg111=M_[19];
  2014. in_reg000=IN0_[0]; in_reg001=IN0_[1];
  2015. in_reg010=IN0_[2]; in_reg011=IN0_[3];
  2016. in_reg100=IN1_[0]; in_reg101=IN1_[1];
  2017. in_reg110=IN1_[2]; in_reg111=IN1_[3];
  2018. out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
  2019. out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
  2020. out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
  2021. out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
  2022. out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
  2023. out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
  2024. out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
  2025. out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
  2026. out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
  2027. out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
  2028. out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
  2029. out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
  2030. out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
  2031. out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
  2032. out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
  2033. out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
  2034. M_+=32; // Jump to (column+2).
  2035. IN0_+=4;
  2036. IN1_+=4;
  2037. }
  2038. OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
  2039. OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
  2040. OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
  2041. OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
  2042. M_+=4-64*2; // Jump back to first column (row+2).
  2043. OUT0+=4;
  2044. OUT1+=4;
  2045. }
  2046. }
  2047. #if defined(__AVX__) || defined(__SSE3__)
  2048. template<>
  2049. inline void matmult_8x8x2<double>(double*& M_, double*& IN0, double*& IN1, double*& OUT0, double*& OUT1){
  2050. #ifdef __AVX__ //AVX code.
  2051. __m256d out00,out01,out10,out11;
  2052. __m256d out20,out21,out30,out31;
  2053. double* in0__ = IN0;
  2054. double* in1__ = IN1;
  2055. out00 = _mm256_load_pd(OUT0);
  2056. out01 = _mm256_load_pd(OUT1);
  2057. out10 = _mm256_load_pd(OUT0+4);
  2058. out11 = _mm256_load_pd(OUT1+4);
  2059. out20 = _mm256_load_pd(OUT0+8);
  2060. out21 = _mm256_load_pd(OUT1+8);
  2061. out30 = _mm256_load_pd(OUT0+12);
  2062. out31 = _mm256_load_pd(OUT1+12);
  2063. for(int i2=0;i2<8;i2+=2){
  2064. __m256d m00;
  2065. __m256d ot00;
  2066. __m256d mt0,mtt0;
  2067. __m256d in00,in00_r,in01,in01_r;
  2068. in00 = _mm256_broadcast_pd((const __m128d*)in0__);
  2069. in00_r = _mm256_permute_pd(in00,5);
  2070. in01 = _mm256_broadcast_pd((const __m128d*)in1__);
  2071. in01_r = _mm256_permute_pd(in01,5);
  2072. m00 = _mm256_load_pd(M_);
  2073. mt0 = _mm256_unpacklo_pd(m00,m00);
  2074. ot00 = _mm256_mul_pd(mt0,in00);
  2075. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2076. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2077. ot00 = _mm256_mul_pd(mt0,in01);
  2078. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2079. m00 = _mm256_load_pd(M_+4);
  2080. mt0 = _mm256_unpacklo_pd(m00,m00);
  2081. ot00 = _mm256_mul_pd(mt0,in00);
  2082. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2083. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2084. ot00 = _mm256_mul_pd(mt0,in01);
  2085. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2086. m00 = _mm256_load_pd(M_+8);
  2087. mt0 = _mm256_unpacklo_pd(m00,m00);
  2088. ot00 = _mm256_mul_pd(mt0,in00);
  2089. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2090. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2091. ot00 = _mm256_mul_pd(mt0,in01);
  2092. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2093. m00 = _mm256_load_pd(M_+12);
  2094. mt0 = _mm256_unpacklo_pd(m00,m00);
  2095. ot00 = _mm256_mul_pd(mt0,in00);
  2096. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2097. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2098. ot00 = _mm256_mul_pd(mt0,in01);
  2099. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2100. in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
  2101. in00_r = _mm256_permute_pd(in00,5);
  2102. in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
  2103. in01_r = _mm256_permute_pd(in01,5);
  2104. m00 = _mm256_load_pd(M_+16);
  2105. mt0 = _mm256_unpacklo_pd(m00,m00);
  2106. ot00 = _mm256_mul_pd(mt0,in00);
  2107. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2108. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2109. ot00 = _mm256_mul_pd(mt0,in01);
  2110. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2111. m00 = _mm256_load_pd(M_+20);
  2112. mt0 = _mm256_unpacklo_pd(m00,m00);
  2113. ot00 = _mm256_mul_pd(mt0,in00);
  2114. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2115. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2116. ot00 = _mm256_mul_pd(mt0,in01);
  2117. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2118. m00 = _mm256_load_pd(M_+24);
  2119. mt0 = _mm256_unpacklo_pd(m00,m00);
  2120. ot00 = _mm256_mul_pd(mt0,in00);
  2121. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2122. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2123. ot00 = _mm256_mul_pd(mt0,in01);
  2124. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2125. m00 = _mm256_load_pd(M_+28);
  2126. mt0 = _mm256_unpacklo_pd(m00,m00);
  2127. ot00 = _mm256_mul_pd(mt0,in00);
  2128. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2129. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2130. ot00 = _mm256_mul_pd(mt0,in01);
  2131. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2132. M_ += 32;
  2133. in0__ += 4;
  2134. in1__ += 4;
  2135. }
  2136. _mm256_store_pd(OUT0,out00);
  2137. _mm256_store_pd(OUT1,out01);
  2138. _mm256_store_pd(OUT0+4,out10);
  2139. _mm256_store_pd(OUT1+4,out11);
  2140. _mm256_store_pd(OUT0+8,out20);
  2141. _mm256_store_pd(OUT1+8,out21);
  2142. _mm256_store_pd(OUT0+12,out30);
  2143. _mm256_store_pd(OUT1+12,out31);
  2144. #elif defined __SSE3__ // SSE code.
  2145. __m128d out00, out01, out10, out11;
  2146. __m128d in00, in01, in10, in11;
  2147. __m128d m00, m01, m10, m11;
  2148. //#pragma unroll
  2149. for(int i1=0;i1<8;i1+=2){
  2150. double* IN0_=IN0;
  2151. double* IN1_=IN1;
  2152. out00 =_mm_load_pd (OUT0 );
  2153. out10 =_mm_load_pd (OUT0+2);
  2154. out01 =_mm_load_pd (OUT1 );
  2155. out11 =_mm_load_pd (OUT1+2);
  2156. //#pragma unroll
  2157. for(int i2=0;i2<8;i2+=2){
  2158. m00 =_mm_load1_pd (M_ );
  2159. m10 =_mm_load1_pd (M_+ 2);
  2160. m01 =_mm_load1_pd (M_+16);
  2161. m11 =_mm_load1_pd (M_+18);
  2162. in00 =_mm_load_pd (IN0_ );
  2163. in10 =_mm_load_pd (IN0_+2);
  2164. in01 =_mm_load_pd (IN1_ );
  2165. in11 =_mm_load_pd (IN1_+2);
  2166. out00 = _mm_add_pd (out00, _mm_mul_pd(m00 , in00 ));
  2167. out00 = _mm_add_pd (out00, _mm_mul_pd(m01 , in10 ));
  2168. out01 = _mm_add_pd (out01, _mm_mul_pd(m00 , in01 ));
  2169. out01 = _mm_add_pd (out01, _mm_mul_pd(m01 , in11 ));
  2170. out10 = _mm_add_pd (out10, _mm_mul_pd(m10 , in00 ));
  2171. out10 = _mm_add_pd (out10, _mm_mul_pd(m11 , in10 ));
  2172. out11 = _mm_add_pd (out11, _mm_mul_pd(m10 , in01 ));
  2173. out11 = _mm_add_pd (out11, _mm_mul_pd(m11 , in11 ));
  2174. m00 =_mm_load1_pd (M_+ 1);
  2175. m10 =_mm_load1_pd (M_+ 2+1);
  2176. m01 =_mm_load1_pd (M_+16+1);
  2177. m11 =_mm_load1_pd (M_+18+1);
  2178. in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
  2179. in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
  2180. in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
  2181. in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
  2182. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
  2183. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
  2184. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
  2185. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
  2186. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
  2187. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
  2188. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
  2189. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
  2190. M_+=32; // Jump to (column+2).
  2191. IN0_+=4;
  2192. IN1_+=4;
  2193. }
  2194. _mm_store_pd (OUT0 ,out00);
  2195. _mm_store_pd (OUT0+2,out10);
  2196. _mm_store_pd (OUT1 ,out01);
  2197. _mm_store_pd (OUT1+2,out11);
  2198. M_+=4-64*2; // Jump back to first column (row+2).
  2199. OUT0+=4;
  2200. OUT1+=4;
  2201. }
  2202. #endif
  2203. }
  2204. #endif
  2205. #if defined(__SSE3__)
  2206. template<>
  2207. inline void matmult_8x8x2<float>(float*& M_, float*& IN0, float*& IN1, float*& OUT0, float*& OUT1){
  2208. #if defined __SSE3__ // SSE code.
  2209. __m128 out00,out01,out10,out11;
  2210. __m128 out20,out21,out30,out31;
  2211. float* in0__ = IN0;
  2212. float* in1__ = IN1;
  2213. out00 = _mm_load_ps(OUT0);
  2214. out01 = _mm_load_ps(OUT1);
  2215. out10 = _mm_load_ps(OUT0+4);
  2216. out11 = _mm_load_ps(OUT1+4);
  2217. out20 = _mm_load_ps(OUT0+8);
  2218. out21 = _mm_load_ps(OUT1+8);
  2219. out30 = _mm_load_ps(OUT0+12);
  2220. out31 = _mm_load_ps(OUT1+12);
  2221. for(int i2=0;i2<8;i2+=2){
  2222. __m128 m00;
  2223. __m128 ot00;
  2224. __m128 mt0,mtt0;
  2225. __m128 in00,in00_r,in01,in01_r;
  2226. in00 = _mm_castpd_ps(_mm_load_pd1((const double*)in0__));
  2227. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2228. in01 = _mm_castpd_ps(_mm_load_pd1((const double*)in1__));
  2229. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2230. m00 = _mm_load_ps(M_);
  2231. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2232. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2233. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2234. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2235. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2236. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2237. m00 = _mm_load_ps(M_+4);
  2238. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2239. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2240. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2241. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2242. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2243. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2244. m00 = _mm_load_ps(M_+8);
  2245. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2246. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2247. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2248. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2249. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2250. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2251. m00 = _mm_load_ps(M_+12);
  2252. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2253. out30= _mm_add_ps (out30,_mm_mul_ps( mt0, in00));
  2254. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2255. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2256. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2257. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2258. in00 = _mm_castpd_ps(_mm_load_pd1((const double*) (in0__+2)));
  2259. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2260. in01 = _mm_castpd_ps(_mm_load_pd1((const double*) (in1__+2)));
  2261. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2262. m00 = _mm_load_ps(M_+16);
  2263. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2264. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2265. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2266. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2267. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2268. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2269. m00 = _mm_load_ps(M_+20);
  2270. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2271. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2272. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2273. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2274. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2275. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2276. m00 = _mm_load_ps(M_+24);
  2277. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2278. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2279. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2280. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2281. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2282. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2283. m00 = _mm_load_ps(M_+28);
  2284. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2285. out30= _mm_add_ps (out30,_mm_mul_ps( mt0,in00 ));
  2286. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2287. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2288. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2289. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2290. M_ += 32;
  2291. in0__ += 4;
  2292. in1__ += 4;
  2293. }
  2294. _mm_store_ps(OUT0,out00);
  2295. _mm_store_ps(OUT1,out01);
  2296. _mm_store_ps(OUT0+4,out10);
  2297. _mm_store_ps(OUT1+4,out11);
  2298. _mm_store_ps(OUT0+8,out20);
  2299. _mm_store_ps(OUT1+8,out21);
  2300. _mm_store_ps(OUT0+12,out30);
  2301. _mm_store_ps(OUT1+12,out31);
  2302. #endif
  2303. }
  2304. #endif
  2305. template <class Real_t>
  2306. void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector<size_t>& interac_dsp,
  2307. Vector<size_t>& interac_vec, Vector<Real_t*>& precomp_mat, Vector<Real_t>& fft_in, Vector<Real_t>& fft_out){
  2308. size_t chld_cnt=1UL<<COORD_DIM;
  2309. size_t fftsize_in =M_dim*ker_dim0*chld_cnt*2;
  2310. size_t fftsize_out=M_dim*ker_dim1*chld_cnt*2;
  2311. Real_t* zero_vec0=mem::aligned_new<Real_t>(fftsize_in );
  2312. Real_t* zero_vec1=mem::aligned_new<Real_t>(fftsize_out);
  2313. size_t n_out=fft_out.Dim()/fftsize_out;
  2314. // Set buff_out to zero.
  2315. #pragma omp parallel for
  2316. for(size_t k=0;k<n_out;k++){
  2317. Vector<Real_t> dnward_check_fft(fftsize_out, &fft_out[k*fftsize_out], false);
  2318. dnward_check_fft.SetZero();
  2319. }
  2320. // Build list of interaction pairs (in, out vectors).
  2321. size_t mat_cnt=precomp_mat.Dim();
  2322. size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
  2323. const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
  2324. Real_t** IN_ =mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2325. Real_t** OUT_=mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2326. #pragma omp parallel for
  2327. for(size_t interac_blk1=0; interac_blk1<blk1_cnt*mat_cnt; interac_blk1++){
  2328. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2329. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2330. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2331. for(size_t j=0;j<interac_cnt;j++){
  2332. IN_ [2*V_BLK_SIZE*interac_blk1 +j]=&fft_in [interac_vec[(interac_dsp0+j)*2+0]];
  2333. OUT_[2*V_BLK_SIZE*interac_blk1 +j]=&fft_out[interac_vec[(interac_dsp0+j)*2+1]];
  2334. }
  2335. IN_ [2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec0;
  2336. OUT_[2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec1;
  2337. }
  2338. int omp_p=omp_get_max_threads();
  2339. #pragma omp parallel for
  2340. for(int pid=0; pid<omp_p; pid++){
  2341. size_t a=( pid *M_dim)/omp_p;
  2342. size_t b=((pid+1)*M_dim)/omp_p;
  2343. for(int in_dim=0;in_dim<ker_dim0;in_dim++)
  2344. for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++)
  2345. for(size_t blk1=0; blk1<blk1_cnt; blk1++)
  2346. for(size_t k=a; k< b; k++)
  2347. for(size_t mat_indx=0; mat_indx< mat_cnt;mat_indx++){
  2348. size_t interac_blk1 = blk1*mat_cnt+mat_indx;
  2349. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2350. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2351. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2352. Real_t** IN = IN_ + 2*V_BLK_SIZE*interac_blk1;
  2353. Real_t** OUT= OUT_+ 2*V_BLK_SIZE*interac_blk1;
  2354. Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2 + (ot_dim+in_dim*ker_dim1)*M_dim*128;
  2355. {
  2356. for(size_t j=0;j<interac_cnt;j+=2){
  2357. Real_t* M_ = M;
  2358. Real_t* IN0 = IN [j+0] + (in_dim*M_dim+k)*chld_cnt*2;
  2359. Real_t* IN1 = IN [j+1] + (in_dim*M_dim+k)*chld_cnt*2;
  2360. Real_t* OUT0 = OUT[j+0] + (ot_dim*M_dim+k)*chld_cnt*2;
  2361. Real_t* OUT1 = OUT[j+1] + (ot_dim*M_dim+k)*chld_cnt*2;
  2362. #ifdef __SSE__
  2363. if (j+2 < interac_cnt) { // Prefetch
  2364. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2365. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2366. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2367. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2368. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2369. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2370. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2371. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2372. }
  2373. #endif
  2374. matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
  2375. }
  2376. }
  2377. }
  2378. }
  2379. // Compute flops.
  2380. {
  2381. Profile::Add_FLOP(8*8*8*(interac_vec.Dim()/2)*M_dim*ker_dim0*ker_dim1*dof);
  2382. }
  2383. // Free memory
  2384. mem::aligned_delete<Real_t*>(IN_ );
  2385. mem::aligned_delete<Real_t*>(OUT_);
  2386. mem::aligned_delete<Real_t>(zero_vec0);
  2387. mem::aligned_delete<Real_t>(zero_vec1);
  2388. }
  2389. template <class FMMNode>
  2390. void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2391. if(this->MultipoleOrder()==0) return;
  2392. if(level==0) return;
  2393. { // Set setup_data
  2394. setup_data.level=level;
  2395. setup_data.kernel=kernel->k_m2l;
  2396. setup_data.interac_type.resize(1);
  2397. setup_data.interac_type[0]=V1_Type;
  2398. setup_data. input_data=&buff[0];
  2399. setup_data.output_data=&buff[1];
  2400. Vector<FMMNode_t*>& nodes_in =n_list[2];
  2401. Vector<FMMNode_t*>& nodes_out=n_list[3];
  2402. setup_data.nodes_in .clear();
  2403. setup_data.nodes_out.clear();
  2404. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  2405. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level-1 || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  2406. }
  2407. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2408. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2409. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2410. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2411. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_in [i])->Child(0))->FMMData())->upward_equiv);
  2412. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_out[i])->Child(0))->FMMData())->dnward_equiv);
  2413. /////////////////////////////////////////////////////////////////////////////
  2414. Real_t eps=1e-10;
  2415. size_t n_in =nodes_in .size();
  2416. size_t n_out=nodes_out.size();
  2417. // Setup precomputed data.
  2418. if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  2419. // Build interac_data
  2420. Profile::Tic("Interac-Data",&this->comm,true,25);
  2421. Matrix<char>& interac_data=setup_data.interac_data;
  2422. if(n_out>0 && n_in >0){ // Build precomp_data, interac_data
  2423. size_t precomp_offset=0;
  2424. Mat_Type& interac_type=setup_data.interac_type[0];
  2425. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  2426. Matrix<size_t> precomp_data_offset;
  2427. std::vector<size_t> interac_mat;
  2428. { // Load precomp_data for interac_type.
  2429. struct HeaderData{
  2430. size_t total_size;
  2431. size_t level;
  2432. size_t mat_cnt ;
  2433. size_t max_depth;
  2434. };
  2435. Matrix<char>& precomp_data=*setup_data.precomp_data;
  2436. char* indx_ptr=precomp_data[0]+precomp_offset;
  2437. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  2438. precomp_data_offset.ReInit(header.mat_cnt,1+(2+2)*header.max_depth, (size_t*)indx_ptr, false);
  2439. precomp_offset+=header.total_size;
  2440. for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
  2441. Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
  2442. assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
  2443. interac_mat.push_back(precomp_data_offset[mat_id][0]);
  2444. }
  2445. }
  2446. size_t dof;
  2447. size_t m=MultipoleOrder();
  2448. size_t ker_dim0=setup_data.kernel->ker_dim[0];
  2449. size_t ker_dim1=setup_data.kernel->ker_dim[1];
  2450. size_t fftsize;
  2451. {
  2452. size_t n1=m*2;
  2453. size_t n2=n1*n1;
  2454. size_t n3_=n2*(n1/2+1);
  2455. size_t chld_cnt=1UL<<COORD_DIM;
  2456. fftsize=2*n3_*chld_cnt;
  2457. dof=1;
  2458. }
  2459. int omp_p=omp_get_max_threads();
  2460. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  2461. size_t n_blk0=2*fftsize*dof*(ker_dim0*n_in +ker_dim1*n_out)*sizeof(Real_t)/buff_size;
  2462. if(n_blk0==0) n_blk0=1;
  2463. std::vector<std::vector<size_t> > fft_vec(n_blk0);
  2464. std::vector<std::vector<size_t> > ifft_vec(n_blk0);
  2465. std::vector<std::vector<Real_t> > fft_scl(n_blk0);
  2466. std::vector<std::vector<Real_t> > ifft_scl(n_blk0);
  2467. std::vector<std::vector<size_t> > interac_vec(n_blk0);
  2468. std::vector<std::vector<size_t> > interac_dsp(n_blk0);
  2469. {
  2470. Matrix<Real_t>& input_data=*setup_data. input_data;
  2471. Matrix<Real_t>& output_data=*setup_data.output_data;
  2472. std::vector<std::vector<FMMNode*> > nodes_blk_in (n_blk0);
  2473. std::vector<std::vector<FMMNode*> > nodes_blk_out(n_blk0);
  2474. Vector<Real_t> src_scal;
  2475. Vector<Real_t> trg_scal;
  2476. { // Set src_scal and trg_scal.
  2477. Vector<Real_t>& src_scal_m2l=this->kernel->k_m2l->src_scal;
  2478. Vector<Real_t>& trg_scal_m2l=this->kernel->k_m2l->trg_scal;
  2479. Vector<Real_t>& src_scal_l2l=this->kernel->k_l2l->src_scal;
  2480. Vector<Real_t>& trg_scal_l2l=this->kernel->k_l2l->trg_scal;
  2481. src_scal=src_scal_m2l;
  2482. trg_scal=src_scal_l2l;
  2483. size_t scal_dim0=src_scal.Dim();
  2484. size_t scal_dim1=trg_scal.Dim();
  2485. Real_t scal_diff=0;
  2486. assert(trg_scal_m2l.Dim()==trg_scal_l2l.Dim());
  2487. if(trg_scal_m2l.Dim()){
  2488. scal_diff=(trg_scal_m2l[0]-trg_scal_l2l[0]);
  2489. for(size_t i=1;i<trg_scal_m2l.Dim();i++){
  2490. assert(fabs(scal_diff-(trg_scal_m2l[1]-trg_scal_l2l[1]))<eps);
  2491. }
  2492. }
  2493. for(size_t i=0;i<trg_scal.Dim();i++){
  2494. trg_scal[i]=scal_diff-trg_scal[i];
  2495. }
  2496. }
  2497. for(size_t i=0;i<n_in;i++) ((FMMNode*)nodes_in[i])->node_id=i;
  2498. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2499. size_t blk0_start=(n_out* blk0 )/n_blk0;
  2500. size_t blk0_end =(n_out*(blk0+1))/n_blk0;
  2501. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2502. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2503. { // Build node list for blk0.
  2504. std::set<void*> nodes_in;
  2505. for(size_t i=blk0_start;i<blk0_end;i++){
  2506. nodes_out_.push_back((FMMNode*)nodes_out[i]);
  2507. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  2508. for(size_t k=0;k<mat_cnt;k++) if(lst[k]!=NULL) nodes_in.insert(lst[k]);
  2509. }
  2510. for(std::set<void*>::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){
  2511. nodes_in_.push_back((FMMNode*)*node);
  2512. }
  2513. size_t input_dim=nodes_in_ .size()*ker_dim0*dof*fftsize;
  2514. size_t output_dim=nodes_out_.size()*ker_dim1*dof*fftsize;
  2515. size_t buffer_dim=(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2516. if(buff_size<(input_dim + output_dim + buffer_dim)*sizeof(Real_t))
  2517. buff_size=(input_dim + output_dim + buffer_dim)*sizeof(Real_t);
  2518. }
  2519. { // Set fft vectors.
  2520. for(size_t i=0;i<nodes_in_ .size();i++) fft_vec[blk0].push_back((size_t)(& input_vector[nodes_in_[i]->node_id][0][0]- input_data[0]));
  2521. for(size_t i=0;i<nodes_out_.size();i++)ifft_vec[blk0].push_back((size_t)(&output_vector[blk0_start + i ][0][0]-output_data[0]));
  2522. size_t scal_dim0=src_scal.Dim();
  2523. size_t scal_dim1=trg_scal.Dim();
  2524. fft_scl [blk0].resize(nodes_in_ .size()*scal_dim0);
  2525. ifft_scl[blk0].resize(nodes_out_.size()*scal_dim1);
  2526. for(size_t i=0;i<nodes_in_ .size();i++){
  2527. size_t depth=nodes_in_[i]->Depth()+1;
  2528. for(size_t j=0;j<scal_dim0;j++){
  2529. fft_scl[blk0][i*scal_dim0+j]=pow(2.0, src_scal[j]*depth);
  2530. }
  2531. }
  2532. for(size_t i=0;i<nodes_out_.size();i++){
  2533. size_t depth=nodes_out_[i]->Depth()+1;
  2534. for(size_t j=0;j<scal_dim1;j++){
  2535. ifft_scl[blk0][i*scal_dim1+j]=pow(2.0, trg_scal[j]*depth);
  2536. }
  2537. }
  2538. }
  2539. }
  2540. for(size_t blk0=0;blk0<n_blk0;blk0++){ // Hadamard interactions.
  2541. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2542. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2543. for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
  2544. { // Next blocking level.
  2545. size_t n_blk1=nodes_out_.size()*(2)*sizeof(Real_t)/(64*V_BLK_CACHE);
  2546. if(n_blk1==0) n_blk1=1;
  2547. size_t interac_dsp_=0;
  2548. for(size_t blk1=0;blk1<n_blk1;blk1++){
  2549. size_t blk1_start=(nodes_out_.size()* blk1 )/n_blk1;
  2550. size_t blk1_end =(nodes_out_.size()*(blk1+1))/n_blk1;
  2551. for(size_t k=0;k<mat_cnt;k++){
  2552. for(size_t i=blk1_start;i<blk1_end;i++){
  2553. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type];
  2554. if(lst[k]!=NULL){
  2555. interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof);
  2556. interac_vec[blk0].push_back( i *fftsize*ker_dim1*dof);
  2557. interac_dsp_++;
  2558. }
  2559. }
  2560. interac_dsp[blk0].push_back(interac_dsp_);
  2561. }
  2562. }
  2563. }
  2564. }
  2565. }
  2566. { // Set interac_data.
  2567. size_t data_size=sizeof(size_t)*6; // buff_size, m, dof, ker_dim0, ker_dim1, n_blk0
  2568. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2569. data_size+=sizeof(size_t)+ fft_vec[blk0].size()*sizeof(size_t);
  2570. data_size+=sizeof(size_t)+ ifft_vec[blk0].size()*sizeof(size_t);
  2571. data_size+=sizeof(size_t)+ fft_scl[blk0].size()*sizeof(Real_t);
  2572. data_size+=sizeof(size_t)+ ifft_scl[blk0].size()*sizeof(Real_t);
  2573. data_size+=sizeof(size_t)+interac_vec[blk0].size()*sizeof(size_t);
  2574. data_size+=sizeof(size_t)+interac_dsp[blk0].size()*sizeof(size_t);
  2575. }
  2576. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  2577. if(data_size>interac_data.Dim(0)*interac_data.Dim(1))
  2578. interac_data.ReInit(1,data_size);
  2579. char* data_ptr=&interac_data[0][0];
  2580. ((size_t*)data_ptr)[0]=buff_size; data_ptr+=sizeof(size_t);
  2581. ((size_t*)data_ptr)[0]= m; data_ptr+=sizeof(size_t);
  2582. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  2583. ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
  2584. ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
  2585. ((size_t*)data_ptr)[0]= n_blk0; data_ptr+=sizeof(size_t);
  2586. ((size_t*)data_ptr)[0]= interac_mat.size(); data_ptr+=sizeof(size_t);
  2587. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  2588. data_ptr+=interac_mat.size()*sizeof(size_t);
  2589. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2590. ((size_t*)data_ptr)[0]= fft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2591. mem::memcopy(data_ptr, & fft_vec[blk0][0], fft_vec[blk0].size()*sizeof(size_t));
  2592. data_ptr+= fft_vec[blk0].size()*sizeof(size_t);
  2593. ((size_t*)data_ptr)[0]=ifft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2594. mem::memcopy(data_ptr, &ifft_vec[blk0][0], ifft_vec[blk0].size()*sizeof(size_t));
  2595. data_ptr+=ifft_vec[blk0].size()*sizeof(size_t);
  2596. ((size_t*)data_ptr)[0]= fft_scl[blk0].size(); data_ptr+=sizeof(size_t);
  2597. mem::memcopy(data_ptr, & fft_scl[blk0][0], fft_scl[blk0].size()*sizeof(Real_t));
  2598. data_ptr+= fft_scl[blk0].size()*sizeof(Real_t);
  2599. ((size_t*)data_ptr)[0]=ifft_scl[blk0].size(); data_ptr+=sizeof(size_t);
  2600. mem::memcopy(data_ptr, &ifft_scl[blk0][0], ifft_scl[blk0].size()*sizeof(Real_t));
  2601. data_ptr+=ifft_scl[blk0].size()*sizeof(Real_t);
  2602. ((size_t*)data_ptr)[0]=interac_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2603. mem::memcopy(data_ptr, &interac_vec[blk0][0], interac_vec[blk0].size()*sizeof(size_t));
  2604. data_ptr+=interac_vec[blk0].size()*sizeof(size_t);
  2605. ((size_t*)data_ptr)[0]=interac_dsp[blk0].size(); data_ptr+=sizeof(size_t);
  2606. mem::memcopy(data_ptr, &interac_dsp[blk0][0], interac_dsp[blk0].size()*sizeof(size_t));
  2607. data_ptr+=interac_dsp[blk0].size()*sizeof(size_t);
  2608. }
  2609. }
  2610. }
  2611. Profile::Toc();
  2612. if(device){ // Host2Device
  2613. Profile::Tic("Host2Device",&this->comm,false,25);
  2614. setup_data.interac_data. AllocDevice(true);
  2615. Profile::Toc();
  2616. }
  2617. }
  2618. template <class FMMNode>
  2619. void FMM_Pts<FMMNode>::V_List (SetupData<Real_t>& setup_data, bool device){
  2620. assert(!device); //Can not run on accelerator yet.
  2621. int np;
  2622. MPI_Comm_size(comm,&np);
  2623. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  2624. if(np>1) Profile::Tic("Host2Device",&this->comm,false,25);
  2625. if(np>1) Profile::Toc();
  2626. return;
  2627. }
  2628. Profile::Tic("Host2Device",&this->comm,false,25);
  2629. int level=setup_data.level;
  2630. size_t buff_size=*((size_t*)&setup_data.interac_data[0][0]);
  2631. typename Matrix<Real_t>::Device M_d;
  2632. typename Vector<char>::Device buff;
  2633. typename Matrix<char>::Device precomp_data;
  2634. typename Matrix<char>::Device interac_data;
  2635. typename Matrix<Real_t>::Device input_data;
  2636. typename Matrix<Real_t>::Device output_data;
  2637. Matrix<Real_t>& M = this->mat->Mat(level, DC2DE_Type, 0);
  2638. if(device){
  2639. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  2640. M_d = M. AllocDevice(false);
  2641. buff = this-> dev_buffer. AllocDevice(false);
  2642. precomp_data= setup_data.precomp_data->AllocDevice(false);
  2643. interac_data= setup_data.interac_data. AllocDevice(false);
  2644. input_data = setup_data. input_data->AllocDevice(false);
  2645. output_data = setup_data. output_data->AllocDevice(false);
  2646. }else{
  2647. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.ReInit(buff_size);
  2648. M_d = M;
  2649. buff = this-> cpu_buffer;
  2650. precomp_data=*setup_data.precomp_data;
  2651. interac_data= setup_data.interac_data;
  2652. input_data =*setup_data. input_data;
  2653. output_data =*setup_data. output_data;
  2654. }
  2655. Profile::Toc();
  2656. { // Offloaded computation.
  2657. // Set interac_data.
  2658. size_t m, dof, ker_dim0, ker_dim1, n_blk0;
  2659. std::vector<Vector<size_t> > fft_vec;
  2660. std::vector<Vector<size_t> > ifft_vec;
  2661. std::vector<Vector<Real_t> > fft_scl;
  2662. std::vector<Vector<Real_t> > ifft_scl;
  2663. std::vector<Vector<size_t> > interac_vec;
  2664. std::vector<Vector<size_t> > interac_dsp;
  2665. Vector<Real_t*> precomp_mat;
  2666. { // Set interac_data.
  2667. char* data_ptr=&interac_data[0][0];
  2668. buff_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2669. m =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2670. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2671. ker_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2672. ker_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2673. n_blk0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2674. fft_vec .resize(n_blk0);
  2675. ifft_vec.resize(n_blk0);
  2676. fft_scl .resize(n_blk0);
  2677. ifft_scl.resize(n_blk0);
  2678. interac_vec.resize(n_blk0);
  2679. interac_dsp.resize(n_blk0);
  2680. Vector<size_t> interac_mat;
  2681. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2682. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  2683. precomp_mat.Resize(interac_mat.Dim());
  2684. for(size_t i=0;i<interac_mat.Dim();i++){
  2685. precomp_mat[i]=(Real_t*)(precomp_data[0]+interac_mat[i]);
  2686. }
  2687. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2688. fft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2689. data_ptr+=sizeof(size_t)+fft_vec[blk0].Dim()*sizeof(size_t);
  2690. ifft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2691. data_ptr+=sizeof(size_t)+ifft_vec[blk0].Dim()*sizeof(size_t);
  2692. fft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  2693. data_ptr+=sizeof(size_t)+fft_scl[blk0].Dim()*sizeof(Real_t);
  2694. ifft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  2695. data_ptr+=sizeof(size_t)+ifft_scl[blk0].Dim()*sizeof(Real_t);
  2696. interac_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2697. data_ptr+=sizeof(size_t)+interac_vec[blk0].Dim()*sizeof(size_t);
  2698. interac_dsp[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2699. data_ptr+=sizeof(size_t)+interac_dsp[blk0].Dim()*sizeof(size_t);
  2700. }
  2701. }
  2702. int omp_p=omp_get_max_threads();
  2703. size_t M_dim, fftsize;
  2704. {
  2705. size_t n1=m*2;
  2706. size_t n2=n1*n1;
  2707. size_t n3_=n2*(n1/2+1);
  2708. size_t chld_cnt=1UL<<COORD_DIM;
  2709. fftsize=2*n3_*chld_cnt;
  2710. M_dim=n3_;
  2711. }
  2712. for(size_t blk0=0;blk0<n_blk0;blk0++){ // interactions
  2713. size_t n_in = fft_vec[blk0].Dim();
  2714. size_t n_out=ifft_vec[blk0].Dim();
  2715. size_t input_dim=n_in *ker_dim0*dof*fftsize;
  2716. size_t output_dim=n_out*ker_dim1*dof*fftsize;
  2717. size_t buffer_dim=(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2718. Vector<Real_t> fft_in ( input_dim, (Real_t*)&buff[ 0 ],false);
  2719. Vector<Real_t> fft_out(output_dim, (Real_t*)&buff[ input_dim *sizeof(Real_t)],false);
  2720. Vector<Real_t> buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
  2721. { // FFT
  2722. if(np==1) Profile::Tic("FFT",&comm,false,100);
  2723. Vector<Real_t> input_data_( input_data.dim[0]* input_data.dim[1], input_data[0], false);
  2724. FFT_UpEquiv(dof, m, ker_dim0, fft_vec[blk0], fft_scl[blk0], input_data_, fft_in, buffer);
  2725. if(np==1) Profile::Toc();
  2726. }
  2727. { // Hadamard
  2728. #ifdef PVFMM_HAVE_PAPI
  2729. #ifdef __VERBOSE__
  2730. std::cout << "Starting counters new\n";
  2731. if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
  2732. #endif
  2733. #endif
  2734. if(np==1) Profile::Tic("HadamardProduct",&comm,false,100);
  2735. VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
  2736. if(np==1) Profile::Toc();
  2737. #ifdef PVFMM_HAVE_PAPI
  2738. #ifdef __VERBOSE__
  2739. if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
  2740. std::cout << "Stopping counters\n";
  2741. #endif
  2742. #endif
  2743. }
  2744. { // IFFT
  2745. if(np==1) Profile::Tic("IFFT",&comm,false,100);
  2746. Matrix<Real_t> M(M_d.dim[0],M_d.dim[1],M_d[0],false);
  2747. Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
  2748. FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], ifft_scl[blk0], fft_out, output_data_, buffer, M);
  2749. if(np==1) Profile::Toc();
  2750. }
  2751. }
  2752. }
  2753. }
  2754. template <class FMMNode>
  2755. void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2756. if(this->MultipoleOrder()==0) return;
  2757. { // Set setup_data
  2758. setup_data.level=level;
  2759. setup_data.kernel=kernel->k_l2l;
  2760. setup_data.interac_type.resize(1);
  2761. setup_data.interac_type[0]=D2D_Type;
  2762. setup_data. input_data=&buff[1];
  2763. setup_data.output_data=&buff[1];
  2764. Vector<FMMNode_t*>& nodes_in =n_list[1];
  2765. Vector<FMMNode_t*>& nodes_out=n_list[1];
  2766. setup_data.nodes_in .clear();
  2767. setup_data.nodes_out.clear();
  2768. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1) setup_data.nodes_in .push_back(nodes_in [i]);
  2769. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level ) setup_data.nodes_out.push_back(nodes_out[i]);
  2770. }
  2771. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2772. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2773. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2774. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2775. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
  2776. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
  2777. SetupInterac(setup_data,device);
  2778. }
  2779. template <class FMMNode>
  2780. void FMM_Pts<FMMNode>::Down2Down (SetupData<Real_t>& setup_data, bool device){
  2781. //Add Down2Down contribution.
  2782. EvalList(setup_data, device);
  2783. }
  2784. template <class FMMNode>
  2785. void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift_src, bool shift_trg, Matrix<Real_t>* M, bool device){
  2786. int level=setup_data.level;
  2787. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  2788. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2789. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2790. Matrix<Real_t>& output_data=*setup_data.output_data;
  2791. Matrix<Real_t>& input_data=*setup_data. input_data;
  2792. Matrix<Real_t>& coord_data=*setup_data. coord_data;
  2793. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector;
  2794. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector;
  2795. size_t n_in =nodes_in .size();
  2796. size_t n_out=nodes_out.size();
  2797. //setup_data.precomp_data=NULL;
  2798. // Build interac_data
  2799. Profile::Tic("Interac-Data",&this->comm,true,25);
  2800. Matrix<char>& interac_data=setup_data.interac_data;
  2801. if(n_out>0 && n_in >0){
  2802. size_t ker_dim0=setup_data.kernel->ker_dim[0];
  2803. size_t ker_dim1=setup_data.kernel->ker_dim[1];
  2804. size_t dof=1;
  2805. #pragma omp parallel for
  2806. for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
  2807. std::vector<size_t> trg_interac_cnt(n_out,0);
  2808. std::vector<size_t> trg_coord(n_out);
  2809. std::vector<size_t> trg_value(n_out);
  2810. std::vector<size_t> trg_cnt(n_out);
  2811. size_t scal_dim0=0;
  2812. size_t scal_dim1=0;
  2813. Vector<Real_t> scal_exp0;
  2814. Vector<Real_t> scal_exp1;
  2815. { // Set src_scal_exp, trg_scal_exp
  2816. Mat_Type& interac_type=interac_type_lst[0];
  2817. if(interac_type==S2U_Type) scal_exp0=this->kernel->k_m2m->trg_scal;
  2818. if(interac_type==S2U_Type) scal_exp1=this->kernel->k_m2m->src_scal;
  2819. if(interac_type== X_Type) scal_exp0=this->kernel->k_l2l->trg_scal;
  2820. if(interac_type== X_Type) scal_exp1=this->kernel->k_l2l->src_scal;
  2821. scal_dim0=scal_exp0.Dim();
  2822. scal_dim1=scal_exp1.Dim();
  2823. }
  2824. std::vector<Real_t> scal0(n_out*scal_dim0,0);
  2825. std::vector<Real_t> scal1(n_out*scal_dim1,0);
  2826. { // Set trg data
  2827. Mat_Type& interac_type=interac_type_lst[0];
  2828. #pragma omp parallel for
  2829. for(size_t i=0;i<n_out;i++){
  2830. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  2831. trg_cnt [i]=output_vector[i*2+0]->Dim()/COORD_DIM;
  2832. trg_coord[i]=(size_t)(&output_vector[i*2+0][0][0]- coord_data[0]);
  2833. trg_value[i]=(size_t)(&output_vector[i*2+1][0][0]-output_data[0]);
  2834. size_t depth=((FMMNode*)nodes_out[i])->Depth();
  2835. Real_t* scal0_=&scal0[i*scal_dim0];
  2836. Real_t* scal1_=&scal1[i*scal_dim1];
  2837. for(size_t j=0;j<scal_dim0;j++){
  2838. if(!this->Homogen()) scal0_[j]=1.0;
  2839. else if(interac_type==S2U_Type) scal0_[j]=pow(0.5, scal_exp0[j]*depth);
  2840. else if(interac_type== X_Type) scal0_[j]=pow(0.5, scal_exp0[j]*depth);
  2841. }
  2842. for(size_t j=0;j<scal_dim1;j++){
  2843. if(!this->Homogen()) scal1_[j]=1.0;
  2844. else if(interac_type==S2U_Type) scal1_[j]=pow(0.5, scal_exp1[j]*depth);
  2845. else if(interac_type== X_Type) scal1_[j]=pow(0.5, scal_exp1[j]*depth);
  2846. }
  2847. }
  2848. }
  2849. }
  2850. std::vector<std::vector<size_t> > src_cnt(n_out);
  2851. std::vector<std::vector<size_t> > src_coord(n_out);
  2852. std::vector<std::vector<size_t> > src_value(n_out);
  2853. std::vector<std::vector<Real_t> > shift_coord(n_out);
  2854. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  2855. Mat_Type& interac_type=interac_type_lst[type_indx];
  2856. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  2857. #pragma omp parallel for
  2858. for(size_t i=0;i<n_out;i++){ // For each target node.
  2859. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  2860. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  2861. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++) if(lst[mat_indx]!=NULL){ // For each direction.
  2862. size_t j=lst[mat_indx]->node_id;
  2863. if(input_vector[j*4+0]->Dim()>0 || input_vector[j*4+2]->Dim()>0){
  2864. trg_interac_cnt[i]++;
  2865. { // Determine shift for periodic boundary condition
  2866. Real_t* sc=lst[mat_indx]->Coord();
  2867. Real_t* tc=((FMMNode*)nodes_out[i])->Coord();
  2868. int* rel_coord=this->interac_list.RelativeCoord(interac_type, mat_indx);
  2869. shift_coord[i].push_back((tc[0]>sc[0] && rel_coord[0]>0? 1.0:
  2870. (tc[0]<sc[0] && rel_coord[0]<0?-1.0:0.0)) +
  2871. (shift_src?sc[0]:0) - (shift_trg?tc[0]:0) );
  2872. shift_coord[i].push_back((tc[1]>sc[1] && rel_coord[1]>0? 1.0:
  2873. (tc[1]<sc[1] && rel_coord[1]<0?-1.0:0.0)) +
  2874. (shift_src?sc[1]:0) - (shift_trg?tc[1]:0) );
  2875. shift_coord[i].push_back((tc[2]>sc[2] && rel_coord[2]>0? 1.0:
  2876. (tc[2]<sc[2] && rel_coord[2]<0?-1.0:0.0)) +
  2877. (shift_src?sc[2]:0) - (shift_trg?tc[2]:0) );
  2878. }
  2879. { // Set src data
  2880. if(input_vector[j*4+0]!=NULL){
  2881. src_cnt [i].push_back(input_vector[j*4+0]->Dim()/COORD_DIM);
  2882. src_coord[i].push_back((size_t)(& input_vector[j*4+0][0][0]- coord_data[0]));
  2883. src_value[i].push_back((size_t)(& input_vector[j*4+1][0][0]- input_data[0]));
  2884. }else{
  2885. src_cnt [i].push_back(0);
  2886. src_coord[i].push_back(0);
  2887. src_value[i].push_back(0);
  2888. }
  2889. if(input_vector[j*4+2]!=NULL){
  2890. src_cnt [i].push_back(input_vector[j*4+2]->Dim()/COORD_DIM);
  2891. src_coord[i].push_back((size_t)(& input_vector[j*4+2][0][0]- coord_data[0]));
  2892. src_value[i].push_back((size_t)(& input_vector[j*4+3][0][0]- input_data[0]));
  2893. }else{
  2894. src_cnt [i].push_back(0);
  2895. src_coord[i].push_back(0);
  2896. src_value[i].push_back(0);
  2897. }
  2898. }
  2899. }
  2900. }
  2901. }
  2902. }
  2903. }
  2904. { // Set interac_data.
  2905. size_t data_size=sizeof(size_t)*6;
  2906. data_size+=sizeof(size_t)+trg_interac_cnt.size()*sizeof(size_t);
  2907. data_size+=sizeof(size_t)+trg_coord.size()*sizeof(size_t);
  2908. data_size+=sizeof(size_t)+trg_value.size()*sizeof(size_t);
  2909. data_size+=sizeof(size_t)+trg_cnt .size()*sizeof(size_t);
  2910. data_size+=sizeof(size_t)+scal0 .size()*sizeof(Real_t);
  2911. data_size+=sizeof(size_t)+scal1 .size()*sizeof(Real_t);
  2912. data_size+=sizeof(size_t)*2+(M!=NULL?M->Dim(0)*M->Dim(1)*sizeof(Real_t):0);
  2913. for(size_t i=0;i<n_out;i++){
  2914. data_size+=sizeof(size_t)+src_cnt [i].size()*sizeof(size_t);
  2915. data_size+=sizeof(size_t)+src_coord[i].size()*sizeof(size_t);
  2916. data_size+=sizeof(size_t)+src_value[i].size()*sizeof(size_t);
  2917. data_size+=sizeof(size_t)+shift_coord[i].size()*sizeof(Real_t);
  2918. }
  2919. if(data_size>interac_data.Dim(0)*interac_data.Dim(1))
  2920. interac_data.ReInit(1,data_size);
  2921. char* data_ptr=&interac_data[0][0];
  2922. ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t);
  2923. ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
  2924. ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
  2925. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  2926. ((size_t*)data_ptr)[0]=scal_dim0; data_ptr+=sizeof(size_t);
  2927. ((size_t*)data_ptr)[0]=scal_dim1; data_ptr+=sizeof(size_t);
  2928. ((size_t*)data_ptr)[0]=trg_interac_cnt.size(); data_ptr+=sizeof(size_t);
  2929. mem::memcopy(data_ptr, &trg_interac_cnt[0], trg_interac_cnt.size()*sizeof(size_t));
  2930. data_ptr+=trg_interac_cnt.size()*sizeof(size_t);
  2931. ((size_t*)data_ptr)[0]=trg_coord.size(); data_ptr+=sizeof(size_t);
  2932. mem::memcopy(data_ptr, &trg_coord[0], trg_coord.size()*sizeof(size_t));
  2933. data_ptr+=trg_coord.size()*sizeof(size_t);
  2934. ((size_t*)data_ptr)[0]=trg_value.size(); data_ptr+=sizeof(size_t);
  2935. mem::memcopy(data_ptr, &trg_value[0], trg_value.size()*sizeof(size_t));
  2936. data_ptr+=trg_value.size()*sizeof(size_t);
  2937. ((size_t*)data_ptr)[0]=trg_cnt.size(); data_ptr+=sizeof(size_t);
  2938. mem::memcopy(data_ptr, &trg_cnt[0], trg_cnt.size()*sizeof(size_t));
  2939. data_ptr+=trg_cnt.size()*sizeof(size_t);
  2940. ((size_t*)data_ptr)[0]=scal0.size(); data_ptr+=sizeof(size_t);
  2941. mem::memcopy(data_ptr, &scal0[0], scal0.size()*sizeof(Real_t));
  2942. data_ptr+=scal0.size()*sizeof(Real_t);
  2943. ((size_t*)data_ptr)[0]=scal1.size(); data_ptr+=sizeof(size_t);
  2944. mem::memcopy(data_ptr, &scal1[0], scal1.size()*sizeof(Real_t));
  2945. data_ptr+=scal1.size()*sizeof(Real_t);
  2946. if(M!=NULL){
  2947. ((size_t*)data_ptr)[0]=M->Dim(0); data_ptr+=sizeof(size_t);
  2948. ((size_t*)data_ptr)[0]=M->Dim(1); data_ptr+=sizeof(size_t);
  2949. mem::memcopy(data_ptr, M[0][0], M->Dim(0)*M->Dim(1)*sizeof(Real_t));
  2950. data_ptr+=M->Dim(0)*M->Dim(1)*sizeof(Real_t);
  2951. }else{
  2952. ((size_t*)data_ptr)[0]=0; data_ptr+=sizeof(size_t);
  2953. ((size_t*)data_ptr)[0]=0; data_ptr+=sizeof(size_t);
  2954. }
  2955. for(size_t i=0;i<n_out;i++){
  2956. ((size_t*)data_ptr)[0]=src_cnt[i].size(); data_ptr+=sizeof(size_t);
  2957. mem::memcopy(data_ptr, &src_cnt[i][0], src_cnt[i].size()*sizeof(size_t));
  2958. data_ptr+=src_cnt[i].size()*sizeof(size_t);
  2959. ((size_t*)data_ptr)[0]=src_coord[i].size(); data_ptr+=sizeof(size_t);
  2960. mem::memcopy(data_ptr, &src_coord[i][0], src_coord[i].size()*sizeof(size_t));
  2961. data_ptr+=src_coord[i].size()*sizeof(size_t);
  2962. ((size_t*)data_ptr)[0]=src_value[i].size(); data_ptr+=sizeof(size_t);
  2963. mem::memcopy(data_ptr, &src_value[i][0], src_value[i].size()*sizeof(size_t));
  2964. data_ptr+=src_value[i].size()*sizeof(size_t);
  2965. ((size_t*)data_ptr)[0]=shift_coord[i].size(); data_ptr+=sizeof(size_t);
  2966. mem::memcopy(data_ptr, &shift_coord[i][0], shift_coord[i].size()*sizeof(Real_t));
  2967. data_ptr+=shift_coord[i].size()*sizeof(Real_t);
  2968. }
  2969. }
  2970. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  2971. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  2972. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.ReInit(buff_size);
  2973. }
  2974. Profile::Toc();
  2975. if(device){ // Host2Device
  2976. Profile::Tic("Host2Device",&this->comm,false,25);
  2977. setup_data.interac_data .AllocDevice(true);
  2978. Profile::Toc();
  2979. }
  2980. }
  2981. template <class FMMNode>
  2982. template <int SYNC>
  2983. void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
  2984. if(setup_data.kernel->ker_dim[0]*setup_data.kernel->ker_dim[1]==0) return;
  2985. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  2986. Profile::Tic("Host2Device",&this->comm,false,25);
  2987. Profile::Toc();
  2988. Profile::Tic("DeviceComp",&this->comm,false,20);
  2989. Profile::Toc();
  2990. return;
  2991. }
  2992. bool have_gpu=false;
  2993. #if defined(PVFMM_HAVE_CUDA)
  2994. have_gpu=true;
  2995. #endif
  2996. Profile::Tic("Host2Device",&this->comm,false,25);
  2997. typename Vector<char>::Device buff;
  2998. //typename Matrix<char>::Device precomp_data;
  2999. typename Matrix<char>::Device interac_data;
  3000. typename Matrix<Real_t>::Device coord_data;
  3001. typename Matrix<Real_t>::Device input_data;
  3002. typename Matrix<Real_t>::Device output_data;
  3003. if(device && !have_gpu){
  3004. buff = this-> dev_buffer. AllocDevice(false);
  3005. interac_data= setup_data.interac_data. AllocDevice(false);
  3006. //if(setup_data.precomp_data!=NULL) precomp_data= setup_data.precomp_data->AllocDevice(false);
  3007. if(setup_data. coord_data!=NULL) coord_data = setup_data. coord_data->AllocDevice(false);
  3008. if(setup_data. input_data!=NULL) input_data = setup_data. input_data->AllocDevice(false);
  3009. if(setup_data. output_data!=NULL) output_data = setup_data. output_data->AllocDevice(false);
  3010. }else{
  3011. buff = this-> cpu_buffer;
  3012. interac_data= setup_data.interac_data;
  3013. //if(setup_data.precomp_data!=NULL) precomp_data=*setup_data.precomp_data;
  3014. if(setup_data. coord_data!=NULL) coord_data =*setup_data. coord_data;
  3015. if(setup_data. input_data!=NULL) input_data =*setup_data. input_data;
  3016. if(setup_data. output_data!=NULL) output_data =*setup_data. output_data;
  3017. }
  3018. Profile::Toc();
  3019. size_t ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten;
  3020. size_t ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
  3021. Profile::Tic("DeviceComp",&this->comm,false,20);
  3022. int lock_idx=-1;
  3023. int wait_lock_idx=-1;
  3024. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  3025. if(device) lock_idx=MIC_Lock::get_lock();
  3026. #ifdef __INTEL_OFFLOAD
  3027. if(device) ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten;
  3028. if(device) ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten;
  3029. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  3030. #endif
  3031. { // Offloaded computation.
  3032. // Set interac_data.
  3033. size_t data_size;
  3034. size_t ker_dim0;
  3035. size_t ker_dim1;
  3036. size_t dof, n_out;
  3037. size_t scal_dim0;
  3038. size_t scal_dim1;
  3039. Vector<size_t> trg_interac_cnt;
  3040. Vector<size_t> trg_coord;
  3041. Vector<size_t> trg_value;
  3042. Vector<size_t> trg_cnt;
  3043. Vector<Real_t> scal0;
  3044. Vector<Real_t> scal1;
  3045. Matrix<Real_t> M;
  3046. Vector< Vector<size_t> > src_cnt;
  3047. Vector< Vector<size_t> > src_coord;
  3048. Vector< Vector<size_t> > src_value;
  3049. Vector< Vector<Real_t> > shift_coord;
  3050. { // Set interac_data.
  3051. char* data_ptr=&interac_data[0][0];
  3052. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3053. ker_dim0=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3054. ker_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3055. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3056. scal_dim0=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3057. scal_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3058. trg_interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3059. data_ptr+=sizeof(size_t)+trg_interac_cnt.Dim()*sizeof(size_t);
  3060. n_out=trg_interac_cnt.Dim();
  3061. trg_coord.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3062. data_ptr+=sizeof(size_t)+trg_coord.Dim()*sizeof(size_t);
  3063. trg_value.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3064. data_ptr+=sizeof(size_t)+trg_value.Dim()*sizeof(size_t);
  3065. trg_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3066. data_ptr+=sizeof(size_t)+trg_cnt.Dim()*sizeof(size_t);
  3067. scal0.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  3068. data_ptr+=sizeof(size_t)+scal0.Dim()*sizeof(Real_t);
  3069. scal1.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  3070. data_ptr+=sizeof(size_t)+scal1.Dim()*sizeof(Real_t);
  3071. M.ReInit(((size_t*)data_ptr)[0],((size_t*)data_ptr)[1],(Real_t*)(data_ptr+2*sizeof(size_t)),false);
  3072. data_ptr+=sizeof(size_t)*2+M.Dim(0)*M.Dim(1)*sizeof(Real_t);
  3073. src_cnt.Resize(n_out);
  3074. src_coord.Resize(n_out);
  3075. src_value.Resize(n_out);
  3076. shift_coord.Resize(n_out);
  3077. for(size_t i=0;i<n_out;i++){
  3078. src_cnt[i].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3079. data_ptr+=sizeof(size_t)+src_cnt[i].Dim()*sizeof(size_t);
  3080. src_coord[i].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3081. data_ptr+=sizeof(size_t)+src_coord[i].Dim()*sizeof(size_t);
  3082. src_value[i].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3083. data_ptr+=sizeof(size_t)+src_value[i].Dim()*sizeof(size_t);
  3084. shift_coord[i].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  3085. data_ptr+=sizeof(size_t)+shift_coord[i].Dim()*sizeof(Real_t);
  3086. }
  3087. }
  3088. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  3089. //Compute interaction from point sources.
  3090. { // interactions
  3091. typename Kernel<Real_t>::Ker_t single_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_single_layer_kernel;
  3092. typename Kernel<Real_t>::Ker_t double_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_double_layer_kernel;
  3093. int omp_p=omp_get_max_threads();
  3094. Vector<Real_t*> thread_buff(omp_p);
  3095. size_t thread_buff_size=buff.dim/sizeof(Real_t)/omp_p;
  3096. for(int i=0;i<omp_p;i++) thread_buff[i]=(Real_t*)&buff[i*thread_buff_size*sizeof(Real_t)];
  3097. #pragma omp parallel for schedule(dynamic)
  3098. for(size_t i=0;i<n_out;i++)
  3099. if(trg_interac_cnt[i]>0 && trg_cnt[i]>0){
  3100. int thread_id=omp_get_thread_num();
  3101. Real_t* s_coord=thread_buff[thread_id];
  3102. Real_t* t_value=output_data[0]+trg_value[i];
  3103. if(M.Dim(0)>0 && M.Dim(1)>0){
  3104. s_coord+=dof*M.Dim(0);
  3105. t_value=thread_buff[thread_id];
  3106. for(size_t j=0;j<dof*M.Dim(0);j++) t_value[j]=0;
  3107. }
  3108. size_t interac_cnt=0;
  3109. for(size_t j=0;j<trg_interac_cnt[i];j++){
  3110. if(ptr_single_layer_kernel!=(size_t)NULL){// Single layer kernel
  3111. Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+0];
  3112. assert(thread_buff_size>=dof*M.Dim(0)+dof*M.Dim(1)+src_cnt[i][2*j+0]*COORD_DIM);
  3113. for(size_t k1=0;k1<src_cnt[i][2*j+0];k1++){ // Compute shifted source coordinates.
  3114. for(size_t k0=0;k0<COORD_DIM;k0++){
  3115. s_coord[k1*COORD_DIM+k0]=src_coord_[k1*COORD_DIM+k0]+shift_coord[i][j*COORD_DIM+k0];
  3116. }
  3117. }
  3118. single_layer_kernel( s_coord , src_cnt[i][2*j+0], input_data[0]+src_value[i][2*j+0], dof,
  3119. coord_data[0]+trg_coord[i], trg_cnt[i] , t_value, NULL);
  3120. interac_cnt+=src_cnt[i][2*j+0]*trg_cnt[i];
  3121. }else if(src_cnt[i][2*j+0]!=0 && trg_cnt[i]!=0){
  3122. assert(ptr_single_layer_kernel); // Single-layer kernel not implemented
  3123. }
  3124. if(ptr_double_layer_kernel!=(size_t)NULL){// Double layer kernel
  3125. Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+1];
  3126. assert(thread_buff_size>=dof*M.Dim(0)+dof*M.Dim(1)+src_cnt[i][2*j+1]*COORD_DIM);
  3127. for(size_t k1=0;k1<src_cnt[i][2*j+1];k1++){ // Compute shifted source coordinates.
  3128. for(size_t k0=0;k0<COORD_DIM;k0++){
  3129. s_coord[k1*COORD_DIM+k0]=src_coord_[k1*COORD_DIM+k0]+shift_coord[i][j*COORD_DIM+k0];
  3130. }
  3131. }
  3132. double_layer_kernel( s_coord , src_cnt[i][2*j+1], input_data[0]+src_value[i][2*j+1], dof,
  3133. coord_data[0]+trg_coord[i], trg_cnt[i] , t_value, NULL);
  3134. interac_cnt+=src_cnt[i][2*j+1]*trg_cnt[i];
  3135. }else if(src_cnt[i][2*j+1]!=0 && trg_cnt[i]!=0){
  3136. assert(ptr_double_layer_kernel); // Double-layer kernel not implemented
  3137. }
  3138. }
  3139. if(M.Dim(0)>0 && M.Dim(1)>0 && interac_cnt>0){
  3140. assert(trg_cnt[i]*scal_dim0==M.Dim(0));
  3141. assert(trg_cnt[i]*scal_dim1==M.Dim(1));
  3142. {// Scaling (scal_dim0)
  3143. Real_t* s=&scal0[i*scal_dim0];
  3144. for(size_t j=0;j<dof*M.Dim(0);j+=scal_dim0){
  3145. for(size_t k=0;k<scal_dim0;k++){
  3146. t_value[j+k]*=s[k];
  3147. }
  3148. }
  3149. }
  3150. Matrix<Real_t> in_vec(dof, M.Dim(0), t_value, false);
  3151. Matrix<Real_t> tmp_vec(dof, M.Dim(1),dof*M.Dim(0)+t_value, false);
  3152. Matrix<Real_t>::GEMM(tmp_vec, in_vec, M, 0.0);
  3153. Matrix<Real_t> out_vec(dof, M.Dim(1), output_data[0]+trg_value[i], false);
  3154. {// Scaling (scal_dim1)
  3155. Real_t* s=&scal1[i*scal_dim1];
  3156. for(size_t j=0;j<dof*M.Dim(1);j+=scal_dim1){
  3157. for(size_t k=0;k<scal_dim1;k++){
  3158. out_vec[0][j+k]+=tmp_vec[0][j+k]*s[k];
  3159. }
  3160. }
  3161. }
  3162. }
  3163. }
  3164. }
  3165. if(device) MIC_Lock::release_lock(lock_idx);
  3166. }
  3167. #ifdef __INTEL_OFFLOAD
  3168. if(SYNC){
  3169. #pragma offload if(device) target(mic:0)
  3170. {if(device) MIC_Lock::wait_lock(lock_idx);}
  3171. }
  3172. #endif
  3173. Profile::Toc();
  3174. }
  3175. template <class FMMNode>
  3176. void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3177. if(this->MultipoleOrder()==0) return;
  3178. { // Set setup_data
  3179. setup_data.level=level;
  3180. setup_data.kernel=kernel->k_s2l;
  3181. setup_data.interac_type.resize(1);
  3182. setup_data.interac_type[0]=X_Type;
  3183. setup_data. input_data=&buff[4];
  3184. setup_data.output_data=&buff[1];
  3185. setup_data. coord_data=&buff[6];
  3186. Vector<FMMNode_t*>& nodes_in =n_list[4];
  3187. Vector<FMMNode_t*>& nodes_out=n_list[1];
  3188. setup_data.nodes_in .clear();
  3189. setup_data.nodes_out.clear();
  3190. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  3191. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  3192. }
  3193. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3194. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3195. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3196. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3197. for(size_t i=0;i<nodes_in .size();i++){
  3198. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
  3199. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
  3200. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
  3201. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
  3202. }
  3203. for(size_t i=0;i<nodes_out.size();i++){
  3204. output_vector.push_back(&dnwd_check_surf[((FMMNode*)nodes_out[i])->Depth()]);
  3205. output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
  3206. }
  3207. //Downward check to downward equivalent matrix.
  3208. Matrix<Real_t>& M_dc2de = this->mat->Mat(level, DC2DE_Type, 0);
  3209. this->SetupInteracPts(setup_data, false, true, &M_dc2de,device);
  3210. { // Resize device buffer
  3211. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3212. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3213. }
  3214. }
  3215. template <class FMMNode>
  3216. void FMM_Pts<FMMNode>::X_List (SetupData<Real_t>& setup_data, bool device){
  3217. //Add X_List contribution.
  3218. this->EvalListPts(setup_data, device);
  3219. }
  3220. template <class FMMNode>
  3221. void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3222. if(this->MultipoleOrder()==0) return;
  3223. { // Set setup_data
  3224. setup_data.level=level;
  3225. setup_data.kernel=kernel->k_m2t;
  3226. setup_data.interac_type.resize(1);
  3227. setup_data.interac_type[0]=W_Type;
  3228. setup_data. input_data=&buff[0];
  3229. setup_data.output_data=&buff[5];
  3230. setup_data. coord_data=&buff[6];
  3231. Vector<FMMNode_t*>& nodes_in =n_list[0];
  3232. Vector<FMMNode_t*>& nodes_out=n_list[5];
  3233. setup_data.nodes_in .clear();
  3234. setup_data.nodes_out.clear();
  3235. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level+1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  3236. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  3237. }
  3238. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3239. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3240. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3241. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3242. for(size_t i=0;i<nodes_in .size();i++){
  3243. input_vector .push_back(&upwd_equiv_surf[((FMMNode*)nodes_in [i])->Depth()]);
  3244. input_vector .push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
  3245. input_vector .push_back(NULL);
  3246. input_vector .push_back(NULL);
  3247. }
  3248. for(size_t i=0;i<nodes_out.size();i++){
  3249. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_coord);
  3250. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value);
  3251. }
  3252. this->SetupInteracPts(setup_data, true, false, NULL, device);
  3253. { // Resize device buffer
  3254. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3255. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3256. }
  3257. }
  3258. template <class FMMNode>
  3259. void FMM_Pts<FMMNode>::W_List (SetupData<Real_t>& setup_data, bool device){
  3260. //Add W_List contribution.
  3261. this->EvalListPts(setup_data, device);
  3262. }
  3263. template <class FMMNode>
  3264. void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3265. { // Set setup_data
  3266. setup_data.level=level;
  3267. setup_data.kernel=kernel->k_s2t;
  3268. setup_data.interac_type.resize(3);
  3269. setup_data.interac_type[0]=U0_Type;
  3270. setup_data.interac_type[1]=U1_Type;
  3271. setup_data.interac_type[2]=U2_Type;
  3272. setup_data. input_data=&buff[4];
  3273. setup_data.output_data=&buff[5];
  3274. setup_data. coord_data=&buff[6];
  3275. Vector<FMMNode_t*>& nodes_in =n_list[4];
  3276. Vector<FMMNode_t*>& nodes_out=n_list[5];
  3277. setup_data.nodes_in .clear();
  3278. setup_data.nodes_out.clear();
  3279. for(size_t i=0;i<nodes_in .Dim();i++) if((level-1<=nodes_in [i]->Depth() && nodes_in [i]->Depth()<=level+1) || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  3280. for(size_t i=0;i<nodes_out.Dim();i++) if(( nodes_out[i]->Depth()==level ) || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  3281. }
  3282. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3283. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3284. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3285. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3286. for(size_t i=0;i<nodes_in .size();i++){
  3287. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
  3288. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
  3289. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
  3290. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
  3291. }
  3292. for(size_t i=0;i<nodes_out.size();i++){
  3293. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_coord);
  3294. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value);
  3295. }
  3296. this->SetupInteracPts(setup_data, false, false, NULL, device);
  3297. { // Resize device buffer
  3298. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3299. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3300. }
  3301. }
  3302. template <class FMMNode>
  3303. void FMM_Pts<FMMNode>::U_List (SetupData<Real_t>& setup_data, bool device){
  3304. //Add U_List contribution.
  3305. this->EvalListPts(setup_data, device);
  3306. }
  3307. template <class FMMNode>
  3308. void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3309. if(this->MultipoleOrder()==0) return;
  3310. { // Set setup_data
  3311. setup_data.level=level;
  3312. setup_data.kernel=kernel->k_l2t;
  3313. setup_data.interac_type.resize(1);
  3314. setup_data.interac_type[0]=D2T_Type;
  3315. setup_data. input_data=&buff[1];
  3316. setup_data.output_data=&buff[5];
  3317. setup_data. coord_data=&buff[6];
  3318. Vector<FMMNode_t*>& nodes_in =n_list[1];
  3319. Vector<FMMNode_t*>& nodes_out=n_list[5];
  3320. setup_data.nodes_in .clear();
  3321. setup_data.nodes_out.clear();
  3322. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  3323. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  3324. }
  3325. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3326. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3327. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3328. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3329. for(size_t i=0;i<nodes_in .size();i++){
  3330. input_vector .push_back(&dnwd_equiv_surf[((FMMNode*)nodes_in [i])->Depth()]);
  3331. input_vector .push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
  3332. input_vector .push_back(NULL);
  3333. input_vector .push_back(NULL);
  3334. }
  3335. for(size_t i=0;i<nodes_out.size();i++){
  3336. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_coord);
  3337. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value);
  3338. }
  3339. this->SetupInteracPts(setup_data, true, false, NULL, device);
  3340. { // Resize device buffer
  3341. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3342. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3343. }
  3344. }
  3345. template <class FMMNode>
  3346. void FMM_Pts<FMMNode>::Down2Target(SetupData<Real_t>& setup_data, bool device){
  3347. //Add Down2Target contribution.
  3348. this->EvalListPts(setup_data, device);
  3349. }
  3350. template <class FMMNode>
  3351. void FMM_Pts<FMMNode>::PostProcessing(std::vector<FMMNode_t*>& nodes){
  3352. }
  3353. template <class FMMNode>
  3354. void FMM_Pts<FMMNode>::CopyOutput(FMMNode** nodes, size_t n){
  3355. }
  3356. }//end namespace