fmm_pts.txx 141 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752
  1. /**
  2. * \file fmm_pts.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 3-07-2011
  5. * \brief This file contains the implementation of the FMM_Pts class.
  6. */
  7. #include <omp.h>
  8. #include <cmath>
  9. #include <cstdlib>
  10. #include <cassert>
  11. #include <sstream>
  12. #include <iostream>
  13. #include <stdint.h>
  14. #include <set>
  15. #ifdef PVFMM_HAVE_SYS_STAT_H
  16. #include <sys/stat.h>
  17. #endif
  18. #ifdef __SSE__
  19. #include <xmmintrin.h>
  20. #endif
  21. #ifdef __SSE2__
  22. #include <emmintrin.h>
  23. #endif
  24. #ifdef __SSE3__
  25. #include <pmmintrin.h>
  26. #endif
  27. #ifdef __AVX__
  28. #include <immintrin.h>
  29. #endif
  30. #if defined(__MIC__)
  31. #include <immintrin.h>
  32. #endif
  33. #include <profile.hpp>
  34. namespace pvfmm{
  35. /**
  36. * \brief Returns the coordinates of points on the surface of a cube.
  37. * \param[in] p Number of points on an edge of the cube is (n+1)
  38. * \param[in] c Coordinates to the centre of the cube (3D array).
  39. * \param[in] alpha Scaling factor for the size of the cube.
  40. * \param[in] depth Depth of the cube in the octree.
  41. * \return Vector with coordinates of points on the surface of the cube in the
  42. * format [x0 y0 z0 x1 y1 z1 .... ].
  43. */
  44. template <class Real_t>
  45. std::vector<Real_t> surface(int p, Real_t* c, Real_t alpha, int depth){
  46. size_t n_=(6*(p-1)*(p-1)+2); //Total number of points.
  47. std::vector<Real_t> coord(n_*3);
  48. coord[0]=coord[1]=coord[2]=-1.0;
  49. size_t cnt=1;
  50. for(int i=0;i<p-1;i++)
  51. for(int j=0;j<p-1;j++){
  52. coord[cnt*3 ]=-1.0;
  53. coord[cnt*3+1]=(2.0*(i+1)-p+1)/(p-1);
  54. coord[cnt*3+2]=(2.0*j-p+1)/(p-1);
  55. cnt++;
  56. }
  57. for(int i=0;i<p-1;i++)
  58. for(int j=0;j<p-1;j++){
  59. coord[cnt*3 ]=(2.0*i-p+1)/(p-1);
  60. coord[cnt*3+1]=-1.0;
  61. coord[cnt*3+2]=(2.0*(j+1)-p+1)/(p-1);
  62. cnt++;
  63. }
  64. for(int i=0;i<p-1;i++)
  65. for(int j=0;j<p-1;j++){
  66. coord[cnt*3 ]=(2.0*(i+1)-p+1)/(p-1);
  67. coord[cnt*3+1]=(2.0*j-p+1)/(p-1);
  68. coord[cnt*3+2]=-1.0;
  69. cnt++;
  70. }
  71. for(size_t i=0;i<(n_/2)*3;i++)
  72. coord[cnt*3+i]=-coord[i];
  73. Real_t r = 0.5*pow(0.5,depth);
  74. Real_t b = alpha*r;
  75. for(size_t i=0;i<n_;i++){
  76. coord[i*3+0]=(coord[i*3+0]+1.0)*b+c[0];
  77. coord[i*3+1]=(coord[i*3+1]+1.0)*b+c[1];
  78. coord[i*3+2]=(coord[i*3+2]+1.0)*b+c[2];
  79. }
  80. return coord;
  81. }
  82. /**
  83. * \brief Returns the coordinates of points on the upward check surface of cube.
  84. * \see surface()
  85. */
  86. template <class Real_t>
  87. std::vector<Real_t> u_check_surf(int p, Real_t* c, int depth){
  88. Real_t r=0.5*pow(0.5,depth);
  89. Real_t coord[3]={c[0]-r*(RAD1-1.0),c[1]-r*(RAD1-1.0),c[2]-r*(RAD1-1.0)};
  90. return surface(p,coord,(Real_t)RAD1,depth);
  91. }
  92. /**
  93. * \brief Returns the coordinates of points on the upward equivalent surface of cube.
  94. * \see surface()
  95. */
  96. template <class Real_t>
  97. std::vector<Real_t> u_equiv_surf(int p, Real_t* c, int depth){
  98. Real_t r=0.5*pow(0.5,depth);
  99. Real_t coord[3]={c[0]-r*(RAD0-1.0),c[1]-r*(RAD0-1.0),c[2]-r*(RAD0-1.0)};
  100. return surface(p,coord,(Real_t)RAD0,depth);
  101. }
  102. /**
  103. * \brief Returns the coordinates of points on the downward check surface of cube.
  104. * \see surface()
  105. */
  106. template <class Real_t>
  107. std::vector<Real_t> d_check_surf(int p, Real_t* c, int depth){
  108. Real_t r=0.5*pow(0.5,depth);
  109. Real_t coord[3]={c[0]-r*(RAD0-1.0),c[1]-r*(RAD0-1.0),c[2]-r*(RAD0-1.0)};
  110. return surface(p,coord,(Real_t)RAD0,depth);
  111. }
  112. /**
  113. * \brief Returns the coordinates of points on the downward equivalent surface of cube.
  114. * \see surface()
  115. */
  116. template <class Real_t>
  117. std::vector<Real_t> d_equiv_surf(int p, Real_t* c, int depth){
  118. Real_t r=0.5*pow(0.5,depth);
  119. Real_t coord[3]={c[0]-r*(RAD1-1.0),c[1]-r*(RAD1-1.0),c[2]-r*(RAD1-1.0)};
  120. return surface(p,coord,(Real_t)RAD1,depth);
  121. }
  122. /**
  123. * \brief Defines the 3D grid for convolution in FFT acceleration of V-list.
  124. * \see surface()
  125. */
  126. template <class Real_t>
  127. std::vector<Real_t> conv_grid(int p, Real_t* c, int depth){
  128. Real_t r=pow(0.5,depth);
  129. Real_t a=r*RAD0;
  130. Real_t coord[3]={c[0],c[1],c[2]};
  131. int n1=p*2;
  132. int n2=(int)pow((Real_t)n1,2);
  133. int n3=(int)pow((Real_t)n1,3);
  134. std::vector<Real_t> grid(n3*3);
  135. for(int i=0;i<n1;i++)
  136. for(int j=0;j<n1;j++)
  137. for(int k=0;k<n1;k++){
  138. grid[(i+n1*j+n2*k)*3+0]=(i-p)*a/(p-1)+coord[0];
  139. grid[(i+n1*j+n2*k)*3+1]=(j-p)*a/(p-1)+coord[1];
  140. grid[(i+n1*j+n2*k)*3+2]=(k-p)*a/(p-1)+coord[2];
  141. }
  142. return grid;
  143. }
  144. template <class Real_t>
  145. void FMM_Data<Real_t>::Clear(){
  146. upward_equiv.Resize(0);
  147. }
  148. template <class Real_t>
  149. PackedData FMM_Data<Real_t>::PackMultipole(void* buff_ptr){
  150. PackedData p0; p0.data=buff_ptr;
  151. p0.length=upward_equiv.Dim()*sizeof(Real_t);
  152. if(p0.length==0) return p0;
  153. if(p0.data==NULL) p0.data=(char*)&upward_equiv[0];
  154. else mem::memcopy(p0.data,&upward_equiv[0],p0.length);
  155. return p0;
  156. }
  157. template <class Real_t>
  158. void FMM_Data<Real_t>::AddMultipole(PackedData p0){
  159. Real_t* data=(Real_t*)p0.data;
  160. size_t n=p0.length/sizeof(Real_t);
  161. assert(upward_equiv.Dim()==n);
  162. Matrix<Real_t> v0(1,n,&upward_equiv[0],false);
  163. Matrix<Real_t> v1(1,n,data,false);
  164. v0+=v1;
  165. }
  166. template <class Real_t>
  167. void FMM_Data<Real_t>::InitMultipole(PackedData p0, bool own_data){
  168. Real_t* data=(Real_t*)p0.data;
  169. size_t n=p0.length/sizeof(Real_t);
  170. if(n==0) return;
  171. if(own_data){
  172. upward_equiv=Vector<Real_t>(n, &data[0], false);
  173. }else{
  174. upward_equiv.ReInit(n, &data[0], false);
  175. }
  176. }
  177. template <class FMMNode>
  178. FMM_Pts<FMMNode>::~FMM_Pts() {
  179. if(mat!=NULL){
  180. // int rank;
  181. // MPI_Comm_rank(comm,&rank);
  182. // if(rank==0) mat->Save2File("Precomp.data");
  183. delete mat;
  184. mat=NULL;
  185. }
  186. if(vprecomp_fft_flag) FFTW_t<Real_t>::fft_destroy_plan(vprecomp_fftplan);
  187. #ifdef __INTEL_OFFLOAD0
  188. #pragma offload target(mic:0)
  189. #endif
  190. {
  191. if(vlist_fft_flag ) FFTW_t<Real_t>::fft_destroy_plan(vlist_fftplan );
  192. if(vlist_ifft_flag) FFTW_t<Real_t>::fft_destroy_plan(vlist_ifftplan);
  193. vlist_fft_flag =false;
  194. vlist_ifft_flag=false;
  195. }
  196. }
  197. template <class FMMNode>
  198. void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel<Real_t>* kernel_){
  199. Profile::Tic("InitFMM_Pts",&comm_,true);{
  200. bool verbose=false;
  201. #ifndef NDEBUG
  202. #ifdef __VERBOSE__
  203. int rank;
  204. MPI_Comm_rank(comm_,&rank);
  205. if(!rank) verbose=true;
  206. #endif
  207. #endif
  208. if(kernel_) kernel_->Initialize(verbose);
  209. multipole_order=mult_order;
  210. comm=comm_;
  211. kernel=kernel_;
  212. assert(kernel!=NULL);
  213. mat=new PrecompMat<Real_t>(Homogen(), MAX_DEPTH+1);
  214. if(this->mat_fname.size()==0){
  215. std::stringstream st;
  216. st<<PVFMM_PRECOMP_DATA_PATH;
  217. if(!st.str().size()){ // look in PVFMM_DIR
  218. char* pvfmm_dir = getenv ("PVFMM_DIR");
  219. if(pvfmm_dir) st<<pvfmm_dir<<'/';
  220. }
  221. #ifndef STAT_MACROS_BROKEN
  222. if(st.str().size()){ // check if the path is a directory
  223. struct stat stat_buff;
  224. if(stat(st.str().c_str(), &stat_buff) || !S_ISDIR(stat_buff.st_mode)){
  225. std::cout<<"error: path not found: "<<st.str()<<'\n';
  226. exit(0);
  227. }
  228. }
  229. #endif
  230. st<<"Precomp_"<<kernel->ker_name.c_str()<<"_m"<<mult_order;
  231. if(sizeof(Real_t)==8) st<<"";
  232. else if(sizeof(Real_t)==4) st<<"_f";
  233. else st<<"_t"<<sizeof(Real_t);
  234. st<<".data";
  235. this->mat_fname=st.str();
  236. }
  237. this->mat->LoadFile(mat_fname.c_str(), this->comm);
  238. interac_list.Initialize(COORD_DIM, this->mat);
  239. Profile::Tic("PrecompUC2UE",&comm,false,4);
  240. this->PrecompAll(UC2UE_Type);
  241. Profile::Toc();
  242. Profile::Tic("PrecompDC2DE",&comm,false,4);
  243. this->PrecompAll(DC2DE_Type);
  244. Profile::Toc();
  245. Profile::Tic("PrecompBC",&comm,false,4);
  246. { /*
  247. int type=BC_Type;
  248. for(int l=0;l<MAX_DEPTH;l++)
  249. for(size_t indx=0;indx<this->interac_list.ListCount((Mat_Type)type);indx++){
  250. Matrix<Real_t>& M=this->mat->Mat(l, (Mat_Type)type, indx);
  251. M.Resize(0,0);
  252. } // */
  253. }
  254. this->PrecompAll(BC_Type,0);
  255. Profile::Toc();
  256. Profile::Tic("PrecompU2U",&comm,false,4);
  257. this->PrecompAll(U2U_Type);
  258. Profile::Toc();
  259. Profile::Tic("PrecompD2D",&comm,false,4);
  260. this->PrecompAll(D2D_Type);
  261. Profile::Toc();
  262. Profile::Tic("PrecompV",&comm,false,4);
  263. this->PrecompAll(V_Type);
  264. Profile::Toc();
  265. Profile::Tic("PrecompV1",&comm,false,4);
  266. this->PrecompAll(V1_Type);
  267. Profile::Toc();
  268. }Profile::Toc();
  269. }
  270. template <class FMMNode>
  271. Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type type, Perm_Type perm_indx){
  272. //Check if the matrix already exists.
  273. Permutation<Real_t>& P_ = mat->Perm((Mat_Type)type, perm_indx);
  274. if(P_.Dim()!=0) return P_;
  275. //Compute the matrix.
  276. Permutation<Real_t> P;
  277. switch (type){
  278. case UC2UE_Type:
  279. {
  280. break;
  281. }
  282. case DC2DE_Type:
  283. {
  284. break;
  285. }
  286. case S2U_Type:
  287. {
  288. break;
  289. }
  290. case U2U_Type:
  291. {
  292. Real_t eps=1e-10;
  293. int dof=kernel->k_m2m->ker_dim[0];
  294. size_t p_indx=perm_indx % C_Perm;
  295. Permutation<Real_t>& ker_perm=kernel->k_m2m->perm_vec[p_indx];
  296. assert(dof==ker_perm.Dim());
  297. Real_t c[3]={-0.5,-0.5,-0.5};
  298. std::vector<Real_t> trg_coord=d_check_surf(this->MultipoleOrder(),c,0);
  299. int n_trg=trg_coord.size()/3;
  300. P=Permutation<Real_t>(n_trg*dof);
  301. if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){ // Set P.perm
  302. for(int i=0;i<n_trg;i++)
  303. for(int j=0;j<n_trg;j++){
  304. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+0]*(p_indx==ReflecX?-1.0:1.0))<eps)
  305. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+1]*(p_indx==ReflecY?-1.0:1.0))<eps)
  306. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+2]*(p_indx==ReflecZ?-1.0:1.0))<eps){
  307. for(int k=0;k<dof;k++){
  308. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  309. }
  310. }
  311. }
  312. }else if(p_indx==SwapXY || p_indx==SwapXZ){
  313. for(int i=0;i<n_trg;i++)
  314. for(int j=0;j<n_trg;j++){
  315. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+(p_indx==SwapXY?1:2)])<eps)
  316. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+(p_indx==SwapXY?0:1)])<eps)
  317. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+(p_indx==SwapXY?2:0)])<eps){
  318. for(int k=0;k<dof;k++){
  319. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  320. }
  321. }
  322. }
  323. }else{
  324. for(int j=0;j<n_trg;j++){
  325. for(int k=0;k<dof;k++){
  326. P.perm[j*dof+k]=j*dof+ker_perm.perm[k];
  327. }
  328. }
  329. }
  330. if(p_indx==Scaling && this->Homogen()){ // Set level-by-level scaling
  331. const Vector<Real_t>& scal_exp=kernel->k_m2m->src_scal;
  332. assert(dof==scal_exp.Dim());
  333. Vector<Real_t> scal(scal_exp.Dim());
  334. for(size_t i=0;i<scal_exp.Dim();i++){
  335. scal[i]=pow(2.0,(perm_indx<C_Perm?1.0:-1.0)*scal_exp[i]);
  336. }
  337. for(int j=0;j<n_trg;j++){
  338. for(int i=0;i<dof;i++){
  339. P.scal[j*dof+i]*=scal[i];
  340. }
  341. }
  342. }
  343. { // Set P.scal
  344. for(int j=0;j<n_trg;j++){
  345. for(int i=0;i<dof;i++){
  346. P.scal[j*dof+i]*=ker_perm.scal[i];
  347. }
  348. }
  349. }
  350. break;
  351. }
  352. case D2D_Type:
  353. {
  354. Real_t eps=1e-10;
  355. int dof=kernel->k_l2l->ker_dim[0];
  356. size_t p_indx=perm_indx % C_Perm;
  357. Permutation<Real_t>& ker_perm=kernel->k_l2l->perm_vec[p_indx];
  358. assert(dof==ker_perm.Dim());
  359. Real_t c[3]={-0.5,-0.5,-0.5};
  360. std::vector<Real_t> trg_coord=d_check_surf(this->MultipoleOrder(),c,0);
  361. int n_trg=trg_coord.size()/3;
  362. P=Permutation<Real_t>(n_trg*dof);
  363. if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){ // Set P.perm
  364. for(int i=0;i<n_trg;i++)
  365. for(int j=0;j<n_trg;j++){
  366. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+0]*(p_indx==ReflecX?-1.0:1.0))<eps)
  367. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+1]*(p_indx==ReflecY?-1.0:1.0))<eps)
  368. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+2]*(p_indx==ReflecZ?-1.0:1.0))<eps){
  369. for(int k=0;k<dof;k++){
  370. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  371. }
  372. }
  373. }
  374. }else if(p_indx==SwapXY || p_indx==SwapXZ){
  375. for(int i=0;i<n_trg;i++)
  376. for(int j=0;j<n_trg;j++){
  377. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+(p_indx==SwapXY?1:2)])<eps)
  378. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+(p_indx==SwapXY?0:1)])<eps)
  379. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+(p_indx==SwapXY?2:0)])<eps){
  380. for(int k=0;k<dof;k++){
  381. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  382. }
  383. }
  384. }
  385. }else{
  386. for(int j=0;j<n_trg;j++){
  387. for(int k=0;k<dof;k++){
  388. P.perm[j*dof+k]=j*dof+ker_perm.perm[k];
  389. }
  390. }
  391. }
  392. if(p_indx==Scaling && this->Homogen()){ // Set level-by-level scaling
  393. const Vector<Real_t>& scal_exp=kernel->k_l2l->src_scal;
  394. assert(dof==scal_exp.Dim());
  395. Vector<Real_t> scal(scal_exp.Dim());
  396. for(size_t i=0;i<scal_exp.Dim();i++){
  397. scal[i]=pow(2.0,(perm_indx<C_Perm?1.0:-1.0)*scal_exp[i]);
  398. }
  399. for(int j=0;j<n_trg;j++){
  400. for(int i=0;i<dof;i++){
  401. P.scal[j*dof+i]*=scal[i];
  402. }
  403. }
  404. }
  405. { // Set P.scal
  406. for(int j=0;j<n_trg;j++){
  407. for(int i=0;i<dof;i++){
  408. P.scal[j*dof+i]*=ker_perm.scal[i];
  409. }
  410. }
  411. }
  412. break;
  413. }
  414. case D2T_Type:
  415. {
  416. break;
  417. }
  418. case U0_Type:
  419. {
  420. break;
  421. }
  422. case U1_Type:
  423. {
  424. break;
  425. }
  426. case U2_Type:
  427. {
  428. break;
  429. }
  430. case V_Type:
  431. {
  432. break;
  433. }
  434. case V1_Type:
  435. {
  436. break;
  437. }
  438. case W_Type:
  439. {
  440. break;
  441. }
  442. case X_Type:
  443. {
  444. break;
  445. }
  446. case BC_Type:
  447. {
  448. break;
  449. }
  450. default:
  451. break;
  452. }
  453. //Save the matrix for future use.
  454. #pragma omp critical (PRECOMP_MATRIX_PTS)
  455. {
  456. if(P_.Dim()==0) P_=P;
  457. }
  458. return P_;
  459. }
  460. template <class FMMNode>
  461. Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type type, size_t mat_indx){
  462. if(this->Homogen()) level=0;
  463. //Check if the matrix already exists.
  464. Matrix<Real_t>& M_ = this->mat->Mat(level, type, mat_indx);
  465. if(M_.Dim(0)!=0 && M_.Dim(1)!=0) return M_;
  466. else{ //Compute matrix from symmetry class (if possible).
  467. size_t class_indx = this->interac_list.InteracClass(type, mat_indx);
  468. if(class_indx!=mat_indx){
  469. Matrix<Real_t>& M0 = this->Precomp(level, type, class_indx);
  470. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, type, mat_indx);
  471. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, type, mat_indx);
  472. if(Pr.Dim()>0 && Pc.Dim()>0 && M0.Dim(0)>0 && M0.Dim(1)>0) return M_;
  473. }
  474. }
  475. //Compute the matrix.
  476. Matrix<Real_t> M;
  477. //int omp_p=omp_get_max_threads();
  478. switch (type){
  479. case UC2UE_Type:
  480. {
  481. if(MultipoleOrder()==0) break;
  482. const int* ker_dim=kernel->k_m2m->ker_dim;
  483. // Coord of upward check surface
  484. Real_t c[3]={0,0,0};
  485. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  486. size_t n_uc=uc_coord.size()/3;
  487. // Coord of upward equivalent surface
  488. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  489. size_t n_ue=ue_coord.size()/3;
  490. // Evaluate potential at check surface due to equivalent surface.
  491. Matrix<Real_t> M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  492. kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue,
  493. &uc_coord[0], n_uc, &(M_e2c[0][0]));
  494. Real_t eps=1.0;
  495. while(eps+(Real_t)1.0>1.0) eps*=0.5;
  496. M=M_e2c.pinv(sqrt(eps)); //check 2 equivalent
  497. break;
  498. }
  499. case DC2DE_Type:
  500. {
  501. if(MultipoleOrder()==0) break;
  502. const int* ker_dim=kernel->k_l2l->ker_dim;
  503. // Coord of downward check surface
  504. Real_t c[3]={0,0,0};
  505. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  506. size_t n_ch=check_surf.size()/3;
  507. // Coord of downward equivalent surface
  508. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  509. size_t n_eq=equiv_surf.size()/3;
  510. // Evaluate potential at check surface due to equivalent surface.
  511. Matrix<Real_t> M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]);
  512. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq,
  513. &check_surf[0], n_ch, &(M_e2c[0][0]));
  514. Real_t eps=1.0;
  515. while(eps+(Real_t)1.0>1.0) eps*=0.5;
  516. M=M_e2c.pinv(sqrt(eps)); //check 2 equivalent
  517. break;
  518. }
  519. case S2U_Type:
  520. {
  521. break;
  522. }
  523. case U2U_Type:
  524. {
  525. if(MultipoleOrder()==0) break;
  526. const int* ker_dim=kernel->k_m2m->ker_dim;
  527. // Coord of upward check surface
  528. Real_t c[3]={0,0,0};
  529. std::vector<Real_t> check_surf=u_check_surf(MultipoleOrder(),c,level);
  530. size_t n_uc=check_surf.size()/3;
  531. // Coord of child's upward equivalent surface
  532. Real_t s=pow(0.5,(level+2));
  533. int* coord=interac_list.RelativeCoord(type,mat_indx);
  534. Real_t child_coord[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  535. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),child_coord,level+1);
  536. size_t n_ue=equiv_surf.size()/3;
  537. // Evaluate potential at check surface due to equivalent surface.
  538. Matrix<Real_t> M_ce2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  539. kernel->k_m2m->BuildMatrix(&equiv_surf[0], n_ue,
  540. &check_surf[0], n_uc, &(M_ce2c[0][0]));
  541. Matrix<Real_t>& M_c2e = Precomp(level, UC2UE_Type, 0);
  542. M=M_ce2c*M_c2e;
  543. break;
  544. }
  545. case D2D_Type:
  546. {
  547. if(MultipoleOrder()==0) break;
  548. const int* ker_dim=kernel->k_l2l->ker_dim;
  549. // Coord of downward check surface
  550. Real_t s=pow(0.5,level+1);
  551. int* coord=interac_list.RelativeCoord(type,mat_indx);
  552. Real_t c[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  553. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  554. size_t n_dc=check_surf.size()/3;
  555. // Coord of parent's downward equivalent surface
  556. Real_t parent_coord[3]={0,0,0};
  557. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),parent_coord,level-1);
  558. size_t n_de=equiv_surf.size()/3;
  559. // Evaluate potential at check surface due to equivalent surface.
  560. Matrix<Real_t> M_pe2c(n_de*ker_dim[0],n_dc*ker_dim[1]);
  561. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_de,
  562. &check_surf[0], n_dc, &(M_pe2c[0][0]));
  563. Matrix<Real_t>& M_c2e=Precomp(level,DC2DE_Type,0);
  564. M=M_pe2c*M_c2e;
  565. break;
  566. }
  567. case D2T_Type:
  568. {
  569. if(MultipoleOrder()==0) break;
  570. const int* ker_dim=kernel->k_l2t->ker_dim;
  571. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  572. // Coord of target points
  573. Real_t r=pow(0.5,level);
  574. size_t n_trg=rel_trg_coord.size()/3;
  575. std::vector<Real_t> trg_coord(n_trg*3);
  576. for(size_t i=0;i<n_trg*COORD_DIM;i++) trg_coord[i]=rel_trg_coord[i]*r;
  577. // Coord of downward equivalent surface
  578. Real_t c[3]={0,0,0};
  579. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  580. size_t n_eq=equiv_surf.size()/3;
  581. // Evaluate potential at target points due to equivalent surface.
  582. {
  583. M .Resize(n_eq*ker_dim [0], n_trg*ker_dim [1]);
  584. kernel->k_l2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  585. }
  586. break;
  587. }
  588. case U0_Type:
  589. {
  590. break;
  591. }
  592. case U1_Type:
  593. {
  594. break;
  595. }
  596. case U2_Type:
  597. {
  598. break;
  599. }
  600. case V_Type:
  601. {
  602. if(MultipoleOrder()==0) break;
  603. const int* ker_dim=kernel->k_m2l->ker_dim;
  604. int n1=MultipoleOrder()*2;
  605. int n3 =n1*n1*n1;
  606. int n3_=n1*n1*(n1/2+1);
  607. //Compute the matrix.
  608. Real_t s=pow(0.5,level);
  609. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  610. Real_t coord_diff[3]={coord2[0]*s,coord2[1]*s,coord2[2]*s};
  611. //Evaluate potential.
  612. std::vector<Real_t> r_trg(COORD_DIM,0.0);
  613. std::vector<Real_t> conv_poten(n3*ker_dim[0]*ker_dim[1]);
  614. std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
  615. kernel->k_m2l->BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
  616. //Rearrange data.
  617. Matrix<Real_t> M_conv(n3,ker_dim[0]*ker_dim[1],&conv_poten[0],false);
  618. M_conv=M_conv.Transpose();
  619. //Compute FFTW plan.
  620. int nnn[3]={n1,n1,n1};
  621. Real_t *fftw_in, *fftw_out;
  622. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  623. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  624. #pragma omp critical (FFTW_PLAN)
  625. {
  626. if (!vprecomp_fft_flag){
  627. vprecomp_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM, nnn, ker_dim[0]*ker_dim[1],
  628. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_, FFTW_ESTIMATE);
  629. vprecomp_fft_flag=true;
  630. }
  631. }
  632. //Compute FFT.
  633. mem::memcopy(fftw_in, &conv_poten[0], n3*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  634. FFTW_t<Real_t>::fft_execute_dft_r2c(vprecomp_fftplan, (Real_t*)fftw_in, (typename FFTW_t<Real_t>::cplx*)(fftw_out));
  635. Matrix<Real_t> M_(2*n3_*ker_dim[0]*ker_dim[1],1,(Real_t*)fftw_out,false);
  636. M=M_;
  637. //Free memory.
  638. mem::aligned_delete<Real_t>(fftw_in);
  639. mem::aligned_delete<Real_t>(fftw_out);
  640. break;
  641. }
  642. case V1_Type:
  643. {
  644. if(MultipoleOrder()==0) break;
  645. const int* ker_dim=kernel->k_m2l->ker_dim;
  646. size_t mat_cnt =interac_list.ListCount( V_Type);
  647. for(size_t k=0;k<mat_cnt;k++) Precomp(level, V_Type, k);
  648. const size_t chld_cnt=1UL<<COORD_DIM;
  649. size_t n1=MultipoleOrder()*2;
  650. size_t M_dim=n1*n1*(n1/2+1);
  651. size_t n3=n1*n1*n1;
  652. Vector<Real_t> zero_vec(M_dim*ker_dim[0]*ker_dim[1]*2);
  653. zero_vec.SetZero();
  654. Vector<Real_t*> M_ptr(chld_cnt*chld_cnt);
  655. for(size_t i=0;i<chld_cnt*chld_cnt;i++) M_ptr[i]=&zero_vec[0];
  656. int* rel_coord_=interac_list.RelativeCoord(V1_Type, mat_indx);
  657. for(int j1=0;j1<chld_cnt;j1++)
  658. for(int j2=0;j2<chld_cnt;j2++){
  659. int rel_coord[3]={rel_coord_[0]*2-(j1/1)%2+(j2/1)%2,
  660. rel_coord_[1]*2-(j1/2)%2+(j2/2)%2,
  661. rel_coord_[2]*2-(j1/4)%2+(j2/4)%2};
  662. for(size_t k=0;k<mat_cnt;k++){
  663. int* ref_coord=interac_list.RelativeCoord(V_Type, k);
  664. if(ref_coord[0]==rel_coord[0] &&
  665. ref_coord[1]==rel_coord[1] &&
  666. ref_coord[2]==rel_coord[2]){
  667. Matrix<Real_t>& M = this->mat->Mat(level, V_Type, k);
  668. M_ptr[j2*chld_cnt+j1]=&M[0][0];
  669. break;
  670. }
  671. }
  672. }
  673. // Build matrix ker_dim0 x ker_dim1 x M_dim x 8 x 8
  674. M.Resize(ker_dim[0]*ker_dim[1]*M_dim, 2*chld_cnt*chld_cnt);
  675. for(int j=0;j<ker_dim[0]*ker_dim[1]*M_dim;j++){
  676. for(size_t k=0;k<chld_cnt*chld_cnt;k++){
  677. M[j][k*2+0]=M_ptr[k][j*2+0]/n3;
  678. M[j][k*2+1]=M_ptr[k][j*2+1]/n3;
  679. }
  680. }
  681. break;
  682. }
  683. case W_Type:
  684. {
  685. if(MultipoleOrder()==0) break;
  686. const int* ker_dim=kernel->k_m2t->ker_dim;
  687. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  688. // Coord of target points
  689. Real_t s=pow(0.5,level);
  690. size_t n_trg=rel_trg_coord.size()/3;
  691. std::vector<Real_t> trg_coord(n_trg*3);
  692. for(size_t j=0;j<n_trg*COORD_DIM;j++) trg_coord[j]=rel_trg_coord[j]*s;
  693. // Coord of downward equivalent surface
  694. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  695. Real_t c[3]={(coord2[0]+1)*s*0.25,(coord2[1]+1)*s*0.25,(coord2[2]+1)*s*0.25};
  696. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),c,level+1);
  697. size_t n_eq=equiv_surf.size()/3;
  698. // Evaluate potential at target points due to equivalent surface.
  699. {
  700. M .Resize(n_eq*ker_dim [0],n_trg*ker_dim [1]);
  701. kernel->k_m2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  702. }
  703. break;
  704. }
  705. case X_Type:
  706. {
  707. break;
  708. }
  709. case BC_Type:
  710. {
  711. if(MultipoleOrder()==0) break;
  712. const int* ker_dim=kernel->k_m2l->ker_dim;
  713. size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type);
  714. size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2); //Total number of points.
  715. if((M.Dim(0)!=n_surf*ker_dim[0] || M.Dim(1)!=n_surf*ker_dim[1]) && level==0){
  716. Matrix<Real_t> M_m2m[BC_LEVELS+1];
  717. Matrix<Real_t> M_m2l[BC_LEVELS+1];
  718. Matrix<Real_t> M_l2l[BC_LEVELS+1];
  719. Matrix<Real_t> M_zero_avg(n_surf*ker_dim[0],n_surf*ker_dim[0]);
  720. { // Set average multipole charge to zero. (improves stability for large BC_LEVELS)
  721. M_zero_avg.SetZero();
  722. for(size_t i=0;i<n_surf*ker_dim[0];i++)
  723. M_zero_avg[i][i]+=1;
  724. for(size_t i=0;i<n_surf;i++)
  725. for(size_t j=0;j<n_surf;j++)
  726. for(size_t k=0;k<ker_dim[0];k++)
  727. M_zero_avg[i*ker_dim[0]+k][j*ker_dim[0]+k]-=1.0/n_surf;
  728. }
  729. for(int level=0; level>=-BC_LEVELS; level--){
  730. // Compute M_l2l
  731. M_l2l[-level] = this->Precomp(level, D2D_Type, 0);
  732. assert(M_l2l[-level].Dim(0)>0 && M_l2l[-level].Dim(1)>0);
  733. // Compute M_m2m
  734. for(size_t mat_indx=0; mat_indx<mat_cnt_m2m; mat_indx++){
  735. Matrix<Real_t> M=this->Precomp(level, U2U_Type, mat_indx);
  736. assert(M.Dim(0)>0 && M_m2m>0);
  737. if(mat_indx==0) M_m2m[-level] = M_zero_avg*M;
  738. else M_m2m[-level] += M_zero_avg*M;
  739. }
  740. // Compute M_m2l
  741. if(!Homogen() || level==0){
  742. Real_t s=(1UL<<(-level));
  743. Real_t ue_coord[3]={0,0,0};
  744. Real_t dc_coord[3]={0,0,0};
  745. std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
  746. std::vector<Real_t> trg_coord=d_check_surf(MultipoleOrder(), dc_coord, level);
  747. Matrix<Real_t> M_ue2dc(n_surf*ker_dim[0], n_surf*ker_dim[1]);
  748. M_ue2dc.SetZero();
  749. for(int x0=-2;x0<4;x0++)
  750. for(int x1=-2;x1<4;x1++)
  751. for(int x2=-2;x2<4;x2++)
  752. if(abs(x0)>1 || abs(x1)>1 || abs(x2)>1){
  753. ue_coord[0]=x0*s; ue_coord[1]=x1*s; ue_coord[2]=x2*s;
  754. std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
  755. Matrix<Real_t> M_tmp(n_surf*ker_dim[0], n_surf*ker_dim[1]);
  756. kernel->k_m2l->BuildMatrix(&src_coord[0], n_surf,
  757. &trg_coord[0], n_surf, &(M_tmp[0][0]));
  758. M_ue2dc+=M_tmp;
  759. }
  760. // Shift by constant.
  761. for(size_t i=0;i<M_ue2dc.Dim(0);i++){
  762. std::vector<Real_t> avg(ker_dim[1],0);
  763. for(size_t j=0; j<M_ue2dc.Dim(1); j+=ker_dim[1])
  764. for(int k=0; k<ker_dim[1]; k++) avg[k]+=M_ue2dc[i][j+k];
  765. for(int k=0; k<ker_dim[1]; k++) avg[k]/=n_surf;
  766. for(size_t j=0; j<M_ue2dc.Dim(1); j+=ker_dim[1])
  767. for(int k=0; k<ker_dim[1]; k++) M_ue2dc[i][j+k]-=avg[k];
  768. }
  769. Matrix<Real_t>& M_dc2de = Precomp(level, DC2DE_Type, 0);
  770. M_m2l[-level]=M_ue2dc*M_dc2de;
  771. }else M_m2l[-level]=M_m2l[-level-1];
  772. }
  773. for(int level=-BC_LEVELS;level<=0;level++){
  774. if(level==-BC_LEVELS) M = M_m2l[-level];
  775. else M = M_m2l[-level] + M_m2m[-level]*M*M_l2l[-level];
  776. { // Shift by constant. (improves stability for large BC_LEVELS)
  777. Matrix<Real_t> M_de2dc(n_surf*ker_dim[0], n_surf*ker_dim[1]);
  778. { // M_de2dc TODO: For homogeneous kernels, compute only once.
  779. // Coord of downward check surface
  780. Real_t c[3]={0,0,0};
  781. int level_=(Homogen()?0:level);
  782. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level_);
  783. size_t n_ch=check_surf.size()/3;
  784. // Coord of downward equivalent surface
  785. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level_);
  786. size_t n_eq=equiv_surf.size()/3;
  787. // Evaluate potential at check surface due to equivalent surface.
  788. kernel->k_m2l->BuildMatrix(&equiv_surf[0], n_eq,
  789. &check_surf[0], n_ch, &(M_de2dc[0][0]));
  790. }
  791. Matrix<Real_t> M_ue2dc=M*M_de2dc;
  792. for(size_t i=0;i<M_ue2dc.Dim(0);i++){
  793. std::vector<Real_t> avg(ker_dim[1],0);
  794. for(size_t j=0; j<M_ue2dc.Dim(1); j+=ker_dim[1])
  795. for(int k=0; k<ker_dim[1]; k++) avg[k]+=M_ue2dc[i][j+k];
  796. for(int k=0; k<ker_dim[1]; k++) avg[k]/=n_surf;
  797. for(size_t j=0; j<M_ue2dc.Dim(1); j+=ker_dim[1])
  798. for(int k=0; k<ker_dim[1]; k++) M_ue2dc[i][j+k]-=avg[k];
  799. }
  800. Matrix<Real_t>& M_dc2de = Precomp(level, DC2DE_Type, 0);
  801. M=M_ue2dc*M_dc2de;
  802. }
  803. }
  804. { // ax+by+cz+d correction.
  805. std::vector<Real_t> corner_pts;
  806. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0);
  807. corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0);
  808. corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0);
  809. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1);
  810. size_t n_corner=corner_pts.size()/3;
  811. // Coord of downward equivalent surface
  812. Real_t c[3]={0,0,0};
  813. std::vector<Real_t> up_equiv_surf=u_equiv_surf(MultipoleOrder(),c,0);
  814. std::vector<Real_t> dn_equiv_surf=d_equiv_surf(MultipoleOrder(),c,0);
  815. std::vector<Real_t> dn_check_surf=d_check_surf(MultipoleOrder(),c,0);
  816. Matrix<Real_t> M_err;
  817. { // Evaluate potential at corner due to upward and dnward equivalent surface.
  818. { // Error from local expansion.
  819. Matrix<Real_t> M_e2pt(n_surf*ker_dim[0],n_corner*ker_dim[1]);
  820. kernel->k_m2l->BuildMatrix(&dn_equiv_surf[0], n_surf,
  821. &corner_pts[0], n_corner, &(M_e2pt[0][0]));
  822. M_err=M*M_e2pt;
  823. }
  824. for(size_t k=0;k<4;k++){ // Error from colleagues of root.
  825. for(int j0=-1;j0<=1;j0++)
  826. for(int j1=-1;j1<=1;j1++)
  827. for(int j2=-1;j2<=1;j2++){
  828. Real_t pt_coord[3]={corner_pts[k*COORD_DIM+0]-j0,
  829. corner_pts[k*COORD_DIM+1]-j1,
  830. corner_pts[k*COORD_DIM+2]-j2};
  831. if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){
  832. Matrix<Real_t> M_e2pt(n_surf*ker_dim[0],ker_dim[1]);
  833. kernel->k_m2l->BuildMatrix(&up_equiv_surf[0], n_surf,
  834. &pt_coord[0], 1, &(M_e2pt[0][0]));
  835. for(size_t i=0;i<M_e2pt.Dim(0);i++)
  836. for(size_t j=0;j<M_e2pt.Dim(1);j++)
  837. M_err[i][k*ker_dim[1]+j]+=M_e2pt[i][j];
  838. }
  839. }
  840. }
  841. }
  842. Matrix<Real_t> M_grad(M_err.Dim(0),n_surf*ker_dim[1]);
  843. for(size_t i=0;i<M_err.Dim(0);i++)
  844. for(size_t k=0;k<ker_dim[1];k++)
  845. for(size_t j=0;j<n_surf;j++){
  846. M_grad[i][j*ker_dim[1]+k]=(M_err[i][0*ker_dim[1]+k] )*1.0 +
  847. (M_err[i][1*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]+
  848. (M_err[i][2*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+1]+
  849. (M_err[i][3*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+2];
  850. }
  851. Matrix<Real_t>& M_dc2de = Precomp(0, DC2DE_Type, 0);
  852. M-=M_grad*M_dc2de;
  853. }
  854. { // Free memory
  855. Mat_Type type=D2D_Type;
  856. for(int l=-BC_LEVELS;l<0;l++)
  857. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  858. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  859. M.Resize(0,0);
  860. }
  861. type=U2U_Type;
  862. for(int l=-BC_LEVELS;l<0;l++)
  863. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  864. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  865. M.Resize(0,0);
  866. }
  867. type=DC2DE_Type;
  868. for(int l=-BC_LEVELS;l<0;l++)
  869. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  870. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  871. M.Resize(0,0);
  872. }
  873. type=UC2UE_Type;
  874. for(int l=-BC_LEVELS;l<0;l++)
  875. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  876. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  877. M.Resize(0,0);
  878. }
  879. }
  880. }
  881. break;
  882. }
  883. default:
  884. break;
  885. }
  886. //Save the matrix for future use.
  887. #pragma omp critical (PRECOMP_MATRIX_PTS)
  888. if(M_.Dim(0)==0 && M_.Dim(1)==0){
  889. M_=M;
  890. /*
  891. M_.Resize(M.Dim(0),M.Dim(1));
  892. int dof=ker_dim[0]*ker_dim[1];
  893. for(int j=0;j<dof;j++){
  894. size_t a=(M.Dim(0)*M.Dim(1)* j )/dof;
  895. size_t b=(M.Dim(0)*M.Dim(1)*(j+1))/dof;
  896. #pragma omp parallel for // NUMA
  897. for(int tid=0;tid<omp_p;tid++){
  898. size_t a_=a+((b-a)* tid )/omp_p;
  899. size_t b_=a+((b-a)*(tid+1))/omp_p;
  900. mem::memcopy(&M_[0][a_], &M[0][a_], (b_-a_)*sizeof(Real_t));
  901. }
  902. }
  903. */
  904. }
  905. return M_;
  906. }
  907. template <class FMMNode>
  908. void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
  909. if(level==-1){
  910. for(int l=0;l<MAX_DEPTH;l++){
  911. PrecompAll(type, l);
  912. }
  913. return;
  914. }
  915. //Compute basic permutations.
  916. for(size_t i=0;i<Perm_Count;i++)
  917. this->PrecompPerm(type, (Perm_Type) i);
  918. {
  919. //Allocate matrices.
  920. size_t mat_cnt=interac_list.ListCount((Mat_Type)type);
  921. mat->Mat(level, (Mat_Type)type, mat_cnt-1);
  922. { // Compute InteracClass matrices.
  923. std::vector<size_t> indx_lst;
  924. for(size_t i=0; i<mat_cnt; i++){
  925. if(interac_list.InteracClass((Mat_Type)type,i)==i)
  926. indx_lst.push_back(i);
  927. }
  928. //Compute Transformations.
  929. //#pragma omp parallel for //lets use fine grained parallelism
  930. for(size_t i=0; i<indx_lst.size(); i++){
  931. Precomp(level, (Mat_Type)type, indx_lst[i]);
  932. }
  933. }
  934. //#pragma omp parallel for //lets use fine grained parallelism
  935. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
  936. Matrix<Real_t>& M0=interac_list.ClassMat(level,(Mat_Type)type,mat_indx);
  937. Permutation<Real_t>& pr=interac_list.Perm_R(level, (Mat_Type)type, mat_indx);
  938. Permutation<Real_t>& pc=interac_list.Perm_C(level, (Mat_Type)type, mat_indx);
  939. if(pr.Dim()!=M0.Dim(0) || pc.Dim()!=M0.Dim(1)) Precomp(level, (Mat_Type)type, mat_indx);
  940. }
  941. }
  942. }
  943. template <class FMMNode>
  944. void FMM_Pts<FMMNode>::CollectNodeData(std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff_list, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
  945. if(buff_list.size()<7) buff_list.resize(7);
  946. if( n_list.size()<7) n_list.resize(7);
  947. if( vec_list.size()<7) vec_list.resize(7);
  948. int omp_p=omp_get_max_threads();
  949. if(node.size()==0) return;
  950. {// 0. upward_equiv
  951. int indx=0;
  952. size_t vec_sz;
  953. { // Set vec_sz
  954. Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE_Type, 0);
  955. vec_sz=M_uc2ue.Dim(1);
  956. }
  957. std::vector< FMMNode* > node_lst;
  958. {// Construct node_lst
  959. node_lst.clear();
  960. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  961. FMMNode_t* r_node=NULL;
  962. for(size_t i=0;i<node.size();i++){
  963. if(!node[i]->IsLeaf())
  964. node_lst_[node[i]->Depth()].push_back(node[i]);
  965. if(node[i]->Depth()==0) r_node=node[i];
  966. }
  967. size_t chld_cnt=1UL<<COORD_DIM;
  968. for(int i=0;i<=MAX_DEPTH;i++){
  969. for(size_t j=0;j<node_lst_[i].size();j++){
  970. for(size_t k=0;k<chld_cnt;k++){
  971. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  972. node_lst.push_back(node);
  973. }
  974. }
  975. }
  976. if(r_node!=NULL) node_lst.push_back(r_node);
  977. n_list[indx]=node_lst;
  978. }
  979. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  980. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  981. FMMNode_t* node=node_lst[i];
  982. Vector<Real_t>& data_vec=node->FMMData()->upward_equiv;
  983. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
  984. vec_lst.push_back(&data_vec);
  985. }
  986. }
  987. {// 1. dnward_equiv
  988. int indx=1;
  989. size_t vec_sz;
  990. { // Set vec_sz
  991. Matrix<Real_t>& M_dc2de = this->interac_list.ClassMat(0, DC2DE_Type, 0);
  992. vec_sz=M_dc2de.Dim(1);
  993. }
  994. std::vector< FMMNode* > node_lst;
  995. {// Construct node_lst
  996. node_lst.clear();
  997. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  998. FMMNode_t* r_node=NULL;
  999. for(size_t i=0;i<node.size();i++){
  1000. if(!node[i]->IsLeaf())
  1001. node_lst_[node[i]->Depth()].push_back(node[i]);
  1002. if(node[i]->Depth()==0) r_node=node[i];
  1003. }
  1004. size_t chld_cnt=1UL<<COORD_DIM;
  1005. for(int i=0;i<=MAX_DEPTH;i++){
  1006. for(size_t j=0;j<node_lst_[i].size();j++){
  1007. for(size_t k=0;k<chld_cnt;k++){
  1008. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1009. node_lst.push_back(node);
  1010. }
  1011. }
  1012. }
  1013. if(r_node!=NULL) node_lst.push_back(r_node);
  1014. n_list[indx]=node_lst;
  1015. }
  1016. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1017. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1018. FMMNode_t* node=node_lst[i];
  1019. Vector<Real_t>& data_vec=node->FMMData()->dnward_equiv;
  1020. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
  1021. vec_lst.push_back(&data_vec);
  1022. }
  1023. }
  1024. {// 2. upward_equiv_fft
  1025. int indx=2;
  1026. std::vector< FMMNode* > node_lst;
  1027. {
  1028. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1029. for(size_t i=0;i<node.size();i++)
  1030. if(!node[i]->IsLeaf())
  1031. node_lst_[node[i]->Depth()].push_back(node[i]);
  1032. for(int i=0;i<=MAX_DEPTH;i++)
  1033. for(size_t j=0;j<node_lst_[i].size();j++)
  1034. node_lst.push_back(node_lst_[i][j]);
  1035. }
  1036. n_list[indx]=node_lst;
  1037. }
  1038. {// 3. dnward_check_fft
  1039. int indx=3;
  1040. std::vector< FMMNode* > node_lst;
  1041. {
  1042. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1043. for(size_t i=0;i<node.size();i++)
  1044. if(!node[i]->IsLeaf() && !node[i]->IsGhost())
  1045. node_lst_[node[i]->Depth()].push_back(node[i]);
  1046. for(int i=0;i<=MAX_DEPTH;i++)
  1047. for(size_t j=0;j<node_lst_[i].size();j++)
  1048. node_lst.push_back(node_lst_[i][j]);
  1049. }
  1050. n_list[indx]=node_lst;
  1051. }
  1052. {// 4. src_val
  1053. int indx=4;
  1054. int src_dof=kernel->ker_dim[0];
  1055. int surf_dof=COORD_DIM+src_dof;
  1056. std::vector< FMMNode* > node_lst;
  1057. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1058. if(node[i]->IsLeaf()){
  1059. node_lst.push_back(node[i]);
  1060. }
  1061. }
  1062. n_list[indx]=node_lst;
  1063. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1064. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1065. FMMNode_t* node=node_lst[i];
  1066. { // src_value
  1067. Vector<Real_t>& data_vec=node->src_value;
  1068. size_t vec_sz=(node->src_coord.Dim()/COORD_DIM)*src_dof;
  1069. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
  1070. vec_lst.push_back(&data_vec);
  1071. }
  1072. { // surf_value
  1073. Vector<Real_t>& data_vec=node->surf_value;
  1074. size_t vec_sz=(node->surf_coord.Dim()/COORD_DIM)*surf_dof;
  1075. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
  1076. vec_lst.push_back(&data_vec);
  1077. }
  1078. }
  1079. }
  1080. {// 5. trg_val
  1081. int indx=5;
  1082. int trg_dof=kernel->ker_dim[1];
  1083. std::vector< FMMNode* > node_lst;
  1084. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1085. if(node[i]->IsLeaf() && !node[i]->IsGhost()){
  1086. node_lst.push_back(node[i]);
  1087. }
  1088. }
  1089. n_list[indx]=node_lst;
  1090. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1091. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1092. FMMNode_t* node=node_lst[i];
  1093. { // trg_value
  1094. Vector<Real_t>& data_vec=node->trg_value;
  1095. size_t vec_sz=(node->trg_coord.Dim()/COORD_DIM)*trg_dof;
  1096. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz);
  1097. vec_lst.push_back(&data_vec);
  1098. }
  1099. }
  1100. }
  1101. {// 6. pts_coord
  1102. int indx=6;
  1103. std::vector< FMMNode* > node_lst;
  1104. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1105. if(node[i]->IsLeaf()){
  1106. node_lst.push_back(node[i]);
  1107. }
  1108. }
  1109. n_list[indx]=node_lst;
  1110. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1111. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1112. FMMNode_t* node=node_lst[i];
  1113. { // src_coord
  1114. Vector<Real_t>& data_vec=node->src_coord;
  1115. vec_lst.push_back(&data_vec);
  1116. }
  1117. { // surf_coord
  1118. Vector<Real_t>& data_vec=node->surf_coord;
  1119. vec_lst.push_back(&data_vec);
  1120. }
  1121. { // trg_coord
  1122. Vector<Real_t>& data_vec=node->trg_coord;
  1123. vec_lst.push_back(&data_vec);
  1124. }
  1125. }
  1126. { // check and equiv surfaces.
  1127. if(upwd_check_surf.size()==0){
  1128. size_t m=MultipoleOrder();
  1129. upwd_check_surf.resize(MAX_DEPTH);
  1130. upwd_equiv_surf.resize(MAX_DEPTH);
  1131. dnwd_check_surf.resize(MAX_DEPTH);
  1132. dnwd_equiv_surf.resize(MAX_DEPTH);
  1133. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1134. Real_t c[3]={0.0,0.0,0.0};
  1135. upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1136. upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1137. dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1138. dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1139. upwd_check_surf[depth]=u_check_surf(m,c,depth);
  1140. upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
  1141. dnwd_check_surf[depth]=d_check_surf(m,c,depth);
  1142. dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
  1143. }
  1144. }
  1145. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1146. vec_lst.push_back(&upwd_check_surf[depth]);
  1147. vec_lst.push_back(&upwd_equiv_surf[depth]);
  1148. vec_lst.push_back(&dnwd_check_surf[depth]);
  1149. vec_lst.push_back(&dnwd_equiv_surf[depth]);
  1150. }
  1151. }
  1152. }
  1153. // Create extra auxiliary buffer.
  1154. if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1);
  1155. for(size_t indx=0;indx<vec_list.size();indx++){ // Resize buffer
  1156. Matrix<Real_t>& aux_buff=buff_list[vec_list.size()];
  1157. Matrix<Real_t>& buff=buff_list[indx];
  1158. std::vector<Vector<Real_t>*>& vec_lst= vec_list[indx];
  1159. bool keep_data=(indx==4 || indx==6);
  1160. size_t n_vec=vec_lst.size();
  1161. { // Continue if nothing to be done.
  1162. if(!n_vec) continue;
  1163. if(buff.Dim(0)*buff.Dim(1)>0){
  1164. bool init_buff=false;
  1165. Real_t* buff_start=&buff[0][0];
  1166. Real_t* buff_end=&buff[0][0]+buff.Dim(0)*buff.Dim(1);
  1167. #pragma omp parallel for reduction(||:init_buff)
  1168. for(size_t i=0;i<n_vec;i++){
  1169. if(&(*vec_lst[i])[0]<buff_start || &(*vec_lst[i])[0]>=buff_end){
  1170. init_buff=true;
  1171. }
  1172. }
  1173. if(!init_buff) continue;
  1174. }
  1175. }
  1176. std::vector<size_t> vec_size(n_vec);
  1177. std::vector<size_t> vec_disp(n_vec);
  1178. if(n_vec){ // Set vec_size and vec_disp
  1179. #pragma omp parallel for
  1180. for(size_t i=0;i<n_vec;i++){ // Set vec_size
  1181. vec_size[i]=vec_lst[i]->Dim();
  1182. }
  1183. vec_disp[0]=0;
  1184. omp_par::scan(&vec_size[0],&vec_disp[0],n_vec);
  1185. }
  1186. size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1];
  1187. if(keep_data){ // Copy to aux_buff
  1188. if(aux_buff.Dim(0)*aux_buff.Dim(1)<buff_size){ // Resize aux_buff
  1189. aux_buff.ReInit(1,buff_size*1.05);
  1190. }
  1191. #pragma omp parallel for schedule(dynamic)
  1192. for(size_t i=0;i<n_vec;i++){
  1193. mem::memcopy(&aux_buff[0][0]+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
  1194. }
  1195. }
  1196. if(buff.Dim(0)*buff.Dim(1)<buff_size){ // Resize buff
  1197. buff.ReInit(1,buff_size*1.05);
  1198. }
  1199. if(keep_data){ // Copy to buff (from aux_buff)
  1200. #pragma omp parallel for
  1201. for(size_t tid=0;tid<omp_p;tid++){
  1202. size_t a=(buff_size*(tid+0))/omp_p;
  1203. size_t b=(buff_size*(tid+1))/omp_p;
  1204. mem::memcopy(&buff[0][0]+a,&aux_buff[0][0]+a,(b-a)*sizeof(Real_t));
  1205. }
  1206. }
  1207. #pragma omp parallel for
  1208. for(size_t i=0;i<n_vec;i++){ // ReInit vectors
  1209. vec_lst[i]->ReInit(vec_size[i],&buff[0][0]+vec_disp[i],false);
  1210. }
  1211. }
  1212. }
  1213. template <class FMMNode>
  1214. void FMM_Pts<FMMNode>::SetupPrecomp(SetupData<Real_t>& setup_data, bool device){
  1215. if(setup_data.precomp_data==NULL || setup_data.level>MAX_DEPTH) return;
  1216. Profile::Tic("SetupPrecomp",&this->comm,true,25);
  1217. { // Build precomp_data
  1218. size_t precomp_offset=0;
  1219. int level=setup_data.level;
  1220. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1221. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1222. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1223. Mat_Type& interac_type=interac_type_lst[type_indx];
  1224. this->PrecompAll(interac_type, level); // Compute matrices.
  1225. precomp_offset=this->mat->CompactData(level, interac_type, precomp_data, precomp_offset);
  1226. }
  1227. }
  1228. Profile::Toc();
  1229. if(device){ // Host2Device
  1230. Profile::Tic("Host2Device",&this->comm,false,25);
  1231. setup_data.precomp_data->AllocDevice(true);
  1232. Profile::Toc();
  1233. }
  1234. }
  1235. template <class FMMNode>
  1236. void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
  1237. int level=setup_data.level;
  1238. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1239. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1240. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1241. Matrix<Real_t>& input_data=*setup_data. input_data;
  1242. Matrix<Real_t>& output_data=*setup_data.output_data;
  1243. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector;
  1244. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector;
  1245. size_t n_in =nodes_in .size();
  1246. size_t n_out=nodes_out.size();
  1247. // Setup precomputed data.
  1248. if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  1249. // Build interac_data
  1250. Profile::Tic("Interac-Data",&this->comm,true,25);
  1251. Matrix<char>& interac_data=setup_data.interac_data;
  1252. if(n_out>0 && n_in >0){ // Build precomp_data, interac_data
  1253. std::vector<size_t> interac_mat;
  1254. std::vector<size_t> interac_cnt;
  1255. std::vector<size_t> interac_blk;
  1256. std::vector<size_t> input_perm;
  1257. std::vector<size_t> output_perm;
  1258. size_t dof=0, M_dim0=0, M_dim1=0;
  1259. size_t precomp_offset=0;
  1260. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  1261. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1262. Mat_Type& interac_type=interac_type_lst[type_indx];
  1263. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  1264. Matrix<size_t> precomp_data_offset;
  1265. { // Load precomp_data for interac_type.
  1266. struct HeaderData{
  1267. size_t total_size;
  1268. size_t level;
  1269. size_t mat_cnt ;
  1270. size_t max_depth;
  1271. };
  1272. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1273. char* indx_ptr=precomp_data[0]+precomp_offset;
  1274. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  1275. precomp_data_offset.ReInit(header.mat_cnt,(1+(2+2)*header.max_depth), (size_t*)indx_ptr, false);
  1276. precomp_offset+=header.total_size;
  1277. }
  1278. Matrix<FMMNode*> src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero();
  1279. Matrix<FMMNode*> trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero();
  1280. { // Build trg_interac_list
  1281. #pragma omp parallel for
  1282. for(size_t i=0;i<n_out;i++){
  1283. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  1284. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  1285. mem::memcopy(&trg_interac_list[i][0], &lst[0], lst.size()*sizeof(FMMNode*));
  1286. assert(lst.size()==mat_cnt);
  1287. }
  1288. }
  1289. }
  1290. { // Build src_interac_list
  1291. #pragma omp parallel for
  1292. for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
  1293. #pragma omp parallel for
  1294. for(size_t i=0;i<n_out;i++){
  1295. for(size_t j=0;j<mat_cnt;j++)
  1296. if(trg_interac_list[i][j]!=NULL){
  1297. src_interac_list[trg_interac_list[i][j]->node_id][j]=(FMMNode*)nodes_out[i];
  1298. }
  1299. }
  1300. }
  1301. Matrix<size_t> interac_dsp(n_out,mat_cnt);
  1302. std::vector<size_t> interac_blk_dsp(1,0);
  1303. { // Determine dof, M_dim0, M_dim1
  1304. dof=1;
  1305. Matrix<Real_t>& M0 = this->interac_list.ClassMat(level, interac_type_lst[0], 0);
  1306. M_dim0=M0.Dim(0); M_dim1=M0.Dim(1);
  1307. }
  1308. { // Determine interaction blocks which fit in memory.
  1309. size_t vec_size=(M_dim0+M_dim1)*sizeof(Real_t)*dof;
  1310. for(size_t j=0;j<mat_cnt;j++){// Determine minimum buff_size
  1311. size_t vec_cnt=0;
  1312. for(size_t i=0;i<n_out;i++){
  1313. if(trg_interac_list[i][j]!=NULL) vec_cnt++;
  1314. }
  1315. if(buff_size<vec_cnt*vec_size)
  1316. buff_size=vec_cnt*vec_size;
  1317. }
  1318. size_t interac_dsp_=0;
  1319. for(size_t j=0;j<mat_cnt;j++){
  1320. for(size_t i=0;i<n_out;i++){
  1321. interac_dsp[i][j]=interac_dsp_;
  1322. if(trg_interac_list[i][j]!=NULL) interac_dsp_++;
  1323. }
  1324. if(interac_dsp_*vec_size>buff_size) // Comment to disable symmetries.
  1325. {
  1326. interac_blk.push_back(j-interac_blk_dsp.back());
  1327. interac_blk_dsp.push_back(j);
  1328. size_t offset=interac_dsp[0][j];
  1329. for(size_t i=0;i<n_out;i++) interac_dsp[i][j]-=offset;
  1330. interac_dsp_-=offset;
  1331. assert(interac_dsp_*vec_size<=buff_size); // Problem too big for buff_size.
  1332. }
  1333. interac_mat.push_back(precomp_data_offset[this->interac_list.InteracClass(interac_type,j)][0]);
  1334. interac_cnt.push_back(interac_dsp_-interac_dsp[0][j]);
  1335. }
  1336. interac_blk.push_back(mat_cnt-interac_blk_dsp.back());
  1337. interac_blk_dsp.push_back(mat_cnt);
  1338. }
  1339. { // Determine input_perm.
  1340. size_t vec_size=M_dim0*dof;
  1341. for(size_t i=0;i<n_out;i++) ((FMMNode*)nodes_out[i])->node_id=i;
  1342. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1343. for(size_t i=0;i<n_in ;i++){
  1344. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1345. FMMNode_t* trg_node=src_interac_list[i][j];
  1346. if(trg_node!=NULL){
  1347. size_t depth=(this->Homogen()?trg_node->Depth():0);
  1348. input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
  1349. input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
  1350. input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
  1351. input_perm .push_back((size_t)(& input_vector[i][0][0]- input_data[0])); // src_ptr
  1352. assert(input_vector[i]->Dim()==vec_size);
  1353. }
  1354. }
  1355. }
  1356. }
  1357. }
  1358. { // Determine output_perm
  1359. size_t vec_size=M_dim1*dof;
  1360. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1361. for(size_t i=0;i<n_out;i++){
  1362. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1363. if(trg_interac_list[i][j]!=NULL){
  1364. size_t depth=(this->Homogen()?((FMMNode*)nodes_out[i])->Depth():0);
  1365. output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem
  1366. output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal
  1367. output_perm.push_back(interac_dsp[ i ][j]*vec_size*sizeof(Real_t)); // src_ptr
  1368. output_perm.push_back((size_t)(&output_vector[i][0][0]-output_data[0])); // trg_ptr
  1369. assert(output_vector[i]->Dim()==vec_size);
  1370. }
  1371. }
  1372. }
  1373. }
  1374. }
  1375. }
  1376. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  1377. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.ReInit(buff_size);
  1378. { // Set interac_data.
  1379. size_t data_size=sizeof(size_t)*4;
  1380. data_size+=sizeof(size_t)+interac_blk.size()*sizeof(size_t);
  1381. data_size+=sizeof(size_t)+interac_cnt.size()*sizeof(size_t);
  1382. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  1383. data_size+=sizeof(size_t)+ input_perm.size()*sizeof(size_t);
  1384. data_size+=sizeof(size_t)+output_perm.size()*sizeof(size_t);
  1385. if(interac_data.Dim(0)*interac_data.Dim(1)<sizeof(size_t)){
  1386. data_size+=sizeof(size_t);
  1387. interac_data.ReInit(1,data_size);
  1388. ((size_t*)&interac_data[0][0])[0]=sizeof(size_t);
  1389. }else{
  1390. size_t pts_data_size=*((size_t*)&interac_data[0][0]);
  1391. assert(interac_data.Dim(0)*interac_data.Dim(1)>=pts_data_size);
  1392. data_size+=pts_data_size;
  1393. if(data_size>interac_data.Dim(0)*interac_data.Dim(1)){ //Resize and copy interac_data.
  1394. Matrix< char> pts_interac_data=interac_data;
  1395. interac_data.ReInit(1,data_size);
  1396. mem::memcopy(&interac_data[0][0],&pts_interac_data[0][0],pts_data_size);
  1397. }
  1398. }
  1399. char* data_ptr=&interac_data[0][0];
  1400. data_ptr+=((size_t*)data_ptr)[0];
  1401. ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t);
  1402. ((size_t*)data_ptr)[0]= M_dim0; data_ptr+=sizeof(size_t);
  1403. ((size_t*)data_ptr)[0]= M_dim1; data_ptr+=sizeof(size_t);
  1404. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  1405. ((size_t*)data_ptr)[0]=interac_blk.size(); data_ptr+=sizeof(size_t);
  1406. mem::memcopy(data_ptr, &interac_blk[0], interac_blk.size()*sizeof(size_t));
  1407. data_ptr+=interac_blk.size()*sizeof(size_t);
  1408. ((size_t*)data_ptr)[0]=interac_cnt.size(); data_ptr+=sizeof(size_t);
  1409. mem::memcopy(data_ptr, &interac_cnt[0], interac_cnt.size()*sizeof(size_t));
  1410. data_ptr+=interac_cnt.size()*sizeof(size_t);
  1411. ((size_t*)data_ptr)[0]=interac_mat.size(); data_ptr+=sizeof(size_t);
  1412. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  1413. data_ptr+=interac_mat.size()*sizeof(size_t);
  1414. ((size_t*)data_ptr)[0]= input_perm.size(); data_ptr+=sizeof(size_t);
  1415. mem::memcopy(data_ptr, & input_perm[0], input_perm.size()*sizeof(size_t));
  1416. data_ptr+= input_perm.size()*sizeof(size_t);
  1417. ((size_t*)data_ptr)[0]=output_perm.size(); data_ptr+=sizeof(size_t);
  1418. mem::memcopy(data_ptr, &output_perm[0], output_perm.size()*sizeof(size_t));
  1419. data_ptr+=output_perm.size()*sizeof(size_t);
  1420. }
  1421. }
  1422. Profile::Toc();
  1423. if(device){ // Host2Device
  1424. Profile::Tic("Host2Device",&this->comm,false,25);
  1425. setup_data.interac_data .AllocDevice(true);
  1426. Profile::Toc();
  1427. }
  1428. }
  1429. #if defined(PVFMM_HAVE_CUDA)
  1430. #include <fmm_pts_gpu.hpp>
  1431. template <class Real_t, int SYNC>
  1432. void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Comm& comm) {
  1433. cudaStream_t* stream = pvfmm::CUDA_Lock::acquire_stream();
  1434. Profile::Tic("Host2Device",&comm,false,25);
  1435. typename Matrix<char>::Device interac_data;
  1436. typename Vector<char>::Device buff;
  1437. typename Matrix<char>::Device precomp_data_d;
  1438. typename Matrix<char>::Device interac_data_d;
  1439. typename Matrix<Real_t>::Device input_data_d;
  1440. typename Matrix<Real_t>::Device output_data_d;
  1441. interac_data = setup_data.interac_data;
  1442. buff = dev_buffer. AllocDevice(false);
  1443. precomp_data_d= setup_data.precomp_data->AllocDevice(false);
  1444. interac_data_d= setup_data.interac_data. AllocDevice(false);
  1445. input_data_d = setup_data. input_data->AllocDevice(false);
  1446. output_data_d = setup_data. output_data->AllocDevice(false);
  1447. Profile::Toc();
  1448. Profile::Tic("DeviceComp",&comm,false,20);
  1449. { // Offloaded computation.
  1450. size_t data_size, M_dim0, M_dim1, dof;
  1451. Vector<size_t> interac_blk;
  1452. Vector<size_t> interac_cnt;
  1453. Vector<size_t> interac_mat;
  1454. Vector<size_t> input_perm_d;
  1455. Vector<size_t> output_perm_d;
  1456. { // Set interac_data.
  1457. char* data_ptr=&interac_data [0][0];
  1458. char* dev_ptr=&interac_data_d[0][0];
  1459. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; dev_ptr += data_size;
  1460. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1461. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1462. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1463. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1464. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1465. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1466. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1467. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1468. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1469. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1470. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1471. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1472. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1473. input_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1474. data_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1475. dev_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1476. output_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1477. data_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1478. dev_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1479. }
  1480. { // interactions
  1481. size_t interac_indx = 0;
  1482. size_t interac_blk_dsp = 0;
  1483. cudaError_t error;
  1484. for (size_t k = 0; k < interac_blk.Dim(); k++) {
  1485. size_t vec_cnt=0;
  1486. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1487. if(vec_cnt==0){
  1488. //interac_indx += vec_cnt;
  1489. interac_blk_dsp += interac_blk[k];
  1490. continue;
  1491. }
  1492. char *buff_in_d =&buff[0];
  1493. char *buff_out_d =&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1494. { // Input permutation.
  1495. in_perm_gpu<Real_t>(&precomp_data_d[0][0], &input_data_d[0][0], buff_in_d,
  1496. &input_perm_d[interac_indx*4], vec_cnt, M_dim0, stream);
  1497. }
  1498. size_t vec_cnt0 = 0;
  1499. for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
  1500. size_t vec_cnt1 = 0;
  1501. size_t interac_mat0 = interac_mat[j];
  1502. for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) vec_cnt1 += interac_cnt[j];
  1503. Matrix<Real_t> M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
  1504. Matrix<Real_t> Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1505. Matrix<Real_t> Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1506. Matrix<Real_t>::CUBLASGEMM(Mt_d, Ms_d, M_d);
  1507. vec_cnt0 += vec_cnt1;
  1508. }
  1509. { // Output permutation.
  1510. out_perm_gpu<Real_t>(&precomp_data_d[0][0], &output_data_d[0][0], buff_out_d,
  1511. &output_perm_d[interac_indx*4], vec_cnt, M_dim1, stream);
  1512. }
  1513. interac_indx += vec_cnt;
  1514. interac_blk_dsp += interac_blk[k];
  1515. }
  1516. }
  1517. }
  1518. Profile::Toc();
  1519. if(SYNC) CUDA_Lock::wait();
  1520. }
  1521. #endif
  1522. template <class FMMNode>
  1523. template <int SYNC>
  1524. void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
  1525. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  1526. Profile::Tic("Host2Device",&this->comm,false,25);
  1527. Profile::Toc();
  1528. Profile::Tic("DeviceComp",&this->comm,false,20);
  1529. Profile::Toc();
  1530. return;
  1531. }
  1532. #if defined(PVFMM_HAVE_CUDA)
  1533. if (device) {
  1534. EvalListGPU<Real_t, SYNC>(setup_data, this->dev_buffer, this->comm);
  1535. return;
  1536. }
  1537. #endif
  1538. Profile::Tic("Host2Device",&this->comm,false,25);
  1539. typename Vector<char>::Device buff;
  1540. typename Matrix<char>::Device precomp_data;
  1541. typename Matrix<char>::Device interac_data;
  1542. typename Matrix<Real_t>::Device input_data;
  1543. typename Matrix<Real_t>::Device output_data;
  1544. if(device){
  1545. buff = this-> dev_buffer. AllocDevice(false);
  1546. precomp_data= setup_data.precomp_data->AllocDevice(false);
  1547. interac_data= setup_data.interac_data. AllocDevice(false);
  1548. input_data = setup_data. input_data->AllocDevice(false);
  1549. output_data = setup_data. output_data->AllocDevice(false);
  1550. }else{
  1551. buff = this-> cpu_buffer;
  1552. precomp_data=*setup_data.precomp_data;
  1553. interac_data= setup_data.interac_data;
  1554. input_data =*setup_data. input_data;
  1555. output_data =*setup_data. output_data;
  1556. }
  1557. Profile::Toc();
  1558. Profile::Tic("DeviceComp",&this->comm,false,20);
  1559. int lock_idx=-1;
  1560. int wait_lock_idx=-1;
  1561. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  1562. if(device) lock_idx=MIC_Lock::get_lock();
  1563. #ifdef __INTEL_OFFLOAD
  1564. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  1565. #endif
  1566. { // Offloaded computation.
  1567. // Set interac_data.
  1568. size_t data_size, M_dim0, M_dim1, dof;
  1569. Vector<size_t> interac_blk;
  1570. Vector<size_t> interac_cnt;
  1571. Vector<size_t> interac_mat;
  1572. Vector<size_t> input_perm;
  1573. Vector<size_t> output_perm;
  1574. { // Set interac_data.
  1575. char* data_ptr=&interac_data[0][0];
  1576. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
  1577. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1578. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1579. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1580. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1581. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1582. data_ptr+=sizeof(size_t)+interac_blk.Dim()*sizeof(size_t);
  1583. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1584. data_ptr+=sizeof(size_t)+interac_cnt.Dim()*sizeof(size_t);
  1585. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1586. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  1587. input_perm .ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1588. data_ptr+=sizeof(size_t)+ input_perm.Dim()*sizeof(size_t);
  1589. output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1590. data_ptr+=sizeof(size_t)+output_perm.Dim()*sizeof(size_t);
  1591. }
  1592. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  1593. //Compute interaction from Chebyshev source density.
  1594. { // interactions
  1595. int omp_p=omp_get_max_threads();
  1596. size_t interac_indx=0;
  1597. size_t interac_blk_dsp=0;
  1598. for(size_t k=0;k<interac_blk.Dim();k++){
  1599. size_t vec_cnt=0;
  1600. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1601. if(vec_cnt==0){
  1602. //interac_indx += vec_cnt;
  1603. interac_blk_dsp += interac_blk[k];
  1604. continue;
  1605. }
  1606. char* buff_in =&buff[0];
  1607. char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1608. // Input permutation.
  1609. #pragma omp parallel for
  1610. for(int tid=0;tid<omp_p;tid++){
  1611. size_t a=( tid *vec_cnt)/omp_p;
  1612. size_t b=((tid+1)*vec_cnt)/omp_p;
  1613. for(size_t i=a;i<b;i++){
  1614. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]);
  1615. const Real_t* scal=( Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]);
  1616. const Real_t* v_in =( Real_t*)( input_data[0]+input_perm[(interac_indx+i)*4+3]);
  1617. Real_t* v_out=( Real_t*)( buff_in +input_perm[(interac_indx+i)*4+2]);
  1618. // TODO: Fix for dof>1
  1619. #ifdef __MIC__
  1620. {
  1621. __m512d v8;
  1622. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1623. size_t j_end =(((uintptr_t)(v_out+M_dim0) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1624. j_start/=sizeof(Real_t);
  1625. j_end /=sizeof(Real_t);
  1626. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1627. assert(((uintptr_t)(v_out+j_start))%64==0);
  1628. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1629. size_t j=0;
  1630. for(;j<j_start;j++ ){
  1631. v_out[j]=v_in[perm[j]]*scal[j];
  1632. }
  1633. for(;j<j_end ;j+=8){
  1634. v8=_mm512_setr_pd(
  1635. v_in[perm[j+0]]*scal[j+0],
  1636. v_in[perm[j+1]]*scal[j+1],
  1637. v_in[perm[j+2]]*scal[j+2],
  1638. v_in[perm[j+3]]*scal[j+3],
  1639. v_in[perm[j+4]]*scal[j+4],
  1640. v_in[perm[j+5]]*scal[j+5],
  1641. v_in[perm[j+6]]*scal[j+6],
  1642. v_in[perm[j+7]]*scal[j+7]);
  1643. _mm512_storenrngo_pd(v_out+j,v8);
  1644. }
  1645. for(;j<M_dim0 ;j++ ){
  1646. v_out[j]=v_in[perm[j]]*scal[j];
  1647. }
  1648. }
  1649. #else
  1650. for(size_t j=0;j<M_dim0;j++ ){
  1651. v_out[j]=v_in[perm[j]]*scal[j];
  1652. }
  1653. #endif
  1654. }
  1655. }
  1656. size_t vec_cnt0=0;
  1657. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];){
  1658. size_t vec_cnt1=0;
  1659. size_t interac_mat0=interac_mat[j];
  1660. for(;j<interac_blk_dsp+interac_blk[k] && interac_mat[j]==interac_mat0;j++) vec_cnt1+=interac_cnt[j];
  1661. Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
  1662. #ifdef __MIC__
  1663. {
  1664. Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1665. Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1666. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1667. }
  1668. #else
  1669. #pragma omp parallel for
  1670. for(int tid=0;tid<omp_p;tid++){
  1671. size_t a=(dof*vec_cnt1*(tid ))/omp_p;
  1672. size_t b=(dof*vec_cnt1*(tid+1))/omp_p;
  1673. Matrix<Real_t> Ms(b-a, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t))+M_dim0*a, false);
  1674. Matrix<Real_t> Mt(b-a, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t))+M_dim1*a, false);
  1675. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1676. }
  1677. #endif
  1678. vec_cnt0+=vec_cnt1;
  1679. }
  1680. // Output permutation.
  1681. #pragma omp parallel for
  1682. for(int tid=0;tid<omp_p;tid++){
  1683. size_t a=( tid *vec_cnt)/omp_p;
  1684. size_t b=((tid+1)*vec_cnt)/omp_p;
  1685. if(tid> 0 && a<vec_cnt){ // Find 'a' independent of other threads.
  1686. size_t out_ptr=output_perm[(interac_indx+a)*4+3];
  1687. if(tid> 0) while(a<vec_cnt && out_ptr==output_perm[(interac_indx+a)*4+3]) a++;
  1688. }
  1689. if(tid<omp_p-1 && b<vec_cnt){ // Find 'b' independent of other threads.
  1690. size_t out_ptr=output_perm[(interac_indx+b)*4+3];
  1691. if(tid<omp_p-1) while(b<vec_cnt && out_ptr==output_perm[(interac_indx+b)*4+3]) b++;
  1692. }
  1693. for(size_t i=a;i<b;i++){ // Compute permutations.
  1694. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
  1695. const Real_t* scal=( Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
  1696. const Real_t* v_in =( Real_t*)( buff_out +output_perm[(interac_indx+i)*4+2]);
  1697. Real_t* v_out=( Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]);
  1698. // TODO: Fix for dof>1
  1699. #ifdef __MIC__
  1700. {
  1701. __m512d v8;
  1702. __m512d v_old;
  1703. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1704. size_t j_end =(((uintptr_t)(v_out+M_dim1) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1705. j_start/=sizeof(Real_t);
  1706. j_end /=sizeof(Real_t);
  1707. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1708. assert(((uintptr_t)(v_out+j_start))%64==0);
  1709. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1710. size_t j=0;
  1711. for(;j<j_start;j++ ){
  1712. v_out[j]+=v_in[perm[j]]*scal[j];
  1713. }
  1714. for(;j<j_end ;j+=8){
  1715. v_old=_mm512_load_pd(v_out+j);
  1716. v8=_mm512_setr_pd(
  1717. v_in[perm[j+0]]*scal[j+0],
  1718. v_in[perm[j+1]]*scal[j+1],
  1719. v_in[perm[j+2]]*scal[j+2],
  1720. v_in[perm[j+3]]*scal[j+3],
  1721. v_in[perm[j+4]]*scal[j+4],
  1722. v_in[perm[j+5]]*scal[j+5],
  1723. v_in[perm[j+6]]*scal[j+6],
  1724. v_in[perm[j+7]]*scal[j+7]);
  1725. v_old=_mm512_add_pd(v_old, v8);
  1726. _mm512_storenrngo_pd(v_out+j,v_old);
  1727. }
  1728. for(;j<M_dim1 ;j++ ){
  1729. v_out[j]+=v_in[perm[j]]*scal[j];
  1730. }
  1731. }
  1732. #else
  1733. for(size_t j=0;j<M_dim1;j++ ){
  1734. v_out[j]+=v_in[perm[j]]*scal[j];
  1735. }
  1736. #endif
  1737. }
  1738. }
  1739. interac_indx+=vec_cnt;
  1740. interac_blk_dsp+=interac_blk[k];
  1741. }
  1742. }
  1743. if(device) MIC_Lock::release_lock(lock_idx);
  1744. }
  1745. #ifdef __INTEL_OFFLOAD
  1746. if(SYNC){
  1747. #pragma offload if(device) target(mic:0)
  1748. {if(device) MIC_Lock::wait_lock(lock_idx);}
  1749. }
  1750. #endif
  1751. Profile::Toc();
  1752. }
  1753. template <class FMMNode>
  1754. void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  1755. if(this->MultipoleOrder()==0) return;
  1756. { // Set setup_data
  1757. setup_data.level=level;
  1758. setup_data.kernel=kernel->k_s2m;
  1759. setup_data.interac_type.resize(1);
  1760. setup_data.interac_type[0]=S2U_Type;
  1761. setup_data. input_data=&buff[4];
  1762. setup_data.output_data=&buff[0];
  1763. setup_data. coord_data=&buff[6];
  1764. Vector<FMMNode_t*>& nodes_in =n_list[4];
  1765. Vector<FMMNode_t*>& nodes_out=n_list[0];
  1766. setup_data.nodes_in .clear();
  1767. setup_data.nodes_out.clear();
  1768. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  1769. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  1770. }
  1771. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1772. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1773. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  1774. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  1775. for(size_t i=0;i<nodes_in .size();i++){
  1776. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
  1777. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
  1778. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
  1779. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
  1780. }
  1781. for(size_t i=0;i<nodes_out.size();i++){
  1782. output_vector.push_back(&upwd_check_surf[((FMMNode*)nodes_out[i])->Depth()]);
  1783. output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
  1784. }
  1785. //Upward check to upward equivalent matrix.
  1786. Matrix<Real_t>& M_uc2ue = this->mat->Mat(level, UC2UE_Type, 0);
  1787. this->SetupInteracPts(setup_data, false, true, &M_uc2ue,device);
  1788. { // Resize device buffer
  1789. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  1790. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  1791. }
  1792. }
  1793. template <class FMMNode>
  1794. void FMM_Pts<FMMNode>::Source2Up(SetupData<Real_t>& setup_data, bool device){
  1795. //Add Source2Up contribution.
  1796. this->EvalListPts(setup_data, device);
  1797. }
  1798. template <class FMMNode>
  1799. void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  1800. if(this->MultipoleOrder()==0) return;
  1801. { // Set setup_data
  1802. setup_data.level=level;
  1803. setup_data.kernel=kernel->k_m2m;
  1804. setup_data.interac_type.resize(1);
  1805. setup_data.interac_type[0]=U2U_Type;
  1806. setup_data. input_data=&buff[0];
  1807. setup_data.output_data=&buff[0];
  1808. Vector<FMMNode_t*>& nodes_in =n_list[0];
  1809. Vector<FMMNode_t*>& nodes_out=n_list[0];
  1810. setup_data.nodes_in .clear();
  1811. setup_data.nodes_out.clear();
  1812. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level+1) setup_data.nodes_in .push_back(nodes_in [i]);
  1813. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level ) setup_data.nodes_out.push_back(nodes_out[i]);
  1814. }
  1815. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1816. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1817. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  1818. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  1819. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
  1820. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
  1821. SetupInterac(setup_data,device);
  1822. }
  1823. template <class FMMNode>
  1824. void FMM_Pts<FMMNode>::Up2Up (SetupData<Real_t>& setup_data, bool device){
  1825. //Add Up2Up contribution.
  1826. EvalList(setup_data, device);
  1827. }
  1828. template <class FMMNode>
  1829. void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
  1830. if(this->MultipoleOrder()==0) return;
  1831. Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
  1832. assert(node->FMMData()->upward_equiv.Dim()>0);
  1833. int dof=1;
  1834. Vector<Real_t>& upward_equiv=node->FMMData()->upward_equiv;
  1835. Vector<Real_t>& dnward_equiv=node->FMMData()->dnward_equiv;
  1836. assert(upward_equiv.Dim()==M.Dim(0)*dof);
  1837. assert(dnward_equiv.Dim()==M.Dim(1)*dof);
  1838. Matrix<Real_t> d_equiv(dof,M.Dim(0),&dnward_equiv[0],false);
  1839. Matrix<Real_t> u_equiv(dof,M.Dim(1),&upward_equiv[0],false);
  1840. Matrix<Real_t>::GEMM(d_equiv,u_equiv,M);
  1841. }
  1842. template <class FMMNode>
  1843. void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec,
  1844. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  1845. size_t n1=m*2;
  1846. size_t n2=n1*n1;
  1847. size_t n3=n1*n2;
  1848. size_t n3_=n2*(n1/2+1);
  1849. size_t chld_cnt=1UL<<COORD_DIM;
  1850. size_t fftsize_in =2*n3_*chld_cnt*ker_dim0*dof;
  1851. int omp_p=omp_get_max_threads();
  1852. //Load permutation map.
  1853. size_t n=6*(m-1)*(m-1)+2;
  1854. static Vector<size_t> map;
  1855. { // Build map to reorder upward_equiv
  1856. size_t n_old=map.Dim();
  1857. if(n_old!=n){
  1858. Real_t c[3]={0,0,0};
  1859. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  1860. map.Resize(surf.Dim()/COORD_DIM);
  1861. for(size_t i=0;i<map.Dim();i++)
  1862. map[i]=((size_t)(m-1-surf[i*3]+0.5))+((size_t)(m-1-surf[i*3+1]+0.5))*n1+((size_t)(m-1-surf[i*3+2]+0.5))*n2;
  1863. }
  1864. }
  1865. { // Build FFTW plan.
  1866. if(!vlist_fft_flag){
  1867. int nnn[3]={(int)n1,(int)n1,(int)n1};
  1868. void *fftw_in, *fftw_out;
  1869. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim0*chld_cnt);
  1870. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim0*chld_cnt);
  1871. vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
  1872. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_, FFTW_ESTIMATE);
  1873. mem::aligned_delete<Real_t>((Real_t*)fftw_in );
  1874. mem::aligned_delete<Real_t>((Real_t*)fftw_out);
  1875. vlist_fft_flag=true;
  1876. }
  1877. }
  1878. { // Offload section
  1879. size_t n_in = fft_vec.Dim();
  1880. #pragma omp parallel for
  1881. for(int pid=0; pid<omp_p; pid++){
  1882. size_t node_start=(n_in*(pid ))/omp_p;
  1883. size_t node_end =(n_in*(pid+1))/omp_p;
  1884. Vector<Real_t> buffer(fftsize_in, &buffer_[fftsize_in*pid], false);
  1885. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  1886. Vector<Real_t*> upward_equiv(chld_cnt);
  1887. for(size_t i=0;i<chld_cnt;i++) upward_equiv[i]=&input_data[0] + fft_vec[node_idx] + n*ker_dim0*dof*i;
  1888. Vector<Real_t> upward_equiv_fft(fftsize_in, &output_data[fftsize_in *node_idx], false);
  1889. upward_equiv_fft.SetZero();
  1890. // Rearrange upward equivalent data.
  1891. for(size_t k=0;k<n;k++){
  1892. size_t idx=map[k];
  1893. for(int j1=0;j1<dof;j1++)
  1894. for(int j0=0;j0<(int)chld_cnt;j0++)
  1895. for(int i=0;i<ker_dim0;i++)
  1896. upward_equiv_fft[idx+(j0+(i+j1*ker_dim0)*chld_cnt)*n3]=upward_equiv[j0][ker_dim0*(n*j1+k)+i];
  1897. }
  1898. // Compute FFT.
  1899. for(int i=0;i<dof;i++)
  1900. FFTW_t<Real_t>::fft_execute_dft_r2c(vlist_fftplan, (Real_t*)&upward_equiv_fft[i* n3 *ker_dim0*chld_cnt],
  1901. (typename FFTW_t<Real_t>::cplx*)&buffer [i*2*n3_*ker_dim0*chld_cnt]);
  1902. //Compute flops.
  1903. #ifndef FFTW3_MKL
  1904. double add, mul, fma;
  1905. FFTW_t<Real_t>::fftw_flops(vlist_fftplan, &add, &mul, &fma);
  1906. #ifndef __INTEL_OFFLOAD0
  1907. Profile::Add_FLOP((long long)(add+mul+2*fma));
  1908. #endif
  1909. #endif
  1910. for(int i=0;i<ker_dim0*dof;i++)
  1911. for(size_t j=0;j<n3_;j++)
  1912. for(size_t k=0;k<chld_cnt;k++){
  1913. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+0]=buffer[2*(n3_*(chld_cnt*i+k)+j)+0];
  1914. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+1]=buffer[2*(n3_*(chld_cnt*i+k)+j)+1];
  1915. }
  1916. }
  1917. }
  1918. }
  1919. }
  1920. template <class FMMNode>
  1921. void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector<size_t>& ifft_vec,
  1922. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_, Matrix<Real_t>& M){
  1923. size_t n1=m*2;
  1924. size_t n2=n1*n1;
  1925. size_t n3=n1*n2;
  1926. size_t n3_=n2*(n1/2+1);
  1927. size_t chld_cnt=1UL<<COORD_DIM;
  1928. size_t fftsize_out=2*n3_*dof*ker_dim1*chld_cnt;
  1929. int omp_p=omp_get_max_threads();
  1930. //Load permutation map.
  1931. size_t n=6*(m-1)*(m-1)+2;
  1932. static Vector<size_t> map;
  1933. { // Build map to reorder dnward_check
  1934. size_t n_old=map.Dim();
  1935. if(n_old!=n){
  1936. Real_t c[3]={0,0,0};
  1937. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  1938. map.Resize(surf.Dim()/COORD_DIM);
  1939. for(size_t i=0;i<map.Dim();i++)
  1940. map[i]=((size_t)(m*2-0.5-surf[i*3]))+((size_t)(m*2-0.5-surf[i*3+1]))*n1+((size_t)(m*2-0.5-surf[i*3+2]))*n2;
  1941. //map;//.AllocDevice(true);
  1942. }
  1943. }
  1944. { // Build FFTW plan.
  1945. if(!vlist_ifft_flag){
  1946. //Build FFTW plan.
  1947. int nnn[3]={(int)n1,(int)n1,(int)n1};
  1948. Real_t *fftw_in, *fftw_out;
  1949. fftw_in = mem::aligned_new<Real_t>(2*n3_*ker_dim1*chld_cnt);
  1950. fftw_out = mem::aligned_new<Real_t>( n3 *ker_dim1*chld_cnt);
  1951. vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
  1952. (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3, FFTW_ESTIMATE);
  1953. mem::aligned_delete<Real_t>(fftw_in);
  1954. mem::aligned_delete<Real_t>(fftw_out);
  1955. vlist_ifft_flag=true;
  1956. }
  1957. }
  1958. { // Offload section
  1959. size_t n_out=ifft_vec.Dim();
  1960. #pragma omp parallel for
  1961. for(int pid=0; pid<omp_p; pid++){
  1962. size_t node_start=(n_out*(pid ))/omp_p;
  1963. size_t node_end =(n_out*(pid+1))/omp_p;
  1964. Vector<Real_t> buffer(fftsize_out, &buffer_[fftsize_out*pid], false);
  1965. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  1966. Vector<Real_t> dnward_check_fft(fftsize_out, &input_data[fftsize_out*node_idx], false);
  1967. //De-interleave data.
  1968. for(int i=0;i<ker_dim1*dof;i++)
  1969. for(size_t j=0;j<n3_;j++)
  1970. for(size_t k=0;k<chld_cnt;k++){
  1971. buffer[2*(n3_*(ker_dim1*dof*k+i)+j)+0]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+0];
  1972. buffer[2*(n3_*(ker_dim1*dof*k+i)+j)+1]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+1];
  1973. }
  1974. // Compute FFT.
  1975. for(int i=0;i<dof;i++)
  1976. FFTW_t<Real_t>::fft_execute_dft_c2r(vlist_ifftplan, (typename FFTW_t<Real_t>::cplx*)&buffer [i*2*n3_*ker_dim1*chld_cnt],
  1977. (Real_t*)&dnward_check_fft[i* n3 *ker_dim1*chld_cnt]);
  1978. //Compute flops.
  1979. #ifndef FFTW3_MKL
  1980. double add, mul, fma;
  1981. FFTW_t<Real_t>::fftw_flops(vlist_ifftplan, &add, &mul, &fma);
  1982. #ifndef __INTEL_OFFLOAD0
  1983. Profile::Add_FLOP((long long)(add+mul+2*fma));
  1984. #endif
  1985. #endif
  1986. // Rearrange downward check data.
  1987. for(size_t k=0;k<n;k++){
  1988. size_t idx=map[k];
  1989. for(int j1=0;j1<dof;j1++)
  1990. for(int j0=0;j0<(int)chld_cnt;j0++)
  1991. for(int i=0;i<ker_dim1;i++)
  1992. buffer[ker_dim1*(n*(dof*j0+j1)+k)+i]=dnward_check_fft[idx+(j1+(i+j0*ker_dim1)*dof)*n3];
  1993. }
  1994. // Compute check to equiv.
  1995. for(size_t j=0;j<chld_cnt;j++){
  1996. Matrix<Real_t> d_check(dof,M.Dim(0),&buffer[n*ker_dim1*dof*j],false);
  1997. Matrix<Real_t> d_equiv(dof,M.Dim(1),&output_data[0] + ifft_vec[node_idx] + M.Dim(1)*dof*j,false);
  1998. Matrix<Real_t>::GEMM(d_equiv,d_check,M,1.0);
  1999. }
  2000. }
  2001. }
  2002. }
  2003. }
  2004. template<class Real_t>
  2005. inline void matmult_8x8x2(Real_t*& M_, Real_t*& IN0, Real_t*& IN1, Real_t*& OUT0, Real_t*& OUT1){
  2006. // Generic code.
  2007. Real_t out_reg000, out_reg001, out_reg010, out_reg011;
  2008. Real_t out_reg100, out_reg101, out_reg110, out_reg111;
  2009. Real_t in_reg000, in_reg001, in_reg010, in_reg011;
  2010. Real_t in_reg100, in_reg101, in_reg110, in_reg111;
  2011. Real_t m_reg000, m_reg001, m_reg010, m_reg011;
  2012. Real_t m_reg100, m_reg101, m_reg110, m_reg111;
  2013. //#pragma unroll
  2014. for(int i1=0;i1<8;i1+=2){
  2015. Real_t* IN0_=IN0;
  2016. Real_t* IN1_=IN1;
  2017. out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
  2018. out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
  2019. out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
  2020. out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
  2021. //#pragma unroll
  2022. for(int i2=0;i2<8;i2+=2){
  2023. m_reg000=M_[ 0]; m_reg001=M_[ 1];
  2024. m_reg010=M_[ 2]; m_reg011=M_[ 3];
  2025. m_reg100=M_[16]; m_reg101=M_[17];
  2026. m_reg110=M_[18]; m_reg111=M_[19];
  2027. in_reg000=IN0_[0]; in_reg001=IN0_[1];
  2028. in_reg010=IN0_[2]; in_reg011=IN0_[3];
  2029. in_reg100=IN1_[0]; in_reg101=IN1_[1];
  2030. in_reg110=IN1_[2]; in_reg111=IN1_[3];
  2031. out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
  2032. out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
  2033. out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
  2034. out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
  2035. out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
  2036. out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
  2037. out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
  2038. out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
  2039. out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
  2040. out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
  2041. out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
  2042. out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
  2043. out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
  2044. out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
  2045. out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
  2046. out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
  2047. M_+=32; // Jump to (column+2).
  2048. IN0_+=4;
  2049. IN1_+=4;
  2050. }
  2051. OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
  2052. OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
  2053. OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
  2054. OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
  2055. M_+=4-64*2; // Jump back to first column (row+2).
  2056. OUT0+=4;
  2057. OUT1+=4;
  2058. }
  2059. }
  2060. #if defined(__AVX__) || defined(__SSE3__)
  2061. template<>
  2062. inline void matmult_8x8x2<double>(double*& M_, double*& IN0, double*& IN1, double*& OUT0, double*& OUT1){
  2063. #ifdef __AVX__ //AVX code.
  2064. __m256d out00,out01,out10,out11;
  2065. __m256d out20,out21,out30,out31;
  2066. double* in0__ = IN0;
  2067. double* in1__ = IN1;
  2068. out00 = _mm256_load_pd(OUT0);
  2069. out01 = _mm256_load_pd(OUT1);
  2070. out10 = _mm256_load_pd(OUT0+4);
  2071. out11 = _mm256_load_pd(OUT1+4);
  2072. out20 = _mm256_load_pd(OUT0+8);
  2073. out21 = _mm256_load_pd(OUT1+8);
  2074. out30 = _mm256_load_pd(OUT0+12);
  2075. out31 = _mm256_load_pd(OUT1+12);
  2076. for(int i2=0;i2<8;i2+=2){
  2077. __m256d m00;
  2078. __m256d ot00;
  2079. __m256d mt0,mtt0;
  2080. __m256d in00,in00_r,in01,in01_r;
  2081. in00 = _mm256_broadcast_pd((const __m128d*)in0__);
  2082. in00_r = _mm256_permute_pd(in00,5);
  2083. in01 = _mm256_broadcast_pd((const __m128d*)in1__);
  2084. in01_r = _mm256_permute_pd(in01,5);
  2085. m00 = _mm256_load_pd(M_);
  2086. mt0 = _mm256_unpacklo_pd(m00,m00);
  2087. ot00 = _mm256_mul_pd(mt0,in00);
  2088. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2089. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2090. ot00 = _mm256_mul_pd(mt0,in01);
  2091. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2092. m00 = _mm256_load_pd(M_+4);
  2093. mt0 = _mm256_unpacklo_pd(m00,m00);
  2094. ot00 = _mm256_mul_pd(mt0,in00);
  2095. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2096. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2097. ot00 = _mm256_mul_pd(mt0,in01);
  2098. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2099. m00 = _mm256_load_pd(M_+8);
  2100. mt0 = _mm256_unpacklo_pd(m00,m00);
  2101. ot00 = _mm256_mul_pd(mt0,in00);
  2102. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2103. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2104. ot00 = _mm256_mul_pd(mt0,in01);
  2105. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2106. m00 = _mm256_load_pd(M_+12);
  2107. mt0 = _mm256_unpacklo_pd(m00,m00);
  2108. ot00 = _mm256_mul_pd(mt0,in00);
  2109. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2110. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2111. ot00 = _mm256_mul_pd(mt0,in01);
  2112. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2113. in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
  2114. in00_r = _mm256_permute_pd(in00,5);
  2115. in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
  2116. in01_r = _mm256_permute_pd(in01,5);
  2117. m00 = _mm256_load_pd(M_+16);
  2118. mt0 = _mm256_unpacklo_pd(m00,m00);
  2119. ot00 = _mm256_mul_pd(mt0,in00);
  2120. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2121. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2122. ot00 = _mm256_mul_pd(mt0,in01);
  2123. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2124. m00 = _mm256_load_pd(M_+20);
  2125. mt0 = _mm256_unpacklo_pd(m00,m00);
  2126. ot00 = _mm256_mul_pd(mt0,in00);
  2127. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2128. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2129. ot00 = _mm256_mul_pd(mt0,in01);
  2130. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2131. m00 = _mm256_load_pd(M_+24);
  2132. mt0 = _mm256_unpacklo_pd(m00,m00);
  2133. ot00 = _mm256_mul_pd(mt0,in00);
  2134. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2135. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2136. ot00 = _mm256_mul_pd(mt0,in01);
  2137. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2138. m00 = _mm256_load_pd(M_+28);
  2139. mt0 = _mm256_unpacklo_pd(m00,m00);
  2140. ot00 = _mm256_mul_pd(mt0,in00);
  2141. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2142. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2143. ot00 = _mm256_mul_pd(mt0,in01);
  2144. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2145. M_ += 32;
  2146. in0__ += 4;
  2147. in1__ += 4;
  2148. }
  2149. _mm256_store_pd(OUT0,out00);
  2150. _mm256_store_pd(OUT1,out01);
  2151. _mm256_store_pd(OUT0+4,out10);
  2152. _mm256_store_pd(OUT1+4,out11);
  2153. _mm256_store_pd(OUT0+8,out20);
  2154. _mm256_store_pd(OUT1+8,out21);
  2155. _mm256_store_pd(OUT0+12,out30);
  2156. _mm256_store_pd(OUT1+12,out31);
  2157. #elif defined __SSE3__ // SSE code.
  2158. __m128d out00, out01, out10, out11;
  2159. __m128d in00, in01, in10, in11;
  2160. __m128d m00, m01, m10, m11;
  2161. //#pragma unroll
  2162. for(int i1=0;i1<8;i1+=2){
  2163. double* IN0_=IN0;
  2164. double* IN1_=IN1;
  2165. out00 =_mm_load_pd (OUT0 );
  2166. out10 =_mm_load_pd (OUT0+2);
  2167. out01 =_mm_load_pd (OUT1 );
  2168. out11 =_mm_load_pd (OUT1+2);
  2169. //#pragma unroll
  2170. for(int i2=0;i2<8;i2+=2){
  2171. m00 =_mm_load1_pd (M_ );
  2172. m10 =_mm_load1_pd (M_+ 2);
  2173. m01 =_mm_load1_pd (M_+16);
  2174. m11 =_mm_load1_pd (M_+18);
  2175. in00 =_mm_load_pd (IN0_ );
  2176. in10 =_mm_load_pd (IN0_+2);
  2177. in01 =_mm_load_pd (IN1_ );
  2178. in11 =_mm_load_pd (IN1_+2);
  2179. out00 = _mm_add_pd (out00, _mm_mul_pd(m00 , in00 ));
  2180. out00 = _mm_add_pd (out00, _mm_mul_pd(m01 , in10 ));
  2181. out01 = _mm_add_pd (out01, _mm_mul_pd(m00 , in01 ));
  2182. out01 = _mm_add_pd (out01, _mm_mul_pd(m01 , in11 ));
  2183. out10 = _mm_add_pd (out10, _mm_mul_pd(m10 , in00 ));
  2184. out10 = _mm_add_pd (out10, _mm_mul_pd(m11 , in10 ));
  2185. out11 = _mm_add_pd (out11, _mm_mul_pd(m10 , in01 ));
  2186. out11 = _mm_add_pd (out11, _mm_mul_pd(m11 , in11 ));
  2187. m00 =_mm_load1_pd (M_+ 1);
  2188. m10 =_mm_load1_pd (M_+ 2+1);
  2189. m01 =_mm_load1_pd (M_+16+1);
  2190. m11 =_mm_load1_pd (M_+18+1);
  2191. in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
  2192. in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
  2193. in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
  2194. in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
  2195. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
  2196. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
  2197. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
  2198. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
  2199. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
  2200. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
  2201. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
  2202. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
  2203. M_+=32; // Jump to (column+2).
  2204. IN0_+=4;
  2205. IN1_+=4;
  2206. }
  2207. _mm_store_pd (OUT0 ,out00);
  2208. _mm_store_pd (OUT0+2,out10);
  2209. _mm_store_pd (OUT1 ,out01);
  2210. _mm_store_pd (OUT1+2,out11);
  2211. M_+=4-64*2; // Jump back to first column (row+2).
  2212. OUT0+=4;
  2213. OUT1+=4;
  2214. }
  2215. #endif
  2216. }
  2217. #endif
  2218. #if defined(__SSE3__)
  2219. template<>
  2220. inline void matmult_8x8x2<float>(float*& M_, float*& IN0, float*& IN1, float*& OUT0, float*& OUT1){
  2221. #if defined __SSE3__ // SSE code.
  2222. __m128 out00,out01,out10,out11;
  2223. __m128 out20,out21,out30,out31;
  2224. float* in0__ = IN0;
  2225. float* in1__ = IN1;
  2226. out00 = _mm_load_ps(OUT0);
  2227. out01 = _mm_load_ps(OUT1);
  2228. out10 = _mm_load_ps(OUT0+4);
  2229. out11 = _mm_load_ps(OUT1+4);
  2230. out20 = _mm_load_ps(OUT0+8);
  2231. out21 = _mm_load_ps(OUT1+8);
  2232. out30 = _mm_load_ps(OUT0+12);
  2233. out31 = _mm_load_ps(OUT1+12);
  2234. for(int i2=0;i2<8;i2+=2){
  2235. __m128 m00;
  2236. __m128 ot00;
  2237. __m128 mt0,mtt0;
  2238. __m128 in00,in00_r,in01,in01_r;
  2239. in00 = _mm_castpd_ps(_mm_load_pd1((const double*)in0__));
  2240. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2241. in01 = _mm_castpd_ps(_mm_load_pd1((const double*)in1__));
  2242. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2243. m00 = _mm_load_ps(M_);
  2244. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2245. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2246. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2247. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2248. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2249. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2250. m00 = _mm_load_ps(M_+4);
  2251. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2252. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2253. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2254. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2255. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2256. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2257. m00 = _mm_load_ps(M_+8);
  2258. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2259. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2260. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2261. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2262. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2263. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2264. m00 = _mm_load_ps(M_+12);
  2265. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2266. out30= _mm_add_ps (out30,_mm_mul_ps( mt0, in00));
  2267. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2268. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2269. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2270. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2271. in00 = _mm_castpd_ps(_mm_load_pd1((const double*) (in0__+2)));
  2272. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2273. in01 = _mm_castpd_ps(_mm_load_pd1((const double*) (in1__+2)));
  2274. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2275. m00 = _mm_load_ps(M_+16);
  2276. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2277. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2278. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2279. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2280. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2281. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2282. m00 = _mm_load_ps(M_+20);
  2283. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2284. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2285. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2286. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2287. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2288. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2289. m00 = _mm_load_ps(M_+24);
  2290. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2291. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2292. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2293. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2294. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2295. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2296. m00 = _mm_load_ps(M_+28);
  2297. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2298. out30= _mm_add_ps (out30,_mm_mul_ps( mt0,in00 ));
  2299. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2300. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2301. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2302. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2303. M_ += 32;
  2304. in0__ += 4;
  2305. in1__ += 4;
  2306. }
  2307. _mm_store_ps(OUT0,out00);
  2308. _mm_store_ps(OUT1,out01);
  2309. _mm_store_ps(OUT0+4,out10);
  2310. _mm_store_ps(OUT1+4,out11);
  2311. _mm_store_ps(OUT0+8,out20);
  2312. _mm_store_ps(OUT1+8,out21);
  2313. _mm_store_ps(OUT0+12,out30);
  2314. _mm_store_ps(OUT1+12,out31);
  2315. #endif
  2316. }
  2317. #endif
  2318. template <class Real_t>
  2319. void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector<size_t>& interac_dsp,
  2320. Vector<size_t>& interac_vec, Vector<Real_t*>& precomp_mat, Vector<Real_t>& fft_in, Vector<Real_t>& fft_out){
  2321. size_t chld_cnt=1UL<<COORD_DIM;
  2322. size_t fftsize_in =M_dim*ker_dim0*chld_cnt*2;
  2323. size_t fftsize_out=M_dim*ker_dim1*chld_cnt*2;
  2324. Real_t* zero_vec0=mem::aligned_new<Real_t>(fftsize_in );
  2325. Real_t* zero_vec1=mem::aligned_new<Real_t>(fftsize_out);
  2326. size_t n_out=fft_out.Dim()/fftsize_out;
  2327. // Set buff_out to zero.
  2328. #pragma omp parallel for
  2329. for(size_t k=0;k<n_out;k++){
  2330. Vector<Real_t> dnward_check_fft(fftsize_out, &fft_out[k*fftsize_out], false);
  2331. dnward_check_fft.SetZero();
  2332. }
  2333. // Build list of interaction pairs (in, out vectors).
  2334. size_t mat_cnt=precomp_mat.Dim();
  2335. size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
  2336. const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
  2337. Real_t** IN_ =mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2338. Real_t** OUT_=mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2339. #pragma omp parallel for
  2340. for(size_t interac_blk1=0; interac_blk1<blk1_cnt*mat_cnt; interac_blk1++){
  2341. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2342. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2343. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2344. for(size_t j=0;j<interac_cnt;j++){
  2345. IN_ [2*V_BLK_SIZE*interac_blk1 +j]=&fft_in [interac_vec[(interac_dsp0+j)*2+0]];
  2346. OUT_[2*V_BLK_SIZE*interac_blk1 +j]=&fft_out[interac_vec[(interac_dsp0+j)*2+1]];
  2347. }
  2348. IN_ [2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec0;
  2349. OUT_[2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec1;
  2350. }
  2351. int omp_p=omp_get_max_threads();
  2352. #pragma omp parallel for
  2353. for(int pid=0; pid<omp_p; pid++){
  2354. size_t a=( pid *M_dim)/omp_p;
  2355. size_t b=((pid+1)*M_dim)/omp_p;
  2356. for(int in_dim=0;in_dim<ker_dim0;in_dim++)
  2357. for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++)
  2358. for(size_t blk1=0; blk1<blk1_cnt; blk1++)
  2359. for(size_t k=a; k< b; k++)
  2360. for(size_t mat_indx=0; mat_indx< mat_cnt;mat_indx++){
  2361. size_t interac_blk1 = blk1*mat_cnt+mat_indx;
  2362. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2363. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2364. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2365. Real_t** IN = IN_ + 2*V_BLK_SIZE*interac_blk1;
  2366. Real_t** OUT= OUT_+ 2*V_BLK_SIZE*interac_blk1;
  2367. Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2 + (ot_dim+in_dim*ker_dim1)*M_dim*128;
  2368. {
  2369. for(size_t j=0;j<interac_cnt;j+=2){
  2370. Real_t* M_ = M;
  2371. Real_t* IN0 = IN [j+0] + (in_dim*M_dim+k)*chld_cnt*2;
  2372. Real_t* IN1 = IN [j+1] + (in_dim*M_dim+k)*chld_cnt*2;
  2373. Real_t* OUT0 = OUT[j+0] + (ot_dim*M_dim+k)*chld_cnt*2;
  2374. Real_t* OUT1 = OUT[j+1] + (ot_dim*M_dim+k)*chld_cnt*2;
  2375. #ifdef __SSE__
  2376. if (j+2 < interac_cnt) { // Prefetch
  2377. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2378. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2379. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2380. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2381. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2382. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2383. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2384. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2385. }
  2386. #endif
  2387. matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
  2388. }
  2389. }
  2390. }
  2391. }
  2392. // Compute flops.
  2393. {
  2394. Profile::Add_FLOP(8*8*8*(interac_vec.Dim()/2)*M_dim*ker_dim0*ker_dim1*dof);
  2395. }
  2396. // Free memory
  2397. mem::aligned_delete<Real_t*>(IN_ );
  2398. mem::aligned_delete<Real_t*>(OUT_);
  2399. mem::aligned_delete<Real_t>(zero_vec0);
  2400. mem::aligned_delete<Real_t>(zero_vec1);
  2401. }
  2402. template <class FMMNode>
  2403. void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2404. if(this->MultipoleOrder()==0) return;
  2405. if(level==0) return;
  2406. { // Set setup_data
  2407. setup_data.level=level;
  2408. setup_data.kernel=kernel->k_m2l;
  2409. setup_data.interac_type.resize(1);
  2410. setup_data.interac_type[0]=V1_Type;
  2411. setup_data. input_data=&buff[0];
  2412. setup_data.output_data=&buff[1];
  2413. Vector<FMMNode_t*>& nodes_in =n_list[2];
  2414. Vector<FMMNode_t*>& nodes_out=n_list[3];
  2415. setup_data.nodes_in .clear();
  2416. setup_data.nodes_out.clear();
  2417. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  2418. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level-1 || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  2419. }
  2420. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2421. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2422. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2423. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2424. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_in [i])->Child(0))->FMMData())->upward_equiv);
  2425. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_out[i])->Child(0))->FMMData())->dnward_equiv);
  2426. /////////////////////////////////////////////////////////////////////////////
  2427. size_t n_in =nodes_in .size();
  2428. size_t n_out=nodes_out.size();
  2429. // Setup precomputed data.
  2430. if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  2431. // Build interac_data
  2432. Profile::Tic("Interac-Data",&this->comm,true,25);
  2433. Matrix<char>& interac_data=setup_data.interac_data;
  2434. if(n_out>0 && n_in >0){ // Build precomp_data, interac_data
  2435. size_t precomp_offset=0;
  2436. Mat_Type& interac_type=setup_data.interac_type[0];
  2437. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  2438. Matrix<size_t> precomp_data_offset;
  2439. std::vector<size_t> interac_mat;
  2440. { // Load precomp_data for interac_type.
  2441. struct HeaderData{
  2442. size_t total_size;
  2443. size_t level;
  2444. size_t mat_cnt ;
  2445. size_t max_depth;
  2446. };
  2447. Matrix<char>& precomp_data=*setup_data.precomp_data;
  2448. char* indx_ptr=precomp_data[0]+precomp_offset;
  2449. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  2450. precomp_data_offset.ReInit(header.mat_cnt,1+(2+2)*header.max_depth, (size_t*)indx_ptr, false);
  2451. precomp_offset+=header.total_size;
  2452. for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
  2453. Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
  2454. assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
  2455. interac_mat.push_back(precomp_data_offset[mat_id][0]);
  2456. }
  2457. }
  2458. size_t dof;
  2459. size_t m=MultipoleOrder();
  2460. size_t ker_dim0=setup_data.kernel->ker_dim[0];
  2461. size_t ker_dim1=setup_data.kernel->ker_dim[1];
  2462. size_t fftsize;
  2463. {
  2464. size_t n1=m*2;
  2465. size_t n2=n1*n1;
  2466. size_t n3_=n2*(n1/2+1);
  2467. size_t chld_cnt=1UL<<COORD_DIM;
  2468. fftsize=2*n3_*chld_cnt;
  2469. dof=1;
  2470. }
  2471. int omp_p=omp_get_max_threads();
  2472. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  2473. size_t n_blk0=2*fftsize*dof*(ker_dim0*n_in +ker_dim1*n_out)*sizeof(Real_t)/buff_size;
  2474. if(n_blk0==0) n_blk0=1;
  2475. std::vector<std::vector<size_t> > fft_vec(n_blk0);
  2476. std::vector<std::vector<size_t> > ifft_vec(n_blk0);
  2477. std::vector<std::vector<size_t> > interac_vec(n_blk0);
  2478. std::vector<std::vector<size_t> > interac_dsp(n_blk0);
  2479. {
  2480. Matrix<Real_t>& input_data=*setup_data. input_data;
  2481. Matrix<Real_t>& output_data=*setup_data.output_data;
  2482. std::vector<std::vector<FMMNode*> > nodes_blk_in (n_blk0);
  2483. std::vector<std::vector<FMMNode*> > nodes_blk_out(n_blk0);
  2484. for(size_t i=0;i<n_in;i++) ((FMMNode*)nodes_in[i])->node_id=i;
  2485. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2486. size_t blk0_start=(n_out* blk0 )/n_blk0;
  2487. size_t blk0_end =(n_out*(blk0+1))/n_blk0;
  2488. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2489. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2490. { // Build node list for blk0.
  2491. std::set<void*> nodes_in;
  2492. for(size_t i=blk0_start;i<blk0_end;i++){
  2493. nodes_out_.push_back((FMMNode*)nodes_out[i]);
  2494. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  2495. for(size_t k=0;k<mat_cnt;k++) if(lst[k]!=NULL) nodes_in.insert(lst[k]);
  2496. }
  2497. for(std::set<void*>::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){
  2498. nodes_in_.push_back((FMMNode*)*node);
  2499. }
  2500. size_t input_dim=nodes_in_ .size()*ker_dim0*dof*fftsize;
  2501. size_t output_dim=nodes_out_.size()*ker_dim1*dof*fftsize;
  2502. size_t buffer_dim=(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2503. if(buff_size<(input_dim + output_dim + buffer_dim)*sizeof(Real_t))
  2504. buff_size=(input_dim + output_dim + buffer_dim)*sizeof(Real_t);
  2505. }
  2506. { // Set fft vectors.
  2507. for(size_t i=0;i<nodes_in_ .size();i++) fft_vec[blk0].push_back((size_t)(& input_vector[nodes_in_[i]->node_id][0][0]- input_data[0]));
  2508. for(size_t i=0;i<nodes_out_.size();i++)ifft_vec[blk0].push_back((size_t)(&output_vector[blk0_start + i ][0][0]-output_data[0]));
  2509. }
  2510. }
  2511. for(size_t blk0=0;blk0<n_blk0;blk0++){ // Hadamard interactions.
  2512. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2513. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2514. for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
  2515. { // Next blocking level.
  2516. size_t n_blk1=nodes_out_.size()*(2)*sizeof(Real_t)/(64*V_BLK_CACHE);
  2517. if(n_blk1==0) n_blk1=1;
  2518. size_t interac_dsp_=0;
  2519. for(size_t blk1=0;blk1<n_blk1;blk1++){
  2520. size_t blk1_start=(nodes_out_.size()* blk1 )/n_blk1;
  2521. size_t blk1_end =(nodes_out_.size()*(blk1+1))/n_blk1;
  2522. for(size_t k=0;k<mat_cnt;k++){
  2523. for(size_t i=blk1_start;i<blk1_end;i++){
  2524. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type];
  2525. if(lst[k]!=NULL){
  2526. interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof);
  2527. interac_vec[blk0].push_back( i *fftsize*ker_dim1*dof);
  2528. interac_dsp_++;
  2529. }
  2530. }
  2531. interac_dsp[blk0].push_back(interac_dsp_);
  2532. }
  2533. }
  2534. }
  2535. }
  2536. }
  2537. { // Set interac_data.
  2538. size_t data_size=sizeof(size_t)*6; // buff_size, m, dof, ker_dim0, ker_dim1, n_blk0
  2539. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2540. data_size+=sizeof(size_t)+ fft_vec[blk0].size()*sizeof(size_t);
  2541. data_size+=sizeof(size_t)+ ifft_vec[blk0].size()*sizeof(size_t);
  2542. data_size+=sizeof(size_t)+interac_vec[blk0].size()*sizeof(size_t);
  2543. data_size+=sizeof(size_t)+interac_dsp[blk0].size()*sizeof(size_t);
  2544. }
  2545. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  2546. if(data_size>interac_data.Dim(0)*interac_data.Dim(1))
  2547. interac_data.ReInit(1,data_size);
  2548. char* data_ptr=&interac_data[0][0];
  2549. ((size_t*)data_ptr)[0]=buff_size; data_ptr+=sizeof(size_t);
  2550. ((size_t*)data_ptr)[0]= m; data_ptr+=sizeof(size_t);
  2551. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  2552. ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
  2553. ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
  2554. ((size_t*)data_ptr)[0]= n_blk0; data_ptr+=sizeof(size_t);
  2555. ((size_t*)data_ptr)[0]= interac_mat.size(); data_ptr+=sizeof(size_t);
  2556. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  2557. data_ptr+=interac_mat.size()*sizeof(size_t);
  2558. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2559. ((size_t*)data_ptr)[0]= fft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2560. mem::memcopy(data_ptr, & fft_vec[blk0][0], fft_vec[blk0].size()*sizeof(size_t));
  2561. data_ptr+= fft_vec[blk0].size()*sizeof(size_t);
  2562. ((size_t*)data_ptr)[0]=ifft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2563. mem::memcopy(data_ptr, &ifft_vec[blk0][0], ifft_vec[blk0].size()*sizeof(size_t));
  2564. data_ptr+=ifft_vec[blk0].size()*sizeof(size_t);
  2565. ((size_t*)data_ptr)[0]=interac_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2566. mem::memcopy(data_ptr, &interac_vec[blk0][0], interac_vec[blk0].size()*sizeof(size_t));
  2567. data_ptr+=interac_vec[blk0].size()*sizeof(size_t);
  2568. ((size_t*)data_ptr)[0]=interac_dsp[blk0].size(); data_ptr+=sizeof(size_t);
  2569. mem::memcopy(data_ptr, &interac_dsp[blk0][0], interac_dsp[blk0].size()*sizeof(size_t));
  2570. data_ptr+=interac_dsp[blk0].size()*sizeof(size_t);
  2571. }
  2572. }
  2573. }
  2574. Profile::Toc();
  2575. if(device){ // Host2Device
  2576. Profile::Tic("Host2Device",&this->comm,false,25);
  2577. setup_data.interac_data. AllocDevice(true);
  2578. Profile::Toc();
  2579. }
  2580. }
  2581. template <class FMMNode>
  2582. void FMM_Pts<FMMNode>::V_List (SetupData<Real_t>& setup_data, bool device){
  2583. assert(!device); //Can not run on accelerator yet.
  2584. int np;
  2585. MPI_Comm_size(comm,&np);
  2586. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  2587. if(np>1) Profile::Tic("Host2Device",&this->comm,false,25);
  2588. if(np>1) Profile::Toc();
  2589. return;
  2590. }
  2591. Profile::Tic("Host2Device",&this->comm,false,25);
  2592. int level=setup_data.level;
  2593. size_t buff_size=*((size_t*)&setup_data.interac_data[0][0]);
  2594. typename Matrix<Real_t>::Device M_d;
  2595. typename Vector<char>::Device buff;
  2596. typename Matrix<char>::Device precomp_data;
  2597. typename Matrix<char>::Device interac_data;
  2598. typename Matrix<Real_t>::Device input_data;
  2599. typename Matrix<Real_t>::Device output_data;
  2600. Matrix<Real_t>& M = this->mat->Mat(level, DC2DE_Type, 0);
  2601. if(device){
  2602. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  2603. M_d = M. AllocDevice(false);
  2604. buff = this-> dev_buffer. AllocDevice(false);
  2605. precomp_data= setup_data.precomp_data->AllocDevice(false);
  2606. interac_data= setup_data.interac_data. AllocDevice(false);
  2607. input_data = setup_data. input_data->AllocDevice(false);
  2608. output_data = setup_data. output_data->AllocDevice(false);
  2609. }else{
  2610. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.ReInit(buff_size);
  2611. M_d = M;
  2612. buff = this-> cpu_buffer;
  2613. precomp_data=*setup_data.precomp_data;
  2614. interac_data= setup_data.interac_data;
  2615. input_data =*setup_data. input_data;
  2616. output_data =*setup_data. output_data;
  2617. }
  2618. Profile::Toc();
  2619. { // Offloaded computation.
  2620. // Set interac_data.
  2621. size_t m, dof, ker_dim0, ker_dim1, n_blk0;
  2622. std::vector<Vector<size_t> > fft_vec;
  2623. std::vector<Vector<size_t> > ifft_vec;
  2624. std::vector<Vector<size_t> > interac_vec;
  2625. std::vector<Vector<size_t> > interac_dsp;
  2626. Vector<Real_t*> precomp_mat;
  2627. { // Set interac_data.
  2628. char* data_ptr=&interac_data[0][0];
  2629. buff_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2630. m =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2631. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2632. ker_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2633. ker_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2634. n_blk0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2635. fft_vec .resize(n_blk0);
  2636. ifft_vec.resize(n_blk0);
  2637. interac_vec.resize(n_blk0);
  2638. interac_dsp.resize(n_blk0);
  2639. Vector<size_t> interac_mat;
  2640. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2641. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  2642. precomp_mat.Resize(interac_mat.Dim());
  2643. for(size_t i=0;i<interac_mat.Dim();i++){
  2644. precomp_mat[i]=(Real_t*)(precomp_data[0]+interac_mat[i]);
  2645. }
  2646. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2647. fft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2648. data_ptr+=sizeof(size_t)+fft_vec[blk0].Dim()*sizeof(size_t);
  2649. ifft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2650. data_ptr+=sizeof(size_t)+ifft_vec[blk0].Dim()*sizeof(size_t);
  2651. interac_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2652. data_ptr+=sizeof(size_t)+interac_vec[blk0].Dim()*sizeof(size_t);
  2653. interac_dsp[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2654. data_ptr+=sizeof(size_t)+interac_dsp[blk0].Dim()*sizeof(size_t);
  2655. }
  2656. }
  2657. int omp_p=omp_get_max_threads();
  2658. size_t M_dim, fftsize;
  2659. {
  2660. size_t n1=m*2;
  2661. size_t n2=n1*n1;
  2662. size_t n3_=n2*(n1/2+1);
  2663. size_t chld_cnt=1UL<<COORD_DIM;
  2664. fftsize=2*n3_*chld_cnt;
  2665. M_dim=n3_;
  2666. }
  2667. for(size_t blk0=0;blk0<n_blk0;blk0++){ // interactions
  2668. size_t n_in = fft_vec[blk0].Dim();
  2669. size_t n_out=ifft_vec[blk0].Dim();
  2670. size_t input_dim=n_in *ker_dim0*dof*fftsize;
  2671. size_t output_dim=n_out*ker_dim1*dof*fftsize;
  2672. size_t buffer_dim=(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2673. Vector<Real_t> fft_in ( input_dim, (Real_t*)&buff[ 0 ],false);
  2674. Vector<Real_t> fft_out(output_dim, (Real_t*)&buff[ input_dim *sizeof(Real_t)],false);
  2675. Vector<Real_t> buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
  2676. { // FFT
  2677. if(np==1) Profile::Tic("FFT",&comm,false,100);
  2678. Vector<Real_t> input_data_( input_data.dim[0]* input_data.dim[1], input_data[0], false);
  2679. FFT_UpEquiv(dof, m, ker_dim0, fft_vec[blk0], input_data_, fft_in, buffer);
  2680. if(np==1) Profile::Toc();
  2681. }
  2682. { // Hadamard
  2683. #ifdef PVFMM_HAVE_PAPI
  2684. #ifdef __VERBOSE__
  2685. std::cout << "Starting counters new\n";
  2686. if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
  2687. #endif
  2688. #endif
  2689. if(np==1) Profile::Tic("HadamardProduct",&comm,false,100);
  2690. VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
  2691. if(np==1) Profile::Toc();
  2692. #ifdef PVFMM_HAVE_PAPI
  2693. #ifdef __VERBOSE__
  2694. if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
  2695. std::cout << "Stopping counters\n";
  2696. #endif
  2697. #endif
  2698. }
  2699. { // IFFT
  2700. if(np==1) Profile::Tic("IFFT",&comm,false,100);
  2701. Matrix<Real_t> M(M_d.dim[0],M_d.dim[1],M_d[0],false);
  2702. Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
  2703. FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], fft_out, output_data_, buffer, M);
  2704. if(np==1) Profile::Toc();
  2705. }
  2706. }
  2707. }
  2708. }
  2709. template <class FMMNode>
  2710. void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2711. if(this->MultipoleOrder()==0) return;
  2712. { // Set setup_data
  2713. setup_data.level=level;
  2714. setup_data.kernel=kernel->k_l2l;
  2715. setup_data.interac_type.resize(1);
  2716. setup_data.interac_type[0]=D2D_Type;
  2717. setup_data. input_data=&buff[1];
  2718. setup_data.output_data=&buff[1];
  2719. Vector<FMMNode_t*>& nodes_in =n_list[1];
  2720. Vector<FMMNode_t*>& nodes_out=n_list[1];
  2721. setup_data.nodes_in .clear();
  2722. setup_data.nodes_out.clear();
  2723. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1) setup_data.nodes_in .push_back(nodes_in [i]);
  2724. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level ) setup_data.nodes_out.push_back(nodes_out[i]);
  2725. }
  2726. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2727. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2728. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2729. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2730. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
  2731. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
  2732. SetupInterac(setup_data,device);
  2733. }
  2734. template <class FMMNode>
  2735. void FMM_Pts<FMMNode>::Down2Down (SetupData<Real_t>& setup_data, bool device){
  2736. //Add Down2Down contribution.
  2737. EvalList(setup_data, device);
  2738. }
  2739. template <class FMMNode>
  2740. void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift_src, bool shift_trg, Matrix<Real_t>* M, bool device){
  2741. int level=setup_data.level;
  2742. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  2743. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2744. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2745. Matrix<Real_t>& output_data=*setup_data.output_data;
  2746. Matrix<Real_t>& input_data=*setup_data. input_data;
  2747. Matrix<Real_t>& coord_data=*setup_data. coord_data;
  2748. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector;
  2749. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector;
  2750. size_t n_in =nodes_in .size();
  2751. size_t n_out=nodes_out.size();
  2752. //setup_data.precomp_data=NULL;
  2753. // Build interac_data
  2754. Profile::Tic("Interac-Data",&this->comm,true,25);
  2755. Matrix<char>& interac_data=setup_data.interac_data;
  2756. if(n_out>0 && n_in >0){
  2757. size_t ker_dim0=setup_data.kernel->ker_dim[0];
  2758. size_t ker_dim1=setup_data.kernel->ker_dim[1];
  2759. size_t dof=1;
  2760. #pragma omp parallel for
  2761. for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
  2762. std::vector<size_t> trg_interac_cnt(n_out,0);
  2763. std::vector<size_t> trg_coord(n_out);
  2764. std::vector<size_t> trg_value(n_out);
  2765. std::vector<size_t> trg_cnt(n_out);
  2766. std::vector<Real_t> scaling(n_out*(ker_dim0+ker_dim1),0);
  2767. { // Set trg data
  2768. Mat_Type& interac_type=interac_type_lst[0];
  2769. #pragma omp parallel for
  2770. for(size_t i=0;i<n_out;i++){
  2771. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  2772. trg_cnt [i]=output_vector[i*2+0]->Dim()/COORD_DIM;
  2773. trg_coord[i]=(size_t)(&output_vector[i*2+0][0][0]- coord_data[0]);
  2774. trg_value[i]=(size_t)(&output_vector[i*2+1][0][0]-output_data[0]);
  2775. Real_t* s=&scaling[i*(ker_dim0+ker_dim1)];
  2776. for(size_t j=0;j<ker_dim1;j++){
  2777. if(!this->Homogen()) s[j]=1.0;
  2778. else if(interac_type==S2U_Type) s[ j]=pow(0.5, setup_data.kernel->trg_scal[j]*((FMMNode*)nodes_out[i])->Depth());
  2779. else if(interac_type== X_Type) s[ j]=pow(0.5, setup_data.kernel->trg_scal[j]*((FMMNode*)nodes_out[i])->Depth());
  2780. }
  2781. for(size_t j=0;j<ker_dim0;j++){
  2782. if(!this->Homogen()) s[ker_dim1+j]=1.0;
  2783. else if(interac_type==S2U_Type) s[ker_dim1+j]=pow(0.5, setup_data.kernel->src_scal[j]*((FMMNode*)nodes_out[i])->Depth());
  2784. else if(interac_type== X_Type) s[ker_dim1+j]=pow(0.5, setup_data.kernel->src_scal[j]*((FMMNode*)nodes_out[i])->Depth());
  2785. }
  2786. }
  2787. }
  2788. }
  2789. std::vector<std::vector<size_t> > src_cnt(n_out);
  2790. std::vector<std::vector<size_t> > src_coord(n_out);
  2791. std::vector<std::vector<size_t> > src_value(n_out);
  2792. std::vector<std::vector<Real_t> > shift_coord(n_out);
  2793. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  2794. Mat_Type& interac_type=interac_type_lst[type_indx];
  2795. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  2796. #pragma omp parallel for
  2797. for(size_t i=0;i<n_out;i++){ // For each target node.
  2798. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  2799. std::vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  2800. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++) if(lst[mat_indx]!=NULL){ // For each direction.
  2801. size_t j=lst[mat_indx]->node_id;
  2802. if(input_vector[j*4+0]->Dim()>0 || input_vector[j*4+2]->Dim()>0){
  2803. trg_interac_cnt[i]++;
  2804. { // Determine shift for periodic boundary condition
  2805. Real_t* sc=lst[mat_indx]->Coord();
  2806. Real_t* tc=((FMMNode*)nodes_out[i])->Coord();
  2807. int* rel_coord=this->interac_list.RelativeCoord(interac_type, mat_indx);
  2808. shift_coord[i].push_back((tc[0]>sc[0] && rel_coord[0]>0? 1.0:
  2809. (tc[0]<sc[0] && rel_coord[0]<0?-1.0:0.0)) +
  2810. (shift_src?sc[0]:0) - (shift_trg?tc[0]:0) );
  2811. shift_coord[i].push_back((tc[1]>sc[1] && rel_coord[1]>0? 1.0:
  2812. (tc[1]<sc[1] && rel_coord[1]<0?-1.0:0.0)) +
  2813. (shift_src?sc[1]:0) - (shift_trg?tc[1]:0) );
  2814. shift_coord[i].push_back((tc[2]>sc[2] && rel_coord[2]>0? 1.0:
  2815. (tc[2]<sc[2] && rel_coord[2]<0?-1.0:0.0)) +
  2816. (shift_src?sc[2]:0) - (shift_trg?tc[2]:0) );
  2817. }
  2818. { // Set src data
  2819. if(input_vector[j*4+0]!=NULL){
  2820. src_cnt [i].push_back(input_vector[j*4+0]->Dim()/COORD_DIM);
  2821. src_coord[i].push_back((size_t)(& input_vector[j*4+0][0][0]- coord_data[0]));
  2822. src_value[i].push_back((size_t)(& input_vector[j*4+1][0][0]- input_data[0]));
  2823. }else{
  2824. src_cnt [i].push_back(0);
  2825. src_coord[i].push_back(0);
  2826. src_value[i].push_back(0);
  2827. }
  2828. if(input_vector[j*4+2]!=NULL){
  2829. src_cnt [i].push_back(input_vector[j*4+2]->Dim()/COORD_DIM);
  2830. src_coord[i].push_back((size_t)(& input_vector[j*4+2][0][0]- coord_data[0]));
  2831. src_value[i].push_back((size_t)(& input_vector[j*4+3][0][0]- input_data[0]));
  2832. }else{
  2833. src_cnt [i].push_back(0);
  2834. src_coord[i].push_back(0);
  2835. src_value[i].push_back(0);
  2836. }
  2837. }
  2838. }
  2839. }
  2840. }
  2841. }
  2842. }
  2843. { // Set interac_data.
  2844. size_t data_size=sizeof(size_t)*4;
  2845. data_size+=sizeof(size_t)+trg_interac_cnt.size()*sizeof(size_t);
  2846. data_size+=sizeof(size_t)+trg_coord.size()*sizeof(size_t);
  2847. data_size+=sizeof(size_t)+trg_value.size()*sizeof(size_t);
  2848. data_size+=sizeof(size_t)+trg_cnt .size()*sizeof(size_t);
  2849. data_size+=sizeof(size_t)+scaling .size()*sizeof(Real_t);
  2850. data_size+=sizeof(size_t)*2+(M!=NULL?M->Dim(0)*M->Dim(1)*sizeof(Real_t):0);
  2851. for(size_t i=0;i<n_out;i++){
  2852. data_size+=sizeof(size_t)+src_cnt [i].size()*sizeof(size_t);
  2853. data_size+=sizeof(size_t)+src_coord[i].size()*sizeof(size_t);
  2854. data_size+=sizeof(size_t)+src_value[i].size()*sizeof(size_t);
  2855. data_size+=sizeof(size_t)+shift_coord[i].size()*sizeof(Real_t);
  2856. }
  2857. if(data_size>interac_data.Dim(0)*interac_data.Dim(1))
  2858. interac_data.ReInit(1,data_size);
  2859. char* data_ptr=&interac_data[0][0];
  2860. ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t);
  2861. ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
  2862. ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
  2863. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  2864. ((size_t*)data_ptr)[0]=trg_interac_cnt.size(); data_ptr+=sizeof(size_t);
  2865. mem::memcopy(data_ptr, &trg_interac_cnt[0], trg_interac_cnt.size()*sizeof(size_t));
  2866. data_ptr+=trg_interac_cnt.size()*sizeof(size_t);
  2867. ((size_t*)data_ptr)[0]=trg_coord.size(); data_ptr+=sizeof(size_t);
  2868. mem::memcopy(data_ptr, &trg_coord[0], trg_coord.size()*sizeof(size_t));
  2869. data_ptr+=trg_coord.size()*sizeof(size_t);
  2870. ((size_t*)data_ptr)[0]=trg_value.size(); data_ptr+=sizeof(size_t);
  2871. mem::memcopy(data_ptr, &trg_value[0], trg_value.size()*sizeof(size_t));
  2872. data_ptr+=trg_value.size()*sizeof(size_t);
  2873. ((size_t*)data_ptr)[0]=trg_cnt.size(); data_ptr+=sizeof(size_t);
  2874. mem::memcopy(data_ptr, &trg_cnt[0], trg_cnt.size()*sizeof(size_t));
  2875. data_ptr+=trg_cnt.size()*sizeof(size_t);
  2876. ((size_t*)data_ptr)[0]=scaling.size(); data_ptr+=sizeof(size_t);
  2877. mem::memcopy(data_ptr, &scaling[0], scaling.size()*sizeof(Real_t));
  2878. data_ptr+=scaling.size()*sizeof(Real_t);
  2879. if(M!=NULL){
  2880. ((size_t*)data_ptr)[0]=M->Dim(0); data_ptr+=sizeof(size_t);
  2881. ((size_t*)data_ptr)[0]=M->Dim(1); data_ptr+=sizeof(size_t);
  2882. mem::memcopy(data_ptr, M[0][0], M->Dim(0)*M->Dim(1)*sizeof(Real_t));
  2883. data_ptr+=M->Dim(0)*M->Dim(1)*sizeof(Real_t);
  2884. }else{
  2885. ((size_t*)data_ptr)[0]=0; data_ptr+=sizeof(size_t);
  2886. ((size_t*)data_ptr)[0]=0; data_ptr+=sizeof(size_t);
  2887. }
  2888. for(size_t i=0;i<n_out;i++){
  2889. ((size_t*)data_ptr)[0]=src_cnt[i].size(); data_ptr+=sizeof(size_t);
  2890. mem::memcopy(data_ptr, &src_cnt[i][0], src_cnt[i].size()*sizeof(size_t));
  2891. data_ptr+=src_cnt[i].size()*sizeof(size_t);
  2892. ((size_t*)data_ptr)[0]=src_coord[i].size(); data_ptr+=sizeof(size_t);
  2893. mem::memcopy(data_ptr, &src_coord[i][0], src_coord[i].size()*sizeof(size_t));
  2894. data_ptr+=src_coord[i].size()*sizeof(size_t);
  2895. ((size_t*)data_ptr)[0]=src_value[i].size(); data_ptr+=sizeof(size_t);
  2896. mem::memcopy(data_ptr, &src_value[i][0], src_value[i].size()*sizeof(size_t));
  2897. data_ptr+=src_value[i].size()*sizeof(size_t);
  2898. ((size_t*)data_ptr)[0]=shift_coord[i].size(); data_ptr+=sizeof(size_t);
  2899. mem::memcopy(data_ptr, &shift_coord[i][0], shift_coord[i].size()*sizeof(Real_t));
  2900. data_ptr+=shift_coord[i].size()*sizeof(Real_t);
  2901. }
  2902. }
  2903. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  2904. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  2905. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.ReInit(buff_size);
  2906. }
  2907. Profile::Toc();
  2908. if(device){ // Host2Device
  2909. Profile::Tic("Host2Device",&this->comm,false,25);
  2910. setup_data.interac_data .AllocDevice(true);
  2911. Profile::Toc();
  2912. }
  2913. }
  2914. template <class FMMNode>
  2915. template <int SYNC>
  2916. void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
  2917. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  2918. Profile::Tic("Host2Device",&this->comm,false,25);
  2919. Profile::Toc();
  2920. Profile::Tic("DeviceComp",&this->comm,false,20);
  2921. Profile::Toc();
  2922. return;
  2923. }
  2924. bool have_gpu=false;
  2925. #if defined(PVFMM_HAVE_CUDA)
  2926. have_gpu=true;
  2927. #endif
  2928. Profile::Tic("Host2Device",&this->comm,false,25);
  2929. typename Vector<char>::Device buff;
  2930. //typename Matrix<char>::Device precomp_data;
  2931. typename Matrix<char>::Device interac_data;
  2932. typename Matrix<Real_t>::Device coord_data;
  2933. typename Matrix<Real_t>::Device input_data;
  2934. typename Matrix<Real_t>::Device output_data;
  2935. if(device && !have_gpu){
  2936. buff = this-> dev_buffer. AllocDevice(false);
  2937. interac_data= setup_data.interac_data. AllocDevice(false);
  2938. //if(setup_data.precomp_data!=NULL) precomp_data= setup_data.precomp_data->AllocDevice(false);
  2939. if(setup_data. coord_data!=NULL) coord_data = setup_data. coord_data->AllocDevice(false);
  2940. if(setup_data. input_data!=NULL) input_data = setup_data. input_data->AllocDevice(false);
  2941. if(setup_data. output_data!=NULL) output_data = setup_data. output_data->AllocDevice(false);
  2942. }else{
  2943. buff = this-> cpu_buffer;
  2944. interac_data= setup_data.interac_data;
  2945. //if(setup_data.precomp_data!=NULL) precomp_data=*setup_data.precomp_data;
  2946. if(setup_data. coord_data!=NULL) coord_data =*setup_data. coord_data;
  2947. if(setup_data. input_data!=NULL) input_data =*setup_data. input_data;
  2948. if(setup_data. output_data!=NULL) output_data =*setup_data. output_data;
  2949. }
  2950. Profile::Toc();
  2951. size_t ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten;
  2952. size_t ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
  2953. Profile::Tic("DeviceComp",&this->comm,false,20);
  2954. int lock_idx=-1;
  2955. int wait_lock_idx=-1;
  2956. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  2957. if(device) lock_idx=MIC_Lock::get_lock();
  2958. #ifdef __INTEL_OFFLOAD
  2959. if(device) ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten;
  2960. if(device) ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten;
  2961. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  2962. #endif
  2963. { // Offloaded computation.
  2964. // Set interac_data.
  2965. size_t data_size;
  2966. size_t ker_dim0;
  2967. size_t ker_dim1;
  2968. size_t dof, n_out;
  2969. Vector<size_t> trg_interac_cnt;
  2970. Vector<size_t> trg_coord;
  2971. Vector<size_t> trg_value;
  2972. Vector<size_t> trg_cnt;
  2973. Vector<Real_t> scaling;
  2974. Matrix<Real_t> M;
  2975. Vector< Vector<size_t> > src_cnt;
  2976. Vector< Vector<size_t> > src_coord;
  2977. Vector< Vector<size_t> > src_value;
  2978. Vector< Vector<Real_t> > shift_coord;
  2979. { // Set interac_data.
  2980. char* data_ptr=&interac_data[0][0];
  2981. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2982. ker_dim0=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2983. ker_dim1=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2984. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2985. trg_interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2986. data_ptr+=sizeof(size_t)+trg_interac_cnt.Dim()*sizeof(size_t);
  2987. n_out=trg_interac_cnt.Dim();
  2988. trg_coord.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2989. data_ptr+=sizeof(size_t)+trg_coord.Dim()*sizeof(size_t);
  2990. trg_value.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2991. data_ptr+=sizeof(size_t)+trg_value.Dim()*sizeof(size_t);
  2992. trg_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2993. data_ptr+=sizeof(size_t)+trg_cnt.Dim()*sizeof(size_t);
  2994. scaling.ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  2995. data_ptr+=sizeof(size_t)+scaling.Dim()*sizeof(Real_t);
  2996. M.ReInit(((size_t*)data_ptr)[0],((size_t*)data_ptr)[1],(Real_t*)(data_ptr+2*sizeof(size_t)),false);
  2997. data_ptr+=sizeof(size_t)*2+M.Dim(0)*M.Dim(1)*sizeof(Real_t);
  2998. src_cnt.Resize(n_out);
  2999. src_coord.Resize(n_out);
  3000. src_value.Resize(n_out);
  3001. shift_coord.Resize(n_out);
  3002. for(size_t i=0;i<n_out;i++){
  3003. src_cnt[i].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3004. data_ptr+=sizeof(size_t)+src_cnt[i].Dim()*sizeof(size_t);
  3005. src_coord[i].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3006. data_ptr+=sizeof(size_t)+src_coord[i].Dim()*sizeof(size_t);
  3007. src_value[i].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3008. data_ptr+=sizeof(size_t)+src_value[i].Dim()*sizeof(size_t);
  3009. shift_coord[i].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  3010. data_ptr+=sizeof(size_t)+shift_coord[i].Dim()*sizeof(Real_t);
  3011. }
  3012. }
  3013. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  3014. //Compute interaction from point sources.
  3015. { // interactions
  3016. typename Kernel<Real_t>::Ker_t single_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_single_layer_kernel;
  3017. typename Kernel<Real_t>::Ker_t double_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_double_layer_kernel;
  3018. int omp_p=omp_get_max_threads();
  3019. Vector<Real_t*> thread_buff(omp_p);
  3020. size_t thread_buff_size=buff.dim/sizeof(Real_t)/omp_p;
  3021. for(int i=0;i<omp_p;i++) thread_buff[i]=(Real_t*)&buff[i*thread_buff_size*sizeof(Real_t)];
  3022. #pragma omp parallel for schedule(dynamic)
  3023. for(size_t i=0;i<n_out;i++)
  3024. if(trg_interac_cnt[i]>0 && trg_cnt[i]>0){
  3025. int thread_id=omp_get_thread_num();
  3026. Real_t* s_coord=thread_buff[thread_id];
  3027. Real_t* t_value=output_data[0]+trg_value[i];
  3028. if(M.Dim(0)>0 && M.Dim(1)>0){
  3029. s_coord+=dof*M.Dim(0);
  3030. t_value=thread_buff[thread_id];
  3031. for(size_t j=0;j<dof*M.Dim(0);j++) t_value[j]=0;
  3032. }
  3033. size_t interac_cnt=0;
  3034. for(size_t j=0;j<trg_interac_cnt[i];j++){
  3035. if(ptr_single_layer_kernel!=(size_t)NULL){// Single layer kernel
  3036. Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+0];
  3037. assert(thread_buff_size>=dof*M.Dim(0)+dof*M.Dim(1)+src_cnt[i][2*j+0]*COORD_DIM);
  3038. for(size_t k1=0;k1<src_cnt[i][2*j+0];k1++){ // Compute shifted source coordinates.
  3039. for(size_t k0=0;k0<COORD_DIM;k0++){
  3040. s_coord[k1*COORD_DIM+k0]=src_coord_[k1*COORD_DIM+k0]+shift_coord[i][j*COORD_DIM+k0];
  3041. }
  3042. }
  3043. single_layer_kernel( s_coord , src_cnt[i][2*j+0], input_data[0]+src_value[i][2*j+0], dof,
  3044. coord_data[0]+trg_coord[i], trg_cnt[i] , t_value, NULL);
  3045. interac_cnt+=src_cnt[i][2*j+0]*trg_cnt[i];
  3046. }else if(src_cnt[i][2*j+0]!=0 && trg_cnt[i]!=0){
  3047. assert(ptr_single_layer_kernel); // Single-layer kernel not implemented
  3048. }
  3049. if(ptr_double_layer_kernel!=(size_t)NULL){// Double layer kernel
  3050. Real_t* src_coord_=coord_data[0]+src_coord[i][2*j+1];
  3051. assert(thread_buff_size>=dof*M.Dim(0)+dof*M.Dim(1)+src_cnt[i][2*j+1]*COORD_DIM);
  3052. for(size_t k1=0;k1<src_cnt[i][2*j+1];k1++){ // Compute shifted source coordinates.
  3053. for(size_t k0=0;k0<COORD_DIM;k0++){
  3054. s_coord[k1*COORD_DIM+k0]=src_coord_[k1*COORD_DIM+k0]+shift_coord[i][j*COORD_DIM+k0];
  3055. }
  3056. }
  3057. double_layer_kernel( s_coord , src_cnt[i][2*j+1], input_data[0]+src_value[i][2*j+1], dof,
  3058. coord_data[0]+trg_coord[i], trg_cnt[i] , t_value, NULL);
  3059. interac_cnt+=src_cnt[i][2*j+1]*trg_cnt[i];
  3060. }else if(src_cnt[i][2*j+1]!=0 && trg_cnt[i]!=0){
  3061. assert(ptr_double_layer_kernel); // Double-layer kernel not implemented
  3062. }
  3063. }
  3064. if(M.Dim(0)>0 && M.Dim(1)>0 && interac_cnt>0){
  3065. assert(trg_cnt[i]*ker_dim1==M.Dim(0));
  3066. assert(trg_cnt[i]*ker_dim0==M.Dim(1));
  3067. {// Scaling (ker_dim1)
  3068. Real_t* s=&scaling[i*(ker_dim0+ker_dim1)];
  3069. for(size_t j=0;j<dof*M.Dim(0);j+=ker_dim1){
  3070. for(size_t k=0;k<ker_dim1;k++){
  3071. t_value[j+k]*=s[k];
  3072. }
  3073. }
  3074. }
  3075. Matrix<Real_t> in_vec(dof, M.Dim(0), t_value, false);
  3076. Matrix<Real_t> tmp_vec(dof, M.Dim(1),dof*M.Dim(0)+t_value, false);
  3077. Matrix<Real_t>::GEMM(tmp_vec, in_vec, M, 0.0);
  3078. Matrix<Real_t> out_vec(dof, M.Dim(1), output_data[0]+trg_value[i], false);
  3079. {// Scaling (ker_dim0)
  3080. Real_t* s=&scaling[i*(ker_dim0+ker_dim1)+ker_dim1];
  3081. for(size_t j=0;j<dof*M.Dim(1);j+=ker_dim0){
  3082. for(size_t k=0;k<ker_dim0;k++){
  3083. out_vec[0][j+k]+=tmp_vec[0][j+k]*s[k];
  3084. }
  3085. }
  3086. }
  3087. }
  3088. }
  3089. }
  3090. if(device) MIC_Lock::release_lock(lock_idx);
  3091. }
  3092. #ifdef __INTEL_OFFLOAD
  3093. if(SYNC){
  3094. #pragma offload if(device) target(mic:0)
  3095. {if(device) MIC_Lock::wait_lock(lock_idx);}
  3096. }
  3097. #endif
  3098. Profile::Toc();
  3099. }
  3100. template <class FMMNode>
  3101. void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3102. if(this->MultipoleOrder()==0) return;
  3103. { // Set setup_data
  3104. setup_data.level=level;
  3105. setup_data.kernel=kernel->k_s2l;
  3106. setup_data.interac_type.resize(1);
  3107. setup_data.interac_type[0]=X_Type;
  3108. setup_data. input_data=&buff[4];
  3109. setup_data.output_data=&buff[1];
  3110. setup_data. coord_data=&buff[6];
  3111. Vector<FMMNode_t*>& nodes_in =n_list[4];
  3112. Vector<FMMNode_t*>& nodes_out=n_list[1];
  3113. setup_data.nodes_in .clear();
  3114. setup_data.nodes_out.clear();
  3115. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level-1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  3116. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  3117. }
  3118. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3119. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3120. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3121. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3122. for(size_t i=0;i<nodes_in .size();i++){
  3123. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
  3124. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
  3125. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
  3126. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
  3127. }
  3128. for(size_t i=0;i<nodes_out.size();i++){
  3129. output_vector.push_back(&dnwd_check_surf[((FMMNode*)nodes_out[i])->Depth()]);
  3130. output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
  3131. }
  3132. //Downward check to downward equivalent matrix.
  3133. Matrix<Real_t>& M_dc2de = this->mat->Mat(level, DC2DE_Type, 0);
  3134. this->SetupInteracPts(setup_data, false, true, &M_dc2de,device);
  3135. { // Resize device buffer
  3136. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3137. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3138. }
  3139. }
  3140. template <class FMMNode>
  3141. void FMM_Pts<FMMNode>::X_List (SetupData<Real_t>& setup_data, bool device){
  3142. //Add X_List contribution.
  3143. this->EvalListPts(setup_data, device);
  3144. }
  3145. template <class FMMNode>
  3146. void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3147. if(this->MultipoleOrder()==0) return;
  3148. { // Set setup_data
  3149. setup_data.level=level;
  3150. setup_data.kernel=kernel->k_m2t;
  3151. setup_data.interac_type.resize(1);
  3152. setup_data.interac_type[0]=W_Type;
  3153. setup_data. input_data=&buff[0];
  3154. setup_data.output_data=&buff[5];
  3155. setup_data. coord_data=&buff[6];
  3156. Vector<FMMNode_t*>& nodes_in =n_list[0];
  3157. Vector<FMMNode_t*>& nodes_out=n_list[5];
  3158. setup_data.nodes_in .clear();
  3159. setup_data.nodes_out.clear();
  3160. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level+1 || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  3161. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  3162. }
  3163. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3164. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3165. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3166. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3167. for(size_t i=0;i<nodes_in .size();i++){
  3168. input_vector .push_back(&upwd_equiv_surf[((FMMNode*)nodes_in [i])->Depth()]);
  3169. input_vector .push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
  3170. input_vector .push_back(NULL);
  3171. input_vector .push_back(NULL);
  3172. }
  3173. for(size_t i=0;i<nodes_out.size();i++){
  3174. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_coord);
  3175. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value);
  3176. }
  3177. this->SetupInteracPts(setup_data, true, false, NULL, device);
  3178. { // Resize device buffer
  3179. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3180. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3181. }
  3182. }
  3183. template <class FMMNode>
  3184. void FMM_Pts<FMMNode>::W_List (SetupData<Real_t>& setup_data, bool device){
  3185. //Add W_List contribution.
  3186. this->EvalListPts(setup_data, device);
  3187. }
  3188. template <class FMMNode>
  3189. void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3190. { // Set setup_data
  3191. setup_data.level=level;
  3192. setup_data.kernel=kernel->k_s2t;
  3193. setup_data.interac_type.resize(3);
  3194. setup_data.interac_type[0]=U0_Type;
  3195. setup_data.interac_type[1]=U1_Type;
  3196. setup_data.interac_type[2]=U2_Type;
  3197. setup_data. input_data=&buff[4];
  3198. setup_data.output_data=&buff[5];
  3199. setup_data. coord_data=&buff[6];
  3200. Vector<FMMNode_t*>& nodes_in =n_list[4];
  3201. Vector<FMMNode_t*>& nodes_out=n_list[5];
  3202. setup_data.nodes_in .clear();
  3203. setup_data.nodes_out.clear();
  3204. for(size_t i=0;i<nodes_in .Dim();i++) if((level-1<=nodes_in [i]->Depth() && nodes_in [i]->Depth()<=level+1) || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  3205. for(size_t i=0;i<nodes_out.Dim();i++) if(( nodes_out[i]->Depth()==level ) || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  3206. }
  3207. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3208. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3209. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3210. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3211. for(size_t i=0;i<nodes_in .size();i++){
  3212. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_coord);
  3213. input_vector .push_back(&((FMMNode*)nodes_in [i])->src_value);
  3214. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_coord);
  3215. input_vector .push_back(&((FMMNode*)nodes_in [i])->surf_value);
  3216. }
  3217. for(size_t i=0;i<nodes_out.size();i++){
  3218. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_coord);
  3219. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value);
  3220. }
  3221. this->SetupInteracPts(setup_data, false, false, NULL, device);
  3222. { // Resize device buffer
  3223. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3224. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3225. }
  3226. }
  3227. template <class FMMNode>
  3228. void FMM_Pts<FMMNode>::U_List (SetupData<Real_t>& setup_data, bool device){
  3229. //Add U_List contribution.
  3230. this->EvalListPts(setup_data, device);
  3231. }
  3232. template <class FMMNode>
  3233. void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3234. if(this->MultipoleOrder()==0) return;
  3235. { // Set setup_data
  3236. setup_data.level=level;
  3237. setup_data.kernel=kernel->k_l2t;
  3238. setup_data.interac_type.resize(1);
  3239. setup_data.interac_type[0]=D2T_Type;
  3240. setup_data. input_data=&buff[1];
  3241. setup_data.output_data=&buff[5];
  3242. setup_data. coord_data=&buff[6];
  3243. Vector<FMMNode_t*>& nodes_in =n_list[1];
  3244. Vector<FMMNode_t*>& nodes_out=n_list[5];
  3245. setup_data.nodes_in .clear();
  3246. setup_data.nodes_out.clear();
  3247. for(size_t i=0;i<nodes_in .Dim();i++) if(nodes_in [i]->Depth()==level || level==-1) setup_data.nodes_in .push_back(nodes_in [i]);
  3248. for(size_t i=0;i<nodes_out.Dim();i++) if(nodes_out[i]->Depth()==level || level==-1) setup_data.nodes_out.push_back(nodes_out[i]);
  3249. }
  3250. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3251. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3252. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3253. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3254. for(size_t i=0;i<nodes_in .size();i++){
  3255. input_vector .push_back(&dnwd_equiv_surf[((FMMNode*)nodes_in [i])->Depth()]);
  3256. input_vector .push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
  3257. input_vector .push_back(NULL);
  3258. input_vector .push_back(NULL);
  3259. }
  3260. for(size_t i=0;i<nodes_out.size();i++){
  3261. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_coord);
  3262. output_vector.push_back(&((FMMNode*)nodes_out[i])->trg_value);
  3263. }
  3264. this->SetupInteracPts(setup_data, true, false, NULL, device);
  3265. { // Resize device buffer
  3266. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3267. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3268. }
  3269. }
  3270. template <class FMMNode>
  3271. void FMM_Pts<FMMNode>::Down2Target(SetupData<Real_t>& setup_data, bool device){
  3272. //Add Down2Target contribution.
  3273. this->EvalListPts(setup_data, device);
  3274. }
  3275. template <class FMMNode>
  3276. void FMM_Pts<FMMNode>::PostProcessing(std::vector<FMMNode_t*>& nodes){
  3277. }
  3278. template <class FMMNode>
  3279. void FMM_Pts<FMMNode>::CopyOutput(FMMNode** nodes, size_t n){
  3280. }
  3281. }//end namespace