fmm_pts.txx 198 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235
  1. /**
  2. * \file fmm_pts.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 3-07-2011
  5. * \brief This file contains the implementation of the FMM_Pts class.
  6. */
  7. #include <omp.h>
  8. #include <cmath>
  9. #include <cstdlib>
  10. #include <cassert>
  11. #include <sstream>
  12. #include <iostream>
  13. #include <stdint.h>
  14. #include <set>
  15. #ifdef PVFMM_HAVE_SYS_STAT_H
  16. #include <sys/stat.h>
  17. #endif
  18. #ifdef __SSE__
  19. #include <xmmintrin.h>
  20. #endif
  21. #ifdef __SSE2__
  22. #include <emmintrin.h>
  23. #endif
  24. #ifdef __SSE3__
  25. #include <pmmintrin.h>
  26. #endif
  27. #ifdef __AVX__
  28. #include <immintrin.h>
  29. #endif
  30. #if defined(__MIC__)
  31. #include <immintrin.h>
  32. #endif
  33. #include <profile.hpp>
  34. namespace pvfmm{
  35. /**
  36. * \brief Returns the coordinates of points on the surface of a cube.
  37. * \param[in] p Number of points on an edge of the cube is (n+1)
  38. * \param[in] c Coordinates to the centre of the cube (3D array).
  39. * \param[in] alpha Scaling factor for the size of the cube.
  40. * \param[in] depth Depth of the cube in the octree.
  41. * \return Vector with coordinates of points on the surface of the cube in the
  42. * format [x0 y0 z0 x1 y1 z1 .... ].
  43. */
  44. template <class Real_t>
  45. std::vector<Real_t> surface(int p, Real_t* c, Real_t alpha, int depth){
  46. size_t n_=(6*(p-1)*(p-1)+2); //Total number of points.
  47. std::vector<Real_t> coord(n_*3);
  48. coord[0]=coord[1]=coord[2]=-1.0;
  49. size_t cnt=1;
  50. for(int i=0;i<p-1;i++)
  51. for(int j=0;j<p-1;j++){
  52. coord[cnt*3 ]=-1.0;
  53. coord[cnt*3+1]=(2.0*(i+1)-p+1)/(p-1);
  54. coord[cnt*3+2]=(2.0*j-p+1)/(p-1);
  55. cnt++;
  56. }
  57. for(int i=0;i<p-1;i++)
  58. for(int j=0;j<p-1;j++){
  59. coord[cnt*3 ]=(2.0*i-p+1)/(p-1);
  60. coord[cnt*3+1]=-1.0;
  61. coord[cnt*3+2]=(2.0*(j+1)-p+1)/(p-1);
  62. cnt++;
  63. }
  64. for(int i=0;i<p-1;i++)
  65. for(int j=0;j<p-1;j++){
  66. coord[cnt*3 ]=(2.0*(i+1)-p+1)/(p-1);
  67. coord[cnt*3+1]=(2.0*j-p+1)/(p-1);
  68. coord[cnt*3+2]=-1.0;
  69. cnt++;
  70. }
  71. for(size_t i=0;i<(n_/2)*3;i++)
  72. coord[cnt*3+i]=-coord[i];
  73. Real_t r = 0.5*pow(0.5,depth);
  74. Real_t b = alpha*r;
  75. for(size_t i=0;i<n_;i++){
  76. coord[i*3+0]=(coord[i*3+0]+1.0)*b+c[0];
  77. coord[i*3+1]=(coord[i*3+1]+1.0)*b+c[1];
  78. coord[i*3+2]=(coord[i*3+2]+1.0)*b+c[2];
  79. }
  80. return coord;
  81. }
  82. /**
  83. * \brief Returns the coordinates of points on the upward check surface of cube.
  84. * \see surface()
  85. */
  86. template <class Real_t>
  87. std::vector<Real_t> u_check_surf(int p, Real_t* c, int depth){
  88. Real_t r=0.5*pow(0.5,depth);
  89. Real_t coord[3]={(Real_t)(c[0]-r*(RAD1-1.0)),(Real_t)(c[1]-r*(RAD1-1.0)),(Real_t)(c[2]-r*(RAD1-1.0))};
  90. return surface(p,coord,(Real_t)RAD1,depth);
  91. }
  92. /**
  93. * \brief Returns the coordinates of points on the upward equivalent surface of cube.
  94. * \see surface()
  95. */
  96. template <class Real_t>
  97. std::vector<Real_t> u_equiv_surf(int p, Real_t* c, int depth){
  98. Real_t r=0.5*pow(0.5,depth);
  99. Real_t coord[3]={(Real_t)(c[0]-r*(RAD0-1.0)),(Real_t)(c[1]-r*(RAD0-1.0)),(Real_t)(c[2]-r*(RAD0-1.0))};
  100. return surface(p,coord,(Real_t)RAD0,depth);
  101. }
  102. /**
  103. * \brief Returns the coordinates of points on the downward check surface of cube.
  104. * \see surface()
  105. */
  106. template <class Real_t>
  107. std::vector<Real_t> d_check_surf(int p, Real_t* c, int depth){
  108. Real_t r=0.5*pow(0.5,depth);
  109. Real_t coord[3]={(Real_t)(c[0]-r*(RAD0-1.0)),(Real_t)(c[1]-r*(RAD0-1.0)),(Real_t)(c[2]-r*(RAD0-1.0))};
  110. return surface(p,coord,(Real_t)RAD0,depth);
  111. }
  112. /**
  113. * \brief Returns the coordinates of points on the downward equivalent surface of cube.
  114. * \see surface()
  115. */
  116. template <class Real_t>
  117. std::vector<Real_t> d_equiv_surf(int p, Real_t* c, int depth){
  118. Real_t r=0.5*pow(0.5,depth);
  119. Real_t coord[3]={(Real_t)(c[0]-r*(RAD1-1.0)),(Real_t)(c[1]-r*(RAD1-1.0)),(Real_t)(c[2]-r*(RAD1-1.0))};
  120. return surface(p,coord,(Real_t)RAD1,depth);
  121. }
  122. /**
  123. * \brief Defines the 3D grid for convolution in FFT acceleration of V-list.
  124. * \see surface()
  125. */
  126. template <class Real_t>
  127. std::vector<Real_t> conv_grid(int p, Real_t* c, int depth){
  128. Real_t r=pow(0.5,depth);
  129. Real_t a=r*RAD0;
  130. Real_t coord[3]={c[0],c[1],c[2]};
  131. int n1=p*2;
  132. int n2=(int)pow((Real_t)n1,2);
  133. int n3=(int)pow((Real_t)n1,3);
  134. std::vector<Real_t> grid(n3*3);
  135. for(int i=0;i<n1;i++)
  136. for(int j=0;j<n1;j++)
  137. for(int k=0;k<n1;k++){
  138. grid[(i+n1*j+n2*k)*3+0]=(i-p)*a/(p-1)+coord[0];
  139. grid[(i+n1*j+n2*k)*3+1]=(j-p)*a/(p-1)+coord[1];
  140. grid[(i+n1*j+n2*k)*3+2]=(k-p)*a/(p-1)+coord[2];
  141. }
  142. return grid;
  143. }
  144. template <class Real_t>
  145. void FMM_Data<Real_t>::Clear(){
  146. upward_equiv.Resize(0);
  147. }
  148. template <class Real_t>
  149. PackedData FMM_Data<Real_t>::PackMultipole(void* buff_ptr){
  150. PackedData p0; p0.data=buff_ptr;
  151. p0.length=upward_equiv.Dim()*sizeof(Real_t);
  152. if(p0.length==0) return p0;
  153. if(p0.data==NULL) p0.data=(char*)&upward_equiv[0];
  154. else mem::memcopy(p0.data,&upward_equiv[0],p0.length);
  155. return p0;
  156. }
  157. template <class Real_t>
  158. void FMM_Data<Real_t>::AddMultipole(PackedData p0){
  159. Real_t* data=(Real_t*)p0.data;
  160. size_t n=p0.length/sizeof(Real_t);
  161. assert(upward_equiv.Dim()==n);
  162. Matrix<Real_t> v0(1,n,&upward_equiv[0],false);
  163. Matrix<Real_t> v1(1,n,data,false);
  164. v0+=v1;
  165. }
  166. template <class Real_t>
  167. void FMM_Data<Real_t>::InitMultipole(PackedData p0, bool own_data){
  168. Real_t* data=(Real_t*)p0.data;
  169. size_t n=p0.length/sizeof(Real_t);
  170. if(n==0) return;
  171. if(own_data){
  172. upward_equiv=Vector<Real_t>(n, &data[0], false);
  173. }else{
  174. upward_equiv.ReInit(n, &data[0], false);
  175. }
  176. }
  177. template <class FMMNode>
  178. FMM_Pts<FMMNode>::~FMM_Pts() {
  179. if(mat!=NULL){
  180. // int rank;
  181. // MPI_Comm_rank(comm,&rank);
  182. // if(rank==0) mat->Save2File("Precomp.data");
  183. delete mat;
  184. mat=NULL;
  185. }
  186. if(vprecomp_fft_flag) FFTW_t<Real_t>::fft_destroy_plan(vprecomp_fftplan);
  187. #ifdef __INTEL_OFFLOAD0
  188. #pragma offload target(mic:0)
  189. #endif
  190. {
  191. if(vlist_fft_flag ) FFTW_t<Real_t>::fft_destroy_plan(vlist_fftplan );
  192. if(vlist_ifft_flag) FFTW_t<Real_t>::fft_destroy_plan(vlist_ifftplan);
  193. vlist_fft_flag =false;
  194. vlist_ifft_flag=false;
  195. }
  196. }
  197. template <class FMMNode>
  198. void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel<Real_t>* kernel_){
  199. Profile::Tic("InitFMM_Pts",&comm_,true);{
  200. bool verbose=false;
  201. #ifndef NDEBUG
  202. #ifdef __VERBOSE__
  203. int rank;
  204. MPI_Comm_rank(comm_,&rank);
  205. if(!rank) verbose=true;
  206. #endif
  207. #endif
  208. if(kernel_) kernel_->Initialize(verbose);
  209. multipole_order=mult_order;
  210. comm=comm_;
  211. kernel=kernel_;
  212. assert(kernel!=NULL);
  213. mat=new PrecompMat<Real_t>(ScaleInvar(), MAX_DEPTH+1);
  214. if(this->mat_fname.size()==0){
  215. std::stringstream st;
  216. st<<PVFMM_PRECOMP_DATA_PATH;
  217. if(!st.str().size()){ // look in PVFMM_DIR
  218. char* pvfmm_dir = getenv ("PVFMM_DIR");
  219. if(pvfmm_dir) st<<pvfmm_dir<<'/';
  220. }
  221. #ifndef STAT_MACROS_BROKEN
  222. if(st.str().size()){ // check if the path is a directory
  223. struct stat stat_buff;
  224. if(stat(st.str().c_str(), &stat_buff) || !S_ISDIR(stat_buff.st_mode)){
  225. std::cout<<"error: path not found: "<<st.str()<<'\n';
  226. exit(0);
  227. }
  228. }
  229. #endif
  230. st<<"Precomp_"<<kernel->ker_name.c_str()<<"_m"<<mult_order;
  231. if(sizeof(Real_t)==8) st<<"";
  232. else if(sizeof(Real_t)==4) st<<"_f";
  233. else st<<"_t"<<sizeof(Real_t);
  234. st<<".data";
  235. this->mat_fname=st.str();
  236. }
  237. this->mat->LoadFile(mat_fname.c_str(), this->comm);
  238. interac_list.Initialize(COORD_DIM, this->mat);
  239. Profile::Tic("PrecompUC2UE",&comm,false,4);
  240. this->PrecompAll(UC2UE0_Type);
  241. this->PrecompAll(UC2UE1_Type);
  242. Profile::Toc();
  243. Profile::Tic("PrecompDC2DE",&comm,false,4);
  244. this->PrecompAll(DC2DE0_Type);
  245. this->PrecompAll(DC2DE1_Type);
  246. Profile::Toc();
  247. Profile::Tic("PrecompBC",&comm,false,4);
  248. { /*
  249. int type=BC_Type;
  250. for(int l=0;l<MAX_DEPTH;l++)
  251. for(size_t indx=0;indx<this->interac_list.ListCount((Mat_Type)type);indx++){
  252. Matrix<Real_t>& M=this->mat->Mat(l, (Mat_Type)type, indx);
  253. M.Resize(0,0);
  254. } // */
  255. }
  256. this->PrecompAll(BC_Type,0);
  257. Profile::Toc();
  258. Profile::Tic("PrecompU2U",&comm,false,4);
  259. this->PrecompAll(U2U_Type);
  260. Profile::Toc();
  261. Profile::Tic("PrecompD2D",&comm,false,4);
  262. this->PrecompAll(D2D_Type);
  263. Profile::Toc();
  264. Profile::Tic("PrecompV",&comm,false,4);
  265. this->PrecompAll(V_Type);
  266. Profile::Toc();
  267. Profile::Tic("PrecompV1",&comm,false,4);
  268. this->PrecompAll(V1_Type);
  269. Profile::Toc();
  270. }Profile::Toc();
  271. }
  272. template <class Real_t>
  273. Permutation<Real_t> equiv_surf_perm(size_t m, size_t p_indx, const Permutation<Real_t>& ker_perm, const Vector<Real_t>* scal_exp=NULL){
  274. Real_t eps=1e-10;
  275. int dof=ker_perm.Dim();
  276. Real_t c[3]={-0.5,-0.5,-0.5};
  277. std::vector<Real_t> trg_coord=d_check_surf(m,c,0);
  278. int n_trg=trg_coord.size()/3;
  279. Permutation<Real_t> P=Permutation<Real_t>(n_trg*dof);
  280. if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){ // Set P.perm
  281. for(int i=0;i<n_trg;i++)
  282. for(int j=0;j<n_trg;j++){
  283. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+0]*(p_indx==ReflecX?-1.0:1.0))<eps)
  284. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+1]*(p_indx==ReflecY?-1.0:1.0))<eps)
  285. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+2]*(p_indx==ReflecZ?-1.0:1.0))<eps){
  286. for(int k=0;k<dof;k++){
  287. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  288. }
  289. }
  290. }
  291. }else if(p_indx==SwapXY || p_indx==SwapXZ){
  292. for(int i=0;i<n_trg;i++)
  293. for(int j=0;j<n_trg;j++){
  294. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+(p_indx==SwapXY?1:2)])<eps)
  295. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+(p_indx==SwapXY?0:1)])<eps)
  296. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+(p_indx==SwapXY?2:0)])<eps){
  297. for(int k=0;k<dof;k++){
  298. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  299. }
  300. }
  301. }
  302. }else{
  303. for(int j=0;j<n_trg;j++){
  304. for(int k=0;k<dof;k++){
  305. P.perm[j*dof+k]=j*dof+ker_perm.perm[k];
  306. }
  307. }
  308. }
  309. if(scal_exp && p_indx==Scaling){ // Set level-by-level scaling
  310. assert(dof==scal_exp->Dim());
  311. Vector<Real_t> scal(scal_exp->Dim());
  312. for(size_t i=0;i<scal.Dim();i++){
  313. scal[i]=pow(2.0,(*scal_exp)[i]);
  314. }
  315. for(int j=0;j<n_trg;j++){
  316. for(int i=0;i<dof;i++){
  317. P.scal[j*dof+i]*=scal[i];
  318. }
  319. }
  320. }
  321. { // Set P.scal
  322. for(int j=0;j<n_trg;j++){
  323. for(int i=0;i<dof;i++){
  324. P.scal[j*dof+i]*=ker_perm.scal[i];
  325. }
  326. }
  327. }
  328. return P;
  329. }
  330. template <class FMMNode>
  331. Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type type, Perm_Type perm_indx){
  332. //Check if the matrix already exists.
  333. Permutation<Real_t>& P_ = mat->Perm((Mat_Type)type, perm_indx);
  334. if(P_.Dim()!=0) return P_;
  335. size_t m=this->MultipoleOrder();
  336. size_t p_indx=perm_indx % C_Perm;
  337. //Compute the matrix.
  338. Permutation<Real_t> P;
  339. switch (type){
  340. case U2U_Type:
  341. {
  342. Vector<Real_t> scal_exp;
  343. Permutation<Real_t> ker_perm;
  344. if(perm_indx<C_Perm){ // Source permutation
  345. ker_perm=kernel->k_m2m->perm_vec[0 +p_indx];
  346. scal_exp=kernel->k_m2m->src_scal;
  347. }else{ // Target permutation
  348. ker_perm=kernel->k_m2m->perm_vec[0 +p_indx];
  349. scal_exp=kernel->k_m2m->src_scal;
  350. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  351. }
  352. P=equiv_surf_perm(m, p_indx, ker_perm, (this->ScaleInvar()?&scal_exp:NULL));
  353. break;
  354. }
  355. case D2D_Type:
  356. {
  357. Vector<Real_t> scal_exp;
  358. Permutation<Real_t> ker_perm;
  359. if(perm_indx<C_Perm){ // Source permutation
  360. ker_perm=kernel->k_l2l->perm_vec[C_Perm+p_indx];
  361. scal_exp=kernel->k_l2l->trg_scal;
  362. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  363. }else{ // Target permutation
  364. ker_perm=kernel->k_l2l->perm_vec[C_Perm+p_indx];
  365. scal_exp=kernel->k_l2l->trg_scal;
  366. }
  367. P=equiv_surf_perm(m, p_indx, ker_perm, (this->ScaleInvar()?&scal_exp:NULL));
  368. break;
  369. }
  370. default:
  371. break;
  372. }
  373. //Save the matrix for future use.
  374. #pragma omp critical (PRECOMP_MATRIX_PTS)
  375. {
  376. if(P_.Dim()==0) P_=P;
  377. }
  378. return P_;
  379. }
  380. template <class FMMNode>
  381. Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type type, size_t mat_indx){
  382. if(this->ScaleInvar()) level=0;
  383. //Check if the matrix already exists.
  384. Matrix<Real_t>& M_ = this->mat->Mat(level, type, mat_indx);
  385. if(M_.Dim(0)!=0 && M_.Dim(1)!=0) return M_;
  386. else{ //Compute matrix from symmetry class (if possible).
  387. size_t class_indx = this->interac_list.InteracClass(type, mat_indx);
  388. if(class_indx!=mat_indx){
  389. Matrix<Real_t>& M0 = this->Precomp(level, type, class_indx);
  390. if(M0.Dim(0)==0 || M0.Dim(1)==0) return M_;
  391. for(size_t i=0;i<Perm_Count;i++) this->PrecompPerm(type, (Perm_Type) i);
  392. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, type, mat_indx);
  393. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, type, mat_indx);
  394. if(Pr.Dim()>0 && Pc.Dim()>0 && M0.Dim(0)>0 && M0.Dim(1)>0) return M_;
  395. }
  396. }
  397. //Compute the matrix.
  398. Matrix<Real_t> M;
  399. //int omp_p=omp_get_max_threads();
  400. switch (type){
  401. case UC2UE0_Type:
  402. {
  403. if(MultipoleOrder()==0) break;
  404. const int* ker_dim=kernel->k_m2m->ker_dim;
  405. // Coord of upward check surface
  406. Real_t c[3]={0,0,0};
  407. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  408. size_t n_uc=uc_coord.size()/3;
  409. // Coord of upward equivalent surface
  410. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  411. size_t n_ue=ue_coord.size()/3;
  412. // Evaluate potential at check surface due to equivalent surface.
  413. Matrix<Real_t> M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  414. kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue,
  415. &uc_coord[0], n_uc, &(M_e2c[0][0]));
  416. Matrix<Real_t> U,S,V;
  417. M_e2c.SVD(U,S,V);
  418. Real_t eps=1, max_S=0;
  419. while(eps*(Real_t)0.5+(Real_t)1.0>1.0) eps*=0.5;
  420. for(size_t i=0;i<std::min(S.Dim(0),S.Dim(1));i++){
  421. if(fabs(S[i][i])>max_S) max_S=fabs(S[i][i]);
  422. }
  423. for(size_t i=0;i<S.Dim(0);i++) S[i][i]=(S[i][i]>eps*max_S*4?1.0/S[i][i]:0.0);
  424. M=V.Transpose()*S;//*U.Transpose();
  425. break;
  426. }
  427. case UC2UE1_Type:
  428. {
  429. if(MultipoleOrder()==0) break;
  430. const int* ker_dim=kernel->k_m2m->ker_dim;
  431. // Coord of upward check surface
  432. Real_t c[3]={0,0,0};
  433. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  434. size_t n_uc=uc_coord.size()/3;
  435. // Coord of upward equivalent surface
  436. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  437. size_t n_ue=ue_coord.size()/3;
  438. // Evaluate potential at check surface due to equivalent surface.
  439. Matrix<Real_t> M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  440. kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue,
  441. &uc_coord[0], n_uc, &(M_e2c[0][0]));
  442. Matrix<Real_t> U,S,V;
  443. M_e2c.SVD(U,S,V);
  444. M=U.Transpose();
  445. break;
  446. }
  447. case DC2DE0_Type:
  448. {
  449. if(MultipoleOrder()==0) break;
  450. const int* ker_dim=kernel->k_l2l->ker_dim;
  451. // Coord of downward check surface
  452. Real_t c[3]={0,0,0};
  453. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  454. size_t n_ch=check_surf.size()/3;
  455. // Coord of downward equivalent surface
  456. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  457. size_t n_eq=equiv_surf.size()/3;
  458. // Evaluate potential at check surface due to equivalent surface.
  459. Matrix<Real_t> M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]);
  460. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq,
  461. &check_surf[0], n_ch, &(M_e2c[0][0]));
  462. Matrix<Real_t> U,S,V;
  463. M_e2c.SVD(U,S,V);
  464. Real_t eps=1, max_S=0;
  465. while(eps*(Real_t)0.5+(Real_t)1.0>1.0) eps*=0.5;
  466. for(size_t i=0;i<std::min(S.Dim(0),S.Dim(1));i++){
  467. if(fabs(S[i][i])>max_S) max_S=fabs(S[i][i]);
  468. }
  469. for(size_t i=0;i<S.Dim(0);i++) S[i][i]=(S[i][i]>eps*max_S*4?1.0/S[i][i]:0.0);
  470. M=V.Transpose()*S;//*U.Transpose();
  471. break;
  472. }
  473. case DC2DE1_Type:
  474. {
  475. if(MultipoleOrder()==0) break;
  476. const int* ker_dim=kernel->k_l2l->ker_dim;
  477. // Coord of downward check surface
  478. Real_t c[3]={0,0,0};
  479. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  480. size_t n_ch=check_surf.size()/3;
  481. // Coord of downward equivalent surface
  482. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  483. size_t n_eq=equiv_surf.size()/3;
  484. // Evaluate potential at check surface due to equivalent surface.
  485. Matrix<Real_t> M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]);
  486. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq,
  487. &check_surf[0], n_ch, &(M_e2c[0][0]));
  488. Matrix<Real_t> U,S,V;
  489. M_e2c.SVD(U,S,V);
  490. M=U.Transpose();
  491. break;
  492. }
  493. case U2U_Type:
  494. {
  495. if(MultipoleOrder()==0) break;
  496. const int* ker_dim=kernel->k_m2m->ker_dim;
  497. // Coord of upward check surface
  498. Real_t c[3]={0,0,0};
  499. std::vector<Real_t> check_surf=u_check_surf(MultipoleOrder(),c,level);
  500. size_t n_uc=check_surf.size()/3;
  501. // Coord of child's upward equivalent surface
  502. Real_t s=pow(0.5,(level+2));
  503. int* coord=interac_list.RelativeCoord(type,mat_indx);
  504. Real_t child_coord[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  505. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),child_coord,level+1);
  506. size_t n_ue=equiv_surf.size()/3;
  507. // Evaluate potential at check surface due to equivalent surface.
  508. Matrix<Real_t> M_ce2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  509. kernel->k_m2m->BuildMatrix(&equiv_surf[0], n_ue,
  510. &check_surf[0], n_uc, &(M_ce2c[0][0]));
  511. Matrix<Real_t>& M_c2e0 = Precomp(level, UC2UE0_Type, 0);
  512. Matrix<Real_t>& M_c2e1 = Precomp(level, UC2UE1_Type, 0);
  513. M=(M_ce2c*M_c2e0)*M_c2e1;
  514. break;
  515. }
  516. case D2D_Type:
  517. {
  518. if(MultipoleOrder()==0) break;
  519. const int* ker_dim=kernel->k_l2l->ker_dim;
  520. // Coord of downward check surface
  521. Real_t s=pow(0.5,level+1);
  522. int* coord=interac_list.RelativeCoord(type,mat_indx);
  523. Real_t c[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  524. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  525. size_t n_dc=check_surf.size()/3;
  526. // Coord of parent's downward equivalent surface
  527. Real_t parent_coord[3]={0,0,0};
  528. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),parent_coord,level-1);
  529. size_t n_de=equiv_surf.size()/3;
  530. // Evaluate potential at check surface due to equivalent surface.
  531. Matrix<Real_t> M_pe2c(n_de*ker_dim[0],n_dc*ker_dim[1]);
  532. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_de,
  533. &check_surf[0], n_dc, &(M_pe2c[0][0]));
  534. Matrix<Real_t> M_c2e0=Precomp(level-1,DC2DE0_Type,0);
  535. Matrix<Real_t> M_c2e1=Precomp(level-1,DC2DE1_Type,0);
  536. if(ScaleInvar()){ // Scale M_c2e0 for level-1
  537. Permutation<Real_t> ker_perm=this->kernel->k_l2l->perm_vec[C_Perm+Scaling];
  538. Vector<Real_t> scal_exp=this->kernel->k_l2l->trg_scal;
  539. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  540. M_c2e0=P*M_c2e0;
  541. }
  542. if(ScaleInvar()){ // Scale M_c2e1 for level-1
  543. Permutation<Real_t> ker_perm=this->kernel->k_l2l->perm_vec[0 +Scaling];
  544. Vector<Real_t> scal_exp=this->kernel->k_l2l->src_scal;
  545. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  546. M_c2e1=M_c2e1*P;
  547. }
  548. M=M_c2e0*(M_c2e1*M_pe2c);
  549. break;
  550. }
  551. case D2T_Type:
  552. {
  553. if(MultipoleOrder()==0) break;
  554. const int* ker_dim=kernel->k_l2t->ker_dim;
  555. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  556. // Coord of target points
  557. Real_t r=pow(0.5,level);
  558. size_t n_trg=rel_trg_coord.size()/3;
  559. std::vector<Real_t> trg_coord(n_trg*3);
  560. for(size_t i=0;i<n_trg*COORD_DIM;i++) trg_coord[i]=rel_trg_coord[i]*r;
  561. // Coord of downward equivalent surface
  562. Real_t c[3]={0,0,0};
  563. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  564. size_t n_eq=equiv_surf.size()/3;
  565. // Evaluate potential at target points due to equivalent surface.
  566. {
  567. M .Resize(n_eq*ker_dim [0], n_trg*ker_dim [1]);
  568. kernel->k_l2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  569. }
  570. Matrix<Real_t>& M_c2e0=Precomp(level,DC2DE0_Type,0);
  571. Matrix<Real_t>& M_c2e1=Precomp(level,DC2DE1_Type,0);
  572. M=M_c2e0*(M_c2e1*M);
  573. break;
  574. }
  575. case V_Type:
  576. {
  577. if(MultipoleOrder()==0) break;
  578. const int* ker_dim=kernel->k_m2l->ker_dim;
  579. int n1=MultipoleOrder()*2;
  580. int n3 =n1*n1*n1;
  581. int n3_=n1*n1*(n1/2+1);
  582. //Compute the matrix.
  583. Real_t s=pow(0.5,level);
  584. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  585. Real_t coord_diff[3]={coord2[0]*s,coord2[1]*s,coord2[2]*s};
  586. //Evaluate potential.
  587. std::vector<Real_t> r_trg(COORD_DIM,0.0);
  588. std::vector<Real_t> conv_poten(n3*ker_dim[0]*ker_dim[1]);
  589. std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
  590. kernel->k_m2l->BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
  591. //Rearrange data.
  592. Matrix<Real_t> M_conv(n3,ker_dim[0]*ker_dim[1],&conv_poten[0],false);
  593. M_conv=M_conv.Transpose();
  594. //Compute FFTW plan.
  595. int nnn[3]={n1,n1,n1};
  596. Real_t *fftw_in, *fftw_out;
  597. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  598. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  599. #pragma omp critical (FFTW_PLAN)
  600. {
  601. if (!vprecomp_fft_flag){
  602. vprecomp_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM, nnn, ker_dim[0]*ker_dim[1],
  603. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_);
  604. vprecomp_fft_flag=true;
  605. }
  606. }
  607. //Compute FFT.
  608. mem::memcopy(fftw_in, &conv_poten[0], n3*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  609. FFTW_t<Real_t>::fft_execute_dft_r2c(vprecomp_fftplan, (Real_t*)fftw_in, (typename FFTW_t<Real_t>::cplx*)(fftw_out));
  610. Matrix<Real_t> M_(2*n3_*ker_dim[0]*ker_dim[1],1,(Real_t*)fftw_out,false);
  611. M=M_;
  612. //Free memory.
  613. mem::aligned_delete<Real_t>(fftw_in);
  614. mem::aligned_delete<Real_t>(fftw_out);
  615. break;
  616. }
  617. case V1_Type:
  618. {
  619. if(MultipoleOrder()==0) break;
  620. const int* ker_dim=kernel->k_m2l->ker_dim;
  621. size_t mat_cnt =interac_list.ListCount( V_Type);
  622. for(size_t k=0;k<mat_cnt;k++) Precomp(level, V_Type, k);
  623. const size_t chld_cnt=1UL<<COORD_DIM;
  624. size_t n1=MultipoleOrder()*2;
  625. size_t M_dim=n1*n1*(n1/2+1);
  626. size_t n3=n1*n1*n1;
  627. Vector<Real_t> zero_vec(M_dim*ker_dim[0]*ker_dim[1]*2);
  628. zero_vec.SetZero();
  629. Vector<Real_t*> M_ptr(chld_cnt*chld_cnt);
  630. for(size_t i=0;i<chld_cnt*chld_cnt;i++) M_ptr[i]=&zero_vec[0];
  631. int* rel_coord_=interac_list.RelativeCoord(V1_Type, mat_indx);
  632. for(int j1=0;j1<chld_cnt;j1++)
  633. for(int j2=0;j2<chld_cnt;j2++){
  634. int rel_coord[3]={rel_coord_[0]*2-(j1/1)%2+(j2/1)%2,
  635. rel_coord_[1]*2-(j1/2)%2+(j2/2)%2,
  636. rel_coord_[2]*2-(j1/4)%2+(j2/4)%2};
  637. for(size_t k=0;k<mat_cnt;k++){
  638. int* ref_coord=interac_list.RelativeCoord(V_Type, k);
  639. if(ref_coord[0]==rel_coord[0] &&
  640. ref_coord[1]==rel_coord[1] &&
  641. ref_coord[2]==rel_coord[2]){
  642. Matrix<Real_t>& M = this->mat->Mat(level, V_Type, k);
  643. M_ptr[j2*chld_cnt+j1]=&M[0][0];
  644. break;
  645. }
  646. }
  647. }
  648. // Build matrix ker_dim0 x ker_dim1 x M_dim x 8 x 8
  649. M.Resize(ker_dim[0]*ker_dim[1]*M_dim, 2*chld_cnt*chld_cnt);
  650. for(int j=0;j<ker_dim[0]*ker_dim[1]*M_dim;j++){
  651. for(size_t k=0;k<chld_cnt*chld_cnt;k++){
  652. M[j][k*2+0]=M_ptr[k][j*2+0]/n3;
  653. M[j][k*2+1]=M_ptr[k][j*2+1]/n3;
  654. }
  655. }
  656. break;
  657. }
  658. case W_Type:
  659. {
  660. if(MultipoleOrder()==0) break;
  661. const int* ker_dim=kernel->k_m2t->ker_dim;
  662. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  663. // Coord of target points
  664. Real_t s=pow(0.5,level);
  665. size_t n_trg=rel_trg_coord.size()/3;
  666. std::vector<Real_t> trg_coord(n_trg*3);
  667. for(size_t j=0;j<n_trg*COORD_DIM;j++) trg_coord[j]=rel_trg_coord[j]*s;
  668. // Coord of downward equivalent surface
  669. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  670. Real_t c[3]={(coord2[0]+1)*s*0.25,(coord2[1]+1)*s*0.25,(coord2[2]+1)*s*0.25};
  671. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),c,level+1);
  672. size_t n_eq=equiv_surf.size()/3;
  673. // Evaluate potential at target points due to equivalent surface.
  674. {
  675. M .Resize(n_eq*ker_dim [0],n_trg*ker_dim [1]);
  676. kernel->k_m2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  677. }
  678. break;
  679. }
  680. case BC_Type:
  681. {
  682. if(!this->ScaleInvar() || MultipoleOrder()==0) break;
  683. if(kernel->k_m2l->ker_dim[0]!=kernel->k_m2m->ker_dim[0]) break;
  684. if(kernel->k_m2l->ker_dim[1]!=kernel->k_l2l->ker_dim[1]) break;
  685. const int* ker_dim=kernel->k_m2l->ker_dim;
  686. size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type);
  687. size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2); //Total number of points.
  688. if((M.Dim(0)!=n_surf*ker_dim[0] || M.Dim(1)!=n_surf*ker_dim[1]) && level==0){
  689. Matrix<Real_t> M_m2m[BC_LEVELS+1];
  690. Matrix<Real_t> M_m2l[BC_LEVELS+1];
  691. Matrix<Real_t> M_l2l[BC_LEVELS+1];
  692. Matrix<Real_t> M_equiv_zero_avg(n_surf*ker_dim[0],n_surf*ker_dim[0]);
  693. Matrix<Real_t> M_check_zero_avg(n_surf*ker_dim[1],n_surf*ker_dim[1]);
  694. { // Set average multipole charge to zero. (improves stability for large BC_LEVELS)
  695. M_equiv_zero_avg.SetZero();
  696. for(size_t i=0;i<n_surf*ker_dim[0];i++)
  697. M_equiv_zero_avg[i][i]+=1;
  698. for(size_t i=0;i<n_surf;i++)
  699. for(size_t j=0;j<n_surf;j++)
  700. for(size_t k=0;k<ker_dim[0];k++)
  701. M_equiv_zero_avg[i*ker_dim[0]+k][j*ker_dim[0]+k]-=1.0/n_surf;
  702. }
  703. { // Set average check potential to zero. (improves stability for large BC_LEVELS)
  704. M_check_zero_avg.SetZero();
  705. for(size_t i=0;i<n_surf*ker_dim[1];i++)
  706. M_check_zero_avg[i][i]+=1;
  707. for(size_t i=0;i<n_surf;i++)
  708. for(size_t j=0;j<n_surf;j++)
  709. for(size_t k=0;k<ker_dim[1];k++)
  710. M_check_zero_avg[i*ker_dim[1]+k][j*ker_dim[1]+k]-=1.0/n_surf;
  711. }
  712. for(int level=0; level>=-BC_LEVELS; level--){
  713. { // Compute M_l2l
  714. this->Precomp(level, D2D_Type, 0);
  715. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, D2D_Type, 0);
  716. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, D2D_Type, 0);
  717. M_l2l[-level] = M_check_zero_avg * Pr * this->Precomp(level, D2D_Type, this->interac_list.InteracClass(D2D_Type, 0)) * Pc * M_check_zero_avg;
  718. assert(M_l2l[-level].Dim(0)>0 && M_l2l[-level].Dim(1)>0);
  719. }
  720. // Compute M_m2m
  721. for(size_t mat_indx=0; mat_indx<mat_cnt_m2m; mat_indx++){
  722. this->Precomp(level, U2U_Type, mat_indx);
  723. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, U2U_Type, mat_indx);
  724. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, U2U_Type, mat_indx);
  725. Matrix<Real_t> M = Pr * this->Precomp(level, U2U_Type, this->interac_list.InteracClass(U2U_Type, mat_indx)) * Pc;
  726. assert(M.Dim(0)>0 && M.Dim(1)>0);
  727. if(mat_indx==0) M_m2m[-level] = M_equiv_zero_avg*M*M_equiv_zero_avg;
  728. else M_m2m[-level] += M_equiv_zero_avg*M*M_equiv_zero_avg;
  729. }
  730. // Compute M_m2l
  731. if(!ScaleInvar() || level==0){
  732. Real_t s=(1UL<<(-level));
  733. Real_t dc_coord[3]={0,0,0};
  734. std::vector<Real_t> trg_coord=d_check_surf(MultipoleOrder(), dc_coord, level);
  735. Matrix<Real_t> M_ue2dc(n_surf*ker_dim[0], n_surf*ker_dim[1]); M_ue2dc.SetZero();
  736. for(int x0=-2;x0<4;x0++)
  737. for(int x1=-2;x1<4;x1++)
  738. for(int x2=-2;x2<4;x2++)
  739. if(abs(x0)>1 || abs(x1)>1 || abs(x2)>1){
  740. Real_t ue_coord[3]={x0*s, x1*s, x2*s};
  741. std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
  742. Matrix<Real_t> M_tmp(n_surf*ker_dim[0], n_surf*ker_dim[1]);
  743. kernel->k_m2l->BuildMatrix(&src_coord[0], n_surf,
  744. &trg_coord[0], n_surf, &(M_tmp[0][0]));
  745. M_ue2dc+=M_tmp;
  746. }
  747. M_m2l[-level]=M_check_zero_avg*M_ue2dc * M_check_zero_avg;
  748. }else{
  749. M_m2l[-level]=M_equiv_zero_avg * M_m2l[-level-1] * M_check_zero_avg;
  750. if(ScaleInvar()){ // Scale M_m2l
  751. Permutation<Real_t> ker_perm=this->kernel->k_m2l->perm_vec[0 +Scaling];
  752. Vector<Real_t> scal_exp=this->kernel->k_m2l->src_scal;
  753. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  754. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  755. M_m2l[-level]=P*M_m2l[-level];
  756. }
  757. if(ScaleInvar()){ // Scale M_m2l
  758. Permutation<Real_t> ker_perm=this->kernel->k_m2l->perm_vec[C_Perm+Scaling];
  759. Vector<Real_t> scal_exp=this->kernel->k_m2l->trg_scal;
  760. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  761. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  762. M_m2l[-level]=M_m2l[-level]*P;
  763. }
  764. }
  765. }
  766. for(int level=-BC_LEVELS;level<=0;level++){
  767. if(level==-BC_LEVELS) M = M_m2l[-level];
  768. else M = M_equiv_zero_avg * (M_m2l[-level] + M_m2m[-level]*M*M_l2l[-level]) * M_equiv_zero_avg;
  769. }
  770. { // ax+by+cz+d correction.
  771. std::vector<Real_t> corner_pts;
  772. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0);
  773. corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0);
  774. corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0);
  775. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1);
  776. size_t n_corner=corner_pts.size()/COORD_DIM;
  777. // Coord of downward equivalent surface
  778. Real_t c[3]={0,0,0};
  779. std::vector<Real_t> up_equiv_surf=u_equiv_surf(MultipoleOrder(),c,0);
  780. std::vector<Real_t> dn_equiv_surf=d_equiv_surf(MultipoleOrder(),c,0);
  781. std::vector<Real_t> dn_check_surf=d_check_surf(MultipoleOrder(),c,0);
  782. Matrix<Real_t> M_err;
  783. { // Evaluate potential at corner due to upward and dnward equivalent surface.
  784. { // Error from local expansion.
  785. Matrix<Real_t> M_e2pt(n_surf*ker_dim[0],n_corner*ker_dim[1]);
  786. kernel->k_m2l->BuildMatrix(&dn_equiv_surf[0], n_surf,
  787. &corner_pts[0], n_corner, &(M_e2pt[0][0]));
  788. Matrix<Real_t>& M_dc2de0 = Precomp(0, DC2DE0_Type, 0);
  789. Matrix<Real_t>& M_dc2de1 = Precomp(0, DC2DE1_Type, 0);
  790. M_err=(M*M_dc2de0)*(M_dc2de1*M_e2pt);
  791. }
  792. for(size_t k=0;k<4;k++){ // Error from colleagues of root.
  793. for(int j0=-1;j0<=1;j0++)
  794. for(int j1=-1;j1<=1;j1++)
  795. for(int j2=-1;j2<=1;j2++){
  796. Real_t pt_coord[3]={corner_pts[k*COORD_DIM+0]-j0,
  797. corner_pts[k*COORD_DIM+1]-j1,
  798. corner_pts[k*COORD_DIM+2]-j2};
  799. if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){
  800. Matrix<Real_t> M_e2pt(n_surf*ker_dim[0],ker_dim[1]);
  801. kernel->k_m2l->BuildMatrix(&up_equiv_surf[0], n_surf,
  802. &pt_coord[0], 1, &(M_e2pt[0][0]));
  803. for(size_t i=0;i<M_e2pt.Dim(0);i++)
  804. for(size_t j=0;j<M_e2pt.Dim(1);j++)
  805. M_err[i][k*ker_dim[1]+j]+=M_e2pt[i][j];
  806. }
  807. }
  808. }
  809. }
  810. Matrix<Real_t> M_grad(M_err.Dim(0),n_surf*ker_dim[1]);
  811. for(size_t i=0;i<M_err.Dim(0);i++)
  812. for(size_t k=0;k<ker_dim[1];k++)
  813. for(size_t j=0;j<n_surf;j++){
  814. M_grad[i][j*ker_dim[1]+k]=(M_err[i][0*ker_dim[1]+k] )*1.0 +
  815. (M_err[i][1*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]+
  816. (M_err[i][2*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+1]+
  817. (M_err[i][3*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+2];
  818. }
  819. M-=M_grad;
  820. }
  821. if(!this->ScaleInvar()){ // Free memory
  822. Mat_Type type=D2D_Type;
  823. for(int l=-BC_LEVELS;l<0;l++)
  824. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  825. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  826. M.Resize(0,0);
  827. }
  828. type=U2U_Type;
  829. for(int l=-BC_LEVELS;l<0;l++)
  830. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  831. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  832. M.Resize(0,0);
  833. }
  834. type=DC2DE0_Type;
  835. for(int l=-BC_LEVELS;l<0;l++)
  836. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  837. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  838. M.Resize(0,0);
  839. }
  840. type=DC2DE1_Type;
  841. for(int l=-BC_LEVELS;l<0;l++)
  842. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  843. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  844. M.Resize(0,0);
  845. }
  846. type=UC2UE0_Type;
  847. for(int l=-BC_LEVELS;l<0;l++)
  848. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  849. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  850. M.Resize(0,0);
  851. }
  852. type=UC2UE1_Type;
  853. for(int l=-BC_LEVELS;l<0;l++)
  854. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  855. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  856. M.Resize(0,0);
  857. }
  858. }
  859. }
  860. break;
  861. }
  862. default:
  863. break;
  864. }
  865. //Save the matrix for future use.
  866. #pragma omp critical (PRECOMP_MATRIX_PTS)
  867. if(M_.Dim(0)==0 && M_.Dim(1)==0){
  868. M_=M;
  869. /*
  870. M_.Resize(M.Dim(0),M.Dim(1));
  871. int dof=ker_dim[0]*ker_dim[1];
  872. for(int j=0;j<dof;j++){
  873. size_t a=(M.Dim(0)*M.Dim(1)* j )/dof;
  874. size_t b=(M.Dim(0)*M.Dim(1)*(j+1))/dof;
  875. #pragma omp parallel for // NUMA
  876. for(int tid=0;tid<omp_p;tid++){
  877. size_t a_=a+((b-a)* tid )/omp_p;
  878. size_t b_=a+((b-a)*(tid+1))/omp_p;
  879. mem::memcopy(&M_[0][a_], &M[0][a_], (b_-a_)*sizeof(Real_t));
  880. }
  881. }
  882. */
  883. }
  884. return M_;
  885. }
  886. template <class FMMNode>
  887. void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
  888. if(level==-1){
  889. for(int l=0;l<MAX_DEPTH;l++){
  890. PrecompAll(type, l);
  891. }
  892. return;
  893. }
  894. //Compute basic permutations.
  895. for(size_t i=0;i<Perm_Count;i++)
  896. this->PrecompPerm(type, (Perm_Type) i);
  897. {
  898. //Allocate matrices.
  899. size_t mat_cnt=interac_list.ListCount((Mat_Type)type);
  900. mat->Mat(level, (Mat_Type)type, mat_cnt-1);
  901. { // Compute InteracClass matrices.
  902. std::vector<size_t> indx_lst;
  903. for(size_t i=0; i<mat_cnt; i++){
  904. if(interac_list.InteracClass((Mat_Type)type,i)==i)
  905. indx_lst.push_back(i);
  906. }
  907. //Compute Transformations.
  908. //#pragma omp parallel for //lets use fine grained parallelism
  909. for(size_t i=0; i<indx_lst.size(); i++){
  910. Precomp(level, (Mat_Type)type, indx_lst[i]);
  911. }
  912. }
  913. //#pragma omp parallel for //lets use fine grained parallelism
  914. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
  915. Matrix<Real_t>& M0=interac_list.ClassMat(level,(Mat_Type)type,mat_indx);
  916. Permutation<Real_t>& pr=interac_list.Perm_R(level, (Mat_Type)type, mat_indx);
  917. Permutation<Real_t>& pc=interac_list.Perm_C(level, (Mat_Type)type, mat_indx);
  918. if(pr.Dim()!=M0.Dim(0) || pc.Dim()!=M0.Dim(1)) Precomp(level, (Mat_Type)type, mat_indx);
  919. }
  920. }
  921. }
  922. template <class FMMNode>
  923. void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff_list, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
  924. if(buff_list.size()<7) buff_list.resize(7);
  925. if( n_list.size()<7) n_list.resize(7);
  926. if( vec_list.size()<7) vec_list.resize(7);
  927. int omp_p=omp_get_max_threads();
  928. if(node.size()==0) return;
  929. {// 0. upward_equiv
  930. int indx=0;
  931. size_t vec_sz;
  932. { // Set vec_sz
  933. Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE1_Type, 0);
  934. vec_sz=M_uc2ue.Dim(1);
  935. }
  936. std::vector< FMMNode* > node_lst;
  937. {// Construct node_lst
  938. node_lst.clear();
  939. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  940. FMMNode_t* r_node=NULL;
  941. for(size_t i=0;i<node.size();i++){
  942. if(!node[i]->IsLeaf()){
  943. node[i]->pt_cnt[0] =0;
  944. node_lst_[node[i]->Depth()].push_back(node[i]);
  945. }else{
  946. node[i]->pt_cnt[0] =node[i]-> src_coord.Dim()/COORD_DIM;
  947. node[i]->pt_cnt[0]+=node[i]->surf_coord.Dim()/COORD_DIM;
  948. }
  949. if(node[i]->Depth()==0) r_node=node[i];
  950. }
  951. size_t chld_cnt=1UL<<COORD_DIM;
  952. for(int i=MAX_DEPTH;i>=0;i--){
  953. for(size_t j=0;j<node_lst_[i].size();j++){
  954. for(size_t k=0;k<chld_cnt;k++){
  955. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  956. node_lst_[i][j]->pt_cnt[0]+=node->pt_cnt[0];
  957. }
  958. }
  959. }
  960. for(int i=0;i<=MAX_DEPTH;i++){
  961. for(size_t j=0;j<node_lst_[i].size();j++){
  962. for(size_t k=0;k<chld_cnt;k++){
  963. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  964. node_lst.push_back(node);
  965. }
  966. }
  967. }
  968. if(r_node!=NULL) node_lst.push_back(r_node);
  969. n_list[indx]=node_lst;
  970. }
  971. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  972. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  973. FMMNode_t* node=node_lst[i];
  974. Vector<Real_t>& data_vec=node->FMMData()->upward_equiv;
  975. data_vec.ReInit(vec_sz,NULL,false);
  976. vec_lst.push_back(&data_vec);
  977. }
  978. }
  979. {// 1. dnward_equiv
  980. int indx=1;
  981. size_t vec_sz;
  982. { // Set vec_sz
  983. Matrix<Real_t>& M_dc2de0 = this->interac_list.ClassMat(0, DC2DE0_Type, 0);
  984. vec_sz=M_dc2de0.Dim(0);
  985. }
  986. std::vector< FMMNode* > node_lst;
  987. {// Construct node_lst
  988. node_lst.clear();
  989. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  990. FMMNode_t* r_node=NULL;
  991. for(size_t i=0;i<node.size();i++){
  992. if(!node[i]->IsLeaf()){
  993. node[i]->pt_cnt[1]=0;
  994. node_lst_[node[i]->Depth()].push_back(node[i]);
  995. }else{
  996. node[i]->pt_cnt[1]=node[i]->trg_coord.Dim()/COORD_DIM;
  997. }
  998. if(node[i]->Depth()==0) r_node=node[i];
  999. }
  1000. size_t chld_cnt=1UL<<COORD_DIM;
  1001. for(int i=MAX_DEPTH;i>=0;i--){
  1002. for(size_t j=0;j<node_lst_[i].size();j++){
  1003. for(size_t k=0;k<chld_cnt;k++){
  1004. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1005. node_lst_[i][j]->pt_cnt[1]+=node->pt_cnt[1];
  1006. }
  1007. }
  1008. }
  1009. for(int i=0;i<=MAX_DEPTH;i++){
  1010. for(size_t j=0;j<node_lst_[i].size();j++){
  1011. for(size_t k=0;k<chld_cnt;k++){
  1012. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1013. node_lst.push_back(node);
  1014. }
  1015. }
  1016. }
  1017. if(r_node!=NULL) node_lst.push_back(r_node);
  1018. n_list[indx]=node_lst;
  1019. }
  1020. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1021. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1022. FMMNode_t* node=node_lst[i];
  1023. Vector<Real_t>& data_vec=node->FMMData()->dnward_equiv;
  1024. data_vec.ReInit(vec_sz,NULL,false);
  1025. vec_lst.push_back(&data_vec);
  1026. }
  1027. }
  1028. {// 2. upward_equiv_fft
  1029. int indx=2;
  1030. std::vector< FMMNode* > node_lst;
  1031. {
  1032. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1033. for(size_t i=0;i<node.size();i++)
  1034. if(!node[i]->IsLeaf())
  1035. node_lst_[node[i]->Depth()].push_back(node[i]);
  1036. for(int i=0;i<=MAX_DEPTH;i++)
  1037. for(size_t j=0;j<node_lst_[i].size();j++)
  1038. node_lst.push_back(node_lst_[i][j]);
  1039. }
  1040. n_list[indx]=node_lst;
  1041. }
  1042. {// 3. dnward_check_fft
  1043. int indx=3;
  1044. std::vector< FMMNode* > node_lst;
  1045. {
  1046. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1047. for(size_t i=0;i<node.size();i++)
  1048. if(!node[i]->IsLeaf() && !node[i]->IsGhost())
  1049. node_lst_[node[i]->Depth()].push_back(node[i]);
  1050. for(int i=0;i<=MAX_DEPTH;i++)
  1051. for(size_t j=0;j<node_lst_[i].size();j++)
  1052. node_lst.push_back(node_lst_[i][j]);
  1053. }
  1054. n_list[indx]=node_lst;
  1055. }
  1056. {// 4. src_val
  1057. int indx=4;
  1058. int src_dof=kernel->ker_dim[0];
  1059. int surf_dof=COORD_DIM+src_dof;
  1060. std::vector< FMMNode* > node_lst;
  1061. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1062. if(node[i]->IsLeaf()){
  1063. node_lst.push_back(node[i]);
  1064. }
  1065. }
  1066. n_list[indx]=node_lst;
  1067. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1068. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1069. FMMNode_t* node=node_lst[i];
  1070. { // src_value
  1071. Vector<Real_t>& data_vec=node->src_value;
  1072. size_t vec_sz=(node->src_coord.Dim()/COORD_DIM)*src_dof;
  1073. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false);
  1074. vec_lst.push_back(&data_vec);
  1075. }
  1076. { // surf_value
  1077. Vector<Real_t>& data_vec=node->surf_value;
  1078. size_t vec_sz=(node->surf_coord.Dim()/COORD_DIM)*surf_dof;
  1079. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false);
  1080. vec_lst.push_back(&data_vec);
  1081. }
  1082. }
  1083. }
  1084. {// 5. trg_val
  1085. int indx=5;
  1086. int trg_dof=kernel->ker_dim[1];
  1087. std::vector< FMMNode* > node_lst;
  1088. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1089. if(node[i]->IsLeaf() && !node[i]->IsGhost()){
  1090. node_lst.push_back(node[i]);
  1091. }
  1092. }
  1093. n_list[indx]=node_lst;
  1094. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1095. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1096. FMMNode_t* node=node_lst[i];
  1097. { // trg_value
  1098. Vector<Real_t>& data_vec=node->trg_value;
  1099. size_t vec_sz=(node->trg_coord.Dim()/COORD_DIM)*trg_dof;
  1100. data_vec.ReInit(vec_sz,NULL,false);
  1101. vec_lst.push_back(&data_vec);
  1102. }
  1103. }
  1104. }
  1105. {// 6. pts_coord
  1106. int indx=6;
  1107. std::vector< FMMNode* > node_lst;
  1108. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1109. if(node[i]->IsLeaf()){
  1110. node_lst.push_back(node[i]);
  1111. }
  1112. }
  1113. n_list[indx]=node_lst;
  1114. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1115. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1116. FMMNode_t* node=node_lst[i];
  1117. { // src_coord
  1118. Vector<Real_t>& data_vec=node->src_coord;
  1119. vec_lst.push_back(&data_vec);
  1120. }
  1121. { // surf_coord
  1122. Vector<Real_t>& data_vec=node->surf_coord;
  1123. vec_lst.push_back(&data_vec);
  1124. }
  1125. { // trg_coord
  1126. Vector<Real_t>& data_vec=node->trg_coord;
  1127. vec_lst.push_back(&data_vec);
  1128. }
  1129. }
  1130. { // check and equiv surfaces.
  1131. if(tree->upwd_check_surf.size()==0){
  1132. size_t m=MultipoleOrder();
  1133. tree->upwd_check_surf.resize(MAX_DEPTH);
  1134. tree->upwd_equiv_surf.resize(MAX_DEPTH);
  1135. tree->dnwd_check_surf.resize(MAX_DEPTH);
  1136. tree->dnwd_equiv_surf.resize(MAX_DEPTH);
  1137. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1138. Real_t c[3]={0.0,0.0,0.0};
  1139. tree->upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1140. tree->upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1141. tree->dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1142. tree->dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1143. tree->upwd_check_surf[depth]=u_check_surf(m,c,depth);
  1144. tree->upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
  1145. tree->dnwd_check_surf[depth]=d_check_surf(m,c,depth);
  1146. tree->dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
  1147. }
  1148. }
  1149. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1150. vec_lst.push_back(&tree->upwd_check_surf[depth]);
  1151. vec_lst.push_back(&tree->upwd_equiv_surf[depth]);
  1152. vec_lst.push_back(&tree->dnwd_check_surf[depth]);
  1153. vec_lst.push_back(&tree->dnwd_equiv_surf[depth]);
  1154. }
  1155. }
  1156. }
  1157. // Create extra auxiliary buffer.
  1158. if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1);
  1159. for(size_t indx=0;indx<vec_list.size();indx++){ // Resize buffer
  1160. Matrix<Real_t>& aux_buff=buff_list[vec_list.size()];
  1161. Matrix<Real_t>& buff=buff_list[indx];
  1162. std::vector<Vector<Real_t>*>& vec_lst= vec_list[indx];
  1163. bool keep_data=(indx==4 || indx==6);
  1164. size_t n_vec=vec_lst.size();
  1165. { // Continue if nothing to be done.
  1166. if(!n_vec) continue;
  1167. if(buff.Dim(0)*buff.Dim(1)>0){
  1168. bool init_buff=false;
  1169. Real_t* buff_start=&buff[0][0];
  1170. Real_t* buff_end=&buff[0][0]+buff.Dim(0)*buff.Dim(1);
  1171. #pragma omp parallel for reduction(||:init_buff)
  1172. for(size_t i=0;i<n_vec;i++){
  1173. if(vec_lst[i]->Dim() && (&(*vec_lst[i])[0]<buff_start || &(*vec_lst[i])[0]>=buff_end)){
  1174. init_buff=true;
  1175. }
  1176. }
  1177. if(!init_buff) continue;
  1178. }
  1179. }
  1180. std::vector<size_t> vec_size(n_vec);
  1181. std::vector<size_t> vec_disp(n_vec);
  1182. if(n_vec){ // Set vec_size and vec_disp
  1183. #pragma omp parallel for
  1184. for(size_t i=0;i<n_vec;i++){ // Set vec_size
  1185. vec_size[i]=vec_lst[i]->Dim();
  1186. }
  1187. vec_disp[0]=0;
  1188. omp_par::scan(&vec_size[0],&vec_disp[0],n_vec);
  1189. }
  1190. size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1];
  1191. if(!buff_size) continue;
  1192. if(keep_data){ // Copy to aux_buff
  1193. if(aux_buff.Dim(0)*aux_buff.Dim(1)<buff_size){ // Resize aux_buff
  1194. aux_buff.ReInit(1,buff_size*1.05);
  1195. }
  1196. #pragma omp parallel for
  1197. for(size_t i=0;i<n_vec;i++){
  1198. if(&(*vec_lst[i])[0]){
  1199. mem::memcopy(&aux_buff[0][0]+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
  1200. }
  1201. }
  1202. }
  1203. if(buff.Dim(0)*buff.Dim(1)<buff_size){ // Resize buff
  1204. buff.ReInit(1,buff_size*1.05);
  1205. }
  1206. if(keep_data){ // Copy to buff (from aux_buff)
  1207. #pragma omp parallel for
  1208. for(size_t tid=0;tid<omp_p;tid++){
  1209. size_t a=(buff_size*(tid+0))/omp_p;
  1210. size_t b=(buff_size*(tid+1))/omp_p;
  1211. mem::memcopy(&buff[0][0]+a,&aux_buff[0][0]+a,(b-a)*sizeof(Real_t));
  1212. }
  1213. }
  1214. #pragma omp parallel for
  1215. for(size_t i=0;i<n_vec;i++){ // ReInit vectors
  1216. vec_lst[i]->ReInit(vec_size[i],&buff[0][0]+vec_disp[i],false);
  1217. }
  1218. }
  1219. }
  1220. template <class FMMNode>
  1221. void FMM_Pts<FMMNode>::SetupPrecomp(SetupData<Real_t>& setup_data, bool device){
  1222. if(setup_data.precomp_data==NULL || setup_data.level>MAX_DEPTH) return;
  1223. Profile::Tic("SetupPrecomp",&this->comm,true,25);
  1224. { // Build precomp_data
  1225. size_t precomp_offset=0;
  1226. int level=setup_data.level;
  1227. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1228. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1229. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1230. Mat_Type& interac_type=interac_type_lst[type_indx];
  1231. this->PrecompAll(interac_type, level); // Compute matrices.
  1232. precomp_offset=this->mat->CompactData(level, interac_type, precomp_data, precomp_offset);
  1233. }
  1234. }
  1235. Profile::Toc();
  1236. if(device){ // Host2Device
  1237. Profile::Tic("Host2Device",&this->comm,false,25);
  1238. setup_data.precomp_data->AllocDevice(true);
  1239. Profile::Toc();
  1240. }
  1241. }
  1242. template <class FMMNode>
  1243. void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
  1244. int level=setup_data.level;
  1245. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1246. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1247. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1248. Matrix<Real_t>& input_data=*setup_data. input_data;
  1249. Matrix<Real_t>& output_data=*setup_data.output_data;
  1250. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector;
  1251. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector;
  1252. size_t n_in =nodes_in .size();
  1253. size_t n_out=nodes_out.size();
  1254. // Setup precomputed data.
  1255. if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  1256. // Build interac_data
  1257. Profile::Tic("Interac-Data",&this->comm,true,25);
  1258. Matrix<char>& interac_data=setup_data.interac_data;
  1259. { // Build precomp_data, interac_data
  1260. std::vector<size_t> interac_mat;
  1261. std::vector<size_t> interac_cnt;
  1262. std::vector<size_t> interac_blk;
  1263. std::vector<size_t> input_perm;
  1264. std::vector<size_t> output_perm;
  1265. size_t dof=0, M_dim0=0, M_dim1=0;
  1266. size_t precomp_offset=0;
  1267. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  1268. if(n_out && n_in) for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1269. Mat_Type& interac_type=interac_type_lst[type_indx];
  1270. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  1271. Matrix<size_t> precomp_data_offset;
  1272. { // Load precomp_data for interac_type.
  1273. struct HeaderData{
  1274. size_t total_size;
  1275. size_t level;
  1276. size_t mat_cnt ;
  1277. size_t max_depth;
  1278. };
  1279. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1280. char* indx_ptr=precomp_data[0]+precomp_offset;
  1281. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  1282. precomp_data_offset.ReInit(header.mat_cnt,(1+(2+2)*header.max_depth), (size_t*)indx_ptr, false);
  1283. precomp_offset+=header.total_size;
  1284. }
  1285. Matrix<FMMNode*> src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero();
  1286. Matrix<FMMNode*> trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero();
  1287. { // Build trg_interac_list
  1288. #pragma omp parallel for
  1289. for(size_t i=0;i<n_out;i++){
  1290. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  1291. Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  1292. mem::memcopy(&trg_interac_list[i][0], &lst[0], lst.Dim()*sizeof(FMMNode*));
  1293. assert(lst.Dim()==mat_cnt);
  1294. }
  1295. }
  1296. }
  1297. { // Build src_interac_list
  1298. #pragma omp parallel for
  1299. for(size_t i=0;i<n_out;i++){
  1300. for(size_t j=0;j<mat_cnt;j++)
  1301. if(trg_interac_list[i][j]!=NULL){
  1302. trg_interac_list[i][j]->node_id=n_in;
  1303. }
  1304. }
  1305. #pragma omp parallel for
  1306. for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
  1307. #pragma omp parallel for
  1308. for(size_t i=0;i<n_out;i++){
  1309. for(size_t j=0;j<mat_cnt;j++){
  1310. if(trg_interac_list[i][j]!=NULL){
  1311. if(trg_interac_list[i][j]->node_id==n_in){
  1312. trg_interac_list[i][j]=NULL;
  1313. }else{
  1314. src_interac_list[trg_interac_list[i][j]->node_id][j]=(FMMNode*)nodes_out[i];
  1315. }
  1316. }
  1317. }
  1318. }
  1319. }
  1320. Matrix<size_t> interac_dsp(n_out,mat_cnt);
  1321. std::vector<size_t> interac_blk_dsp(1,0);
  1322. { // Determine dof, M_dim0, M_dim1
  1323. dof=1;
  1324. Matrix<Real_t>& M0 = this->interac_list.ClassMat(level, interac_type_lst[0], 0);
  1325. M_dim0=M0.Dim(0); M_dim1=M0.Dim(1);
  1326. }
  1327. { // Determine interaction blocks which fit in memory.
  1328. size_t vec_size=(M_dim0+M_dim1)*sizeof(Real_t)*dof;
  1329. for(size_t j=0;j<mat_cnt;j++){// Determine minimum buff_size
  1330. size_t vec_cnt=0;
  1331. for(size_t i=0;i<n_out;i++){
  1332. if(trg_interac_list[i][j]!=NULL) vec_cnt++;
  1333. }
  1334. if(buff_size<vec_cnt*vec_size)
  1335. buff_size=vec_cnt*vec_size;
  1336. }
  1337. size_t interac_dsp_=0;
  1338. for(size_t j=0;j<mat_cnt;j++){
  1339. for(size_t i=0;i<n_out;i++){
  1340. interac_dsp[i][j]=interac_dsp_;
  1341. if(trg_interac_list[i][j]!=NULL) interac_dsp_++;
  1342. }
  1343. if(interac_dsp_*vec_size>buff_size) // Comment to disable symmetries.
  1344. {
  1345. interac_blk.push_back(j-interac_blk_dsp.back());
  1346. interac_blk_dsp.push_back(j);
  1347. size_t offset=interac_dsp[0][j];
  1348. for(size_t i=0;i<n_out;i++) interac_dsp[i][j]-=offset;
  1349. interac_dsp_-=offset;
  1350. assert(interac_dsp_*vec_size<=buff_size); // Problem too big for buff_size.
  1351. }
  1352. interac_mat.push_back(precomp_data_offset[this->interac_list.InteracClass(interac_type,j)][0]);
  1353. interac_cnt.push_back(interac_dsp_-interac_dsp[0][j]);
  1354. }
  1355. interac_blk.push_back(mat_cnt-interac_blk_dsp.back());
  1356. interac_blk_dsp.push_back(mat_cnt);
  1357. }
  1358. { // Determine input_perm.
  1359. size_t vec_size=M_dim0*dof;
  1360. for(size_t i=0;i<n_out;i++) ((FMMNode*)nodes_out[i])->node_id=i;
  1361. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1362. for(size_t i=0;i<n_in ;i++){
  1363. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1364. FMMNode_t* trg_node=src_interac_list[i][j];
  1365. if(trg_node!=NULL && trg_node->node_id<n_out){
  1366. size_t depth=(this->ScaleInvar()?trg_node->Depth():0);
  1367. input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
  1368. input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
  1369. input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
  1370. input_perm .push_back((size_t)(& input_vector[i][0][0]- input_data[0])); // src_ptr
  1371. assert(input_vector[i]->Dim()==vec_size);
  1372. }
  1373. }
  1374. }
  1375. }
  1376. }
  1377. { // Determine output_perm
  1378. size_t vec_size=M_dim1*dof;
  1379. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1380. for(size_t i=0;i<n_out;i++){
  1381. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1382. if(trg_interac_list[i][j]!=NULL){
  1383. size_t depth=(this->ScaleInvar()?((FMMNode*)nodes_out[i])->Depth():0);
  1384. output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem
  1385. output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal
  1386. output_perm.push_back(interac_dsp[ i ][j]*vec_size*sizeof(Real_t)); // src_ptr
  1387. output_perm.push_back((size_t)(&output_vector[i][0][0]-output_data[0])); // trg_ptr
  1388. assert(output_vector[i]->Dim()==vec_size);
  1389. }
  1390. }
  1391. }
  1392. }
  1393. }
  1394. }
  1395. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  1396. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.ReInit(buff_size);
  1397. { // Set interac_data.
  1398. size_t data_size=sizeof(size_t)*4;
  1399. data_size+=sizeof(size_t)+interac_blk.size()*sizeof(size_t);
  1400. data_size+=sizeof(size_t)+interac_cnt.size()*sizeof(size_t);
  1401. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  1402. data_size+=sizeof(size_t)+ input_perm.size()*sizeof(size_t);
  1403. data_size+=sizeof(size_t)+output_perm.size()*sizeof(size_t);
  1404. if(interac_data.Dim(0)*interac_data.Dim(1)<sizeof(size_t)){
  1405. data_size+=sizeof(size_t);
  1406. interac_data.ReInit(1,data_size);
  1407. ((size_t*)&interac_data[0][0])[0]=sizeof(size_t);
  1408. }else{
  1409. size_t pts_data_size=*((size_t*)&interac_data[0][0]);
  1410. assert(interac_data.Dim(0)*interac_data.Dim(1)>=pts_data_size);
  1411. data_size+=pts_data_size;
  1412. if(data_size>interac_data.Dim(0)*interac_data.Dim(1)){ //Resize and copy interac_data.
  1413. Matrix< char> pts_interac_data=interac_data;
  1414. interac_data.ReInit(1,data_size);
  1415. mem::memcopy(&interac_data[0][0],&pts_interac_data[0][0],pts_data_size);
  1416. }
  1417. }
  1418. char* data_ptr=&interac_data[0][0];
  1419. data_ptr+=((size_t*)data_ptr)[0];
  1420. ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t);
  1421. ((size_t*)data_ptr)[0]= M_dim0; data_ptr+=sizeof(size_t);
  1422. ((size_t*)data_ptr)[0]= M_dim1; data_ptr+=sizeof(size_t);
  1423. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  1424. ((size_t*)data_ptr)[0]=interac_blk.size(); data_ptr+=sizeof(size_t);
  1425. mem::memcopy(data_ptr, &interac_blk[0], interac_blk.size()*sizeof(size_t));
  1426. data_ptr+=interac_blk.size()*sizeof(size_t);
  1427. ((size_t*)data_ptr)[0]=interac_cnt.size(); data_ptr+=sizeof(size_t);
  1428. mem::memcopy(data_ptr, &interac_cnt[0], interac_cnt.size()*sizeof(size_t));
  1429. data_ptr+=interac_cnt.size()*sizeof(size_t);
  1430. ((size_t*)data_ptr)[0]=interac_mat.size(); data_ptr+=sizeof(size_t);
  1431. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  1432. data_ptr+=interac_mat.size()*sizeof(size_t);
  1433. ((size_t*)data_ptr)[0]= input_perm.size(); data_ptr+=sizeof(size_t);
  1434. mem::memcopy(data_ptr, & input_perm[0], input_perm.size()*sizeof(size_t));
  1435. data_ptr+= input_perm.size()*sizeof(size_t);
  1436. ((size_t*)data_ptr)[0]=output_perm.size(); data_ptr+=sizeof(size_t);
  1437. mem::memcopy(data_ptr, &output_perm[0], output_perm.size()*sizeof(size_t));
  1438. data_ptr+=output_perm.size()*sizeof(size_t);
  1439. }
  1440. }
  1441. Profile::Toc();
  1442. if(device){ // Host2Device
  1443. Profile::Tic("Host2Device",&this->comm,false,25);
  1444. setup_data.interac_data .AllocDevice(true);
  1445. Profile::Toc();
  1446. }
  1447. }
  1448. #if defined(PVFMM_HAVE_CUDA)
  1449. #include <fmm_pts_gpu.hpp>
  1450. template <class Real_t, int SYNC>
  1451. void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Comm& comm) {
  1452. cudaStream_t* stream = pvfmm::CUDA_Lock::acquire_stream();
  1453. Profile::Tic("Host2Device",&comm,false,25);
  1454. typename Matrix<char>::Device interac_data;
  1455. typename Vector<char>::Device buff;
  1456. typename Matrix<char>::Device precomp_data_d;
  1457. typename Matrix<char>::Device interac_data_d;
  1458. typename Matrix<Real_t>::Device input_data_d;
  1459. typename Matrix<Real_t>::Device output_data_d;
  1460. interac_data = setup_data.interac_data;
  1461. buff = dev_buffer. AllocDevice(false);
  1462. precomp_data_d= setup_data.precomp_data->AllocDevice(false);
  1463. interac_data_d= setup_data.interac_data. AllocDevice(false);
  1464. input_data_d = setup_data. input_data->AllocDevice(false);
  1465. output_data_d = setup_data. output_data->AllocDevice(false);
  1466. Profile::Toc();
  1467. Profile::Tic("DeviceComp",&comm,false,20);
  1468. { // Offloaded computation.
  1469. size_t data_size, M_dim0, M_dim1, dof;
  1470. Vector<size_t> interac_blk;
  1471. Vector<size_t> interac_cnt;
  1472. Vector<size_t> interac_mat;
  1473. Vector<size_t> input_perm_d;
  1474. Vector<size_t> output_perm_d;
  1475. { // Set interac_data.
  1476. char* data_ptr=&interac_data [0][0];
  1477. char* dev_ptr=&interac_data_d[0][0];
  1478. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; dev_ptr += data_size;
  1479. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1480. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1481. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1482. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1483. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1484. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1485. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1486. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1487. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1488. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1489. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1490. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1491. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1492. input_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1493. data_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1494. dev_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1495. output_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1496. data_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1497. dev_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1498. }
  1499. { // interactions
  1500. size_t interac_indx = 0;
  1501. size_t interac_blk_dsp = 0;
  1502. cudaError_t error;
  1503. for (size_t k = 0; k < interac_blk.Dim(); k++) {
  1504. size_t vec_cnt=0;
  1505. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1506. if(vec_cnt==0){
  1507. //interac_indx += vec_cnt;
  1508. interac_blk_dsp += interac_blk[k];
  1509. continue;
  1510. }
  1511. char *buff_in_d =&buff[0];
  1512. char *buff_out_d =&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1513. { // Input permutation.
  1514. in_perm_gpu<Real_t>(&precomp_data_d[0][0], &input_data_d[0][0], buff_in_d,
  1515. &input_perm_d[interac_indx*4], vec_cnt, M_dim0, stream);
  1516. }
  1517. size_t vec_cnt0 = 0;
  1518. for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
  1519. size_t vec_cnt1 = 0;
  1520. size_t interac_mat0 = interac_mat[j];
  1521. for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) vec_cnt1 += interac_cnt[j];
  1522. Matrix<Real_t> M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
  1523. Matrix<Real_t> Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1524. Matrix<Real_t> Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1525. Matrix<Real_t>::CUBLASGEMM(Mt_d, Ms_d, M_d);
  1526. vec_cnt0 += vec_cnt1;
  1527. }
  1528. { // Output permutation.
  1529. out_perm_gpu<Real_t>(&precomp_data_d[0][0], &output_data_d[0][0], buff_out_d,
  1530. &output_perm_d[interac_indx*4], vec_cnt, M_dim1, stream);
  1531. }
  1532. interac_indx += vec_cnt;
  1533. interac_blk_dsp += interac_blk[k];
  1534. }
  1535. }
  1536. }
  1537. Profile::Toc();
  1538. if(SYNC) CUDA_Lock::wait();
  1539. }
  1540. #endif
  1541. template <class FMMNode>
  1542. template <int SYNC>
  1543. void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
  1544. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  1545. Profile::Tic("Host2Device",&this->comm,false,25);
  1546. Profile::Toc();
  1547. Profile::Tic("DeviceComp",&this->comm,false,20);
  1548. Profile::Toc();
  1549. return;
  1550. }
  1551. #if defined(PVFMM_HAVE_CUDA)
  1552. if (device) {
  1553. EvalListGPU<Real_t, SYNC>(setup_data, this->dev_buffer, this->comm);
  1554. return;
  1555. }
  1556. #endif
  1557. Profile::Tic("Host2Device",&this->comm,false,25);
  1558. typename Vector<char>::Device buff;
  1559. typename Matrix<char>::Device precomp_data;
  1560. typename Matrix<char>::Device interac_data;
  1561. typename Matrix<Real_t>::Device input_data;
  1562. typename Matrix<Real_t>::Device output_data;
  1563. if(device){
  1564. buff = this-> dev_buffer. AllocDevice(false);
  1565. precomp_data= setup_data.precomp_data->AllocDevice(false);
  1566. interac_data= setup_data.interac_data. AllocDevice(false);
  1567. input_data = setup_data. input_data->AllocDevice(false);
  1568. output_data = setup_data. output_data->AllocDevice(false);
  1569. }else{
  1570. buff = this-> cpu_buffer;
  1571. precomp_data=*setup_data.precomp_data;
  1572. interac_data= setup_data.interac_data;
  1573. input_data =*setup_data. input_data;
  1574. output_data =*setup_data. output_data;
  1575. }
  1576. Profile::Toc();
  1577. Profile::Tic("DeviceComp",&this->comm,false,20);
  1578. int lock_idx=-1;
  1579. int wait_lock_idx=-1;
  1580. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  1581. if(device) lock_idx=MIC_Lock::get_lock();
  1582. #ifdef __INTEL_OFFLOAD
  1583. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  1584. #endif
  1585. { // Offloaded computation.
  1586. // Set interac_data.
  1587. size_t data_size, M_dim0, M_dim1, dof;
  1588. Vector<size_t> interac_blk;
  1589. Vector<size_t> interac_cnt;
  1590. Vector<size_t> interac_mat;
  1591. Vector<size_t> input_perm;
  1592. Vector<size_t> output_perm;
  1593. { // Set interac_data.
  1594. char* data_ptr=&interac_data[0][0];
  1595. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
  1596. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1597. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1598. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1599. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1600. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1601. data_ptr+=sizeof(size_t)+interac_blk.Dim()*sizeof(size_t);
  1602. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1603. data_ptr+=sizeof(size_t)+interac_cnt.Dim()*sizeof(size_t);
  1604. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1605. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  1606. input_perm .ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1607. data_ptr+=sizeof(size_t)+ input_perm.Dim()*sizeof(size_t);
  1608. output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1609. data_ptr+=sizeof(size_t)+output_perm.Dim()*sizeof(size_t);
  1610. }
  1611. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  1612. //Compute interaction from Chebyshev source density.
  1613. { // interactions
  1614. int omp_p=omp_get_max_threads();
  1615. size_t interac_indx=0;
  1616. size_t interac_blk_dsp=0;
  1617. for(size_t k=0;k<interac_blk.Dim();k++){
  1618. size_t vec_cnt=0;
  1619. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1620. if(vec_cnt==0){
  1621. //interac_indx += vec_cnt;
  1622. interac_blk_dsp += interac_blk[k];
  1623. continue;
  1624. }
  1625. char* buff_in =&buff[0];
  1626. char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1627. // Input permutation.
  1628. #pragma omp parallel for
  1629. for(int tid=0;tid<omp_p;tid++){
  1630. size_t a=( tid *vec_cnt)/omp_p;
  1631. size_t b=((tid+1)*vec_cnt)/omp_p;
  1632. for(size_t i=a;i<b;i++){
  1633. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]);
  1634. const Real_t* scal=( Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]);
  1635. const Real_t* v_in =( Real_t*)( input_data[0]+input_perm[(interac_indx+i)*4+3]);
  1636. Real_t* v_out=( Real_t*)( buff_in +input_perm[(interac_indx+i)*4+2]);
  1637. // TODO: Fix for dof>1
  1638. #ifdef __MIC__
  1639. {
  1640. __m512d v8;
  1641. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1642. size_t j_end =(((uintptr_t)(v_out+M_dim0) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1643. j_start/=sizeof(Real_t);
  1644. j_end /=sizeof(Real_t);
  1645. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1646. assert(((uintptr_t)(v_out+j_start))%64==0);
  1647. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1648. size_t j=0;
  1649. for(;j<j_start;j++ ){
  1650. v_out[j]=v_in[perm[j]]*scal[j];
  1651. }
  1652. for(;j<j_end ;j+=8){
  1653. v8=_mm512_setr_pd(
  1654. v_in[perm[j+0]]*scal[j+0],
  1655. v_in[perm[j+1]]*scal[j+1],
  1656. v_in[perm[j+2]]*scal[j+2],
  1657. v_in[perm[j+3]]*scal[j+3],
  1658. v_in[perm[j+4]]*scal[j+4],
  1659. v_in[perm[j+5]]*scal[j+5],
  1660. v_in[perm[j+6]]*scal[j+6],
  1661. v_in[perm[j+7]]*scal[j+7]);
  1662. _mm512_storenrngo_pd(v_out+j,v8);
  1663. }
  1664. for(;j<M_dim0 ;j++ ){
  1665. v_out[j]=v_in[perm[j]]*scal[j];
  1666. }
  1667. }
  1668. #else
  1669. for(size_t j=0;j<M_dim0;j++ ){
  1670. v_out[j]=v_in[perm[j]]*scal[j];
  1671. }
  1672. #endif
  1673. }
  1674. }
  1675. size_t vec_cnt0=0;
  1676. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];){
  1677. size_t vec_cnt1=0;
  1678. size_t interac_mat0=interac_mat[j];
  1679. for(;j<interac_blk_dsp+interac_blk[k] && interac_mat[j]==interac_mat0;j++) vec_cnt1+=interac_cnt[j];
  1680. Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
  1681. #ifdef __MIC__
  1682. {
  1683. Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1684. Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1685. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1686. }
  1687. #else
  1688. #pragma omp parallel for
  1689. for(int tid=0;tid<omp_p;tid++){
  1690. size_t a=(dof*vec_cnt1*(tid ))/omp_p;
  1691. size_t b=(dof*vec_cnt1*(tid+1))/omp_p;
  1692. Matrix<Real_t> Ms(b-a, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t))+M_dim0*a, false);
  1693. Matrix<Real_t> Mt(b-a, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t))+M_dim1*a, false);
  1694. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1695. }
  1696. #endif
  1697. vec_cnt0+=vec_cnt1;
  1698. }
  1699. // Output permutation.
  1700. #pragma omp parallel for
  1701. for(int tid=0;tid<omp_p;tid++){
  1702. size_t a=( tid *vec_cnt)/omp_p;
  1703. size_t b=((tid+1)*vec_cnt)/omp_p;
  1704. if(tid> 0 && a<vec_cnt){ // Find 'a' independent of other threads.
  1705. size_t out_ptr=output_perm[(interac_indx+a)*4+3];
  1706. if(tid> 0) while(a<vec_cnt && out_ptr==output_perm[(interac_indx+a)*4+3]) a++;
  1707. }
  1708. if(tid<omp_p-1 && b<vec_cnt){ // Find 'b' independent of other threads.
  1709. size_t out_ptr=output_perm[(interac_indx+b)*4+3];
  1710. if(tid<omp_p-1) while(b<vec_cnt && out_ptr==output_perm[(interac_indx+b)*4+3]) b++;
  1711. }
  1712. for(size_t i=a;i<b;i++){ // Compute permutations.
  1713. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
  1714. const Real_t* scal=( Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
  1715. const Real_t* v_in =( Real_t*)( buff_out +output_perm[(interac_indx+i)*4+2]);
  1716. Real_t* v_out=( Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]);
  1717. // TODO: Fix for dof>1
  1718. #ifdef __MIC__
  1719. {
  1720. __m512d v8;
  1721. __m512d v_old;
  1722. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1723. size_t j_end =(((uintptr_t)(v_out+M_dim1) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1724. j_start/=sizeof(Real_t);
  1725. j_end /=sizeof(Real_t);
  1726. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1727. assert(((uintptr_t)(v_out+j_start))%64==0);
  1728. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1729. size_t j=0;
  1730. for(;j<j_start;j++ ){
  1731. v_out[j]+=v_in[perm[j]]*scal[j];
  1732. }
  1733. for(;j<j_end ;j+=8){
  1734. v_old=_mm512_load_pd(v_out+j);
  1735. v8=_mm512_setr_pd(
  1736. v_in[perm[j+0]]*scal[j+0],
  1737. v_in[perm[j+1]]*scal[j+1],
  1738. v_in[perm[j+2]]*scal[j+2],
  1739. v_in[perm[j+3]]*scal[j+3],
  1740. v_in[perm[j+4]]*scal[j+4],
  1741. v_in[perm[j+5]]*scal[j+5],
  1742. v_in[perm[j+6]]*scal[j+6],
  1743. v_in[perm[j+7]]*scal[j+7]);
  1744. v_old=_mm512_add_pd(v_old, v8);
  1745. _mm512_storenrngo_pd(v_out+j,v_old);
  1746. }
  1747. for(;j<M_dim1 ;j++ ){
  1748. v_out[j]+=v_in[perm[j]]*scal[j];
  1749. }
  1750. }
  1751. #else
  1752. for(size_t j=0;j<M_dim1;j++ ){
  1753. v_out[j]+=v_in[perm[j]]*scal[j];
  1754. }
  1755. #endif
  1756. }
  1757. }
  1758. interac_indx+=vec_cnt;
  1759. interac_blk_dsp+=interac_blk[k];
  1760. }
  1761. }
  1762. if(device) MIC_Lock::release_lock(lock_idx);
  1763. }
  1764. #ifdef __INTEL_OFFLOAD
  1765. if(SYNC){
  1766. #pragma offload if(device) target(mic:0)
  1767. {if(device) MIC_Lock::wait_lock(lock_idx);}
  1768. }
  1769. #endif
  1770. Profile::Toc();
  1771. }
  1772. template <class FMMNode>
  1773. void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  1774. if(!this->MultipoleOrder()) return;
  1775. { // Set setup_data
  1776. setup_data. level=level;
  1777. setup_data.kernel=kernel->k_s2m;
  1778. setup_data. input_data=&buff[4];
  1779. setup_data.output_data=&buff[0];
  1780. setup_data. coord_data=&buff[6];
  1781. Vector<FMMNode_t*>& nodes_in =n_list[4];
  1782. Vector<FMMNode_t*>& nodes_out=n_list[0];
  1783. setup_data.nodes_in .clear();
  1784. setup_data.nodes_out.clear();
  1785. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
  1786. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->pt_cnt[0] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  1787. }
  1788. struct PackedData{
  1789. size_t len;
  1790. Matrix<Real_t>* ptr;
  1791. Vector<size_t> cnt;
  1792. Vector<size_t> dsp;
  1793. };
  1794. struct InteracData{
  1795. Vector<size_t> in_node;
  1796. Vector<size_t> scal_idx;
  1797. Vector<Real_t> coord_shift;
  1798. Vector<size_t> interac_cnt;
  1799. Vector<size_t> interac_dsp;
  1800. Vector<Real_t> scal[4*MAX_DEPTH];
  1801. Matrix<Real_t> M[4];
  1802. };
  1803. struct ptSetupData{
  1804. int level;
  1805. const Kernel<Real_t>* kernel;
  1806. PackedData src_coord; // Src coord
  1807. PackedData src_value; // Src density
  1808. PackedData srf_coord; // Srf coord
  1809. PackedData srf_value; // Srf density
  1810. PackedData trg_coord; // Trg coord
  1811. PackedData trg_value; // Trg potential
  1812. InteracData interac_data;
  1813. };
  1814. ptSetupData data;
  1815. data. level=setup_data. level;
  1816. data.kernel=setup_data.kernel;
  1817. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1818. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1819. { // Set src data
  1820. std::vector<void*>& nodes=nodes_in;
  1821. PackedData& coord=data.src_coord;
  1822. PackedData& value=data.src_value;
  1823. coord.ptr=setup_data. coord_data;
  1824. value.ptr=setup_data. input_data;
  1825. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  1826. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  1827. coord.cnt.ReInit(nodes.size());
  1828. coord.dsp.ReInit(nodes.size());
  1829. value.cnt.ReInit(nodes.size());
  1830. value.dsp.ReInit(nodes.size());
  1831. #pragma omp parallel for
  1832. for(size_t i=0;i<nodes.size();i++){
  1833. ((FMMNode_t*)nodes[i])->node_id=i;
  1834. Vector<Real_t>& coord_vec=((FMMNode*)nodes[i])->src_coord;
  1835. Vector<Real_t>& value_vec=((FMMNode*)nodes[i])->src_value;
  1836. if(coord_vec.Dim()){
  1837. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  1838. assert(coord.dsp[i]<coord.len);
  1839. coord.cnt[i]=coord_vec.Dim();
  1840. }else{
  1841. coord.dsp[i]=0;
  1842. coord.cnt[i]=0;
  1843. }
  1844. if(value_vec.Dim()){
  1845. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  1846. assert(value.dsp[i]<value.len);
  1847. value.cnt[i]=value_vec.Dim();
  1848. }else{
  1849. value.dsp[i]=0;
  1850. value.cnt[i]=0;
  1851. }
  1852. }
  1853. }
  1854. { // Set srf data
  1855. std::vector<void*>& nodes=nodes_in;
  1856. PackedData& coord=data.srf_coord;
  1857. PackedData& value=data.srf_value;
  1858. coord.ptr=setup_data. coord_data;
  1859. value.ptr=setup_data. input_data;
  1860. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  1861. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  1862. coord.cnt.ReInit(nodes.size());
  1863. coord.dsp.ReInit(nodes.size());
  1864. value.cnt.ReInit(nodes.size());
  1865. value.dsp.ReInit(nodes.size());
  1866. #pragma omp parallel for
  1867. for(size_t i=0;i<nodes.size();i++){
  1868. Vector<Real_t>& coord_vec=((FMMNode*)nodes[i])->surf_coord;
  1869. Vector<Real_t>& value_vec=((FMMNode*)nodes[i])->surf_value;
  1870. if(coord_vec.Dim()){
  1871. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  1872. assert(coord.dsp[i]<coord.len);
  1873. coord.cnt[i]=coord_vec.Dim();
  1874. }else{
  1875. coord.dsp[i]=0;
  1876. coord.cnt[i]=0;
  1877. }
  1878. if(value_vec.Dim()){
  1879. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  1880. assert(value.dsp[i]<value.len);
  1881. value.cnt[i]=value_vec.Dim();
  1882. }else{
  1883. value.dsp[i]=0;
  1884. value.cnt[i]=0;
  1885. }
  1886. }
  1887. }
  1888. { // Set trg data
  1889. std::vector<void*>& nodes=nodes_out;
  1890. PackedData& coord=data.trg_coord;
  1891. PackedData& value=data.trg_value;
  1892. coord.ptr=setup_data. coord_data;
  1893. value.ptr=setup_data.output_data;
  1894. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  1895. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  1896. coord.cnt.ReInit(nodes.size());
  1897. coord.dsp.ReInit(nodes.size());
  1898. value.cnt.ReInit(nodes.size());
  1899. value.dsp.ReInit(nodes.size());
  1900. #pragma omp parallel for
  1901. for(size_t i=0;i<nodes.size();i++){
  1902. Vector<Real_t>& coord_vec=tree->upwd_check_surf[((FMMNode*)nodes[i])->Depth()];
  1903. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->upward_equiv;
  1904. if(coord_vec.Dim()){
  1905. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  1906. assert(coord.dsp[i]<coord.len);
  1907. coord.cnt[i]=coord_vec.Dim();
  1908. }else{
  1909. coord.dsp[i]=0;
  1910. coord.cnt[i]=0;
  1911. }
  1912. if(value_vec.Dim()){
  1913. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  1914. assert(value.dsp[i]<value.len);
  1915. value.cnt[i]=value_vec.Dim();
  1916. }else{
  1917. value.dsp[i]=0;
  1918. value.cnt[i]=0;
  1919. }
  1920. }
  1921. }
  1922. { // Set interac_data
  1923. int omp_p=omp_get_max_threads();
  1924. std::vector<std::vector<size_t> > in_node_(omp_p);
  1925. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  1926. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  1927. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  1928. data.interac_data.M[2]=this->mat->Mat(level, UC2UE0_Type, 0);
  1929. data.interac_data.M[3]=this->mat->Mat(level, UC2UE1_Type, 0);
  1930. if(this->ScaleInvar()){ // Set scal
  1931. const Kernel<Real_t>* ker=kernel->k_m2m;
  1932. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+2]
  1933. Vector<Real_t>& scal=data.interac_data.scal[l*4+2];
  1934. Vector<Real_t>& scal_exp=ker->trg_scal;
  1935. scal.ReInit(scal_exp.Dim());
  1936. for(size_t i=0;i<scal.Dim();i++){
  1937. scal[i]=std::pow(2.0,-scal_exp[i]*l);
  1938. }
  1939. }
  1940. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+3]
  1941. Vector<Real_t>& scal=data.interac_data.scal[l*4+3];
  1942. Vector<Real_t>& scal_exp=ker->src_scal;
  1943. scal.ReInit(scal_exp.Dim());
  1944. for(size_t i=0;i<scal.Dim();i++){
  1945. scal[i]=std::pow(2.0,-scal_exp[i]*l);
  1946. }
  1947. }
  1948. }
  1949. #pragma omp parallel for
  1950. for(size_t tid=0;tid<omp_p;tid++){
  1951. std::vector<size_t>& in_node =in_node_[tid] ;
  1952. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  1953. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  1954. std::vector<size_t>& interac_cnt=interac_cnt_[tid];
  1955. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  1956. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  1957. for(size_t i=a;i<b;i++){
  1958. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  1959. Real_t s=std::pow(0.5,tnode->Depth());
  1960. size_t interac_cnt_=0;
  1961. { // S2U_Type
  1962. Mat_Type type=S2U_Type;
  1963. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  1964. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  1965. FMMNode_t* snode=intlst[j];
  1966. size_t snode_id=snode->node_id;
  1967. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  1968. in_node.push_back(snode_id);
  1969. scal_idx.push_back(snode->Depth());
  1970. { // set coord_shift
  1971. const int* rel_coord=interac_list.RelativeCoord(type,j);
  1972. const Real_t* scoord=snode->Coord();
  1973. const Real_t* tcoord=tnode->Coord();
  1974. Real_t shift[COORD_DIM];
  1975. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+0.5*s)+(0+0.5*s);
  1976. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+0.5*s)+(0+0.5*s);
  1977. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+0.5*s)+(0+0.5*s);
  1978. coord_shift.push_back(shift[0]);
  1979. coord_shift.push_back(shift[1]);
  1980. coord_shift.push_back(shift[2]);
  1981. }
  1982. interac_cnt_++;
  1983. }
  1984. }
  1985. interac_cnt.push_back(interac_cnt_);
  1986. }
  1987. }
  1988. { // Combine interac data
  1989. InteracData& interac_data=data.interac_data;
  1990. { // in_node
  1991. typedef size_t ElemType;
  1992. std::vector<std::vector<ElemType> >& vec_=in_node_;
  1993. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  1994. std::vector<size_t> vec_dsp(omp_p+1,0);
  1995. for(size_t tid=0;tid<omp_p;tid++){
  1996. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  1997. }
  1998. vec.ReInit(vec_dsp[omp_p]);
  1999. #pragma omp parallel for
  2000. for(size_t tid=0;tid<omp_p;tid++){
  2001. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2002. }
  2003. }
  2004. { // scal_idx
  2005. typedef size_t ElemType;
  2006. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  2007. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  2008. std::vector<size_t> vec_dsp(omp_p+1,0);
  2009. for(size_t tid=0;tid<omp_p;tid++){
  2010. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2011. }
  2012. vec.ReInit(vec_dsp[omp_p]);
  2013. #pragma omp parallel for
  2014. for(size_t tid=0;tid<omp_p;tid++){
  2015. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2016. }
  2017. }
  2018. { // coord_shift
  2019. typedef Real_t ElemType;
  2020. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  2021. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  2022. std::vector<size_t> vec_dsp(omp_p+1,0);
  2023. for(size_t tid=0;tid<omp_p;tid++){
  2024. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2025. }
  2026. vec.ReInit(vec_dsp[omp_p]);
  2027. #pragma omp parallel for
  2028. for(size_t tid=0;tid<omp_p;tid++){
  2029. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2030. }
  2031. }
  2032. { // interac_cnt
  2033. typedef size_t ElemType;
  2034. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  2035. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  2036. std::vector<size_t> vec_dsp(omp_p+1,0);
  2037. for(size_t tid=0;tid<omp_p;tid++){
  2038. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2039. }
  2040. vec.ReInit(vec_dsp[omp_p]);
  2041. #pragma omp parallel for
  2042. for(size_t tid=0;tid<omp_p;tid++){
  2043. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2044. }
  2045. }
  2046. { // interac_dsp
  2047. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  2048. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  2049. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  2050. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  2051. }
  2052. }
  2053. }
  2054. PtSetup(setup_data, &data);
  2055. }
  2056. template <class FMMNode>
  2057. void FMM_Pts<FMMNode>::Source2Up(SetupData<Real_t>& setup_data, bool device){
  2058. if(!this->MultipoleOrder()) return;
  2059. //Add Source2Up contribution.
  2060. this->EvalListPts(setup_data, device);
  2061. }
  2062. template <class FMMNode>
  2063. void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2064. if(!this->MultipoleOrder()) return;
  2065. { // Set setup_data
  2066. setup_data.level=level;
  2067. setup_data.kernel=kernel->k_m2m;
  2068. setup_data.interac_type.resize(1);
  2069. setup_data.interac_type[0]=U2U_Type;
  2070. setup_data. input_data=&buff[0];
  2071. setup_data.output_data=&buff[0];
  2072. Vector<FMMNode_t*>& nodes_in =n_list[0];
  2073. Vector<FMMNode_t*>& nodes_out=n_list[0];
  2074. setup_data.nodes_in .clear();
  2075. setup_data.nodes_out.clear();
  2076. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level+1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
  2077. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level ) && nodes_out[i]->pt_cnt[0]) setup_data.nodes_out.push_back(nodes_out[i]);
  2078. }
  2079. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2080. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2081. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2082. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2083. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
  2084. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
  2085. SetupInterac(setup_data,device);
  2086. }
  2087. template <class FMMNode>
  2088. void FMM_Pts<FMMNode>::Up2Up (SetupData<Real_t>& setup_data, bool device){
  2089. if(!this->MultipoleOrder()) return;
  2090. //Add Up2Up contribution.
  2091. EvalList(setup_data, device);
  2092. }
  2093. template <class FMMNode>
  2094. void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
  2095. if(!this->ScaleInvar() || this->MultipoleOrder()==0) return;
  2096. Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
  2097. assert(node->FMMData()->upward_equiv.Dim()>0);
  2098. int dof=1;
  2099. Vector<Real_t>& upward_equiv=node->FMMData()->upward_equiv;
  2100. Vector<Real_t>& dnward_equiv=node->FMMData()->dnward_equiv;
  2101. assert(upward_equiv.Dim()==M.Dim(0)*dof);
  2102. assert(dnward_equiv.Dim()==M.Dim(1)*dof);
  2103. Matrix<Real_t> d_equiv(dof,M.Dim(0),&dnward_equiv[0],false);
  2104. Matrix<Real_t> u_equiv(dof,M.Dim(1),&upward_equiv[0],false);
  2105. Matrix<Real_t>::GEMM(d_equiv,u_equiv,M);
  2106. }
  2107. template <class FMMNode>
  2108. void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec, Vector<Real_t>& fft_scal,
  2109. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  2110. size_t n1=m*2;
  2111. size_t n2=n1*n1;
  2112. size_t n3=n1*n2;
  2113. size_t n3_=n2*(n1/2+1);
  2114. size_t chld_cnt=1UL<<COORD_DIM;
  2115. size_t fftsize_in =2*n3_*chld_cnt*ker_dim0*dof;
  2116. int omp_p=omp_get_max_threads();
  2117. //Load permutation map.
  2118. size_t n=6*(m-1)*(m-1)+2;
  2119. static Vector<size_t> map;
  2120. { // Build map to reorder upward_equiv
  2121. size_t n_old=map.Dim();
  2122. if(n_old!=n){
  2123. Real_t c[3]={0,0,0};
  2124. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  2125. map.Resize(surf.Dim()/COORD_DIM);
  2126. for(size_t i=0;i<map.Dim();i++)
  2127. map[i]=((size_t)(m-1-surf[i*3]+0.5))+((size_t)(m-1-surf[i*3+1]+0.5))*n1+((size_t)(m-1-surf[i*3+2]+0.5))*n2;
  2128. }
  2129. }
  2130. { // Build FFTW plan.
  2131. if(!vlist_fft_flag){
  2132. int nnn[3]={(int)n1,(int)n1,(int)n1};
  2133. void *fftw_in, *fftw_out;
  2134. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim0*chld_cnt);
  2135. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim0*chld_cnt);
  2136. vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
  2137. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_);
  2138. mem::aligned_delete<Real_t>((Real_t*)fftw_in );
  2139. mem::aligned_delete<Real_t>((Real_t*)fftw_out);
  2140. vlist_fft_flag=true;
  2141. }
  2142. }
  2143. { // Offload section
  2144. size_t n_in = fft_vec.Dim();
  2145. #pragma omp parallel for
  2146. for(int pid=0; pid<omp_p; pid++){
  2147. size_t node_start=(n_in*(pid ))/omp_p;
  2148. size_t node_end =(n_in*(pid+1))/omp_p;
  2149. Vector<Real_t> buffer(fftsize_in, &buffer_[fftsize_in*pid], false);
  2150. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  2151. Matrix<Real_t> upward_equiv(chld_cnt,n*ker_dim0*dof,&input_data[0] + fft_vec[node_idx],false);
  2152. Vector<Real_t> upward_equiv_fft(fftsize_in, &output_data[fftsize_in *node_idx], false);
  2153. upward_equiv_fft.SetZero();
  2154. // Rearrange upward equivalent data.
  2155. for(size_t k=0;k<n;k++){
  2156. size_t idx=map[k];
  2157. for(int j1=0;j1<dof;j1++)
  2158. for(int j0=0;j0<(int)chld_cnt;j0++)
  2159. for(int i=0;i<ker_dim0;i++)
  2160. upward_equiv_fft[idx+(j0+(i+j1*ker_dim0)*chld_cnt)*n3]=upward_equiv[j0][ker_dim0*(n*j1+k)+i]*fft_scal[ker_dim0*node_idx+i];
  2161. }
  2162. // Compute FFT.
  2163. for(int i=0;i<dof;i++)
  2164. FFTW_t<Real_t>::fft_execute_dft_r2c(vlist_fftplan, (Real_t*)&upward_equiv_fft[i* n3 *ker_dim0*chld_cnt],
  2165. (typename FFTW_t<Real_t>::cplx*)&buffer [i*2*n3_*ker_dim0*chld_cnt]);
  2166. //Compute flops.
  2167. #ifndef FFTW3_MKL
  2168. double add, mul, fma;
  2169. FFTW_t<Real_t>::fftw_flops(vlist_fftplan, &add, &mul, &fma);
  2170. #ifndef __INTEL_OFFLOAD0
  2171. Profile::Add_FLOP((long long)(add+mul+2*fma));
  2172. #endif
  2173. #endif
  2174. for(int i=0;i<ker_dim0*dof;i++)
  2175. for(size_t j=0;j<n3_;j++)
  2176. for(size_t k=0;k<chld_cnt;k++){
  2177. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+0]=buffer[2*(n3_*(chld_cnt*i+k)+j)+0];
  2178. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+1]=buffer[2*(n3_*(chld_cnt*i+k)+j)+1];
  2179. }
  2180. }
  2181. }
  2182. }
  2183. }
  2184. template <class FMMNode>
  2185. void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector<size_t>& ifft_vec, Vector<Real_t>& ifft_scal,
  2186. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  2187. size_t n1=m*2;
  2188. size_t n2=n1*n1;
  2189. size_t n3=n1*n2;
  2190. size_t n3_=n2*(n1/2+1);
  2191. size_t chld_cnt=1UL<<COORD_DIM;
  2192. size_t fftsize_out=2*n3_*dof*ker_dim1*chld_cnt;
  2193. int omp_p=omp_get_max_threads();
  2194. //Load permutation map.
  2195. size_t n=6*(m-1)*(m-1)+2;
  2196. static Vector<size_t> map;
  2197. { // Build map to reorder dnward_check
  2198. size_t n_old=map.Dim();
  2199. if(n_old!=n){
  2200. Real_t c[3]={0,0,0};
  2201. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  2202. map.Resize(surf.Dim()/COORD_DIM);
  2203. for(size_t i=0;i<map.Dim();i++)
  2204. map[i]=((size_t)(m*2-0.5-surf[i*3]))+((size_t)(m*2-0.5-surf[i*3+1]))*n1+((size_t)(m*2-0.5-surf[i*3+2]))*n2;
  2205. //map;//.AllocDevice(true);
  2206. }
  2207. }
  2208. { // Build FFTW plan.
  2209. if(!vlist_ifft_flag){
  2210. //Build FFTW plan.
  2211. int nnn[3]={(int)n1,(int)n1,(int)n1};
  2212. Real_t *fftw_in, *fftw_out;
  2213. fftw_in = mem::aligned_new<Real_t>(2*n3_*ker_dim1*chld_cnt);
  2214. fftw_out = mem::aligned_new<Real_t>( n3 *ker_dim1*chld_cnt);
  2215. vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
  2216. (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3);
  2217. mem::aligned_delete<Real_t>(fftw_in);
  2218. mem::aligned_delete<Real_t>(fftw_out);
  2219. vlist_ifft_flag=true;
  2220. }
  2221. }
  2222. { // Offload section
  2223. assert(buffer_.Dim()>=2*fftsize_out*omp_p);
  2224. size_t n_out=ifft_vec.Dim();
  2225. #pragma omp parallel for
  2226. for(int pid=0; pid<omp_p; pid++){
  2227. size_t node_start=(n_out*(pid ))/omp_p;
  2228. size_t node_end =(n_out*(pid+1))/omp_p;
  2229. Vector<Real_t> buffer0(fftsize_out, &buffer_[fftsize_out*(2*pid+0)], false);
  2230. Vector<Real_t> buffer1(fftsize_out, &buffer_[fftsize_out*(2*pid+1)], false);
  2231. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  2232. Vector<Real_t> dnward_check_fft(fftsize_out, &input_data[fftsize_out*node_idx], false);
  2233. Vector<Real_t> dnward_equiv(ker_dim1*n*dof*chld_cnt,&output_data[0] + ifft_vec[node_idx],false);
  2234. //De-interleave data.
  2235. for(int i=0;i<ker_dim1*dof;i++)
  2236. for(size_t j=0;j<n3_;j++)
  2237. for(size_t k=0;k<chld_cnt;k++){
  2238. buffer0[2*(n3_*(ker_dim1*dof*k+i)+j)+0]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+0];
  2239. buffer0[2*(n3_*(ker_dim1*dof*k+i)+j)+1]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+1];
  2240. }
  2241. // Compute FFT.
  2242. for(int i=0;i<dof;i++)
  2243. FFTW_t<Real_t>::fft_execute_dft_c2r(vlist_ifftplan, (typename FFTW_t<Real_t>::cplx*)&buffer0[i*2*n3_*ker_dim1*chld_cnt],
  2244. (Real_t*)&buffer1[i* n3 *ker_dim1*chld_cnt]);
  2245. //Compute flops.
  2246. #ifndef FFTW3_MKL
  2247. double add, mul, fma;
  2248. FFTW_t<Real_t>::fftw_flops(vlist_ifftplan, &add, &mul, &fma);
  2249. #ifndef __INTEL_OFFLOAD0
  2250. Profile::Add_FLOP((long long)(add+mul+2*fma)*dof);
  2251. #endif
  2252. #endif
  2253. // Rearrange downward check data.
  2254. for(size_t k=0;k<n;k++){
  2255. size_t idx=map[k];
  2256. for(int j1=0;j1<dof;j1++)
  2257. for(int j0=0;j0<(int)chld_cnt;j0++)
  2258. for(int i=0;i<ker_dim1;i++)
  2259. dnward_equiv[ker_dim1*(n*(dof*j0+j1)+k)+i]+=buffer1[idx+(i+(j1+j0*dof)*ker_dim1)*n3]*ifft_scal[ker_dim1*node_idx+i];
  2260. }
  2261. }
  2262. }
  2263. }
  2264. }
  2265. template<class Real_t>
  2266. inline void matmult_8x8x2(Real_t*& M_, Real_t*& IN0, Real_t*& IN1, Real_t*& OUT0, Real_t*& OUT1){
  2267. // Generic code.
  2268. Real_t out_reg000, out_reg001, out_reg010, out_reg011;
  2269. Real_t out_reg100, out_reg101, out_reg110, out_reg111;
  2270. Real_t in_reg000, in_reg001, in_reg010, in_reg011;
  2271. Real_t in_reg100, in_reg101, in_reg110, in_reg111;
  2272. Real_t m_reg000, m_reg001, m_reg010, m_reg011;
  2273. Real_t m_reg100, m_reg101, m_reg110, m_reg111;
  2274. //#pragma unroll
  2275. for(int i1=0;i1<8;i1+=2){
  2276. Real_t* IN0_=IN0;
  2277. Real_t* IN1_=IN1;
  2278. out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
  2279. out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
  2280. out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
  2281. out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
  2282. //#pragma unroll
  2283. for(int i2=0;i2<8;i2+=2){
  2284. m_reg000=M_[ 0]; m_reg001=M_[ 1];
  2285. m_reg010=M_[ 2]; m_reg011=M_[ 3];
  2286. m_reg100=M_[16]; m_reg101=M_[17];
  2287. m_reg110=M_[18]; m_reg111=M_[19];
  2288. in_reg000=IN0_[0]; in_reg001=IN0_[1];
  2289. in_reg010=IN0_[2]; in_reg011=IN0_[3];
  2290. in_reg100=IN1_[0]; in_reg101=IN1_[1];
  2291. in_reg110=IN1_[2]; in_reg111=IN1_[3];
  2292. out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
  2293. out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
  2294. out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
  2295. out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
  2296. out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
  2297. out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
  2298. out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
  2299. out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
  2300. out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
  2301. out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
  2302. out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
  2303. out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
  2304. out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
  2305. out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
  2306. out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
  2307. out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
  2308. M_+=32; // Jump to (column+2).
  2309. IN0_+=4;
  2310. IN1_+=4;
  2311. }
  2312. OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
  2313. OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
  2314. OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
  2315. OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
  2316. M_+=4-64*2; // Jump back to first column (row+2).
  2317. OUT0+=4;
  2318. OUT1+=4;
  2319. }
  2320. }
  2321. #if defined(__AVX__) || defined(__SSE3__)
  2322. template<>
  2323. inline void matmult_8x8x2<double>(double*& M_, double*& IN0, double*& IN1, double*& OUT0, double*& OUT1){
  2324. #ifdef __AVX__ //AVX code.
  2325. __m256d out00,out01,out10,out11;
  2326. __m256d out20,out21,out30,out31;
  2327. double* in0__ = IN0;
  2328. double* in1__ = IN1;
  2329. out00 = _mm256_load_pd(OUT0);
  2330. out01 = _mm256_load_pd(OUT1);
  2331. out10 = _mm256_load_pd(OUT0+4);
  2332. out11 = _mm256_load_pd(OUT1+4);
  2333. out20 = _mm256_load_pd(OUT0+8);
  2334. out21 = _mm256_load_pd(OUT1+8);
  2335. out30 = _mm256_load_pd(OUT0+12);
  2336. out31 = _mm256_load_pd(OUT1+12);
  2337. for(int i2=0;i2<8;i2+=2){
  2338. __m256d m00;
  2339. __m256d ot00;
  2340. __m256d mt0,mtt0;
  2341. __m256d in00,in00_r,in01,in01_r;
  2342. in00 = _mm256_broadcast_pd((const __m128d*)in0__);
  2343. in00_r = _mm256_permute_pd(in00,5);
  2344. in01 = _mm256_broadcast_pd((const __m128d*)in1__);
  2345. in01_r = _mm256_permute_pd(in01,5);
  2346. m00 = _mm256_load_pd(M_);
  2347. mt0 = _mm256_unpacklo_pd(m00,m00);
  2348. ot00 = _mm256_mul_pd(mt0,in00);
  2349. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2350. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2351. ot00 = _mm256_mul_pd(mt0,in01);
  2352. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2353. m00 = _mm256_load_pd(M_+4);
  2354. mt0 = _mm256_unpacklo_pd(m00,m00);
  2355. ot00 = _mm256_mul_pd(mt0,in00);
  2356. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2357. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2358. ot00 = _mm256_mul_pd(mt0,in01);
  2359. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2360. m00 = _mm256_load_pd(M_+8);
  2361. mt0 = _mm256_unpacklo_pd(m00,m00);
  2362. ot00 = _mm256_mul_pd(mt0,in00);
  2363. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2364. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2365. ot00 = _mm256_mul_pd(mt0,in01);
  2366. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2367. m00 = _mm256_load_pd(M_+12);
  2368. mt0 = _mm256_unpacklo_pd(m00,m00);
  2369. ot00 = _mm256_mul_pd(mt0,in00);
  2370. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2371. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2372. ot00 = _mm256_mul_pd(mt0,in01);
  2373. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2374. in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
  2375. in00_r = _mm256_permute_pd(in00,5);
  2376. in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
  2377. in01_r = _mm256_permute_pd(in01,5);
  2378. m00 = _mm256_load_pd(M_+16);
  2379. mt0 = _mm256_unpacklo_pd(m00,m00);
  2380. ot00 = _mm256_mul_pd(mt0,in00);
  2381. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2382. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2383. ot00 = _mm256_mul_pd(mt0,in01);
  2384. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2385. m00 = _mm256_load_pd(M_+20);
  2386. mt0 = _mm256_unpacklo_pd(m00,m00);
  2387. ot00 = _mm256_mul_pd(mt0,in00);
  2388. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2389. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2390. ot00 = _mm256_mul_pd(mt0,in01);
  2391. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2392. m00 = _mm256_load_pd(M_+24);
  2393. mt0 = _mm256_unpacklo_pd(m00,m00);
  2394. ot00 = _mm256_mul_pd(mt0,in00);
  2395. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2396. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2397. ot00 = _mm256_mul_pd(mt0,in01);
  2398. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2399. m00 = _mm256_load_pd(M_+28);
  2400. mt0 = _mm256_unpacklo_pd(m00,m00);
  2401. ot00 = _mm256_mul_pd(mt0,in00);
  2402. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2403. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2404. ot00 = _mm256_mul_pd(mt0,in01);
  2405. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2406. M_ += 32;
  2407. in0__ += 4;
  2408. in1__ += 4;
  2409. }
  2410. _mm256_store_pd(OUT0,out00);
  2411. _mm256_store_pd(OUT1,out01);
  2412. _mm256_store_pd(OUT0+4,out10);
  2413. _mm256_store_pd(OUT1+4,out11);
  2414. _mm256_store_pd(OUT0+8,out20);
  2415. _mm256_store_pd(OUT1+8,out21);
  2416. _mm256_store_pd(OUT0+12,out30);
  2417. _mm256_store_pd(OUT1+12,out31);
  2418. #elif defined __SSE3__ // SSE code.
  2419. __m128d out00, out01, out10, out11;
  2420. __m128d in00, in01, in10, in11;
  2421. __m128d m00, m01, m10, m11;
  2422. //#pragma unroll
  2423. for(int i1=0;i1<8;i1+=2){
  2424. double* IN0_=IN0;
  2425. double* IN1_=IN1;
  2426. out00 =_mm_load_pd (OUT0 );
  2427. out10 =_mm_load_pd (OUT0+2);
  2428. out01 =_mm_load_pd (OUT1 );
  2429. out11 =_mm_load_pd (OUT1+2);
  2430. //#pragma unroll
  2431. for(int i2=0;i2<8;i2+=2){
  2432. m00 =_mm_load1_pd (M_ );
  2433. m10 =_mm_load1_pd (M_+ 2);
  2434. m01 =_mm_load1_pd (M_+16);
  2435. m11 =_mm_load1_pd (M_+18);
  2436. in00 =_mm_load_pd (IN0_ );
  2437. in10 =_mm_load_pd (IN0_+2);
  2438. in01 =_mm_load_pd (IN1_ );
  2439. in11 =_mm_load_pd (IN1_+2);
  2440. out00 = _mm_add_pd (out00, _mm_mul_pd(m00 , in00 ));
  2441. out00 = _mm_add_pd (out00, _mm_mul_pd(m01 , in10 ));
  2442. out01 = _mm_add_pd (out01, _mm_mul_pd(m00 , in01 ));
  2443. out01 = _mm_add_pd (out01, _mm_mul_pd(m01 , in11 ));
  2444. out10 = _mm_add_pd (out10, _mm_mul_pd(m10 , in00 ));
  2445. out10 = _mm_add_pd (out10, _mm_mul_pd(m11 , in10 ));
  2446. out11 = _mm_add_pd (out11, _mm_mul_pd(m10 , in01 ));
  2447. out11 = _mm_add_pd (out11, _mm_mul_pd(m11 , in11 ));
  2448. m00 =_mm_load1_pd (M_+ 1);
  2449. m10 =_mm_load1_pd (M_+ 2+1);
  2450. m01 =_mm_load1_pd (M_+16+1);
  2451. m11 =_mm_load1_pd (M_+18+1);
  2452. in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
  2453. in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
  2454. in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
  2455. in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
  2456. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
  2457. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
  2458. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
  2459. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
  2460. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
  2461. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
  2462. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
  2463. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
  2464. M_+=32; // Jump to (column+2).
  2465. IN0_+=4;
  2466. IN1_+=4;
  2467. }
  2468. _mm_store_pd (OUT0 ,out00);
  2469. _mm_store_pd (OUT0+2,out10);
  2470. _mm_store_pd (OUT1 ,out01);
  2471. _mm_store_pd (OUT1+2,out11);
  2472. M_+=4-64*2; // Jump back to first column (row+2).
  2473. OUT0+=4;
  2474. OUT1+=4;
  2475. }
  2476. #endif
  2477. }
  2478. #endif
  2479. #if defined(__SSE3__)
  2480. template<>
  2481. inline void matmult_8x8x2<float>(float*& M_, float*& IN0, float*& IN1, float*& OUT0, float*& OUT1){
  2482. #if defined __SSE3__ // SSE code.
  2483. __m128 out00,out01,out10,out11;
  2484. __m128 out20,out21,out30,out31;
  2485. float* in0__ = IN0;
  2486. float* in1__ = IN1;
  2487. out00 = _mm_load_ps(OUT0);
  2488. out01 = _mm_load_ps(OUT1);
  2489. out10 = _mm_load_ps(OUT0+4);
  2490. out11 = _mm_load_ps(OUT1+4);
  2491. out20 = _mm_load_ps(OUT0+8);
  2492. out21 = _mm_load_ps(OUT1+8);
  2493. out30 = _mm_load_ps(OUT0+12);
  2494. out31 = _mm_load_ps(OUT1+12);
  2495. for(int i2=0;i2<8;i2+=2){
  2496. __m128 m00;
  2497. __m128 mt0,mtt0;
  2498. __m128 in00,in00_r,in01,in01_r;
  2499. in00 = _mm_castpd_ps(_mm_load_pd1((const double*)in0__));
  2500. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2501. in01 = _mm_castpd_ps(_mm_load_pd1((const double*)in1__));
  2502. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2503. m00 = _mm_load_ps(M_);
  2504. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2505. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2506. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2507. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2508. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2509. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2510. m00 = _mm_load_ps(M_+4);
  2511. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2512. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2513. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2514. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2515. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2516. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2517. m00 = _mm_load_ps(M_+8);
  2518. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2519. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2520. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2521. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2522. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2523. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2524. m00 = _mm_load_ps(M_+12);
  2525. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2526. out30= _mm_add_ps (out30,_mm_mul_ps( mt0, in00));
  2527. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2528. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2529. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2530. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2531. in00 = _mm_castpd_ps(_mm_load_pd1((const double*) (in0__+2)));
  2532. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2533. in01 = _mm_castpd_ps(_mm_load_pd1((const double*) (in1__+2)));
  2534. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2535. m00 = _mm_load_ps(M_+16);
  2536. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2537. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2538. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2539. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2540. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2541. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2542. m00 = _mm_load_ps(M_+20);
  2543. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2544. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2545. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2546. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2547. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2548. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2549. m00 = _mm_load_ps(M_+24);
  2550. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2551. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2552. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2553. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2554. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2555. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2556. m00 = _mm_load_ps(M_+28);
  2557. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2558. out30= _mm_add_ps (out30,_mm_mul_ps( mt0,in00 ));
  2559. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2560. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2561. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2562. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2563. M_ += 32;
  2564. in0__ += 4;
  2565. in1__ += 4;
  2566. }
  2567. _mm_store_ps(OUT0,out00);
  2568. _mm_store_ps(OUT1,out01);
  2569. _mm_store_ps(OUT0+4,out10);
  2570. _mm_store_ps(OUT1+4,out11);
  2571. _mm_store_ps(OUT0+8,out20);
  2572. _mm_store_ps(OUT1+8,out21);
  2573. _mm_store_ps(OUT0+12,out30);
  2574. _mm_store_ps(OUT1+12,out31);
  2575. #endif
  2576. }
  2577. #endif
  2578. template <class Real_t>
  2579. void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector<size_t>& interac_dsp,
  2580. Vector<size_t>& interac_vec, Vector<Real_t*>& precomp_mat, Vector<Real_t>& fft_in, Vector<Real_t>& fft_out){
  2581. size_t chld_cnt=1UL<<COORD_DIM;
  2582. size_t fftsize_in =M_dim*ker_dim0*chld_cnt*2;
  2583. size_t fftsize_out=M_dim*ker_dim1*chld_cnt*2;
  2584. Real_t* zero_vec0=mem::aligned_new<Real_t>(fftsize_in );
  2585. Real_t* zero_vec1=mem::aligned_new<Real_t>(fftsize_out);
  2586. size_t n_out=fft_out.Dim()/fftsize_out;
  2587. // Set buff_out to zero.
  2588. #pragma omp parallel for
  2589. for(size_t k=0;k<n_out;k++){
  2590. Vector<Real_t> dnward_check_fft(fftsize_out, &fft_out[k*fftsize_out], false);
  2591. dnward_check_fft.SetZero();
  2592. }
  2593. // Build list of interaction pairs (in, out vectors).
  2594. size_t mat_cnt=precomp_mat.Dim();
  2595. size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
  2596. const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
  2597. Real_t** IN_ =mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2598. Real_t** OUT_=mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2599. #pragma omp parallel for
  2600. for(size_t interac_blk1=0; interac_blk1<blk1_cnt*mat_cnt; interac_blk1++){
  2601. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2602. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2603. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2604. for(size_t j=0;j<interac_cnt;j++){
  2605. IN_ [2*V_BLK_SIZE*interac_blk1 +j]=&fft_in [interac_vec[(interac_dsp0+j)*2+0]];
  2606. OUT_[2*V_BLK_SIZE*interac_blk1 +j]=&fft_out[interac_vec[(interac_dsp0+j)*2+1]];
  2607. }
  2608. IN_ [2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec0;
  2609. OUT_[2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec1;
  2610. }
  2611. int omp_p=omp_get_max_threads();
  2612. #pragma omp parallel for
  2613. for(int pid=0; pid<omp_p; pid++){
  2614. size_t a=( pid *M_dim)/omp_p;
  2615. size_t b=((pid+1)*M_dim)/omp_p;
  2616. for(int in_dim=0;in_dim<ker_dim0;in_dim++)
  2617. for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++)
  2618. for(size_t blk1=0; blk1<blk1_cnt; blk1++)
  2619. for(size_t k=a; k< b; k++)
  2620. for(size_t mat_indx=0; mat_indx< mat_cnt;mat_indx++){
  2621. size_t interac_blk1 = blk1*mat_cnt+mat_indx;
  2622. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2623. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2624. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2625. Real_t** IN = IN_ + 2*V_BLK_SIZE*interac_blk1;
  2626. Real_t** OUT= OUT_+ 2*V_BLK_SIZE*interac_blk1;
  2627. Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2 + (ot_dim+in_dim*ker_dim1)*M_dim*128;
  2628. {
  2629. for(size_t j=0;j<interac_cnt;j+=2){
  2630. Real_t* M_ = M;
  2631. Real_t* IN0 = IN [j+0] + (in_dim*M_dim+k)*chld_cnt*2;
  2632. Real_t* IN1 = IN [j+1] + (in_dim*M_dim+k)*chld_cnt*2;
  2633. Real_t* OUT0 = OUT[j+0] + (ot_dim*M_dim+k)*chld_cnt*2;
  2634. Real_t* OUT1 = OUT[j+1] + (ot_dim*M_dim+k)*chld_cnt*2;
  2635. #ifdef __SSE__
  2636. if (j+2 < interac_cnt) { // Prefetch
  2637. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2638. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2639. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2640. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2641. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2642. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2643. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2644. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2645. }
  2646. #endif
  2647. matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
  2648. }
  2649. }
  2650. }
  2651. }
  2652. // Compute flops.
  2653. {
  2654. Profile::Add_FLOP(8*8*8*(interac_vec.Dim()/2)*M_dim*ker_dim0*ker_dim1*dof);
  2655. }
  2656. // Free memory
  2657. mem::aligned_delete<Real_t*>(IN_ );
  2658. mem::aligned_delete<Real_t*>(OUT_);
  2659. mem::aligned_delete<Real_t>(zero_vec0);
  2660. mem::aligned_delete<Real_t>(zero_vec1);
  2661. }
  2662. template <class FMMNode>
  2663. void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2664. if(!this->MultipoleOrder()) return;
  2665. if(level==0) return;
  2666. { // Set setup_data
  2667. setup_data.level=level;
  2668. setup_data.kernel=kernel->k_m2l;
  2669. setup_data.interac_type.resize(1);
  2670. setup_data.interac_type[0]=V1_Type;
  2671. setup_data. input_data=&buff[0];
  2672. setup_data.output_data=&buff[1];
  2673. Vector<FMMNode_t*>& nodes_in =n_list[2];
  2674. Vector<FMMNode_t*>& nodes_out=n_list[3];
  2675. setup_data.nodes_in .clear();
  2676. setup_data.nodes_out.clear();
  2677. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1 || level==-1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
  2678. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level-1 || level==-1) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
  2679. }
  2680. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2681. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2682. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2683. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2684. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_in [i])->Child(0))->FMMData())->upward_equiv);
  2685. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_out[i])->Child(0))->FMMData())->dnward_equiv);
  2686. /////////////////////////////////////////////////////////////////////////////
  2687. Real_t eps=1e-10;
  2688. size_t n_in =nodes_in .size();
  2689. size_t n_out=nodes_out.size();
  2690. // Setup precomputed data.
  2691. if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  2692. // Build interac_data
  2693. Profile::Tic("Interac-Data",&this->comm,true,25);
  2694. Matrix<char>& interac_data=setup_data.interac_data;
  2695. if(n_out>0 && n_in >0){ // Build precomp_data, interac_data
  2696. size_t precomp_offset=0;
  2697. Mat_Type& interac_type=setup_data.interac_type[0];
  2698. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  2699. Matrix<size_t> precomp_data_offset;
  2700. std::vector<size_t> interac_mat;
  2701. { // Load precomp_data for interac_type.
  2702. struct HeaderData{
  2703. size_t total_size;
  2704. size_t level;
  2705. size_t mat_cnt ;
  2706. size_t max_depth;
  2707. };
  2708. Matrix<char>& precomp_data=*setup_data.precomp_data;
  2709. char* indx_ptr=precomp_data[0]+precomp_offset;
  2710. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  2711. precomp_data_offset.ReInit(header.mat_cnt,1+(2+2)*header.max_depth, (size_t*)indx_ptr, false);
  2712. precomp_offset+=header.total_size;
  2713. for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
  2714. Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
  2715. assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
  2716. interac_mat.push_back(precomp_data_offset[mat_id][0]);
  2717. }
  2718. }
  2719. size_t dof;
  2720. size_t m=MultipoleOrder();
  2721. size_t ker_dim0=setup_data.kernel->ker_dim[0];
  2722. size_t ker_dim1=setup_data.kernel->ker_dim[1];
  2723. size_t fftsize;
  2724. {
  2725. size_t n1=m*2;
  2726. size_t n2=n1*n1;
  2727. size_t n3_=n2*(n1/2+1);
  2728. size_t chld_cnt=1UL<<COORD_DIM;
  2729. fftsize=2*n3_*chld_cnt;
  2730. dof=1;
  2731. }
  2732. int omp_p=omp_get_max_threads();
  2733. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  2734. size_t n_blk0=2*fftsize*dof*(ker_dim0*n_in +ker_dim1*n_out)*sizeof(Real_t)/buff_size;
  2735. if(n_blk0==0) n_blk0=1;
  2736. std::vector<std::vector<size_t> > fft_vec(n_blk0);
  2737. std::vector<std::vector<size_t> > ifft_vec(n_blk0);
  2738. std::vector<std::vector<Real_t> > fft_scl(n_blk0);
  2739. std::vector<std::vector<Real_t> > ifft_scl(n_blk0);
  2740. std::vector<std::vector<size_t> > interac_vec(n_blk0);
  2741. std::vector<std::vector<size_t> > interac_dsp(n_blk0);
  2742. {
  2743. Matrix<Real_t>& input_data=*setup_data. input_data;
  2744. Matrix<Real_t>& output_data=*setup_data.output_data;
  2745. std::vector<std::vector<FMMNode*> > nodes_blk_in (n_blk0);
  2746. std::vector<std::vector<FMMNode*> > nodes_blk_out(n_blk0);
  2747. Vector<Real_t> src_scal=this->kernel->k_m2l->src_scal;
  2748. Vector<Real_t> trg_scal=this->kernel->k_m2l->trg_scal;
  2749. for(size_t i=0;i<n_in;i++) ((FMMNode*)nodes_in[i])->node_id=i;
  2750. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2751. size_t blk0_start=(n_out* blk0 )/n_blk0;
  2752. size_t blk0_end =(n_out*(blk0+1))/n_blk0;
  2753. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2754. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2755. { // Build node list for blk0.
  2756. std::set<void*> nodes_in;
  2757. for(size_t i=blk0_start;i<blk0_end;i++){
  2758. nodes_out_.push_back((FMMNode*)nodes_out[i]);
  2759. Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  2760. for(size_t k=0;k<mat_cnt;k++) if(lst[k]!=NULL && lst[k]->pt_cnt[0]) nodes_in.insert(lst[k]);
  2761. }
  2762. for(std::set<void*>::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){
  2763. nodes_in_.push_back((FMMNode*)*node);
  2764. }
  2765. size_t input_dim=nodes_in_ .size()*ker_dim0*dof*fftsize;
  2766. size_t output_dim=nodes_out_.size()*ker_dim1*dof*fftsize;
  2767. size_t buffer_dim=2*(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2768. if(buff_size<(input_dim + output_dim + buffer_dim)*sizeof(Real_t))
  2769. buff_size=(input_dim + output_dim + buffer_dim)*sizeof(Real_t);
  2770. }
  2771. { // Set fft vectors.
  2772. for(size_t i=0;i<nodes_in_ .size();i++) fft_vec[blk0].push_back((size_t)(& input_vector[nodes_in_[i]->node_id][0][0]- input_data[0]));
  2773. for(size_t i=0;i<nodes_out_.size();i++)ifft_vec[blk0].push_back((size_t)(&output_vector[blk0_start + i ][0][0]-output_data[0]));
  2774. size_t scal_dim0=src_scal.Dim();
  2775. size_t scal_dim1=trg_scal.Dim();
  2776. fft_scl [blk0].resize(nodes_in_ .size()*scal_dim0);
  2777. ifft_scl[blk0].resize(nodes_out_.size()*scal_dim1);
  2778. for(size_t i=0;i<nodes_in_ .size();i++){
  2779. size_t depth=nodes_in_[i]->Depth()+1;
  2780. for(size_t j=0;j<scal_dim0;j++){
  2781. fft_scl[blk0][i*scal_dim0+j]=pow(2.0, src_scal[j]*depth);
  2782. }
  2783. }
  2784. for(size_t i=0;i<nodes_out_.size();i++){
  2785. size_t depth=nodes_out_[i]->Depth()+1;
  2786. for(size_t j=0;j<scal_dim1;j++){
  2787. ifft_scl[blk0][i*scal_dim1+j]=pow(2.0, trg_scal[j]*depth);
  2788. }
  2789. }
  2790. }
  2791. }
  2792. for(size_t blk0=0;blk0<n_blk0;blk0++){ // Hadamard interactions.
  2793. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2794. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2795. for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
  2796. { // Next blocking level.
  2797. size_t n_blk1=nodes_out_.size()*(2)*sizeof(Real_t)/(64*V_BLK_CACHE);
  2798. if(n_blk1==0) n_blk1=1;
  2799. size_t interac_dsp_=0;
  2800. for(size_t blk1=0;blk1<n_blk1;blk1++){
  2801. size_t blk1_start=(nodes_out_.size()* blk1 )/n_blk1;
  2802. size_t blk1_end =(nodes_out_.size()*(blk1+1))/n_blk1;
  2803. for(size_t k=0;k<mat_cnt;k++){
  2804. for(size_t i=blk1_start;i<blk1_end;i++){
  2805. Vector<FMMNode*>& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type];
  2806. if(lst[k]!=NULL && lst[k]->pt_cnt[0]){
  2807. interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof);
  2808. interac_vec[blk0].push_back( i *fftsize*ker_dim1*dof);
  2809. interac_dsp_++;
  2810. }
  2811. }
  2812. interac_dsp[blk0].push_back(interac_dsp_);
  2813. }
  2814. }
  2815. }
  2816. }
  2817. }
  2818. { // Set interac_data.
  2819. size_t data_size=sizeof(size_t)*6; // buff_size, m, dof, ker_dim0, ker_dim1, n_blk0
  2820. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2821. data_size+=sizeof(size_t)+ fft_vec[blk0].size()*sizeof(size_t);
  2822. data_size+=sizeof(size_t)+ ifft_vec[blk0].size()*sizeof(size_t);
  2823. data_size+=sizeof(size_t)+ fft_scl[blk0].size()*sizeof(Real_t);
  2824. data_size+=sizeof(size_t)+ ifft_scl[blk0].size()*sizeof(Real_t);
  2825. data_size+=sizeof(size_t)+interac_vec[blk0].size()*sizeof(size_t);
  2826. data_size+=sizeof(size_t)+interac_dsp[blk0].size()*sizeof(size_t);
  2827. }
  2828. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  2829. if(data_size>interac_data.Dim(0)*interac_data.Dim(1))
  2830. interac_data.ReInit(1,data_size);
  2831. char* data_ptr=&interac_data[0][0];
  2832. ((size_t*)data_ptr)[0]=buff_size; data_ptr+=sizeof(size_t);
  2833. ((size_t*)data_ptr)[0]= m; data_ptr+=sizeof(size_t);
  2834. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  2835. ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
  2836. ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
  2837. ((size_t*)data_ptr)[0]= n_blk0; data_ptr+=sizeof(size_t);
  2838. ((size_t*)data_ptr)[0]= interac_mat.size(); data_ptr+=sizeof(size_t);
  2839. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  2840. data_ptr+=interac_mat.size()*sizeof(size_t);
  2841. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2842. ((size_t*)data_ptr)[0]= fft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2843. mem::memcopy(data_ptr, & fft_vec[blk0][0], fft_vec[blk0].size()*sizeof(size_t));
  2844. data_ptr+= fft_vec[blk0].size()*sizeof(size_t);
  2845. ((size_t*)data_ptr)[0]=ifft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2846. mem::memcopy(data_ptr, &ifft_vec[blk0][0], ifft_vec[blk0].size()*sizeof(size_t));
  2847. data_ptr+=ifft_vec[blk0].size()*sizeof(size_t);
  2848. ((size_t*)data_ptr)[0]= fft_scl[blk0].size(); data_ptr+=sizeof(size_t);
  2849. mem::memcopy(data_ptr, & fft_scl[blk0][0], fft_scl[blk0].size()*sizeof(Real_t));
  2850. data_ptr+= fft_scl[blk0].size()*sizeof(Real_t);
  2851. ((size_t*)data_ptr)[0]=ifft_scl[blk0].size(); data_ptr+=sizeof(size_t);
  2852. mem::memcopy(data_ptr, &ifft_scl[blk0][0], ifft_scl[blk0].size()*sizeof(Real_t));
  2853. data_ptr+=ifft_scl[blk0].size()*sizeof(Real_t);
  2854. ((size_t*)data_ptr)[0]=interac_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2855. mem::memcopy(data_ptr, &interac_vec[blk0][0], interac_vec[blk0].size()*sizeof(size_t));
  2856. data_ptr+=interac_vec[blk0].size()*sizeof(size_t);
  2857. ((size_t*)data_ptr)[0]=interac_dsp[blk0].size(); data_ptr+=sizeof(size_t);
  2858. mem::memcopy(data_ptr, &interac_dsp[blk0][0], interac_dsp[blk0].size()*sizeof(size_t));
  2859. data_ptr+=interac_dsp[blk0].size()*sizeof(size_t);
  2860. }
  2861. }
  2862. }
  2863. Profile::Toc();
  2864. if(device){ // Host2Device
  2865. Profile::Tic("Host2Device",&this->comm,false,25);
  2866. setup_data.interac_data. AllocDevice(true);
  2867. Profile::Toc();
  2868. }
  2869. }
  2870. template <class FMMNode>
  2871. void FMM_Pts<FMMNode>::V_List (SetupData<Real_t>& setup_data, bool device){
  2872. if(!this->MultipoleOrder()) return;
  2873. assert(!device); //Can not run on accelerator yet.
  2874. int np;
  2875. MPI_Comm_size(comm,&np);
  2876. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  2877. if(np>1) Profile::Tic("Host2Device",&this->comm,false,25);
  2878. if(np>1) Profile::Toc();
  2879. return;
  2880. }
  2881. Profile::Tic("Host2Device",&this->comm,false,25);
  2882. int level=setup_data.level;
  2883. size_t buff_size=*((size_t*)&setup_data.interac_data[0][0]);
  2884. typename Vector<char>::Device buff;
  2885. typename Matrix<char>::Device precomp_data;
  2886. typename Matrix<char>::Device interac_data;
  2887. typename Matrix<Real_t>::Device input_data;
  2888. typename Matrix<Real_t>::Device output_data;
  2889. if(device){
  2890. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  2891. buff = this-> dev_buffer. AllocDevice(false);
  2892. precomp_data= setup_data.precomp_data->AllocDevice(false);
  2893. interac_data= setup_data.interac_data. AllocDevice(false);
  2894. input_data = setup_data. input_data->AllocDevice(false);
  2895. output_data = setup_data. output_data->AllocDevice(false);
  2896. }else{
  2897. if(this->cpu_buffer.Dim()<buff_size) this->cpu_buffer.ReInit(buff_size);
  2898. buff = this-> cpu_buffer;
  2899. precomp_data=*setup_data.precomp_data;
  2900. interac_data= setup_data.interac_data;
  2901. input_data =*setup_data. input_data;
  2902. output_data =*setup_data. output_data;
  2903. }
  2904. Profile::Toc();
  2905. { // Offloaded computation.
  2906. // Set interac_data.
  2907. size_t m, dof, ker_dim0, ker_dim1, n_blk0;
  2908. std::vector<Vector<size_t> > fft_vec;
  2909. std::vector<Vector<size_t> > ifft_vec;
  2910. std::vector<Vector<Real_t> > fft_scl;
  2911. std::vector<Vector<Real_t> > ifft_scl;
  2912. std::vector<Vector<size_t> > interac_vec;
  2913. std::vector<Vector<size_t> > interac_dsp;
  2914. Vector<Real_t*> precomp_mat;
  2915. { // Set interac_data.
  2916. char* data_ptr=&interac_data[0][0];
  2917. buff_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2918. m =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2919. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2920. ker_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2921. ker_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2922. n_blk0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2923. fft_vec .resize(n_blk0);
  2924. ifft_vec.resize(n_blk0);
  2925. fft_scl .resize(n_blk0);
  2926. ifft_scl.resize(n_blk0);
  2927. interac_vec.resize(n_blk0);
  2928. interac_dsp.resize(n_blk0);
  2929. Vector<size_t> interac_mat;
  2930. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2931. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  2932. precomp_mat.Resize(interac_mat.Dim());
  2933. for(size_t i=0;i<interac_mat.Dim();i++){
  2934. precomp_mat[i]=(Real_t*)(precomp_data[0]+interac_mat[i]);
  2935. }
  2936. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2937. fft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2938. data_ptr+=sizeof(size_t)+fft_vec[blk0].Dim()*sizeof(size_t);
  2939. ifft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2940. data_ptr+=sizeof(size_t)+ifft_vec[blk0].Dim()*sizeof(size_t);
  2941. fft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  2942. data_ptr+=sizeof(size_t)+fft_scl[blk0].Dim()*sizeof(Real_t);
  2943. ifft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  2944. data_ptr+=sizeof(size_t)+ifft_scl[blk0].Dim()*sizeof(Real_t);
  2945. interac_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2946. data_ptr+=sizeof(size_t)+interac_vec[blk0].Dim()*sizeof(size_t);
  2947. interac_dsp[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2948. data_ptr+=sizeof(size_t)+interac_dsp[blk0].Dim()*sizeof(size_t);
  2949. }
  2950. }
  2951. int omp_p=omp_get_max_threads();
  2952. size_t M_dim, fftsize;
  2953. {
  2954. size_t n1=m*2;
  2955. size_t n2=n1*n1;
  2956. size_t n3_=n2*(n1/2+1);
  2957. size_t chld_cnt=1UL<<COORD_DIM;
  2958. fftsize=2*n3_*chld_cnt;
  2959. M_dim=n3_;
  2960. }
  2961. for(size_t blk0=0;blk0<n_blk0;blk0++){ // interactions
  2962. size_t n_in = fft_vec[blk0].Dim();
  2963. size_t n_out=ifft_vec[blk0].Dim();
  2964. size_t input_dim=n_in *ker_dim0*dof*fftsize;
  2965. size_t output_dim=n_out*ker_dim1*dof*fftsize;
  2966. size_t buffer_dim=2*(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2967. Vector<Real_t> fft_in ( input_dim, (Real_t*)&buff[ 0 ],false);
  2968. Vector<Real_t> fft_out(output_dim, (Real_t*)&buff[ input_dim *sizeof(Real_t)],false);
  2969. Vector<Real_t> buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
  2970. { // FFT
  2971. if(np==1) Profile::Tic("FFT",&comm,false,100);
  2972. Vector<Real_t> input_data_( input_data.dim[0]* input_data.dim[1], input_data[0], false);
  2973. FFT_UpEquiv(dof, m, ker_dim0, fft_vec[blk0], fft_scl[blk0], input_data_, fft_in, buffer);
  2974. if(np==1) Profile::Toc();
  2975. }
  2976. { // Hadamard
  2977. #ifdef PVFMM_HAVE_PAPI
  2978. #ifdef __VERBOSE__
  2979. std::cout << "Starting counters new\n";
  2980. if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
  2981. #endif
  2982. #endif
  2983. if(np==1) Profile::Tic("HadamardProduct",&comm,false,100);
  2984. VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
  2985. if(np==1) Profile::Toc();
  2986. #ifdef PVFMM_HAVE_PAPI
  2987. #ifdef __VERBOSE__
  2988. if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
  2989. std::cout << "Stopping counters\n";
  2990. #endif
  2991. #endif
  2992. }
  2993. { // IFFT
  2994. if(np==1) Profile::Tic("IFFT",&comm,false,100);
  2995. Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
  2996. FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], ifft_scl[blk0], fft_out, output_data_, buffer);
  2997. if(np==1) Profile::Toc();
  2998. }
  2999. }
  3000. }
  3001. }
  3002. template <class FMMNode>
  3003. void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3004. if(!this->MultipoleOrder()) return;
  3005. { // Set setup_data
  3006. setup_data.level=level;
  3007. setup_data.kernel=kernel->k_l2l;
  3008. setup_data.interac_type.resize(1);
  3009. setup_data.interac_type[0]=D2D_Type;
  3010. setup_data. input_data=&buff[1];
  3011. setup_data.output_data=&buff[1];
  3012. Vector<FMMNode_t*>& nodes_in =n_list[1];
  3013. Vector<FMMNode_t*>& nodes_out=n_list[1];
  3014. setup_data.nodes_in .clear();
  3015. setup_data.nodes_out.clear();
  3016. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1) && nodes_in [i]->pt_cnt[1]) setup_data.nodes_in .push_back(nodes_in [i]);
  3017. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level ) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
  3018. }
  3019. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3020. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3021. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3022. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3023. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
  3024. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
  3025. SetupInterac(setup_data,device);
  3026. }
  3027. template <class FMMNode>
  3028. void FMM_Pts<FMMNode>::Down2Down (SetupData<Real_t>& setup_data, bool device){
  3029. if(!this->MultipoleOrder()) return;
  3030. //Add Down2Down contribution.
  3031. EvalList(setup_data, device);
  3032. }
  3033. template <class FMMNode>
  3034. void FMM_Pts<FMMNode>::PtSetup(SetupData<Real_t>& setup_data, void* data_){
  3035. struct PackedData{
  3036. size_t len;
  3037. Matrix<Real_t>* ptr;
  3038. Vector<size_t> cnt;
  3039. Vector<size_t> dsp;
  3040. };
  3041. struct InteracData{
  3042. Vector<size_t> in_node;
  3043. Vector<size_t> scal_idx;
  3044. Vector<Real_t> coord_shift;
  3045. Vector<size_t> interac_cnt;
  3046. Vector<size_t> interac_dsp;
  3047. Vector<Real_t> scal[4*MAX_DEPTH];
  3048. Matrix<Real_t> M[4];
  3049. };
  3050. struct ptSetupData{
  3051. int level;
  3052. const Kernel<Real_t>* kernel;
  3053. PackedData src_coord; // Src coord
  3054. PackedData src_value; // Src density
  3055. PackedData srf_coord; // Srf coord
  3056. PackedData srf_value; // Srf density
  3057. PackedData trg_coord; // Trg coord
  3058. PackedData trg_value; // Trg potential
  3059. InteracData interac_data;
  3060. };
  3061. ptSetupData& data=*(ptSetupData*)data_;
  3062. { // pack data
  3063. struct PackedSetupData{
  3064. size_t size;
  3065. int level;
  3066. const Kernel<Real_t>* kernel;
  3067. Matrix<Real_t>* src_coord; // Src coord
  3068. Matrix<Real_t>* src_value; // Src density
  3069. Matrix<Real_t>* srf_coord; // Srf coord
  3070. Matrix<Real_t>* srf_value; // Srf density
  3071. Matrix<Real_t>* trg_coord; // Trg coord
  3072. Matrix<Real_t>* trg_value; // Trg potential
  3073. size_t src_coord_cnt_size; size_t src_coord_cnt_offset;
  3074. size_t src_coord_dsp_size; size_t src_coord_dsp_offset;
  3075. size_t src_value_cnt_size; size_t src_value_cnt_offset;
  3076. size_t src_value_dsp_size; size_t src_value_dsp_offset;
  3077. size_t srf_coord_cnt_size; size_t srf_coord_cnt_offset;
  3078. size_t srf_coord_dsp_size; size_t srf_coord_dsp_offset;
  3079. size_t srf_value_cnt_size; size_t srf_value_cnt_offset;
  3080. size_t srf_value_dsp_size; size_t srf_value_dsp_offset;
  3081. size_t trg_coord_cnt_size; size_t trg_coord_cnt_offset;
  3082. size_t trg_coord_dsp_size; size_t trg_coord_dsp_offset;
  3083. size_t trg_value_cnt_size; size_t trg_value_cnt_offset;
  3084. size_t trg_value_dsp_size; size_t trg_value_dsp_offset;
  3085. // interac_data
  3086. size_t in_node_size; size_t in_node_offset;
  3087. size_t scal_idx_size; size_t scal_idx_offset;
  3088. size_t coord_shift_size; size_t coord_shift_offset;
  3089. size_t interac_cnt_size; size_t interac_cnt_offset;
  3090. size_t interac_dsp_size; size_t interac_dsp_offset;
  3091. size_t scal_dim[4*MAX_DEPTH]; size_t scal_offset[4*MAX_DEPTH];
  3092. size_t Mdim[4][2]; size_t M_offset[4];
  3093. };
  3094. PackedSetupData pkd_data;
  3095. { // Set pkd_data
  3096. size_t offset=mem::align_ptr(sizeof(PackedSetupData));
  3097. pkd_data. level=data. level;
  3098. pkd_data.kernel=data.kernel;
  3099. pkd_data.src_coord=data.src_coord.ptr;
  3100. pkd_data.src_value=data.src_value.ptr;
  3101. pkd_data.srf_coord=data.srf_coord.ptr;
  3102. pkd_data.srf_value=data.srf_value.ptr;
  3103. pkd_data.trg_coord=data.trg_coord.ptr;
  3104. pkd_data.trg_value=data.trg_value.ptr;
  3105. pkd_data.src_coord_cnt_offset=offset; pkd_data.src_coord_cnt_size=data.src_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_coord_cnt_size);
  3106. pkd_data.src_coord_dsp_offset=offset; pkd_data.src_coord_dsp_size=data.src_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_coord_dsp_size);
  3107. pkd_data.src_value_cnt_offset=offset; pkd_data.src_value_cnt_size=data.src_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_value_cnt_size);
  3108. pkd_data.src_value_dsp_offset=offset; pkd_data.src_value_dsp_size=data.src_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_value_dsp_size);
  3109. pkd_data.srf_coord_cnt_offset=offset; pkd_data.srf_coord_cnt_size=data.srf_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_coord_cnt_size);
  3110. pkd_data.srf_coord_dsp_offset=offset; pkd_data.srf_coord_dsp_size=data.srf_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_coord_dsp_size);
  3111. pkd_data.srf_value_cnt_offset=offset; pkd_data.srf_value_cnt_size=data.srf_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_value_cnt_size);
  3112. pkd_data.srf_value_dsp_offset=offset; pkd_data.srf_value_dsp_size=data.srf_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_value_dsp_size);
  3113. pkd_data.trg_coord_cnt_offset=offset; pkd_data.trg_coord_cnt_size=data.trg_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_coord_cnt_size);
  3114. pkd_data.trg_coord_dsp_offset=offset; pkd_data.trg_coord_dsp_size=data.trg_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_coord_dsp_size);
  3115. pkd_data.trg_value_cnt_offset=offset; pkd_data.trg_value_cnt_size=data.trg_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_value_cnt_size);
  3116. pkd_data.trg_value_dsp_offset=offset; pkd_data.trg_value_dsp_size=data.trg_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_value_dsp_size);
  3117. InteracData& intdata=data.interac_data;
  3118. pkd_data. in_node_offset=offset; pkd_data. in_node_size=intdata. in_node.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data. in_node_size);
  3119. pkd_data. scal_idx_offset=offset; pkd_data. scal_idx_size=intdata. scal_idx.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data. scal_idx_size);
  3120. pkd_data.coord_shift_offset=offset; pkd_data.coord_shift_size=intdata.coord_shift.Dim(); offset+=mem::align_ptr(sizeof(Real_t)*pkd_data.coord_shift_size);
  3121. pkd_data.interac_cnt_offset=offset; pkd_data.interac_cnt_size=intdata.interac_cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_cnt_size);
  3122. pkd_data.interac_dsp_offset=offset; pkd_data.interac_dsp_size=intdata.interac_dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_dsp_size);
  3123. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3124. pkd_data.scal_offset[i]=offset; pkd_data.scal_dim[i]=intdata.scal[i].Dim(); offset+=mem::align_ptr(sizeof(Real_t)*pkd_data.scal_dim[i]);
  3125. }
  3126. for(size_t i=0;i<4;i++){
  3127. size_t& Mdim0=pkd_data.Mdim[i][0];
  3128. size_t& Mdim1=pkd_data.Mdim[i][1];
  3129. pkd_data.M_offset[i]=offset; Mdim0=intdata.M[i].Dim(0); Mdim1=intdata.M[i].Dim(1); offset+=mem::align_ptr(sizeof(Real_t)*Mdim0*Mdim1);
  3130. }
  3131. pkd_data.size=offset;
  3132. }
  3133. { // Set setup_data.interac_data
  3134. Matrix<char>& buff=setup_data.interac_data;
  3135. if(pkd_data.size>buff.Dim(0)*buff.Dim(1)){
  3136. buff.ReInit(1,pkd_data.size);
  3137. }
  3138. ((PackedSetupData*)buff[0])[0]=pkd_data;
  3139. if(pkd_data.src_coord_cnt_size) memcpy(&buff[0][pkd_data.src_coord_cnt_offset], &data.src_coord.cnt[0], pkd_data.src_coord_cnt_size*sizeof(size_t));
  3140. if(pkd_data.src_coord_dsp_size) memcpy(&buff[0][pkd_data.src_coord_dsp_offset], &data.src_coord.dsp[0], pkd_data.src_coord_dsp_size*sizeof(size_t));
  3141. if(pkd_data.src_value_cnt_size) memcpy(&buff[0][pkd_data.src_value_cnt_offset], &data.src_value.cnt[0], pkd_data.src_value_cnt_size*sizeof(size_t));
  3142. if(pkd_data.src_value_dsp_size) memcpy(&buff[0][pkd_data.src_value_dsp_offset], &data.src_value.dsp[0], pkd_data.src_value_dsp_size*sizeof(size_t));
  3143. if(pkd_data.srf_coord_cnt_size) memcpy(&buff[0][pkd_data.srf_coord_cnt_offset], &data.srf_coord.cnt[0], pkd_data.srf_coord_cnt_size*sizeof(size_t));
  3144. if(pkd_data.srf_coord_dsp_size) memcpy(&buff[0][pkd_data.srf_coord_dsp_offset], &data.srf_coord.dsp[0], pkd_data.srf_coord_dsp_size*sizeof(size_t));
  3145. if(pkd_data.srf_value_cnt_size) memcpy(&buff[0][pkd_data.srf_value_cnt_offset], &data.srf_value.cnt[0], pkd_data.srf_value_cnt_size*sizeof(size_t));
  3146. if(pkd_data.srf_value_dsp_size) memcpy(&buff[0][pkd_data.srf_value_dsp_offset], &data.srf_value.dsp[0], pkd_data.srf_value_dsp_size*sizeof(size_t));
  3147. if(pkd_data.trg_coord_cnt_size) memcpy(&buff[0][pkd_data.trg_coord_cnt_offset], &data.trg_coord.cnt[0], pkd_data.trg_coord_cnt_size*sizeof(size_t));
  3148. if(pkd_data.trg_coord_dsp_size) memcpy(&buff[0][pkd_data.trg_coord_dsp_offset], &data.trg_coord.dsp[0], pkd_data.trg_coord_dsp_size*sizeof(size_t));
  3149. if(pkd_data.trg_value_cnt_size) memcpy(&buff[0][pkd_data.trg_value_cnt_offset], &data.trg_value.cnt[0], pkd_data.trg_value_cnt_size*sizeof(size_t));
  3150. if(pkd_data.trg_value_dsp_size) memcpy(&buff[0][pkd_data.trg_value_dsp_offset], &data.trg_value.dsp[0], pkd_data.trg_value_dsp_size*sizeof(size_t));
  3151. InteracData& intdata=data.interac_data;
  3152. if(pkd_data. in_node_size) memcpy(&buff[0][pkd_data. in_node_offset], &intdata. in_node[0], pkd_data. in_node_size*sizeof(size_t));
  3153. if(pkd_data. scal_idx_size) memcpy(&buff[0][pkd_data. scal_idx_offset], &intdata. scal_idx[0], pkd_data. scal_idx_size*sizeof(size_t));
  3154. if(pkd_data.coord_shift_size) memcpy(&buff[0][pkd_data.coord_shift_offset], &intdata.coord_shift[0], pkd_data.coord_shift_size*sizeof(Real_t));
  3155. if(pkd_data.interac_cnt_size) memcpy(&buff[0][pkd_data.interac_cnt_offset], &intdata.interac_cnt[0], pkd_data.interac_cnt_size*sizeof(size_t));
  3156. if(pkd_data.interac_dsp_size) memcpy(&buff[0][pkd_data.interac_dsp_offset], &intdata.interac_dsp[0], pkd_data.interac_dsp_size*sizeof(size_t));
  3157. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3158. if(intdata.scal[i].Dim()) memcpy(&buff[0][pkd_data.scal_offset[i]], &intdata.scal[i][0], intdata.scal[i].Dim()*sizeof(Real_t));
  3159. }
  3160. for(size_t i=0;i<4;i++){
  3161. if(intdata.M[i].Dim(0)*intdata.M[i].Dim(1)) memcpy(&buff[0][pkd_data.M_offset[i]], &intdata.M[i][0][0], intdata.M[i].Dim(0)*intdata.M[i].Dim(1)*sizeof(Real_t));
  3162. }
  3163. }
  3164. }
  3165. { // Resize device buffer
  3166. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3167. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3168. }
  3169. }
  3170. template <class FMMNode>
  3171. template <int SYNC>
  3172. void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
  3173. if(setup_data.kernel->ker_dim[0]*setup_data.kernel->ker_dim[1]==0) return;
  3174. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  3175. Profile::Tic("Host2Device",&this->comm,false,25);
  3176. Profile::Toc();
  3177. Profile::Tic("DeviceComp",&this->comm,false,20);
  3178. Profile::Toc();
  3179. return;
  3180. }
  3181. bool have_gpu=false;
  3182. #if defined(PVFMM_HAVE_CUDA)
  3183. have_gpu=true;
  3184. #endif
  3185. Profile::Tic("Host2Device",&this->comm,false,25);
  3186. typename Vector<char>::Device buff;
  3187. typename Matrix<char>::Device interac_data;
  3188. typename Matrix<Real_t>::Device coord_data;
  3189. typename Matrix<Real_t>::Device input_data;
  3190. typename Matrix<Real_t>::Device output_data;
  3191. size_t ptr_single_layer_kernel=(size_t)NULL;
  3192. size_t ptr_double_layer_kernel=(size_t)NULL;
  3193. if(device && !have_gpu){
  3194. buff = this-> dev_buffer. AllocDevice(false);
  3195. interac_data= setup_data.interac_data. AllocDevice(false);
  3196. if(setup_data. coord_data!=NULL) coord_data = setup_data. coord_data->AllocDevice(false);
  3197. if(setup_data. input_data!=NULL) input_data = setup_data. input_data->AllocDevice(false);
  3198. if(setup_data. output_data!=NULL) output_data = setup_data. output_data->AllocDevice(false);
  3199. ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten;
  3200. ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten;
  3201. }else{
  3202. buff = this-> cpu_buffer;
  3203. interac_data= setup_data.interac_data;
  3204. if(setup_data. coord_data!=NULL) coord_data =*setup_data. coord_data;
  3205. if(setup_data. input_data!=NULL) input_data =*setup_data. input_data;
  3206. if(setup_data. output_data!=NULL) output_data =*setup_data. output_data;
  3207. ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten;
  3208. ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
  3209. }
  3210. Profile::Toc();
  3211. Profile::Tic("DeviceComp",&this->comm,false,20);
  3212. int lock_idx=-1;
  3213. int wait_lock_idx=-1;
  3214. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  3215. if(device) lock_idx=MIC_Lock::get_lock();
  3216. #ifdef __INTEL_OFFLOAD
  3217. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  3218. #endif
  3219. { // Offloaded computation.
  3220. struct PackedData{
  3221. size_t len;
  3222. Matrix<Real_t>* ptr;
  3223. Vector<size_t> cnt;
  3224. Vector<size_t> dsp;
  3225. };
  3226. struct InteracData{
  3227. Vector<size_t> in_node;
  3228. Vector<size_t> scal_idx;
  3229. Vector<Real_t> coord_shift;
  3230. Vector<size_t> interac_cnt;
  3231. Vector<size_t> interac_dsp;
  3232. Vector<Real_t> scal[4*MAX_DEPTH];
  3233. Matrix<Real_t> M[4];
  3234. };
  3235. struct ptSetupData{
  3236. int level;
  3237. const Kernel<Real_t>* kernel;
  3238. PackedData src_coord; // Src coord
  3239. PackedData src_value; // Src density
  3240. PackedData srf_coord; // Srf coord
  3241. PackedData srf_value; // Srf density
  3242. PackedData trg_coord; // Trg coord
  3243. PackedData trg_value; // Trg potential
  3244. InteracData interac_data;
  3245. };
  3246. ptSetupData data;
  3247. { // Initialize data
  3248. struct PackedSetupData{
  3249. size_t size;
  3250. int level;
  3251. const Kernel<Real_t>* kernel;
  3252. Matrix<Real_t>* src_coord; // Src coord
  3253. Matrix<Real_t>* src_value; // Src density
  3254. Matrix<Real_t>* srf_coord; // Srf coord
  3255. Matrix<Real_t>* srf_value; // Srf density
  3256. Matrix<Real_t>* trg_coord; // Trg coord
  3257. Matrix<Real_t>* trg_value; // Trg potential
  3258. size_t src_coord_cnt_size; size_t src_coord_cnt_offset;
  3259. size_t src_coord_dsp_size; size_t src_coord_dsp_offset;
  3260. size_t src_value_cnt_size; size_t src_value_cnt_offset;
  3261. size_t src_value_dsp_size; size_t src_value_dsp_offset;
  3262. size_t srf_coord_cnt_size; size_t srf_coord_cnt_offset;
  3263. size_t srf_coord_dsp_size; size_t srf_coord_dsp_offset;
  3264. size_t srf_value_cnt_size; size_t srf_value_cnt_offset;
  3265. size_t srf_value_dsp_size; size_t srf_value_dsp_offset;
  3266. size_t trg_coord_cnt_size; size_t trg_coord_cnt_offset;
  3267. size_t trg_coord_dsp_size; size_t trg_coord_dsp_offset;
  3268. size_t trg_value_cnt_size; size_t trg_value_cnt_offset;
  3269. size_t trg_value_dsp_size; size_t trg_value_dsp_offset;
  3270. // interac_data
  3271. size_t in_node_size; size_t in_node_offset;
  3272. size_t scal_idx_size; size_t scal_idx_offset;
  3273. size_t coord_shift_size; size_t coord_shift_offset;
  3274. size_t interac_cnt_size; size_t interac_cnt_offset;
  3275. size_t interac_dsp_size; size_t interac_dsp_offset;
  3276. size_t scal_dim[4*MAX_DEPTH]; size_t scal_offset[4*MAX_DEPTH];
  3277. size_t Mdim[4][2]; size_t M_offset[4];
  3278. };
  3279. typename Matrix<char>::Device& setupdata=interac_data;
  3280. PackedSetupData& pkd_data=*((PackedSetupData*)setupdata[0]);
  3281. data. level=pkd_data. level;
  3282. data.kernel=pkd_data.kernel;
  3283. data.src_coord.ptr=pkd_data.src_coord;
  3284. data.src_value.ptr=pkd_data.src_value;
  3285. data.srf_coord.ptr=pkd_data.srf_coord;
  3286. data.srf_value.ptr=pkd_data.srf_value;
  3287. data.trg_coord.ptr=pkd_data.trg_coord;
  3288. data.trg_value.ptr=pkd_data.trg_value;
  3289. data.src_coord.cnt.ReInit(pkd_data.src_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.src_coord_cnt_offset], false);
  3290. data.src_coord.dsp.ReInit(pkd_data.src_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.src_coord_dsp_offset], false);
  3291. data.src_value.cnt.ReInit(pkd_data.src_value_cnt_size, (size_t*)&setupdata[0][pkd_data.src_value_cnt_offset], false);
  3292. data.src_value.dsp.ReInit(pkd_data.src_value_dsp_size, (size_t*)&setupdata[0][pkd_data.src_value_dsp_offset], false);
  3293. data.srf_coord.cnt.ReInit(pkd_data.srf_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.srf_coord_cnt_offset], false);
  3294. data.srf_coord.dsp.ReInit(pkd_data.srf_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.srf_coord_dsp_offset], false);
  3295. data.srf_value.cnt.ReInit(pkd_data.srf_value_cnt_size, (size_t*)&setupdata[0][pkd_data.srf_value_cnt_offset], false);
  3296. data.srf_value.dsp.ReInit(pkd_data.srf_value_dsp_size, (size_t*)&setupdata[0][pkd_data.srf_value_dsp_offset], false);
  3297. data.trg_coord.cnt.ReInit(pkd_data.trg_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.trg_coord_cnt_offset], false);
  3298. data.trg_coord.dsp.ReInit(pkd_data.trg_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.trg_coord_dsp_offset], false);
  3299. data.trg_value.cnt.ReInit(pkd_data.trg_value_cnt_size, (size_t*)&setupdata[0][pkd_data.trg_value_cnt_offset], false);
  3300. data.trg_value.dsp.ReInit(pkd_data.trg_value_dsp_size, (size_t*)&setupdata[0][pkd_data.trg_value_dsp_offset], false);
  3301. InteracData& intdata=data.interac_data;
  3302. intdata. in_node.ReInit(pkd_data. in_node_size, (size_t*)&setupdata[0][pkd_data. in_node_offset],false);
  3303. intdata. scal_idx.ReInit(pkd_data. scal_idx_size, (size_t*)&setupdata[0][pkd_data. scal_idx_offset],false);
  3304. intdata.coord_shift.ReInit(pkd_data.coord_shift_size, (Real_t*)&setupdata[0][pkd_data.coord_shift_offset],false);
  3305. intdata.interac_cnt.ReInit(pkd_data.interac_cnt_size, (size_t*)&setupdata[0][pkd_data.interac_cnt_offset],false);
  3306. intdata.interac_dsp.ReInit(pkd_data.interac_dsp_size, (size_t*)&setupdata[0][pkd_data.interac_dsp_offset],false);
  3307. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3308. intdata.scal[i].ReInit(pkd_data.scal_dim[i], (Real_t*)&setupdata[0][pkd_data.scal_offset[i]],false);
  3309. }
  3310. for(size_t i=0;i<4;i++){
  3311. intdata.M[i].ReInit(pkd_data.Mdim[i][0], pkd_data.Mdim[i][1], (Real_t*)&setupdata[0][pkd_data.M_offset[i]],false);
  3312. }
  3313. }
  3314. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  3315. { // Compute interactions
  3316. InteracData& intdata=data.interac_data;
  3317. typename Kernel<Real_t>::Ker_t single_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_single_layer_kernel;
  3318. typename Kernel<Real_t>::Ker_t double_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_double_layer_kernel;
  3319. int omp_p=omp_get_max_threads();
  3320. std::vector<Vector<Real_t> > thread_buff(omp_p);
  3321. { // Initialize thread_buff
  3322. size_t thread_buff_size=buff.dim/sizeof(Real_t)/omp_p;
  3323. for(int i=0;i<omp_p;i++) thread_buff[i].ReInit(thread_buff_size, (Real_t*)&buff[i*thread_buff_size*sizeof(Real_t)], false);
  3324. }
  3325. //#if !defined(__INTEL_COMPILER) || __INTEL_COMPILER!=1210 // bug in icpc-12.1.0
  3326. #pragma omp parallel for schedule(dynamic)
  3327. //#endif
  3328. for(size_t trg=0;trg<intdata.interac_cnt.Dim();trg++){
  3329. assert(trg<intdata.interac_cnt.Dim()); // assertion fails due to bug in icpc-12.1.0
  3330. int tid=omp_get_thread_num();
  3331. Matrix<Real_t> src_coord, src_value;
  3332. Matrix<Real_t> srf_coord, srf_value;
  3333. Matrix<Real_t> trg_coord, trg_value;
  3334. trg_coord.ReInit(1, data.trg_coord.cnt[trg], &data.trg_coord.ptr[0][0][data.trg_coord.dsp[trg]], false);
  3335. trg_value.ReInit(1, data.trg_value.cnt[trg], &data.trg_value.ptr[0][0][data.trg_value.dsp[trg]], false);
  3336. Vector<Real_t> buff;
  3337. Matrix<Real_t> vbuff[6];
  3338. buff.ReInit(thread_buff[tid].Dim(), &thread_buff[tid][0], false);
  3339. for(size_t indx=0;indx<6;indx++){ // init vbuff[0:5]
  3340. size_t vdim=0;
  3341. switch(indx){
  3342. case 0:
  3343. vdim=intdata.M[0].Dim(0); break;
  3344. case 1:
  3345. assert(intdata.M[0].Dim(1)==intdata.M[1].Dim(0));
  3346. vdim=intdata.M[0].Dim(1); break;
  3347. case 2:
  3348. vdim=intdata.M[1].Dim(1); break;
  3349. case 3:
  3350. vdim=intdata.M[2].Dim(0); break;
  3351. case 4:
  3352. assert(intdata.M[2].Dim(1)==intdata.M[3].Dim(0));
  3353. vdim=intdata.M[2].Dim(1); break;
  3354. case 5:
  3355. vdim=intdata.M[3].Dim(1); break;
  3356. default:
  3357. vdim=0; break;
  3358. }
  3359. vbuff[indx].ReInit(1,vdim,&buff[0],false);
  3360. assert(buff.Dim()>=vdim); // Thread buffer is too small
  3361. buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
  3362. }
  3363. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3364. size_t int_id=intdata.interac_dsp[trg]+i;
  3365. size_t src=intdata.in_node[int_id];
  3366. src_coord.ReInit(1, data.src_coord.cnt[src], &data.src_coord.ptr[0][0][data.src_coord.dsp[src]], false);
  3367. src_value.ReInit(1, data.src_value.cnt[src], &data.src_value.ptr[0][0][data.src_value.dsp[src]], false);
  3368. srf_coord.ReInit(1, data.srf_coord.cnt[src], &data.srf_coord.ptr[0][0][data.srf_coord.dsp[src]], false);
  3369. srf_value.ReInit(1, data.srf_value.cnt[src], &data.srf_value.ptr[0][0][data.srf_value.dsp[src]], false);
  3370. { // coord_shift
  3371. Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
  3372. if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
  3373. size_t vdim=src_coord.Dim(1);
  3374. Vector<Real_t> new_coord(vdim, &buff[0], false);
  3375. assert(buff.Dim()>=vdim); // Thread buffer is too small
  3376. buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
  3377. for(size_t j=0;j<vdim;j+=COORD_DIM){
  3378. for(size_t k=0;k<COORD_DIM;k++){
  3379. new_coord[j+k]=src_coord[0][j+k]+shift[k];
  3380. }
  3381. }
  3382. src_coord.ReInit(1, vdim, &new_coord[0], false);
  3383. }
  3384. }
  3385. { // src mat-vec
  3386. if(intdata.M[0].Dim(0) && intdata.M[0].Dim(1) && intdata.M[1].Dim(0) && intdata.M[1].Dim(1)){
  3387. // Copy src_value to vbuff[0]
  3388. size_t vdim=vbuff[0].Dim(0)*vbuff[0].Dim(1);
  3389. assert(src_value.Dim(0)*src_value.Dim(1)==vdim);
  3390. for(size_t j=0;j<vdim;j++) vbuff[0][0][j]=src_value[0][j];
  3391. size_t scal_idx=intdata.scal_idx[int_id];
  3392. { // scaling
  3393. Matrix<Real_t>& vec=vbuff[0];
  3394. Vector<Real_t>& scal=intdata.scal[scal_idx*4+0];
  3395. size_t scal_dim=scal.Dim();
  3396. if(scal_dim){
  3397. size_t vdim=vec.Dim(0)*vec.Dim(1);
  3398. for(size_t j=0;j<vdim;j+=scal_dim){
  3399. for(size_t k=0;k<scal_dim;k++){
  3400. vec[0][j+k]*=scal[k];
  3401. }
  3402. }
  3403. }
  3404. }
  3405. Matrix<Real_t>::GEMM(vbuff[1],vbuff[0],intdata.M[0]);
  3406. Matrix<Real_t>::GEMM(vbuff[2],vbuff[1],intdata.M[1]);
  3407. { // scaling
  3408. Matrix<Real_t>& vec=vbuff[2];
  3409. Vector<Real_t>& scal=intdata.scal[scal_idx*4+1];
  3410. size_t scal_dim=scal.Dim();
  3411. if(scal_dim){
  3412. size_t vdim=vec.Dim(0)*vec.Dim(1);
  3413. for(size_t j=0;j<vdim;j+=scal_dim){
  3414. for(size_t k=0;k<scal_dim;k++){
  3415. vec[0][j+k]*=scal[k];
  3416. }
  3417. }
  3418. }
  3419. }
  3420. }else{
  3421. vbuff[2].ReInit(src_value.Dim(0), src_value.Dim(1), src_value[0], false);
  3422. }
  3423. }
  3424. { // init vbuff[3]
  3425. if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){
  3426. size_t vdim=vbuff[3].Dim(0)*vbuff[3].Dim(1);
  3427. for(size_t i=0;i<vdim;i++) vbuff[0][0][i]=0;
  3428. }else{
  3429. vbuff[3].ReInit(trg_value.Dim(0), trg_value.Dim(1), trg_value[0], false);
  3430. }
  3431. }
  3432. if(src_coord.Dim(1)){
  3433. assert(ptr_single_layer_kernel); // assert(Single-layer kernel is implemented)
  3434. single_layer_kernel(src_coord[0], src_coord.Dim(1)/COORD_DIM, vbuff[2][0], 1,
  3435. trg_coord[0], trg_coord.Dim(1)/COORD_DIM, vbuff[3][0], NULL);
  3436. }
  3437. if(srf_coord.Dim(1)){
  3438. assert(ptr_double_layer_kernel); // assert(Double-layer kernel is implemented)
  3439. double_layer_kernel(srf_coord[0], srf_coord.Dim(1)/COORD_DIM, srf_value[0], 1,
  3440. trg_coord[0], trg_coord.Dim(1)/COORD_DIM, trg_value[0], NULL);
  3441. }
  3442. {// trg mat-vec
  3443. if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){
  3444. size_t scal_idx=intdata.scal_idx[int_id];
  3445. { // scaling
  3446. Matrix<Real_t>& vec=vbuff[3];
  3447. Vector<Real_t>& scal=intdata.scal[scal_idx*4+2];
  3448. size_t scal_dim=scal.Dim();
  3449. if(scal_dim){
  3450. size_t vdim=vec.Dim(0)*vec.Dim(1);
  3451. for(size_t j=0;j<vdim;j+=scal_dim){
  3452. for(size_t k=0;k<scal_dim;k++){
  3453. vec[0][j+k]*=scal[k];
  3454. }
  3455. }
  3456. }
  3457. }
  3458. Matrix<Real_t>::GEMM(vbuff[4],vbuff[3],intdata.M[2]);
  3459. Matrix<Real_t>::GEMM(vbuff[5],vbuff[4],intdata.M[3]);
  3460. { // scaling
  3461. Matrix<Real_t>& vec=vbuff[5];
  3462. Vector<Real_t>& scal=intdata.scal[scal_idx*4+3];
  3463. size_t scal_dim=scal.Dim();
  3464. if(scal_dim){
  3465. size_t vdim=vec.Dim(0)*vec.Dim(1);
  3466. for(size_t j=0;j<vdim;j+=scal_dim){
  3467. for(size_t k=0;k<scal_dim;k++){
  3468. vec[0][j+k]*=scal[k];
  3469. }
  3470. }
  3471. }
  3472. }
  3473. // Add vbuff[5] to trg_value
  3474. size_t vdim=vbuff[5].Dim(0)*vbuff[5].Dim(1);
  3475. assert(trg_value.Dim(0)*trg_value.Dim(1)==vdim);
  3476. for(size_t i=0;i<vdim;i++) trg_value[0][i]+=vbuff[5][0][i];
  3477. }
  3478. }
  3479. }
  3480. }
  3481. }
  3482. if(device) MIC_Lock::release_lock(lock_idx);
  3483. }
  3484. #ifdef __INTEL_OFFLOAD
  3485. if(SYNC){
  3486. #pragma offload if(device) target(mic:0)
  3487. {if(device) MIC_Lock::wait_lock(lock_idx);}
  3488. }
  3489. #endif
  3490. Profile::Toc();
  3491. }
  3492. template <class FMMNode>
  3493. void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3494. if(!this->MultipoleOrder()) return;
  3495. { // Set setup_data
  3496. setup_data. level=level;
  3497. setup_data.kernel=kernel->k_s2l;
  3498. setup_data. input_data=&buff[4];
  3499. setup_data.output_data=&buff[1];
  3500. setup_data. coord_data=&buff[6];
  3501. Vector<FMMNode_t*>& nodes_in =n_list[4];
  3502. Vector<FMMNode_t*>& nodes_out=n_list[1];
  3503. setup_data.nodes_in .clear();
  3504. setup_data.nodes_out.clear();
  3505. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]);
  3506. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  3507. }
  3508. struct PackedData{
  3509. size_t len;
  3510. Matrix<Real_t>* ptr;
  3511. Vector<size_t> cnt;
  3512. Vector<size_t> dsp;
  3513. };
  3514. struct InteracData{
  3515. Vector<size_t> in_node;
  3516. Vector<size_t> scal_idx;
  3517. Vector<Real_t> coord_shift;
  3518. Vector<size_t> interac_cnt;
  3519. Vector<size_t> interac_dsp;
  3520. Vector<Real_t> scal[4*MAX_DEPTH];
  3521. Matrix<Real_t> M[4];
  3522. };
  3523. struct ptSetupData{
  3524. int level;
  3525. const Kernel<Real_t>* kernel;
  3526. PackedData src_coord; // Src coord
  3527. PackedData src_value; // Src density
  3528. PackedData srf_coord; // Srf coord
  3529. PackedData srf_value; // Srf density
  3530. PackedData trg_coord; // Trg coord
  3531. PackedData trg_value; // Trg potential
  3532. InteracData interac_data;
  3533. };
  3534. ptSetupData data;
  3535. data. level=setup_data. level;
  3536. data.kernel=setup_data.kernel;
  3537. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3538. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3539. { // Set src data
  3540. std::vector<void*>& nodes=nodes_in;
  3541. PackedData& coord=data.src_coord;
  3542. PackedData& value=data.src_value;
  3543. coord.ptr=setup_data. coord_data;
  3544. value.ptr=setup_data. input_data;
  3545. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3546. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3547. coord.cnt.ReInit(nodes.size());
  3548. coord.dsp.ReInit(nodes.size());
  3549. value.cnt.ReInit(nodes.size());
  3550. value.dsp.ReInit(nodes.size());
  3551. #pragma omp parallel for
  3552. for(size_t i=0;i<nodes.size();i++){
  3553. ((FMMNode_t*)nodes[i])->node_id=i;
  3554. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->src_coord;
  3555. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->src_value;
  3556. if(coord_vec.Dim()){
  3557. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3558. assert(coord.dsp[i]<coord.len);
  3559. coord.cnt[i]=coord_vec.Dim();
  3560. }else{
  3561. coord.dsp[i]=0;
  3562. coord.cnt[i]=0;
  3563. }
  3564. if(value_vec.Dim()){
  3565. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3566. assert(value.dsp[i]<value.len);
  3567. value.cnt[i]=value_vec.Dim();
  3568. }else{
  3569. value.dsp[i]=0;
  3570. value.cnt[i]=0;
  3571. }
  3572. }
  3573. }
  3574. { // Set srf data
  3575. std::vector<void*>& nodes=nodes_in;
  3576. PackedData& coord=data.srf_coord;
  3577. PackedData& value=data.srf_value;
  3578. coord.ptr=setup_data. coord_data;
  3579. value.ptr=setup_data. input_data;
  3580. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3581. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3582. coord.cnt.ReInit(nodes.size());
  3583. coord.dsp.ReInit(nodes.size());
  3584. value.cnt.ReInit(nodes.size());
  3585. value.dsp.ReInit(nodes.size());
  3586. #pragma omp parallel for
  3587. for(size_t i=0;i<nodes.size();i++){
  3588. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->surf_coord;
  3589. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->surf_value;
  3590. if(coord_vec.Dim()){
  3591. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3592. assert(coord.dsp[i]<coord.len);
  3593. coord.cnt[i]=coord_vec.Dim();
  3594. }else{
  3595. coord.dsp[i]=0;
  3596. coord.cnt[i]=0;
  3597. }
  3598. if(value_vec.Dim()){
  3599. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3600. assert(value.dsp[i]<value.len);
  3601. value.cnt[i]=value_vec.Dim();
  3602. }else{
  3603. value.dsp[i]=0;
  3604. value.cnt[i]=0;
  3605. }
  3606. }
  3607. }
  3608. { // Set trg data
  3609. std::vector<void*>& nodes=nodes_out;
  3610. PackedData& coord=data.trg_coord;
  3611. PackedData& value=data.trg_value;
  3612. coord.ptr=setup_data. coord_data;
  3613. value.ptr=setup_data.output_data;
  3614. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3615. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3616. coord.cnt.ReInit(nodes.size());
  3617. coord.dsp.ReInit(nodes.size());
  3618. value.cnt.ReInit(nodes.size());
  3619. value.dsp.ReInit(nodes.size());
  3620. #pragma omp parallel for
  3621. for(size_t i=0;i<nodes.size();i++){
  3622. Vector<Real_t>& coord_vec=tree->dnwd_check_surf[((FMMNode*)nodes[i])->Depth()];
  3623. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->dnward_equiv;
  3624. if(coord_vec.Dim()){
  3625. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3626. assert(coord.dsp[i]<coord.len);
  3627. coord.cnt[i]=coord_vec.Dim();
  3628. }else{
  3629. coord.dsp[i]=0;
  3630. coord.cnt[i]=0;
  3631. }
  3632. if(value_vec.Dim()){
  3633. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3634. assert(value.dsp[i]<value.len);
  3635. value.cnt[i]=value_vec.Dim();
  3636. }else{
  3637. value.dsp[i]=0;
  3638. value.cnt[i]=0;
  3639. }
  3640. }
  3641. }
  3642. { // Set interac_data
  3643. int omp_p=omp_get_max_threads();
  3644. std::vector<std::vector<size_t> > in_node_(omp_p);
  3645. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  3646. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  3647. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  3648. size_t m=this->MultipoleOrder();
  3649. size_t Nsrf=(6*(m-1)*(m-1)+2)*COORD_DIM;
  3650. #pragma omp parallel for
  3651. for(size_t tid=0;tid<omp_p;tid++){
  3652. std::vector<size_t>& in_node =in_node_[tid] ;
  3653. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  3654. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  3655. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  3656. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  3657. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  3658. for(size_t i=a;i<b;i++){
  3659. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  3660. if(tnode->IsLeaf() && data.trg_coord.cnt[i]<=Nsrf){ // skip: handled in U-list
  3661. interac_cnt.push_back(0);
  3662. continue;
  3663. }
  3664. Real_t s=std::pow(0.5,tnode->Depth());
  3665. size_t interac_cnt_=0;
  3666. { // X_Type
  3667. Mat_Type type=X_Type;
  3668. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  3669. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  3670. FMMNode_t* snode=intlst[j];
  3671. size_t snode_id=snode->node_id;
  3672. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  3673. in_node.push_back(snode_id);
  3674. scal_idx.push_back(snode->Depth());
  3675. { // set coord_shift
  3676. const int* rel_coord=interac_list.RelativeCoord(type,j);
  3677. const Real_t* scoord=snode->Coord();
  3678. const Real_t* tcoord=tnode->Coord();
  3679. Real_t shift[COORD_DIM];
  3680. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(0+0.5*s);
  3681. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(0+0.5*s);
  3682. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(0+0.5*s);
  3683. coord_shift.push_back(shift[0]);
  3684. coord_shift.push_back(shift[1]);
  3685. coord_shift.push_back(shift[2]);
  3686. }
  3687. interac_cnt_++;
  3688. }
  3689. }
  3690. interac_cnt.push_back(interac_cnt_);
  3691. }
  3692. }
  3693. { // Combine interac data
  3694. InteracData& interac_data=data.interac_data;
  3695. { // in_node
  3696. typedef size_t ElemType;
  3697. std::vector<std::vector<ElemType> >& vec_=in_node_;
  3698. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  3699. std::vector<size_t> vec_dsp(omp_p+1,0);
  3700. for(size_t tid=0;tid<omp_p;tid++){
  3701. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3702. }
  3703. vec.ReInit(vec_dsp[omp_p]);
  3704. #pragma omp parallel for
  3705. for(size_t tid=0;tid<omp_p;tid++){
  3706. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3707. }
  3708. }
  3709. { // scal_idx
  3710. typedef size_t ElemType;
  3711. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  3712. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  3713. std::vector<size_t> vec_dsp(omp_p+1,0);
  3714. for(size_t tid=0;tid<omp_p;tid++){
  3715. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3716. }
  3717. vec.ReInit(vec_dsp[omp_p]);
  3718. #pragma omp parallel for
  3719. for(size_t tid=0;tid<omp_p;tid++){
  3720. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3721. }
  3722. }
  3723. { // coord_shift
  3724. typedef Real_t ElemType;
  3725. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  3726. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  3727. std::vector<size_t> vec_dsp(omp_p+1,0);
  3728. for(size_t tid=0;tid<omp_p;tid++){
  3729. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3730. }
  3731. vec.ReInit(vec_dsp[omp_p]);
  3732. #pragma omp parallel for
  3733. for(size_t tid=0;tid<omp_p;tid++){
  3734. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3735. }
  3736. }
  3737. { // interac_cnt
  3738. typedef size_t ElemType;
  3739. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  3740. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  3741. std::vector<size_t> vec_dsp(omp_p+1,0);
  3742. for(size_t tid=0;tid<omp_p;tid++){
  3743. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3744. }
  3745. vec.ReInit(vec_dsp[omp_p]);
  3746. #pragma omp parallel for
  3747. for(size_t tid=0;tid<omp_p;tid++){
  3748. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3749. }
  3750. }
  3751. { // interac_dsp
  3752. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  3753. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  3754. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  3755. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  3756. }
  3757. }
  3758. }
  3759. PtSetup(setup_data, &data);
  3760. }
  3761. template <class FMMNode>
  3762. void FMM_Pts<FMMNode>::X_List (SetupData<Real_t>& setup_data, bool device){
  3763. if(!this->MultipoleOrder()) return;
  3764. //Add X_List contribution.
  3765. this->EvalListPts(setup_data, device);
  3766. }
  3767. template <class FMMNode>
  3768. void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3769. if(!this->MultipoleOrder()) return;
  3770. { // Set setup_data
  3771. setup_data. level=level;
  3772. setup_data.kernel=kernel->k_m2t;
  3773. setup_data. input_data=&buff[0];
  3774. setup_data.output_data=&buff[5];
  3775. setup_data. coord_data=&buff[6];
  3776. Vector<FMMNode_t*>& nodes_in =n_list[0];
  3777. Vector<FMMNode_t*>& nodes_out=n_list[5];
  3778. setup_data.nodes_in .clear();
  3779. setup_data.nodes_out.clear();
  3780. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] ) setup_data.nodes_in .push_back(nodes_in [i]);
  3781. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  3782. }
  3783. struct PackedData{
  3784. size_t len;
  3785. Matrix<Real_t>* ptr;
  3786. Vector<size_t> cnt;
  3787. Vector<size_t> dsp;
  3788. };
  3789. struct InteracData{
  3790. Vector<size_t> in_node;
  3791. Vector<size_t> scal_idx;
  3792. Vector<Real_t> coord_shift;
  3793. Vector<size_t> interac_cnt;
  3794. Vector<size_t> interac_dsp;
  3795. Vector<Real_t> scal[4*MAX_DEPTH];
  3796. Matrix<Real_t> M[4];
  3797. };
  3798. struct ptSetupData{
  3799. int level;
  3800. const Kernel<Real_t>* kernel;
  3801. PackedData src_coord; // Src coord
  3802. PackedData src_value; // Src density
  3803. PackedData srf_coord; // Srf coord
  3804. PackedData srf_value; // Srf density
  3805. PackedData trg_coord; // Trg coord
  3806. PackedData trg_value; // Trg potential
  3807. InteracData interac_data;
  3808. };
  3809. ptSetupData data;
  3810. data. level=setup_data. level;
  3811. data.kernel=setup_data.kernel;
  3812. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3813. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3814. { // Set src data
  3815. std::vector<void*>& nodes=nodes_in;
  3816. PackedData& coord=data.src_coord;
  3817. PackedData& value=data.src_value;
  3818. coord.ptr=setup_data. coord_data;
  3819. value.ptr=setup_data. input_data;
  3820. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3821. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3822. coord.cnt.ReInit(nodes.size());
  3823. coord.dsp.ReInit(nodes.size());
  3824. value.cnt.ReInit(nodes.size());
  3825. value.dsp.ReInit(nodes.size());
  3826. #pragma omp parallel for
  3827. for(size_t i=0;i<nodes.size();i++){
  3828. ((FMMNode_t*)nodes[i])->node_id=i;
  3829. Vector<Real_t>& coord_vec=tree->upwd_equiv_surf[((FMMNode*)nodes[i])->Depth()];
  3830. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->upward_equiv;
  3831. if(coord_vec.Dim()){
  3832. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3833. assert(coord.dsp[i]<coord.len);
  3834. coord.cnt[i]=coord_vec.Dim();
  3835. }else{
  3836. coord.dsp[i]=0;
  3837. coord.cnt[i]=0;
  3838. }
  3839. if(value_vec.Dim()){
  3840. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3841. assert(value.dsp[i]<value.len);
  3842. value.cnt[i]=value_vec.Dim();
  3843. }else{
  3844. value.dsp[i]=0;
  3845. value.cnt[i]=0;
  3846. }
  3847. }
  3848. }
  3849. { // Set srf data
  3850. std::vector<void*>& nodes=nodes_in;
  3851. PackedData& coord=data.srf_coord;
  3852. PackedData& value=data.srf_value;
  3853. coord.ptr=setup_data. coord_data;
  3854. value.ptr=setup_data. input_data;
  3855. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3856. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3857. coord.cnt.ReInit(nodes.size());
  3858. coord.dsp.ReInit(nodes.size());
  3859. value.cnt.ReInit(nodes.size());
  3860. value.dsp.ReInit(nodes.size());
  3861. #pragma omp parallel for
  3862. for(size_t i=0;i<nodes.size();i++){
  3863. coord.dsp[i]=0;
  3864. coord.cnt[i]=0;
  3865. value.dsp[i]=0;
  3866. value.cnt[i]=0;
  3867. }
  3868. }
  3869. { // Set trg data
  3870. std::vector<void*>& nodes=nodes_out;
  3871. PackedData& coord=data.trg_coord;
  3872. PackedData& value=data.trg_value;
  3873. coord.ptr=setup_data. coord_data;
  3874. value.ptr=setup_data.output_data;
  3875. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3876. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3877. coord.cnt.ReInit(nodes.size());
  3878. coord.dsp.ReInit(nodes.size());
  3879. value.cnt.ReInit(nodes.size());
  3880. value.dsp.ReInit(nodes.size());
  3881. #pragma omp parallel for
  3882. for(size_t i=0;i<nodes.size();i++){
  3883. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  3884. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  3885. if(coord_vec.Dim()){
  3886. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3887. assert(coord.dsp[i]<coord.len);
  3888. coord.cnt[i]=coord_vec.Dim();
  3889. }else{
  3890. coord.dsp[i]=0;
  3891. coord.cnt[i]=0;
  3892. }
  3893. if(value_vec.Dim()){
  3894. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3895. assert(value.dsp[i]<value.len);
  3896. value.cnt[i]=value_vec.Dim();
  3897. }else{
  3898. value.dsp[i]=0;
  3899. value.cnt[i]=0;
  3900. }
  3901. }
  3902. }
  3903. { // Set interac_data
  3904. int omp_p=omp_get_max_threads();
  3905. std::vector<std::vector<size_t> > in_node_(omp_p);
  3906. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  3907. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  3908. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  3909. size_t m=this->MultipoleOrder();
  3910. size_t Nsrf=(6*(m-1)*(m-1)+2)*COORD_DIM;
  3911. #pragma omp parallel for
  3912. for(size_t tid=0;tid<omp_p;tid++){
  3913. std::vector<size_t>& in_node =in_node_[tid] ;
  3914. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  3915. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  3916. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  3917. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  3918. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  3919. for(size_t i=a;i<b;i++){
  3920. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  3921. Real_t s=std::pow(0.5,tnode->Depth());
  3922. size_t interac_cnt_=0;
  3923. { // W_Type
  3924. Mat_Type type=W_Type;
  3925. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  3926. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  3927. FMMNode_t* snode=intlst[j];
  3928. size_t snode_id=snode->node_id;
  3929. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  3930. if(snode->IsLeaf() && data.src_coord.cnt[snode_id]+data.srf_coord.cnt[snode_id]<=Nsrf) continue; // skip: handled in U-list
  3931. in_node.push_back(snode_id);
  3932. scal_idx.push_back(snode->Depth());
  3933. { // set coord_shift
  3934. const int* rel_coord=interac_list.RelativeCoord(type,j);
  3935. const Real_t* scoord=snode->Coord();
  3936. const Real_t* tcoord=tnode->Coord();
  3937. Real_t shift[COORD_DIM];
  3938. shift[0]=rel_coord[0]*0.25*s-(0+0.25*s)+(tcoord[0]+0.5*s);
  3939. shift[1]=rel_coord[1]*0.25*s-(0+0.25*s)+(tcoord[1]+0.5*s);
  3940. shift[2]=rel_coord[2]*0.25*s-(0+0.25*s)+(tcoord[2]+0.5*s);
  3941. coord_shift.push_back(shift[0]);
  3942. coord_shift.push_back(shift[1]);
  3943. coord_shift.push_back(shift[2]);
  3944. }
  3945. interac_cnt_++;
  3946. }
  3947. }
  3948. interac_cnt.push_back(interac_cnt_);
  3949. }
  3950. }
  3951. { // Combine interac data
  3952. InteracData& interac_data=data.interac_data;
  3953. { // in_node
  3954. typedef size_t ElemType;
  3955. std::vector<std::vector<ElemType> >& vec_=in_node_;
  3956. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  3957. std::vector<size_t> vec_dsp(omp_p+1,0);
  3958. for(size_t tid=0;tid<omp_p;tid++){
  3959. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3960. }
  3961. vec.ReInit(vec_dsp[omp_p]);
  3962. #pragma omp parallel for
  3963. for(size_t tid=0;tid<omp_p;tid++){
  3964. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3965. }
  3966. }
  3967. { // scal_idx
  3968. typedef size_t ElemType;
  3969. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  3970. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  3971. std::vector<size_t> vec_dsp(omp_p+1,0);
  3972. for(size_t tid=0;tid<omp_p;tid++){
  3973. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3974. }
  3975. vec.ReInit(vec_dsp[omp_p]);
  3976. #pragma omp parallel for
  3977. for(size_t tid=0;tid<omp_p;tid++){
  3978. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3979. }
  3980. }
  3981. { // coord_shift
  3982. typedef Real_t ElemType;
  3983. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  3984. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  3985. std::vector<size_t> vec_dsp(omp_p+1,0);
  3986. for(size_t tid=0;tid<omp_p;tid++){
  3987. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3988. }
  3989. vec.ReInit(vec_dsp[omp_p]);
  3990. #pragma omp parallel for
  3991. for(size_t tid=0;tid<omp_p;tid++){
  3992. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3993. }
  3994. }
  3995. { // interac_cnt
  3996. typedef size_t ElemType;
  3997. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  3998. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  3999. std::vector<size_t> vec_dsp(omp_p+1,0);
  4000. for(size_t tid=0;tid<omp_p;tid++){
  4001. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4002. }
  4003. vec.ReInit(vec_dsp[omp_p]);
  4004. #pragma omp parallel for
  4005. for(size_t tid=0;tid<omp_p;tid++){
  4006. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4007. }
  4008. }
  4009. { // interac_dsp
  4010. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4011. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4012. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4013. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4014. }
  4015. }
  4016. }
  4017. PtSetup(setup_data, &data);
  4018. }
  4019. template <class FMMNode>
  4020. void FMM_Pts<FMMNode>::W_List (SetupData<Real_t>& setup_data, bool device){
  4021. if(!this->MultipoleOrder()) return;
  4022. //Add W_List contribution.
  4023. this->EvalListPts(setup_data, device);
  4024. }
  4025. template <class FMMNode>
  4026. void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  4027. { // Set setup_data
  4028. setup_data. level=level;
  4029. setup_data.kernel=kernel->k_s2t;
  4030. setup_data. input_data=&buff[4];
  4031. setup_data.output_data=&buff[5];
  4032. setup_data. coord_data=&buff[6];
  4033. Vector<FMMNode_t*>& nodes_in =n_list[4];
  4034. Vector<FMMNode_t*>& nodes_out=n_list[5];
  4035. setup_data.nodes_in .clear();
  4036. setup_data.nodes_out.clear();
  4037. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]);
  4038. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  4039. }
  4040. struct PackedData{
  4041. size_t len;
  4042. Matrix<Real_t>* ptr;
  4043. Vector<size_t> cnt;
  4044. Vector<size_t> dsp;
  4045. };
  4046. struct InteracData{
  4047. Vector<size_t> in_node;
  4048. Vector<size_t> scal_idx;
  4049. Vector<Real_t> coord_shift;
  4050. Vector<size_t> interac_cnt;
  4051. Vector<size_t> interac_dsp;
  4052. Vector<Real_t> scal[4*MAX_DEPTH];
  4053. Matrix<Real_t> M[4];
  4054. };
  4055. struct ptSetupData{
  4056. int level;
  4057. const Kernel<Real_t>* kernel;
  4058. PackedData src_coord; // Src coord
  4059. PackedData src_value; // Src density
  4060. PackedData srf_coord; // Srf coord
  4061. PackedData srf_value; // Srf density
  4062. PackedData trg_coord; // Trg coord
  4063. PackedData trg_value; // Trg potential
  4064. InteracData interac_data;
  4065. };
  4066. ptSetupData data;
  4067. data. level=setup_data. level;
  4068. data.kernel=setup_data.kernel;
  4069. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  4070. std::vector<void*>& nodes_out=setup_data.nodes_out;
  4071. { // Set src data
  4072. std::vector<void*>& nodes=nodes_in;
  4073. PackedData& coord=data.src_coord;
  4074. PackedData& value=data.src_value;
  4075. coord.ptr=setup_data. coord_data;
  4076. value.ptr=setup_data. input_data;
  4077. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4078. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4079. coord.cnt.ReInit(nodes.size());
  4080. coord.dsp.ReInit(nodes.size());
  4081. value.cnt.ReInit(nodes.size());
  4082. value.dsp.ReInit(nodes.size());
  4083. #pragma omp parallel for
  4084. for(size_t i=0;i<nodes.size();i++){
  4085. ((FMMNode_t*)nodes[i])->node_id=i;
  4086. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->src_coord;
  4087. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->src_value;
  4088. if(coord_vec.Dim()){
  4089. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4090. assert(coord.dsp[i]<coord.len);
  4091. coord.cnt[i]=coord_vec.Dim();
  4092. }else{
  4093. coord.dsp[i]=0;
  4094. coord.cnt[i]=0;
  4095. }
  4096. if(value_vec.Dim()){
  4097. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4098. assert(value.dsp[i]<value.len);
  4099. value.cnt[i]=value_vec.Dim();
  4100. }else{
  4101. value.dsp[i]=0;
  4102. value.cnt[i]=0;
  4103. }
  4104. }
  4105. }
  4106. { // Set srf data
  4107. std::vector<void*>& nodes=nodes_in;
  4108. PackedData& coord=data.srf_coord;
  4109. PackedData& value=data.srf_value;
  4110. coord.ptr=setup_data. coord_data;
  4111. value.ptr=setup_data. input_data;
  4112. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4113. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4114. coord.cnt.ReInit(nodes.size());
  4115. coord.dsp.ReInit(nodes.size());
  4116. value.cnt.ReInit(nodes.size());
  4117. value.dsp.ReInit(nodes.size());
  4118. #pragma omp parallel for
  4119. for(size_t i=0;i<nodes.size();i++){
  4120. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->surf_coord;
  4121. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->surf_value;
  4122. if(coord_vec.Dim()){
  4123. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4124. assert(coord.dsp[i]<coord.len);
  4125. coord.cnt[i]=coord_vec.Dim();
  4126. }else{
  4127. coord.dsp[i]=0;
  4128. coord.cnt[i]=0;
  4129. }
  4130. if(value_vec.Dim()){
  4131. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4132. assert(value.dsp[i]<value.len);
  4133. value.cnt[i]=value_vec.Dim();
  4134. }else{
  4135. value.dsp[i]=0;
  4136. value.cnt[i]=0;
  4137. }
  4138. }
  4139. }
  4140. { // Set trg data
  4141. std::vector<void*>& nodes=nodes_out;
  4142. PackedData& coord=data.trg_coord;
  4143. PackedData& value=data.trg_value;
  4144. coord.ptr=setup_data. coord_data;
  4145. value.ptr=setup_data.output_data;
  4146. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4147. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4148. coord.cnt.ReInit(nodes.size());
  4149. coord.dsp.ReInit(nodes.size());
  4150. value.cnt.ReInit(nodes.size());
  4151. value.dsp.ReInit(nodes.size());
  4152. #pragma omp parallel for
  4153. for(size_t i=0;i<nodes.size();i++){
  4154. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4155. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4156. if(coord_vec.Dim()){
  4157. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4158. assert(coord.dsp[i]<coord.len);
  4159. coord.cnt[i]=coord_vec.Dim();
  4160. }else{
  4161. coord.dsp[i]=0;
  4162. coord.cnt[i]=0;
  4163. }
  4164. if(value_vec.Dim()){
  4165. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4166. assert(value.dsp[i]<value.len);
  4167. value.cnt[i]=value_vec.Dim();
  4168. }else{
  4169. value.dsp[i]=0;
  4170. value.cnt[i]=0;
  4171. }
  4172. }
  4173. }
  4174. { // Set interac_data
  4175. int omp_p=omp_get_max_threads();
  4176. std::vector<std::vector<size_t> > in_node_(omp_p);
  4177. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4178. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4179. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4180. size_t m=this->MultipoleOrder();
  4181. size_t Nsrf=(6*(m-1)*(m-1)+2)*COORD_DIM;
  4182. #pragma omp parallel for
  4183. for(size_t tid=0;tid<omp_p;tid++){
  4184. std::vector<size_t>& in_node =in_node_[tid] ;
  4185. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4186. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4187. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  4188. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4189. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4190. for(size_t i=a;i<b;i++){
  4191. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4192. Real_t s=std::pow(0.5,tnode->Depth());
  4193. size_t interac_cnt_=0;
  4194. { // U0_Type
  4195. Mat_Type type=U0_Type;
  4196. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4197. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4198. FMMNode_t* snode=intlst[j];
  4199. size_t snode_id=snode->node_id;
  4200. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4201. in_node.push_back(snode_id);
  4202. scal_idx.push_back(snode->Depth());
  4203. { // set coord_shift
  4204. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4205. const Real_t* scoord=snode->Coord();
  4206. const Real_t* tcoord=tnode->Coord();
  4207. Real_t shift[COORD_DIM];
  4208. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(tcoord[0]+0.5*s);
  4209. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(tcoord[1]+0.5*s);
  4210. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(tcoord[2]+0.5*s);
  4211. coord_shift.push_back(shift[0]);
  4212. coord_shift.push_back(shift[1]);
  4213. coord_shift.push_back(shift[2]);
  4214. }
  4215. interac_cnt_++;
  4216. }
  4217. }
  4218. { // U1_Type
  4219. Mat_Type type=U1_Type;
  4220. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4221. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4222. FMMNode_t* snode=intlst[j];
  4223. size_t snode_id=snode->node_id;
  4224. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4225. in_node.push_back(snode_id);
  4226. scal_idx.push_back(snode->Depth());
  4227. { // set coord_shift
  4228. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4229. const Real_t* scoord=snode->Coord();
  4230. const Real_t* tcoord=tnode->Coord();
  4231. Real_t shift[COORD_DIM];
  4232. shift[0]=rel_coord[0]*1.0*s-(scoord[0]+0.5*s)+(tcoord[0]+0.5*s);
  4233. shift[1]=rel_coord[1]*1.0*s-(scoord[1]+0.5*s)+(tcoord[1]+0.5*s);
  4234. shift[2]=rel_coord[2]*1.0*s-(scoord[2]+0.5*s)+(tcoord[2]+0.5*s);
  4235. coord_shift.push_back(shift[0]);
  4236. coord_shift.push_back(shift[1]);
  4237. coord_shift.push_back(shift[2]);
  4238. }
  4239. interac_cnt_++;
  4240. }
  4241. }
  4242. { // U2_Type
  4243. Mat_Type type=U2_Type;
  4244. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4245. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4246. FMMNode_t* snode=intlst[j];
  4247. size_t snode_id=snode->node_id;
  4248. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4249. in_node.push_back(snode_id);
  4250. scal_idx.push_back(snode->Depth());
  4251. { // set coord_shift
  4252. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4253. const Real_t* scoord=snode->Coord();
  4254. const Real_t* tcoord=tnode->Coord();
  4255. Real_t shift[COORD_DIM];
  4256. shift[0]=rel_coord[0]*0.25*s-(scoord[0]+0.25*s)+(tcoord[0]+0.5*s);
  4257. shift[1]=rel_coord[1]*0.25*s-(scoord[1]+0.25*s)+(tcoord[1]+0.5*s);
  4258. shift[2]=rel_coord[2]*0.25*s-(scoord[2]+0.25*s)+(tcoord[2]+0.5*s);
  4259. coord_shift.push_back(shift[0]);
  4260. coord_shift.push_back(shift[1]);
  4261. coord_shift.push_back(shift[2]);
  4262. }
  4263. interac_cnt_++;
  4264. }
  4265. }
  4266. { // X_Type
  4267. Mat_Type type=X_Type;
  4268. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4269. if(data.trg_coord.cnt[i]<=Nsrf)
  4270. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4271. FMMNode_t* snode=intlst[j];
  4272. size_t snode_id=snode->node_id;
  4273. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4274. in_node.push_back(snode_id);
  4275. scal_idx.push_back(snode->Depth());
  4276. { // set coord_shift
  4277. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4278. const Real_t* scoord=snode->Coord();
  4279. const Real_t* tcoord=tnode->Coord();
  4280. Real_t shift[COORD_DIM];
  4281. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(tcoord[0]+0.5*s);
  4282. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(tcoord[1]+0.5*s);
  4283. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(tcoord[2]+0.5*s);
  4284. coord_shift.push_back(shift[0]);
  4285. coord_shift.push_back(shift[1]);
  4286. coord_shift.push_back(shift[2]);
  4287. }
  4288. interac_cnt_++;
  4289. }
  4290. }
  4291. { // W_Type
  4292. Mat_Type type=W_Type;
  4293. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4294. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4295. FMMNode_t* snode=intlst[j];
  4296. size_t snode_id=snode->node_id;
  4297. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4298. if(data.src_coord.cnt[snode_id]+data.srf_coord.cnt[snode_id]> Nsrf) continue;
  4299. in_node.push_back(snode_id);
  4300. scal_idx.push_back(snode->Depth());
  4301. { // set coord_shift
  4302. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4303. const Real_t* scoord=snode->Coord();
  4304. const Real_t* tcoord=tnode->Coord();
  4305. Real_t shift[COORD_DIM];
  4306. shift[0]=rel_coord[0]*0.25*s-(scoord[0]+0.25*s)+(tcoord[0]+0.5*s);
  4307. shift[1]=rel_coord[1]*0.25*s-(scoord[1]+0.25*s)+(tcoord[1]+0.5*s);
  4308. shift[2]=rel_coord[2]*0.25*s-(scoord[2]+0.25*s)+(tcoord[2]+0.5*s);
  4309. coord_shift.push_back(shift[0]);
  4310. coord_shift.push_back(shift[1]);
  4311. coord_shift.push_back(shift[2]);
  4312. }
  4313. interac_cnt_++;
  4314. }
  4315. }
  4316. interac_cnt.push_back(interac_cnt_);
  4317. }
  4318. }
  4319. { // Combine interac data
  4320. InteracData& interac_data=data.interac_data;
  4321. { // in_node
  4322. typedef size_t ElemType;
  4323. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4324. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4325. std::vector<size_t> vec_dsp(omp_p+1,0);
  4326. for(size_t tid=0;tid<omp_p;tid++){
  4327. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4328. }
  4329. vec.ReInit(vec_dsp[omp_p]);
  4330. #pragma omp parallel for
  4331. for(size_t tid=0;tid<omp_p;tid++){
  4332. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4333. }
  4334. }
  4335. { // scal_idx
  4336. typedef size_t ElemType;
  4337. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4338. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4339. std::vector<size_t> vec_dsp(omp_p+1,0);
  4340. for(size_t tid=0;tid<omp_p;tid++){
  4341. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4342. }
  4343. vec.ReInit(vec_dsp[omp_p]);
  4344. #pragma omp parallel for
  4345. for(size_t tid=0;tid<omp_p;tid++){
  4346. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4347. }
  4348. }
  4349. { // coord_shift
  4350. typedef Real_t ElemType;
  4351. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4352. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4353. std::vector<size_t> vec_dsp(omp_p+1,0);
  4354. for(size_t tid=0;tid<omp_p;tid++){
  4355. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4356. }
  4357. vec.ReInit(vec_dsp[omp_p]);
  4358. #pragma omp parallel for
  4359. for(size_t tid=0;tid<omp_p;tid++){
  4360. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4361. }
  4362. }
  4363. { // interac_cnt
  4364. typedef size_t ElemType;
  4365. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4366. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4367. std::vector<size_t> vec_dsp(omp_p+1,0);
  4368. for(size_t tid=0;tid<omp_p;tid++){
  4369. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4370. }
  4371. vec.ReInit(vec_dsp[omp_p]);
  4372. #pragma omp parallel for
  4373. for(size_t tid=0;tid<omp_p;tid++){
  4374. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4375. }
  4376. }
  4377. { // interac_dsp
  4378. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4379. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4380. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4381. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4382. }
  4383. }
  4384. }
  4385. PtSetup(setup_data, &data);
  4386. }
  4387. template <class FMMNode>
  4388. void FMM_Pts<FMMNode>::U_List (SetupData<Real_t>& setup_data, bool device){
  4389. //Add U_List contribution.
  4390. this->EvalListPts(setup_data, device);
  4391. }
  4392. template <class FMMNode>
  4393. void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  4394. if(!this->MultipoleOrder()) return;
  4395. { // Set setup_data
  4396. setup_data. level=level;
  4397. setup_data.kernel=kernel->k_l2t;
  4398. setup_data. input_data=&buff[1];
  4399. setup_data.output_data=&buff[5];
  4400. setup_data. coord_data=&buff[6];
  4401. Vector<FMMNode_t*>& nodes_in =n_list[1];
  4402. Vector<FMMNode_t*>& nodes_out=n_list[5];
  4403. setup_data.nodes_in .clear();
  4404. setup_data.nodes_out.clear();
  4405. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->pt_cnt[1] && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
  4406. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  4407. }
  4408. struct PackedData{
  4409. size_t len;
  4410. Matrix<Real_t>* ptr;
  4411. Vector<size_t> cnt;
  4412. Vector<size_t> dsp;
  4413. };
  4414. struct InteracData{
  4415. Vector<size_t> in_node;
  4416. Vector<size_t> scal_idx;
  4417. Vector<Real_t> coord_shift;
  4418. Vector<size_t> interac_cnt;
  4419. Vector<size_t> interac_dsp;
  4420. Vector<Real_t> scal[4*MAX_DEPTH];
  4421. Matrix<Real_t> M[4];
  4422. };
  4423. struct ptSetupData{
  4424. int level;
  4425. const Kernel<Real_t>* kernel;
  4426. PackedData src_coord; // Src coord
  4427. PackedData src_value; // Src density
  4428. PackedData srf_coord; // Srf coord
  4429. PackedData srf_value; // Srf density
  4430. PackedData trg_coord; // Trg coord
  4431. PackedData trg_value; // Trg potential
  4432. InteracData interac_data;
  4433. };
  4434. ptSetupData data;
  4435. data. level=setup_data. level;
  4436. data.kernel=setup_data.kernel;
  4437. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  4438. std::vector<void*>& nodes_out=setup_data.nodes_out;
  4439. { // Set src data
  4440. std::vector<void*>& nodes=nodes_in;
  4441. PackedData& coord=data.src_coord;
  4442. PackedData& value=data.src_value;
  4443. coord.ptr=setup_data. coord_data;
  4444. value.ptr=setup_data. input_data;
  4445. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4446. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4447. coord.cnt.ReInit(nodes.size());
  4448. coord.dsp.ReInit(nodes.size());
  4449. value.cnt.ReInit(nodes.size());
  4450. value.dsp.ReInit(nodes.size());
  4451. #pragma omp parallel for
  4452. for(size_t i=0;i<nodes.size();i++){
  4453. ((FMMNode_t*)nodes[i])->node_id=i;
  4454. Vector<Real_t>& coord_vec=tree->dnwd_equiv_surf[((FMMNode*)nodes[i])->Depth()];
  4455. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->dnward_equiv;
  4456. if(coord_vec.Dim()){
  4457. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4458. assert(coord.dsp[i]<coord.len);
  4459. coord.cnt[i]=coord_vec.Dim();
  4460. }else{
  4461. coord.dsp[i]=0;
  4462. coord.cnt[i]=0;
  4463. }
  4464. if(value_vec.Dim()){
  4465. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4466. assert(value.dsp[i]<value.len);
  4467. value.cnt[i]=value_vec.Dim();
  4468. }else{
  4469. value.dsp[i]=0;
  4470. value.cnt[i]=0;
  4471. }
  4472. }
  4473. }
  4474. { // Set srf data
  4475. std::vector<void*>& nodes=nodes_in;
  4476. PackedData& coord=data.srf_coord;
  4477. PackedData& value=data.srf_value;
  4478. coord.ptr=setup_data. coord_data;
  4479. value.ptr=setup_data. input_data;
  4480. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4481. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4482. coord.cnt.ReInit(nodes.size());
  4483. coord.dsp.ReInit(nodes.size());
  4484. value.cnt.ReInit(nodes.size());
  4485. value.dsp.ReInit(nodes.size());
  4486. #pragma omp parallel for
  4487. for(size_t i=0;i<nodes.size();i++){
  4488. coord.dsp[i]=0;
  4489. coord.cnt[i]=0;
  4490. value.dsp[i]=0;
  4491. value.cnt[i]=0;
  4492. }
  4493. }
  4494. { // Set trg data
  4495. std::vector<void*>& nodes=nodes_out;
  4496. PackedData& coord=data.trg_coord;
  4497. PackedData& value=data.trg_value;
  4498. coord.ptr=setup_data. coord_data;
  4499. value.ptr=setup_data.output_data;
  4500. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4501. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4502. coord.cnt.ReInit(nodes.size());
  4503. coord.dsp.ReInit(nodes.size());
  4504. value.cnt.ReInit(nodes.size());
  4505. value.dsp.ReInit(nodes.size());
  4506. #pragma omp parallel for
  4507. for(size_t i=0;i<nodes.size();i++){
  4508. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4509. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4510. if(coord_vec.Dim()){
  4511. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4512. assert(coord.dsp[i]<coord.len);
  4513. coord.cnt[i]=coord_vec.Dim();
  4514. }else{
  4515. coord.dsp[i]=0;
  4516. coord.cnt[i]=0;
  4517. }
  4518. if(value_vec.Dim()){
  4519. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4520. assert(value.dsp[i]<value.len);
  4521. value.cnt[i]=value_vec.Dim();
  4522. }else{
  4523. value.dsp[i]=0;
  4524. value.cnt[i]=0;
  4525. }
  4526. }
  4527. }
  4528. { // Set interac_data
  4529. int omp_p=omp_get_max_threads();
  4530. std::vector<std::vector<size_t> > in_node_(omp_p);
  4531. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4532. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4533. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4534. data.interac_data.M[0]=this->mat->Mat(level, DC2DE0_Type, 0);
  4535. data.interac_data.M[1]=this->mat->Mat(level, DC2DE1_Type, 0);
  4536. if(this->ScaleInvar()){ // Set scal
  4537. const Kernel<Real_t>* ker=kernel->k_l2l;
  4538. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+0]
  4539. Vector<Real_t>& scal=data.interac_data.scal[l*4+0];
  4540. Vector<Real_t>& scal_exp=ker->trg_scal;
  4541. scal.ReInit(scal_exp.Dim());
  4542. for(size_t i=0;i<scal.Dim();i++){
  4543. scal[i]=std::pow(2.0,-scal_exp[i]*l);
  4544. }
  4545. }
  4546. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+1]
  4547. Vector<Real_t>& scal=data.interac_data.scal[l*4+1];
  4548. Vector<Real_t>& scal_exp=ker->src_scal;
  4549. scal.ReInit(scal_exp.Dim());
  4550. for(size_t i=0;i<scal.Dim();i++){
  4551. scal[i]=std::pow(2.0,-scal_exp[i]*l);
  4552. }
  4553. }
  4554. }
  4555. #pragma omp parallel for
  4556. for(size_t tid=0;tid<omp_p;tid++){
  4557. std::vector<size_t>& in_node =in_node_[tid] ;
  4558. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4559. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4560. std::vector<size_t>& interac_cnt=interac_cnt_[tid];
  4561. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4562. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4563. for(size_t i=a;i<b;i++){
  4564. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4565. Real_t s=std::pow(0.5,tnode->Depth());
  4566. size_t interac_cnt_=0;
  4567. { // D2T_Type
  4568. Mat_Type type=D2T_Type;
  4569. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4570. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4571. FMMNode_t* snode=intlst[j];
  4572. size_t snode_id=snode->node_id;
  4573. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4574. in_node.push_back(snode_id);
  4575. scal_idx.push_back(snode->Depth());
  4576. { // set coord_shift
  4577. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4578. const Real_t* scoord=snode->Coord();
  4579. const Real_t* tcoord=tnode->Coord();
  4580. Real_t shift[COORD_DIM];
  4581. shift[0]=rel_coord[0]*0.5*s-(0+0.5*s)+(tcoord[0]+0.5*s);
  4582. shift[1]=rel_coord[1]*0.5*s-(0+0.5*s)+(tcoord[1]+0.5*s);
  4583. shift[2]=rel_coord[2]*0.5*s-(0+0.5*s)+(tcoord[2]+0.5*s);
  4584. coord_shift.push_back(shift[0]);
  4585. coord_shift.push_back(shift[1]);
  4586. coord_shift.push_back(shift[2]);
  4587. }
  4588. interac_cnt_++;
  4589. }
  4590. }
  4591. interac_cnt.push_back(interac_cnt_);
  4592. }
  4593. }
  4594. { // Combine interac data
  4595. InteracData& interac_data=data.interac_data;
  4596. { // in_node
  4597. typedef size_t ElemType;
  4598. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4599. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4600. std::vector<size_t> vec_dsp(omp_p+1,0);
  4601. for(size_t tid=0;tid<omp_p;tid++){
  4602. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4603. }
  4604. vec.ReInit(vec_dsp[omp_p]);
  4605. #pragma omp parallel for
  4606. for(size_t tid=0;tid<omp_p;tid++){
  4607. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4608. }
  4609. }
  4610. { // scal_idx
  4611. typedef size_t ElemType;
  4612. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4613. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4614. std::vector<size_t> vec_dsp(omp_p+1,0);
  4615. for(size_t tid=0;tid<omp_p;tid++){
  4616. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4617. }
  4618. vec.ReInit(vec_dsp[omp_p]);
  4619. #pragma omp parallel for
  4620. for(size_t tid=0;tid<omp_p;tid++){
  4621. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4622. }
  4623. }
  4624. { // coord_shift
  4625. typedef Real_t ElemType;
  4626. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4627. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4628. std::vector<size_t> vec_dsp(omp_p+1,0);
  4629. for(size_t tid=0;tid<omp_p;tid++){
  4630. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4631. }
  4632. vec.ReInit(vec_dsp[omp_p]);
  4633. #pragma omp parallel for
  4634. for(size_t tid=0;tid<omp_p;tid++){
  4635. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4636. }
  4637. }
  4638. { // interac_cnt
  4639. typedef size_t ElemType;
  4640. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4641. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4642. std::vector<size_t> vec_dsp(omp_p+1,0);
  4643. for(size_t tid=0;tid<omp_p;tid++){
  4644. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4645. }
  4646. vec.ReInit(vec_dsp[omp_p]);
  4647. #pragma omp parallel for
  4648. for(size_t tid=0;tid<omp_p;tid++){
  4649. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4650. }
  4651. }
  4652. { // interac_dsp
  4653. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4654. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4655. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4656. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4657. }
  4658. }
  4659. }
  4660. PtSetup(setup_data, &data);
  4661. }
  4662. template <class FMMNode>
  4663. void FMM_Pts<FMMNode>::Down2Target(SetupData<Real_t>& setup_data, bool device){
  4664. if(!this->MultipoleOrder()) return;
  4665. //Add Down2Target contribution.
  4666. this->EvalListPts(setup_data, device);
  4667. }
  4668. template <class FMMNode>
  4669. void FMM_Pts<FMMNode>::PostProcessing(std::vector<FMMNode_t*>& nodes){
  4670. }
  4671. template <class FMMNode>
  4672. void FMM_Pts<FMMNode>::CopyOutput(FMMNode** nodes, size_t n){
  4673. }
  4674. }//end namespace