fmm_pts.txx 213 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564
  1. /**
  2. * \file fmm_pts.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 3-07-2011
  5. * \brief This file contains the implementation of the FMM_Pts class.
  6. */
  7. #include <omp.h>
  8. #include <cmath>
  9. #include <cstdlib>
  10. #include <cassert>
  11. #include <sstream>
  12. #include <iostream>
  13. #include <stdint.h>
  14. #include <set>
  15. #ifdef PVFMM_HAVE_SYS_STAT_H
  16. #include <sys/stat.h>
  17. #endif
  18. #ifdef __SSE__
  19. #include <xmmintrin.h>
  20. #endif
  21. #ifdef __SSE2__
  22. #include <emmintrin.h>
  23. #endif
  24. #ifdef __SSE3__
  25. #include <pmmintrin.h>
  26. #endif
  27. #ifdef __AVX__
  28. #include <immintrin.h>
  29. #endif
  30. #if defined(__MIC__)
  31. #include <immintrin.h>
  32. #endif
  33. #include <profile.hpp>
  34. #include <cheb_utils.hpp>
  35. namespace pvfmm{
  36. /**
  37. * \brief Returns the coordinates of points on the surface of a cube.
  38. * \param[in] p Number of points on an edge of the cube is (n+1)
  39. * \param[in] c Coordinates to the centre of the cube (3D array).
  40. * \param[in] alpha Scaling factor for the size of the cube.
  41. * \param[in] depth Depth of the cube in the octree.
  42. * \return Vector with coordinates of points on the surface of the cube in the
  43. * format [x0 y0 z0 x1 y1 z1 .... ].
  44. */
  45. template <class Real_t>
  46. std::vector<Real_t> surface(int p, Real_t* c, Real_t alpha, int depth){
  47. size_t n_=(6*(p-1)*(p-1)+2); //Total number of points.
  48. std::vector<Real_t> coord(n_*3);
  49. coord[0]=coord[1]=coord[2]=-1.0;
  50. size_t cnt=1;
  51. for(int i=0;i<p-1;i++)
  52. for(int j=0;j<p-1;j++){
  53. coord[cnt*3 ]=-1.0;
  54. coord[cnt*3+1]=(2.0*(i+1)-p+1)/(p-1);
  55. coord[cnt*3+2]=(2.0*j-p+1)/(p-1);
  56. cnt++;
  57. }
  58. for(int i=0;i<p-1;i++)
  59. for(int j=0;j<p-1;j++){
  60. coord[cnt*3 ]=(2.0*i-p+1)/(p-1);
  61. coord[cnt*3+1]=-1.0;
  62. coord[cnt*3+2]=(2.0*(j+1)-p+1)/(p-1);
  63. cnt++;
  64. }
  65. for(int i=0;i<p-1;i++)
  66. for(int j=0;j<p-1;j++){
  67. coord[cnt*3 ]=(2.0*(i+1)-p+1)/(p-1);
  68. coord[cnt*3+1]=(2.0*j-p+1)/(p-1);
  69. coord[cnt*3+2]=-1.0;
  70. cnt++;
  71. }
  72. for(size_t i=0;i<(n_/2)*3;i++)
  73. coord[cnt*3+i]=-coord[i];
  74. Real_t r = 0.5*pvfmm::pow<Real_t>(0.5,depth);
  75. Real_t b = alpha*r;
  76. for(size_t i=0;i<n_;i++){
  77. coord[i*3+0]=(coord[i*3+0]+1.0)*b+c[0];
  78. coord[i*3+1]=(coord[i*3+1]+1.0)*b+c[1];
  79. coord[i*3+2]=(coord[i*3+2]+1.0)*b+c[2];
  80. }
  81. return coord;
  82. }
  83. /**
  84. * \brief Returns the coordinates of points on the upward check surface of cube.
  85. * \see surface()
  86. */
  87. template <class Real_t>
  88. std::vector<Real_t> u_check_surf(int p, Real_t* c, int depth){
  89. Real_t r=0.5*pvfmm::pow<Real_t>(0.5,depth);
  90. Real_t coord[3]={(Real_t)(c[0]-r*(RAD1-1.0)),(Real_t)(c[1]-r*(RAD1-1.0)),(Real_t)(c[2]-r*(RAD1-1.0))};
  91. return surface(p,coord,(Real_t)RAD1,depth);
  92. }
  93. /**
  94. * \brief Returns the coordinates of points on the upward equivalent surface of cube.
  95. * \see surface()
  96. */
  97. template <class Real_t>
  98. std::vector<Real_t> u_equiv_surf(int p, Real_t* c, int depth){
  99. Real_t r=0.5*pvfmm::pow<Real_t>(0.5,depth);
  100. Real_t coord[3]={(Real_t)(c[0]-r*(RAD0-1.0)),(Real_t)(c[1]-r*(RAD0-1.0)),(Real_t)(c[2]-r*(RAD0-1.0))};
  101. return surface(p,coord,(Real_t)RAD0,depth);
  102. }
  103. /**
  104. * \brief Returns the coordinates of points on the downward check surface of cube.
  105. * \see surface()
  106. */
  107. template <class Real_t>
  108. std::vector<Real_t> d_check_surf(int p, Real_t* c, int depth){
  109. Real_t r=0.5*pvfmm::pow<Real_t>(0.5,depth);
  110. Real_t coord[3]={(Real_t)(c[0]-r*(RAD0-1.0)),(Real_t)(c[1]-r*(RAD0-1.0)),(Real_t)(c[2]-r*(RAD0-1.0))};
  111. return surface(p,coord,(Real_t)RAD0,depth);
  112. }
  113. /**
  114. * \brief Returns the coordinates of points on the downward equivalent surface of cube.
  115. * \see surface()
  116. */
  117. template <class Real_t>
  118. std::vector<Real_t> d_equiv_surf(int p, Real_t* c, int depth){
  119. Real_t r=0.5*pvfmm::pow<Real_t>(0.5,depth);
  120. Real_t coord[3]={(Real_t)(c[0]-r*(RAD1-1.0)),(Real_t)(c[1]-r*(RAD1-1.0)),(Real_t)(c[2]-r*(RAD1-1.0))};
  121. return surface(p,coord,(Real_t)RAD1,depth);
  122. }
  123. /**
  124. * \brief Defines the 3D grid for convolution in FFT acceleration of V-list.
  125. * \see surface()
  126. */
  127. template <class Real_t>
  128. std::vector<Real_t> conv_grid(int p, Real_t* c, int depth){
  129. Real_t r=pvfmm::pow<Real_t>(0.5,depth);
  130. Real_t a=r*RAD0;
  131. Real_t coord[3]={c[0],c[1],c[2]};
  132. int n1=p*2;
  133. int n2=pvfmm::pow<int>((Real_t)n1,2);
  134. int n3=pvfmm::pow<int>((Real_t)n1,3);
  135. std::vector<Real_t> grid(n3*3);
  136. for(int i=0;i<n1;i++)
  137. for(int j=0;j<n1;j++)
  138. for(int k=0;k<n1;k++){
  139. grid[(i+n1*j+n2*k)*3+0]=(i-p)*a/(p-1)+coord[0];
  140. grid[(i+n1*j+n2*k)*3+1]=(j-p)*a/(p-1)+coord[1];
  141. grid[(i+n1*j+n2*k)*3+2]=(k-p)*a/(p-1)+coord[2];
  142. }
  143. return grid;
  144. }
  145. template <class Real_t>
  146. void FMM_Data<Real_t>::Clear(){
  147. upward_equiv.Resize(0);
  148. }
  149. template <class Real_t>
  150. PackedData FMM_Data<Real_t>::PackMultipole(void* buff_ptr){
  151. PackedData p0; p0.data=buff_ptr;
  152. p0.length=upward_equiv.Dim()*sizeof(Real_t);
  153. if(p0.length==0) return p0;
  154. if(p0.data==NULL) p0.data=(char*)&upward_equiv[0];
  155. else mem::memcopy(p0.data,&upward_equiv[0],p0.length);
  156. return p0;
  157. }
  158. template <class Real_t>
  159. void FMM_Data<Real_t>::AddMultipole(PackedData p0){
  160. Real_t* data=(Real_t*)p0.data;
  161. size_t n=p0.length/sizeof(Real_t);
  162. assert(upward_equiv.Dim()==n);
  163. Matrix<Real_t> v0(1,n,&upward_equiv[0],false);
  164. Matrix<Real_t> v1(1,n,data,false);
  165. v0+=v1;
  166. }
  167. template <class Real_t>
  168. void FMM_Data<Real_t>::InitMultipole(PackedData p0, bool own_data){
  169. Real_t* data=(Real_t*)p0.data;
  170. size_t n=p0.length/sizeof(Real_t);
  171. if(n==0) return;
  172. if(own_data){
  173. upward_equiv=Vector<Real_t>(n, &data[0], false);
  174. }else{
  175. upward_equiv.ReInit(n, &data[0], false);
  176. }
  177. }
  178. template <class FMMNode>
  179. FMM_Pts<FMMNode>::~FMM_Pts() {
  180. if(mat!=NULL){
  181. // int rank;
  182. // MPI_Comm_rank(comm,&rank);
  183. // if(rank==0) mat->Save2File("Precomp.data");
  184. delete mat;
  185. mat=NULL;
  186. }
  187. if(vprecomp_fft_flag) FFTW_t<Real_t>::fft_destroy_plan(vprecomp_fftplan);
  188. #ifdef __INTEL_OFFLOAD0
  189. #pragma offload target(mic:0)
  190. #endif
  191. {
  192. if(vlist_fft_flag ) FFTW_t<Real_t>::fft_destroy_plan(vlist_fftplan );
  193. if(vlist_ifft_flag) FFTW_t<Real_t>::fft_destroy_plan(vlist_ifftplan);
  194. vlist_fft_flag =false;
  195. vlist_ifft_flag=false;
  196. }
  197. }
  198. template <class FMMNode>
  199. void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel<Real_t>* kernel_){
  200. Profile::Tic("InitFMM_Pts",&comm_,true);{
  201. int rank;
  202. MPI_Comm_rank(comm_,&rank);
  203. bool verbose=false;
  204. #ifndef NDEBUG
  205. #ifdef __VERBOSE__
  206. if(!rank) verbose=true;
  207. #endif
  208. #endif
  209. if(kernel_) kernel_->Initialize(verbose);
  210. multipole_order=mult_order;
  211. comm=comm_;
  212. kernel=kernel_;
  213. assert(kernel!=NULL);
  214. bool save_precomp=false;
  215. mat=new PrecompMat<Real_t>(ScaleInvar());
  216. if(this->mat_fname.size()==0){// && !this->ScaleInvar()){
  217. std::stringstream st;
  218. st<<PVFMM_PRECOMP_DATA_PATH;
  219. if(!st.str().size()){ // look in PVFMM_DIR
  220. char* pvfmm_dir = getenv ("PVFMM_DIR");
  221. if(pvfmm_dir) st<<pvfmm_dir;
  222. }
  223. #ifndef STAT_MACROS_BROKEN
  224. if(st.str().size()){ // check if the path is a directory
  225. struct stat stat_buff;
  226. if(stat(st.str().c_str(), &stat_buff) || !S_ISDIR(stat_buff.st_mode)){
  227. std::cout<<"error: path not found: "<<st.str()<<'\n';
  228. exit(0);
  229. }
  230. }
  231. #endif
  232. if(st.str().size()) st<<'/';
  233. st<<"Precomp_"<<kernel->ker_name.c_str()<<"_m"<<mult_order;
  234. if(sizeof(Real_t)==8) st<<"";
  235. else if(sizeof(Real_t)==4) st<<"_f";
  236. else st<<"_t"<<sizeof(Real_t);
  237. st<<".data";
  238. this->mat_fname=st.str();
  239. save_precomp=true;
  240. }
  241. this->mat->LoadFile(mat_fname.c_str(), this->comm);
  242. interac_list.Initialize(COORD_DIM, this->mat);
  243. Profile::Tic("PrecompUC2UE",&comm,false,4);
  244. this->PrecompAll(UC2UE0_Type);
  245. this->PrecompAll(UC2UE1_Type);
  246. Profile::Toc();
  247. Profile::Tic("PrecompDC2DE",&comm,false,4);
  248. this->PrecompAll(DC2DE0_Type);
  249. this->PrecompAll(DC2DE1_Type);
  250. Profile::Toc();
  251. Profile::Tic("PrecompBC",&comm,false,4);
  252. { /*
  253. int type=BC_Type;
  254. for(int l=0;l<MAX_DEPTH;l++)
  255. for(size_t indx=0;indx<this->interac_list.ListCount((Mat_Type)type);indx++){
  256. Matrix<Real_t>& M=this->mat->Mat(l, (Mat_Type)type, indx);
  257. M.Resize(0,0);
  258. } // */
  259. }
  260. this->PrecompAll(BC_Type,0);
  261. Profile::Toc();
  262. Profile::Tic("PrecompU2U",&comm,false,4);
  263. this->PrecompAll(U2U_Type);
  264. Profile::Toc();
  265. Profile::Tic("PrecompD2D",&comm,false,4);
  266. this->PrecompAll(D2D_Type);
  267. Profile::Toc();
  268. if(save_precomp){
  269. Profile::Tic("Save2File",&this->comm,false,4);
  270. if(!rank){
  271. FILE* f=fopen(this->mat_fname.c_str(),"r");
  272. if(f==NULL) { //File does not exists.
  273. this->mat->Save2File(this->mat_fname.c_str());
  274. }else fclose(f);
  275. }
  276. Profile::Toc();
  277. }
  278. Profile::Tic("PrecompV",&comm,false,4);
  279. this->PrecompAll(V_Type);
  280. Profile::Toc();
  281. Profile::Tic("PrecompV1",&comm,false,4);
  282. this->PrecompAll(V1_Type);
  283. Profile::Toc();
  284. }Profile::Toc();
  285. }
  286. template <class Real_t>
  287. Permutation<Real_t> equiv_surf_perm(size_t m, size_t p_indx, const Permutation<Real_t>& ker_perm, const Vector<Real_t>* scal_exp=NULL){
  288. Real_t eps=1e-10;
  289. int dof=ker_perm.Dim();
  290. Real_t c[3]={-0.5,-0.5,-0.5};
  291. std::vector<Real_t> trg_coord=d_check_surf(m,c,0);
  292. int n_trg=trg_coord.size()/3;
  293. Permutation<Real_t> P=Permutation<Real_t>(n_trg*dof);
  294. if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){ // Set P.perm
  295. for(int i=0;i<n_trg;i++)
  296. for(int j=0;j<n_trg;j++){
  297. if(pvfmm::fabs<Real_t>(trg_coord[i*3+0]-trg_coord[j*3+0]*(p_indx==ReflecX?-1.0:1.0))<eps)
  298. if(pvfmm::fabs<Real_t>(trg_coord[i*3+1]-trg_coord[j*3+1]*(p_indx==ReflecY?-1.0:1.0))<eps)
  299. if(pvfmm::fabs<Real_t>(trg_coord[i*3+2]-trg_coord[j*3+2]*(p_indx==ReflecZ?-1.0:1.0))<eps){
  300. for(int k=0;k<dof;k++){
  301. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  302. }
  303. }
  304. }
  305. }else if(p_indx==SwapXY || p_indx==SwapXZ){
  306. for(int i=0;i<n_trg;i++)
  307. for(int j=0;j<n_trg;j++){
  308. if(pvfmm::fabs<Real_t>(trg_coord[i*3+0]-trg_coord[j*3+(p_indx==SwapXY?1:2)])<eps)
  309. if(pvfmm::fabs<Real_t>(trg_coord[i*3+1]-trg_coord[j*3+(p_indx==SwapXY?0:1)])<eps)
  310. if(pvfmm::fabs<Real_t>(trg_coord[i*3+2]-trg_coord[j*3+(p_indx==SwapXY?2:0)])<eps){
  311. for(int k=0;k<dof;k++){
  312. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  313. }
  314. }
  315. }
  316. }else{
  317. for(int j=0;j<n_trg;j++){
  318. for(int k=0;k<dof;k++){
  319. P.perm[j*dof+k]=j*dof+ker_perm.perm[k];
  320. }
  321. }
  322. }
  323. if(scal_exp && p_indx==Scaling){ // Set level-by-level scaling
  324. assert(dof==scal_exp->Dim());
  325. Vector<Real_t> scal(scal_exp->Dim());
  326. for(size_t i=0;i<scal.Dim();i++){
  327. scal[i]=pvfmm::pow<Real_t>(2.0,(*scal_exp)[i]);
  328. }
  329. for(int j=0;j<n_trg;j++){
  330. for(int i=0;i<dof;i++){
  331. P.scal[j*dof+i]*=scal[i];
  332. }
  333. }
  334. }
  335. { // Set P.scal
  336. for(int j=0;j<n_trg;j++){
  337. for(int i=0;i<dof;i++){
  338. P.scal[j*dof+i]*=ker_perm.scal[i];
  339. }
  340. }
  341. }
  342. return P;
  343. }
  344. template <class FMMNode>
  345. Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type type, Perm_Type perm_indx){
  346. //Check if the matrix already exists.
  347. Permutation<Real_t>& P_ = mat->Perm((Mat_Type)type, perm_indx);
  348. if(P_.Dim()!=0) return P_;
  349. size_t m=this->MultipoleOrder();
  350. size_t p_indx=perm_indx % C_Perm;
  351. //Compute the matrix.
  352. Permutation<Real_t> P;
  353. switch (type){
  354. case U2U_Type:
  355. {
  356. Vector<Real_t> scal_exp;
  357. Permutation<Real_t> ker_perm;
  358. if(perm_indx<C_Perm){ // Source permutation
  359. ker_perm=kernel->k_m2m->perm_vec[0 +p_indx];
  360. scal_exp=kernel->k_m2m->src_scal;
  361. }else{ // Target permutation
  362. ker_perm=kernel->k_m2m->perm_vec[0 +p_indx];
  363. scal_exp=kernel->k_m2m->src_scal;
  364. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  365. }
  366. P=equiv_surf_perm(m, p_indx, ker_perm, (this->ScaleInvar()?&scal_exp:NULL));
  367. break;
  368. }
  369. case D2D_Type:
  370. {
  371. Vector<Real_t> scal_exp;
  372. Permutation<Real_t> ker_perm;
  373. if(perm_indx<C_Perm){ // Source permutation
  374. ker_perm=kernel->k_l2l->perm_vec[C_Perm+p_indx];
  375. scal_exp=kernel->k_l2l->trg_scal;
  376. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  377. }else{ // Target permutation
  378. ker_perm=kernel->k_l2l->perm_vec[C_Perm+p_indx];
  379. scal_exp=kernel->k_l2l->trg_scal;
  380. }
  381. P=equiv_surf_perm(m, p_indx, ker_perm, (this->ScaleInvar()?&scal_exp:NULL));
  382. break;
  383. }
  384. default:
  385. break;
  386. }
  387. //Save the matrix for future use.
  388. #pragma omp critical (PRECOMP_MATRIX_PTS)
  389. {
  390. if(P_.Dim()==0) P_=P;
  391. }
  392. return P_;
  393. }
  394. template <class FMMNode>
  395. Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type type, size_t mat_indx){
  396. if(this->ScaleInvar()) level=0;
  397. //Check if the matrix already exists.
  398. Matrix<Real_t>& M_ = this->mat->Mat(level, type, mat_indx);
  399. if(M_.Dim(0)!=0 && M_.Dim(1)!=0) return M_;
  400. else{ //Compute matrix from symmetry class (if possible).
  401. size_t class_indx = this->interac_list.InteracClass(type, mat_indx);
  402. if(class_indx!=mat_indx){
  403. Matrix<Real_t>& M0 = this->Precomp(level, type, class_indx);
  404. if(M0.Dim(0)==0 || M0.Dim(1)==0) return M_;
  405. for(size_t i=0;i<Perm_Count;i++) this->PrecompPerm(type, (Perm_Type) i);
  406. Permutation<Real_t>& Pr = this->interac_list.Perm_R(abs(level), type, mat_indx);
  407. Permutation<Real_t>& Pc = this->interac_list.Perm_C(abs(level), type, mat_indx);
  408. if(Pr.Dim()>0 && Pc.Dim()>0 && M0.Dim(0)>0 && M0.Dim(1)>0) return M_;
  409. }
  410. }
  411. //Compute the matrix.
  412. Matrix<Real_t> M;
  413. //int omp_p=omp_get_max_threads();
  414. switch (type){
  415. case UC2UE0_Type:
  416. {
  417. if(MultipoleOrder()==0) break;
  418. const int* ker_dim=kernel->k_m2m->ker_dim;
  419. // Coord of upward check surface
  420. Real_t c[3]={0,0,0};
  421. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  422. size_t n_uc=uc_coord.size()/3;
  423. // Coord of upward equivalent surface
  424. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  425. size_t n_ue=ue_coord.size()/3;
  426. // Evaluate potential at check surface due to equivalent surface.
  427. Matrix<Real_t> M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  428. kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue,
  429. &uc_coord[0], n_uc, &(M_e2c[0][0]));
  430. Matrix<Real_t> U,S,V;
  431. M_e2c.SVD(U,S,V);
  432. Real_t eps=1, max_S=0;
  433. while(eps*(Real_t)0.5+(Real_t)1.0>1.0) eps*=0.5;
  434. for(size_t i=0;i<std::min(S.Dim(0),S.Dim(1));i++){
  435. if(pvfmm::fabs<Real_t>(S[i][i])>max_S) max_S=pvfmm::fabs<Real_t>(S[i][i]);
  436. }
  437. for(size_t i=0;i<S.Dim(0);i++) S[i][i]=(S[i][i]>eps*max_S*4?1.0/S[i][i]:0.0);
  438. M=V.Transpose()*S;//*U.Transpose();
  439. break;
  440. }
  441. case UC2UE1_Type:
  442. {
  443. if(MultipoleOrder()==0) break;
  444. const int* ker_dim=kernel->k_m2m->ker_dim;
  445. // Coord of upward check surface
  446. Real_t c[3]={0,0,0};
  447. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  448. size_t n_uc=uc_coord.size()/3;
  449. // Coord of upward equivalent surface
  450. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  451. size_t n_ue=ue_coord.size()/3;
  452. // Evaluate potential at check surface due to equivalent surface.
  453. Matrix<Real_t> M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  454. kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue,
  455. &uc_coord[0], n_uc, &(M_e2c[0][0]));
  456. Matrix<Real_t> U,S,V;
  457. M_e2c.SVD(U,S,V);
  458. M=U.Transpose();
  459. break;
  460. }
  461. case DC2DE0_Type:
  462. {
  463. if(MultipoleOrder()==0) break;
  464. const int* ker_dim=kernel->k_l2l->ker_dim;
  465. // Coord of downward check surface
  466. Real_t c[3]={0,0,0};
  467. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  468. size_t n_ch=check_surf.size()/3;
  469. // Coord of downward equivalent surface
  470. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  471. size_t n_eq=equiv_surf.size()/3;
  472. // Evaluate potential at check surface due to equivalent surface.
  473. Matrix<Real_t> M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]);
  474. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq,
  475. &check_surf[0], n_ch, &(M_e2c[0][0]));
  476. Matrix<Real_t> U,S,V;
  477. M_e2c.SVD(U,S,V);
  478. Real_t eps=1, max_S=0;
  479. while(eps*(Real_t)0.5+(Real_t)1.0>1.0) eps*=0.5;
  480. for(size_t i=0;i<std::min(S.Dim(0),S.Dim(1));i++){
  481. if(pvfmm::fabs<Real_t>(S[i][i])>max_S) max_S=pvfmm::fabs<Real_t>(S[i][i]);
  482. }
  483. for(size_t i=0;i<S.Dim(0);i++) S[i][i]=(S[i][i]>eps*max_S*4?1.0/S[i][i]:0.0);
  484. M=V.Transpose()*S;//*U.Transpose();
  485. break;
  486. }
  487. case DC2DE1_Type:
  488. {
  489. if(MultipoleOrder()==0) break;
  490. const int* ker_dim=kernel->k_l2l->ker_dim;
  491. // Coord of downward check surface
  492. Real_t c[3]={0,0,0};
  493. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  494. size_t n_ch=check_surf.size()/3;
  495. // Coord of downward equivalent surface
  496. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  497. size_t n_eq=equiv_surf.size()/3;
  498. // Evaluate potential at check surface due to equivalent surface.
  499. Matrix<Real_t> M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]);
  500. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq,
  501. &check_surf[0], n_ch, &(M_e2c[0][0]));
  502. Matrix<Real_t> U,S,V;
  503. M_e2c.SVD(U,S,V);
  504. M=U.Transpose();
  505. break;
  506. }
  507. case U2U_Type:
  508. {
  509. if(MultipoleOrder()==0) break;
  510. const int* ker_dim=kernel->k_m2m->ker_dim;
  511. // Coord of upward check surface
  512. Real_t c[3]={0,0,0};
  513. std::vector<Real_t> check_surf=u_check_surf(MultipoleOrder(),c,level);
  514. size_t n_uc=check_surf.size()/3;
  515. // Coord of child's upward equivalent surface
  516. Real_t s=pvfmm::pow<Real_t>(0.5,(level+2));
  517. int* coord=interac_list.RelativeCoord(type,mat_indx);
  518. Real_t child_coord[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  519. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),child_coord,level+1);
  520. size_t n_ue=equiv_surf.size()/3;
  521. // Evaluate potential at check surface due to equivalent surface.
  522. Matrix<Real_t> M_ce2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  523. kernel->k_m2m->BuildMatrix(&equiv_surf[0], n_ue,
  524. &check_surf[0], n_uc, &(M_ce2c[0][0]));
  525. Matrix<Real_t>& M_c2e0 = Precomp(level, UC2UE0_Type, 0);
  526. Matrix<Real_t>& M_c2e1 = Precomp(level, UC2UE1_Type, 0);
  527. M=(M_ce2c*M_c2e0)*M_c2e1;
  528. break;
  529. }
  530. case D2D_Type:
  531. {
  532. if(MultipoleOrder()==0) break;
  533. const int* ker_dim=kernel->k_l2l->ker_dim;
  534. // Coord of downward check surface
  535. Real_t s=pvfmm::pow<Real_t>(0.5,level+1);
  536. int* coord=interac_list.RelativeCoord(type,mat_indx);
  537. Real_t c[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  538. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  539. size_t n_dc=check_surf.size()/3;
  540. // Coord of parent's downward equivalent surface
  541. Real_t parent_coord[3]={0,0,0};
  542. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),parent_coord,level-1);
  543. size_t n_de=equiv_surf.size()/3;
  544. // Evaluate potential at check surface due to equivalent surface.
  545. Matrix<Real_t> M_pe2c(n_de*ker_dim[0],n_dc*ker_dim[1]);
  546. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_de,
  547. &check_surf[0], n_dc, &(M_pe2c[0][0]));
  548. Matrix<Real_t> M_c2e0=Precomp(level-1,DC2DE0_Type,0);
  549. Matrix<Real_t> M_c2e1=Precomp(level-1,DC2DE1_Type,0);
  550. if(ScaleInvar()){ // Scale M_c2e0 for level-1
  551. Permutation<Real_t> ker_perm=this->kernel->k_l2l->perm_vec[C_Perm+Scaling];
  552. Vector<Real_t> scal_exp=this->kernel->k_l2l->trg_scal;
  553. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  554. M_c2e0=P*M_c2e0;
  555. }
  556. if(ScaleInvar()){ // Scale M_c2e1 for level-1
  557. Permutation<Real_t> ker_perm=this->kernel->k_l2l->perm_vec[0 +Scaling];
  558. Vector<Real_t> scal_exp=this->kernel->k_l2l->src_scal;
  559. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  560. M_c2e1=M_c2e1*P;
  561. }
  562. M=M_c2e0*(M_c2e1*M_pe2c);
  563. break;
  564. }
  565. case D2T_Type:
  566. {
  567. if(MultipoleOrder()==0) break;
  568. const int* ker_dim=kernel->k_l2t->ker_dim;
  569. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  570. // Coord of target points
  571. Real_t r=pvfmm::pow<Real_t>(0.5,level);
  572. size_t n_trg=rel_trg_coord.size()/3;
  573. std::vector<Real_t> trg_coord(n_trg*3);
  574. for(size_t i=0;i<n_trg*COORD_DIM;i++) trg_coord[i]=rel_trg_coord[i]*r;
  575. // Coord of downward equivalent surface
  576. Real_t c[3]={0,0,0};
  577. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  578. size_t n_eq=equiv_surf.size()/3;
  579. // Evaluate potential at target points due to equivalent surface.
  580. {
  581. M .Resize(n_eq*ker_dim [0], n_trg*ker_dim [1]);
  582. kernel->k_l2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  583. }
  584. Matrix<Real_t>& M_c2e0=Precomp(level,DC2DE0_Type,0);
  585. Matrix<Real_t>& M_c2e1=Precomp(level,DC2DE1_Type,0);
  586. M=M_c2e0*(M_c2e1*M);
  587. break;
  588. }
  589. case V_Type:
  590. {
  591. if(MultipoleOrder()==0) break;
  592. const int* ker_dim=kernel->k_m2l->ker_dim;
  593. int n1=MultipoleOrder()*2;
  594. int n3 =n1*n1*n1;
  595. int n3_=n1*n1*(n1/2+1);
  596. //Compute the matrix.
  597. Real_t s=pvfmm::pow<Real_t>(0.5,level);
  598. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  599. Real_t coord_diff[3]={coord2[0]*s,coord2[1]*s,coord2[2]*s};
  600. //Evaluate potential.
  601. std::vector<Real_t> r_trg(COORD_DIM,0.0);
  602. std::vector<Real_t> conv_poten(n3*ker_dim[0]*ker_dim[1]);
  603. std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
  604. kernel->k_m2l->BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
  605. //Rearrange data.
  606. Matrix<Real_t> M_conv(n3,ker_dim[0]*ker_dim[1],&conv_poten[0],false);
  607. M_conv=M_conv.Transpose();
  608. //Compute FFTW plan.
  609. int nnn[3]={n1,n1,n1};
  610. Real_t *fftw_in, *fftw_out;
  611. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  612. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  613. #pragma omp critical (FFTW_PLAN)
  614. {
  615. if (!vprecomp_fft_flag){
  616. vprecomp_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM, nnn, ker_dim[0]*ker_dim[1],
  617. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_);
  618. vprecomp_fft_flag=true;
  619. }
  620. }
  621. //Compute FFT.
  622. mem::memcopy(fftw_in, &conv_poten[0], n3*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  623. FFTW_t<Real_t>::fft_execute_dft_r2c(vprecomp_fftplan, (Real_t*)fftw_in, (typename FFTW_t<Real_t>::cplx*)(fftw_out));
  624. Matrix<Real_t> M_(2*n3_*ker_dim[0]*ker_dim[1],1,(Real_t*)fftw_out,false);
  625. M=M_;
  626. //Free memory.
  627. mem::aligned_delete<Real_t>(fftw_in);
  628. mem::aligned_delete<Real_t>(fftw_out);
  629. break;
  630. }
  631. case V1_Type:
  632. {
  633. if(MultipoleOrder()==0) break;
  634. const int* ker_dim=kernel->k_m2l->ker_dim;
  635. size_t mat_cnt =interac_list.ListCount( V_Type);
  636. for(size_t k=0;k<mat_cnt;k++) Precomp(level, V_Type, k);
  637. const size_t chld_cnt=1UL<<COORD_DIM;
  638. size_t n1=MultipoleOrder()*2;
  639. size_t M_dim=n1*n1*(n1/2+1);
  640. size_t n3=n1*n1*n1;
  641. Vector<Real_t> zero_vec(M_dim*ker_dim[0]*ker_dim[1]*2);
  642. zero_vec.SetZero();
  643. Vector<Real_t*> M_ptr(chld_cnt*chld_cnt);
  644. for(size_t i=0;i<chld_cnt*chld_cnt;i++) M_ptr[i]=&zero_vec[0];
  645. int* rel_coord_=interac_list.RelativeCoord(V1_Type, mat_indx);
  646. for(int j1=0;j1<chld_cnt;j1++)
  647. for(int j2=0;j2<chld_cnt;j2++){
  648. int rel_coord[3]={rel_coord_[0]*2-(j1/1)%2+(j2/1)%2,
  649. rel_coord_[1]*2-(j1/2)%2+(j2/2)%2,
  650. rel_coord_[2]*2-(j1/4)%2+(j2/4)%2};
  651. for(size_t k=0;k<mat_cnt;k++){
  652. int* ref_coord=interac_list.RelativeCoord(V_Type, k);
  653. if(ref_coord[0]==rel_coord[0] &&
  654. ref_coord[1]==rel_coord[1] &&
  655. ref_coord[2]==rel_coord[2]){
  656. Matrix<Real_t>& M = this->mat->Mat(level, V_Type, k);
  657. M_ptr[j2*chld_cnt+j1]=&M[0][0];
  658. break;
  659. }
  660. }
  661. }
  662. // Build matrix ker_dim0 x ker_dim1 x M_dim x 8 x 8
  663. M.Resize(ker_dim[0]*ker_dim[1]*M_dim, 2*chld_cnt*chld_cnt);
  664. for(int j=0;j<ker_dim[0]*ker_dim[1]*M_dim;j++){
  665. for(size_t k=0;k<chld_cnt*chld_cnt;k++){
  666. M[j][k*2+0]=M_ptr[k][j*2+0]/n3;
  667. M[j][k*2+1]=M_ptr[k][j*2+1]/n3;
  668. }
  669. }
  670. break;
  671. }
  672. case W_Type:
  673. {
  674. if(MultipoleOrder()==0) break;
  675. const int* ker_dim=kernel->k_m2t->ker_dim;
  676. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  677. // Coord of target points
  678. Real_t s=pvfmm::pow<Real_t>(0.5,level);
  679. size_t n_trg=rel_trg_coord.size()/3;
  680. std::vector<Real_t> trg_coord(n_trg*3);
  681. for(size_t j=0;j<n_trg*COORD_DIM;j++) trg_coord[j]=rel_trg_coord[j]*s;
  682. // Coord of downward equivalent surface
  683. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  684. Real_t c[3]={(Real_t)((coord2[0]+1)*s*0.25),(Real_t)((coord2[1]+1)*s*0.25),(Real_t)((coord2[2]+1)*s*0.25)};
  685. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),c,level+1);
  686. size_t n_eq=equiv_surf.size()/3;
  687. // Evaluate potential at target points due to equivalent surface.
  688. {
  689. M .Resize(n_eq*ker_dim [0],n_trg*ker_dim [1]);
  690. kernel->k_m2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  691. }
  692. break;
  693. }
  694. case BC_Type:
  695. {
  696. if(!this->ScaleInvar() || MultipoleOrder()==0) break;
  697. if(kernel->k_m2l->ker_dim[0]!=kernel->k_m2m->ker_dim[0]) break;
  698. if(kernel->k_m2l->ker_dim[1]!=kernel->k_l2l->ker_dim[1]) break;
  699. int ker_dim[2]={kernel->k_m2l->ker_dim[0],kernel->k_m2l->ker_dim[1]};
  700. size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type);
  701. size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2); //Total number of points.
  702. if((M.Dim(0)!=n_surf*ker_dim[0] || M.Dim(1)!=n_surf*ker_dim[1]) && level==0){
  703. Matrix<Real_t> M_m2m[BC_LEVELS+1];
  704. Matrix<Real_t> M_m2l[BC_LEVELS+1];
  705. Matrix<Real_t> M_l2l[BC_LEVELS+1];
  706. Matrix<Real_t> M_equiv_zero_avg(n_surf*ker_dim[0],n_surf*ker_dim[0]);
  707. Matrix<Real_t> M_check_zero_avg(n_surf*ker_dim[1],n_surf*ker_dim[1]);
  708. { // Set average multipole charge to zero (projection for non-zero total source density)
  709. Matrix<Real_t> M_s2c;
  710. { // Compute M_s2c
  711. int ker_dim[2]={kernel->k_m2m->ker_dim[0],kernel->k_m2m->ker_dim[1]};
  712. M_s2c.ReInit(ker_dim[0],n_surf*ker_dim[1]);
  713. std::vector<Real_t> uc_coord;
  714. { // Coord of upward check surface
  715. Real_t c[3]={0,0,0};
  716. uc_coord=u_check_surf(MultipoleOrder(),c,0);
  717. }
  718. #pragma omp parallel for schedule(dynamic)
  719. for(size_t i=0;i<n_surf;i++){
  720. std::vector<Real_t> M_=cheb_integ<Real_t>(0, &uc_coord[i*3], 1.0, *kernel->k_m2m);
  721. for(size_t j=0; j<ker_dim[0]; j++)
  722. for(int k=0; k<ker_dim[1]; k++)
  723. M_s2c[j][i*ker_dim[1]+k] = M_[j+k*ker_dim[0]];
  724. }
  725. }
  726. Matrix<Real_t>& M_c2e0 = Precomp(level, UC2UE0_Type, 0);
  727. Matrix<Real_t>& M_c2e1 = Precomp(level, UC2UE1_Type, 0);
  728. Matrix<Real_t> M_s2e=(M_s2c*M_c2e0)*M_c2e1;
  729. for(size_t i=0;i<M_s2e.Dim(0);i++){ // Normalize each row to 1
  730. Real_t s=0;
  731. for(size_t j=0;j<M_s2e.Dim(1);j++) s+=M_s2e[i][j];
  732. s=1.0/s;
  733. for(size_t j=0;j<M_s2e.Dim(1);j++) M_s2e[i][j]*=s;
  734. }
  735. assert(M_equiv_zero_avg.Dim(0)==M_s2e.Dim(1));
  736. assert(M_equiv_zero_avg.Dim(1)==M_s2e.Dim(1));
  737. M_equiv_zero_avg.SetZero();
  738. for(size_t i=0;i<n_surf*ker_dim[0];i++)
  739. M_equiv_zero_avg[i][i]=1;
  740. for(size_t i=0;i<n_surf;i++)
  741. for(size_t k=0;k<ker_dim[0];k++)
  742. for(size_t j=0;j<n_surf*ker_dim[0];j++)
  743. M_equiv_zero_avg[i*ker_dim[0]+k][j]-=M_s2e[k][j];
  744. }
  745. { // Set average check potential to zero. (improves stability for large BC_LEVELS)
  746. M_check_zero_avg.SetZero();
  747. for(size_t i=0;i<n_surf*ker_dim[1];i++)
  748. M_check_zero_avg[i][i]+=1;
  749. for(size_t i=0;i<n_surf;i++)
  750. for(size_t j=0;j<n_surf;j++)
  751. for(size_t k=0;k<ker_dim[1];k++)
  752. M_check_zero_avg[i*ker_dim[1]+k][j*ker_dim[1]+k]-=1.0/n_surf;
  753. }
  754. for(int level=0; level>=-BC_LEVELS; level--){
  755. { // Compute M_l2l
  756. this->Precomp(level, D2D_Type, 0);
  757. Permutation<Real_t> Pr = this->interac_list.Perm_R(abs(level), D2D_Type, 0);
  758. Permutation<Real_t> Pc = this->interac_list.Perm_C(abs(level), D2D_Type, 0);
  759. { // Invert scaling because level<0
  760. for(long i=0;i<Pr.Dim();i++) Pr.scal[i]=1.0/Pr.scal[i];
  761. for(long i=0;i<Pc.Dim();i++) Pc.scal[i]=1.0/Pc.scal[i];
  762. }
  763. M_l2l[-level] = M_check_zero_avg * Pr * this->Precomp(level, D2D_Type, this->interac_list.InteracClass(D2D_Type, 0)) * Pc * M_check_zero_avg;
  764. assert(M_l2l[-level].Dim(0)>0 && M_l2l[-level].Dim(1)>0);
  765. }
  766. // Compute M_m2m
  767. for(size_t mat_indx=0; mat_indx<mat_cnt_m2m; mat_indx++){
  768. this->Precomp(level-1, U2U_Type, mat_indx);
  769. Permutation<Real_t> Pr = this->interac_list.Perm_R(abs(level-1), U2U_Type, mat_indx);
  770. Permutation<Real_t> Pc = this->interac_list.Perm_C(abs(level-1), U2U_Type, mat_indx);
  771. for(long i=0;i<Pr.Dim();i++) Pr.scal[i]=1.0/Pr.scal[i];
  772. for(long i=0;i<Pc.Dim();i++) Pc.scal[i]=1.0/Pc.scal[i];
  773. Matrix<Real_t> M = Pr * this->Precomp(level-1, U2U_Type, this->interac_list.InteracClass(U2U_Type, mat_indx)) * Pc;
  774. assert(M.Dim(0)>0 && M.Dim(1)>0);
  775. if(mat_indx==0) M_m2m[-level] = M_equiv_zero_avg*M*M_equiv_zero_avg;
  776. else M_m2m[-level] += M_equiv_zero_avg*M*M_equiv_zero_avg;
  777. }
  778. // Compute M_m2l
  779. if(!ScaleInvar() || level==0){
  780. Real_t s=(1UL<<(-level));
  781. Real_t dc_coord[3]={0,0,0};
  782. std::vector<Real_t> trg_coord=d_check_surf(MultipoleOrder(), dc_coord, level);
  783. Matrix<Real_t> M_ue2dc(n_surf*ker_dim[0], n_surf*ker_dim[1]); M_ue2dc.SetZero();
  784. for(int x0=-2;x0<4;x0++)
  785. for(int x1=-2;x1<4;x1++)
  786. for(int x2=-2;x2<4;x2++)
  787. if(abs(x0)>1 || abs(x1)>1 || abs(x2)>1){
  788. Real_t ue_coord[3]={x0*s, x1*s, x2*s};
  789. std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
  790. Matrix<Real_t> M_tmp(n_surf*ker_dim[0], n_surf*ker_dim[1]);
  791. kernel->k_m2l->BuildMatrix(&src_coord[0], n_surf,
  792. &trg_coord[0], n_surf, &(M_tmp[0][0]));
  793. M_ue2dc+=M_tmp;
  794. }
  795. M_m2l[-level]=M_equiv_zero_avg*M_ue2dc * M_check_zero_avg;
  796. }else{
  797. M_m2l[-level]=M_equiv_zero_avg * M_m2l[-level-1] * M_check_zero_avg;
  798. if(ScaleInvar()){ // Scale M_m2l
  799. Permutation<Real_t> ker_perm=this->kernel->k_m2l->perm_vec[0 +Scaling];
  800. Vector<Real_t> scal_exp=this->kernel->k_m2l->src_scal;
  801. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  802. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  803. M_m2l[-level]=P*M_m2l[-level];
  804. }
  805. if(ScaleInvar()){ // Scale M_m2l
  806. Permutation<Real_t> ker_perm=this->kernel->k_m2l->perm_vec[C_Perm+Scaling];
  807. Vector<Real_t> scal_exp=this->kernel->k_m2l->trg_scal;
  808. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  809. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  810. M_m2l[-level]=M_m2l[-level]*P;
  811. }
  812. }
  813. }
  814. for(int level=-BC_LEVELS;level<=0;level++){
  815. if(level==-BC_LEVELS) M = M_m2l[-level];
  816. else M = M_equiv_zero_avg * (M_m2l[-level] + M_m2m[-level]*M*M_l2l[-level]) * M_check_zero_avg;
  817. }
  818. if(kernel->k_m2l->vol_poten){ // Correction for far-field of analytical volume potential
  819. Matrix<Real_t> M_far;
  820. { // Compute M_far
  821. // kernel->k_m2l->vol_poten is the analtical particular solution for uniform source density=1
  822. // We already corrected far-field above with M_equiv_zero_avg, so we don't need the far field of the analytical solutions.
  823. // We take the analytical solution and subtract the near interaction (3x3x3 boxes) from it to get the far-field
  824. // Then, we add the far-field correction for the analytical solution to be subtracted later.
  825. std::vector<Real_t> dc_coord;
  826. { // Coord of upward check surface
  827. Real_t c[3]={1.0,1.0,1.0};
  828. dc_coord=d_check_surf(MultipoleOrder(),c,0);
  829. }
  830. Matrix<Real_t> M_near(ker_dim[0],n_surf*ker_dim[1]);
  831. #pragma omp parallel for schedule(dynamic)
  832. for(size_t i=0;i<n_surf;i++){ // Compute near-interaction part
  833. std::vector<Real_t> M_=cheb_integ<Real_t>(0, &dc_coord[i*3], 3.0, *kernel->k_m2l);
  834. for(size_t j=0; j<ker_dim[0]; j++)
  835. for(int k=0; k<ker_dim[1]; k++)
  836. M_near[j][i*ker_dim[1]+k] = M_[j+k*ker_dim[0]];
  837. }
  838. { // M_far = M_analytic - M_near
  839. Matrix<Real_t> M_analytic(ker_dim[0],n_surf*ker_dim[1]); M_analytic.SetZero();
  840. kernel->k_m2l->vol_poten(&dc_coord[0],n_surf,&M_analytic[0][0]);
  841. M_far=M_analytic-M_near;
  842. }
  843. }
  844. { // Add far-field corection to M
  845. for(size_t i=0;i<n_surf;i++)
  846. for(size_t k=0;k<ker_dim[0];k++)
  847. for(size_t j=0;j<n_surf*ker_dim[1];j++)
  848. M[i*ker_dim[0]+k][j]+=M_far[k][j];
  849. }
  850. }
  851. { // a + bx + cy + dz + exy + fxz + gyz correction.
  852. std::vector<Real_t> corner_pts;
  853. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0);
  854. corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0);
  855. corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0);
  856. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1);
  857. corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(1);
  858. corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(1);
  859. corner_pts.push_back(1); corner_pts.push_back(1); corner_pts.push_back(0);
  860. corner_pts.push_back(1); corner_pts.push_back(1); corner_pts.push_back(1);
  861. size_t n_corner=corner_pts.size()/COORD_DIM;
  862. // Coord of downward equivalent surface
  863. Real_t c[3]={0,0,0};
  864. std::vector<Real_t> up_equiv_surf=u_equiv_surf(MultipoleOrder(),c,0);
  865. std::vector<Real_t> dn_equiv_surf=d_equiv_surf(MultipoleOrder(),c,0);
  866. std::vector<Real_t> dn_check_surf=d_check_surf(MultipoleOrder(),c,0);
  867. Matrix<Real_t> M_err;
  868. { // Evaluate potential at corner due to upward and dnward equivalent surface.
  869. { // Error from local expansion.
  870. Matrix<Real_t> M_e2pt(n_surf*kernel->k_l2l->ker_dim[0],n_corner*kernel->k_l2l->ker_dim[1]);
  871. kernel->k_l2l->BuildMatrix(&dn_equiv_surf[0], n_surf,
  872. &corner_pts[0], n_corner, &(M_e2pt[0][0]));
  873. Matrix<Real_t>& M_dc2de0 = Precomp(0, DC2DE0_Type, 0);
  874. Matrix<Real_t>& M_dc2de1 = Precomp(0, DC2DE1_Type, 0);
  875. M_err=(M*M_dc2de0)*(M_dc2de1*M_e2pt);
  876. }
  877. for(size_t k=0;k<n_corner;k++){ // Error from colleagues of root.
  878. for(int j0=-1;j0<=1;j0++)
  879. for(int j1=-1;j1<=1;j1++)
  880. for(int j2=-1;j2<=1;j2++){
  881. Real_t pt_coord[3]={corner_pts[k*COORD_DIM+0]-j0,
  882. corner_pts[k*COORD_DIM+1]-j1,
  883. corner_pts[k*COORD_DIM+2]-j2};
  884. if(pvfmm::fabs<Real_t>(pt_coord[0]-0.5)>1.0 || pvfmm::fabs<Real_t>(pt_coord[1]-0.5)>1.0 || pvfmm::fabs<Real_t>(pt_coord[2]-0.5)>1.0){
  885. Matrix<Real_t> M_e2pt(n_surf*ker_dim[0],ker_dim[1]);
  886. kernel->k_m2l->BuildMatrix(&up_equiv_surf[0], n_surf,
  887. &pt_coord[0], 1, &(M_e2pt[0][0]));
  888. for(size_t i=0;i<M_e2pt.Dim(0);i++)
  889. for(size_t j=0;j<M_e2pt.Dim(1);j++)
  890. M_err[i][k*ker_dim[1]+j]+=M_e2pt[i][j];
  891. }
  892. }
  893. }
  894. if(kernel->k_m2l->vol_poten){ // Error from analytical volume potential
  895. Matrix<Real_t> M_analytic(ker_dim[0],n_corner*ker_dim[1]); M_analytic.SetZero();
  896. kernel->k_m2l->vol_poten(&corner_pts[0],n_corner,&M_analytic[0][0]);
  897. for(size_t j=0;j<n_surf;j++)
  898. for(size_t k=0;k<ker_dim[0];k++)
  899. for(size_t i=0;i<M_err.Dim(1);i++){
  900. M_err[j*ker_dim[0]+k][i]-=M_analytic[k][i];
  901. }
  902. }
  903. }
  904. Matrix<Real_t> M_grad(M_err.Dim(0),n_surf*ker_dim[1]);
  905. for(size_t i=0;i<M_err.Dim(0);i++)
  906. for(size_t k=0;k<ker_dim[1];k++)
  907. for(size_t j=0;j<n_surf;j++){
  908. M_grad[i][j*ker_dim[1]+k]= M_err[i][0*ker_dim[1]+k]
  909. +(M_err[i][1*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]
  910. +(M_err[i][2*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+1]
  911. +(M_err[i][3*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+2]
  912. +(M_err[i][4*ker_dim[1]+k]+M_err[i][0*ker_dim[1]+k]-M_err[i][2*ker_dim[1]+k]-M_err[i][3*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+1]*dn_check_surf[j*COORD_DIM+2]
  913. +(M_err[i][5*ker_dim[1]+k]+M_err[i][0*ker_dim[1]+k]-M_err[i][1*ker_dim[1]+k]-M_err[i][3*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+2]*dn_check_surf[j*COORD_DIM+0]
  914. +(M_err[i][6*ker_dim[1]+k]+M_err[i][0*ker_dim[1]+k]-M_err[i][1*ker_dim[1]+k]-M_err[i][2*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]*dn_check_surf[j*COORD_DIM+1]
  915. +(M_err[i][7*ker_dim[1]+k]+M_err[i][1*ker_dim[1]+k]+M_err[i][2*ker_dim[1]+k]+M_err[i][3*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k]-M_err[i][4*ker_dim[1]+k]-M_err[i][5*ker_dim[1]+k]-M_err[i][6*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]*dn_check_surf[j*COORD_DIM+1]*dn_check_surf[j*COORD_DIM+2];
  916. }
  917. M-=M_grad;
  918. }
  919. if(!this->ScaleInvar()){ // Free memory
  920. Mat_Type type=D2D_Type;
  921. for(int l=-BC_LEVELS;l<0;l++)
  922. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  923. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  924. M.Resize(0,0);
  925. }
  926. type=U2U_Type;
  927. for(int l=-BC_LEVELS;l<0;l++)
  928. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  929. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  930. M.Resize(0,0);
  931. }
  932. type=DC2DE0_Type;
  933. for(int l=-BC_LEVELS;l<0;l++)
  934. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  935. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  936. M.Resize(0,0);
  937. }
  938. type=DC2DE1_Type;
  939. for(int l=-BC_LEVELS;l<0;l++)
  940. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  941. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  942. M.Resize(0,0);
  943. }
  944. type=UC2UE0_Type;
  945. for(int l=-BC_LEVELS;l<0;l++)
  946. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  947. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  948. M.Resize(0,0);
  949. }
  950. type=UC2UE1_Type;
  951. for(int l=-BC_LEVELS;l<0;l++)
  952. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  953. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  954. M.Resize(0,0);
  955. }
  956. }
  957. }
  958. break;
  959. }
  960. default:
  961. break;
  962. }
  963. //Save the matrix for future use.
  964. #pragma omp critical (PRECOMP_MATRIX_PTS)
  965. if(M_.Dim(0)==0 && M_.Dim(1)==0){
  966. M_=M;
  967. /*
  968. M_.Resize(M.Dim(0),M.Dim(1));
  969. int dof=ker_dim[0]*ker_dim[1];
  970. for(int j=0;j<dof;j++){
  971. size_t a=(M.Dim(0)*M.Dim(1)* j )/dof;
  972. size_t b=(M.Dim(0)*M.Dim(1)*(j+1))/dof;
  973. #pragma omp parallel for // NUMA
  974. for(int tid=0;tid<omp_p;tid++){
  975. size_t a_=a+((b-a)* tid )/omp_p;
  976. size_t b_=a+((b-a)*(tid+1))/omp_p;
  977. mem::memcopy(&M_[0][a_], &M[0][a_], (b_-a_)*sizeof(Real_t));
  978. }
  979. }
  980. */
  981. }
  982. return M_;
  983. }
  984. template <class FMMNode>
  985. void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
  986. if(level==-1){
  987. for(int l=0;l<MAX_DEPTH;l++){
  988. PrecompAll(type, l);
  989. }
  990. return;
  991. }
  992. //Compute basic permutations.
  993. for(size_t i=0;i<Perm_Count;i++)
  994. this->PrecompPerm(type, (Perm_Type) i);
  995. {
  996. //Allocate matrices.
  997. size_t mat_cnt=interac_list.ListCount((Mat_Type)type);
  998. mat->Mat(level, (Mat_Type)type, mat_cnt-1);
  999. { // Compute InteracClass matrices.
  1000. std::vector<size_t> indx_lst;
  1001. for(size_t i=0; i<mat_cnt; i++){
  1002. if(interac_list.InteracClass((Mat_Type)type,i)==i)
  1003. indx_lst.push_back(i);
  1004. }
  1005. //Compute Transformations.
  1006. //#pragma omp parallel for //lets use fine grained parallelism
  1007. for(size_t i=0; i<indx_lst.size(); i++){
  1008. Precomp(level, (Mat_Type)type, indx_lst[i]);
  1009. }
  1010. }
  1011. //#pragma omp parallel for //lets use fine grained parallelism
  1012. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
  1013. Matrix<Real_t>& M0=interac_list.ClassMat(level,(Mat_Type)type,mat_indx);
  1014. Permutation<Real_t>& pr=interac_list.Perm_R(abs(level), (Mat_Type)type, mat_indx);
  1015. Permutation<Real_t>& pc=interac_list.Perm_C(abs(level), (Mat_Type)type, mat_indx);
  1016. if(pr.Dim()!=M0.Dim(0) || pc.Dim()!=M0.Dim(1)) Precomp(level, (Mat_Type)type, mat_indx);
  1017. }
  1018. }
  1019. }
  1020. template <class FMMNode>
  1021. void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff_list, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
  1022. if(buff_list.size()<7) buff_list.resize(7);
  1023. if( n_list.size()<7) n_list.resize(7);
  1024. if( vec_list.size()<7) vec_list.resize(7);
  1025. int omp_p=omp_get_max_threads();
  1026. if(node.size()==0) return;
  1027. {// 0. upward_equiv
  1028. int indx=0;
  1029. size_t vec_sz;
  1030. { // Set vec_sz
  1031. Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE1_Type, 0);
  1032. vec_sz=M_uc2ue.Dim(1);
  1033. }
  1034. std::vector< FMMNode* > node_lst;
  1035. {// Construct node_lst
  1036. node_lst.clear();
  1037. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1038. FMMNode_t* r_node=NULL;
  1039. for(size_t i=0;i<node.size();i++){
  1040. if(!node[i]->IsLeaf()){
  1041. node_lst_[node[i]->Depth()].push_back(node[i]);
  1042. }else{
  1043. node[i]->pt_cnt[0]+=node[i]-> src_coord.Dim()/COORD_DIM;
  1044. node[i]->pt_cnt[0]+=node[i]->surf_coord.Dim()/COORD_DIM;
  1045. if(node[i]->IsGhost()) node[i]->pt_cnt[0]++; // TODO: temporary fix, pt_cnt not known for ghost nodes
  1046. }
  1047. if(node[i]->Depth()==0) r_node=node[i];
  1048. }
  1049. size_t chld_cnt=1UL<<COORD_DIM;
  1050. for(int i=MAX_DEPTH;i>=0;i--){
  1051. for(size_t j=0;j<node_lst_[i].size();j++){
  1052. for(size_t k=0;k<chld_cnt;k++){
  1053. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1054. node_lst_[i][j]->pt_cnt[0]+=node->pt_cnt[0];
  1055. }
  1056. }
  1057. }
  1058. for(int i=0;i<=MAX_DEPTH;i++){
  1059. for(size_t j=0;j<node_lst_[i].size();j++){
  1060. if(node_lst_[i][j]->pt_cnt[0])
  1061. for(size_t k=0;k<chld_cnt;k++){
  1062. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1063. node_lst.push_back(node);
  1064. }
  1065. }
  1066. }
  1067. if(r_node!=NULL) node_lst.push_back(r_node);
  1068. n_list[indx]=node_lst;
  1069. }
  1070. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1071. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1072. FMMNode_t* node=node_lst[i];
  1073. Vector<Real_t>& data_vec=node->FMMData()->upward_equiv;
  1074. data_vec.ReInit(vec_sz,NULL,false);
  1075. vec_lst.push_back(&data_vec);
  1076. }
  1077. }
  1078. {// 1. dnward_equiv
  1079. int indx=1;
  1080. size_t vec_sz;
  1081. { // Set vec_sz
  1082. Matrix<Real_t>& M_dc2de0 = this->interac_list.ClassMat(0, DC2DE0_Type, 0);
  1083. vec_sz=M_dc2de0.Dim(0);
  1084. }
  1085. std::vector< FMMNode* > node_lst;
  1086. {// Construct node_lst
  1087. node_lst.clear();
  1088. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1089. FMMNode_t* r_node=NULL;
  1090. for(size_t i=0;i<node.size();i++){
  1091. if(!node[i]->IsLeaf()){
  1092. node_lst_[node[i]->Depth()].push_back(node[i]);
  1093. }else{
  1094. node[i]->pt_cnt[1]+=node[i]->trg_coord.Dim()/COORD_DIM;
  1095. }
  1096. if(node[i]->Depth()==0) r_node=node[i];
  1097. }
  1098. size_t chld_cnt=1UL<<COORD_DIM;
  1099. for(int i=MAX_DEPTH;i>=0;i--){
  1100. for(size_t j=0;j<node_lst_[i].size();j++){
  1101. for(size_t k=0;k<chld_cnt;k++){
  1102. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1103. node_lst_[i][j]->pt_cnt[1]+=node->pt_cnt[1];
  1104. }
  1105. }
  1106. }
  1107. for(int i=0;i<=MAX_DEPTH;i++){
  1108. for(size_t j=0;j<node_lst_[i].size();j++){
  1109. if(node_lst_[i][j]->pt_cnt[1])
  1110. for(size_t k=0;k<chld_cnt;k++){
  1111. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1112. node_lst.push_back(node);
  1113. }
  1114. }
  1115. }
  1116. if(r_node!=NULL) node_lst.push_back(r_node);
  1117. n_list[indx]=node_lst;
  1118. }
  1119. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1120. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1121. FMMNode_t* node=node_lst[i];
  1122. Vector<Real_t>& data_vec=node->FMMData()->dnward_equiv;
  1123. data_vec.ReInit(vec_sz,NULL,false);
  1124. vec_lst.push_back(&data_vec);
  1125. }
  1126. }
  1127. {// 2. upward_equiv_fft
  1128. int indx=2;
  1129. std::vector< FMMNode* > node_lst;
  1130. {
  1131. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1132. for(size_t i=0;i<node.size();i++)
  1133. if(!node[i]->IsLeaf())
  1134. node_lst_[node[i]->Depth()].push_back(node[i]);
  1135. for(int i=0;i<=MAX_DEPTH;i++)
  1136. for(size_t j=0;j<node_lst_[i].size();j++)
  1137. node_lst.push_back(node_lst_[i][j]);
  1138. }
  1139. n_list[indx]=node_lst;
  1140. }
  1141. {// 3. dnward_check_fft
  1142. int indx=3;
  1143. std::vector< FMMNode* > node_lst;
  1144. {
  1145. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1146. for(size_t i=0;i<node.size();i++)
  1147. if(!node[i]->IsLeaf() && !node[i]->IsGhost())
  1148. node_lst_[node[i]->Depth()].push_back(node[i]);
  1149. for(int i=0;i<=MAX_DEPTH;i++)
  1150. for(size_t j=0;j<node_lst_[i].size();j++)
  1151. node_lst.push_back(node_lst_[i][j]);
  1152. }
  1153. n_list[indx]=node_lst;
  1154. }
  1155. {// 4. src_val
  1156. int indx=4;
  1157. int src_dof=kernel->ker_dim[0];
  1158. int surf_dof=COORD_DIM+src_dof;
  1159. std::vector< FMMNode* > node_lst;
  1160. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1161. if(node[i]->IsLeaf()){
  1162. node_lst.push_back(node[i]);
  1163. }else{
  1164. node[i]->src_value.ReInit(0);
  1165. node[i]->surf_value.ReInit(0);
  1166. }
  1167. }
  1168. n_list[indx]=node_lst;
  1169. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1170. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1171. FMMNode_t* node=node_lst[i];
  1172. { // src_value
  1173. Vector<Real_t>& data_vec=node->src_value;
  1174. size_t vec_sz=(node->src_coord.Dim()/COORD_DIM)*src_dof;
  1175. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false);
  1176. vec_lst.push_back(&data_vec);
  1177. }
  1178. { // surf_value
  1179. Vector<Real_t>& data_vec=node->surf_value;
  1180. size_t vec_sz=(node->surf_coord.Dim()/COORD_DIM)*surf_dof;
  1181. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false);
  1182. vec_lst.push_back(&data_vec);
  1183. }
  1184. }
  1185. }
  1186. {// 5. trg_val
  1187. int indx=5;
  1188. int trg_dof=kernel->ker_dim[1];
  1189. std::vector< FMMNode* > node_lst;
  1190. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1191. if(node[i]->IsLeaf() && !node[i]->IsGhost()){
  1192. node_lst.push_back(node[i]);
  1193. }else{
  1194. node[i]->trg_value.ReInit(0);
  1195. }
  1196. }
  1197. n_list[indx]=node_lst;
  1198. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1199. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1200. FMMNode_t* node=node_lst[i];
  1201. { // trg_value
  1202. Vector<Real_t>& data_vec=node->trg_value;
  1203. size_t vec_sz=(node->trg_coord.Dim()/COORD_DIM)*trg_dof;
  1204. data_vec.ReInit(vec_sz,NULL,false);
  1205. vec_lst.push_back(&data_vec);
  1206. }
  1207. }
  1208. }
  1209. {// 6. pts_coord
  1210. int indx=6;
  1211. std::vector< FMMNode* > node_lst;
  1212. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1213. if(node[i]->IsLeaf()){
  1214. node_lst.push_back(node[i]);
  1215. }else{
  1216. node[i]->src_coord.ReInit(0);
  1217. node[i]->surf_coord.ReInit(0);
  1218. node[i]->trg_coord.ReInit(0);
  1219. }
  1220. }
  1221. n_list[indx]=node_lst;
  1222. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1223. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1224. FMMNode_t* node=node_lst[i];
  1225. { // src_coord
  1226. Vector<Real_t>& data_vec=node->src_coord;
  1227. vec_lst.push_back(&data_vec);
  1228. }
  1229. { // surf_coord
  1230. Vector<Real_t>& data_vec=node->surf_coord;
  1231. vec_lst.push_back(&data_vec);
  1232. }
  1233. { // trg_coord
  1234. Vector<Real_t>& data_vec=node->trg_coord;
  1235. vec_lst.push_back(&data_vec);
  1236. }
  1237. }
  1238. { // check and equiv surfaces.
  1239. if(tree->upwd_check_surf.size()==0){
  1240. size_t m=MultipoleOrder();
  1241. tree->upwd_check_surf.resize(MAX_DEPTH);
  1242. tree->upwd_equiv_surf.resize(MAX_DEPTH);
  1243. tree->dnwd_check_surf.resize(MAX_DEPTH);
  1244. tree->dnwd_equiv_surf.resize(MAX_DEPTH);
  1245. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1246. Real_t c[3]={0.0,0.0,0.0};
  1247. tree->upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1248. tree->upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1249. tree->dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1250. tree->dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1251. tree->upwd_check_surf[depth]=u_check_surf(m,c,depth);
  1252. tree->upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
  1253. tree->dnwd_check_surf[depth]=d_check_surf(m,c,depth);
  1254. tree->dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
  1255. }
  1256. }
  1257. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1258. vec_lst.push_back(&tree->upwd_check_surf[depth]);
  1259. vec_lst.push_back(&tree->upwd_equiv_surf[depth]);
  1260. vec_lst.push_back(&tree->dnwd_check_surf[depth]);
  1261. vec_lst.push_back(&tree->dnwd_equiv_surf[depth]);
  1262. }
  1263. }
  1264. }
  1265. // Create extra auxiliary buffer.
  1266. if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1);
  1267. for(size_t indx=0;indx<vec_list.size();indx++){ // Resize buffer
  1268. Matrix<Real_t>& buff=buff_list[indx];
  1269. std::vector<Vector<Real_t>*>& vec_lst= vec_list[indx];
  1270. bool keep_data=(indx==4 || indx==6);
  1271. size_t n_vec=vec_lst.size();
  1272. { // Continue if nothing to be done.
  1273. if(!n_vec) continue;
  1274. if(buff.Dim(0)*buff.Dim(1)>0){
  1275. bool init_buff=false;
  1276. Real_t* buff_start=&buff[0][0];
  1277. Real_t* buff_end=&buff[0][0]+buff.Dim(0)*buff.Dim(1);
  1278. #pragma omp parallel for reduction(||:init_buff)
  1279. for(size_t i=0;i<n_vec;i++){
  1280. if(vec_lst[i]->Dim() && (&(*vec_lst[i])[0]<buff_start || &(*vec_lst[i])[0]>=buff_end)){
  1281. init_buff=true;
  1282. }
  1283. }
  1284. if(!init_buff) continue;
  1285. }
  1286. }
  1287. std::vector<size_t> vec_size(n_vec);
  1288. std::vector<size_t> vec_disp(n_vec);
  1289. if(n_vec){ // Set vec_size and vec_disp
  1290. #pragma omp parallel for
  1291. for(size_t i=0;i<n_vec;i++){ // Set vec_size
  1292. vec_size[i]=vec_lst[i]->Dim();
  1293. }
  1294. vec_disp[0]=0;
  1295. omp_par::scan(&vec_size[0],&vec_disp[0],n_vec);
  1296. }
  1297. size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1];
  1298. if(!buff_size) continue;
  1299. if(keep_data){ // Copy to dev_buffer
  1300. if(dev_buffer.Dim()<buff_size*sizeof(Real_t)){ // Resize dev_buffer
  1301. dev_buffer.ReInit(buff_size*sizeof(Real_t)*1.05);
  1302. }
  1303. #pragma omp parallel for
  1304. for(size_t i=0;i<n_vec;i++){
  1305. if(&(*vec_lst[i])[0]){
  1306. mem::memcopy(((Real_t*)&dev_buffer[0])+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
  1307. }
  1308. }
  1309. }
  1310. if(buff.Dim(0)*buff.Dim(1)<buff_size){ // Resize buff
  1311. buff.ReInit(1,buff_size*1.05);
  1312. }
  1313. if(keep_data){ // Copy to buff (from dev_buffer)
  1314. #pragma omp parallel for
  1315. for(size_t tid=0;tid<omp_p;tid++){
  1316. size_t a=(buff_size*(tid+0))/omp_p;
  1317. size_t b=(buff_size*(tid+1))/omp_p;
  1318. mem::memcopy(&buff[0][0]+a,((Real_t*)&dev_buffer[0])+a,(b-a)*sizeof(Real_t));
  1319. }
  1320. }
  1321. #pragma omp parallel for
  1322. for(size_t i=0;i<n_vec;i++){ // ReInit vectors
  1323. vec_lst[i]->ReInit(vec_size[i],&buff[0][0]+vec_disp[i],false);
  1324. }
  1325. }
  1326. }
  1327. template <class FMMNode>
  1328. void FMM_Pts<FMMNode>::SetupPrecomp(SetupData<Real_t>& setup_data, bool device){
  1329. if(setup_data.precomp_data==NULL || setup_data.level>MAX_DEPTH) return;
  1330. Profile::Tic("SetupPrecomp",&this->comm,true,25);
  1331. { // Build precomp_data
  1332. size_t precomp_offset=0;
  1333. int level=setup_data.level;
  1334. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1335. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1336. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1337. Mat_Type& interac_type=interac_type_lst[type_indx];
  1338. this->PrecompAll(interac_type, level); // Compute matrices.
  1339. precomp_offset=this->mat->CompactData(level, interac_type, precomp_data, precomp_offset);
  1340. }
  1341. }
  1342. Profile::Toc();
  1343. if(device){ // Host2Device
  1344. Profile::Tic("Host2Device",&this->comm,false,25);
  1345. setup_data.precomp_data->AllocDevice(true);
  1346. Profile::Toc();
  1347. }
  1348. }
  1349. template <class FMMNode>
  1350. void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
  1351. int level=setup_data.level;
  1352. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1353. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1354. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1355. Matrix<Real_t>& input_data=*setup_data. input_data;
  1356. Matrix<Real_t>& output_data=*setup_data.output_data;
  1357. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector;
  1358. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector;
  1359. size_t n_in =nodes_in .size();
  1360. size_t n_out=nodes_out.size();
  1361. // Setup precomputed data.
  1362. if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  1363. // Build interac_data
  1364. Profile::Tic("Interac-Data",&this->comm,true,25);
  1365. Matrix<char>& interac_data=setup_data.interac_data;
  1366. { // Build precomp_data, interac_data
  1367. std::vector<size_t> interac_mat;
  1368. std::vector<size_t> interac_cnt;
  1369. std::vector<size_t> interac_blk;
  1370. std::vector<size_t> input_perm;
  1371. std::vector<size_t> output_perm;
  1372. size_t dof=0, M_dim0=0, M_dim1=0;
  1373. size_t precomp_offset=0;
  1374. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  1375. if(n_out && n_in) for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1376. Mat_Type& interac_type=interac_type_lst[type_indx];
  1377. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  1378. Matrix<size_t> precomp_data_offset;
  1379. { // Load precomp_data for interac_type.
  1380. struct HeaderData{
  1381. size_t total_size;
  1382. size_t level;
  1383. size_t mat_cnt ;
  1384. size_t max_depth;
  1385. };
  1386. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1387. char* indx_ptr=precomp_data[0]+precomp_offset;
  1388. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  1389. precomp_data_offset.ReInit(header.mat_cnt,(1+(2+2)*header.max_depth), (size_t*)indx_ptr, false);
  1390. precomp_offset+=header.total_size;
  1391. }
  1392. Matrix<FMMNode*> src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero();
  1393. Matrix<FMMNode*> trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero();
  1394. { // Build trg_interac_list
  1395. #pragma omp parallel for
  1396. for(size_t i=0;i<n_out;i++){
  1397. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  1398. Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  1399. mem::memcopy(&trg_interac_list[i][0], &lst[0], lst.Dim()*sizeof(FMMNode*));
  1400. assert(lst.Dim()==mat_cnt);
  1401. }
  1402. }
  1403. }
  1404. { // Build src_interac_list
  1405. #pragma omp parallel for
  1406. for(size_t i=0;i<n_out;i++){
  1407. for(size_t j=0;j<mat_cnt;j++)
  1408. if(trg_interac_list[i][j]!=NULL){
  1409. trg_interac_list[i][j]->node_id=n_in;
  1410. }
  1411. }
  1412. #pragma omp parallel for
  1413. for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
  1414. #pragma omp parallel for
  1415. for(size_t i=0;i<n_out;i++){
  1416. for(size_t j=0;j<mat_cnt;j++){
  1417. if(trg_interac_list[i][j]!=NULL){
  1418. if(trg_interac_list[i][j]->node_id==n_in){
  1419. trg_interac_list[i][j]=NULL;
  1420. }else{
  1421. src_interac_list[trg_interac_list[i][j]->node_id][j]=(FMMNode*)nodes_out[i];
  1422. }
  1423. }
  1424. }
  1425. }
  1426. }
  1427. Matrix<size_t> interac_dsp(n_out,mat_cnt);
  1428. std::vector<size_t> interac_blk_dsp(1,0);
  1429. { // Determine dof, M_dim0, M_dim1
  1430. dof=1;
  1431. Matrix<Real_t>& M0 = this->interac_list.ClassMat(level, interac_type_lst[0], 0);
  1432. M_dim0=M0.Dim(0); M_dim1=M0.Dim(1);
  1433. }
  1434. { // Determine interaction blocks which fit in memory.
  1435. size_t vec_size=(M_dim0+M_dim1)*sizeof(Real_t)*dof;
  1436. for(size_t j=0;j<mat_cnt;j++){// Determine minimum buff_size
  1437. size_t vec_cnt=0;
  1438. for(size_t i=0;i<n_out;i++){
  1439. if(trg_interac_list[i][j]!=NULL) vec_cnt++;
  1440. }
  1441. if(buff_size<vec_cnt*vec_size)
  1442. buff_size=vec_cnt*vec_size;
  1443. }
  1444. size_t interac_dsp_=0;
  1445. for(size_t j=0;j<mat_cnt;j++){
  1446. for(size_t i=0;i<n_out;i++){
  1447. interac_dsp[i][j]=interac_dsp_;
  1448. if(trg_interac_list[i][j]!=NULL) interac_dsp_++;
  1449. }
  1450. if(interac_dsp_*vec_size>buff_size) // Comment to disable symmetries.
  1451. {
  1452. interac_blk.push_back(j-interac_blk_dsp.back());
  1453. interac_blk_dsp.push_back(j);
  1454. size_t offset=interac_dsp[0][j];
  1455. for(size_t i=0;i<n_out;i++) interac_dsp[i][j]-=offset;
  1456. interac_dsp_-=offset;
  1457. assert(interac_dsp_*vec_size<=buff_size); // Problem too big for buff_size.
  1458. }
  1459. interac_mat.push_back(precomp_data_offset[this->interac_list.InteracClass(interac_type,j)][0]);
  1460. interac_cnt.push_back(interac_dsp_-interac_dsp[0][j]);
  1461. }
  1462. interac_blk.push_back(mat_cnt-interac_blk_dsp.back());
  1463. interac_blk_dsp.push_back(mat_cnt);
  1464. }
  1465. { // Determine input_perm.
  1466. size_t vec_size=M_dim0*dof;
  1467. for(size_t i=0;i<n_out;i++) ((FMMNode*)nodes_out[i])->node_id=i;
  1468. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1469. for(size_t i=0;i<n_in ;i++){
  1470. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1471. FMMNode_t* trg_node=src_interac_list[i][j];
  1472. if(trg_node!=NULL && trg_node->node_id<n_out){
  1473. size_t depth=(this->ScaleInvar()?trg_node->Depth():0);
  1474. input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
  1475. input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
  1476. input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
  1477. input_perm .push_back((size_t)(& input_vector[i][0][0]- input_data[0])); // src_ptr
  1478. assert(input_vector[i]->Dim()==vec_size);
  1479. }
  1480. }
  1481. }
  1482. }
  1483. }
  1484. { // Determine output_perm
  1485. size_t vec_size=M_dim1*dof;
  1486. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1487. for(size_t i=0;i<n_out;i++){
  1488. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1489. if(trg_interac_list[i][j]!=NULL){
  1490. size_t depth=(this->ScaleInvar()?((FMMNode*)nodes_out[i])->Depth():0);
  1491. output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem
  1492. output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal
  1493. output_perm.push_back(interac_dsp[ i ][j]*vec_size*sizeof(Real_t)); // src_ptr
  1494. output_perm.push_back((size_t)(&output_vector[i][0][0]-output_data[0])); // trg_ptr
  1495. assert(output_vector[i]->Dim()==vec_size);
  1496. }
  1497. }
  1498. }
  1499. }
  1500. }
  1501. }
  1502. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  1503. { // Set interac_data.
  1504. size_t data_size=sizeof(size_t)*4;
  1505. data_size+=sizeof(size_t)+interac_blk.size()*sizeof(size_t);
  1506. data_size+=sizeof(size_t)+interac_cnt.size()*sizeof(size_t);
  1507. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  1508. data_size+=sizeof(size_t)+ input_perm.size()*sizeof(size_t);
  1509. data_size+=sizeof(size_t)+output_perm.size()*sizeof(size_t);
  1510. if(interac_data.Dim(0)*interac_data.Dim(1)<sizeof(size_t)){
  1511. data_size+=sizeof(size_t);
  1512. interac_data.ReInit(1,data_size);
  1513. ((size_t*)&interac_data[0][0])[0]=sizeof(size_t);
  1514. }else{
  1515. size_t pts_data_size=*((size_t*)&interac_data[0][0]);
  1516. assert(interac_data.Dim(0)*interac_data.Dim(1)>=pts_data_size);
  1517. data_size+=pts_data_size;
  1518. if(data_size>interac_data.Dim(0)*interac_data.Dim(1)){ //Resize and copy interac_data.
  1519. Matrix< char> pts_interac_data=interac_data;
  1520. interac_data.ReInit(1,data_size);
  1521. mem::memcopy(&interac_data[0][0],&pts_interac_data[0][0],pts_data_size);
  1522. }
  1523. }
  1524. char* data_ptr=&interac_data[0][0];
  1525. data_ptr+=((size_t*)data_ptr)[0];
  1526. ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t);
  1527. ((size_t*)data_ptr)[0]= M_dim0; data_ptr+=sizeof(size_t);
  1528. ((size_t*)data_ptr)[0]= M_dim1; data_ptr+=sizeof(size_t);
  1529. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  1530. ((size_t*)data_ptr)[0]=interac_blk.size(); data_ptr+=sizeof(size_t);
  1531. mem::memcopy(data_ptr, &interac_blk[0], interac_blk.size()*sizeof(size_t));
  1532. data_ptr+=interac_blk.size()*sizeof(size_t);
  1533. ((size_t*)data_ptr)[0]=interac_cnt.size(); data_ptr+=sizeof(size_t);
  1534. mem::memcopy(data_ptr, &interac_cnt[0], interac_cnt.size()*sizeof(size_t));
  1535. data_ptr+=interac_cnt.size()*sizeof(size_t);
  1536. ((size_t*)data_ptr)[0]=interac_mat.size(); data_ptr+=sizeof(size_t);
  1537. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  1538. data_ptr+=interac_mat.size()*sizeof(size_t);
  1539. ((size_t*)data_ptr)[0]= input_perm.size(); data_ptr+=sizeof(size_t);
  1540. mem::memcopy(data_ptr, & input_perm[0], input_perm.size()*sizeof(size_t));
  1541. data_ptr+= input_perm.size()*sizeof(size_t);
  1542. ((size_t*)data_ptr)[0]=output_perm.size(); data_ptr+=sizeof(size_t);
  1543. mem::memcopy(data_ptr, &output_perm[0], output_perm.size()*sizeof(size_t));
  1544. data_ptr+=output_perm.size()*sizeof(size_t);
  1545. }
  1546. }
  1547. Profile::Toc();
  1548. if(device){ // Host2Device
  1549. Profile::Tic("Host2Device",&this->comm,false,25);
  1550. setup_data.interac_data .AllocDevice(true);
  1551. if(staging_buffer.Dim()<sizeof(Real_t)*output_data.Dim(0)*output_data.Dim(1)){
  1552. staging_buffer.ReInit(sizeof(Real_t)*output_data.Dim(0)*output_data.Dim(1));
  1553. staging_buffer.SetZero();
  1554. staging_buffer.AllocDevice(true);
  1555. }
  1556. Profile::Toc();
  1557. }
  1558. }
  1559. #if defined(PVFMM_HAVE_CUDA)
  1560. #include <fmm_pts_gpu.hpp>
  1561. template <class Real_t, int SYNC>
  1562. void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Comm& comm) {
  1563. cudaStream_t* stream = pvfmm::CUDA_Lock::acquire_stream();
  1564. Profile::Tic("Host2Device",&comm,false,25);
  1565. typename Matrix<char>::Device interac_data;
  1566. typename Vector<char>::Device buff;
  1567. typename Matrix<char>::Device precomp_data_d;
  1568. typename Matrix<char>::Device interac_data_d;
  1569. typename Matrix<Real_t>::Device input_data_d;
  1570. typename Matrix<Real_t>::Device output_data_d;
  1571. interac_data = setup_data.interac_data;
  1572. buff = dev_buffer. AllocDevice(false);
  1573. precomp_data_d= setup_data.precomp_data->AllocDevice(false);
  1574. interac_data_d= setup_data.interac_data. AllocDevice(false);
  1575. input_data_d = setup_data. input_data->AllocDevice(false);
  1576. output_data_d = setup_data. output_data->AllocDevice(false);
  1577. Profile::Toc();
  1578. Profile::Tic("DeviceComp",&comm,false,20);
  1579. { // Offloaded computation.
  1580. size_t data_size, M_dim0, M_dim1, dof;
  1581. Vector<size_t> interac_blk;
  1582. Vector<size_t> interac_cnt;
  1583. Vector<size_t> interac_mat;
  1584. Vector<size_t> input_perm_d;
  1585. Vector<size_t> output_perm_d;
  1586. { // Set interac_data.
  1587. char* data_ptr=&interac_data [0][0];
  1588. char* dev_ptr=&interac_data_d[0][0];
  1589. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; dev_ptr += data_size;
  1590. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1591. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1592. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1593. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1594. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1595. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1596. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1597. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1598. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1599. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1600. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1601. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1602. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1603. input_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1604. data_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1605. dev_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1606. output_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1607. data_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1608. dev_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1609. }
  1610. { // interactions
  1611. size_t interac_indx = 0;
  1612. size_t interac_blk_dsp = 0;
  1613. cudaError_t error;
  1614. for (size_t k = 0; k < interac_blk.Dim(); k++) {
  1615. size_t vec_cnt=0;
  1616. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1617. if(vec_cnt==0){
  1618. //interac_indx += vec_cnt;
  1619. interac_blk_dsp += interac_blk[k];
  1620. continue;
  1621. }
  1622. char *buff_in_d =&buff[0];
  1623. char *buff_out_d =&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1624. { // Input permutation.
  1625. in_perm_gpu<Real_t>(&precomp_data_d[0][0], &input_data_d[0][0], buff_in_d,
  1626. &input_perm_d[interac_indx*4], vec_cnt, M_dim0, stream);
  1627. }
  1628. size_t vec_cnt0 = 0;
  1629. for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
  1630. size_t vec_cnt1 = 0;
  1631. size_t interac_mat0 = interac_mat[j];
  1632. for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) vec_cnt1 += interac_cnt[j];
  1633. Matrix<Real_t> M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
  1634. Matrix<Real_t> Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1635. Matrix<Real_t> Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1636. Matrix<Real_t>::CUBLASGEMM(Mt_d, Ms_d, M_d);
  1637. vec_cnt0 += vec_cnt1;
  1638. }
  1639. { // Output permutation.
  1640. out_perm_gpu<Real_t>(&precomp_data_d[0][0], &output_data_d[0][0], buff_out_d,
  1641. &output_perm_d[interac_indx*4], vec_cnt, M_dim1, stream);
  1642. }
  1643. interac_indx += vec_cnt;
  1644. interac_blk_dsp += interac_blk[k];
  1645. }
  1646. }
  1647. }
  1648. Profile::Toc();
  1649. if(SYNC) CUDA_Lock::wait();
  1650. }
  1651. #endif
  1652. template <class FMMNode>
  1653. template <int SYNC>
  1654. void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
  1655. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  1656. Profile::Tic("Host2Device",&this->comm,false,25);
  1657. Profile::Toc();
  1658. Profile::Tic("DeviceComp",&this->comm,false,20);
  1659. Profile::Toc();
  1660. return;
  1661. }
  1662. #if defined(PVFMM_HAVE_CUDA)
  1663. if (device) {
  1664. EvalListGPU<Real_t, SYNC>(setup_data, this->dev_buffer, this->comm);
  1665. return;
  1666. }
  1667. #endif
  1668. Profile::Tic("Host2Device",&this->comm,false,25);
  1669. typename Vector<char>::Device buff;
  1670. typename Matrix<char>::Device precomp_data;
  1671. typename Matrix<char>::Device interac_data;
  1672. typename Matrix<Real_t>::Device input_data;
  1673. typename Matrix<Real_t>::Device output_data;
  1674. if(device){
  1675. buff = this-> dev_buffer. AllocDevice(false);
  1676. precomp_data= setup_data.precomp_data->AllocDevice(false);
  1677. interac_data= setup_data.interac_data. AllocDevice(false);
  1678. input_data = setup_data. input_data->AllocDevice(false);
  1679. output_data = setup_data. output_data->AllocDevice(false);
  1680. }else{
  1681. buff = this-> dev_buffer;
  1682. precomp_data=*setup_data.precomp_data;
  1683. interac_data= setup_data.interac_data;
  1684. input_data =*setup_data. input_data;
  1685. output_data =*setup_data. output_data;
  1686. }
  1687. Profile::Toc();
  1688. Profile::Tic("DeviceComp",&this->comm,false,20);
  1689. int lock_idx=-1;
  1690. int wait_lock_idx=-1;
  1691. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  1692. if(device) lock_idx=MIC_Lock::get_lock();
  1693. #ifdef __INTEL_OFFLOAD
  1694. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  1695. #endif
  1696. { // Offloaded computation.
  1697. // Set interac_data.
  1698. size_t data_size, M_dim0, M_dim1, dof;
  1699. Vector<size_t> interac_blk;
  1700. Vector<size_t> interac_cnt;
  1701. Vector<size_t> interac_mat;
  1702. Vector<size_t> input_perm;
  1703. Vector<size_t> output_perm;
  1704. { // Set interac_data.
  1705. char* data_ptr=&interac_data[0][0];
  1706. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
  1707. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1708. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1709. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1710. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1711. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1712. data_ptr+=sizeof(size_t)+interac_blk.Dim()*sizeof(size_t);
  1713. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1714. data_ptr+=sizeof(size_t)+interac_cnt.Dim()*sizeof(size_t);
  1715. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1716. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  1717. input_perm .ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1718. data_ptr+=sizeof(size_t)+ input_perm.Dim()*sizeof(size_t);
  1719. output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1720. data_ptr+=sizeof(size_t)+output_perm.Dim()*sizeof(size_t);
  1721. }
  1722. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  1723. //Compute interaction from Chebyshev source density.
  1724. { // interactions
  1725. int omp_p=omp_get_max_threads();
  1726. size_t interac_indx=0;
  1727. size_t interac_blk_dsp=0;
  1728. for(size_t k=0;k<interac_blk.Dim();k++){
  1729. size_t vec_cnt=0;
  1730. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1731. if(vec_cnt==0){
  1732. //interac_indx += vec_cnt;
  1733. interac_blk_dsp += interac_blk[k];
  1734. continue;
  1735. }
  1736. char* buff_in =&buff[0];
  1737. char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1738. // Input permutation.
  1739. #pragma omp parallel for
  1740. for(int tid=0;tid<omp_p;tid++){
  1741. size_t a=( tid *vec_cnt)/omp_p;
  1742. size_t b=((tid+1)*vec_cnt)/omp_p;
  1743. for(size_t i=a;i<b;i++){
  1744. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]);
  1745. const Real_t* scal=( Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]);
  1746. const Real_t* v_in =( Real_t*)( input_data[0]+input_perm[(interac_indx+i)*4+3]);
  1747. Real_t* v_out=( Real_t*)( buff_in +input_perm[(interac_indx+i)*4+2]);
  1748. // TODO: Fix for dof>1
  1749. #ifdef __MIC__
  1750. {
  1751. __m512d v8;
  1752. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1753. size_t j_end =(((uintptr_t)(v_out+M_dim0) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1754. j_start/=sizeof(Real_t);
  1755. j_end /=sizeof(Real_t);
  1756. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1757. assert(((uintptr_t)(v_out+j_start))%64==0);
  1758. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1759. size_t j=0;
  1760. for(;j<j_start;j++ ){
  1761. v_out[j]=v_in[perm[j]]*scal[j];
  1762. }
  1763. for(;j<j_end ;j+=8){
  1764. v8=_mm512_setr_pd(
  1765. v_in[perm[j+0]]*scal[j+0],
  1766. v_in[perm[j+1]]*scal[j+1],
  1767. v_in[perm[j+2]]*scal[j+2],
  1768. v_in[perm[j+3]]*scal[j+3],
  1769. v_in[perm[j+4]]*scal[j+4],
  1770. v_in[perm[j+5]]*scal[j+5],
  1771. v_in[perm[j+6]]*scal[j+6],
  1772. v_in[perm[j+7]]*scal[j+7]);
  1773. _mm512_storenrngo_pd(v_out+j,v8);
  1774. }
  1775. for(;j<M_dim0 ;j++ ){
  1776. v_out[j]=v_in[perm[j]]*scal[j];
  1777. }
  1778. }
  1779. #else
  1780. for(size_t j=0;j<M_dim0;j++ ){
  1781. v_out[j]=v_in[perm[j]]*scal[j];
  1782. }
  1783. #endif
  1784. }
  1785. }
  1786. size_t vec_cnt0=0;
  1787. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];){
  1788. size_t vec_cnt1=0;
  1789. size_t interac_mat0=interac_mat[j];
  1790. for(;j<interac_blk_dsp+interac_blk[k] && interac_mat[j]==interac_mat0;j++) vec_cnt1+=interac_cnt[j];
  1791. Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
  1792. #ifdef __MIC__
  1793. {
  1794. Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1795. Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1796. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1797. }
  1798. #else
  1799. #pragma omp parallel for
  1800. for(int tid=0;tid<omp_p;tid++){
  1801. size_t a=(dof*vec_cnt1*(tid ))/omp_p;
  1802. size_t b=(dof*vec_cnt1*(tid+1))/omp_p;
  1803. Matrix<Real_t> Ms(b-a, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t))+M_dim0*a, false);
  1804. Matrix<Real_t> Mt(b-a, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t))+M_dim1*a, false);
  1805. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1806. }
  1807. #endif
  1808. vec_cnt0+=vec_cnt1;
  1809. }
  1810. // Output permutation.
  1811. #pragma omp parallel for
  1812. for(int tid=0;tid<omp_p;tid++){
  1813. size_t a=( tid *vec_cnt)/omp_p;
  1814. size_t b=((tid+1)*vec_cnt)/omp_p;
  1815. if(tid> 0 && a<vec_cnt){ // Find 'a' independent of other threads.
  1816. size_t out_ptr=output_perm[(interac_indx+a)*4+3];
  1817. if(tid> 0) while(a<vec_cnt && out_ptr==output_perm[(interac_indx+a)*4+3]) a++;
  1818. }
  1819. if(tid<omp_p-1 && b<vec_cnt){ // Find 'b' independent of other threads.
  1820. size_t out_ptr=output_perm[(interac_indx+b)*4+3];
  1821. if(tid<omp_p-1) while(b<vec_cnt && out_ptr==output_perm[(interac_indx+b)*4+3]) b++;
  1822. }
  1823. for(size_t i=a;i<b;i++){ // Compute permutations.
  1824. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
  1825. const Real_t* scal=( Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
  1826. const Real_t* v_in =( Real_t*)( buff_out +output_perm[(interac_indx+i)*4+2]);
  1827. Real_t* v_out=( Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]);
  1828. // TODO: Fix for dof>1
  1829. #ifdef __MIC__
  1830. {
  1831. __m512d v8;
  1832. __m512d v_old;
  1833. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1834. size_t j_end =(((uintptr_t)(v_out+M_dim1) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1835. j_start/=sizeof(Real_t);
  1836. j_end /=sizeof(Real_t);
  1837. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1838. assert(((uintptr_t)(v_out+j_start))%64==0);
  1839. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1840. size_t j=0;
  1841. for(;j<j_start;j++ ){
  1842. v_out[j]+=v_in[perm[j]]*scal[j];
  1843. }
  1844. for(;j<j_end ;j+=8){
  1845. v_old=_mm512_load_pd(v_out+j);
  1846. v8=_mm512_setr_pd(
  1847. v_in[perm[j+0]]*scal[j+0],
  1848. v_in[perm[j+1]]*scal[j+1],
  1849. v_in[perm[j+2]]*scal[j+2],
  1850. v_in[perm[j+3]]*scal[j+3],
  1851. v_in[perm[j+4]]*scal[j+4],
  1852. v_in[perm[j+5]]*scal[j+5],
  1853. v_in[perm[j+6]]*scal[j+6],
  1854. v_in[perm[j+7]]*scal[j+7]);
  1855. v_old=_mm512_add_pd(v_old, v8);
  1856. _mm512_storenrngo_pd(v_out+j,v_old);
  1857. }
  1858. for(;j<M_dim1 ;j++ ){
  1859. v_out[j]+=v_in[perm[j]]*scal[j];
  1860. }
  1861. }
  1862. #else
  1863. for(size_t j=0;j<M_dim1;j++ ){
  1864. v_out[j]+=v_in[perm[j]]*scal[j];
  1865. }
  1866. #endif
  1867. }
  1868. }
  1869. interac_indx+=vec_cnt;
  1870. interac_blk_dsp+=interac_blk[k];
  1871. }
  1872. }
  1873. if(device) MIC_Lock::release_lock(lock_idx);
  1874. }
  1875. #ifdef __INTEL_OFFLOAD
  1876. if(SYNC){
  1877. #pragma offload if(device) target(mic:0)
  1878. {if(device) MIC_Lock::wait_lock(lock_idx);}
  1879. }
  1880. #endif
  1881. Profile::Toc();
  1882. }
  1883. template <class FMMNode>
  1884. void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  1885. if(!this->MultipoleOrder()) return;
  1886. { // Set setup_data
  1887. setup_data. level=level;
  1888. setup_data.kernel=kernel->k_s2m;
  1889. setup_data. input_data=&buff[4];
  1890. setup_data.output_data=&buff[0];
  1891. setup_data. coord_data=&buff[6];
  1892. Vector<FMMNode_t*>& nodes_in =n_list[4];
  1893. Vector<FMMNode_t*>& nodes_out=n_list[0];
  1894. setup_data.nodes_in .clear();
  1895. setup_data.nodes_out.clear();
  1896. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
  1897. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && (nodes_out[i]->src_coord.Dim() || nodes_out[i]->surf_coord.Dim()) && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  1898. }
  1899. struct PackedData{
  1900. size_t len;
  1901. Matrix<Real_t>* ptr;
  1902. Vector<size_t> cnt;
  1903. Vector<size_t> dsp;
  1904. };
  1905. struct InteracData{
  1906. Vector<size_t> in_node;
  1907. Vector<size_t> scal_idx;
  1908. Vector<Real_t> coord_shift;
  1909. Vector<size_t> interac_cnt;
  1910. Vector<size_t> interac_dsp;
  1911. Vector<size_t> interac_cst;
  1912. Vector<Real_t> scal[4*MAX_DEPTH];
  1913. Matrix<Real_t> M[4];
  1914. };
  1915. struct ptSetupData{
  1916. int level;
  1917. const Kernel<Real_t>* kernel;
  1918. PackedData src_coord; // Src coord
  1919. PackedData src_value; // Src density
  1920. PackedData srf_coord; // Srf coord
  1921. PackedData srf_value; // Srf density
  1922. PackedData trg_coord; // Trg coord
  1923. PackedData trg_value; // Trg potential
  1924. InteracData interac_data;
  1925. };
  1926. ptSetupData data;
  1927. data. level=setup_data. level;
  1928. data.kernel=setup_data.kernel;
  1929. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1930. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1931. { // Set src data
  1932. std::vector<void*>& nodes=nodes_in;
  1933. PackedData& coord=data.src_coord;
  1934. PackedData& value=data.src_value;
  1935. coord.ptr=setup_data. coord_data;
  1936. value.ptr=setup_data. input_data;
  1937. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  1938. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  1939. coord.cnt.ReInit(nodes.size());
  1940. coord.dsp.ReInit(nodes.size());
  1941. value.cnt.ReInit(nodes.size());
  1942. value.dsp.ReInit(nodes.size());
  1943. #pragma omp parallel for
  1944. for(size_t i=0;i<nodes.size();i++){
  1945. ((FMMNode_t*)nodes[i])->node_id=i;
  1946. Vector<Real_t>& coord_vec=((FMMNode*)nodes[i])->src_coord;
  1947. Vector<Real_t>& value_vec=((FMMNode*)nodes[i])->src_value;
  1948. if(coord_vec.Dim()){
  1949. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  1950. assert(coord.dsp[i]<coord.len);
  1951. coord.cnt[i]=coord_vec.Dim();
  1952. }else{
  1953. coord.dsp[i]=0;
  1954. coord.cnt[i]=0;
  1955. }
  1956. if(value_vec.Dim()){
  1957. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  1958. assert(value.dsp[i]<value.len);
  1959. value.cnt[i]=value_vec.Dim();
  1960. }else{
  1961. value.dsp[i]=0;
  1962. value.cnt[i]=0;
  1963. }
  1964. }
  1965. }
  1966. { // Set srf data
  1967. std::vector<void*>& nodes=nodes_in;
  1968. PackedData& coord=data.srf_coord;
  1969. PackedData& value=data.srf_value;
  1970. coord.ptr=setup_data. coord_data;
  1971. value.ptr=setup_data. input_data;
  1972. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  1973. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  1974. coord.cnt.ReInit(nodes.size());
  1975. coord.dsp.ReInit(nodes.size());
  1976. value.cnt.ReInit(nodes.size());
  1977. value.dsp.ReInit(nodes.size());
  1978. #pragma omp parallel for
  1979. for(size_t i=0;i<nodes.size();i++){
  1980. Vector<Real_t>& coord_vec=((FMMNode*)nodes[i])->surf_coord;
  1981. Vector<Real_t>& value_vec=((FMMNode*)nodes[i])->surf_value;
  1982. if(coord_vec.Dim()){
  1983. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  1984. assert(coord.dsp[i]<coord.len);
  1985. coord.cnt[i]=coord_vec.Dim();
  1986. }else{
  1987. coord.dsp[i]=0;
  1988. coord.cnt[i]=0;
  1989. }
  1990. if(value_vec.Dim()){
  1991. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  1992. assert(value.dsp[i]<value.len);
  1993. value.cnt[i]=value_vec.Dim();
  1994. }else{
  1995. value.dsp[i]=0;
  1996. value.cnt[i]=0;
  1997. }
  1998. }
  1999. }
  2000. { // Set trg data
  2001. std::vector<void*>& nodes=nodes_out;
  2002. PackedData& coord=data.trg_coord;
  2003. PackedData& value=data.trg_value;
  2004. coord.ptr=setup_data. coord_data;
  2005. value.ptr=setup_data.output_data;
  2006. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  2007. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  2008. coord.cnt.ReInit(nodes.size());
  2009. coord.dsp.ReInit(nodes.size());
  2010. value.cnt.ReInit(nodes.size());
  2011. value.dsp.ReInit(nodes.size());
  2012. #pragma omp parallel for
  2013. for(size_t i=0;i<nodes.size();i++){
  2014. Vector<Real_t>& coord_vec=tree->upwd_check_surf[((FMMNode*)nodes[i])->Depth()];
  2015. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->upward_equiv;
  2016. if(coord_vec.Dim()){
  2017. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  2018. assert(coord.dsp[i]<coord.len);
  2019. coord.cnt[i]=coord_vec.Dim();
  2020. }else{
  2021. coord.dsp[i]=0;
  2022. coord.cnt[i]=0;
  2023. }
  2024. if(value_vec.Dim()){
  2025. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  2026. assert(value.dsp[i]<value.len);
  2027. value.cnt[i]=value_vec.Dim();
  2028. }else{
  2029. value.dsp[i]=0;
  2030. value.cnt[i]=0;
  2031. }
  2032. }
  2033. }
  2034. { // Set interac_data
  2035. int omp_p=omp_get_max_threads();
  2036. std::vector<std::vector<size_t> > in_node_(omp_p);
  2037. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  2038. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  2039. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  2040. if(this->ScaleInvar()){ // Set scal
  2041. const Kernel<Real_t>* ker=kernel->k_m2m;
  2042. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+2]
  2043. Vector<Real_t>& scal=data.interac_data.scal[l*4+2];
  2044. Vector<Real_t>& scal_exp=ker->trg_scal;
  2045. scal.ReInit(scal_exp.Dim());
  2046. for(size_t i=0;i<scal.Dim();i++){
  2047. scal[i]=pvfmm::pow<Real_t>(2.0,-scal_exp[i]*l);
  2048. }
  2049. }
  2050. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+3]
  2051. Vector<Real_t>& scal=data.interac_data.scal[l*4+3];
  2052. Vector<Real_t>& scal_exp=ker->src_scal;
  2053. scal.ReInit(scal_exp.Dim());
  2054. for(size_t i=0;i<scal.Dim();i++){
  2055. scal[i]=pvfmm::pow<Real_t>(2.0,-scal_exp[i]*l);
  2056. }
  2057. }
  2058. }
  2059. #pragma omp parallel for
  2060. for(size_t tid=0;tid<omp_p;tid++){
  2061. std::vector<size_t>& in_node =in_node_[tid] ;
  2062. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  2063. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  2064. std::vector<size_t>& interac_cnt=interac_cnt_[tid];
  2065. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  2066. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  2067. for(size_t i=a;i<b;i++){
  2068. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  2069. Real_t s=pvfmm::pow<Real_t>(0.5,tnode->Depth());
  2070. size_t interac_cnt_=0;
  2071. { // S2U_Type
  2072. Mat_Type type=S2U_Type;
  2073. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  2074. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  2075. FMMNode_t* snode=intlst[j];
  2076. size_t snode_id=snode->node_id;
  2077. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  2078. in_node.push_back(snode_id);
  2079. scal_idx.push_back(snode->Depth());
  2080. { // set coord_shift
  2081. const int* rel_coord=interac_list.RelativeCoord(type,j);
  2082. const Real_t* scoord=snode->Coord();
  2083. const Real_t* tcoord=tnode->Coord();
  2084. Real_t shift[COORD_DIM];
  2085. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+0.5*s)+(0+0.5*s);
  2086. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+0.5*s)+(0+0.5*s);
  2087. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+0.5*s)+(0+0.5*s);
  2088. coord_shift.push_back(shift[0]);
  2089. coord_shift.push_back(shift[1]);
  2090. coord_shift.push_back(shift[2]);
  2091. }
  2092. interac_cnt_++;
  2093. }
  2094. }
  2095. interac_cnt.push_back(interac_cnt_);
  2096. }
  2097. }
  2098. { // Combine interac data
  2099. InteracData& interac_data=data.interac_data;
  2100. { // in_node
  2101. typedef size_t ElemType;
  2102. std::vector<std::vector<ElemType> >& vec_=in_node_;
  2103. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  2104. std::vector<size_t> vec_dsp(omp_p+1,0);
  2105. for(size_t tid=0;tid<omp_p;tid++){
  2106. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2107. }
  2108. vec.ReInit(vec_dsp[omp_p]);
  2109. #pragma omp parallel for
  2110. for(size_t tid=0;tid<omp_p;tid++){
  2111. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2112. }
  2113. }
  2114. { // scal_idx
  2115. typedef size_t ElemType;
  2116. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  2117. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  2118. std::vector<size_t> vec_dsp(omp_p+1,0);
  2119. for(size_t tid=0;tid<omp_p;tid++){
  2120. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2121. }
  2122. vec.ReInit(vec_dsp[omp_p]);
  2123. #pragma omp parallel for
  2124. for(size_t tid=0;tid<omp_p;tid++){
  2125. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2126. }
  2127. }
  2128. { // coord_shift
  2129. typedef Real_t ElemType;
  2130. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  2131. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  2132. std::vector<size_t> vec_dsp(omp_p+1,0);
  2133. for(size_t tid=0;tid<omp_p;tid++){
  2134. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2135. }
  2136. vec.ReInit(vec_dsp[omp_p]);
  2137. #pragma omp parallel for
  2138. for(size_t tid=0;tid<omp_p;tid++){
  2139. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2140. }
  2141. }
  2142. { // interac_cnt
  2143. typedef size_t ElemType;
  2144. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  2145. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  2146. std::vector<size_t> vec_dsp(omp_p+1,0);
  2147. for(size_t tid=0;tid<omp_p;tid++){
  2148. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2149. }
  2150. vec.ReInit(vec_dsp[omp_p]);
  2151. #pragma omp parallel for
  2152. for(size_t tid=0;tid<omp_p;tid++){
  2153. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2154. }
  2155. }
  2156. { // interac_dsp
  2157. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  2158. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  2159. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  2160. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  2161. }
  2162. }
  2163. { // Set M[2], M[3]
  2164. InteracData& interac_data=data.interac_data;
  2165. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  2166. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  2167. if(cnt.Dim() && cnt[cnt.Dim()-1]+dsp[dsp.Dim()-1]){
  2168. data.interac_data.M[2]=this->mat->Mat(level, UC2UE0_Type, 0);
  2169. data.interac_data.M[3]=this->mat->Mat(level, UC2UE1_Type, 0);
  2170. }else{
  2171. data.interac_data.M[2].ReInit(0,0);
  2172. data.interac_data.M[3].ReInit(0,0);
  2173. }
  2174. }
  2175. }
  2176. PtSetup(setup_data, &data);
  2177. }
  2178. template <class FMMNode>
  2179. void FMM_Pts<FMMNode>::Source2Up(SetupData<Real_t>& setup_data, bool device){
  2180. if(!this->MultipoleOrder()) return;
  2181. //Add Source2Up contribution.
  2182. this->EvalListPts(setup_data, device);
  2183. }
  2184. template <class FMMNode>
  2185. void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2186. if(!this->MultipoleOrder()) return;
  2187. { // Set setup_data
  2188. setup_data.level=level;
  2189. setup_data.kernel=kernel->k_m2m;
  2190. setup_data.interac_type.resize(1);
  2191. setup_data.interac_type[0]=U2U_Type;
  2192. setup_data. input_data=&buff[0];
  2193. setup_data.output_data=&buff[0];
  2194. Vector<FMMNode_t*>& nodes_in =n_list[0];
  2195. Vector<FMMNode_t*>& nodes_out=n_list[0];
  2196. setup_data.nodes_in .clear();
  2197. setup_data.nodes_out.clear();
  2198. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level+1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
  2199. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level ) && nodes_out[i]->pt_cnt[0]) setup_data.nodes_out.push_back(nodes_out[i]);
  2200. }
  2201. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2202. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2203. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2204. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2205. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
  2206. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
  2207. SetupInterac(setup_data,device);
  2208. }
  2209. template <class FMMNode>
  2210. void FMM_Pts<FMMNode>::Up2Up (SetupData<Real_t>& setup_data, bool device){
  2211. if(!this->MultipoleOrder()) return;
  2212. //Add Up2Up contribution.
  2213. EvalList(setup_data, device);
  2214. }
  2215. template <class FMMNode>
  2216. void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
  2217. if(!this->ScaleInvar() || this->MultipoleOrder()==0) return;
  2218. Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
  2219. assert(node->FMMData()->upward_equiv.Dim()>0);
  2220. int dof=1;
  2221. Vector<Real_t>& upward_equiv=node->FMMData()->upward_equiv;
  2222. Vector<Real_t>& dnward_equiv=node->FMMData()->dnward_equiv;
  2223. assert(upward_equiv.Dim()==M.Dim(0)*dof);
  2224. assert(dnward_equiv.Dim()==M.Dim(1)*dof);
  2225. Matrix<Real_t> d_equiv(dof,M.Dim(1),&dnward_equiv[0],false);
  2226. Matrix<Real_t> u_equiv(dof,M.Dim(0),&upward_equiv[0],false);
  2227. Matrix<Real_t>::GEMM(d_equiv,u_equiv,M);
  2228. }
  2229. template <class FMMNode>
  2230. void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec, Vector<Real_t>& fft_scal,
  2231. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  2232. size_t n1=m*2;
  2233. size_t n2=n1*n1;
  2234. size_t n3=n1*n2;
  2235. size_t n3_=n2*(n1/2+1);
  2236. size_t chld_cnt=1UL<<COORD_DIM;
  2237. size_t fftsize_in =2*n3_*chld_cnt*ker_dim0*dof;
  2238. int omp_p=omp_get_max_threads();
  2239. //Load permutation map.
  2240. size_t n=6*(m-1)*(m-1)+2;
  2241. static Vector<size_t> map;
  2242. { // Build map to reorder upward_equiv
  2243. size_t n_old=map.Dim();
  2244. if(n_old!=n){
  2245. Real_t c[3]={0,0,0};
  2246. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  2247. map.Resize(surf.Dim()/COORD_DIM);
  2248. for(size_t i=0;i<map.Dim();i++)
  2249. map[i]=((size_t)(m-1-surf[i*3]+0.5))+((size_t)(m-1-surf[i*3+1]+0.5))*n1+((size_t)(m-1-surf[i*3+2]+0.5))*n2;
  2250. }
  2251. }
  2252. { // Build FFTW plan.
  2253. if(!vlist_fft_flag){
  2254. int nnn[3]={(int)n1,(int)n1,(int)n1};
  2255. void *fftw_in, *fftw_out;
  2256. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim0*chld_cnt);
  2257. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim0*chld_cnt);
  2258. vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
  2259. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_);
  2260. mem::aligned_delete<Real_t>((Real_t*)fftw_in );
  2261. mem::aligned_delete<Real_t>((Real_t*)fftw_out);
  2262. vlist_fft_flag=true;
  2263. }
  2264. }
  2265. { // Offload section
  2266. size_t n_in = fft_vec.Dim();
  2267. #pragma omp parallel for
  2268. for(int pid=0; pid<omp_p; pid++){
  2269. size_t node_start=(n_in*(pid ))/omp_p;
  2270. size_t node_end =(n_in*(pid+1))/omp_p;
  2271. Vector<Real_t> buffer(fftsize_in, &buffer_[fftsize_in*pid], false);
  2272. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  2273. Matrix<Real_t> upward_equiv(chld_cnt,n*ker_dim0*dof,&input_data[0] + fft_vec[node_idx],false);
  2274. Vector<Real_t> upward_equiv_fft(fftsize_in, &output_data[fftsize_in *node_idx], false);
  2275. upward_equiv_fft.SetZero();
  2276. // Rearrange upward equivalent data.
  2277. for(size_t k=0;k<n;k++){
  2278. size_t idx=map[k];
  2279. for(int j1=0;j1<dof;j1++)
  2280. for(int j0=0;j0<(int)chld_cnt;j0++)
  2281. for(int i=0;i<ker_dim0;i++)
  2282. upward_equiv_fft[idx+(j0+(i+j1*ker_dim0)*chld_cnt)*n3]=upward_equiv[j0][ker_dim0*(n*j1+k)+i]*fft_scal[ker_dim0*node_idx+i];
  2283. }
  2284. // Compute FFT.
  2285. for(int i=0;i<dof;i++)
  2286. FFTW_t<Real_t>::fft_execute_dft_r2c(vlist_fftplan, (Real_t*)&upward_equiv_fft[i* n3 *ker_dim0*chld_cnt],
  2287. (typename FFTW_t<Real_t>::cplx*)&buffer [i*2*n3_*ker_dim0*chld_cnt]);
  2288. //Compute flops.
  2289. #ifndef FFTW3_MKL
  2290. double add, mul, fma;
  2291. FFTW_t<Real_t>::fftw_flops(vlist_fftplan, &add, &mul, &fma);
  2292. #ifndef __INTEL_OFFLOAD0
  2293. Profile::Add_FLOP((long long)(add+mul+2*fma));
  2294. #endif
  2295. #endif
  2296. for(int i=0;i<ker_dim0*dof;i++)
  2297. for(size_t j=0;j<n3_;j++)
  2298. for(size_t k=0;k<chld_cnt;k++){
  2299. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+0]=buffer[2*(n3_*(chld_cnt*i+k)+j)+0];
  2300. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+1]=buffer[2*(n3_*(chld_cnt*i+k)+j)+1];
  2301. }
  2302. }
  2303. }
  2304. }
  2305. }
  2306. template <class FMMNode>
  2307. void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector<size_t>& ifft_vec, Vector<Real_t>& ifft_scal,
  2308. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  2309. size_t n1=m*2;
  2310. size_t n2=n1*n1;
  2311. size_t n3=n1*n2;
  2312. size_t n3_=n2*(n1/2+1);
  2313. size_t chld_cnt=1UL<<COORD_DIM;
  2314. size_t fftsize_out=2*n3_*dof*ker_dim1*chld_cnt;
  2315. int omp_p=omp_get_max_threads();
  2316. //Load permutation map.
  2317. size_t n=6*(m-1)*(m-1)+2;
  2318. static Vector<size_t> map;
  2319. { // Build map to reorder dnward_check
  2320. size_t n_old=map.Dim();
  2321. if(n_old!=n){
  2322. Real_t c[3]={0,0,0};
  2323. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  2324. map.Resize(surf.Dim()/COORD_DIM);
  2325. for(size_t i=0;i<map.Dim();i++)
  2326. map[i]=((size_t)(m*2-0.5-surf[i*3]))+((size_t)(m*2-0.5-surf[i*3+1]))*n1+((size_t)(m*2-0.5-surf[i*3+2]))*n2;
  2327. //map;//.AllocDevice(true);
  2328. }
  2329. }
  2330. { // Build FFTW plan.
  2331. if(!vlist_ifft_flag){
  2332. //Build FFTW plan.
  2333. int nnn[3]={(int)n1,(int)n1,(int)n1};
  2334. Real_t *fftw_in, *fftw_out;
  2335. fftw_in = mem::aligned_new<Real_t>(2*n3_*ker_dim1*chld_cnt);
  2336. fftw_out = mem::aligned_new<Real_t>( n3 *ker_dim1*chld_cnt);
  2337. vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
  2338. (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3);
  2339. mem::aligned_delete<Real_t>(fftw_in);
  2340. mem::aligned_delete<Real_t>(fftw_out);
  2341. vlist_ifft_flag=true;
  2342. }
  2343. }
  2344. { // Offload section
  2345. assert(buffer_.Dim()>=2*fftsize_out*omp_p);
  2346. size_t n_out=ifft_vec.Dim();
  2347. #pragma omp parallel for
  2348. for(int pid=0; pid<omp_p; pid++){
  2349. size_t node_start=(n_out*(pid ))/omp_p;
  2350. size_t node_end =(n_out*(pid+1))/omp_p;
  2351. Vector<Real_t> buffer0(fftsize_out, &buffer_[fftsize_out*(2*pid+0)], false);
  2352. Vector<Real_t> buffer1(fftsize_out, &buffer_[fftsize_out*(2*pid+1)], false);
  2353. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  2354. Vector<Real_t> dnward_check_fft(fftsize_out, &input_data[fftsize_out*node_idx], false);
  2355. Vector<Real_t> dnward_equiv(ker_dim1*n*dof*chld_cnt,&output_data[0] + ifft_vec[node_idx],false);
  2356. //De-interleave data.
  2357. for(int i=0;i<ker_dim1*dof;i++)
  2358. for(size_t j=0;j<n3_;j++)
  2359. for(size_t k=0;k<chld_cnt;k++){
  2360. buffer0[2*(n3_*(ker_dim1*dof*k+i)+j)+0]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+0];
  2361. buffer0[2*(n3_*(ker_dim1*dof*k+i)+j)+1]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+1];
  2362. }
  2363. // Compute FFT.
  2364. for(int i=0;i<dof;i++)
  2365. FFTW_t<Real_t>::fft_execute_dft_c2r(vlist_ifftplan, (typename FFTW_t<Real_t>::cplx*)&buffer0[i*2*n3_*ker_dim1*chld_cnt],
  2366. (Real_t*)&buffer1[i* n3 *ker_dim1*chld_cnt]);
  2367. //Compute flops.
  2368. #ifndef FFTW3_MKL
  2369. double add, mul, fma;
  2370. FFTW_t<Real_t>::fftw_flops(vlist_ifftplan, &add, &mul, &fma);
  2371. #ifndef __INTEL_OFFLOAD0
  2372. Profile::Add_FLOP((long long)(add+mul+2*fma)*dof);
  2373. #endif
  2374. #endif
  2375. // Rearrange downward check data.
  2376. for(size_t k=0;k<n;k++){
  2377. size_t idx=map[k];
  2378. for(int j1=0;j1<dof;j1++)
  2379. for(int j0=0;j0<(int)chld_cnt;j0++)
  2380. for(int i=0;i<ker_dim1;i++)
  2381. dnward_equiv[ker_dim1*(n*(dof*j0+j1)+k)+i]+=buffer1[idx+(i+(j1+j0*dof)*ker_dim1)*n3]*ifft_scal[ker_dim1*node_idx+i];
  2382. }
  2383. }
  2384. }
  2385. }
  2386. }
  2387. template<class Real_t>
  2388. inline void matmult_8x8x2(Real_t*& M_, Real_t*& IN0, Real_t*& IN1, Real_t*& OUT0, Real_t*& OUT1){
  2389. // Generic code.
  2390. Real_t out_reg000, out_reg001, out_reg010, out_reg011;
  2391. Real_t out_reg100, out_reg101, out_reg110, out_reg111;
  2392. Real_t in_reg000, in_reg001, in_reg010, in_reg011;
  2393. Real_t in_reg100, in_reg101, in_reg110, in_reg111;
  2394. Real_t m_reg000, m_reg001, m_reg010, m_reg011;
  2395. Real_t m_reg100, m_reg101, m_reg110, m_reg111;
  2396. //#pragma unroll
  2397. for(int i1=0;i1<8;i1+=2){
  2398. Real_t* IN0_=IN0;
  2399. Real_t* IN1_=IN1;
  2400. out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
  2401. out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
  2402. out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
  2403. out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
  2404. //#pragma unroll
  2405. for(int i2=0;i2<8;i2+=2){
  2406. m_reg000=M_[ 0]; m_reg001=M_[ 1];
  2407. m_reg010=M_[ 2]; m_reg011=M_[ 3];
  2408. m_reg100=M_[16]; m_reg101=M_[17];
  2409. m_reg110=M_[18]; m_reg111=M_[19];
  2410. in_reg000=IN0_[0]; in_reg001=IN0_[1];
  2411. in_reg010=IN0_[2]; in_reg011=IN0_[3];
  2412. in_reg100=IN1_[0]; in_reg101=IN1_[1];
  2413. in_reg110=IN1_[2]; in_reg111=IN1_[3];
  2414. out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
  2415. out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
  2416. out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
  2417. out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
  2418. out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
  2419. out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
  2420. out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
  2421. out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
  2422. out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
  2423. out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
  2424. out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
  2425. out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
  2426. out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
  2427. out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
  2428. out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
  2429. out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
  2430. M_+=32; // Jump to (column+2).
  2431. IN0_+=4;
  2432. IN1_+=4;
  2433. }
  2434. OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
  2435. OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
  2436. OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
  2437. OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
  2438. M_+=4-64*2; // Jump back to first column (row+2).
  2439. OUT0+=4;
  2440. OUT1+=4;
  2441. }
  2442. }
  2443. #if defined(__AVX__) || defined(__SSE3__)
  2444. template<>
  2445. inline void matmult_8x8x2<double>(double*& M_, double*& IN0, double*& IN1, double*& OUT0, double*& OUT1){
  2446. #ifdef __AVX__ //AVX code.
  2447. __m256d out00,out01,out10,out11;
  2448. __m256d out20,out21,out30,out31;
  2449. double* in0__ = IN0;
  2450. double* in1__ = IN1;
  2451. out00 = _mm256_load_pd(OUT0);
  2452. out01 = _mm256_load_pd(OUT1);
  2453. out10 = _mm256_load_pd(OUT0+4);
  2454. out11 = _mm256_load_pd(OUT1+4);
  2455. out20 = _mm256_load_pd(OUT0+8);
  2456. out21 = _mm256_load_pd(OUT1+8);
  2457. out30 = _mm256_load_pd(OUT0+12);
  2458. out31 = _mm256_load_pd(OUT1+12);
  2459. for(int i2=0;i2<8;i2+=2){
  2460. __m256d m00;
  2461. __m256d ot00;
  2462. __m256d mt0,mtt0;
  2463. __m256d in00,in00_r,in01,in01_r;
  2464. in00 = _mm256_broadcast_pd((const __m128d*)in0__);
  2465. in00_r = _mm256_permute_pd(in00,5);
  2466. in01 = _mm256_broadcast_pd((const __m128d*)in1__);
  2467. in01_r = _mm256_permute_pd(in01,5);
  2468. m00 = _mm256_load_pd(M_);
  2469. mt0 = _mm256_unpacklo_pd(m00,m00);
  2470. ot00 = _mm256_mul_pd(mt0,in00);
  2471. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2472. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2473. ot00 = _mm256_mul_pd(mt0,in01);
  2474. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2475. m00 = _mm256_load_pd(M_+4);
  2476. mt0 = _mm256_unpacklo_pd(m00,m00);
  2477. ot00 = _mm256_mul_pd(mt0,in00);
  2478. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2479. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2480. ot00 = _mm256_mul_pd(mt0,in01);
  2481. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2482. m00 = _mm256_load_pd(M_+8);
  2483. mt0 = _mm256_unpacklo_pd(m00,m00);
  2484. ot00 = _mm256_mul_pd(mt0,in00);
  2485. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2486. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2487. ot00 = _mm256_mul_pd(mt0,in01);
  2488. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2489. m00 = _mm256_load_pd(M_+12);
  2490. mt0 = _mm256_unpacklo_pd(m00,m00);
  2491. ot00 = _mm256_mul_pd(mt0,in00);
  2492. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2493. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2494. ot00 = _mm256_mul_pd(mt0,in01);
  2495. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2496. in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
  2497. in00_r = _mm256_permute_pd(in00,5);
  2498. in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
  2499. in01_r = _mm256_permute_pd(in01,5);
  2500. m00 = _mm256_load_pd(M_+16);
  2501. mt0 = _mm256_unpacklo_pd(m00,m00);
  2502. ot00 = _mm256_mul_pd(mt0,in00);
  2503. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2504. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2505. ot00 = _mm256_mul_pd(mt0,in01);
  2506. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2507. m00 = _mm256_load_pd(M_+20);
  2508. mt0 = _mm256_unpacklo_pd(m00,m00);
  2509. ot00 = _mm256_mul_pd(mt0,in00);
  2510. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2511. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2512. ot00 = _mm256_mul_pd(mt0,in01);
  2513. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2514. m00 = _mm256_load_pd(M_+24);
  2515. mt0 = _mm256_unpacklo_pd(m00,m00);
  2516. ot00 = _mm256_mul_pd(mt0,in00);
  2517. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2518. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2519. ot00 = _mm256_mul_pd(mt0,in01);
  2520. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2521. m00 = _mm256_load_pd(M_+28);
  2522. mt0 = _mm256_unpacklo_pd(m00,m00);
  2523. ot00 = _mm256_mul_pd(mt0,in00);
  2524. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2525. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2526. ot00 = _mm256_mul_pd(mt0,in01);
  2527. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2528. M_ += 32;
  2529. in0__ += 4;
  2530. in1__ += 4;
  2531. }
  2532. _mm256_store_pd(OUT0,out00);
  2533. _mm256_store_pd(OUT1,out01);
  2534. _mm256_store_pd(OUT0+4,out10);
  2535. _mm256_store_pd(OUT1+4,out11);
  2536. _mm256_store_pd(OUT0+8,out20);
  2537. _mm256_store_pd(OUT1+8,out21);
  2538. _mm256_store_pd(OUT0+12,out30);
  2539. _mm256_store_pd(OUT1+12,out31);
  2540. #elif defined __SSE3__ // SSE code.
  2541. __m128d out00, out01, out10, out11;
  2542. __m128d in00, in01, in10, in11;
  2543. __m128d m00, m01, m10, m11;
  2544. //#pragma unroll
  2545. for(int i1=0;i1<8;i1+=2){
  2546. double* IN0_=IN0;
  2547. double* IN1_=IN1;
  2548. out00 =_mm_load_pd (OUT0 );
  2549. out10 =_mm_load_pd (OUT0+2);
  2550. out01 =_mm_load_pd (OUT1 );
  2551. out11 =_mm_load_pd (OUT1+2);
  2552. //#pragma unroll
  2553. for(int i2=0;i2<8;i2+=2){
  2554. m00 =_mm_load1_pd (M_ );
  2555. m10 =_mm_load1_pd (M_+ 2);
  2556. m01 =_mm_load1_pd (M_+16);
  2557. m11 =_mm_load1_pd (M_+18);
  2558. in00 =_mm_load_pd (IN0_ );
  2559. in10 =_mm_load_pd (IN0_+2);
  2560. in01 =_mm_load_pd (IN1_ );
  2561. in11 =_mm_load_pd (IN1_+2);
  2562. out00 = _mm_add_pd (out00, _mm_mul_pd(m00 , in00 ));
  2563. out00 = _mm_add_pd (out00, _mm_mul_pd(m01 , in10 ));
  2564. out01 = _mm_add_pd (out01, _mm_mul_pd(m00 , in01 ));
  2565. out01 = _mm_add_pd (out01, _mm_mul_pd(m01 , in11 ));
  2566. out10 = _mm_add_pd (out10, _mm_mul_pd(m10 , in00 ));
  2567. out10 = _mm_add_pd (out10, _mm_mul_pd(m11 , in10 ));
  2568. out11 = _mm_add_pd (out11, _mm_mul_pd(m10 , in01 ));
  2569. out11 = _mm_add_pd (out11, _mm_mul_pd(m11 , in11 ));
  2570. m00 =_mm_load1_pd (M_+ 1);
  2571. m10 =_mm_load1_pd (M_+ 2+1);
  2572. m01 =_mm_load1_pd (M_+16+1);
  2573. m11 =_mm_load1_pd (M_+18+1);
  2574. in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
  2575. in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
  2576. in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
  2577. in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
  2578. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
  2579. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
  2580. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
  2581. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
  2582. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
  2583. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
  2584. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
  2585. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
  2586. M_+=32; // Jump to (column+2).
  2587. IN0_+=4;
  2588. IN1_+=4;
  2589. }
  2590. _mm_store_pd (OUT0 ,out00);
  2591. _mm_store_pd (OUT0+2,out10);
  2592. _mm_store_pd (OUT1 ,out01);
  2593. _mm_store_pd (OUT1+2,out11);
  2594. M_+=4-64*2; // Jump back to first column (row+2).
  2595. OUT0+=4;
  2596. OUT1+=4;
  2597. }
  2598. #endif
  2599. }
  2600. #endif
  2601. #if defined(__SSE3__)
  2602. template<>
  2603. inline void matmult_8x8x2<float>(float*& M_, float*& IN0, float*& IN1, float*& OUT0, float*& OUT1){
  2604. #if defined __SSE3__ // SSE code.
  2605. __m128 out00,out01,out10,out11;
  2606. __m128 out20,out21,out30,out31;
  2607. float* in0__ = IN0;
  2608. float* in1__ = IN1;
  2609. out00 = _mm_load_ps(OUT0);
  2610. out01 = _mm_load_ps(OUT1);
  2611. out10 = _mm_load_ps(OUT0+4);
  2612. out11 = _mm_load_ps(OUT1+4);
  2613. out20 = _mm_load_ps(OUT0+8);
  2614. out21 = _mm_load_ps(OUT1+8);
  2615. out30 = _mm_load_ps(OUT0+12);
  2616. out31 = _mm_load_ps(OUT1+12);
  2617. for(int i2=0;i2<8;i2+=2){
  2618. __m128 m00;
  2619. __m128 mt0,mtt0;
  2620. __m128 in00,in00_r,in01,in01_r;
  2621. in00 = _mm_castpd_ps(_mm_load_pd1((const double*)in0__));
  2622. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2623. in01 = _mm_castpd_ps(_mm_load_pd1((const double*)in1__));
  2624. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2625. m00 = _mm_load_ps(M_);
  2626. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2627. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2628. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2629. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2630. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2631. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2632. m00 = _mm_load_ps(M_+4);
  2633. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2634. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2635. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2636. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2637. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2638. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2639. m00 = _mm_load_ps(M_+8);
  2640. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2641. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2642. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2643. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2644. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2645. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2646. m00 = _mm_load_ps(M_+12);
  2647. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2648. out30= _mm_add_ps (out30,_mm_mul_ps( mt0, in00));
  2649. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2650. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2651. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2652. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2653. in00 = _mm_castpd_ps(_mm_load_pd1((const double*) (in0__+2)));
  2654. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2655. in01 = _mm_castpd_ps(_mm_load_pd1((const double*) (in1__+2)));
  2656. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2657. m00 = _mm_load_ps(M_+16);
  2658. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2659. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2660. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2661. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2662. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2663. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2664. m00 = _mm_load_ps(M_+20);
  2665. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2666. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2667. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2668. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2669. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2670. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2671. m00 = _mm_load_ps(M_+24);
  2672. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2673. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2674. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2675. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2676. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2677. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2678. m00 = _mm_load_ps(M_+28);
  2679. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2680. out30= _mm_add_ps (out30,_mm_mul_ps( mt0,in00 ));
  2681. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2682. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2683. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2684. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2685. M_ += 32;
  2686. in0__ += 4;
  2687. in1__ += 4;
  2688. }
  2689. _mm_store_ps(OUT0,out00);
  2690. _mm_store_ps(OUT1,out01);
  2691. _mm_store_ps(OUT0+4,out10);
  2692. _mm_store_ps(OUT1+4,out11);
  2693. _mm_store_ps(OUT0+8,out20);
  2694. _mm_store_ps(OUT1+8,out21);
  2695. _mm_store_ps(OUT0+12,out30);
  2696. _mm_store_ps(OUT1+12,out31);
  2697. #endif
  2698. }
  2699. #endif
  2700. template <class Real_t>
  2701. void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector<size_t>& interac_dsp,
  2702. Vector<size_t>& interac_vec, Vector<Real_t*>& precomp_mat, Vector<Real_t>& fft_in, Vector<Real_t>& fft_out){
  2703. size_t chld_cnt=1UL<<COORD_DIM;
  2704. size_t fftsize_in =M_dim*ker_dim0*chld_cnt*2;
  2705. size_t fftsize_out=M_dim*ker_dim1*chld_cnt*2;
  2706. Real_t* zero_vec0=mem::aligned_new<Real_t>(fftsize_in );
  2707. Real_t* zero_vec1=mem::aligned_new<Real_t>(fftsize_out);
  2708. size_t n_out=fft_out.Dim()/fftsize_out;
  2709. // Set buff_out to zero.
  2710. #pragma omp parallel for
  2711. for(size_t k=0;k<n_out;k++){
  2712. Vector<Real_t> dnward_check_fft(fftsize_out, &fft_out[k*fftsize_out], false);
  2713. dnward_check_fft.SetZero();
  2714. }
  2715. // Build list of interaction pairs (in, out vectors).
  2716. size_t mat_cnt=precomp_mat.Dim();
  2717. size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
  2718. const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
  2719. Real_t** IN_ =mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2720. Real_t** OUT_=mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2721. #pragma omp parallel for
  2722. for(size_t interac_blk1=0; interac_blk1<blk1_cnt*mat_cnt; interac_blk1++){
  2723. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2724. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2725. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2726. for(size_t j=0;j<interac_cnt;j++){
  2727. IN_ [2*V_BLK_SIZE*interac_blk1 +j]=&fft_in [interac_vec[(interac_dsp0+j)*2+0]];
  2728. OUT_[2*V_BLK_SIZE*interac_blk1 +j]=&fft_out[interac_vec[(interac_dsp0+j)*2+1]];
  2729. }
  2730. IN_ [2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec0;
  2731. OUT_[2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec1;
  2732. }
  2733. int omp_p=omp_get_max_threads();
  2734. #pragma omp parallel for
  2735. for(int pid=0; pid<omp_p; pid++){
  2736. size_t a=( pid *M_dim)/omp_p;
  2737. size_t b=((pid+1)*M_dim)/omp_p;
  2738. for(int in_dim=0;in_dim<ker_dim0;in_dim++)
  2739. for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++)
  2740. for(size_t blk1=0; blk1<blk1_cnt; blk1++)
  2741. for(size_t k=a; k< b; k++)
  2742. for(size_t mat_indx=0; mat_indx< mat_cnt;mat_indx++){
  2743. size_t interac_blk1 = blk1*mat_cnt+mat_indx;
  2744. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2745. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2746. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2747. Real_t** IN = IN_ + 2*V_BLK_SIZE*interac_blk1;
  2748. Real_t** OUT= OUT_+ 2*V_BLK_SIZE*interac_blk1;
  2749. Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2 + (ot_dim+in_dim*ker_dim1)*M_dim*128;
  2750. {
  2751. for(size_t j=0;j<interac_cnt;j+=2){
  2752. Real_t* M_ = M;
  2753. Real_t* IN0 = IN [j+0] + (in_dim*M_dim+k)*chld_cnt*2;
  2754. Real_t* IN1 = IN [j+1] + (in_dim*M_dim+k)*chld_cnt*2;
  2755. Real_t* OUT0 = OUT[j+0] + (ot_dim*M_dim+k)*chld_cnt*2;
  2756. Real_t* OUT1 = OUT[j+1] + (ot_dim*M_dim+k)*chld_cnt*2;
  2757. #ifdef __SSE__
  2758. if (j+2 < interac_cnt) { // Prefetch
  2759. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2760. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2761. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2762. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2763. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2764. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2765. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2766. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2767. }
  2768. #endif
  2769. matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
  2770. }
  2771. }
  2772. }
  2773. }
  2774. // Compute flops.
  2775. {
  2776. Profile::Add_FLOP(8*8*8*(interac_vec.Dim()/2)*M_dim*ker_dim0*ker_dim1*dof);
  2777. }
  2778. // Free memory
  2779. mem::aligned_delete<Real_t*>(IN_ );
  2780. mem::aligned_delete<Real_t*>(OUT_);
  2781. mem::aligned_delete<Real_t>(zero_vec0);
  2782. mem::aligned_delete<Real_t>(zero_vec1);
  2783. }
  2784. template <class FMMNode>
  2785. void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2786. if(!this->MultipoleOrder()) return;
  2787. if(level==0) return;
  2788. { // Set setup_data
  2789. setup_data.level=level;
  2790. setup_data.kernel=kernel->k_m2l;
  2791. setup_data.interac_type.resize(1);
  2792. setup_data.interac_type[0]=V1_Type;
  2793. setup_data. input_data=&buff[0];
  2794. setup_data.output_data=&buff[1];
  2795. Vector<FMMNode_t*>& nodes_in =n_list[2];
  2796. Vector<FMMNode_t*>& nodes_out=n_list[3];
  2797. setup_data.nodes_in .clear();
  2798. setup_data.nodes_out.clear();
  2799. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1 || level==-1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
  2800. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level-1 || level==-1) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
  2801. }
  2802. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2803. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2804. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2805. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2806. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_in [i])->Child(0))->FMMData())->upward_equiv);
  2807. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_out[i])->Child(0))->FMMData())->dnward_equiv);
  2808. /////////////////////////////////////////////////////////////////////////////
  2809. Real_t eps=1e-10;
  2810. size_t n_in =nodes_in .size();
  2811. size_t n_out=nodes_out.size();
  2812. // Setup precomputed data.
  2813. //if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  2814. // Build interac_data
  2815. Profile::Tic("Interac-Data",&this->comm,true,25);
  2816. Matrix<char>& interac_data=setup_data.interac_data;
  2817. if(n_out>0 && n_in >0){ // Build precomp_data, interac_data
  2818. size_t precomp_offset=0;
  2819. Mat_Type& interac_type=setup_data.interac_type[0];
  2820. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  2821. Matrix<size_t> precomp_data_offset;
  2822. std::vector<size_t> interac_mat;
  2823. std::vector<Real_t*> interac_mat_ptr;
  2824. #if 0 // Since we skip SetupPrecomp for V-list
  2825. { // Load precomp_data for interac_type.
  2826. struct HeaderData{
  2827. size_t total_size;
  2828. size_t level;
  2829. size_t mat_cnt ;
  2830. size_t max_depth;
  2831. };
  2832. Matrix<char>& precomp_data=*setup_data.precomp_data;
  2833. char* indx_ptr=precomp_data[0]+precomp_offset;
  2834. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  2835. precomp_data_offset.ReInit(header.mat_cnt,1+(2+2)*header.max_depth, (size_t*)indx_ptr, false);
  2836. precomp_offset+=header.total_size;
  2837. for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
  2838. Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
  2839. assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
  2840. interac_mat.push_back(precomp_data_offset[mat_id][0]);
  2841. }
  2842. }
  2843. #else
  2844. {
  2845. for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
  2846. Matrix<Real_t>& M = this->mat->Mat(level, interac_type, mat_id);
  2847. interac_mat_ptr.push_back(&M[0][0]);
  2848. }
  2849. }
  2850. #endif
  2851. size_t dof;
  2852. size_t m=MultipoleOrder();
  2853. size_t ker_dim0=setup_data.kernel->ker_dim[0];
  2854. size_t ker_dim1=setup_data.kernel->ker_dim[1];
  2855. size_t fftsize;
  2856. {
  2857. size_t n1=m*2;
  2858. size_t n2=n1*n1;
  2859. size_t n3_=n2*(n1/2+1);
  2860. size_t chld_cnt=1UL<<COORD_DIM;
  2861. fftsize=2*n3_*chld_cnt;
  2862. dof=1;
  2863. }
  2864. int omp_p=omp_get_max_threads();
  2865. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  2866. size_t n_blk0=2*fftsize*dof*(ker_dim0*n_in +ker_dim1*n_out)*sizeof(Real_t)/buff_size;
  2867. if(n_blk0==0) n_blk0=1;
  2868. std::vector<std::vector<size_t> > fft_vec(n_blk0);
  2869. std::vector<std::vector<size_t> > ifft_vec(n_blk0);
  2870. std::vector<std::vector<Real_t> > fft_scl(n_blk0);
  2871. std::vector<std::vector<Real_t> > ifft_scl(n_blk0);
  2872. std::vector<std::vector<size_t> > interac_vec(n_blk0);
  2873. std::vector<std::vector<size_t> > interac_dsp(n_blk0);
  2874. {
  2875. Matrix<Real_t>& input_data=*setup_data. input_data;
  2876. Matrix<Real_t>& output_data=*setup_data.output_data;
  2877. std::vector<std::vector<FMMNode*> > nodes_blk_in (n_blk0);
  2878. std::vector<std::vector<FMMNode*> > nodes_blk_out(n_blk0);
  2879. Vector<Real_t> src_scal=this->kernel->k_m2l->src_scal;
  2880. Vector<Real_t> trg_scal=this->kernel->k_m2l->trg_scal;
  2881. for(size_t i=0;i<n_in;i++) ((FMMNode*)nodes_in[i])->node_id=i;
  2882. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2883. size_t blk0_start=(n_out* blk0 )/n_blk0;
  2884. size_t blk0_end =(n_out*(blk0+1))/n_blk0;
  2885. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2886. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2887. { // Build node list for blk0.
  2888. std::set<void*> nodes_in;
  2889. for(size_t i=blk0_start;i<blk0_end;i++){
  2890. nodes_out_.push_back((FMMNode*)nodes_out[i]);
  2891. Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  2892. for(size_t k=0;k<mat_cnt;k++) if(lst[k]!=NULL && lst[k]->pt_cnt[0]) nodes_in.insert(lst[k]);
  2893. }
  2894. for(std::set<void*>::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){
  2895. nodes_in_.push_back((FMMNode*)*node);
  2896. }
  2897. size_t input_dim=nodes_in_ .size()*ker_dim0*dof*fftsize;
  2898. size_t output_dim=nodes_out_.size()*ker_dim1*dof*fftsize;
  2899. size_t buffer_dim=2*(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2900. if(buff_size<(input_dim + output_dim + buffer_dim)*sizeof(Real_t))
  2901. buff_size=(input_dim + output_dim + buffer_dim)*sizeof(Real_t);
  2902. }
  2903. { // Set fft vectors.
  2904. for(size_t i=0;i<nodes_in_ .size();i++) fft_vec[blk0].push_back((size_t)(& input_vector[nodes_in_[i]->node_id][0][0]- input_data[0]));
  2905. for(size_t i=0;i<nodes_out_.size();i++)ifft_vec[blk0].push_back((size_t)(&output_vector[blk0_start + i ][0][0]-output_data[0]));
  2906. size_t scal_dim0=src_scal.Dim();
  2907. size_t scal_dim1=trg_scal.Dim();
  2908. fft_scl [blk0].resize(nodes_in_ .size()*scal_dim0);
  2909. ifft_scl[blk0].resize(nodes_out_.size()*scal_dim1);
  2910. for(size_t i=0;i<nodes_in_ .size();i++){
  2911. size_t depth=nodes_in_[i]->Depth()+1;
  2912. for(size_t j=0;j<scal_dim0;j++){
  2913. fft_scl[blk0][i*scal_dim0+j]=pvfmm::pow<Real_t>(2.0, src_scal[j]*depth);
  2914. }
  2915. }
  2916. for(size_t i=0;i<nodes_out_.size();i++){
  2917. size_t depth=nodes_out_[i]->Depth()+1;
  2918. for(size_t j=0;j<scal_dim1;j++){
  2919. ifft_scl[blk0][i*scal_dim1+j]=pvfmm::pow<Real_t>(2.0, trg_scal[j]*depth);
  2920. }
  2921. }
  2922. }
  2923. }
  2924. for(size_t blk0=0;blk0<n_blk0;blk0++){ // Hadamard interactions.
  2925. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2926. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2927. for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
  2928. { // Next blocking level.
  2929. size_t n_blk1=nodes_out_.size()*(2)*sizeof(Real_t)/(64*V_BLK_CACHE);
  2930. if(n_blk1==0) n_blk1=1;
  2931. size_t interac_dsp_=0;
  2932. for(size_t blk1=0;blk1<n_blk1;blk1++){
  2933. size_t blk1_start=(nodes_out_.size()* blk1 )/n_blk1;
  2934. size_t blk1_end =(nodes_out_.size()*(blk1+1))/n_blk1;
  2935. for(size_t k=0;k<mat_cnt;k++){
  2936. for(size_t i=blk1_start;i<blk1_end;i++){
  2937. Vector<FMMNode*>& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type];
  2938. if(lst[k]!=NULL && lst[k]->pt_cnt[0]){
  2939. interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof);
  2940. interac_vec[blk0].push_back( i *fftsize*ker_dim1*dof);
  2941. interac_dsp_++;
  2942. }
  2943. }
  2944. interac_dsp[blk0].push_back(interac_dsp_);
  2945. }
  2946. }
  2947. }
  2948. }
  2949. }
  2950. { // Set interac_data.
  2951. size_t data_size=sizeof(size_t)*6; // buff_size, m, dof, ker_dim0, ker_dim1, n_blk0
  2952. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2953. data_size+=sizeof(size_t)+ fft_vec[blk0].size()*sizeof(size_t);
  2954. data_size+=sizeof(size_t)+ ifft_vec[blk0].size()*sizeof(size_t);
  2955. data_size+=sizeof(size_t)+ fft_scl[blk0].size()*sizeof(Real_t);
  2956. data_size+=sizeof(size_t)+ ifft_scl[blk0].size()*sizeof(Real_t);
  2957. data_size+=sizeof(size_t)+interac_vec[blk0].size()*sizeof(size_t);
  2958. data_size+=sizeof(size_t)+interac_dsp[blk0].size()*sizeof(size_t);
  2959. }
  2960. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  2961. data_size+=sizeof(size_t)+interac_mat_ptr.size()*sizeof(Real_t*);
  2962. if(data_size>interac_data.Dim(0)*interac_data.Dim(1))
  2963. interac_data.ReInit(1,data_size);
  2964. char* data_ptr=&interac_data[0][0];
  2965. ((size_t*)data_ptr)[0]=buff_size; data_ptr+=sizeof(size_t);
  2966. ((size_t*)data_ptr)[0]= m; data_ptr+=sizeof(size_t);
  2967. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  2968. ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
  2969. ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
  2970. ((size_t*)data_ptr)[0]= n_blk0; data_ptr+=sizeof(size_t);
  2971. ((size_t*)data_ptr)[0]= interac_mat.size(); data_ptr+=sizeof(size_t);
  2972. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  2973. data_ptr+=interac_mat.size()*sizeof(size_t);
  2974. ((size_t*)data_ptr)[0]= interac_mat_ptr.size(); data_ptr+=sizeof(size_t);
  2975. mem::memcopy(data_ptr, &interac_mat_ptr[0], interac_mat_ptr.size()*sizeof(Real_t*));
  2976. data_ptr+=interac_mat_ptr.size()*sizeof(Real_t*);
  2977. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2978. ((size_t*)data_ptr)[0]= fft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2979. mem::memcopy(data_ptr, & fft_vec[blk0][0], fft_vec[blk0].size()*sizeof(size_t));
  2980. data_ptr+= fft_vec[blk0].size()*sizeof(size_t);
  2981. ((size_t*)data_ptr)[0]=ifft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2982. mem::memcopy(data_ptr, &ifft_vec[blk0][0], ifft_vec[blk0].size()*sizeof(size_t));
  2983. data_ptr+=ifft_vec[blk0].size()*sizeof(size_t);
  2984. ((size_t*)data_ptr)[0]= fft_scl[blk0].size(); data_ptr+=sizeof(size_t);
  2985. mem::memcopy(data_ptr, & fft_scl[blk0][0], fft_scl[blk0].size()*sizeof(Real_t));
  2986. data_ptr+= fft_scl[blk0].size()*sizeof(Real_t);
  2987. ((size_t*)data_ptr)[0]=ifft_scl[blk0].size(); data_ptr+=sizeof(size_t);
  2988. mem::memcopy(data_ptr, &ifft_scl[blk0][0], ifft_scl[blk0].size()*sizeof(Real_t));
  2989. data_ptr+=ifft_scl[blk0].size()*sizeof(Real_t);
  2990. ((size_t*)data_ptr)[0]=interac_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2991. mem::memcopy(data_ptr, &interac_vec[blk0][0], interac_vec[blk0].size()*sizeof(size_t));
  2992. data_ptr+=interac_vec[blk0].size()*sizeof(size_t);
  2993. ((size_t*)data_ptr)[0]=interac_dsp[blk0].size(); data_ptr+=sizeof(size_t);
  2994. mem::memcopy(data_ptr, &interac_dsp[blk0][0], interac_dsp[blk0].size()*sizeof(size_t));
  2995. data_ptr+=interac_dsp[blk0].size()*sizeof(size_t);
  2996. }
  2997. }
  2998. }
  2999. Profile::Toc();
  3000. if(device){ // Host2Device
  3001. Profile::Tic("Host2Device",&this->comm,false,25);
  3002. setup_data.interac_data. AllocDevice(true);
  3003. Profile::Toc();
  3004. }
  3005. }
  3006. template <class FMMNode>
  3007. void FMM_Pts<FMMNode>::V_List (SetupData<Real_t>& setup_data, bool device){
  3008. if(!this->MultipoleOrder()) return;
  3009. assert(!device); //Can not run on accelerator yet.
  3010. int np;
  3011. MPI_Comm_size(comm,&np);
  3012. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  3013. if(np>1) Profile::Tic("Host2Device",&this->comm,false,25);
  3014. if(np>1) Profile::Toc();
  3015. return;
  3016. }
  3017. Profile::Tic("Host2Device",&this->comm,false,25);
  3018. int level=setup_data.level;
  3019. size_t buff_size=*((size_t*)&setup_data.interac_data[0][0]);
  3020. typename Vector<char>::Device buff;
  3021. //typename Matrix<char>::Device precomp_data;
  3022. typename Matrix<char>::Device interac_data;
  3023. typename Matrix<Real_t>::Device input_data;
  3024. typename Matrix<Real_t>::Device output_data;
  3025. if(device){
  3026. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  3027. buff = this-> dev_buffer. AllocDevice(false);
  3028. //precomp_data= setup_data.precomp_data->AllocDevice(false);
  3029. interac_data= setup_data.interac_data. AllocDevice(false);
  3030. input_data = setup_data. input_data->AllocDevice(false);
  3031. output_data = setup_data. output_data->AllocDevice(false);
  3032. }else{
  3033. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  3034. buff = this-> dev_buffer;
  3035. //precomp_data=*setup_data.precomp_data;
  3036. interac_data= setup_data.interac_data;
  3037. input_data =*setup_data. input_data;
  3038. output_data =*setup_data. output_data;
  3039. }
  3040. Profile::Toc();
  3041. { // Offloaded computation.
  3042. // Set interac_data.
  3043. size_t m, dof, ker_dim0, ker_dim1, n_blk0;
  3044. std::vector<Vector<size_t> > fft_vec;
  3045. std::vector<Vector<size_t> > ifft_vec;
  3046. std::vector<Vector<Real_t> > fft_scl;
  3047. std::vector<Vector<Real_t> > ifft_scl;
  3048. std::vector<Vector<size_t> > interac_vec;
  3049. std::vector<Vector<size_t> > interac_dsp;
  3050. Vector<Real_t*> precomp_mat;
  3051. { // Set interac_data.
  3052. char* data_ptr=&interac_data[0][0];
  3053. buff_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3054. m =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3055. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3056. ker_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3057. ker_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3058. n_blk0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3059. fft_vec .resize(n_blk0);
  3060. ifft_vec.resize(n_blk0);
  3061. fft_scl .resize(n_blk0);
  3062. ifft_scl.resize(n_blk0);
  3063. interac_vec.resize(n_blk0);
  3064. interac_dsp.resize(n_blk0);
  3065. Vector<size_t> interac_mat;
  3066. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3067. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  3068. Vector<Real_t*> interac_mat_ptr;
  3069. interac_mat_ptr.ReInit(((size_t*)data_ptr)[0],(Real_t**)(data_ptr+sizeof(size_t)),false);
  3070. data_ptr+=sizeof(size_t)+interac_mat_ptr.Dim()*sizeof(Real_t*);
  3071. #if 0 // Since we skip SetupPrecomp for V-list
  3072. precomp_mat.Resize(interac_mat.Dim());
  3073. for(size_t i=0;i<interac_mat.Dim();i++){
  3074. precomp_mat[i]=(Real_t*)(precomp_data[0]+interac_mat[i]);
  3075. }
  3076. #else
  3077. precomp_mat.Resize(interac_mat_ptr.Dim());
  3078. for(size_t i=0;i<interac_mat_ptr.Dim();i++){
  3079. precomp_mat[i]=interac_mat_ptr[i];
  3080. }
  3081. #endif
  3082. for(size_t blk0=0;blk0<n_blk0;blk0++){
  3083. fft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3084. data_ptr+=sizeof(size_t)+fft_vec[blk0].Dim()*sizeof(size_t);
  3085. ifft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3086. data_ptr+=sizeof(size_t)+ifft_vec[blk0].Dim()*sizeof(size_t);
  3087. fft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  3088. data_ptr+=sizeof(size_t)+fft_scl[blk0].Dim()*sizeof(Real_t);
  3089. ifft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  3090. data_ptr+=sizeof(size_t)+ifft_scl[blk0].Dim()*sizeof(Real_t);
  3091. interac_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3092. data_ptr+=sizeof(size_t)+interac_vec[blk0].Dim()*sizeof(size_t);
  3093. interac_dsp[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3094. data_ptr+=sizeof(size_t)+interac_dsp[blk0].Dim()*sizeof(size_t);
  3095. }
  3096. }
  3097. int omp_p=omp_get_max_threads();
  3098. size_t M_dim, fftsize;
  3099. {
  3100. size_t n1=m*2;
  3101. size_t n2=n1*n1;
  3102. size_t n3_=n2*(n1/2+1);
  3103. size_t chld_cnt=1UL<<COORD_DIM;
  3104. fftsize=2*n3_*chld_cnt;
  3105. M_dim=n3_;
  3106. }
  3107. for(size_t blk0=0;blk0<n_blk0;blk0++){ // interactions
  3108. size_t n_in = fft_vec[blk0].Dim();
  3109. size_t n_out=ifft_vec[blk0].Dim();
  3110. size_t input_dim=n_in *ker_dim0*dof*fftsize;
  3111. size_t output_dim=n_out*ker_dim1*dof*fftsize;
  3112. size_t buffer_dim=2*(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  3113. Vector<Real_t> fft_in ( input_dim, (Real_t*)&buff[ 0 ],false);
  3114. Vector<Real_t> fft_out(output_dim, (Real_t*)&buff[ input_dim *sizeof(Real_t)],false);
  3115. Vector<Real_t> buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
  3116. { // FFT
  3117. if(np==1) Profile::Tic("FFT",&comm,false,100);
  3118. Vector<Real_t> input_data_( input_data.dim[0]* input_data.dim[1], input_data[0], false);
  3119. FFT_UpEquiv(dof, m, ker_dim0, fft_vec[blk0], fft_scl[blk0], input_data_, fft_in, buffer);
  3120. if(np==1) Profile::Toc();
  3121. }
  3122. { // Hadamard
  3123. #ifdef PVFMM_HAVE_PAPI
  3124. #ifdef __VERBOSE__
  3125. std::cout << "Starting counters new\n";
  3126. if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
  3127. #endif
  3128. #endif
  3129. if(np==1) Profile::Tic("HadamardProduct",&comm,false,100);
  3130. VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
  3131. if(np==1) Profile::Toc();
  3132. #ifdef PVFMM_HAVE_PAPI
  3133. #ifdef __VERBOSE__
  3134. if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
  3135. std::cout << "Stopping counters\n";
  3136. #endif
  3137. #endif
  3138. }
  3139. { // IFFT
  3140. if(np==1) Profile::Tic("IFFT",&comm,false,100);
  3141. Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
  3142. FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], ifft_scl[blk0], fft_out, output_data_, buffer);
  3143. if(np==1) Profile::Toc();
  3144. }
  3145. }
  3146. }
  3147. }
  3148. template <class FMMNode>
  3149. void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3150. if(!this->MultipoleOrder()) return;
  3151. { // Set setup_data
  3152. setup_data.level=level;
  3153. setup_data.kernel=kernel->k_l2l;
  3154. setup_data.interac_type.resize(1);
  3155. setup_data.interac_type[0]=D2D_Type;
  3156. setup_data. input_data=&buff[1];
  3157. setup_data.output_data=&buff[1];
  3158. Vector<FMMNode_t*>& nodes_in =n_list[1];
  3159. Vector<FMMNode_t*>& nodes_out=n_list[1];
  3160. setup_data.nodes_in .clear();
  3161. setup_data.nodes_out.clear();
  3162. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1) && nodes_in [i]->pt_cnt[1]) setup_data.nodes_in .push_back(nodes_in [i]);
  3163. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level ) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
  3164. }
  3165. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3166. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3167. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3168. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3169. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
  3170. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
  3171. SetupInterac(setup_data,device);
  3172. }
  3173. template <class FMMNode>
  3174. void FMM_Pts<FMMNode>::Down2Down (SetupData<Real_t>& setup_data, bool device){
  3175. if(!this->MultipoleOrder()) return;
  3176. //Add Down2Down contribution.
  3177. EvalList(setup_data, device);
  3178. }
  3179. template <class FMMNode>
  3180. void FMM_Pts<FMMNode>::PtSetup(SetupData<Real_t>& setup_data, void* data_){
  3181. struct PackedData{
  3182. size_t len;
  3183. Matrix<Real_t>* ptr;
  3184. Vector<size_t> cnt;
  3185. Vector<size_t> dsp;
  3186. };
  3187. struct InteracData{
  3188. Vector<size_t> in_node;
  3189. Vector<size_t> scal_idx;
  3190. Vector<Real_t> coord_shift;
  3191. Vector<size_t> interac_cnt;
  3192. Vector<size_t> interac_dsp;
  3193. Vector<size_t> interac_cst;
  3194. Vector<Real_t> scal[4*MAX_DEPTH];
  3195. Matrix<Real_t> M[4];
  3196. };
  3197. struct ptSetupData{
  3198. int level;
  3199. const Kernel<Real_t>* kernel;
  3200. PackedData src_coord; // Src coord
  3201. PackedData src_value; // Src density
  3202. PackedData srf_coord; // Srf coord
  3203. PackedData srf_value; // Srf density
  3204. PackedData trg_coord; // Trg coord
  3205. PackedData trg_value; // Trg potential
  3206. InteracData interac_data;
  3207. };
  3208. ptSetupData& data=*(ptSetupData*)data_;
  3209. if(data.interac_data.interac_cnt.Dim()){ // Set data.interac_data.interac_cst
  3210. InteracData& intdata=data.interac_data;
  3211. Vector<size_t> cnt;
  3212. Vector<size_t>& dsp=intdata.interac_cst;
  3213. cnt.ReInit(intdata.interac_cnt.Dim());
  3214. dsp.ReInit(intdata.interac_dsp.Dim());
  3215. #pragma omp parallel for
  3216. for(size_t trg=0;trg<cnt.Dim();trg++){
  3217. size_t trg_cnt=data.trg_coord.cnt[trg];
  3218. cnt[trg]=0;
  3219. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3220. size_t int_id=intdata.interac_dsp[trg]+i;
  3221. size_t src=intdata.in_node[int_id];
  3222. size_t src_cnt=data.src_coord.cnt[src];
  3223. size_t srf_cnt=data.srf_coord.cnt[src];
  3224. cnt[trg]+=(src_cnt+srf_cnt)*trg_cnt;
  3225. }
  3226. }
  3227. dsp[0]=cnt[0];
  3228. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  3229. }
  3230. { // pack data
  3231. struct PackedSetupData{
  3232. size_t size;
  3233. int level;
  3234. const Kernel<Real_t>* kernel;
  3235. Matrix<Real_t>* src_coord; // Src coord
  3236. Matrix<Real_t>* src_value; // Src density
  3237. Matrix<Real_t>* srf_coord; // Srf coord
  3238. Matrix<Real_t>* srf_value; // Srf density
  3239. Matrix<Real_t>* trg_coord; // Trg coord
  3240. Matrix<Real_t>* trg_value; // Trg potential
  3241. size_t src_coord_cnt_size; size_t src_coord_cnt_offset;
  3242. size_t src_coord_dsp_size; size_t src_coord_dsp_offset;
  3243. size_t src_value_cnt_size; size_t src_value_cnt_offset;
  3244. size_t src_value_dsp_size; size_t src_value_dsp_offset;
  3245. size_t srf_coord_cnt_size; size_t srf_coord_cnt_offset;
  3246. size_t srf_coord_dsp_size; size_t srf_coord_dsp_offset;
  3247. size_t srf_value_cnt_size; size_t srf_value_cnt_offset;
  3248. size_t srf_value_dsp_size; size_t srf_value_dsp_offset;
  3249. size_t trg_coord_cnt_size; size_t trg_coord_cnt_offset;
  3250. size_t trg_coord_dsp_size; size_t trg_coord_dsp_offset;
  3251. size_t trg_value_cnt_size; size_t trg_value_cnt_offset;
  3252. size_t trg_value_dsp_size; size_t trg_value_dsp_offset;
  3253. // interac_data
  3254. size_t in_node_size; size_t in_node_offset;
  3255. size_t scal_idx_size; size_t scal_idx_offset;
  3256. size_t coord_shift_size; size_t coord_shift_offset;
  3257. size_t interac_cnt_size; size_t interac_cnt_offset;
  3258. size_t interac_dsp_size; size_t interac_dsp_offset;
  3259. size_t interac_cst_size; size_t interac_cst_offset;
  3260. size_t scal_dim[4*MAX_DEPTH]; size_t scal_offset[4*MAX_DEPTH];
  3261. size_t Mdim[4][2]; size_t M_offset[4];
  3262. };
  3263. PackedSetupData pkd_data;
  3264. { // Set pkd_data
  3265. size_t offset=mem::align_ptr(sizeof(PackedSetupData));
  3266. pkd_data. level=data. level;
  3267. pkd_data.kernel=data.kernel;
  3268. pkd_data.src_coord=data.src_coord.ptr;
  3269. pkd_data.src_value=data.src_value.ptr;
  3270. pkd_data.srf_coord=data.srf_coord.ptr;
  3271. pkd_data.srf_value=data.srf_value.ptr;
  3272. pkd_data.trg_coord=data.trg_coord.ptr;
  3273. pkd_data.trg_value=data.trg_value.ptr;
  3274. pkd_data.src_coord_cnt_offset=offset; pkd_data.src_coord_cnt_size=data.src_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_coord_cnt_size);
  3275. pkd_data.src_coord_dsp_offset=offset; pkd_data.src_coord_dsp_size=data.src_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_coord_dsp_size);
  3276. pkd_data.src_value_cnt_offset=offset; pkd_data.src_value_cnt_size=data.src_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_value_cnt_size);
  3277. pkd_data.src_value_dsp_offset=offset; pkd_data.src_value_dsp_size=data.src_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_value_dsp_size);
  3278. pkd_data.srf_coord_cnt_offset=offset; pkd_data.srf_coord_cnt_size=data.srf_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_coord_cnt_size);
  3279. pkd_data.srf_coord_dsp_offset=offset; pkd_data.srf_coord_dsp_size=data.srf_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_coord_dsp_size);
  3280. pkd_data.srf_value_cnt_offset=offset; pkd_data.srf_value_cnt_size=data.srf_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_value_cnt_size);
  3281. pkd_data.srf_value_dsp_offset=offset; pkd_data.srf_value_dsp_size=data.srf_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_value_dsp_size);
  3282. pkd_data.trg_coord_cnt_offset=offset; pkd_data.trg_coord_cnt_size=data.trg_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_coord_cnt_size);
  3283. pkd_data.trg_coord_dsp_offset=offset; pkd_data.trg_coord_dsp_size=data.trg_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_coord_dsp_size);
  3284. pkd_data.trg_value_cnt_offset=offset; pkd_data.trg_value_cnt_size=data.trg_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_value_cnt_size);
  3285. pkd_data.trg_value_dsp_offset=offset; pkd_data.trg_value_dsp_size=data.trg_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_value_dsp_size);
  3286. InteracData& intdata=data.interac_data;
  3287. pkd_data. in_node_offset=offset; pkd_data. in_node_size=intdata. in_node.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data. in_node_size);
  3288. pkd_data. scal_idx_offset=offset; pkd_data. scal_idx_size=intdata. scal_idx.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data. scal_idx_size);
  3289. pkd_data.coord_shift_offset=offset; pkd_data.coord_shift_size=intdata.coord_shift.Dim(); offset+=mem::align_ptr(sizeof(Real_t)*pkd_data.coord_shift_size);
  3290. pkd_data.interac_cnt_offset=offset; pkd_data.interac_cnt_size=intdata.interac_cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_cnt_size);
  3291. pkd_data.interac_dsp_offset=offset; pkd_data.interac_dsp_size=intdata.interac_dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_dsp_size);
  3292. pkd_data.interac_cst_offset=offset; pkd_data.interac_cst_size=intdata.interac_cst.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_cst_size);
  3293. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3294. pkd_data.scal_offset[i]=offset; pkd_data.scal_dim[i]=intdata.scal[i].Dim(); offset+=mem::align_ptr(sizeof(Real_t)*pkd_data.scal_dim[i]);
  3295. }
  3296. for(size_t i=0;i<4;i++){
  3297. size_t& Mdim0=pkd_data.Mdim[i][0];
  3298. size_t& Mdim1=pkd_data.Mdim[i][1];
  3299. pkd_data.M_offset[i]=offset; Mdim0=intdata.M[i].Dim(0); Mdim1=intdata.M[i].Dim(1); offset+=mem::align_ptr(sizeof(Real_t)*Mdim0*Mdim1);
  3300. }
  3301. pkd_data.size=offset;
  3302. }
  3303. { // Set setup_data.interac_data
  3304. Matrix<char>& buff=setup_data.interac_data;
  3305. if(pkd_data.size>buff.Dim(0)*buff.Dim(1)){
  3306. buff.ReInit(1,pkd_data.size);
  3307. }
  3308. ((PackedSetupData*)buff[0])[0]=pkd_data;
  3309. if(pkd_data.src_coord_cnt_size) memcpy(&buff[0][pkd_data.src_coord_cnt_offset], &data.src_coord.cnt[0], pkd_data.src_coord_cnt_size*sizeof(size_t));
  3310. if(pkd_data.src_coord_dsp_size) memcpy(&buff[0][pkd_data.src_coord_dsp_offset], &data.src_coord.dsp[0], pkd_data.src_coord_dsp_size*sizeof(size_t));
  3311. if(pkd_data.src_value_cnt_size) memcpy(&buff[0][pkd_data.src_value_cnt_offset], &data.src_value.cnt[0], pkd_data.src_value_cnt_size*sizeof(size_t));
  3312. if(pkd_data.src_value_dsp_size) memcpy(&buff[0][pkd_data.src_value_dsp_offset], &data.src_value.dsp[0], pkd_data.src_value_dsp_size*sizeof(size_t));
  3313. if(pkd_data.srf_coord_cnt_size) memcpy(&buff[0][pkd_data.srf_coord_cnt_offset], &data.srf_coord.cnt[0], pkd_data.srf_coord_cnt_size*sizeof(size_t));
  3314. if(pkd_data.srf_coord_dsp_size) memcpy(&buff[0][pkd_data.srf_coord_dsp_offset], &data.srf_coord.dsp[0], pkd_data.srf_coord_dsp_size*sizeof(size_t));
  3315. if(pkd_data.srf_value_cnt_size) memcpy(&buff[0][pkd_data.srf_value_cnt_offset], &data.srf_value.cnt[0], pkd_data.srf_value_cnt_size*sizeof(size_t));
  3316. if(pkd_data.srf_value_dsp_size) memcpy(&buff[0][pkd_data.srf_value_dsp_offset], &data.srf_value.dsp[0], pkd_data.srf_value_dsp_size*sizeof(size_t));
  3317. if(pkd_data.trg_coord_cnt_size) memcpy(&buff[0][pkd_data.trg_coord_cnt_offset], &data.trg_coord.cnt[0], pkd_data.trg_coord_cnt_size*sizeof(size_t));
  3318. if(pkd_data.trg_coord_dsp_size) memcpy(&buff[0][pkd_data.trg_coord_dsp_offset], &data.trg_coord.dsp[0], pkd_data.trg_coord_dsp_size*sizeof(size_t));
  3319. if(pkd_data.trg_value_cnt_size) memcpy(&buff[0][pkd_data.trg_value_cnt_offset], &data.trg_value.cnt[0], pkd_data.trg_value_cnt_size*sizeof(size_t));
  3320. if(pkd_data.trg_value_dsp_size) memcpy(&buff[0][pkd_data.trg_value_dsp_offset], &data.trg_value.dsp[0], pkd_data.trg_value_dsp_size*sizeof(size_t));
  3321. InteracData& intdata=data.interac_data;
  3322. if(pkd_data. in_node_size) memcpy(&buff[0][pkd_data. in_node_offset], &intdata. in_node[0], pkd_data. in_node_size*sizeof(size_t));
  3323. if(pkd_data. scal_idx_size) memcpy(&buff[0][pkd_data. scal_idx_offset], &intdata. scal_idx[0], pkd_data. scal_idx_size*sizeof(size_t));
  3324. if(pkd_data.coord_shift_size) memcpy(&buff[0][pkd_data.coord_shift_offset], &intdata.coord_shift[0], pkd_data.coord_shift_size*sizeof(Real_t));
  3325. if(pkd_data.interac_cnt_size) memcpy(&buff[0][pkd_data.interac_cnt_offset], &intdata.interac_cnt[0], pkd_data.interac_cnt_size*sizeof(size_t));
  3326. if(pkd_data.interac_dsp_size) memcpy(&buff[0][pkd_data.interac_dsp_offset], &intdata.interac_dsp[0], pkd_data.interac_dsp_size*sizeof(size_t));
  3327. if(pkd_data.interac_cst_size) memcpy(&buff[0][pkd_data.interac_cst_offset], &intdata.interac_cst[0], pkd_data.interac_cst_size*sizeof(size_t));
  3328. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3329. if(intdata.scal[i].Dim()) memcpy(&buff[0][pkd_data.scal_offset[i]], &intdata.scal[i][0], intdata.scal[i].Dim()*sizeof(Real_t));
  3330. }
  3331. for(size_t i=0;i<4;i++){
  3332. if(intdata.M[i].Dim(0)*intdata.M[i].Dim(1)) memcpy(&buff[0][pkd_data.M_offset[i]], &intdata.M[i][0][0], intdata.M[i].Dim(0)*intdata.M[i].Dim(1)*sizeof(Real_t));
  3333. }
  3334. }
  3335. }
  3336. { // Resize device buffer
  3337. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3338. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3339. }
  3340. }
  3341. template <class FMMNode>
  3342. template <int SYNC>
  3343. void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
  3344. if(setup_data.kernel->ker_dim[0]*setup_data.kernel->ker_dim[1]==0) return;
  3345. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  3346. Profile::Tic("Host2Device",&this->comm,false,25);
  3347. Profile::Toc();
  3348. Profile::Tic("DeviceComp",&this->comm,false,20);
  3349. Profile::Toc();
  3350. return;
  3351. }
  3352. bool have_gpu=false;
  3353. #if defined(PVFMM_HAVE_CUDA)
  3354. have_gpu=true;
  3355. #endif
  3356. Profile::Tic("Host2Device",&this->comm,false,25);
  3357. typename Vector<char>::Device dev_buff;
  3358. typename Matrix<char>::Device interac_data;
  3359. typename Matrix<Real_t>::Device coord_data;
  3360. typename Matrix<Real_t>::Device input_data;
  3361. typename Matrix<Real_t>::Device output_data;
  3362. size_t ptr_single_layer_kernel=(size_t)NULL;
  3363. size_t ptr_double_layer_kernel=(size_t)NULL;
  3364. if(device && !have_gpu){
  3365. dev_buff = this-> dev_buffer. AllocDevice(false);
  3366. interac_data= setup_data.interac_data. AllocDevice(false);
  3367. if(setup_data. coord_data!=NULL) coord_data = setup_data. coord_data->AllocDevice(false);
  3368. if(setup_data. input_data!=NULL) input_data = setup_data. input_data->AllocDevice(false);
  3369. if(setup_data. output_data!=NULL) output_data = setup_data. output_data->AllocDevice(false);
  3370. ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten;
  3371. ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten;
  3372. }else{
  3373. dev_buff = this-> dev_buffer;
  3374. interac_data= setup_data.interac_data;
  3375. if(setup_data. coord_data!=NULL) coord_data =*setup_data. coord_data;
  3376. if(setup_data. input_data!=NULL) input_data =*setup_data. input_data;
  3377. if(setup_data. output_data!=NULL) output_data =*setup_data. output_data;
  3378. ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten;
  3379. ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
  3380. }
  3381. Profile::Toc();
  3382. Profile::Tic("DeviceComp",&this->comm,false,20);
  3383. int lock_idx=-1;
  3384. int wait_lock_idx=-1;
  3385. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  3386. if(device) lock_idx=MIC_Lock::get_lock();
  3387. #ifdef __INTEL_OFFLOAD
  3388. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  3389. #endif
  3390. { // Offloaded computation.
  3391. struct PackedData{
  3392. size_t len;
  3393. Matrix<Real_t>* ptr;
  3394. Vector<size_t> cnt;
  3395. Vector<size_t> dsp;
  3396. };
  3397. struct InteracData{
  3398. Vector<size_t> in_node;
  3399. Vector<size_t> scal_idx;
  3400. Vector<Real_t> coord_shift;
  3401. Vector<size_t> interac_cnt;
  3402. Vector<size_t> interac_dsp;
  3403. Vector<size_t> interac_cst;
  3404. Vector<Real_t> scal[4*MAX_DEPTH];
  3405. Matrix<Real_t> M[4];
  3406. };
  3407. struct ptSetupData{
  3408. int level;
  3409. const Kernel<Real_t>* kernel;
  3410. PackedData src_coord; // Src coord
  3411. PackedData src_value; // Src density
  3412. PackedData srf_coord; // Srf coord
  3413. PackedData srf_value; // Srf density
  3414. PackedData trg_coord; // Trg coord
  3415. PackedData trg_value; // Trg potential
  3416. InteracData interac_data;
  3417. };
  3418. ptSetupData data;
  3419. { // Initialize data
  3420. struct PackedSetupData{
  3421. size_t size;
  3422. int level;
  3423. const Kernel<Real_t>* kernel;
  3424. Matrix<Real_t>* src_coord; // Src coord
  3425. Matrix<Real_t>* src_value; // Src density
  3426. Matrix<Real_t>* srf_coord; // Srf coord
  3427. Matrix<Real_t>* srf_value; // Srf density
  3428. Matrix<Real_t>* trg_coord; // Trg coord
  3429. Matrix<Real_t>* trg_value; // Trg potential
  3430. size_t src_coord_cnt_size; size_t src_coord_cnt_offset;
  3431. size_t src_coord_dsp_size; size_t src_coord_dsp_offset;
  3432. size_t src_value_cnt_size; size_t src_value_cnt_offset;
  3433. size_t src_value_dsp_size; size_t src_value_dsp_offset;
  3434. size_t srf_coord_cnt_size; size_t srf_coord_cnt_offset;
  3435. size_t srf_coord_dsp_size; size_t srf_coord_dsp_offset;
  3436. size_t srf_value_cnt_size; size_t srf_value_cnt_offset;
  3437. size_t srf_value_dsp_size; size_t srf_value_dsp_offset;
  3438. size_t trg_coord_cnt_size; size_t trg_coord_cnt_offset;
  3439. size_t trg_coord_dsp_size; size_t trg_coord_dsp_offset;
  3440. size_t trg_value_cnt_size; size_t trg_value_cnt_offset;
  3441. size_t trg_value_dsp_size; size_t trg_value_dsp_offset;
  3442. // interac_data
  3443. size_t in_node_size; size_t in_node_offset;
  3444. size_t scal_idx_size; size_t scal_idx_offset;
  3445. size_t coord_shift_size; size_t coord_shift_offset;
  3446. size_t interac_cnt_size; size_t interac_cnt_offset;
  3447. size_t interac_dsp_size; size_t interac_dsp_offset;
  3448. size_t interac_cst_size; size_t interac_cst_offset;
  3449. size_t scal_dim[4*MAX_DEPTH]; size_t scal_offset[4*MAX_DEPTH];
  3450. size_t Mdim[4][2]; size_t M_offset[4];
  3451. };
  3452. typename Matrix<char>::Device& setupdata=interac_data;
  3453. PackedSetupData& pkd_data=*((PackedSetupData*)setupdata[0]);
  3454. data. level=pkd_data. level;
  3455. data.kernel=pkd_data.kernel;
  3456. data.src_coord.ptr=pkd_data.src_coord;
  3457. data.src_value.ptr=pkd_data.src_value;
  3458. data.srf_coord.ptr=pkd_data.srf_coord;
  3459. data.srf_value.ptr=pkd_data.srf_value;
  3460. data.trg_coord.ptr=pkd_data.trg_coord;
  3461. data.trg_value.ptr=pkd_data.trg_value;
  3462. data.src_coord.cnt.ReInit(pkd_data.src_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.src_coord_cnt_offset], false);
  3463. data.src_coord.dsp.ReInit(pkd_data.src_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.src_coord_dsp_offset], false);
  3464. data.src_value.cnt.ReInit(pkd_data.src_value_cnt_size, (size_t*)&setupdata[0][pkd_data.src_value_cnt_offset], false);
  3465. data.src_value.dsp.ReInit(pkd_data.src_value_dsp_size, (size_t*)&setupdata[0][pkd_data.src_value_dsp_offset], false);
  3466. data.srf_coord.cnt.ReInit(pkd_data.srf_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.srf_coord_cnt_offset], false);
  3467. data.srf_coord.dsp.ReInit(pkd_data.srf_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.srf_coord_dsp_offset], false);
  3468. data.srf_value.cnt.ReInit(pkd_data.srf_value_cnt_size, (size_t*)&setupdata[0][pkd_data.srf_value_cnt_offset], false);
  3469. data.srf_value.dsp.ReInit(pkd_data.srf_value_dsp_size, (size_t*)&setupdata[0][pkd_data.srf_value_dsp_offset], false);
  3470. data.trg_coord.cnt.ReInit(pkd_data.trg_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.trg_coord_cnt_offset], false);
  3471. data.trg_coord.dsp.ReInit(pkd_data.trg_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.trg_coord_dsp_offset], false);
  3472. data.trg_value.cnt.ReInit(pkd_data.trg_value_cnt_size, (size_t*)&setupdata[0][pkd_data.trg_value_cnt_offset], false);
  3473. data.trg_value.dsp.ReInit(pkd_data.trg_value_dsp_size, (size_t*)&setupdata[0][pkd_data.trg_value_dsp_offset], false);
  3474. InteracData& intdata=data.interac_data;
  3475. intdata. in_node.ReInit(pkd_data. in_node_size, (size_t*)&setupdata[0][pkd_data. in_node_offset],false);
  3476. intdata. scal_idx.ReInit(pkd_data. scal_idx_size, (size_t*)&setupdata[0][pkd_data. scal_idx_offset],false);
  3477. intdata.coord_shift.ReInit(pkd_data.coord_shift_size, (Real_t*)&setupdata[0][pkd_data.coord_shift_offset],false);
  3478. intdata.interac_cnt.ReInit(pkd_data.interac_cnt_size, (size_t*)&setupdata[0][pkd_data.interac_cnt_offset],false);
  3479. intdata.interac_dsp.ReInit(pkd_data.interac_dsp_size, (size_t*)&setupdata[0][pkd_data.interac_dsp_offset],false);
  3480. intdata.interac_cst.ReInit(pkd_data.interac_cst_size, (size_t*)&setupdata[0][pkd_data.interac_cst_offset],false);
  3481. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3482. intdata.scal[i].ReInit(pkd_data.scal_dim[i], (Real_t*)&setupdata[0][pkd_data.scal_offset[i]],false);
  3483. }
  3484. for(size_t i=0;i<4;i++){
  3485. intdata.M[i].ReInit(pkd_data.Mdim[i][0], pkd_data.Mdim[i][1], (Real_t*)&setupdata[0][pkd_data.M_offset[i]],false);
  3486. }
  3487. }
  3488. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  3489. { // Compute interactions
  3490. InteracData& intdata=data.interac_data;
  3491. typename Kernel<Real_t>::Ker_t single_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_single_layer_kernel;
  3492. typename Kernel<Real_t>::Ker_t double_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_double_layer_kernel;
  3493. int omp_p=omp_get_max_threads();
  3494. #pragma omp parallel for
  3495. for(size_t tid=0;tid<omp_p;tid++){
  3496. Matrix<Real_t> src_coord, src_value;
  3497. Matrix<Real_t> srf_coord, srf_value;
  3498. Matrix<Real_t> trg_coord, trg_value;
  3499. Vector<Real_t> buff;
  3500. { // init buff
  3501. size_t thread_buff_size=dev_buff.dim/sizeof(Real_t)/omp_p;
  3502. buff.ReInit(thread_buff_size, (Real_t*)&dev_buff[tid*thread_buff_size*sizeof(Real_t)], false);
  3503. }
  3504. size_t vcnt=0;
  3505. std::vector<Matrix<Real_t> > vbuff(6);
  3506. { // init vbuff[0:5]
  3507. size_t vdim_=0, vdim[6];
  3508. for(size_t indx=0;indx<6;indx++){
  3509. vdim[indx]=0;
  3510. switch(indx){
  3511. case 0:
  3512. vdim[indx]=intdata.M[0].Dim(0); break;
  3513. case 1:
  3514. assert(intdata.M[0].Dim(1)==intdata.M[1].Dim(0));
  3515. vdim[indx]=intdata.M[0].Dim(1); break;
  3516. case 2:
  3517. vdim[indx]=intdata.M[1].Dim(1); break;
  3518. case 3:
  3519. vdim[indx]=intdata.M[2].Dim(0); break;
  3520. case 4:
  3521. assert(intdata.M[2].Dim(1)==intdata.M[3].Dim(0));
  3522. vdim[indx]=intdata.M[2].Dim(1); break;
  3523. case 5:
  3524. vdim[indx]=intdata.M[3].Dim(1); break;
  3525. default:
  3526. vdim[indx]=0; break;
  3527. }
  3528. vdim_+=vdim[indx];
  3529. }
  3530. if(vdim_){
  3531. vcnt=buff.Dim()/vdim_/2;
  3532. assert(vcnt>0); // Thread buffer is too small
  3533. }
  3534. for(size_t indx=0;indx<6;indx++){ // init vbuff[0:5]
  3535. vbuff[indx].ReInit(vcnt,vdim[indx],&buff[0],false);
  3536. buff.ReInit(buff.Dim()-vdim[indx]*vcnt, &buff[vdim[indx]*vcnt], false);
  3537. }
  3538. }
  3539. size_t trg_a=0, trg_b=0;
  3540. if(intdata.interac_cst.Dim()){ // Determine trg_a, trg_b
  3541. //trg_a=((tid+0)*intdata.interac_cnt.Dim())/omp_p;
  3542. //trg_b=((tid+1)*intdata.interac_cnt.Dim())/omp_p;
  3543. Vector<size_t>& interac_cst=intdata.interac_cst;
  3544. size_t cost=interac_cst[interac_cst.Dim()-1];
  3545. trg_a=std::lower_bound(&interac_cst[0],&interac_cst[interac_cst.Dim()-1],(cost*(tid+0))/omp_p)-&interac_cst[0]+1;
  3546. trg_b=std::lower_bound(&interac_cst[0],&interac_cst[interac_cst.Dim()-1],(cost*(tid+1))/omp_p)-&interac_cst[0]+1;
  3547. if(tid==omp_p-1) trg_b=interac_cst.Dim();
  3548. if(tid==0) trg_a=0;
  3549. }
  3550. for(size_t trg0=trg_a;trg0<trg_b;){
  3551. size_t trg1_max=1;
  3552. if(vcnt){ // Find trg1_max
  3553. size_t interac_cnt=intdata.interac_cnt[trg0];
  3554. while(trg0+trg1_max<trg_b){
  3555. interac_cnt+=intdata.interac_cnt[trg0+trg1_max];
  3556. if(interac_cnt>vcnt){
  3557. interac_cnt-=intdata.interac_cnt[trg0+trg1_max];
  3558. break;
  3559. }
  3560. trg1_max++;
  3561. }
  3562. assert(interac_cnt<=vcnt);
  3563. for(size_t k=0;k<6;k++){
  3564. if(vbuff[k].Dim(0)*vbuff[k].Dim(1)){
  3565. vbuff[k].ReInit(interac_cnt,vbuff[k].Dim(1),vbuff[k][0],false);
  3566. }
  3567. }
  3568. }else{
  3569. trg1_max=trg_b-trg0;
  3570. }
  3571. if(intdata.M[0].Dim(0) && intdata.M[0].Dim(1) && intdata.M[1].Dim(0) && intdata.M[1].Dim(1)){ // src mat-vec
  3572. size_t interac_idx=0;
  3573. for(size_t trg1=0;trg1<trg1_max;trg1++){ // Copy src_value to vbuff[0]
  3574. size_t trg=trg0+trg1;
  3575. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3576. size_t int_id=intdata.interac_dsp[trg]+i;
  3577. size_t src=intdata.in_node[int_id];
  3578. src_value.ReInit(1, data.src_value.cnt[src], &data.src_value.ptr[0][0][data.src_value.dsp[src]], false);
  3579. { // Copy src_value to vbuff[0]
  3580. size_t vdim=vbuff[0].Dim(1);
  3581. assert(src_value.Dim(1)==vdim);
  3582. for(size_t j=0;j<vdim;j++) vbuff[0][interac_idx][j]=src_value[0][j];
  3583. }
  3584. size_t scal_idx=intdata.scal_idx[int_id];
  3585. { // scaling
  3586. Matrix<Real_t>& vec=vbuff[0];
  3587. Vector<Real_t>& scal=intdata.scal[scal_idx*4+0];
  3588. size_t scal_dim=scal.Dim();
  3589. if(scal_dim){
  3590. size_t vdim=vec.Dim(1);
  3591. for(size_t j=0;j<vdim;j+=scal_dim){
  3592. for(size_t k=0;k<scal_dim;k++){
  3593. vec[interac_idx][j+k]*=scal[k];
  3594. }
  3595. }
  3596. }
  3597. }
  3598. interac_idx++;
  3599. }
  3600. }
  3601. Matrix<Real_t>::GEMM(vbuff[1],vbuff[0],intdata.M[0]);
  3602. Matrix<Real_t>::GEMM(vbuff[2],vbuff[1],intdata.M[1]);
  3603. interac_idx=0;
  3604. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3605. size_t trg=trg0+trg1;
  3606. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3607. size_t int_id=intdata.interac_dsp[trg]+i;
  3608. size_t scal_idx=intdata.scal_idx[int_id];
  3609. { // scaling
  3610. Matrix<Real_t>& vec=vbuff[2];
  3611. Vector<Real_t>& scal=intdata.scal[scal_idx*4+1];
  3612. size_t scal_dim=scal.Dim();
  3613. if(scal_dim){
  3614. size_t vdim=vec.Dim(1);
  3615. for(size_t j=0;j<vdim;j+=scal_dim){
  3616. for(size_t k=0;k<scal_dim;k++){
  3617. vec[interac_idx][j+k]*=scal[k];
  3618. }
  3619. }
  3620. }
  3621. }
  3622. interac_idx++;
  3623. }
  3624. }
  3625. }
  3626. if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){ // init vbuff[3]
  3627. size_t vdim=vbuff[3].Dim(0)*vbuff[3].Dim(1);
  3628. for(size_t i=0;i<vdim;i++) vbuff[3][0][i]=0;
  3629. }
  3630. { // Evaluate kernel functions
  3631. size_t interac_idx=0;
  3632. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3633. size_t trg=trg0+trg1;
  3634. trg_coord.ReInit(1, data.trg_coord.cnt[trg], &data.trg_coord.ptr[0][0][data.trg_coord.dsp[trg]], false);
  3635. trg_value.ReInit(1, data.trg_value.cnt[trg], &data.trg_value.ptr[0][0][data.trg_value.dsp[trg]], false);
  3636. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3637. size_t int_id=intdata.interac_dsp[trg]+i;
  3638. size_t src=intdata.in_node[int_id];
  3639. src_coord.ReInit(1, data.src_coord.cnt[src], &data.src_coord.ptr[0][0][data.src_coord.dsp[src]], false);
  3640. src_value.ReInit(1, data.src_value.cnt[src], &data.src_value.ptr[0][0][data.src_value.dsp[src]], false);
  3641. srf_coord.ReInit(1, data.srf_coord.cnt[src], &data.srf_coord.ptr[0][0][data.srf_coord.dsp[src]], false);
  3642. srf_value.ReInit(1, data.srf_value.cnt[src], &data.srf_value.ptr[0][0][data.srf_value.dsp[src]], false);
  3643. Real_t* vbuff2_ptr=(vbuff[2].Dim(0)*vbuff[2].Dim(1)?vbuff[2][interac_idx]:src_value[0]);
  3644. Real_t* vbuff3_ptr=(vbuff[3].Dim(0)*vbuff[3].Dim(1)?vbuff[3][interac_idx]:trg_value[0]);
  3645. if(src_coord.Dim(1)){
  3646. { // coord_shift
  3647. Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
  3648. if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
  3649. size_t vdim=src_coord.Dim(1);
  3650. Vector<Real_t> new_coord(vdim, &buff[0], false);
  3651. assert(buff.Dim()>=vdim); // Thread buffer is too small
  3652. //buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
  3653. for(size_t j=0;j<vdim;j+=COORD_DIM){
  3654. for(size_t k=0;k<COORD_DIM;k++){
  3655. new_coord[j+k]=src_coord[0][j+k]+shift[k];
  3656. }
  3657. }
  3658. src_coord.ReInit(1, vdim, &new_coord[0], false);
  3659. }
  3660. }
  3661. assert(ptr_single_layer_kernel); // assert(Single-layer kernel is implemented)
  3662. single_layer_kernel(src_coord[0], src_coord.Dim(1)/COORD_DIM, vbuff2_ptr, 1,
  3663. trg_coord[0], trg_coord.Dim(1)/COORD_DIM, vbuff3_ptr, NULL);
  3664. }
  3665. if(srf_coord.Dim(1)){
  3666. { // coord_shift
  3667. Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
  3668. if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
  3669. size_t vdim=srf_coord.Dim(1);
  3670. Vector<Real_t> new_coord(vdim, &buff[0], false);
  3671. assert(buff.Dim()>=vdim); // Thread buffer is too small
  3672. //buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
  3673. for(size_t j=0;j<vdim;j+=COORD_DIM){
  3674. for(size_t k=0;k<COORD_DIM;k++){
  3675. new_coord[j+k]=srf_coord[0][j+k]+shift[k];
  3676. }
  3677. }
  3678. srf_coord.ReInit(1, vdim, &new_coord[0], false);
  3679. }
  3680. }
  3681. assert(ptr_double_layer_kernel); // assert(Double-layer kernel is implemented)
  3682. double_layer_kernel(srf_coord[0], srf_coord.Dim(1)/COORD_DIM, srf_value[0], 1,
  3683. trg_coord[0], trg_coord.Dim(1)/COORD_DIM, vbuff3_ptr, NULL);
  3684. }
  3685. interac_idx++;
  3686. }
  3687. }
  3688. }
  3689. if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){ // trg mat-vec
  3690. size_t interac_idx=0;
  3691. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3692. size_t trg=trg0+trg1;
  3693. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3694. size_t int_id=intdata.interac_dsp[trg]+i;
  3695. size_t scal_idx=intdata.scal_idx[int_id];
  3696. { // scaling
  3697. Matrix<Real_t>& vec=vbuff[3];
  3698. Vector<Real_t>& scal=intdata.scal[scal_idx*4+2];
  3699. size_t scal_dim=scal.Dim();
  3700. if(scal_dim){
  3701. size_t vdim=vec.Dim(1);
  3702. for(size_t j=0;j<vdim;j+=scal_dim){
  3703. for(size_t k=0;k<scal_dim;k++){
  3704. vec[interac_idx][j+k]*=scal[k];
  3705. }
  3706. }
  3707. }
  3708. }
  3709. interac_idx++;
  3710. }
  3711. }
  3712. Matrix<Real_t>::GEMM(vbuff[4],vbuff[3],intdata.M[2]);
  3713. Matrix<Real_t>::GEMM(vbuff[5],vbuff[4],intdata.M[3]);
  3714. interac_idx=0;
  3715. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3716. size_t trg=trg0+trg1;
  3717. trg_value.ReInit(1, data.trg_value.cnt[trg], &data.trg_value.ptr[0][0][data.trg_value.dsp[trg]], false);
  3718. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3719. size_t int_id=intdata.interac_dsp[trg]+i;
  3720. size_t scal_idx=intdata.scal_idx[int_id];
  3721. { // scaling
  3722. Matrix<Real_t>& vec=vbuff[5];
  3723. Vector<Real_t>& scal=intdata.scal[scal_idx*4+3];
  3724. size_t scal_dim=scal.Dim();
  3725. if(scal_dim){
  3726. size_t vdim=vec.Dim(1);
  3727. for(size_t j=0;j<vdim;j+=scal_dim){
  3728. for(size_t k=0;k<scal_dim;k++){
  3729. vec[interac_idx][j+k]*=scal[k];
  3730. }
  3731. }
  3732. }
  3733. }
  3734. { // Add vbuff[5] to trg_value
  3735. size_t vdim=vbuff[5].Dim(1);
  3736. assert(trg_value.Dim(1)==vdim);
  3737. for(size_t i=0;i<vdim;i++) trg_value[0][i]+=vbuff[5][interac_idx][i];
  3738. }
  3739. interac_idx++;
  3740. }
  3741. }
  3742. }
  3743. trg0+=trg1_max;
  3744. }
  3745. }
  3746. }
  3747. if(device) MIC_Lock::release_lock(lock_idx);
  3748. }
  3749. #ifdef __INTEL_OFFLOAD
  3750. if(SYNC){
  3751. #pragma offload if(device) target(mic:0)
  3752. {if(device) MIC_Lock::wait_lock(lock_idx);}
  3753. }
  3754. #endif
  3755. Profile::Toc();
  3756. }
  3757. template <class FMMNode>
  3758. void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3759. if(!this->MultipoleOrder()) return;
  3760. { // Set setup_data
  3761. setup_data. level=level;
  3762. setup_data.kernel=kernel->k_s2l;
  3763. setup_data. input_data=&buff[4];
  3764. setup_data.output_data=&buff[1];
  3765. setup_data. coord_data=&buff[6];
  3766. Vector<FMMNode_t*>& nodes_in =n_list[4];
  3767. Vector<FMMNode_t*>& nodes_out=n_list[1];
  3768. setup_data.nodes_in .clear();
  3769. setup_data.nodes_out.clear();
  3770. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf ()) setup_data.nodes_in .push_back(nodes_in [i]);
  3771. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  3772. }
  3773. struct PackedData{
  3774. size_t len;
  3775. Matrix<Real_t>* ptr;
  3776. Vector<size_t> cnt;
  3777. Vector<size_t> dsp;
  3778. };
  3779. struct InteracData{
  3780. Vector<size_t> in_node;
  3781. Vector<size_t> scal_idx;
  3782. Vector<Real_t> coord_shift;
  3783. Vector<size_t> interac_cnt;
  3784. Vector<size_t> interac_dsp;
  3785. Vector<size_t> interac_cst;
  3786. Vector<Real_t> scal[4*MAX_DEPTH];
  3787. Matrix<Real_t> M[4];
  3788. };
  3789. struct ptSetupData{
  3790. int level;
  3791. const Kernel<Real_t>* kernel;
  3792. PackedData src_coord; // Src coord
  3793. PackedData src_value; // Src density
  3794. PackedData srf_coord; // Srf coord
  3795. PackedData srf_value; // Srf density
  3796. PackedData trg_coord; // Trg coord
  3797. PackedData trg_value; // Trg potential
  3798. InteracData interac_data;
  3799. };
  3800. ptSetupData data;
  3801. data. level=setup_data. level;
  3802. data.kernel=setup_data.kernel;
  3803. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3804. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3805. { // Set src data
  3806. std::vector<void*>& nodes=nodes_in;
  3807. PackedData& coord=data.src_coord;
  3808. PackedData& value=data.src_value;
  3809. coord.ptr=setup_data. coord_data;
  3810. value.ptr=setup_data. input_data;
  3811. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3812. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3813. coord.cnt.ReInit(nodes.size());
  3814. coord.dsp.ReInit(nodes.size());
  3815. value.cnt.ReInit(nodes.size());
  3816. value.dsp.ReInit(nodes.size());
  3817. #pragma omp parallel for
  3818. for(size_t i=0;i<nodes.size();i++){
  3819. ((FMMNode_t*)nodes[i])->node_id=i;
  3820. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->src_coord;
  3821. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->src_value;
  3822. if(coord_vec.Dim()){
  3823. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3824. assert(coord.dsp[i]<coord.len);
  3825. coord.cnt[i]=coord_vec.Dim();
  3826. }else{
  3827. coord.dsp[i]=0;
  3828. coord.cnt[i]=0;
  3829. }
  3830. if(value_vec.Dim()){
  3831. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3832. assert(value.dsp[i]<value.len);
  3833. value.cnt[i]=value_vec.Dim();
  3834. }else{
  3835. value.dsp[i]=0;
  3836. value.cnt[i]=0;
  3837. }
  3838. }
  3839. }
  3840. { // Set srf data
  3841. std::vector<void*>& nodes=nodes_in;
  3842. PackedData& coord=data.srf_coord;
  3843. PackedData& value=data.srf_value;
  3844. coord.ptr=setup_data. coord_data;
  3845. value.ptr=setup_data. input_data;
  3846. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3847. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3848. coord.cnt.ReInit(nodes.size());
  3849. coord.dsp.ReInit(nodes.size());
  3850. value.cnt.ReInit(nodes.size());
  3851. value.dsp.ReInit(nodes.size());
  3852. #pragma omp parallel for
  3853. for(size_t i=0;i<nodes.size();i++){
  3854. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->surf_coord;
  3855. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->surf_value;
  3856. if(coord_vec.Dim()){
  3857. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3858. assert(coord.dsp[i]<coord.len);
  3859. coord.cnt[i]=coord_vec.Dim();
  3860. }else{
  3861. coord.dsp[i]=0;
  3862. coord.cnt[i]=0;
  3863. }
  3864. if(value_vec.Dim()){
  3865. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3866. assert(value.dsp[i]<value.len);
  3867. value.cnt[i]=value_vec.Dim();
  3868. }else{
  3869. value.dsp[i]=0;
  3870. value.cnt[i]=0;
  3871. }
  3872. }
  3873. }
  3874. { // Set trg data
  3875. std::vector<void*>& nodes=nodes_out;
  3876. PackedData& coord=data.trg_coord;
  3877. PackedData& value=data.trg_value;
  3878. coord.ptr=setup_data. coord_data;
  3879. value.ptr=setup_data.output_data;
  3880. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3881. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3882. coord.cnt.ReInit(nodes.size());
  3883. coord.dsp.ReInit(nodes.size());
  3884. value.cnt.ReInit(nodes.size());
  3885. value.dsp.ReInit(nodes.size());
  3886. #pragma omp parallel for
  3887. for(size_t i=0;i<nodes.size();i++){
  3888. Vector<Real_t>& coord_vec=tree->dnwd_check_surf[((FMMNode*)nodes[i])->Depth()];
  3889. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->dnward_equiv;
  3890. if(coord_vec.Dim()){
  3891. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3892. assert(coord.dsp[i]<coord.len);
  3893. coord.cnt[i]=coord_vec.Dim();
  3894. }else{
  3895. coord.dsp[i]=0;
  3896. coord.cnt[i]=0;
  3897. }
  3898. if(value_vec.Dim()){
  3899. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3900. assert(value.dsp[i]<value.len);
  3901. value.cnt[i]=value_vec.Dim();
  3902. }else{
  3903. value.dsp[i]=0;
  3904. value.cnt[i]=0;
  3905. }
  3906. }
  3907. }
  3908. { // Set interac_data
  3909. int omp_p=omp_get_max_threads();
  3910. std::vector<std::vector<size_t> > in_node_(omp_p);
  3911. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  3912. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  3913. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  3914. size_t m=this->MultipoleOrder();
  3915. size_t Nsrf=(6*(m-1)*(m-1)+2);
  3916. #pragma omp parallel for
  3917. for(size_t tid=0;tid<omp_p;tid++){
  3918. std::vector<size_t>& in_node =in_node_[tid] ;
  3919. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  3920. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  3921. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  3922. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  3923. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  3924. for(size_t i=a;i<b;i++){
  3925. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  3926. if(tnode->IsLeaf() && tnode->pt_cnt[1]<=Nsrf){ // skip: handled in U-list
  3927. interac_cnt.push_back(0);
  3928. continue;
  3929. }
  3930. Real_t s=pvfmm::pow<Real_t>(0.5,tnode->Depth());
  3931. size_t interac_cnt_=0;
  3932. { // X_Type
  3933. Mat_Type type=X_Type;
  3934. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  3935. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  3936. FMMNode_t* snode=intlst[j];
  3937. size_t snode_id=snode->node_id;
  3938. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  3939. in_node.push_back(snode_id);
  3940. scal_idx.push_back(snode->Depth());
  3941. { // set coord_shift
  3942. const int* rel_coord=interac_list.RelativeCoord(type,j);
  3943. const Real_t* scoord=snode->Coord();
  3944. const Real_t* tcoord=tnode->Coord();
  3945. Real_t shift[COORD_DIM];
  3946. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(0+0.5*s);
  3947. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(0+0.5*s);
  3948. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(0+0.5*s);
  3949. coord_shift.push_back(shift[0]);
  3950. coord_shift.push_back(shift[1]);
  3951. coord_shift.push_back(shift[2]);
  3952. }
  3953. interac_cnt_++;
  3954. }
  3955. }
  3956. interac_cnt.push_back(interac_cnt_);
  3957. }
  3958. }
  3959. { // Combine interac data
  3960. InteracData& interac_data=data.interac_data;
  3961. { // in_node
  3962. typedef size_t ElemType;
  3963. std::vector<std::vector<ElemType> >& vec_=in_node_;
  3964. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  3965. std::vector<size_t> vec_dsp(omp_p+1,0);
  3966. for(size_t tid=0;tid<omp_p;tid++){
  3967. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3968. }
  3969. vec.ReInit(vec_dsp[omp_p]);
  3970. #pragma omp parallel for
  3971. for(size_t tid=0;tid<omp_p;tid++){
  3972. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3973. }
  3974. }
  3975. { // scal_idx
  3976. typedef size_t ElemType;
  3977. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  3978. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  3979. std::vector<size_t> vec_dsp(omp_p+1,0);
  3980. for(size_t tid=0;tid<omp_p;tid++){
  3981. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3982. }
  3983. vec.ReInit(vec_dsp[omp_p]);
  3984. #pragma omp parallel for
  3985. for(size_t tid=0;tid<omp_p;tid++){
  3986. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3987. }
  3988. }
  3989. { // coord_shift
  3990. typedef Real_t ElemType;
  3991. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  3992. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  3993. std::vector<size_t> vec_dsp(omp_p+1,0);
  3994. for(size_t tid=0;tid<omp_p;tid++){
  3995. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3996. }
  3997. vec.ReInit(vec_dsp[omp_p]);
  3998. #pragma omp parallel for
  3999. for(size_t tid=0;tid<omp_p;tid++){
  4000. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4001. }
  4002. }
  4003. { // interac_cnt
  4004. typedef size_t ElemType;
  4005. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4006. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4007. std::vector<size_t> vec_dsp(omp_p+1,0);
  4008. for(size_t tid=0;tid<omp_p;tid++){
  4009. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4010. }
  4011. vec.ReInit(vec_dsp[omp_p]);
  4012. #pragma omp parallel for
  4013. for(size_t tid=0;tid<omp_p;tid++){
  4014. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4015. }
  4016. }
  4017. { // interac_dsp
  4018. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4019. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4020. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4021. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4022. }
  4023. }
  4024. }
  4025. PtSetup(setup_data, &data);
  4026. }
  4027. template <class FMMNode>
  4028. void FMM_Pts<FMMNode>::X_List (SetupData<Real_t>& setup_data, bool device){
  4029. if(!this->MultipoleOrder()) return;
  4030. //Add X_List contribution.
  4031. this->EvalListPts(setup_data, device);
  4032. }
  4033. template <class FMMNode>
  4034. void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  4035. if(!this->MultipoleOrder()) return;
  4036. { // Set setup_data
  4037. setup_data. level=level;
  4038. setup_data.kernel=kernel->k_m2t;
  4039. setup_data. input_data=&buff[0];
  4040. setup_data.output_data=&buff[5];
  4041. setup_data. coord_data=&buff[6];
  4042. Vector<FMMNode_t*>& nodes_in =n_list[0];
  4043. Vector<FMMNode_t*>& nodes_out=n_list[5];
  4044. setup_data.nodes_in .clear();
  4045. setup_data.nodes_out.clear();
  4046. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] ) setup_data.nodes_in .push_back(nodes_in [i]);
  4047. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->trg_coord.Dim() && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  4048. }
  4049. struct PackedData{
  4050. size_t len;
  4051. Matrix<Real_t>* ptr;
  4052. Vector<size_t> cnt;
  4053. Vector<size_t> dsp;
  4054. };
  4055. struct InteracData{
  4056. Vector<size_t> in_node;
  4057. Vector<size_t> scal_idx;
  4058. Vector<Real_t> coord_shift;
  4059. Vector<size_t> interac_cnt;
  4060. Vector<size_t> interac_dsp;
  4061. Vector<size_t> interac_cst;
  4062. Vector<Real_t> scal[4*MAX_DEPTH];
  4063. Matrix<Real_t> M[4];
  4064. };
  4065. struct ptSetupData{
  4066. int level;
  4067. const Kernel<Real_t>* kernel;
  4068. PackedData src_coord; // Src coord
  4069. PackedData src_value; // Src density
  4070. PackedData srf_coord; // Srf coord
  4071. PackedData srf_value; // Srf density
  4072. PackedData trg_coord; // Trg coord
  4073. PackedData trg_value; // Trg potential
  4074. InteracData interac_data;
  4075. };
  4076. ptSetupData data;
  4077. data. level=setup_data. level;
  4078. data.kernel=setup_data.kernel;
  4079. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  4080. std::vector<void*>& nodes_out=setup_data.nodes_out;
  4081. { // Set src data
  4082. std::vector<void*>& nodes=nodes_in;
  4083. PackedData& coord=data.src_coord;
  4084. PackedData& value=data.src_value;
  4085. coord.ptr=setup_data. coord_data;
  4086. value.ptr=setup_data. input_data;
  4087. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4088. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4089. coord.cnt.ReInit(nodes.size());
  4090. coord.dsp.ReInit(nodes.size());
  4091. value.cnt.ReInit(nodes.size());
  4092. value.dsp.ReInit(nodes.size());
  4093. #pragma omp parallel for
  4094. for(size_t i=0;i<nodes.size();i++){
  4095. ((FMMNode_t*)nodes[i])->node_id=i;
  4096. Vector<Real_t>& coord_vec=tree->upwd_equiv_surf[((FMMNode*)nodes[i])->Depth()];
  4097. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->upward_equiv;
  4098. if(coord_vec.Dim()){
  4099. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4100. assert(coord.dsp[i]<coord.len);
  4101. coord.cnt[i]=coord_vec.Dim();
  4102. }else{
  4103. coord.dsp[i]=0;
  4104. coord.cnt[i]=0;
  4105. }
  4106. if(value_vec.Dim()){
  4107. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4108. assert(value.dsp[i]<value.len);
  4109. value.cnt[i]=value_vec.Dim();
  4110. }else{
  4111. value.dsp[i]=0;
  4112. value.cnt[i]=0;
  4113. }
  4114. }
  4115. }
  4116. { // Set srf data
  4117. std::vector<void*>& nodes=nodes_in;
  4118. PackedData& coord=data.srf_coord;
  4119. PackedData& value=data.srf_value;
  4120. coord.ptr=setup_data. coord_data;
  4121. value.ptr=setup_data. input_data;
  4122. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4123. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4124. coord.cnt.ReInit(nodes.size());
  4125. coord.dsp.ReInit(nodes.size());
  4126. value.cnt.ReInit(nodes.size());
  4127. value.dsp.ReInit(nodes.size());
  4128. #pragma omp parallel for
  4129. for(size_t i=0;i<nodes.size();i++){
  4130. coord.dsp[i]=0;
  4131. coord.cnt[i]=0;
  4132. value.dsp[i]=0;
  4133. value.cnt[i]=0;
  4134. }
  4135. }
  4136. { // Set trg data
  4137. std::vector<void*>& nodes=nodes_out;
  4138. PackedData& coord=data.trg_coord;
  4139. PackedData& value=data.trg_value;
  4140. coord.ptr=setup_data. coord_data;
  4141. value.ptr=setup_data.output_data;
  4142. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4143. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4144. coord.cnt.ReInit(nodes.size());
  4145. coord.dsp.ReInit(nodes.size());
  4146. value.cnt.ReInit(nodes.size());
  4147. value.dsp.ReInit(nodes.size());
  4148. #pragma omp parallel for
  4149. for(size_t i=0;i<nodes.size();i++){
  4150. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4151. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4152. if(coord_vec.Dim()){
  4153. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4154. assert(coord.dsp[i]<coord.len);
  4155. coord.cnt[i]=coord_vec.Dim();
  4156. }else{
  4157. coord.dsp[i]=0;
  4158. coord.cnt[i]=0;
  4159. }
  4160. if(value_vec.Dim()){
  4161. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4162. assert(value.dsp[i]<value.len);
  4163. value.cnt[i]=value_vec.Dim();
  4164. }else{
  4165. value.dsp[i]=0;
  4166. value.cnt[i]=0;
  4167. }
  4168. }
  4169. }
  4170. { // Set interac_data
  4171. int omp_p=omp_get_max_threads();
  4172. std::vector<std::vector<size_t> > in_node_(omp_p);
  4173. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4174. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4175. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4176. size_t m=this->MultipoleOrder();
  4177. size_t Nsrf=(6*(m-1)*(m-1)+2);
  4178. #pragma omp parallel for
  4179. for(size_t tid=0;tid<omp_p;tid++){
  4180. std::vector<size_t>& in_node =in_node_[tid] ;
  4181. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4182. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4183. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  4184. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4185. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4186. for(size_t i=a;i<b;i++){
  4187. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4188. Real_t s=pvfmm::pow<Real_t>(0.5,tnode->Depth());
  4189. size_t interac_cnt_=0;
  4190. { // W_Type
  4191. Mat_Type type=W_Type;
  4192. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4193. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4194. FMMNode_t* snode=intlst[j];
  4195. size_t snode_id=snode->node_id;
  4196. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4197. if(snode->IsGhost() && snode->src_coord.Dim()+snode->surf_coord.Dim()==0){ // Is non-leaf ghost node
  4198. }else if(snode->IsLeaf() && snode->pt_cnt[0]<=Nsrf) continue; // skip: handled in U-list
  4199. in_node.push_back(snode_id);
  4200. scal_idx.push_back(snode->Depth());
  4201. { // set coord_shift
  4202. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4203. const Real_t* scoord=snode->Coord();
  4204. const Real_t* tcoord=tnode->Coord();
  4205. Real_t shift[COORD_DIM];
  4206. shift[0]=rel_coord[0]*0.25*s-(0+0.25*s)+(tcoord[0]+0.5*s);
  4207. shift[1]=rel_coord[1]*0.25*s-(0+0.25*s)+(tcoord[1]+0.5*s);
  4208. shift[2]=rel_coord[2]*0.25*s-(0+0.25*s)+(tcoord[2]+0.5*s);
  4209. coord_shift.push_back(shift[0]);
  4210. coord_shift.push_back(shift[1]);
  4211. coord_shift.push_back(shift[2]);
  4212. }
  4213. interac_cnt_++;
  4214. }
  4215. }
  4216. interac_cnt.push_back(interac_cnt_);
  4217. }
  4218. }
  4219. { // Combine interac data
  4220. InteracData& interac_data=data.interac_data;
  4221. { // in_node
  4222. typedef size_t ElemType;
  4223. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4224. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4225. std::vector<size_t> vec_dsp(omp_p+1,0);
  4226. for(size_t tid=0;tid<omp_p;tid++){
  4227. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4228. }
  4229. vec.ReInit(vec_dsp[omp_p]);
  4230. #pragma omp parallel for
  4231. for(size_t tid=0;tid<omp_p;tid++){
  4232. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4233. }
  4234. }
  4235. { // scal_idx
  4236. typedef size_t ElemType;
  4237. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4238. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4239. std::vector<size_t> vec_dsp(omp_p+1,0);
  4240. for(size_t tid=0;tid<omp_p;tid++){
  4241. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4242. }
  4243. vec.ReInit(vec_dsp[omp_p]);
  4244. #pragma omp parallel for
  4245. for(size_t tid=0;tid<omp_p;tid++){
  4246. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4247. }
  4248. }
  4249. { // coord_shift
  4250. typedef Real_t ElemType;
  4251. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4252. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4253. std::vector<size_t> vec_dsp(omp_p+1,0);
  4254. for(size_t tid=0;tid<omp_p;tid++){
  4255. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4256. }
  4257. vec.ReInit(vec_dsp[omp_p]);
  4258. #pragma omp parallel for
  4259. for(size_t tid=0;tid<omp_p;tid++){
  4260. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4261. }
  4262. }
  4263. { // interac_cnt
  4264. typedef size_t ElemType;
  4265. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4266. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4267. std::vector<size_t> vec_dsp(omp_p+1,0);
  4268. for(size_t tid=0;tid<omp_p;tid++){
  4269. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4270. }
  4271. vec.ReInit(vec_dsp[omp_p]);
  4272. #pragma omp parallel for
  4273. for(size_t tid=0;tid<omp_p;tid++){
  4274. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4275. }
  4276. }
  4277. { // interac_dsp
  4278. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4279. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4280. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4281. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4282. }
  4283. }
  4284. }
  4285. PtSetup(setup_data, &data);
  4286. }
  4287. template <class FMMNode>
  4288. void FMM_Pts<FMMNode>::W_List (SetupData<Real_t>& setup_data, bool device){
  4289. if(!this->MultipoleOrder()) return;
  4290. //Add W_List contribution.
  4291. this->EvalListPts(setup_data, device);
  4292. }
  4293. template <class FMMNode>
  4294. void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  4295. { // Set setup_data
  4296. setup_data. level=level;
  4297. setup_data.kernel=kernel->k_s2t;
  4298. setup_data. input_data=&buff[4];
  4299. setup_data.output_data=&buff[5];
  4300. setup_data. coord_data=&buff[6];
  4301. Vector<FMMNode_t*>& nodes_in =n_list[4];
  4302. Vector<FMMNode_t*>& nodes_out=n_list[5];
  4303. setup_data.nodes_in .clear();
  4304. setup_data.nodes_out.clear();
  4305. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]);
  4306. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && (nodes_out[i]->trg_coord.Dim() ) && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  4307. }
  4308. struct PackedData{
  4309. size_t len;
  4310. Matrix<Real_t>* ptr;
  4311. Vector<size_t> cnt;
  4312. Vector<size_t> dsp;
  4313. };
  4314. struct InteracData{
  4315. Vector<size_t> in_node;
  4316. Vector<size_t> scal_idx;
  4317. Vector<Real_t> coord_shift;
  4318. Vector<size_t> interac_cnt;
  4319. Vector<size_t> interac_dsp;
  4320. Vector<size_t> interac_cst;
  4321. Vector<Real_t> scal[4*MAX_DEPTH];
  4322. Matrix<Real_t> M[4];
  4323. };
  4324. struct ptSetupData{
  4325. int level;
  4326. const Kernel<Real_t>* kernel;
  4327. PackedData src_coord; // Src coord
  4328. PackedData src_value; // Src density
  4329. PackedData srf_coord; // Srf coord
  4330. PackedData srf_value; // Srf density
  4331. PackedData trg_coord; // Trg coord
  4332. PackedData trg_value; // Trg potential
  4333. InteracData interac_data;
  4334. };
  4335. ptSetupData data;
  4336. data. level=setup_data. level;
  4337. data.kernel=setup_data.kernel;
  4338. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  4339. std::vector<void*>& nodes_out=setup_data.nodes_out;
  4340. { // Set src data
  4341. std::vector<void*>& nodes=nodes_in;
  4342. PackedData& coord=data.src_coord;
  4343. PackedData& value=data.src_value;
  4344. coord.ptr=setup_data. coord_data;
  4345. value.ptr=setup_data. input_data;
  4346. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4347. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4348. coord.cnt.ReInit(nodes.size());
  4349. coord.dsp.ReInit(nodes.size());
  4350. value.cnt.ReInit(nodes.size());
  4351. value.dsp.ReInit(nodes.size());
  4352. #pragma omp parallel for
  4353. for(size_t i=0;i<nodes.size();i++){
  4354. ((FMMNode_t*)nodes[i])->node_id=i;
  4355. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->src_coord;
  4356. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->src_value;
  4357. if(coord_vec.Dim()){
  4358. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4359. assert(coord.dsp[i]<coord.len);
  4360. coord.cnt[i]=coord_vec.Dim();
  4361. }else{
  4362. coord.dsp[i]=0;
  4363. coord.cnt[i]=0;
  4364. }
  4365. if(value_vec.Dim()){
  4366. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4367. assert(value.dsp[i]<value.len);
  4368. value.cnt[i]=value_vec.Dim();
  4369. }else{
  4370. value.dsp[i]=0;
  4371. value.cnt[i]=0;
  4372. }
  4373. }
  4374. }
  4375. { // Set srf data
  4376. std::vector<void*>& nodes=nodes_in;
  4377. PackedData& coord=data.srf_coord;
  4378. PackedData& value=data.srf_value;
  4379. coord.ptr=setup_data. coord_data;
  4380. value.ptr=setup_data. input_data;
  4381. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4382. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4383. coord.cnt.ReInit(nodes.size());
  4384. coord.dsp.ReInit(nodes.size());
  4385. value.cnt.ReInit(nodes.size());
  4386. value.dsp.ReInit(nodes.size());
  4387. #pragma omp parallel for
  4388. for(size_t i=0;i<nodes.size();i++){
  4389. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->surf_coord;
  4390. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->surf_value;
  4391. if(coord_vec.Dim()){
  4392. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4393. assert(coord.dsp[i]<coord.len);
  4394. coord.cnt[i]=coord_vec.Dim();
  4395. }else{
  4396. coord.dsp[i]=0;
  4397. coord.cnt[i]=0;
  4398. }
  4399. if(value_vec.Dim()){
  4400. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4401. assert(value.dsp[i]<value.len);
  4402. value.cnt[i]=value_vec.Dim();
  4403. }else{
  4404. value.dsp[i]=0;
  4405. value.cnt[i]=0;
  4406. }
  4407. }
  4408. }
  4409. { // Set trg data
  4410. std::vector<void*>& nodes=nodes_out;
  4411. PackedData& coord=data.trg_coord;
  4412. PackedData& value=data.trg_value;
  4413. coord.ptr=setup_data. coord_data;
  4414. value.ptr=setup_data.output_data;
  4415. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4416. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4417. coord.cnt.ReInit(nodes.size());
  4418. coord.dsp.ReInit(nodes.size());
  4419. value.cnt.ReInit(nodes.size());
  4420. value.dsp.ReInit(nodes.size());
  4421. #pragma omp parallel for
  4422. for(size_t i=0;i<nodes.size();i++){
  4423. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4424. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4425. if(coord_vec.Dim()){
  4426. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4427. assert(coord.dsp[i]<coord.len);
  4428. coord.cnt[i]=coord_vec.Dim();
  4429. }else{
  4430. coord.dsp[i]=0;
  4431. coord.cnt[i]=0;
  4432. }
  4433. if(value_vec.Dim()){
  4434. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4435. assert(value.dsp[i]<value.len);
  4436. value.cnt[i]=value_vec.Dim();
  4437. }else{
  4438. value.dsp[i]=0;
  4439. value.cnt[i]=0;
  4440. }
  4441. }
  4442. }
  4443. { // Set interac_data
  4444. int omp_p=omp_get_max_threads();
  4445. std::vector<std::vector<size_t> > in_node_(omp_p);
  4446. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4447. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4448. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4449. size_t m=this->MultipoleOrder();
  4450. size_t Nsrf=(6*(m-1)*(m-1)+2);
  4451. #pragma omp parallel for
  4452. for(size_t tid=0;tid<omp_p;tid++){
  4453. std::vector<size_t>& in_node =in_node_[tid] ;
  4454. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4455. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4456. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  4457. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4458. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4459. for(size_t i=a;i<b;i++){
  4460. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4461. Real_t s=pvfmm::pow<Real_t>(0.5,tnode->Depth());
  4462. size_t interac_cnt_=0;
  4463. { // U0_Type
  4464. Mat_Type type=U0_Type;
  4465. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4466. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4467. FMMNode_t* snode=intlst[j];
  4468. size_t snode_id=snode->node_id;
  4469. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4470. in_node.push_back(snode_id);
  4471. scal_idx.push_back(snode->Depth());
  4472. { // set coord_shift
  4473. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4474. const Real_t* scoord=snode->Coord();
  4475. const Real_t* tcoord=tnode->Coord();
  4476. Real_t shift[COORD_DIM];
  4477. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(tcoord[0]+0.5*s);
  4478. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(tcoord[1]+0.5*s);
  4479. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(tcoord[2]+0.5*s);
  4480. coord_shift.push_back(shift[0]);
  4481. coord_shift.push_back(shift[1]);
  4482. coord_shift.push_back(shift[2]);
  4483. }
  4484. interac_cnt_++;
  4485. }
  4486. }
  4487. { // U1_Type
  4488. Mat_Type type=U1_Type;
  4489. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4490. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4491. FMMNode_t* snode=intlst[j];
  4492. size_t snode_id=snode->node_id;
  4493. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4494. in_node.push_back(snode_id);
  4495. scal_idx.push_back(snode->Depth());
  4496. { // set coord_shift
  4497. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4498. const Real_t* scoord=snode->Coord();
  4499. const Real_t* tcoord=tnode->Coord();
  4500. Real_t shift[COORD_DIM];
  4501. shift[0]=rel_coord[0]*1.0*s-(scoord[0]+0.5*s)+(tcoord[0]+0.5*s);
  4502. shift[1]=rel_coord[1]*1.0*s-(scoord[1]+0.5*s)+(tcoord[1]+0.5*s);
  4503. shift[2]=rel_coord[2]*1.0*s-(scoord[2]+0.5*s)+(tcoord[2]+0.5*s);
  4504. coord_shift.push_back(shift[0]);
  4505. coord_shift.push_back(shift[1]);
  4506. coord_shift.push_back(shift[2]);
  4507. }
  4508. interac_cnt_++;
  4509. }
  4510. }
  4511. { // U2_Type
  4512. Mat_Type type=U2_Type;
  4513. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4514. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4515. FMMNode_t* snode=intlst[j];
  4516. size_t snode_id=snode->node_id;
  4517. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4518. in_node.push_back(snode_id);
  4519. scal_idx.push_back(snode->Depth());
  4520. { // set coord_shift
  4521. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4522. const Real_t* scoord=snode->Coord();
  4523. const Real_t* tcoord=tnode->Coord();
  4524. Real_t shift[COORD_DIM];
  4525. shift[0]=rel_coord[0]*0.25*s-(scoord[0]+0.25*s)+(tcoord[0]+0.5*s);
  4526. shift[1]=rel_coord[1]*0.25*s-(scoord[1]+0.25*s)+(tcoord[1]+0.5*s);
  4527. shift[2]=rel_coord[2]*0.25*s-(scoord[2]+0.25*s)+(tcoord[2]+0.5*s);
  4528. coord_shift.push_back(shift[0]);
  4529. coord_shift.push_back(shift[1]);
  4530. coord_shift.push_back(shift[2]);
  4531. }
  4532. interac_cnt_++;
  4533. }
  4534. }
  4535. { // X_Type
  4536. Mat_Type type=X_Type;
  4537. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4538. if(tnode->pt_cnt[1]<=Nsrf)
  4539. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4540. FMMNode_t* snode=intlst[j];
  4541. size_t snode_id=snode->node_id;
  4542. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4543. in_node.push_back(snode_id);
  4544. scal_idx.push_back(snode->Depth());
  4545. { // set coord_shift
  4546. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4547. const Real_t* scoord=snode->Coord();
  4548. const Real_t* tcoord=tnode->Coord();
  4549. Real_t shift[COORD_DIM];
  4550. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(tcoord[0]+0.5*s);
  4551. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(tcoord[1]+0.5*s);
  4552. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(tcoord[2]+0.5*s);
  4553. coord_shift.push_back(shift[0]);
  4554. coord_shift.push_back(shift[1]);
  4555. coord_shift.push_back(shift[2]);
  4556. }
  4557. interac_cnt_++;
  4558. }
  4559. }
  4560. { // W_Type
  4561. Mat_Type type=W_Type;
  4562. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4563. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4564. FMMNode_t* snode=intlst[j];
  4565. size_t snode_id=snode->node_id;
  4566. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4567. if(snode->IsGhost() && snode->src_coord.Dim()+snode->surf_coord.Dim()==0) continue; // Is non-leaf ghost node
  4568. if(snode->pt_cnt[0]> Nsrf) continue;
  4569. in_node.push_back(snode_id);
  4570. scal_idx.push_back(snode->Depth());
  4571. { // set coord_shift
  4572. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4573. const Real_t* scoord=snode->Coord();
  4574. const Real_t* tcoord=tnode->Coord();
  4575. Real_t shift[COORD_DIM];
  4576. shift[0]=rel_coord[0]*0.25*s-(scoord[0]+0.25*s)+(tcoord[0]+0.5*s);
  4577. shift[1]=rel_coord[1]*0.25*s-(scoord[1]+0.25*s)+(tcoord[1]+0.5*s);
  4578. shift[2]=rel_coord[2]*0.25*s-(scoord[2]+0.25*s)+(tcoord[2]+0.5*s);
  4579. coord_shift.push_back(shift[0]);
  4580. coord_shift.push_back(shift[1]);
  4581. coord_shift.push_back(shift[2]);
  4582. }
  4583. interac_cnt_++;
  4584. }
  4585. }
  4586. interac_cnt.push_back(interac_cnt_);
  4587. }
  4588. }
  4589. { // Combine interac data
  4590. InteracData& interac_data=data.interac_data;
  4591. { // in_node
  4592. typedef size_t ElemType;
  4593. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4594. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4595. std::vector<size_t> vec_dsp(omp_p+1,0);
  4596. for(size_t tid=0;tid<omp_p;tid++){
  4597. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4598. }
  4599. vec.ReInit(vec_dsp[omp_p]);
  4600. #pragma omp parallel for
  4601. for(size_t tid=0;tid<omp_p;tid++){
  4602. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4603. }
  4604. }
  4605. { // scal_idx
  4606. typedef size_t ElemType;
  4607. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4608. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4609. std::vector<size_t> vec_dsp(omp_p+1,0);
  4610. for(size_t tid=0;tid<omp_p;tid++){
  4611. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4612. }
  4613. vec.ReInit(vec_dsp[omp_p]);
  4614. #pragma omp parallel for
  4615. for(size_t tid=0;tid<omp_p;tid++){
  4616. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4617. }
  4618. }
  4619. { // coord_shift
  4620. typedef Real_t ElemType;
  4621. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4622. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4623. std::vector<size_t> vec_dsp(omp_p+1,0);
  4624. for(size_t tid=0;tid<omp_p;tid++){
  4625. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4626. }
  4627. vec.ReInit(vec_dsp[omp_p]);
  4628. #pragma omp parallel for
  4629. for(size_t tid=0;tid<omp_p;tid++){
  4630. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4631. }
  4632. }
  4633. { // interac_cnt
  4634. typedef size_t ElemType;
  4635. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4636. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4637. std::vector<size_t> vec_dsp(omp_p+1,0);
  4638. for(size_t tid=0;tid<omp_p;tid++){
  4639. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4640. }
  4641. vec.ReInit(vec_dsp[omp_p]);
  4642. #pragma omp parallel for
  4643. for(size_t tid=0;tid<omp_p;tid++){
  4644. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4645. }
  4646. }
  4647. { // interac_dsp
  4648. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4649. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4650. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4651. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4652. }
  4653. }
  4654. }
  4655. PtSetup(setup_data, &data);
  4656. }
  4657. template <class FMMNode>
  4658. void FMM_Pts<FMMNode>::U_List (SetupData<Real_t>& setup_data, bool device){
  4659. //Add U_List contribution.
  4660. this->EvalListPts(setup_data, device);
  4661. }
  4662. template <class FMMNode>
  4663. void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  4664. if(!this->MultipoleOrder()) return;
  4665. { // Set setup_data
  4666. setup_data. level=level;
  4667. setup_data.kernel=kernel->k_l2t;
  4668. setup_data. input_data=&buff[1];
  4669. setup_data.output_data=&buff[5];
  4670. setup_data. coord_data=&buff[6];
  4671. Vector<FMMNode_t*>& nodes_in =n_list[1];
  4672. Vector<FMMNode_t*>& nodes_out=n_list[5];
  4673. setup_data.nodes_in .clear();
  4674. setup_data.nodes_out.clear();
  4675. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->trg_coord.Dim() && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
  4676. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->trg_coord.Dim() && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  4677. }
  4678. struct PackedData{
  4679. size_t len;
  4680. Matrix<Real_t>* ptr;
  4681. Vector<size_t> cnt;
  4682. Vector<size_t> dsp;
  4683. };
  4684. struct InteracData{
  4685. Vector<size_t> in_node;
  4686. Vector<size_t> scal_idx;
  4687. Vector<Real_t> coord_shift;
  4688. Vector<size_t> interac_cnt;
  4689. Vector<size_t> interac_dsp;
  4690. Vector<size_t> interac_cst;
  4691. Vector<Real_t> scal[4*MAX_DEPTH];
  4692. Matrix<Real_t> M[4];
  4693. };
  4694. struct ptSetupData{
  4695. int level;
  4696. const Kernel<Real_t>* kernel;
  4697. PackedData src_coord; // Src coord
  4698. PackedData src_value; // Src density
  4699. PackedData srf_coord; // Srf coord
  4700. PackedData srf_value; // Srf density
  4701. PackedData trg_coord; // Trg coord
  4702. PackedData trg_value; // Trg potential
  4703. InteracData interac_data;
  4704. };
  4705. ptSetupData data;
  4706. data. level=setup_data. level;
  4707. data.kernel=setup_data.kernel;
  4708. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  4709. std::vector<void*>& nodes_out=setup_data.nodes_out;
  4710. { // Set src data
  4711. std::vector<void*>& nodes=nodes_in;
  4712. PackedData& coord=data.src_coord;
  4713. PackedData& value=data.src_value;
  4714. coord.ptr=setup_data. coord_data;
  4715. value.ptr=setup_data. input_data;
  4716. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4717. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4718. coord.cnt.ReInit(nodes.size());
  4719. coord.dsp.ReInit(nodes.size());
  4720. value.cnt.ReInit(nodes.size());
  4721. value.dsp.ReInit(nodes.size());
  4722. #pragma omp parallel for
  4723. for(size_t i=0;i<nodes.size();i++){
  4724. ((FMMNode_t*)nodes[i])->node_id=i;
  4725. Vector<Real_t>& coord_vec=tree->dnwd_equiv_surf[((FMMNode*)nodes[i])->Depth()];
  4726. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->dnward_equiv;
  4727. if(coord_vec.Dim()){
  4728. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4729. assert(coord.dsp[i]<coord.len);
  4730. coord.cnt[i]=coord_vec.Dim();
  4731. }else{
  4732. coord.dsp[i]=0;
  4733. coord.cnt[i]=0;
  4734. }
  4735. if(value_vec.Dim()){
  4736. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4737. assert(value.dsp[i]<value.len);
  4738. value.cnt[i]=value_vec.Dim();
  4739. }else{
  4740. value.dsp[i]=0;
  4741. value.cnt[i]=0;
  4742. }
  4743. }
  4744. }
  4745. { // Set srf data
  4746. std::vector<void*>& nodes=nodes_in;
  4747. PackedData& coord=data.srf_coord;
  4748. PackedData& value=data.srf_value;
  4749. coord.ptr=setup_data. coord_data;
  4750. value.ptr=setup_data. input_data;
  4751. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4752. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4753. coord.cnt.ReInit(nodes.size());
  4754. coord.dsp.ReInit(nodes.size());
  4755. value.cnt.ReInit(nodes.size());
  4756. value.dsp.ReInit(nodes.size());
  4757. #pragma omp parallel for
  4758. for(size_t i=0;i<nodes.size();i++){
  4759. coord.dsp[i]=0;
  4760. coord.cnt[i]=0;
  4761. value.dsp[i]=0;
  4762. value.cnt[i]=0;
  4763. }
  4764. }
  4765. { // Set trg data
  4766. std::vector<void*>& nodes=nodes_out;
  4767. PackedData& coord=data.trg_coord;
  4768. PackedData& value=data.trg_value;
  4769. coord.ptr=setup_data. coord_data;
  4770. value.ptr=setup_data.output_data;
  4771. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4772. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4773. coord.cnt.ReInit(nodes.size());
  4774. coord.dsp.ReInit(nodes.size());
  4775. value.cnt.ReInit(nodes.size());
  4776. value.dsp.ReInit(nodes.size());
  4777. #pragma omp parallel for
  4778. for(size_t i=0;i<nodes.size();i++){
  4779. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4780. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4781. if(coord_vec.Dim()){
  4782. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4783. assert(coord.dsp[i]<coord.len);
  4784. coord.cnt[i]=coord_vec.Dim();
  4785. }else{
  4786. coord.dsp[i]=0;
  4787. coord.cnt[i]=0;
  4788. }
  4789. if(value_vec.Dim()){
  4790. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4791. assert(value.dsp[i]<value.len);
  4792. value.cnt[i]=value_vec.Dim();
  4793. }else{
  4794. value.dsp[i]=0;
  4795. value.cnt[i]=0;
  4796. }
  4797. }
  4798. }
  4799. { // Set interac_data
  4800. int omp_p=omp_get_max_threads();
  4801. std::vector<std::vector<size_t> > in_node_(omp_p);
  4802. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4803. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4804. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4805. if(this->ScaleInvar()){ // Set scal
  4806. const Kernel<Real_t>* ker=kernel->k_l2l;
  4807. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+0]
  4808. Vector<Real_t>& scal=data.interac_data.scal[l*4+0];
  4809. Vector<Real_t>& scal_exp=ker->trg_scal;
  4810. scal.ReInit(scal_exp.Dim());
  4811. for(size_t i=0;i<scal.Dim();i++){
  4812. scal[i]=pvfmm::pow<Real_t>(2.0,-scal_exp[i]*l);
  4813. }
  4814. }
  4815. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+1]
  4816. Vector<Real_t>& scal=data.interac_data.scal[l*4+1];
  4817. Vector<Real_t>& scal_exp=ker->src_scal;
  4818. scal.ReInit(scal_exp.Dim());
  4819. for(size_t i=0;i<scal.Dim();i++){
  4820. scal[i]=pvfmm::pow<Real_t>(2.0,-scal_exp[i]*l);
  4821. }
  4822. }
  4823. }
  4824. #pragma omp parallel for
  4825. for(size_t tid=0;tid<omp_p;tid++){
  4826. std::vector<size_t>& in_node =in_node_[tid] ;
  4827. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4828. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4829. std::vector<size_t>& interac_cnt=interac_cnt_[tid];
  4830. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4831. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4832. for(size_t i=a;i<b;i++){
  4833. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4834. Real_t s=pvfmm::pow<Real_t>(0.5,tnode->Depth());
  4835. size_t interac_cnt_=0;
  4836. { // D2T_Type
  4837. Mat_Type type=D2T_Type;
  4838. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4839. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4840. FMMNode_t* snode=intlst[j];
  4841. size_t snode_id=snode->node_id;
  4842. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4843. in_node.push_back(snode_id);
  4844. scal_idx.push_back(snode->Depth());
  4845. { // set coord_shift
  4846. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4847. const Real_t* scoord=snode->Coord();
  4848. const Real_t* tcoord=tnode->Coord();
  4849. Real_t shift[COORD_DIM];
  4850. shift[0]=rel_coord[0]*0.5*s-(0+0.5*s)+(tcoord[0]+0.5*s);
  4851. shift[1]=rel_coord[1]*0.5*s-(0+0.5*s)+(tcoord[1]+0.5*s);
  4852. shift[2]=rel_coord[2]*0.5*s-(0+0.5*s)+(tcoord[2]+0.5*s);
  4853. coord_shift.push_back(shift[0]);
  4854. coord_shift.push_back(shift[1]);
  4855. coord_shift.push_back(shift[2]);
  4856. }
  4857. interac_cnt_++;
  4858. }
  4859. }
  4860. interac_cnt.push_back(interac_cnt_);
  4861. }
  4862. }
  4863. { // Combine interac data
  4864. InteracData& interac_data=data.interac_data;
  4865. { // in_node
  4866. typedef size_t ElemType;
  4867. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4868. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4869. std::vector<size_t> vec_dsp(omp_p+1,0);
  4870. for(size_t tid=0;tid<omp_p;tid++){
  4871. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4872. }
  4873. vec.ReInit(vec_dsp[omp_p]);
  4874. #pragma omp parallel for
  4875. for(size_t tid=0;tid<omp_p;tid++){
  4876. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4877. }
  4878. }
  4879. { // scal_idx
  4880. typedef size_t ElemType;
  4881. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4882. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4883. std::vector<size_t> vec_dsp(omp_p+1,0);
  4884. for(size_t tid=0;tid<omp_p;tid++){
  4885. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4886. }
  4887. vec.ReInit(vec_dsp[omp_p]);
  4888. #pragma omp parallel for
  4889. for(size_t tid=0;tid<omp_p;tid++){
  4890. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4891. }
  4892. }
  4893. { // coord_shift
  4894. typedef Real_t ElemType;
  4895. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4896. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4897. std::vector<size_t> vec_dsp(omp_p+1,0);
  4898. for(size_t tid=0;tid<omp_p;tid++){
  4899. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4900. }
  4901. vec.ReInit(vec_dsp[omp_p]);
  4902. #pragma omp parallel for
  4903. for(size_t tid=0;tid<omp_p;tid++){
  4904. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4905. }
  4906. }
  4907. { // interac_cnt
  4908. typedef size_t ElemType;
  4909. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4910. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4911. std::vector<size_t> vec_dsp(omp_p+1,0);
  4912. for(size_t tid=0;tid<omp_p;tid++){
  4913. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4914. }
  4915. vec.ReInit(vec_dsp[omp_p]);
  4916. #pragma omp parallel for
  4917. for(size_t tid=0;tid<omp_p;tid++){
  4918. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4919. }
  4920. }
  4921. { // interac_dsp
  4922. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4923. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4924. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4925. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4926. }
  4927. }
  4928. { // Set M[0], M[1]
  4929. InteracData& interac_data=data.interac_data;
  4930. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4931. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4932. if(cnt.Dim() && cnt[cnt.Dim()-1]+dsp[dsp.Dim()-1]){
  4933. data.interac_data.M[0]=this->mat->Mat(level, DC2DE0_Type, 0);
  4934. data.interac_data.M[1]=this->mat->Mat(level, DC2DE1_Type, 0);
  4935. }else{
  4936. data.interac_data.M[0].ReInit(0,0);
  4937. data.interac_data.M[1].ReInit(0,0);
  4938. }
  4939. }
  4940. }
  4941. PtSetup(setup_data, &data);
  4942. }
  4943. template <class FMMNode>
  4944. void FMM_Pts<FMMNode>::Down2Target(SetupData<Real_t>& setup_data, bool device){
  4945. if(!this->MultipoleOrder()) return;
  4946. //Add Down2Target contribution.
  4947. this->EvalListPts(setup_data, device);
  4948. }
  4949. template <class FMMNode>
  4950. void FMM_Pts<FMMNode>::PostProcessing(FMMTree_t* tree, std::vector<FMMNode_t*>& nodes, BoundaryType bndry){
  4951. if(kernel->k_m2l->vol_poten && bndry==Periodic){ // Add analytical near-field to target potential
  4952. const Kernel<Real_t>& k_m2t=*kernel->k_m2t;
  4953. int ker_dim[2]={k_m2t.ker_dim[0],k_m2t.ker_dim[1]};
  4954. Vector<Real_t>& up_equiv=((FMMData*)tree->RootNode()->FMMData())->upward_equiv;
  4955. Matrix<Real_t> avg_density(1,ker_dim[0]); avg_density.SetZero();
  4956. for(size_t i0=0;i0<up_equiv.Dim();i0+=ker_dim[0]){
  4957. for(size_t i1=0;i1<ker_dim[0];i1++){
  4958. avg_density[0][i1]+=up_equiv[i0+i1];
  4959. }
  4960. }
  4961. int omp_p=omp_get_max_threads();
  4962. std::vector<Matrix<Real_t> > M_tmp(omp_p);
  4963. #pragma omp parallel for
  4964. for(size_t i=0;i<nodes.size();i++)
  4965. if(nodes[i]->IsLeaf() && !nodes[i]->IsGhost()){
  4966. Vector<Real_t>& trg_coord=nodes[i]->trg_coord;
  4967. Vector<Real_t>& trg_value=nodes[i]->trg_value;
  4968. size_t n_trg=trg_coord.Dim()/COORD_DIM;
  4969. Matrix<Real_t>& M_vol=M_tmp[omp_get_thread_num()];
  4970. M_vol.ReInit(ker_dim[0],n_trg*ker_dim[1]); M_vol.SetZero();
  4971. k_m2t.vol_poten(&trg_coord[0],n_trg,&M_vol[0][0]);
  4972. Matrix<Real_t> M_trg(1,n_trg*ker_dim[1],&trg_value[0],false);
  4973. M_trg-=avg_density*M_vol;
  4974. }
  4975. }
  4976. }
  4977. template <class FMMNode>
  4978. void FMM_Pts<FMMNode>::CopyOutput(FMMNode** nodes, size_t n){
  4979. }
  4980. }//end namespace