fmm_pts.txx 214 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581
  1. /**
  2. * \file fmm_pts.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 3-07-2011
  5. * \brief This file contains the implementation of the FMM_Pts class.
  6. */
  7. #include <omp.h>
  8. #include <cmath>
  9. #include <cstdlib>
  10. #include <cassert>
  11. #include <sstream>
  12. #include <iostream>
  13. #include <stdint.h>
  14. #include <set>
  15. #ifdef PVFMM_HAVE_SYS_STAT_H
  16. #include <sys/stat.h>
  17. #endif
  18. #ifdef __SSE__
  19. #include <xmmintrin.h>
  20. #endif
  21. #ifdef __SSE2__
  22. #include <emmintrin.h>
  23. #endif
  24. #ifdef __SSE3__
  25. #include <pmmintrin.h>
  26. #endif
  27. #ifdef __AVX__
  28. #include <immintrin.h>
  29. #endif
  30. #if defined(__MIC__)
  31. #include <immintrin.h>
  32. #endif
  33. #include <profile.hpp>
  34. #include <cheb_utils.hpp>
  35. namespace pvfmm{
  36. /**
  37. * \brief Returns the coordinates of points on the surface of a cube.
  38. * \param[in] p Number of points on an edge of the cube is (n+1)
  39. * \param[in] c Coordinates to the centre of the cube (3D array).
  40. * \param[in] alpha Scaling factor for the size of the cube.
  41. * \param[in] depth Depth of the cube in the octree.
  42. * \return Vector with coordinates of points on the surface of the cube in the
  43. * format [x0 y0 z0 x1 y1 z1 .... ].
  44. */
  45. template <class Real_t>
  46. std::vector<Real_t> surface(int p, Real_t* c, Real_t alpha, int depth){
  47. size_t n_=(6*(p-1)*(p-1)+2); //Total number of points.
  48. std::vector<Real_t> coord(n_*3);
  49. coord[0]=coord[1]=coord[2]=-1.0;
  50. size_t cnt=1;
  51. for(int i=0;i<p-1;i++)
  52. for(int j=0;j<p-1;j++){
  53. coord[cnt*3 ]=-1.0;
  54. coord[cnt*3+1]=(2.0*(i+1)-p+1)/(p-1);
  55. coord[cnt*3+2]=(2.0*j-p+1)/(p-1);
  56. cnt++;
  57. }
  58. for(int i=0;i<p-1;i++)
  59. for(int j=0;j<p-1;j++){
  60. coord[cnt*3 ]=(2.0*i-p+1)/(p-1);
  61. coord[cnt*3+1]=-1.0;
  62. coord[cnt*3+2]=(2.0*(j+1)-p+1)/(p-1);
  63. cnt++;
  64. }
  65. for(int i=0;i<p-1;i++)
  66. for(int j=0;j<p-1;j++){
  67. coord[cnt*3 ]=(2.0*(i+1)-p+1)/(p-1);
  68. coord[cnt*3+1]=(2.0*j-p+1)/(p-1);
  69. coord[cnt*3+2]=-1.0;
  70. cnt++;
  71. }
  72. for(size_t i=0;i<(n_/2)*3;i++)
  73. coord[cnt*3+i]=-coord[i];
  74. Real_t r = 0.5*pvfmm::pow<Real_t>(0.5,depth);
  75. Real_t b = alpha*r;
  76. for(size_t i=0;i<n_;i++){
  77. coord[i*3+0]=(coord[i*3+0]+1.0)*b+c[0];
  78. coord[i*3+1]=(coord[i*3+1]+1.0)*b+c[1];
  79. coord[i*3+2]=(coord[i*3+2]+1.0)*b+c[2];
  80. }
  81. return coord;
  82. }
  83. /**
  84. * \brief Returns the coordinates of points on the upward check surface of cube.
  85. * \see surface()
  86. */
  87. template <class Real_t>
  88. std::vector<Real_t> u_check_surf(int p, Real_t* c, int depth){
  89. Real_t r=0.5*pvfmm::pow<Real_t>(0.5,depth);
  90. Real_t coord[3]={(Real_t)(c[0]-r*(RAD1-1.0)),(Real_t)(c[1]-r*(RAD1-1.0)),(Real_t)(c[2]-r*(RAD1-1.0))};
  91. return surface(p,coord,(Real_t)RAD1,depth);
  92. }
  93. /**
  94. * \brief Returns the coordinates of points on the upward equivalent surface of cube.
  95. * \see surface()
  96. */
  97. template <class Real_t>
  98. std::vector<Real_t> u_equiv_surf(int p, Real_t* c, int depth){
  99. Real_t r=0.5*pvfmm::pow<Real_t>(0.5,depth);
  100. Real_t coord[3]={(Real_t)(c[0]-r*(RAD0-1.0)),(Real_t)(c[1]-r*(RAD0-1.0)),(Real_t)(c[2]-r*(RAD0-1.0))};
  101. return surface(p,coord,(Real_t)RAD0,depth);
  102. }
  103. /**
  104. * \brief Returns the coordinates of points on the downward check surface of cube.
  105. * \see surface()
  106. */
  107. template <class Real_t>
  108. std::vector<Real_t> d_check_surf(int p, Real_t* c, int depth){
  109. Real_t r=0.5*pvfmm::pow<Real_t>(0.5,depth);
  110. Real_t coord[3]={(Real_t)(c[0]-r*(RAD0-1.0)),(Real_t)(c[1]-r*(RAD0-1.0)),(Real_t)(c[2]-r*(RAD0-1.0))};
  111. return surface(p,coord,(Real_t)RAD0,depth);
  112. }
  113. /**
  114. * \brief Returns the coordinates of points on the downward equivalent surface of cube.
  115. * \see surface()
  116. */
  117. template <class Real_t>
  118. std::vector<Real_t> d_equiv_surf(int p, Real_t* c, int depth){
  119. Real_t r=0.5*pvfmm::pow<Real_t>(0.5,depth);
  120. Real_t coord[3]={(Real_t)(c[0]-r*(RAD1-1.0)),(Real_t)(c[1]-r*(RAD1-1.0)),(Real_t)(c[2]-r*(RAD1-1.0))};
  121. return surface(p,coord,(Real_t)RAD1,depth);
  122. }
  123. /**
  124. * \brief Defines the 3D grid for convolution in FFT acceleration of V-list.
  125. * \see surface()
  126. */
  127. template <class Real_t>
  128. std::vector<Real_t> conv_grid(int p, Real_t* c, int depth){
  129. Real_t r=pvfmm::pow<Real_t>(0.5,depth);
  130. Real_t a=r*RAD0;
  131. Real_t coord[3]={c[0],c[1],c[2]};
  132. int n1=p*2;
  133. int n2=pvfmm::pow<int>((Real_t)n1,2);
  134. int n3=pvfmm::pow<int>((Real_t)n1,3);
  135. std::vector<Real_t> grid(n3*3);
  136. for(int i=0;i<n1;i++)
  137. for(int j=0;j<n1;j++)
  138. for(int k=0;k<n1;k++){
  139. grid[(i+n1*j+n2*k)*3+0]=(i-p)*a/(p-1)+coord[0];
  140. grid[(i+n1*j+n2*k)*3+1]=(j-p)*a/(p-1)+coord[1];
  141. grid[(i+n1*j+n2*k)*3+2]=(k-p)*a/(p-1)+coord[2];
  142. }
  143. return grid;
  144. }
  145. template <class Real_t>
  146. void FMM_Data<Real_t>::Clear(){
  147. upward_equiv.Resize(0);
  148. }
  149. template <class Real_t>
  150. PackedData FMM_Data<Real_t>::PackMultipole(void* buff_ptr){
  151. PackedData p0; p0.data=buff_ptr;
  152. p0.length=upward_equiv.Dim()*sizeof(Real_t);
  153. if(p0.length==0) return p0;
  154. if(p0.data==NULL) p0.data=(char*)&upward_equiv[0];
  155. else mem::memcopy(p0.data,&upward_equiv[0],p0.length);
  156. return p0;
  157. }
  158. template <class Real_t>
  159. void FMM_Data<Real_t>::AddMultipole(PackedData p0){
  160. Real_t* data=(Real_t*)p0.data;
  161. size_t n=p0.length/sizeof(Real_t);
  162. assert(upward_equiv.Dim()==n);
  163. Matrix<Real_t> v0(1,n,&upward_equiv[0],false);
  164. Matrix<Real_t> v1(1,n,data,false);
  165. v0+=v1;
  166. }
  167. template <class Real_t>
  168. void FMM_Data<Real_t>::InitMultipole(PackedData p0, bool own_data){
  169. Real_t* data=(Real_t*)p0.data;
  170. size_t n=p0.length/sizeof(Real_t);
  171. if(n==0) return;
  172. if(own_data){
  173. upward_equiv=Vector<Real_t>(n, &data[0], false);
  174. }else{
  175. upward_equiv.ReInit(n, &data[0], false);
  176. }
  177. }
  178. template <class FMMNode>
  179. FMM_Pts<FMMNode>::~FMM_Pts() {
  180. if(mat!=NULL){
  181. // int rank;
  182. // MPI_Comm_rank(comm,&rank);
  183. // if(rank==0) mat->Save2File("Precomp.data");
  184. delete mat;
  185. mat=NULL;
  186. }
  187. if(vprecomp_fft_flag) FFTW_t<Real_t>::fft_destroy_plan(vprecomp_fftplan);
  188. #ifdef __INTEL_OFFLOAD0
  189. #pragma offload target(mic:0)
  190. #endif
  191. {
  192. if(vlist_fft_flag ) FFTW_t<Real_t>::fft_destroy_plan(vlist_fftplan );
  193. if(vlist_ifft_flag) FFTW_t<Real_t>::fft_destroy_plan(vlist_ifftplan);
  194. vlist_fft_flag =false;
  195. vlist_ifft_flag=false;
  196. }
  197. }
  198. template <class FMMNode>
  199. void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel<Real_t>* kernel_){
  200. Profile::Tic("InitFMM_Pts",&comm_,true);{
  201. int rank;
  202. MPI_Comm_rank(comm_,&rank);
  203. bool verbose=false;
  204. #ifndef NDEBUG
  205. #ifdef __VERBOSE__
  206. if(!rank) verbose=true;
  207. #endif
  208. #endif
  209. if(kernel_) kernel_->Initialize(verbose);
  210. multipole_order=mult_order;
  211. comm=comm_;
  212. kernel=kernel_;
  213. assert(kernel!=NULL);
  214. bool save_precomp=false;
  215. mat=new PrecompMat<Real_t>(ScaleInvar());
  216. if(this->mat_fname.size()==0){// && !this->ScaleInvar()){
  217. std::stringstream st;
  218. st<<PVFMM_PRECOMP_DATA_PATH;
  219. if(!st.str().size()){ // look in PVFMM_DIR
  220. char* pvfmm_dir = getenv ("PVFMM_DIR");
  221. if(pvfmm_dir) st<<pvfmm_dir;
  222. }
  223. #ifndef STAT_MACROS_BROKEN
  224. if(st.str().size()){ // check if the path is a directory
  225. struct stat stat_buff;
  226. if(stat(st.str().c_str(), &stat_buff) || !S_ISDIR(stat_buff.st_mode)){
  227. std::cout<<"error: path not found: "<<st.str()<<'\n';
  228. exit(0);
  229. }
  230. }
  231. #endif
  232. if(st.str().size()) st<<'/';
  233. st<<"Precomp_"<<kernel->ker_name.c_str()<<"_m"<<mult_order;
  234. if(sizeof(Real_t)==8) st<<"";
  235. else if(sizeof(Real_t)==4) st<<"_f";
  236. else st<<"_t"<<sizeof(Real_t);
  237. st<<".data";
  238. this->mat_fname=st.str();
  239. save_precomp=true;
  240. }
  241. this->mat->LoadFile(mat_fname.c_str(), this->comm);
  242. interac_list.Initialize(COORD_DIM, this->mat);
  243. Profile::Tic("PrecompUC2UE",&comm,false,4);
  244. this->PrecompAll(UC2UE0_Type);
  245. this->PrecompAll(UC2UE1_Type);
  246. Profile::Toc();
  247. Profile::Tic("PrecompDC2DE",&comm,false,4);
  248. this->PrecompAll(DC2DE0_Type);
  249. this->PrecompAll(DC2DE1_Type);
  250. Profile::Toc();
  251. Profile::Tic("PrecompBC",&comm,false,4);
  252. { /*
  253. int type=BC_Type;
  254. for(int l=0;l<MAX_DEPTH;l++)
  255. for(size_t indx=0;indx<this->interac_list.ListCount((Mat_Type)type);indx++){
  256. Matrix<Real_t>& M=this->mat->Mat(l, (Mat_Type)type, indx);
  257. M.Resize(0,0);
  258. } // */
  259. }
  260. this->PrecompAll(BC_Type,0);
  261. Profile::Toc();
  262. Profile::Tic("PrecompU2U",&comm,false,4);
  263. this->PrecompAll(U2U_Type);
  264. Profile::Toc();
  265. Profile::Tic("PrecompD2D",&comm,false,4);
  266. this->PrecompAll(D2D_Type);
  267. Profile::Toc();
  268. if(save_precomp){
  269. Profile::Tic("Save2File",&this->comm,false,4);
  270. if(!rank){
  271. FILE* f=fopen(this->mat_fname.c_str(),"r");
  272. if(f==NULL) { //File does not exists.
  273. this->mat->Save2File(this->mat_fname.c_str());
  274. }else fclose(f);
  275. }
  276. Profile::Toc();
  277. }
  278. Profile::Tic("PrecompV",&comm,false,4);
  279. this->PrecompAll(V_Type);
  280. Profile::Toc();
  281. Profile::Tic("PrecompV1",&comm,false,4);
  282. this->PrecompAll(V1_Type);
  283. Profile::Toc();
  284. }Profile::Toc();
  285. }
  286. template <class Real_t>
  287. Permutation<Real_t> equiv_surf_perm(size_t m, size_t p_indx, const Permutation<Real_t>& ker_perm, const Vector<Real_t>* scal_exp=NULL){
  288. Real_t eps=1e-10;
  289. int dof=ker_perm.Dim();
  290. Real_t c[3]={-0.5,-0.5,-0.5};
  291. std::vector<Real_t> trg_coord=d_check_surf(m,c,0);
  292. int n_trg=trg_coord.size()/3;
  293. Permutation<Real_t> P=Permutation<Real_t>(n_trg*dof);
  294. if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){ // Set P.perm
  295. for(int i=0;i<n_trg;i++)
  296. for(int j=0;j<n_trg;j++){
  297. if(pvfmm::fabs<Real_t>(trg_coord[i*3+0]-trg_coord[j*3+0]*(p_indx==ReflecX?-1.0:1.0))<eps)
  298. if(pvfmm::fabs<Real_t>(trg_coord[i*3+1]-trg_coord[j*3+1]*(p_indx==ReflecY?-1.0:1.0))<eps)
  299. if(pvfmm::fabs<Real_t>(trg_coord[i*3+2]-trg_coord[j*3+2]*(p_indx==ReflecZ?-1.0:1.0))<eps){
  300. for(int k=0;k<dof;k++){
  301. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  302. }
  303. }
  304. }
  305. }else if(p_indx==SwapXY || p_indx==SwapXZ){
  306. for(int i=0;i<n_trg;i++)
  307. for(int j=0;j<n_trg;j++){
  308. if(pvfmm::fabs<Real_t>(trg_coord[i*3+0]-trg_coord[j*3+(p_indx==SwapXY?1:2)])<eps)
  309. if(pvfmm::fabs<Real_t>(trg_coord[i*3+1]-trg_coord[j*3+(p_indx==SwapXY?0:1)])<eps)
  310. if(pvfmm::fabs<Real_t>(trg_coord[i*3+2]-trg_coord[j*3+(p_indx==SwapXY?2:0)])<eps){
  311. for(int k=0;k<dof;k++){
  312. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  313. }
  314. }
  315. }
  316. }else{
  317. for(int j=0;j<n_trg;j++){
  318. for(int k=0;k<dof;k++){
  319. P.perm[j*dof+k]=j*dof+ker_perm.perm[k];
  320. }
  321. }
  322. }
  323. if(scal_exp && p_indx==Scaling){ // Set level-by-level scaling
  324. assert(dof==scal_exp->Dim());
  325. Vector<Real_t> scal(scal_exp->Dim());
  326. for(size_t i=0;i<scal.Dim();i++){
  327. scal[i]=pvfmm::pow<Real_t>(2.0,(*scal_exp)[i]);
  328. }
  329. for(int j=0;j<n_trg;j++){
  330. for(int i=0;i<dof;i++){
  331. P.scal[j*dof+i]*=scal[i];
  332. }
  333. }
  334. }
  335. { // Set P.scal
  336. for(int j=0;j<n_trg;j++){
  337. for(int i=0;i<dof;i++){
  338. P.scal[j*dof+i]*=ker_perm.scal[i];
  339. }
  340. }
  341. }
  342. return P;
  343. }
  344. template <class FMMNode>
  345. Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type type, Perm_Type perm_indx){
  346. //Check if the matrix already exists.
  347. Permutation<Real_t>& P_ = mat->Perm((Mat_Type)type, perm_indx);
  348. if(P_.Dim()!=0) return P_;
  349. size_t m=this->MultipoleOrder();
  350. size_t p_indx=perm_indx % C_Perm;
  351. //Compute the matrix.
  352. Permutation<Real_t> P;
  353. switch (type){
  354. case U2U_Type:
  355. {
  356. Vector<Real_t> scal_exp;
  357. Permutation<Real_t> ker_perm;
  358. if(perm_indx<C_Perm){ // Source permutation
  359. ker_perm=kernel->k_m2m->perm_vec[0 +p_indx];
  360. scal_exp=kernel->k_m2m->src_scal;
  361. }else{ // Target permutation
  362. ker_perm=kernel->k_m2m->perm_vec[0 +p_indx];
  363. scal_exp=kernel->k_m2m->src_scal;
  364. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  365. }
  366. P=equiv_surf_perm(m, p_indx, ker_perm, (this->ScaleInvar()?&scal_exp:NULL));
  367. break;
  368. }
  369. case D2D_Type:
  370. {
  371. Vector<Real_t> scal_exp;
  372. Permutation<Real_t> ker_perm;
  373. if(perm_indx<C_Perm){ // Source permutation
  374. ker_perm=kernel->k_l2l->perm_vec[C_Perm+p_indx];
  375. scal_exp=kernel->k_l2l->trg_scal;
  376. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  377. }else{ // Target permutation
  378. ker_perm=kernel->k_l2l->perm_vec[C_Perm+p_indx];
  379. scal_exp=kernel->k_l2l->trg_scal;
  380. }
  381. P=equiv_surf_perm(m, p_indx, ker_perm, (this->ScaleInvar()?&scal_exp:NULL));
  382. break;
  383. }
  384. default:
  385. break;
  386. }
  387. //Save the matrix for future use.
  388. #pragma omp critical (PRECOMP_MATRIX_PTS)
  389. {
  390. if(P_.Dim()==0) P_=P;
  391. }
  392. return P_;
  393. }
  394. template <class FMMNode>
  395. Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type type, size_t mat_indx){
  396. if(this->ScaleInvar()) level=0;
  397. //Check if the matrix already exists.
  398. Matrix<Real_t>& M_ = this->mat->Mat(level, type, mat_indx);
  399. if(M_.Dim(0)!=0 && M_.Dim(1)!=0) return M_;
  400. else{ //Compute matrix from symmetry class (if possible).
  401. size_t class_indx = this->interac_list.InteracClass(type, mat_indx);
  402. if(class_indx!=mat_indx){
  403. Matrix<Real_t>& M0 = this->Precomp(level, type, class_indx);
  404. if(M0.Dim(0)==0 || M0.Dim(1)==0) return M_;
  405. for(size_t i=0;i<Perm_Count;i++) this->PrecompPerm(type, (Perm_Type) i);
  406. Permutation<Real_t>& Pr = this->interac_list.Perm_R(abs(level), type, mat_indx);
  407. Permutation<Real_t>& Pc = this->interac_list.Perm_C(abs(level), type, mat_indx);
  408. if(Pr.Dim()>0 && Pc.Dim()>0 && M0.Dim(0)>0 && M0.Dim(1)>0) return M_;
  409. }
  410. }
  411. //Compute the matrix.
  412. Matrix<Real_t> M;
  413. //int omp_p=omp_get_max_threads();
  414. switch (type){
  415. case UC2UE0_Type:
  416. {
  417. if(MultipoleOrder()==0) break;
  418. const int* ker_dim=kernel->k_m2m->ker_dim;
  419. // Coord of upward check surface
  420. Real_t c[3]={0,0,0};
  421. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  422. size_t n_uc=uc_coord.size()/3;
  423. // Coord of upward equivalent surface
  424. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  425. size_t n_ue=ue_coord.size()/3;
  426. // Evaluate potential at check surface due to equivalent surface.
  427. Matrix<Real_t> M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  428. kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue,
  429. &uc_coord[0], n_uc, &(M_e2c[0][0]));
  430. Matrix<Real_t> U,S,V;
  431. M_e2c.SVD(U,S,V);
  432. Real_t eps=1, max_S=0;
  433. while(eps*(Real_t)0.5+(Real_t)1.0>1.0) eps*=0.5;
  434. for(size_t i=0;i<std::min(S.Dim(0),S.Dim(1));i++){
  435. if(pvfmm::fabs<Real_t>(S[i][i])>max_S) max_S=pvfmm::fabs<Real_t>(S[i][i]);
  436. }
  437. for(size_t i=0;i<S.Dim(0);i++) S[i][i]=(S[i][i]>eps*max_S*4?1.0/S[i][i]:0.0);
  438. M=V.Transpose()*S;//*U.Transpose();
  439. break;
  440. }
  441. case UC2UE1_Type:
  442. {
  443. if(MultipoleOrder()==0) break;
  444. const int* ker_dim=kernel->k_m2m->ker_dim;
  445. // Coord of upward check surface
  446. Real_t c[3]={0,0,0};
  447. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  448. size_t n_uc=uc_coord.size()/3;
  449. // Coord of upward equivalent surface
  450. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  451. size_t n_ue=ue_coord.size()/3;
  452. // Evaluate potential at check surface due to equivalent surface.
  453. Matrix<Real_t> M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  454. kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue,
  455. &uc_coord[0], n_uc, &(M_e2c[0][0]));
  456. Matrix<Real_t> U,S,V;
  457. M_e2c.SVD(U,S,V);
  458. M=U.Transpose();
  459. break;
  460. }
  461. case DC2DE0_Type:
  462. {
  463. if(MultipoleOrder()==0) break;
  464. const int* ker_dim=kernel->k_l2l->ker_dim;
  465. // Coord of downward check surface
  466. Real_t c[3]={0,0,0};
  467. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  468. size_t n_ch=check_surf.size()/3;
  469. // Coord of downward equivalent surface
  470. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  471. size_t n_eq=equiv_surf.size()/3;
  472. // Evaluate potential at check surface due to equivalent surface.
  473. Matrix<Real_t> M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]);
  474. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq,
  475. &check_surf[0], n_ch, &(M_e2c[0][0]));
  476. Matrix<Real_t> U,S,V;
  477. M_e2c.SVD(U,S,V);
  478. Real_t eps=1, max_S=0;
  479. while(eps*(Real_t)0.5+(Real_t)1.0>1.0) eps*=0.5;
  480. for(size_t i=0;i<std::min(S.Dim(0),S.Dim(1));i++){
  481. if(pvfmm::fabs<Real_t>(S[i][i])>max_S) max_S=pvfmm::fabs<Real_t>(S[i][i]);
  482. }
  483. for(size_t i=0;i<S.Dim(0);i++) S[i][i]=(S[i][i]>eps*max_S*4?1.0/S[i][i]:0.0);
  484. M=V.Transpose()*S;//*U.Transpose();
  485. break;
  486. }
  487. case DC2DE1_Type:
  488. {
  489. if(MultipoleOrder()==0) break;
  490. const int* ker_dim=kernel->k_l2l->ker_dim;
  491. // Coord of downward check surface
  492. Real_t c[3]={0,0,0};
  493. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  494. size_t n_ch=check_surf.size()/3;
  495. // Coord of downward equivalent surface
  496. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  497. size_t n_eq=equiv_surf.size()/3;
  498. // Evaluate potential at check surface due to equivalent surface.
  499. Matrix<Real_t> M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]);
  500. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq,
  501. &check_surf[0], n_ch, &(M_e2c[0][0]));
  502. Matrix<Real_t> U,S,V;
  503. M_e2c.SVD(U,S,V);
  504. M=U.Transpose();
  505. break;
  506. }
  507. case U2U_Type:
  508. {
  509. if(MultipoleOrder()==0) break;
  510. const int* ker_dim=kernel->k_m2m->ker_dim;
  511. // Coord of upward check surface
  512. Real_t c[3]={0,0,0};
  513. std::vector<Real_t> check_surf=u_check_surf(MultipoleOrder(),c,level);
  514. size_t n_uc=check_surf.size()/3;
  515. // Coord of child's upward equivalent surface
  516. Real_t s=pvfmm::pow<Real_t>(0.5,(level+2));
  517. int* coord=interac_list.RelativeCoord(type,mat_indx);
  518. Real_t child_coord[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  519. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),child_coord,level+1);
  520. size_t n_ue=equiv_surf.size()/3;
  521. // Evaluate potential at check surface due to equivalent surface.
  522. Matrix<Real_t> M_ce2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  523. kernel->k_m2m->BuildMatrix(&equiv_surf[0], n_ue,
  524. &check_surf[0], n_uc, &(M_ce2c[0][0]));
  525. Matrix<Real_t>& M_c2e0 = Precomp(level, UC2UE0_Type, 0);
  526. Matrix<Real_t>& M_c2e1 = Precomp(level, UC2UE1_Type, 0);
  527. M=(M_ce2c*M_c2e0)*M_c2e1;
  528. break;
  529. }
  530. case D2D_Type:
  531. {
  532. if(MultipoleOrder()==0) break;
  533. const int* ker_dim=kernel->k_l2l->ker_dim;
  534. // Coord of downward check surface
  535. Real_t s=pvfmm::pow<Real_t>(0.5,level+1);
  536. int* coord=interac_list.RelativeCoord(type,mat_indx);
  537. Real_t c[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  538. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  539. size_t n_dc=check_surf.size()/3;
  540. // Coord of parent's downward equivalent surface
  541. Real_t parent_coord[3]={0,0,0};
  542. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),parent_coord,level-1);
  543. size_t n_de=equiv_surf.size()/3;
  544. // Evaluate potential at check surface due to equivalent surface.
  545. Matrix<Real_t> M_pe2c(n_de*ker_dim[0],n_dc*ker_dim[1]);
  546. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_de,
  547. &check_surf[0], n_dc, &(M_pe2c[0][0]));
  548. Matrix<Real_t> M_c2e0=Precomp(level-1,DC2DE0_Type,0);
  549. Matrix<Real_t> M_c2e1=Precomp(level-1,DC2DE1_Type,0);
  550. if(ScaleInvar()){ // Scale M_c2e0 for level-1
  551. Permutation<Real_t> ker_perm=this->kernel->k_l2l->perm_vec[C_Perm+Scaling];
  552. Vector<Real_t> scal_exp=this->kernel->k_l2l->trg_scal;
  553. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  554. M_c2e0=P*M_c2e0;
  555. }
  556. if(ScaleInvar()){ // Scale M_c2e1 for level-1
  557. Permutation<Real_t> ker_perm=this->kernel->k_l2l->perm_vec[0 +Scaling];
  558. Vector<Real_t> scal_exp=this->kernel->k_l2l->src_scal;
  559. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  560. M_c2e1=M_c2e1*P;
  561. }
  562. M=M_c2e0*(M_c2e1*M_pe2c);
  563. break;
  564. }
  565. case D2T_Type:
  566. {
  567. if(MultipoleOrder()==0) break;
  568. const int* ker_dim=kernel->k_l2t->ker_dim;
  569. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  570. // Coord of target points
  571. Real_t r=pvfmm::pow<Real_t>(0.5,level);
  572. size_t n_trg=rel_trg_coord.size()/3;
  573. std::vector<Real_t> trg_coord(n_trg*3);
  574. for(size_t i=0;i<n_trg*COORD_DIM;i++) trg_coord[i]=rel_trg_coord[i]*r;
  575. // Coord of downward equivalent surface
  576. Real_t c[3]={0,0,0};
  577. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  578. size_t n_eq=equiv_surf.size()/3;
  579. // Evaluate potential at target points due to equivalent surface.
  580. {
  581. M .Resize(n_eq*ker_dim [0], n_trg*ker_dim [1]);
  582. kernel->k_l2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  583. }
  584. Matrix<Real_t>& M_c2e0=Precomp(level,DC2DE0_Type,0);
  585. Matrix<Real_t>& M_c2e1=Precomp(level,DC2DE1_Type,0);
  586. M=M_c2e0*(M_c2e1*M);
  587. break;
  588. }
  589. case V_Type:
  590. {
  591. if(MultipoleOrder()==0) break;
  592. const int* ker_dim=kernel->k_m2l->ker_dim;
  593. int n1=MultipoleOrder()*2;
  594. int n3 =n1*n1*n1;
  595. int n3_=n1*n1*(n1/2+1);
  596. //Compute the matrix.
  597. Real_t s=pvfmm::pow<Real_t>(0.5,level);
  598. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  599. Real_t coord_diff[3]={coord2[0]*s,coord2[1]*s,coord2[2]*s};
  600. //Evaluate potential.
  601. std::vector<Real_t> r_trg(COORD_DIM,0.0);
  602. std::vector<Real_t> conv_poten(n3*ker_dim[0]*ker_dim[1]);
  603. std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
  604. kernel->k_m2l->BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
  605. //Rearrange data.
  606. Matrix<Real_t> M_conv(n3,ker_dim[0]*ker_dim[1],&conv_poten[0],false);
  607. M_conv=M_conv.Transpose();
  608. //Compute FFTW plan.
  609. int nnn[3]={n1,n1,n1};
  610. Real_t *fftw_in, *fftw_out;
  611. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  612. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  613. #pragma omp critical (FFTW_PLAN)
  614. {
  615. if (!vprecomp_fft_flag){
  616. vprecomp_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM, nnn, ker_dim[0]*ker_dim[1],
  617. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_);
  618. vprecomp_fft_flag=true;
  619. }
  620. }
  621. //Compute FFT.
  622. mem::memcopy(fftw_in, &conv_poten[0], n3*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  623. FFTW_t<Real_t>::fft_execute_dft_r2c(vprecomp_fftplan, (Real_t*)fftw_in, (typename FFTW_t<Real_t>::cplx*)(fftw_out));
  624. Matrix<Real_t> M_(2*n3_*ker_dim[0]*ker_dim[1],1,(Real_t*)fftw_out,false);
  625. M=M_;
  626. //Free memory.
  627. mem::aligned_delete<Real_t>(fftw_in);
  628. mem::aligned_delete<Real_t>(fftw_out);
  629. break;
  630. }
  631. case V1_Type:
  632. {
  633. if(MultipoleOrder()==0) break;
  634. const int* ker_dim=kernel->k_m2l->ker_dim;
  635. size_t mat_cnt =interac_list.ListCount( V_Type);
  636. for(size_t k=0;k<mat_cnt;k++) Precomp(level, V_Type, k);
  637. const size_t chld_cnt=1UL<<COORD_DIM;
  638. size_t n1=MultipoleOrder()*2;
  639. size_t M_dim=n1*n1*(n1/2+1);
  640. size_t n3=n1*n1*n1;
  641. Vector<Real_t> zero_vec(M_dim*ker_dim[0]*ker_dim[1]*2);
  642. zero_vec.SetZero();
  643. Vector<Real_t*> M_ptr(chld_cnt*chld_cnt);
  644. for(size_t i=0;i<chld_cnt*chld_cnt;i++) M_ptr[i]=&zero_vec[0];
  645. int* rel_coord_=interac_list.RelativeCoord(V1_Type, mat_indx);
  646. for(int j1=0;j1<chld_cnt;j1++)
  647. for(int j2=0;j2<chld_cnt;j2++){
  648. int rel_coord[3]={rel_coord_[0]*2-(j1/1)%2+(j2/1)%2,
  649. rel_coord_[1]*2-(j1/2)%2+(j2/2)%2,
  650. rel_coord_[2]*2-(j1/4)%2+(j2/4)%2};
  651. for(size_t k=0;k<mat_cnt;k++){
  652. int* ref_coord=interac_list.RelativeCoord(V_Type, k);
  653. if(ref_coord[0]==rel_coord[0] &&
  654. ref_coord[1]==rel_coord[1] &&
  655. ref_coord[2]==rel_coord[2]){
  656. Matrix<Real_t>& M = this->mat->Mat(level, V_Type, k);
  657. M_ptr[j2*chld_cnt+j1]=&M[0][0];
  658. break;
  659. }
  660. }
  661. }
  662. // Build matrix ker_dim0 x ker_dim1 x M_dim x 8 x 8
  663. M.Resize(ker_dim[0]*ker_dim[1]*M_dim, 2*chld_cnt*chld_cnt);
  664. for(int j=0;j<ker_dim[0]*ker_dim[1]*M_dim;j++){
  665. for(size_t k=0;k<chld_cnt*chld_cnt;k++){
  666. M[j][k*2+0]=M_ptr[k][j*2+0]/n3;
  667. M[j][k*2+1]=M_ptr[k][j*2+1]/n3;
  668. }
  669. }
  670. break;
  671. }
  672. case W_Type:
  673. {
  674. if(MultipoleOrder()==0) break;
  675. const int* ker_dim=kernel->k_m2t->ker_dim;
  676. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  677. // Coord of target points
  678. Real_t s=pvfmm::pow<Real_t>(0.5,level);
  679. size_t n_trg=rel_trg_coord.size()/3;
  680. std::vector<Real_t> trg_coord(n_trg*3);
  681. for(size_t j=0;j<n_trg*COORD_DIM;j++) trg_coord[j]=rel_trg_coord[j]*s;
  682. // Coord of downward equivalent surface
  683. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  684. Real_t c[3]={(Real_t)((coord2[0]+1)*s*0.25),(Real_t)((coord2[1]+1)*s*0.25),(Real_t)((coord2[2]+1)*s*0.25)};
  685. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),c,level+1);
  686. size_t n_eq=equiv_surf.size()/3;
  687. // Evaluate potential at target points due to equivalent surface.
  688. {
  689. M .Resize(n_eq*ker_dim [0],n_trg*ker_dim [1]);
  690. kernel->k_m2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  691. }
  692. break;
  693. }
  694. case BC_Type:
  695. {
  696. if(!this->ScaleInvar() || MultipoleOrder()==0) break;
  697. if(kernel->k_m2l->ker_dim[0]!=kernel->k_m2m->ker_dim[0]) break;
  698. if(kernel->k_m2l->ker_dim[1]!=kernel->k_l2l->ker_dim[1]) break;
  699. int ker_dim[2]={kernel->k_m2l->ker_dim[0],kernel->k_m2l->ker_dim[1]};
  700. size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type);
  701. size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2); //Total number of points.
  702. if((M.Dim(0)!=n_surf*ker_dim[0] || M.Dim(1)!=n_surf*ker_dim[1]) && level==0){
  703. if(BC_LEVELS==0){ // Set M=0 and break;
  704. M.ReInit(n_surf*ker_dim[0],n_surf*ker_dim[1]);
  705. M.SetZero();
  706. break;
  707. }
  708. Matrix<Real_t> M_m2m[BC_LEVELS+1];
  709. Matrix<Real_t> M_m2l[BC_LEVELS+1];
  710. Matrix<Real_t> M_l2l[BC_LEVELS+1];
  711. Matrix<Real_t> M_equiv_zero_avg(n_surf*ker_dim[0],n_surf*ker_dim[0]);
  712. Matrix<Real_t> M_check_zero_avg(n_surf*ker_dim[1],n_surf*ker_dim[1]);
  713. { // Set average multipole charge to zero (projection for non-zero total source density)
  714. Matrix<Real_t> M_s2c;
  715. { // Compute M_s2c
  716. int ker_dim[2]={kernel->k_m2m->ker_dim[0],kernel->k_m2m->ker_dim[1]};
  717. M_s2c.ReInit(ker_dim[0],n_surf*ker_dim[1]);
  718. std::vector<Real_t> uc_coord;
  719. { // Coord of upward check surface
  720. Real_t c[3]={0,0,0};
  721. uc_coord=u_check_surf(MultipoleOrder(),c,0);
  722. }
  723. #pragma omp parallel for schedule(dynamic)
  724. for(size_t i=0;i<n_surf;i++){
  725. std::vector<Real_t> M_=cheb_integ<Real_t>(0, &uc_coord[i*3], 1.0, *kernel->k_m2m);
  726. for(size_t j=0; j<ker_dim[0]; j++)
  727. for(int k=0; k<ker_dim[1]; k++)
  728. M_s2c[j][i*ker_dim[1]+k] = M_[j+k*ker_dim[0]];
  729. }
  730. }
  731. Matrix<Real_t>& M_c2e0 = Precomp(level, UC2UE0_Type, 0);
  732. Matrix<Real_t>& M_c2e1 = Precomp(level, UC2UE1_Type, 0);
  733. Matrix<Real_t> M_s2e=(M_s2c*M_c2e0)*M_c2e1;
  734. for(size_t i=0;i<M_s2e.Dim(0);i++){ // Normalize each row to 1
  735. Real_t s=0;
  736. for(size_t j=0;j<M_s2e.Dim(1);j++) s+=M_s2e[i][j];
  737. s=1.0/s;
  738. for(size_t j=0;j<M_s2e.Dim(1);j++) M_s2e[i][j]*=s;
  739. }
  740. assert(M_equiv_zero_avg.Dim(0)==M_s2e.Dim(1));
  741. assert(M_equiv_zero_avg.Dim(1)==M_s2e.Dim(1));
  742. M_equiv_zero_avg.SetZero();
  743. for(size_t i=0;i<n_surf*ker_dim[0];i++)
  744. M_equiv_zero_avg[i][i]=1;
  745. for(size_t i=0;i<n_surf;i++)
  746. for(size_t k=0;k<ker_dim[0];k++)
  747. for(size_t j=0;j<n_surf*ker_dim[0];j++)
  748. M_equiv_zero_avg[i*ker_dim[0]+k][j]-=M_s2e[k][j];
  749. }
  750. { // Set average check potential to zero. (improves stability for large BC_LEVELS)
  751. M_check_zero_avg.SetZero();
  752. for(size_t i=0;i<n_surf*ker_dim[1];i++)
  753. M_check_zero_avg[i][i]+=1;
  754. for(size_t i=0;i<n_surf;i++)
  755. for(size_t j=0;j<n_surf;j++)
  756. for(size_t k=0;k<ker_dim[1];k++)
  757. M_check_zero_avg[i*ker_dim[1]+k][j*ker_dim[1]+k]-=1.0/n_surf;
  758. }
  759. for(int level=0; level>=-BC_LEVELS; level--){
  760. { // Compute M_l2l
  761. this->Precomp(level, D2D_Type, 0);
  762. Permutation<Real_t> Pr = this->interac_list.Perm_R(abs(level), D2D_Type, 0);
  763. Permutation<Real_t> Pc = this->interac_list.Perm_C(abs(level), D2D_Type, 0);
  764. { // Invert scaling because level<0
  765. for(long i=0;i<Pr.Dim();i++) Pr.scal[i]=1.0/Pr.scal[i];
  766. for(long i=0;i<Pc.Dim();i++) Pc.scal[i]=1.0/Pc.scal[i];
  767. }
  768. M_l2l[-level] = M_check_zero_avg * Pr * this->Precomp(level, D2D_Type, this->interac_list.InteracClass(D2D_Type, 0)) * Pc * M_check_zero_avg;
  769. assert(M_l2l[-level].Dim(0)>0 && M_l2l[-level].Dim(1)>0);
  770. }
  771. // Compute M_m2m
  772. for(size_t mat_indx=0; mat_indx<mat_cnt_m2m; mat_indx++){
  773. this->Precomp(level-1, U2U_Type, mat_indx);
  774. Permutation<Real_t> Pr = this->interac_list.Perm_R(abs(level-1), U2U_Type, mat_indx);
  775. Permutation<Real_t> Pc = this->interac_list.Perm_C(abs(level-1), U2U_Type, mat_indx);
  776. for(long i=0;i<Pr.Dim();i++) Pr.scal[i]=1.0/Pr.scal[i];
  777. for(long i=0;i<Pc.Dim();i++) Pc.scal[i]=1.0/Pc.scal[i];
  778. Matrix<Real_t> M = Pr * this->Precomp(level-1, U2U_Type, this->interac_list.InteracClass(U2U_Type, mat_indx)) * Pc;
  779. assert(M.Dim(0)>0 && M.Dim(1)>0);
  780. if(mat_indx==0) M_m2m[-level] = M_equiv_zero_avg*M*M_equiv_zero_avg;
  781. else M_m2m[-level] += M_equiv_zero_avg*M*M_equiv_zero_avg;
  782. }
  783. // Compute M_m2l
  784. if(!ScaleInvar() || level==0){
  785. Real_t s=(1UL<<(-level));
  786. Real_t dc_coord[3]={0,0,0};
  787. std::vector<Real_t> trg_coord=d_check_surf(MultipoleOrder(), dc_coord, level);
  788. Matrix<Real_t> M_ue2dc(n_surf*ker_dim[0], n_surf*ker_dim[1]); M_ue2dc.SetZero();
  789. for(int x0=-2;x0<4;x0++)
  790. for(int x1=-2;x1<4;x1++)
  791. for(int x2=-2;x2<4;x2++)
  792. if(abs(x0)>1 || abs(x1)>1 || abs(x2)>1){
  793. Real_t ue_coord[3]={x0*s, x1*s, x2*s};
  794. std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
  795. Matrix<Real_t> M_tmp(n_surf*ker_dim[0], n_surf*ker_dim[1]);
  796. kernel->k_m2l->BuildMatrix(&src_coord[0], n_surf,
  797. &trg_coord[0], n_surf, &(M_tmp[0][0]));
  798. M_ue2dc+=M_tmp;
  799. }
  800. M_m2l[-level]=M_equiv_zero_avg*M_ue2dc * M_check_zero_avg;
  801. }else{
  802. M_m2l[-level]=M_equiv_zero_avg * M_m2l[-level-1] * M_check_zero_avg;
  803. if(ScaleInvar()){ // Scale M_m2l
  804. Permutation<Real_t> ker_perm=this->kernel->k_m2l->perm_vec[0 +Scaling];
  805. Vector<Real_t> scal_exp=this->kernel->k_m2l->src_scal;
  806. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  807. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  808. M_m2l[-level]=P*M_m2l[-level];
  809. }
  810. if(ScaleInvar()){ // Scale M_m2l
  811. Permutation<Real_t> ker_perm=this->kernel->k_m2l->perm_vec[C_Perm+Scaling];
  812. Vector<Real_t> scal_exp=this->kernel->k_m2l->trg_scal;
  813. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  814. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  815. M_m2l[-level]=M_m2l[-level]*P;
  816. }
  817. }
  818. }
  819. for(int level=-BC_LEVELS;level<=0;level++){
  820. if(level==-BC_LEVELS) M = M_m2l[-level];
  821. else M = M_equiv_zero_avg * (M_m2l[-level] + M_m2m[-level]*M*M_l2l[-level]) * M_check_zero_avg;
  822. }
  823. if(kernel->k_m2l->vol_poten){ // Correction for far-field of analytical volume potential
  824. Matrix<Real_t> M_far;
  825. { // Compute M_far
  826. // kernel->k_m2l->vol_poten is the analtical particular solution for uniform source density=1
  827. // We already corrected far-field above with M_equiv_zero_avg, so we don't need the far field of the analytical solutions.
  828. // We take the analytical solution and subtract the near interaction (3x3x3 boxes) from it to get the far-field
  829. // Then, we add the far-field correction for the analytical solution to be subtracted later.
  830. std::vector<Real_t> dc_coord;
  831. { // Coord of upward check surface
  832. Real_t c[3]={1.0,1.0,1.0};
  833. dc_coord=d_check_surf(MultipoleOrder(),c,0);
  834. }
  835. Matrix<Real_t> M_near(ker_dim[0],n_surf*ker_dim[1]);
  836. #pragma omp parallel for schedule(dynamic)
  837. for(size_t i=0;i<n_surf;i++){ // Compute near-interaction part
  838. std::vector<Real_t> M_=cheb_integ<Real_t>(0, &dc_coord[i*3], 3.0, *kernel->k_m2l);
  839. for(size_t j=0; j<ker_dim[0]; j++)
  840. for(int k=0; k<ker_dim[1]; k++)
  841. M_near[j][i*ker_dim[1]+k] = M_[j+k*ker_dim[0]];
  842. }
  843. { // M_far = M_analytic - M_near
  844. Matrix<Real_t> M_analytic(ker_dim[0],n_surf*ker_dim[1]); M_analytic.SetZero();
  845. kernel->k_m2l->vol_poten(&dc_coord[0],n_surf,&M_analytic[0][0]);
  846. M_far=M_analytic-M_near;
  847. }
  848. }
  849. { // Add far-field corection to M
  850. for(size_t i=0;i<n_surf;i++)
  851. for(size_t k=0;k<ker_dim[0];k++)
  852. for(size_t j=0;j<n_surf*ker_dim[1];j++)
  853. M[i*ker_dim[0]+k][j]+=M_far[k][j];
  854. }
  855. }
  856. { // a + bx + cy + dz + exy + fxz + gyz correction.
  857. std::vector<Real_t> corner_pts;
  858. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0);
  859. corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0);
  860. corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0);
  861. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1);
  862. corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(1);
  863. corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(1);
  864. corner_pts.push_back(1); corner_pts.push_back(1); corner_pts.push_back(0);
  865. corner_pts.push_back(1); corner_pts.push_back(1); corner_pts.push_back(1);
  866. size_t n_corner=corner_pts.size()/COORD_DIM;
  867. // Coord of downward equivalent surface
  868. Real_t c[3]={0,0,0};
  869. std::vector<Real_t> up_equiv_surf=u_equiv_surf(MultipoleOrder(),c,0);
  870. std::vector<Real_t> dn_equiv_surf=d_equiv_surf(MultipoleOrder(),c,0);
  871. std::vector<Real_t> dn_check_surf=d_check_surf(MultipoleOrder(),c,0);
  872. Matrix<Real_t> M_err;
  873. { // Evaluate potential at corner due to upward and dnward equivalent surface.
  874. { // Error from local expansion.
  875. Matrix<Real_t> M_e2pt(n_surf*kernel->k_l2l->ker_dim[0],n_corner*kernel->k_l2l->ker_dim[1]);
  876. kernel->k_l2l->BuildMatrix(&dn_equiv_surf[0], n_surf,
  877. &corner_pts[0], n_corner, &(M_e2pt[0][0]));
  878. Matrix<Real_t>& M_dc2de0 = Precomp(0, DC2DE0_Type, 0);
  879. Matrix<Real_t>& M_dc2de1 = Precomp(0, DC2DE1_Type, 0);
  880. M_err=(M*M_dc2de0)*(M_dc2de1*M_e2pt);
  881. }
  882. for(size_t k=0;k<n_corner;k++){ // Error from colleagues of root.
  883. for(int j0=-1;j0<=1;j0++)
  884. for(int j1=-1;j1<=1;j1++)
  885. for(int j2=-1;j2<=1;j2++){
  886. Real_t pt_coord[3]={corner_pts[k*COORD_DIM+0]-j0,
  887. corner_pts[k*COORD_DIM+1]-j1,
  888. corner_pts[k*COORD_DIM+2]-j2};
  889. if(pvfmm::fabs<Real_t>(pt_coord[0]-0.5)>1.0 || pvfmm::fabs<Real_t>(pt_coord[1]-0.5)>1.0 || pvfmm::fabs<Real_t>(pt_coord[2]-0.5)>1.0){
  890. Matrix<Real_t> M_e2pt(n_surf*ker_dim[0],ker_dim[1]);
  891. kernel->k_m2l->BuildMatrix(&up_equiv_surf[0], n_surf,
  892. &pt_coord[0], 1, &(M_e2pt[0][0]));
  893. for(size_t i=0;i<M_e2pt.Dim(0);i++)
  894. for(size_t j=0;j<M_e2pt.Dim(1);j++)
  895. M_err[i][k*ker_dim[1]+j]+=M_e2pt[i][j];
  896. }
  897. }
  898. }
  899. if(kernel->k_m2l->vol_poten){ // Error from analytical volume potential
  900. Matrix<Real_t> M_analytic(ker_dim[0],n_corner*ker_dim[1]); M_analytic.SetZero();
  901. kernel->k_m2l->vol_poten(&corner_pts[0],n_corner,&M_analytic[0][0]);
  902. for(size_t j=0;j<n_surf;j++)
  903. for(size_t k=0;k<ker_dim[0];k++)
  904. for(size_t i=0;i<M_err.Dim(1);i++){
  905. M_err[j*ker_dim[0]+k][i]-=M_analytic[k][i];
  906. }
  907. }
  908. }
  909. Matrix<Real_t> M_grad(M_err.Dim(0),n_surf*ker_dim[1]);
  910. for(size_t i=0;i<M_err.Dim(0);i++)
  911. for(size_t k=0;k<ker_dim[1];k++)
  912. for(size_t j=0;j<n_surf;j++){
  913. M_grad[i][j*ker_dim[1]+k]= M_err[i][0*ker_dim[1]+k]
  914. +(M_err[i][1*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]
  915. +(M_err[i][2*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+1]
  916. +(M_err[i][3*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+2]
  917. +(M_err[i][4*ker_dim[1]+k]+M_err[i][0*ker_dim[1]+k]-M_err[i][2*ker_dim[1]+k]-M_err[i][3*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+1]*dn_check_surf[j*COORD_DIM+2]
  918. +(M_err[i][5*ker_dim[1]+k]+M_err[i][0*ker_dim[1]+k]-M_err[i][1*ker_dim[1]+k]-M_err[i][3*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+2]*dn_check_surf[j*COORD_DIM+0]
  919. +(M_err[i][6*ker_dim[1]+k]+M_err[i][0*ker_dim[1]+k]-M_err[i][1*ker_dim[1]+k]-M_err[i][2*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]*dn_check_surf[j*COORD_DIM+1]
  920. +(M_err[i][7*ker_dim[1]+k]+M_err[i][1*ker_dim[1]+k]+M_err[i][2*ker_dim[1]+k]+M_err[i][3*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k]-M_err[i][4*ker_dim[1]+k]-M_err[i][5*ker_dim[1]+k]-M_err[i][6*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]*dn_check_surf[j*COORD_DIM+1]*dn_check_surf[j*COORD_DIM+2];
  921. }
  922. M-=M_grad;
  923. }
  924. if(!this->ScaleInvar()){ // Free memory
  925. Mat_Type type=D2D_Type;
  926. for(int l=-BC_LEVELS;l<0;l++)
  927. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  928. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  929. M.Resize(0,0);
  930. }
  931. type=U2U_Type;
  932. for(int l=-BC_LEVELS;l<0;l++)
  933. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  934. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  935. M.Resize(0,0);
  936. }
  937. type=DC2DE0_Type;
  938. for(int l=-BC_LEVELS;l<0;l++)
  939. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  940. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  941. M.Resize(0,0);
  942. }
  943. type=DC2DE1_Type;
  944. for(int l=-BC_LEVELS;l<0;l++)
  945. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  946. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  947. M.Resize(0,0);
  948. }
  949. type=UC2UE0_Type;
  950. for(int l=-BC_LEVELS;l<0;l++)
  951. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  952. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  953. M.Resize(0,0);
  954. }
  955. type=UC2UE1_Type;
  956. for(int l=-BC_LEVELS;l<0;l++)
  957. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  958. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  959. M.Resize(0,0);
  960. }
  961. }
  962. }
  963. break;
  964. }
  965. default:
  966. break;
  967. }
  968. //Save the matrix for future use.
  969. #pragma omp critical (PRECOMP_MATRIX_PTS)
  970. if(M_.Dim(0)==0 && M_.Dim(1)==0){
  971. M_=M;
  972. /*
  973. M_.Resize(M.Dim(0),M.Dim(1));
  974. int dof=ker_dim[0]*ker_dim[1];
  975. for(int j=0;j<dof;j++){
  976. size_t a=(M.Dim(0)*M.Dim(1)* j )/dof;
  977. size_t b=(M.Dim(0)*M.Dim(1)*(j+1))/dof;
  978. #pragma omp parallel for // NUMA
  979. for(int tid=0;tid<omp_p;tid++){
  980. size_t a_=a+((b-a)* tid )/omp_p;
  981. size_t b_=a+((b-a)*(tid+1))/omp_p;
  982. mem::memcopy(&M_[0][a_], &M[0][a_], (b_-a_)*sizeof(Real_t));
  983. }
  984. }
  985. */
  986. }
  987. return M_;
  988. }
  989. template <class FMMNode>
  990. void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
  991. if(level==-1){
  992. for(int l=0;l<MAX_DEPTH;l++){
  993. PrecompAll(type, l);
  994. }
  995. return;
  996. }
  997. //Compute basic permutations.
  998. for(size_t i=0;i<Perm_Count;i++)
  999. this->PrecompPerm(type, (Perm_Type) i);
  1000. {
  1001. //Allocate matrices.
  1002. size_t mat_cnt=interac_list.ListCount((Mat_Type)type);
  1003. mat->Mat(level, (Mat_Type)type, mat_cnt-1);
  1004. { // Compute InteracClass matrices.
  1005. std::vector<size_t> indx_lst;
  1006. for(size_t i=0; i<mat_cnt; i++){
  1007. if(interac_list.InteracClass((Mat_Type)type,i)==i)
  1008. indx_lst.push_back(i);
  1009. }
  1010. //Compute Transformations.
  1011. //#pragma omp parallel for //lets use fine grained parallelism
  1012. for(size_t i=0; i<indx_lst.size(); i++){
  1013. Precomp(level, (Mat_Type)type, indx_lst[i]);
  1014. }
  1015. }
  1016. //#pragma omp parallel for //lets use fine grained parallelism
  1017. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
  1018. Matrix<Real_t>& M0=interac_list.ClassMat(level,(Mat_Type)type,mat_indx);
  1019. Permutation<Real_t>& pr=interac_list.Perm_R(abs(level), (Mat_Type)type, mat_indx);
  1020. Permutation<Real_t>& pc=interac_list.Perm_C(abs(level), (Mat_Type)type, mat_indx);
  1021. if(pr.Dim()!=M0.Dim(0) || pc.Dim()!=M0.Dim(1)) Precomp(level, (Mat_Type)type, mat_indx);
  1022. }
  1023. }
  1024. }
  1025. template <class FMMNode>
  1026. void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff_list, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
  1027. if(buff_list.size()<7) buff_list.resize(7);
  1028. if( n_list.size()<7) n_list.resize(7);
  1029. if( vec_list.size()<7) vec_list.resize(7);
  1030. int omp_p=omp_get_max_threads();
  1031. if(node.size()==0) return;
  1032. {// 0. upward_equiv
  1033. int indx=0;
  1034. size_t vec_sz;
  1035. { // Set vec_sz
  1036. Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE1_Type, 0);
  1037. vec_sz=M_uc2ue.Dim(1);
  1038. }
  1039. std::vector< FMMNode* > node_lst;
  1040. {// Construct node_lst
  1041. node_lst.clear();
  1042. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1043. FMMNode_t* r_node=NULL;
  1044. for(size_t i=0;i<node.size();i++){
  1045. if(!node[i]->IsLeaf()){
  1046. node_lst_[node[i]->Depth()].push_back(node[i]);
  1047. }else{
  1048. node[i]->pt_cnt[0]+=node[i]-> src_coord.Dim()/COORD_DIM;
  1049. node[i]->pt_cnt[0]+=node[i]->surf_coord.Dim()/COORD_DIM;
  1050. if(node[i]->IsGhost()) node[i]->pt_cnt[0]++; // TODO: temporary fix, pt_cnt not known for ghost nodes
  1051. }
  1052. if(node[i]->Depth()==0) r_node=node[i];
  1053. }
  1054. size_t chld_cnt=1UL<<COORD_DIM;
  1055. for(int i=MAX_DEPTH;i>=0;i--){
  1056. for(size_t j=0;j<node_lst_[i].size();j++){
  1057. for(size_t k=0;k<chld_cnt;k++){
  1058. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1059. node_lst_[i][j]->pt_cnt[0]+=node->pt_cnt[0];
  1060. }
  1061. }
  1062. }
  1063. for(int i=0;i<=MAX_DEPTH;i++){
  1064. for(size_t j=0;j<node_lst_[i].size();j++){
  1065. if(node_lst_[i][j]->pt_cnt[0]){
  1066. for(size_t k=0;k<chld_cnt;k++){
  1067. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1068. node_lst.push_back(node);
  1069. }
  1070. }else{
  1071. for(size_t k=0;k<chld_cnt;k++){
  1072. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1073. node->FMMData()->upward_equiv.ReInit(0);
  1074. }
  1075. }
  1076. }
  1077. }
  1078. if(r_node!=NULL) node_lst.push_back(r_node);
  1079. n_list[indx]=node_lst;
  1080. }
  1081. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1082. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1083. FMMNode_t* node=node_lst[i];
  1084. Vector<Real_t>& data_vec=node->FMMData()->upward_equiv;
  1085. data_vec.ReInit(vec_sz,NULL,false);
  1086. vec_lst.push_back(&data_vec);
  1087. }
  1088. }
  1089. {// 1. dnward_equiv
  1090. int indx=1;
  1091. size_t vec_sz;
  1092. { // Set vec_sz
  1093. Matrix<Real_t>& M_dc2de0 = this->interac_list.ClassMat(0, DC2DE0_Type, 0);
  1094. vec_sz=M_dc2de0.Dim(0);
  1095. }
  1096. std::vector< FMMNode* > node_lst;
  1097. {// Construct node_lst
  1098. node_lst.clear();
  1099. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1100. FMMNode_t* r_node=NULL;
  1101. for(size_t i=0;i<node.size();i++){
  1102. if(!node[i]->IsLeaf()){
  1103. node_lst_[node[i]->Depth()].push_back(node[i]);
  1104. }else{
  1105. node[i]->pt_cnt[1]+=node[i]->trg_coord.Dim()/COORD_DIM;
  1106. }
  1107. if(node[i]->Depth()==0) r_node=node[i];
  1108. }
  1109. size_t chld_cnt=1UL<<COORD_DIM;
  1110. for(int i=MAX_DEPTH;i>=0;i--){
  1111. for(size_t j=0;j<node_lst_[i].size();j++){
  1112. for(size_t k=0;k<chld_cnt;k++){
  1113. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1114. node_lst_[i][j]->pt_cnt[1]+=node->pt_cnt[1];
  1115. }
  1116. }
  1117. }
  1118. for(int i=0;i<=MAX_DEPTH;i++){
  1119. for(size_t j=0;j<node_lst_[i].size();j++){
  1120. if(node_lst_[i][j]->pt_cnt[1]){
  1121. for(size_t k=0;k<chld_cnt;k++){
  1122. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1123. node_lst.push_back(node);
  1124. }
  1125. }else{
  1126. for(size_t k=0;k<chld_cnt;k++){
  1127. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1128. node->FMMData()->dnward_equiv.ReInit(0);
  1129. }
  1130. }
  1131. }
  1132. }
  1133. if(r_node!=NULL) node_lst.push_back(r_node);
  1134. n_list[indx]=node_lst;
  1135. }
  1136. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1137. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1138. FMMNode_t* node=node_lst[i];
  1139. Vector<Real_t>& data_vec=node->FMMData()->dnward_equiv;
  1140. data_vec.ReInit(vec_sz,NULL,false);
  1141. vec_lst.push_back(&data_vec);
  1142. }
  1143. }
  1144. {// 2. upward_equiv_fft
  1145. int indx=2;
  1146. std::vector< FMMNode* > node_lst;
  1147. {
  1148. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1149. for(size_t i=0;i<node.size();i++)
  1150. if(!node[i]->IsLeaf())
  1151. node_lst_[node[i]->Depth()].push_back(node[i]);
  1152. for(int i=0;i<=MAX_DEPTH;i++)
  1153. for(size_t j=0;j<node_lst_[i].size();j++)
  1154. node_lst.push_back(node_lst_[i][j]);
  1155. }
  1156. n_list[indx]=node_lst;
  1157. }
  1158. {// 3. dnward_check_fft
  1159. int indx=3;
  1160. std::vector< FMMNode* > node_lst;
  1161. {
  1162. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1163. for(size_t i=0;i<node.size();i++)
  1164. if(!node[i]->IsLeaf() && !node[i]->IsGhost())
  1165. node_lst_[node[i]->Depth()].push_back(node[i]);
  1166. for(int i=0;i<=MAX_DEPTH;i++)
  1167. for(size_t j=0;j<node_lst_[i].size();j++)
  1168. node_lst.push_back(node_lst_[i][j]);
  1169. }
  1170. n_list[indx]=node_lst;
  1171. }
  1172. {// 4. src_val
  1173. int indx=4;
  1174. int src_dof=kernel->ker_dim[0];
  1175. int surf_dof=COORD_DIM+src_dof;
  1176. std::vector< FMMNode* > node_lst;
  1177. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1178. if(node[i]->IsLeaf()){
  1179. node_lst.push_back(node[i]);
  1180. }else{
  1181. node[i]->src_value.ReInit(0);
  1182. node[i]->surf_value.ReInit(0);
  1183. }
  1184. }
  1185. n_list[indx]=node_lst;
  1186. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1187. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1188. FMMNode_t* node=node_lst[i];
  1189. { // src_value
  1190. Vector<Real_t>& data_vec=node->src_value;
  1191. size_t vec_sz=(node->src_coord.Dim()/COORD_DIM)*src_dof;
  1192. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false);
  1193. vec_lst.push_back(&data_vec);
  1194. }
  1195. { // surf_value
  1196. Vector<Real_t>& data_vec=node->surf_value;
  1197. size_t vec_sz=(node->surf_coord.Dim()/COORD_DIM)*surf_dof;
  1198. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false);
  1199. vec_lst.push_back(&data_vec);
  1200. }
  1201. }
  1202. }
  1203. {// 5. trg_val
  1204. int indx=5;
  1205. int trg_dof=kernel->ker_dim[1];
  1206. std::vector< FMMNode* > node_lst;
  1207. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1208. if(node[i]->IsLeaf() && !node[i]->IsGhost()){
  1209. node_lst.push_back(node[i]);
  1210. }else{
  1211. node[i]->trg_value.ReInit(0);
  1212. }
  1213. }
  1214. n_list[indx]=node_lst;
  1215. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1216. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1217. FMMNode_t* node=node_lst[i];
  1218. { // trg_value
  1219. Vector<Real_t>& data_vec=node->trg_value;
  1220. size_t vec_sz=(node->trg_coord.Dim()/COORD_DIM)*trg_dof;
  1221. data_vec.ReInit(vec_sz,NULL,false);
  1222. vec_lst.push_back(&data_vec);
  1223. }
  1224. }
  1225. }
  1226. {// 6. pts_coord
  1227. int indx=6;
  1228. std::vector< FMMNode* > node_lst;
  1229. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1230. if(node[i]->IsLeaf()){
  1231. node_lst.push_back(node[i]);
  1232. }else{
  1233. node[i]->src_coord.ReInit(0);
  1234. node[i]->surf_coord.ReInit(0);
  1235. node[i]->trg_coord.ReInit(0);
  1236. }
  1237. }
  1238. n_list[indx]=node_lst;
  1239. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1240. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1241. FMMNode_t* node=node_lst[i];
  1242. { // src_coord
  1243. Vector<Real_t>& data_vec=node->src_coord;
  1244. vec_lst.push_back(&data_vec);
  1245. }
  1246. { // surf_coord
  1247. Vector<Real_t>& data_vec=node->surf_coord;
  1248. vec_lst.push_back(&data_vec);
  1249. }
  1250. { // trg_coord
  1251. Vector<Real_t>& data_vec=node->trg_coord;
  1252. vec_lst.push_back(&data_vec);
  1253. }
  1254. }
  1255. { // check and equiv surfaces.
  1256. if(tree->upwd_check_surf.size()==0){
  1257. size_t m=MultipoleOrder();
  1258. tree->upwd_check_surf.resize(MAX_DEPTH);
  1259. tree->upwd_equiv_surf.resize(MAX_DEPTH);
  1260. tree->dnwd_check_surf.resize(MAX_DEPTH);
  1261. tree->dnwd_equiv_surf.resize(MAX_DEPTH);
  1262. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1263. Real_t c[3]={0.0,0.0,0.0};
  1264. tree->upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1265. tree->upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1266. tree->dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1267. tree->dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1268. tree->upwd_check_surf[depth]=u_check_surf(m,c,depth);
  1269. tree->upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
  1270. tree->dnwd_check_surf[depth]=d_check_surf(m,c,depth);
  1271. tree->dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
  1272. }
  1273. }
  1274. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1275. vec_lst.push_back(&tree->upwd_check_surf[depth]);
  1276. vec_lst.push_back(&tree->upwd_equiv_surf[depth]);
  1277. vec_lst.push_back(&tree->dnwd_check_surf[depth]);
  1278. vec_lst.push_back(&tree->dnwd_equiv_surf[depth]);
  1279. }
  1280. }
  1281. }
  1282. // Create extra auxiliary buffer.
  1283. if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1);
  1284. for(size_t indx=0;indx<vec_list.size();indx++){ // Resize buffer
  1285. Matrix<Real_t>& buff=buff_list[indx];
  1286. std::vector<Vector<Real_t>*>& vec_lst= vec_list[indx];
  1287. bool keep_data=(indx==4 || indx==6);
  1288. size_t n_vec=vec_lst.size();
  1289. { // Continue if nothing to be done.
  1290. if(!n_vec) continue;
  1291. if(buff.Dim(0)*buff.Dim(1)>0){
  1292. bool init_buff=false;
  1293. Real_t* buff_start=buff.Begin();
  1294. Real_t* buff_end=buff.Begin()+buff.Dim(0)*buff.Dim(1);
  1295. #pragma omp parallel for reduction(||:init_buff)
  1296. for(size_t i=0;i<n_vec;i++){
  1297. if(vec_lst[i]->Dim() && (vec_lst[i]->Begin()<buff_start || vec_lst[i]->Begin()>=buff_end)){
  1298. init_buff=true;
  1299. }
  1300. }
  1301. if(!init_buff) continue;
  1302. }
  1303. }
  1304. std::vector<size_t> vec_size(n_vec);
  1305. std::vector<size_t> vec_disp(n_vec);
  1306. if(n_vec){ // Set vec_size and vec_disp
  1307. #pragma omp parallel for
  1308. for(size_t i=0;i<n_vec;i++){ // Set vec_size
  1309. vec_size[i]=vec_lst[i]->Dim();
  1310. }
  1311. vec_disp[0]=0;
  1312. omp_par::scan(&vec_size[0],&vec_disp[0],n_vec);
  1313. }
  1314. size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1];
  1315. if(!buff_size) continue;
  1316. if(keep_data){ // Copy to dev_buffer
  1317. if(dev_buffer.Dim()<buff_size*sizeof(Real_t)){ // Resize dev_buffer
  1318. dev_buffer.ReInit(buff_size*sizeof(Real_t)*1.05);
  1319. }
  1320. #pragma omp parallel for
  1321. for(size_t i=0;i<n_vec;i++){
  1322. if(vec_lst[i]->Begin()){
  1323. mem::memcopy(((Real_t*)dev_buffer.Begin())+vec_disp[i],vec_lst[i]->Begin(),vec_size[i]*sizeof(Real_t));
  1324. }
  1325. }
  1326. }
  1327. if(buff.Dim(0)*buff.Dim(1)<buff_size){ // Resize buff
  1328. buff.ReInit(1,buff_size*1.05);
  1329. }
  1330. if(keep_data){ // Copy to buff (from dev_buffer)
  1331. #pragma omp parallel for
  1332. for(size_t tid=0;tid<omp_p;tid++){
  1333. size_t a=(buff_size*(tid+0))/omp_p;
  1334. size_t b=(buff_size*(tid+1))/omp_p;
  1335. mem::memcopy(buff.Begin()+a,((Real_t*)dev_buffer.Begin())+a,(b-a)*sizeof(Real_t));
  1336. }
  1337. }
  1338. #pragma omp parallel for
  1339. for(size_t i=0;i<n_vec;i++){ // ReInit vectors
  1340. vec_lst[i]->ReInit(vec_size[i],buff.Begin()+vec_disp[i],false);
  1341. }
  1342. }
  1343. }
  1344. template <class FMMNode>
  1345. void FMM_Pts<FMMNode>::SetupPrecomp(SetupData<Real_t>& setup_data, bool device){
  1346. if(setup_data.precomp_data==NULL || setup_data.level>MAX_DEPTH) return;
  1347. Profile::Tic("SetupPrecomp",&this->comm,true,25);
  1348. { // Build precomp_data
  1349. size_t precomp_offset=0;
  1350. int level=setup_data.level;
  1351. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1352. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1353. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1354. Mat_Type& interac_type=interac_type_lst[type_indx];
  1355. this->PrecompAll(interac_type, level); // Compute matrices.
  1356. precomp_offset=this->mat->CompactData(level, interac_type, precomp_data, precomp_offset);
  1357. }
  1358. }
  1359. Profile::Toc();
  1360. if(device){ // Host2Device
  1361. Profile::Tic("Host2Device",&this->comm,false,25);
  1362. setup_data.precomp_data->AllocDevice(true);
  1363. Profile::Toc();
  1364. }
  1365. }
  1366. template <class FMMNode>
  1367. void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
  1368. int level=setup_data.level;
  1369. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1370. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1371. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1372. Matrix<Real_t>& input_data=*setup_data. input_data;
  1373. Matrix<Real_t>& output_data=*setup_data.output_data;
  1374. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector;
  1375. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector;
  1376. size_t n_in =nodes_in .size();
  1377. size_t n_out=nodes_out.size();
  1378. // Setup precomputed data.
  1379. if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  1380. // Build interac_data
  1381. Profile::Tic("Interac-Data",&this->comm,true,25);
  1382. Matrix<char>& interac_data=setup_data.interac_data;
  1383. { // Build precomp_data, interac_data
  1384. std::vector<size_t> interac_mat;
  1385. std::vector<size_t> interac_cnt;
  1386. std::vector<size_t> interac_blk;
  1387. std::vector<size_t> input_perm;
  1388. std::vector<size_t> output_perm;
  1389. size_t dof=0, M_dim0=0, M_dim1=0;
  1390. size_t precomp_offset=0;
  1391. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  1392. if(n_out && n_in) for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1393. Mat_Type& interac_type=interac_type_lst[type_indx];
  1394. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  1395. Matrix<size_t> precomp_data_offset;
  1396. { // Load precomp_data for interac_type.
  1397. struct HeaderData{
  1398. size_t total_size;
  1399. size_t level;
  1400. size_t mat_cnt ;
  1401. size_t max_depth;
  1402. };
  1403. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1404. char* indx_ptr=precomp_data[0]+precomp_offset;
  1405. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  1406. precomp_data_offset.ReInit(header.mat_cnt,(1+(2+2)*header.max_depth), (size_t*)indx_ptr, false);
  1407. precomp_offset+=header.total_size;
  1408. }
  1409. Matrix<FMMNode*> src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero();
  1410. Matrix<FMMNode*> trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero();
  1411. { // Build trg_interac_list
  1412. #pragma omp parallel for
  1413. for(size_t i=0;i<n_out;i++){
  1414. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  1415. Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  1416. mem::memcopy(trg_interac_list[i], lst.Begin(), lst.Dim()*sizeof(FMMNode*));
  1417. assert(lst.Dim()==mat_cnt);
  1418. }
  1419. }
  1420. }
  1421. { // Build src_interac_list
  1422. #pragma omp parallel for
  1423. for(size_t i=0;i<n_out;i++){
  1424. for(size_t j=0;j<mat_cnt;j++)
  1425. if(trg_interac_list[i][j]!=NULL){
  1426. trg_interac_list[i][j]->node_id=n_in;
  1427. }
  1428. }
  1429. #pragma omp parallel for
  1430. for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
  1431. #pragma omp parallel for
  1432. for(size_t i=0;i<n_out;i++){
  1433. for(size_t j=0;j<mat_cnt;j++){
  1434. if(trg_interac_list[i][j]!=NULL){
  1435. if(trg_interac_list[i][j]->node_id==n_in){
  1436. trg_interac_list[i][j]=NULL;
  1437. }else{
  1438. src_interac_list[trg_interac_list[i][j]->node_id][j]=(FMMNode*)nodes_out[i];
  1439. }
  1440. }
  1441. }
  1442. }
  1443. }
  1444. Matrix<size_t> interac_dsp(n_out,mat_cnt);
  1445. std::vector<size_t> interac_blk_dsp(1,0);
  1446. { // Determine dof, M_dim0, M_dim1
  1447. dof=1;
  1448. Matrix<Real_t>& M0 = this->interac_list.ClassMat(level, interac_type_lst[0], 0);
  1449. M_dim0=M0.Dim(0); M_dim1=M0.Dim(1);
  1450. }
  1451. { // Determine interaction blocks which fit in memory.
  1452. size_t vec_size=(M_dim0+M_dim1)*sizeof(Real_t)*dof;
  1453. for(size_t j=0;j<mat_cnt;j++){// Determine minimum buff_size
  1454. size_t vec_cnt=0;
  1455. for(size_t i=0;i<n_out;i++){
  1456. if(trg_interac_list[i][j]!=NULL) vec_cnt++;
  1457. }
  1458. if(buff_size<vec_cnt*vec_size)
  1459. buff_size=vec_cnt*vec_size;
  1460. }
  1461. size_t interac_dsp_=0;
  1462. for(size_t j=0;j<mat_cnt;j++){
  1463. for(size_t i=0;i<n_out;i++){
  1464. interac_dsp[i][j]=interac_dsp_;
  1465. if(trg_interac_list[i][j]!=NULL) interac_dsp_++;
  1466. }
  1467. if(interac_dsp_*vec_size>buff_size) // Comment to disable symmetries.
  1468. {
  1469. interac_blk.push_back(j-interac_blk_dsp.back());
  1470. interac_blk_dsp.push_back(j);
  1471. size_t offset=interac_dsp[0][j];
  1472. for(size_t i=0;i<n_out;i++) interac_dsp[i][j]-=offset;
  1473. interac_dsp_-=offset;
  1474. assert(interac_dsp_*vec_size<=buff_size); // Problem too big for buff_size.
  1475. }
  1476. interac_mat.push_back(precomp_data_offset[this->interac_list.InteracClass(interac_type,j)][0]);
  1477. interac_cnt.push_back(interac_dsp_-interac_dsp[0][j]);
  1478. }
  1479. interac_blk.push_back(mat_cnt-interac_blk_dsp.back());
  1480. interac_blk_dsp.push_back(mat_cnt);
  1481. }
  1482. { // Determine input_perm.
  1483. size_t vec_size=M_dim0*dof;
  1484. for(size_t i=0;i<n_out;i++) ((FMMNode*)nodes_out[i])->node_id=i;
  1485. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1486. for(size_t i=0;i<n_in ;i++){
  1487. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1488. FMMNode_t* trg_node=src_interac_list[i][j];
  1489. if(trg_node!=NULL && trg_node->node_id<n_out){
  1490. size_t depth=(this->ScaleInvar()?trg_node->Depth():0);
  1491. input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
  1492. input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
  1493. input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
  1494. input_perm .push_back((size_t)(input_vector[i]->Begin()- input_data[0])); // src_ptr
  1495. assert(input_vector[i]->Dim()==vec_size);
  1496. }
  1497. }
  1498. }
  1499. }
  1500. }
  1501. { // Determine output_perm
  1502. size_t vec_size=M_dim1*dof;
  1503. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1504. for(size_t i=0;i<n_out;i++){
  1505. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1506. if(trg_interac_list[i][j]!=NULL){
  1507. size_t depth=(this->ScaleInvar()?((FMMNode*)nodes_out[i])->Depth():0);
  1508. output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem
  1509. output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal
  1510. output_perm.push_back(interac_dsp[ i ][j]*vec_size*sizeof(Real_t)); // src_ptr
  1511. output_perm.push_back((size_t)(output_vector[i]->Begin()-output_data[0])); // trg_ptr
  1512. assert(output_vector[i]->Dim()==vec_size);
  1513. }
  1514. }
  1515. }
  1516. }
  1517. }
  1518. }
  1519. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  1520. { // Set interac_data.
  1521. size_t data_size=sizeof(size_t)*4;
  1522. data_size+=sizeof(size_t)+interac_blk.size()*sizeof(size_t);
  1523. data_size+=sizeof(size_t)+interac_cnt.size()*sizeof(size_t);
  1524. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  1525. data_size+=sizeof(size_t)+ input_perm.size()*sizeof(size_t);
  1526. data_size+=sizeof(size_t)+output_perm.size()*sizeof(size_t);
  1527. if(interac_data.Dim(0)*interac_data.Dim(1)<sizeof(size_t)){
  1528. data_size+=sizeof(size_t);
  1529. interac_data.ReInit(1,data_size);
  1530. ((size_t*)interac_data.Begin())[0]=sizeof(size_t);
  1531. }else{
  1532. size_t pts_data_size=*((size_t*)interac_data.Begin());
  1533. assert(interac_data.Dim(0)*interac_data.Dim(1)>=pts_data_size);
  1534. data_size+=pts_data_size;
  1535. if(data_size>interac_data.Dim(0)*interac_data.Dim(1)){ //Resize and copy interac_data.
  1536. Matrix< char> pts_interac_data=interac_data;
  1537. interac_data.ReInit(1,data_size);
  1538. mem::memcopy(interac_data.Begin(),pts_interac_data.Begin(),pts_data_size);
  1539. }
  1540. }
  1541. char* data_ptr=interac_data.Begin();
  1542. data_ptr+=((size_t*)data_ptr)[0];
  1543. ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t);
  1544. ((size_t*)data_ptr)[0]= M_dim0; data_ptr+=sizeof(size_t);
  1545. ((size_t*)data_ptr)[0]= M_dim1; data_ptr+=sizeof(size_t);
  1546. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  1547. ((size_t*)data_ptr)[0]=interac_blk.size(); data_ptr+=sizeof(size_t);
  1548. mem::memcopy(data_ptr, &interac_blk[0], interac_blk.size()*sizeof(size_t));
  1549. data_ptr+=interac_blk.size()*sizeof(size_t);
  1550. ((size_t*)data_ptr)[0]=interac_cnt.size(); data_ptr+=sizeof(size_t);
  1551. mem::memcopy(data_ptr, &interac_cnt[0], interac_cnt.size()*sizeof(size_t));
  1552. data_ptr+=interac_cnt.size()*sizeof(size_t);
  1553. ((size_t*)data_ptr)[0]=interac_mat.size(); data_ptr+=sizeof(size_t);
  1554. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  1555. data_ptr+=interac_mat.size()*sizeof(size_t);
  1556. ((size_t*)data_ptr)[0]= input_perm.size(); data_ptr+=sizeof(size_t);
  1557. mem::memcopy(data_ptr, &input_perm[0], input_perm.size()*sizeof(size_t));
  1558. data_ptr+= input_perm.size()*sizeof(size_t);
  1559. ((size_t*)data_ptr)[0]=output_perm.size(); data_ptr+=sizeof(size_t);
  1560. mem::memcopy(data_ptr, &output_perm[0], output_perm.size()*sizeof(size_t));
  1561. data_ptr+=output_perm.size()*sizeof(size_t);
  1562. }
  1563. }
  1564. Profile::Toc();
  1565. if(device){ // Host2Device
  1566. Profile::Tic("Host2Device",&this->comm,false,25);
  1567. setup_data.interac_data .AllocDevice(true);
  1568. if(staging_buffer.Dim()<sizeof(Real_t)*output_data.Dim(0)*output_data.Dim(1)){
  1569. staging_buffer.ReInit(sizeof(Real_t)*output_data.Dim(0)*output_data.Dim(1));
  1570. staging_buffer.SetZero();
  1571. staging_buffer.AllocDevice(true);
  1572. }
  1573. Profile::Toc();
  1574. }
  1575. }
  1576. #if defined(PVFMM_HAVE_CUDA)
  1577. #include <fmm_pts_gpu.hpp>
  1578. template <class Real_t, int SYNC>
  1579. void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Comm& comm) {
  1580. cudaStream_t* stream = pvfmm::CUDA_Lock::acquire_stream();
  1581. Profile::Tic("Host2Device",&comm,false,25);
  1582. typename Matrix<char>::Device interac_data;
  1583. typename Vector<char>::Device buff;
  1584. typename Matrix<char>::Device precomp_data_d;
  1585. typename Matrix<char>::Device interac_data_d;
  1586. typename Matrix<Real_t>::Device input_data_d;
  1587. typename Matrix<Real_t>::Device output_data_d;
  1588. interac_data = setup_data.interac_data;
  1589. buff = dev_buffer. AllocDevice(false);
  1590. precomp_data_d= setup_data.precomp_data->AllocDevice(false);
  1591. interac_data_d= setup_data.interac_data. AllocDevice(false);
  1592. input_data_d = setup_data. input_data->AllocDevice(false);
  1593. output_data_d = setup_data. output_data->AllocDevice(false);
  1594. Profile::Toc();
  1595. Profile::Tic("DeviceComp",&comm,false,20);
  1596. { // Offloaded computation.
  1597. size_t data_size, M_dim0, M_dim1, dof;
  1598. Vector<size_t> interac_blk;
  1599. Vector<size_t> interac_cnt;
  1600. Vector<size_t> interac_mat;
  1601. Vector<size_t> input_perm_d;
  1602. Vector<size_t> output_perm_d;
  1603. { // Set interac_data.
  1604. char* data_ptr=&interac_data [0][0];
  1605. char* dev_ptr=&interac_data_d[0][0];
  1606. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; dev_ptr += data_size;
  1607. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1608. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1609. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1610. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1611. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1612. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1613. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1614. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1615. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1616. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1617. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1618. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1619. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1620. input_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1621. data_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1622. dev_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1623. output_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1624. data_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1625. dev_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1626. }
  1627. { // interactions
  1628. size_t interac_indx = 0;
  1629. size_t interac_blk_dsp = 0;
  1630. cudaError_t error;
  1631. for (size_t k = 0; k < interac_blk.Dim(); k++) {
  1632. size_t vec_cnt=0;
  1633. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1634. if(vec_cnt==0){
  1635. //interac_indx += vec_cnt;
  1636. interac_blk_dsp += interac_blk[k];
  1637. continue;
  1638. }
  1639. char *buff_in_d =&buff[0];
  1640. char *buff_out_d =&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1641. { // Input permutation.
  1642. in_perm_gpu<Real_t>(&precomp_data_d[0][0], &input_data_d[0][0], buff_in_d,
  1643. &input_perm_d[interac_indx*4], vec_cnt, M_dim0, stream);
  1644. }
  1645. size_t vec_cnt0 = 0;
  1646. for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
  1647. size_t vec_cnt1 = 0;
  1648. size_t interac_mat0 = interac_mat[j];
  1649. for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) vec_cnt1 += interac_cnt[j];
  1650. Matrix<Real_t> M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
  1651. Matrix<Real_t> Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1652. Matrix<Real_t> Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1653. Matrix<Real_t>::CUBLASGEMM(Mt_d, Ms_d, M_d);
  1654. vec_cnt0 += vec_cnt1;
  1655. }
  1656. { // Output permutation.
  1657. out_perm_gpu<Real_t>(&precomp_data_d[0][0], &output_data_d[0][0], buff_out_d,
  1658. &output_perm_d[interac_indx*4], vec_cnt, M_dim1, stream);
  1659. }
  1660. interac_indx += vec_cnt;
  1661. interac_blk_dsp += interac_blk[k];
  1662. }
  1663. }
  1664. }
  1665. Profile::Toc();
  1666. if(SYNC) CUDA_Lock::wait();
  1667. }
  1668. #endif
  1669. template <class FMMNode>
  1670. template <int SYNC>
  1671. void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
  1672. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  1673. Profile::Tic("Host2Device",&this->comm,false,25);
  1674. Profile::Toc();
  1675. Profile::Tic("DeviceComp",&this->comm,false,20);
  1676. Profile::Toc();
  1677. return;
  1678. }
  1679. #if defined(PVFMM_HAVE_CUDA)
  1680. if (device) {
  1681. EvalListGPU<Real_t, SYNC>(setup_data, this->dev_buffer, this->comm);
  1682. return;
  1683. }
  1684. #endif
  1685. Profile::Tic("Host2Device",&this->comm,false,25);
  1686. typename Vector<char>::Device buff;
  1687. typename Matrix<char>::Device precomp_data;
  1688. typename Matrix<char>::Device interac_data;
  1689. typename Matrix<Real_t>::Device input_data;
  1690. typename Matrix<Real_t>::Device output_data;
  1691. if(device){
  1692. buff = this-> dev_buffer. AllocDevice(false);
  1693. precomp_data= setup_data.precomp_data->AllocDevice(false);
  1694. interac_data= setup_data.interac_data. AllocDevice(false);
  1695. input_data = setup_data. input_data->AllocDevice(false);
  1696. output_data = setup_data. output_data->AllocDevice(false);
  1697. }else{
  1698. buff = this-> dev_buffer;
  1699. precomp_data=*setup_data.precomp_data;
  1700. interac_data= setup_data.interac_data;
  1701. input_data =*setup_data. input_data;
  1702. output_data =*setup_data. output_data;
  1703. }
  1704. Profile::Toc();
  1705. Profile::Tic("DeviceComp",&this->comm,false,20);
  1706. int lock_idx=-1;
  1707. int wait_lock_idx=-1;
  1708. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  1709. if(device) lock_idx=MIC_Lock::get_lock();
  1710. #ifdef __INTEL_OFFLOAD
  1711. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  1712. #endif
  1713. { // Offloaded computation.
  1714. // Set interac_data.
  1715. size_t data_size, M_dim0, M_dim1, dof;
  1716. Vector<size_t> interac_blk;
  1717. Vector<size_t> interac_cnt;
  1718. Vector<size_t> interac_mat;
  1719. Vector<size_t> input_perm;
  1720. Vector<size_t> output_perm;
  1721. { // Set interac_data.
  1722. char* data_ptr=&interac_data[0][0];
  1723. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
  1724. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1725. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1726. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1727. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1728. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1729. data_ptr+=sizeof(size_t)+interac_blk.Dim()*sizeof(size_t);
  1730. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1731. data_ptr+=sizeof(size_t)+interac_cnt.Dim()*sizeof(size_t);
  1732. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1733. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  1734. input_perm .ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1735. data_ptr+=sizeof(size_t)+ input_perm.Dim()*sizeof(size_t);
  1736. output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1737. data_ptr+=sizeof(size_t)+output_perm.Dim()*sizeof(size_t);
  1738. }
  1739. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  1740. //Compute interaction from Chebyshev source density.
  1741. { // interactions
  1742. int omp_p=omp_get_max_threads();
  1743. size_t interac_indx=0;
  1744. size_t interac_blk_dsp=0;
  1745. for(size_t k=0;k<interac_blk.Dim();k++){
  1746. size_t vec_cnt=0;
  1747. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1748. if(vec_cnt==0){
  1749. //interac_indx += vec_cnt;
  1750. interac_blk_dsp += interac_blk[k];
  1751. continue;
  1752. }
  1753. char* buff_in =&buff[0];
  1754. char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1755. // Input permutation.
  1756. #pragma omp parallel for
  1757. for(int tid=0;tid<omp_p;tid++){
  1758. size_t a=( tid *vec_cnt)/omp_p;
  1759. size_t b=((tid+1)*vec_cnt)/omp_p;
  1760. for(size_t i=a;i<b;i++){
  1761. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]);
  1762. const Real_t* scal=( Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]);
  1763. const Real_t* v_in =( Real_t*)( input_data[0]+input_perm[(interac_indx+i)*4+3]);
  1764. Real_t* v_out=( Real_t*)( buff_in +input_perm[(interac_indx+i)*4+2]);
  1765. // TODO: Fix for dof>1
  1766. #ifdef __MIC__
  1767. {
  1768. __m512d v8;
  1769. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1770. size_t j_end =(((uintptr_t)(v_out+M_dim0) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1771. j_start/=sizeof(Real_t);
  1772. j_end /=sizeof(Real_t);
  1773. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1774. assert(((uintptr_t)(v_out+j_start))%64==0);
  1775. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1776. size_t j=0;
  1777. for(;j<j_start;j++ ){
  1778. v_out[j]=v_in[perm[j]]*scal[j];
  1779. }
  1780. for(;j<j_end ;j+=8){
  1781. v8=_mm512_setr_pd(
  1782. v_in[perm[j+0]]*scal[j+0],
  1783. v_in[perm[j+1]]*scal[j+1],
  1784. v_in[perm[j+2]]*scal[j+2],
  1785. v_in[perm[j+3]]*scal[j+3],
  1786. v_in[perm[j+4]]*scal[j+4],
  1787. v_in[perm[j+5]]*scal[j+5],
  1788. v_in[perm[j+6]]*scal[j+6],
  1789. v_in[perm[j+7]]*scal[j+7]);
  1790. _mm512_storenrngo_pd(v_out+j,v8);
  1791. }
  1792. for(;j<M_dim0 ;j++ ){
  1793. v_out[j]=v_in[perm[j]]*scal[j];
  1794. }
  1795. }
  1796. #else
  1797. for(size_t j=0;j<M_dim0;j++ ){
  1798. v_out[j]=v_in[perm[j]]*scal[j];
  1799. }
  1800. #endif
  1801. }
  1802. }
  1803. size_t vec_cnt0=0;
  1804. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];){
  1805. size_t vec_cnt1=0;
  1806. size_t interac_mat0=interac_mat[j];
  1807. for(;j<interac_blk_dsp+interac_blk[k] && interac_mat[j]==interac_mat0;j++) vec_cnt1+=interac_cnt[j];
  1808. Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
  1809. #ifdef __MIC__
  1810. {
  1811. Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1812. Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1813. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1814. }
  1815. #else
  1816. #pragma omp parallel for
  1817. for(int tid=0;tid<omp_p;tid++){
  1818. size_t a=(dof*vec_cnt1*(tid ))/omp_p;
  1819. size_t b=(dof*vec_cnt1*(tid+1))/omp_p;
  1820. Matrix<Real_t> Ms(b-a, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t))+M_dim0*a, false);
  1821. Matrix<Real_t> Mt(b-a, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t))+M_dim1*a, false);
  1822. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1823. }
  1824. #endif
  1825. vec_cnt0+=vec_cnt1;
  1826. }
  1827. // Output permutation.
  1828. #pragma omp parallel for
  1829. for(int tid=0;tid<omp_p;tid++){
  1830. size_t a=( tid *vec_cnt)/omp_p;
  1831. size_t b=((tid+1)*vec_cnt)/omp_p;
  1832. if(tid> 0 && a<vec_cnt){ // Find 'a' independent of other threads.
  1833. size_t out_ptr=output_perm[(interac_indx+a)*4+3];
  1834. if(tid> 0) while(a<vec_cnt && out_ptr==output_perm[(interac_indx+a)*4+3]) a++;
  1835. }
  1836. if(tid<omp_p-1 && b<vec_cnt){ // Find 'b' independent of other threads.
  1837. size_t out_ptr=output_perm[(interac_indx+b)*4+3];
  1838. if(tid<omp_p-1) while(b<vec_cnt && out_ptr==output_perm[(interac_indx+b)*4+3]) b++;
  1839. }
  1840. for(size_t i=a;i<b;i++){ // Compute permutations.
  1841. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
  1842. const Real_t* scal=( Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
  1843. const Real_t* v_in =( Real_t*)( buff_out +output_perm[(interac_indx+i)*4+2]);
  1844. Real_t* v_out=( Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]);
  1845. // TODO: Fix for dof>1
  1846. #ifdef __MIC__
  1847. {
  1848. __m512d v8;
  1849. __m512d v_old;
  1850. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1851. size_t j_end =(((uintptr_t)(v_out+M_dim1) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1852. j_start/=sizeof(Real_t);
  1853. j_end /=sizeof(Real_t);
  1854. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1855. assert(((uintptr_t)(v_out+j_start))%64==0);
  1856. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1857. size_t j=0;
  1858. for(;j<j_start;j++ ){
  1859. v_out[j]+=v_in[perm[j]]*scal[j];
  1860. }
  1861. for(;j<j_end ;j+=8){
  1862. v_old=_mm512_load_pd(v_out+j);
  1863. v8=_mm512_setr_pd(
  1864. v_in[perm[j+0]]*scal[j+0],
  1865. v_in[perm[j+1]]*scal[j+1],
  1866. v_in[perm[j+2]]*scal[j+2],
  1867. v_in[perm[j+3]]*scal[j+3],
  1868. v_in[perm[j+4]]*scal[j+4],
  1869. v_in[perm[j+5]]*scal[j+5],
  1870. v_in[perm[j+6]]*scal[j+6],
  1871. v_in[perm[j+7]]*scal[j+7]);
  1872. v_old=_mm512_add_pd(v_old, v8);
  1873. _mm512_storenrngo_pd(v_out+j,v_old);
  1874. }
  1875. for(;j<M_dim1 ;j++ ){
  1876. v_out[j]+=v_in[perm[j]]*scal[j];
  1877. }
  1878. }
  1879. #else
  1880. for(size_t j=0;j<M_dim1;j++ ){
  1881. v_out[j]+=v_in[perm[j]]*scal[j];
  1882. }
  1883. #endif
  1884. }
  1885. }
  1886. interac_indx+=vec_cnt;
  1887. interac_blk_dsp+=interac_blk[k];
  1888. }
  1889. }
  1890. if(device) MIC_Lock::release_lock(lock_idx);
  1891. }
  1892. #ifdef __INTEL_OFFLOAD
  1893. if(SYNC){
  1894. #pragma offload if(device) target(mic:0)
  1895. {if(device) MIC_Lock::wait_lock(lock_idx);}
  1896. }
  1897. #endif
  1898. Profile::Toc();
  1899. }
  1900. template <class FMMNode>
  1901. void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  1902. if(!this->MultipoleOrder()) return;
  1903. { // Set setup_data
  1904. setup_data. level=level;
  1905. setup_data.kernel=kernel->k_s2m;
  1906. setup_data. input_data=&buff[4];
  1907. setup_data.output_data=&buff[0];
  1908. setup_data. coord_data=&buff[6];
  1909. Vector<FMMNode_t*>& nodes_in =n_list[4];
  1910. Vector<FMMNode_t*>& nodes_out=n_list[0];
  1911. setup_data.nodes_in .clear();
  1912. setup_data.nodes_out.clear();
  1913. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
  1914. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && (nodes_out[i]->src_coord.Dim() || nodes_out[i]->surf_coord.Dim()) && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  1915. }
  1916. struct PackedData{
  1917. size_t len;
  1918. Matrix<Real_t>* ptr;
  1919. Vector<size_t> cnt;
  1920. Vector<size_t> dsp;
  1921. };
  1922. struct InteracData{
  1923. Vector<size_t> in_node;
  1924. Vector<size_t> scal_idx;
  1925. Vector<Real_t> coord_shift;
  1926. Vector<size_t> interac_cnt;
  1927. Vector<size_t> interac_dsp;
  1928. Vector<size_t> interac_cst;
  1929. Vector<Real_t> scal[4*MAX_DEPTH];
  1930. Matrix<Real_t> M[4];
  1931. };
  1932. struct ptSetupData{
  1933. int level;
  1934. const Kernel<Real_t>* kernel;
  1935. PackedData src_coord; // Src coord
  1936. PackedData src_value; // Src density
  1937. PackedData srf_coord; // Srf coord
  1938. PackedData srf_value; // Srf density
  1939. PackedData trg_coord; // Trg coord
  1940. PackedData trg_value; // Trg potential
  1941. InteracData interac_data;
  1942. };
  1943. ptSetupData data;
  1944. data. level=setup_data. level;
  1945. data.kernel=setup_data.kernel;
  1946. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1947. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1948. { // Set src data
  1949. std::vector<void*>& nodes=nodes_in;
  1950. PackedData& coord=data.src_coord;
  1951. PackedData& value=data.src_value;
  1952. coord.ptr=setup_data. coord_data;
  1953. value.ptr=setup_data. input_data;
  1954. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  1955. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  1956. coord.cnt.ReInit(nodes.size());
  1957. coord.dsp.ReInit(nodes.size());
  1958. value.cnt.ReInit(nodes.size());
  1959. value.dsp.ReInit(nodes.size());
  1960. #pragma omp parallel for
  1961. for(size_t i=0;i<nodes.size();i++){
  1962. ((FMMNode_t*)nodes[i])->node_id=i;
  1963. Vector<Real_t>& coord_vec=((FMMNode*)nodes[i])->src_coord;
  1964. Vector<Real_t>& value_vec=((FMMNode*)nodes[i])->src_value;
  1965. if(coord_vec.Dim()){
  1966. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  1967. assert(coord.dsp[i]<coord.len);
  1968. coord.cnt[i]=coord_vec.Dim();
  1969. }else{
  1970. coord.dsp[i]=0;
  1971. coord.cnt[i]=0;
  1972. }
  1973. if(value_vec.Dim()){
  1974. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  1975. assert(value.dsp[i]<value.len);
  1976. value.cnt[i]=value_vec.Dim();
  1977. }else{
  1978. value.dsp[i]=0;
  1979. value.cnt[i]=0;
  1980. }
  1981. }
  1982. }
  1983. { // Set srf data
  1984. std::vector<void*>& nodes=nodes_in;
  1985. PackedData& coord=data.srf_coord;
  1986. PackedData& value=data.srf_value;
  1987. coord.ptr=setup_data. coord_data;
  1988. value.ptr=setup_data. input_data;
  1989. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  1990. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  1991. coord.cnt.ReInit(nodes.size());
  1992. coord.dsp.ReInit(nodes.size());
  1993. value.cnt.ReInit(nodes.size());
  1994. value.dsp.ReInit(nodes.size());
  1995. #pragma omp parallel for
  1996. for(size_t i=0;i<nodes.size();i++){
  1997. Vector<Real_t>& coord_vec=((FMMNode*)nodes[i])->surf_coord;
  1998. Vector<Real_t>& value_vec=((FMMNode*)nodes[i])->surf_value;
  1999. if(coord_vec.Dim()){
  2000. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  2001. assert(coord.dsp[i]<coord.len);
  2002. coord.cnt[i]=coord_vec.Dim();
  2003. }else{
  2004. coord.dsp[i]=0;
  2005. coord.cnt[i]=0;
  2006. }
  2007. if(value_vec.Dim()){
  2008. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  2009. assert(value.dsp[i]<value.len);
  2010. value.cnt[i]=value_vec.Dim();
  2011. }else{
  2012. value.dsp[i]=0;
  2013. value.cnt[i]=0;
  2014. }
  2015. }
  2016. }
  2017. { // Set trg data
  2018. std::vector<void*>& nodes=nodes_out;
  2019. PackedData& coord=data.trg_coord;
  2020. PackedData& value=data.trg_value;
  2021. coord.ptr=setup_data. coord_data;
  2022. value.ptr=setup_data.output_data;
  2023. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  2024. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  2025. coord.cnt.ReInit(nodes.size());
  2026. coord.dsp.ReInit(nodes.size());
  2027. value.cnt.ReInit(nodes.size());
  2028. value.dsp.ReInit(nodes.size());
  2029. #pragma omp parallel for
  2030. for(size_t i=0;i<nodes.size();i++){
  2031. Vector<Real_t>& coord_vec=tree->upwd_check_surf[((FMMNode*)nodes[i])->Depth()];
  2032. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->upward_equiv;
  2033. if(coord_vec.Dim()){
  2034. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  2035. assert(coord.dsp[i]<coord.len);
  2036. coord.cnt[i]=coord_vec.Dim();
  2037. }else{
  2038. coord.dsp[i]=0;
  2039. coord.cnt[i]=0;
  2040. }
  2041. if(value_vec.Dim()){
  2042. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  2043. assert(value.dsp[i]<value.len);
  2044. value.cnt[i]=value_vec.Dim();
  2045. }else{
  2046. value.dsp[i]=0;
  2047. value.cnt[i]=0;
  2048. }
  2049. }
  2050. }
  2051. { // Set interac_data
  2052. int omp_p=omp_get_max_threads();
  2053. std::vector<std::vector<size_t> > in_node_(omp_p);
  2054. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  2055. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  2056. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  2057. if(this->ScaleInvar()){ // Set scal
  2058. const Kernel<Real_t>* ker=kernel->k_m2m;
  2059. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+2]
  2060. Vector<Real_t>& scal=data.interac_data.scal[l*4+2];
  2061. Vector<Real_t>& scal_exp=ker->trg_scal;
  2062. scal.ReInit(scal_exp.Dim());
  2063. for(size_t i=0;i<scal.Dim();i++){
  2064. scal[i]=pvfmm::pow<Real_t>(2.0,-scal_exp[i]*l);
  2065. }
  2066. }
  2067. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+3]
  2068. Vector<Real_t>& scal=data.interac_data.scal[l*4+3];
  2069. Vector<Real_t>& scal_exp=ker->src_scal;
  2070. scal.ReInit(scal_exp.Dim());
  2071. for(size_t i=0;i<scal.Dim();i++){
  2072. scal[i]=pvfmm::pow<Real_t>(2.0,-scal_exp[i]*l);
  2073. }
  2074. }
  2075. }
  2076. #pragma omp parallel for
  2077. for(size_t tid=0;tid<omp_p;tid++){
  2078. std::vector<size_t>& in_node =in_node_[tid] ;
  2079. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  2080. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  2081. std::vector<size_t>& interac_cnt=interac_cnt_[tid];
  2082. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  2083. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  2084. for(size_t i=a;i<b;i++){
  2085. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  2086. Real_t s=pvfmm::pow<Real_t>(0.5,tnode->Depth());
  2087. size_t interac_cnt_=0;
  2088. { // S2U_Type
  2089. Mat_Type type=S2U_Type;
  2090. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  2091. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  2092. FMMNode_t* snode=intlst[j];
  2093. size_t snode_id=snode->node_id;
  2094. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  2095. in_node.push_back(snode_id);
  2096. scal_idx.push_back(snode->Depth());
  2097. { // set coord_shift
  2098. const int* rel_coord=interac_list.RelativeCoord(type,j);
  2099. const Real_t* scoord=snode->Coord();
  2100. const Real_t* tcoord=tnode->Coord();
  2101. Real_t shift[COORD_DIM];
  2102. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+0.5*s)+(0+0.5*s);
  2103. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+0.5*s)+(0+0.5*s);
  2104. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+0.5*s)+(0+0.5*s);
  2105. coord_shift.push_back(shift[0]);
  2106. coord_shift.push_back(shift[1]);
  2107. coord_shift.push_back(shift[2]);
  2108. }
  2109. interac_cnt_++;
  2110. }
  2111. }
  2112. interac_cnt.push_back(interac_cnt_);
  2113. }
  2114. }
  2115. { // Combine interac data
  2116. InteracData& interac_data=data.interac_data;
  2117. { // in_node
  2118. typedef size_t ElemType;
  2119. std::vector<std::vector<ElemType> >& vec_=in_node_;
  2120. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  2121. std::vector<size_t> vec_dsp(omp_p+1,0);
  2122. for(size_t tid=0;tid<omp_p;tid++){
  2123. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2124. }
  2125. vec.ReInit(vec_dsp[omp_p]);
  2126. #pragma omp parallel for
  2127. for(size_t tid=0;tid<omp_p;tid++){
  2128. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2129. }
  2130. }
  2131. { // scal_idx
  2132. typedef size_t ElemType;
  2133. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  2134. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  2135. std::vector<size_t> vec_dsp(omp_p+1,0);
  2136. for(size_t tid=0;tid<omp_p;tid++){
  2137. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2138. }
  2139. vec.ReInit(vec_dsp[omp_p]);
  2140. #pragma omp parallel for
  2141. for(size_t tid=0;tid<omp_p;tid++){
  2142. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2143. }
  2144. }
  2145. { // coord_shift
  2146. typedef Real_t ElemType;
  2147. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  2148. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  2149. std::vector<size_t> vec_dsp(omp_p+1,0);
  2150. for(size_t tid=0;tid<omp_p;tid++){
  2151. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2152. }
  2153. vec.ReInit(vec_dsp[omp_p]);
  2154. #pragma omp parallel for
  2155. for(size_t tid=0;tid<omp_p;tid++){
  2156. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2157. }
  2158. }
  2159. { // interac_cnt
  2160. typedef size_t ElemType;
  2161. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  2162. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  2163. std::vector<size_t> vec_dsp(omp_p+1,0);
  2164. for(size_t tid=0;tid<omp_p;tid++){
  2165. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2166. }
  2167. vec.ReInit(vec_dsp[omp_p]);
  2168. #pragma omp parallel for
  2169. for(size_t tid=0;tid<omp_p;tid++){
  2170. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2171. }
  2172. }
  2173. { // interac_dsp
  2174. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  2175. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  2176. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  2177. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  2178. }
  2179. }
  2180. { // Set M[2], M[3]
  2181. InteracData& interac_data=data.interac_data;
  2182. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  2183. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  2184. if(cnt.Dim() && cnt[cnt.Dim()-1]+dsp[dsp.Dim()-1]){
  2185. data.interac_data.M[2]=this->mat->Mat(level, UC2UE0_Type, 0);
  2186. data.interac_data.M[3]=this->mat->Mat(level, UC2UE1_Type, 0);
  2187. }else{
  2188. data.interac_data.M[2].ReInit(0,0);
  2189. data.interac_data.M[3].ReInit(0,0);
  2190. }
  2191. }
  2192. }
  2193. PtSetup(setup_data, &data);
  2194. }
  2195. template <class FMMNode>
  2196. void FMM_Pts<FMMNode>::Source2Up(SetupData<Real_t>& setup_data, bool device){
  2197. if(!this->MultipoleOrder()) return;
  2198. //Add Source2Up contribution.
  2199. this->EvalListPts(setup_data, device);
  2200. }
  2201. template <class FMMNode>
  2202. void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2203. if(!this->MultipoleOrder()) return;
  2204. { // Set setup_data
  2205. setup_data.level=level;
  2206. setup_data.kernel=kernel->k_m2m;
  2207. setup_data.interac_type.resize(1);
  2208. setup_data.interac_type[0]=U2U_Type;
  2209. setup_data. input_data=&buff[0];
  2210. setup_data.output_data=&buff[0];
  2211. Vector<FMMNode_t*>& nodes_in =n_list[0];
  2212. Vector<FMMNode_t*>& nodes_out=n_list[0];
  2213. setup_data.nodes_in .clear();
  2214. setup_data.nodes_out.clear();
  2215. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level+1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
  2216. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level ) && nodes_out[i]->pt_cnt[0]) setup_data.nodes_out.push_back(nodes_out[i]);
  2217. }
  2218. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2219. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2220. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2221. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2222. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
  2223. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
  2224. SetupInterac(setup_data,device);
  2225. }
  2226. template <class FMMNode>
  2227. void FMM_Pts<FMMNode>::Up2Up (SetupData<Real_t>& setup_data, bool device){
  2228. if(!this->MultipoleOrder()) return;
  2229. //Add Up2Up contribution.
  2230. EvalList(setup_data, device);
  2231. }
  2232. template <class FMMNode>
  2233. void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
  2234. if(!this->ScaleInvar() || this->MultipoleOrder()==0) return;
  2235. Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
  2236. assert(node->FMMData()->upward_equiv.Dim()>0);
  2237. int dof=1;
  2238. Vector<Real_t>& upward_equiv=node->FMMData()->upward_equiv;
  2239. Vector<Real_t>& dnward_equiv=node->FMMData()->dnward_equiv;
  2240. assert(upward_equiv.Dim()==M.Dim(0)*dof);
  2241. assert(dnward_equiv.Dim()==M.Dim(1)*dof);
  2242. Matrix<Real_t> d_equiv(dof,M.Dim(1),&dnward_equiv[0],false);
  2243. Matrix<Real_t> u_equiv(dof,M.Dim(0),&upward_equiv[0],false);
  2244. Matrix<Real_t>::GEMM(d_equiv,u_equiv,M);
  2245. }
  2246. template <class FMMNode>
  2247. void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec, Vector<Real_t>& fft_scal,
  2248. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  2249. size_t n1=m*2;
  2250. size_t n2=n1*n1;
  2251. size_t n3=n1*n2;
  2252. size_t n3_=n2*(n1/2+1);
  2253. size_t chld_cnt=1UL<<COORD_DIM;
  2254. size_t fftsize_in =2*n3_*chld_cnt*ker_dim0*dof;
  2255. int omp_p=omp_get_max_threads();
  2256. //Load permutation map.
  2257. size_t n=6*(m-1)*(m-1)+2;
  2258. static Vector<size_t> map;
  2259. { // Build map to reorder upward_equiv
  2260. size_t n_old=map.Dim();
  2261. if(n_old!=n){
  2262. Real_t c[3]={0,0,0};
  2263. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  2264. map.Resize(surf.Dim()/COORD_DIM);
  2265. for(size_t i=0;i<map.Dim();i++)
  2266. map[i]=((size_t)(m-1-surf[i*3]+0.5))+((size_t)(m-1-surf[i*3+1]+0.5))*n1+((size_t)(m-1-surf[i*3+2]+0.5))*n2;
  2267. }
  2268. }
  2269. { // Build FFTW plan.
  2270. if(!vlist_fft_flag){
  2271. int nnn[3]={(int)n1,(int)n1,(int)n1};
  2272. void *fftw_in, *fftw_out;
  2273. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim0*chld_cnt);
  2274. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim0*chld_cnt);
  2275. vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
  2276. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_);
  2277. mem::aligned_delete<Real_t>((Real_t*)fftw_in );
  2278. mem::aligned_delete<Real_t>((Real_t*)fftw_out);
  2279. vlist_fft_flag=true;
  2280. }
  2281. }
  2282. { // Offload section
  2283. size_t n_in = fft_vec.Dim();
  2284. #pragma omp parallel for
  2285. for(int pid=0; pid<omp_p; pid++){
  2286. size_t node_start=(n_in*(pid ))/omp_p;
  2287. size_t node_end =(n_in*(pid+1))/omp_p;
  2288. Vector<Real_t> buffer(fftsize_in, &buffer_[fftsize_in*pid], false);
  2289. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  2290. Matrix<Real_t> upward_equiv(chld_cnt,n*ker_dim0*dof,&input_data[0] + fft_vec[node_idx],false);
  2291. Vector<Real_t> upward_equiv_fft(fftsize_in, &output_data[fftsize_in *node_idx], false);
  2292. upward_equiv_fft.SetZero();
  2293. // Rearrange upward equivalent data.
  2294. for(size_t k=0;k<n;k++){
  2295. size_t idx=map[k];
  2296. for(int j1=0;j1<dof;j1++)
  2297. for(int j0=0;j0<(int)chld_cnt;j0++)
  2298. for(int i=0;i<ker_dim0;i++)
  2299. upward_equiv_fft[idx+(j0+(i+j1*ker_dim0)*chld_cnt)*n3]=upward_equiv[j0][ker_dim0*(n*j1+k)+i]*fft_scal[ker_dim0*node_idx+i];
  2300. }
  2301. // Compute FFT.
  2302. for(int i=0;i<dof;i++)
  2303. FFTW_t<Real_t>::fft_execute_dft_r2c(vlist_fftplan, (Real_t*)&upward_equiv_fft[i* n3 *ker_dim0*chld_cnt],
  2304. (typename FFTW_t<Real_t>::cplx*)&buffer [i*2*n3_*ker_dim0*chld_cnt]);
  2305. //Compute flops.
  2306. #ifndef FFTW3_MKL
  2307. double add, mul, fma;
  2308. FFTW_t<Real_t>::fftw_flops(vlist_fftplan, &add, &mul, &fma);
  2309. #ifndef __INTEL_OFFLOAD0
  2310. Profile::Add_FLOP((long long)(add+mul+2*fma));
  2311. #endif
  2312. #endif
  2313. for(int i=0;i<ker_dim0*dof;i++)
  2314. for(size_t j=0;j<n3_;j++)
  2315. for(size_t k=0;k<chld_cnt;k++){
  2316. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+0]=buffer[2*(n3_*(chld_cnt*i+k)+j)+0];
  2317. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+1]=buffer[2*(n3_*(chld_cnt*i+k)+j)+1];
  2318. }
  2319. }
  2320. }
  2321. }
  2322. }
  2323. template <class FMMNode>
  2324. void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector<size_t>& ifft_vec, Vector<Real_t>& ifft_scal,
  2325. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  2326. size_t n1=m*2;
  2327. size_t n2=n1*n1;
  2328. size_t n3=n1*n2;
  2329. size_t n3_=n2*(n1/2+1);
  2330. size_t chld_cnt=1UL<<COORD_DIM;
  2331. size_t fftsize_out=2*n3_*dof*ker_dim1*chld_cnt;
  2332. int omp_p=omp_get_max_threads();
  2333. //Load permutation map.
  2334. size_t n=6*(m-1)*(m-1)+2;
  2335. static Vector<size_t> map;
  2336. { // Build map to reorder dnward_check
  2337. size_t n_old=map.Dim();
  2338. if(n_old!=n){
  2339. Real_t c[3]={0,0,0};
  2340. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  2341. map.Resize(surf.Dim()/COORD_DIM);
  2342. for(size_t i=0;i<map.Dim();i++)
  2343. map[i]=((size_t)(m*2-0.5-surf[i*3]))+((size_t)(m*2-0.5-surf[i*3+1]))*n1+((size_t)(m*2-0.5-surf[i*3+2]))*n2;
  2344. //map;//.AllocDevice(true);
  2345. }
  2346. }
  2347. { // Build FFTW plan.
  2348. if(!vlist_ifft_flag){
  2349. //Build FFTW plan.
  2350. int nnn[3]={(int)n1,(int)n1,(int)n1};
  2351. Real_t *fftw_in, *fftw_out;
  2352. fftw_in = mem::aligned_new<Real_t>(2*n3_*ker_dim1*chld_cnt);
  2353. fftw_out = mem::aligned_new<Real_t>( n3 *ker_dim1*chld_cnt);
  2354. vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
  2355. (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3);
  2356. mem::aligned_delete<Real_t>(fftw_in);
  2357. mem::aligned_delete<Real_t>(fftw_out);
  2358. vlist_ifft_flag=true;
  2359. }
  2360. }
  2361. { // Offload section
  2362. assert(buffer_.Dim()>=2*fftsize_out*omp_p);
  2363. size_t n_out=ifft_vec.Dim();
  2364. #pragma omp parallel for
  2365. for(int pid=0; pid<omp_p; pid++){
  2366. size_t node_start=(n_out*(pid ))/omp_p;
  2367. size_t node_end =(n_out*(pid+1))/omp_p;
  2368. Vector<Real_t> buffer0(fftsize_out, &buffer_[fftsize_out*(2*pid+0)], false);
  2369. Vector<Real_t> buffer1(fftsize_out, &buffer_[fftsize_out*(2*pid+1)], false);
  2370. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  2371. Vector<Real_t> dnward_check_fft(fftsize_out, &input_data[fftsize_out*node_idx], false);
  2372. Vector<Real_t> dnward_equiv(ker_dim1*n*dof*chld_cnt,&output_data[0] + ifft_vec[node_idx],false);
  2373. //De-interleave data.
  2374. for(int i=0;i<ker_dim1*dof;i++)
  2375. for(size_t j=0;j<n3_;j++)
  2376. for(size_t k=0;k<chld_cnt;k++){
  2377. buffer0[2*(n3_*(ker_dim1*dof*k+i)+j)+0]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+0];
  2378. buffer0[2*(n3_*(ker_dim1*dof*k+i)+j)+1]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+1];
  2379. }
  2380. // Compute FFT.
  2381. for(int i=0;i<dof;i++)
  2382. FFTW_t<Real_t>::fft_execute_dft_c2r(vlist_ifftplan, (typename FFTW_t<Real_t>::cplx*)&buffer0[i*2*n3_*ker_dim1*chld_cnt],
  2383. (Real_t*)&buffer1[i* n3 *ker_dim1*chld_cnt]);
  2384. //Compute flops.
  2385. #ifndef FFTW3_MKL
  2386. double add, mul, fma;
  2387. FFTW_t<Real_t>::fftw_flops(vlist_ifftplan, &add, &mul, &fma);
  2388. #ifndef __INTEL_OFFLOAD0
  2389. Profile::Add_FLOP((long long)(add+mul+2*fma)*dof);
  2390. #endif
  2391. #endif
  2392. // Rearrange downward check data.
  2393. for(size_t k=0;k<n;k++){
  2394. size_t idx=map[k];
  2395. for(int j1=0;j1<dof;j1++)
  2396. for(int j0=0;j0<(int)chld_cnt;j0++)
  2397. for(int i=0;i<ker_dim1;i++)
  2398. dnward_equiv[ker_dim1*(n*(dof*j0+j1)+k)+i]+=buffer1[idx+(i+(j1+j0*dof)*ker_dim1)*n3]*ifft_scal[ker_dim1*node_idx+i];
  2399. }
  2400. }
  2401. }
  2402. }
  2403. }
  2404. template<class Real_t>
  2405. inline void matmult_8x8x2(Real_t*& M_, Real_t*& IN0, Real_t*& IN1, Real_t*& OUT0, Real_t*& OUT1){
  2406. // Generic code.
  2407. Real_t out_reg000, out_reg001, out_reg010, out_reg011;
  2408. Real_t out_reg100, out_reg101, out_reg110, out_reg111;
  2409. Real_t in_reg000, in_reg001, in_reg010, in_reg011;
  2410. Real_t in_reg100, in_reg101, in_reg110, in_reg111;
  2411. Real_t m_reg000, m_reg001, m_reg010, m_reg011;
  2412. Real_t m_reg100, m_reg101, m_reg110, m_reg111;
  2413. //#pragma unroll
  2414. for(int i1=0;i1<8;i1+=2){
  2415. Real_t* IN0_=IN0;
  2416. Real_t* IN1_=IN1;
  2417. out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
  2418. out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
  2419. out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
  2420. out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
  2421. //#pragma unroll
  2422. for(int i2=0;i2<8;i2+=2){
  2423. m_reg000=M_[ 0]; m_reg001=M_[ 1];
  2424. m_reg010=M_[ 2]; m_reg011=M_[ 3];
  2425. m_reg100=M_[16]; m_reg101=M_[17];
  2426. m_reg110=M_[18]; m_reg111=M_[19];
  2427. in_reg000=IN0_[0]; in_reg001=IN0_[1];
  2428. in_reg010=IN0_[2]; in_reg011=IN0_[3];
  2429. in_reg100=IN1_[0]; in_reg101=IN1_[1];
  2430. in_reg110=IN1_[2]; in_reg111=IN1_[3];
  2431. out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
  2432. out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
  2433. out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
  2434. out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
  2435. out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
  2436. out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
  2437. out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
  2438. out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
  2439. out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
  2440. out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
  2441. out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
  2442. out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
  2443. out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
  2444. out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
  2445. out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
  2446. out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
  2447. M_+=32; // Jump to (column+2).
  2448. IN0_+=4;
  2449. IN1_+=4;
  2450. }
  2451. OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
  2452. OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
  2453. OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
  2454. OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
  2455. M_+=4-64*2; // Jump back to first column (row+2).
  2456. OUT0+=4;
  2457. OUT1+=4;
  2458. }
  2459. }
  2460. #if defined(__AVX__) || defined(__SSE3__)
  2461. template<>
  2462. inline void matmult_8x8x2<double>(double*& M_, double*& IN0, double*& IN1, double*& OUT0, double*& OUT1){
  2463. #ifdef __AVX__ //AVX code.
  2464. __m256d out00,out01,out10,out11;
  2465. __m256d out20,out21,out30,out31;
  2466. double* in0__ = IN0;
  2467. double* in1__ = IN1;
  2468. out00 = _mm256_load_pd(OUT0);
  2469. out01 = _mm256_load_pd(OUT1);
  2470. out10 = _mm256_load_pd(OUT0+4);
  2471. out11 = _mm256_load_pd(OUT1+4);
  2472. out20 = _mm256_load_pd(OUT0+8);
  2473. out21 = _mm256_load_pd(OUT1+8);
  2474. out30 = _mm256_load_pd(OUT0+12);
  2475. out31 = _mm256_load_pd(OUT1+12);
  2476. for(int i2=0;i2<8;i2+=2){
  2477. __m256d m00;
  2478. __m256d ot00;
  2479. __m256d mt0,mtt0;
  2480. __m256d in00,in00_r,in01,in01_r;
  2481. in00 = _mm256_broadcast_pd((const __m128d*)in0__);
  2482. in00_r = _mm256_permute_pd(in00,5);
  2483. in01 = _mm256_broadcast_pd((const __m128d*)in1__);
  2484. in01_r = _mm256_permute_pd(in01,5);
  2485. m00 = _mm256_load_pd(M_);
  2486. mt0 = _mm256_unpacklo_pd(m00,m00);
  2487. ot00 = _mm256_mul_pd(mt0,in00);
  2488. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2489. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2490. ot00 = _mm256_mul_pd(mt0,in01);
  2491. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2492. m00 = _mm256_load_pd(M_+4);
  2493. mt0 = _mm256_unpacklo_pd(m00,m00);
  2494. ot00 = _mm256_mul_pd(mt0,in00);
  2495. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2496. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2497. ot00 = _mm256_mul_pd(mt0,in01);
  2498. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2499. m00 = _mm256_load_pd(M_+8);
  2500. mt0 = _mm256_unpacklo_pd(m00,m00);
  2501. ot00 = _mm256_mul_pd(mt0,in00);
  2502. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2503. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2504. ot00 = _mm256_mul_pd(mt0,in01);
  2505. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2506. m00 = _mm256_load_pd(M_+12);
  2507. mt0 = _mm256_unpacklo_pd(m00,m00);
  2508. ot00 = _mm256_mul_pd(mt0,in00);
  2509. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2510. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2511. ot00 = _mm256_mul_pd(mt0,in01);
  2512. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2513. in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
  2514. in00_r = _mm256_permute_pd(in00,5);
  2515. in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
  2516. in01_r = _mm256_permute_pd(in01,5);
  2517. m00 = _mm256_load_pd(M_+16);
  2518. mt0 = _mm256_unpacklo_pd(m00,m00);
  2519. ot00 = _mm256_mul_pd(mt0,in00);
  2520. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2521. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2522. ot00 = _mm256_mul_pd(mt0,in01);
  2523. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2524. m00 = _mm256_load_pd(M_+20);
  2525. mt0 = _mm256_unpacklo_pd(m00,m00);
  2526. ot00 = _mm256_mul_pd(mt0,in00);
  2527. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2528. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2529. ot00 = _mm256_mul_pd(mt0,in01);
  2530. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2531. m00 = _mm256_load_pd(M_+24);
  2532. mt0 = _mm256_unpacklo_pd(m00,m00);
  2533. ot00 = _mm256_mul_pd(mt0,in00);
  2534. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2535. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2536. ot00 = _mm256_mul_pd(mt0,in01);
  2537. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2538. m00 = _mm256_load_pd(M_+28);
  2539. mt0 = _mm256_unpacklo_pd(m00,m00);
  2540. ot00 = _mm256_mul_pd(mt0,in00);
  2541. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2542. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2543. ot00 = _mm256_mul_pd(mt0,in01);
  2544. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2545. M_ += 32;
  2546. in0__ += 4;
  2547. in1__ += 4;
  2548. }
  2549. _mm256_store_pd(OUT0,out00);
  2550. _mm256_store_pd(OUT1,out01);
  2551. _mm256_store_pd(OUT0+4,out10);
  2552. _mm256_store_pd(OUT1+4,out11);
  2553. _mm256_store_pd(OUT0+8,out20);
  2554. _mm256_store_pd(OUT1+8,out21);
  2555. _mm256_store_pd(OUT0+12,out30);
  2556. _mm256_store_pd(OUT1+12,out31);
  2557. #elif defined __SSE3__ // SSE code.
  2558. __m128d out00, out01, out10, out11;
  2559. __m128d in00, in01, in10, in11;
  2560. __m128d m00, m01, m10, m11;
  2561. //#pragma unroll
  2562. for(int i1=0;i1<8;i1+=2){
  2563. double* IN0_=IN0;
  2564. double* IN1_=IN1;
  2565. out00 =_mm_load_pd (OUT0 );
  2566. out10 =_mm_load_pd (OUT0+2);
  2567. out01 =_mm_load_pd (OUT1 );
  2568. out11 =_mm_load_pd (OUT1+2);
  2569. //#pragma unroll
  2570. for(int i2=0;i2<8;i2+=2){
  2571. m00 =_mm_load1_pd (M_ );
  2572. m10 =_mm_load1_pd (M_+ 2);
  2573. m01 =_mm_load1_pd (M_+16);
  2574. m11 =_mm_load1_pd (M_+18);
  2575. in00 =_mm_load_pd (IN0_ );
  2576. in10 =_mm_load_pd (IN0_+2);
  2577. in01 =_mm_load_pd (IN1_ );
  2578. in11 =_mm_load_pd (IN1_+2);
  2579. out00 = _mm_add_pd (out00, _mm_mul_pd(m00 , in00 ));
  2580. out00 = _mm_add_pd (out00, _mm_mul_pd(m01 , in10 ));
  2581. out01 = _mm_add_pd (out01, _mm_mul_pd(m00 , in01 ));
  2582. out01 = _mm_add_pd (out01, _mm_mul_pd(m01 , in11 ));
  2583. out10 = _mm_add_pd (out10, _mm_mul_pd(m10 , in00 ));
  2584. out10 = _mm_add_pd (out10, _mm_mul_pd(m11 , in10 ));
  2585. out11 = _mm_add_pd (out11, _mm_mul_pd(m10 , in01 ));
  2586. out11 = _mm_add_pd (out11, _mm_mul_pd(m11 , in11 ));
  2587. m00 =_mm_load1_pd (M_+ 1);
  2588. m10 =_mm_load1_pd (M_+ 2+1);
  2589. m01 =_mm_load1_pd (M_+16+1);
  2590. m11 =_mm_load1_pd (M_+18+1);
  2591. in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
  2592. in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
  2593. in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
  2594. in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
  2595. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
  2596. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
  2597. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
  2598. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
  2599. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
  2600. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
  2601. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
  2602. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
  2603. M_+=32; // Jump to (column+2).
  2604. IN0_+=4;
  2605. IN1_+=4;
  2606. }
  2607. _mm_store_pd (OUT0 ,out00);
  2608. _mm_store_pd (OUT0+2,out10);
  2609. _mm_store_pd (OUT1 ,out01);
  2610. _mm_store_pd (OUT1+2,out11);
  2611. M_+=4-64*2; // Jump back to first column (row+2).
  2612. OUT0+=4;
  2613. OUT1+=4;
  2614. }
  2615. #endif
  2616. }
  2617. #endif
  2618. #if defined(__SSE3__)
  2619. template<>
  2620. inline void matmult_8x8x2<float>(float*& M_, float*& IN0, float*& IN1, float*& OUT0, float*& OUT1){
  2621. #if defined __SSE3__ // SSE code.
  2622. __m128 out00,out01,out10,out11;
  2623. __m128 out20,out21,out30,out31;
  2624. float* in0__ = IN0;
  2625. float* in1__ = IN1;
  2626. out00 = _mm_load_ps(OUT0);
  2627. out01 = _mm_load_ps(OUT1);
  2628. out10 = _mm_load_ps(OUT0+4);
  2629. out11 = _mm_load_ps(OUT1+4);
  2630. out20 = _mm_load_ps(OUT0+8);
  2631. out21 = _mm_load_ps(OUT1+8);
  2632. out30 = _mm_load_ps(OUT0+12);
  2633. out31 = _mm_load_ps(OUT1+12);
  2634. for(int i2=0;i2<8;i2+=2){
  2635. __m128 m00;
  2636. __m128 mt0,mtt0;
  2637. __m128 in00,in00_r,in01,in01_r;
  2638. in00 = _mm_castpd_ps(_mm_load_pd1((const double*)in0__));
  2639. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2640. in01 = _mm_castpd_ps(_mm_load_pd1((const double*)in1__));
  2641. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2642. m00 = _mm_load_ps(M_);
  2643. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2644. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2645. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2646. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2647. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2648. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2649. m00 = _mm_load_ps(M_+4);
  2650. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2651. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2652. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2653. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2654. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2655. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2656. m00 = _mm_load_ps(M_+8);
  2657. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2658. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2659. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2660. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2661. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2662. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2663. m00 = _mm_load_ps(M_+12);
  2664. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2665. out30= _mm_add_ps (out30,_mm_mul_ps( mt0, in00));
  2666. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2667. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2668. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2669. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2670. in00 = _mm_castpd_ps(_mm_load_pd1((const double*) (in0__+2)));
  2671. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2672. in01 = _mm_castpd_ps(_mm_load_pd1((const double*) (in1__+2)));
  2673. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2674. m00 = _mm_load_ps(M_+16);
  2675. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2676. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2677. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2678. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2679. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2680. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2681. m00 = _mm_load_ps(M_+20);
  2682. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2683. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2684. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2685. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2686. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2687. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2688. m00 = _mm_load_ps(M_+24);
  2689. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2690. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2691. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2692. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2693. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2694. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2695. m00 = _mm_load_ps(M_+28);
  2696. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2697. out30= _mm_add_ps (out30,_mm_mul_ps( mt0,in00 ));
  2698. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2699. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2700. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2701. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2702. M_ += 32;
  2703. in0__ += 4;
  2704. in1__ += 4;
  2705. }
  2706. _mm_store_ps(OUT0,out00);
  2707. _mm_store_ps(OUT1,out01);
  2708. _mm_store_ps(OUT0+4,out10);
  2709. _mm_store_ps(OUT1+4,out11);
  2710. _mm_store_ps(OUT0+8,out20);
  2711. _mm_store_ps(OUT1+8,out21);
  2712. _mm_store_ps(OUT0+12,out30);
  2713. _mm_store_ps(OUT1+12,out31);
  2714. #endif
  2715. }
  2716. #endif
  2717. template <class Real_t>
  2718. void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector<size_t>& interac_dsp,
  2719. Vector<size_t>& interac_vec, Vector<Real_t*>& precomp_mat, Vector<Real_t>& fft_in, Vector<Real_t>& fft_out){
  2720. size_t chld_cnt=1UL<<COORD_DIM;
  2721. size_t fftsize_in =M_dim*ker_dim0*chld_cnt*2;
  2722. size_t fftsize_out=M_dim*ker_dim1*chld_cnt*2;
  2723. Real_t* zero_vec0=mem::aligned_new<Real_t>(fftsize_in );
  2724. Real_t* zero_vec1=mem::aligned_new<Real_t>(fftsize_out);
  2725. size_t n_out=fft_out.Dim()/fftsize_out;
  2726. // Set buff_out to zero.
  2727. #pragma omp parallel for
  2728. for(size_t k=0;k<n_out;k++){
  2729. Vector<Real_t> dnward_check_fft(fftsize_out, &fft_out[k*fftsize_out], false);
  2730. dnward_check_fft.SetZero();
  2731. }
  2732. // Build list of interaction pairs (in, out vectors).
  2733. size_t mat_cnt=precomp_mat.Dim();
  2734. size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
  2735. const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
  2736. Real_t** IN_ =mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2737. Real_t** OUT_=mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2738. #pragma omp parallel for
  2739. for(size_t interac_blk1=0; interac_blk1<blk1_cnt*mat_cnt; interac_blk1++){
  2740. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2741. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2742. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2743. for(size_t j=0;j<interac_cnt;j++){
  2744. IN_ [2*V_BLK_SIZE*interac_blk1 +j]=&fft_in [interac_vec[(interac_dsp0+j)*2+0]];
  2745. OUT_[2*V_BLK_SIZE*interac_blk1 +j]=&fft_out[interac_vec[(interac_dsp0+j)*2+1]];
  2746. }
  2747. IN_ [2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec0;
  2748. OUT_[2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec1;
  2749. }
  2750. int omp_p=omp_get_max_threads();
  2751. #pragma omp parallel for
  2752. for(int pid=0; pid<omp_p; pid++){
  2753. size_t a=( pid *M_dim)/omp_p;
  2754. size_t b=((pid+1)*M_dim)/omp_p;
  2755. for(int in_dim=0;in_dim<ker_dim0;in_dim++)
  2756. for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++)
  2757. for(size_t blk1=0; blk1<blk1_cnt; blk1++)
  2758. for(size_t k=a; k< b; k++)
  2759. for(size_t mat_indx=0; mat_indx< mat_cnt;mat_indx++){
  2760. size_t interac_blk1 = blk1*mat_cnt+mat_indx;
  2761. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2762. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2763. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2764. Real_t** IN = IN_ + 2*V_BLK_SIZE*interac_blk1;
  2765. Real_t** OUT= OUT_+ 2*V_BLK_SIZE*interac_blk1;
  2766. Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2 + (ot_dim+in_dim*ker_dim1)*M_dim*128;
  2767. {
  2768. for(size_t j=0;j<interac_cnt;j+=2){
  2769. Real_t* M_ = M;
  2770. Real_t* IN0 = IN [j+0] + (in_dim*M_dim+k)*chld_cnt*2;
  2771. Real_t* IN1 = IN [j+1] + (in_dim*M_dim+k)*chld_cnt*2;
  2772. Real_t* OUT0 = OUT[j+0] + (ot_dim*M_dim+k)*chld_cnt*2;
  2773. Real_t* OUT1 = OUT[j+1] + (ot_dim*M_dim+k)*chld_cnt*2;
  2774. #ifdef __SSE__
  2775. if (j+2 < interac_cnt) { // Prefetch
  2776. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2777. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2778. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2779. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2780. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2781. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2782. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2783. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2784. }
  2785. #endif
  2786. matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
  2787. }
  2788. }
  2789. }
  2790. }
  2791. // Compute flops.
  2792. {
  2793. Profile::Add_FLOP(8*8*8*(interac_vec.Dim()/2)*M_dim*ker_dim0*ker_dim1*dof);
  2794. }
  2795. // Free memory
  2796. mem::aligned_delete<Real_t*>(IN_ );
  2797. mem::aligned_delete<Real_t*>(OUT_);
  2798. mem::aligned_delete<Real_t>(zero_vec0);
  2799. mem::aligned_delete<Real_t>(zero_vec1);
  2800. }
  2801. template <class FMMNode>
  2802. void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2803. if(!this->MultipoleOrder()) return;
  2804. if(level==0) return;
  2805. { // Set setup_data
  2806. setup_data.level=level;
  2807. setup_data.kernel=kernel->k_m2l;
  2808. setup_data.interac_type.resize(1);
  2809. setup_data.interac_type[0]=V1_Type;
  2810. setup_data. input_data=&buff[0];
  2811. setup_data.output_data=&buff[1];
  2812. Vector<FMMNode_t*>& nodes_in =n_list[2];
  2813. Vector<FMMNode_t*>& nodes_out=n_list[3];
  2814. setup_data.nodes_in .clear();
  2815. setup_data.nodes_out.clear();
  2816. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1 || level==-1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
  2817. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level-1 || level==-1) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
  2818. }
  2819. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2820. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2821. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2822. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2823. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_in [i])->Child(0))->FMMData())->upward_equiv);
  2824. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_out[i])->Child(0))->FMMData())->dnward_equiv);
  2825. /////////////////////////////////////////////////////////////////////////////
  2826. Real_t eps=1e-10;
  2827. size_t n_in =nodes_in .size();
  2828. size_t n_out=nodes_out.size();
  2829. // Setup precomputed data.
  2830. //if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  2831. // Build interac_data
  2832. Profile::Tic("Interac-Data",&this->comm,true,25);
  2833. Matrix<char>& interac_data=setup_data.interac_data;
  2834. if(n_out>0 && n_in >0){ // Build precomp_data, interac_data
  2835. size_t precomp_offset=0;
  2836. Mat_Type& interac_type=setup_data.interac_type[0];
  2837. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  2838. Matrix<size_t> precomp_data_offset;
  2839. std::vector<size_t> interac_mat;
  2840. std::vector<Real_t*> interac_mat_ptr;
  2841. #if 0 // Since we skip SetupPrecomp for V-list
  2842. { // Load precomp_data for interac_type.
  2843. struct HeaderData{
  2844. size_t total_size;
  2845. size_t level;
  2846. size_t mat_cnt ;
  2847. size_t max_depth;
  2848. };
  2849. Matrix<char>& precomp_data=*setup_data.precomp_data;
  2850. char* indx_ptr=precomp_data[0]+precomp_offset;
  2851. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  2852. precomp_data_offset.ReInit(header.mat_cnt,1+(2+2)*header.max_depth, (size_t*)indx_ptr, false);
  2853. precomp_offset+=header.total_size;
  2854. for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
  2855. Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
  2856. assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
  2857. interac_mat.push_back(precomp_data_offset[mat_id][0]);
  2858. }
  2859. }
  2860. #else
  2861. {
  2862. for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
  2863. Matrix<Real_t>& M = this->mat->Mat(level, interac_type, mat_id);
  2864. interac_mat_ptr.push_back(&M[0][0]);
  2865. }
  2866. }
  2867. #endif
  2868. size_t dof;
  2869. size_t m=MultipoleOrder();
  2870. size_t ker_dim0=setup_data.kernel->ker_dim[0];
  2871. size_t ker_dim1=setup_data.kernel->ker_dim[1];
  2872. size_t fftsize;
  2873. {
  2874. size_t n1=m*2;
  2875. size_t n2=n1*n1;
  2876. size_t n3_=n2*(n1/2+1);
  2877. size_t chld_cnt=1UL<<COORD_DIM;
  2878. fftsize=2*n3_*chld_cnt;
  2879. dof=1;
  2880. }
  2881. int omp_p=omp_get_max_threads();
  2882. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  2883. size_t n_blk0=2*fftsize*dof*(ker_dim0*n_in +ker_dim1*n_out)*sizeof(Real_t)/buff_size;
  2884. if(n_blk0==0) n_blk0=1;
  2885. std::vector<std::vector<size_t> > fft_vec(n_blk0);
  2886. std::vector<std::vector<size_t> > ifft_vec(n_blk0);
  2887. std::vector<std::vector<Real_t> > fft_scl(n_blk0);
  2888. std::vector<std::vector<Real_t> > ifft_scl(n_blk0);
  2889. std::vector<std::vector<size_t> > interac_vec(n_blk0);
  2890. std::vector<std::vector<size_t> > interac_dsp(n_blk0);
  2891. {
  2892. Matrix<Real_t>& input_data=*setup_data. input_data;
  2893. Matrix<Real_t>& output_data=*setup_data.output_data;
  2894. std::vector<std::vector<FMMNode*> > nodes_blk_in (n_blk0);
  2895. std::vector<std::vector<FMMNode*> > nodes_blk_out(n_blk0);
  2896. Vector<Real_t> src_scal=this->kernel->k_m2l->src_scal;
  2897. Vector<Real_t> trg_scal=this->kernel->k_m2l->trg_scal;
  2898. for(size_t i=0;i<n_in;i++) ((FMMNode*)nodes_in[i])->node_id=i;
  2899. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2900. size_t blk0_start=(n_out* blk0 )/n_blk0;
  2901. size_t blk0_end =(n_out*(blk0+1))/n_blk0;
  2902. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2903. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2904. { // Build node list for blk0.
  2905. std::set<void*> nodes_in;
  2906. for(size_t i=blk0_start;i<blk0_end;i++){
  2907. nodes_out_.push_back((FMMNode*)nodes_out[i]);
  2908. Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  2909. for(size_t k=0;k<mat_cnt;k++) if(lst[k]!=NULL && lst[k]->pt_cnt[0]) nodes_in.insert(lst[k]);
  2910. }
  2911. for(std::set<void*>::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){
  2912. nodes_in_.push_back((FMMNode*)*node);
  2913. }
  2914. size_t input_dim=nodes_in_ .size()*ker_dim0*dof*fftsize;
  2915. size_t output_dim=nodes_out_.size()*ker_dim1*dof*fftsize;
  2916. size_t buffer_dim=2*(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2917. if(buff_size<(input_dim + output_dim + buffer_dim)*sizeof(Real_t))
  2918. buff_size=(input_dim + output_dim + buffer_dim)*sizeof(Real_t);
  2919. }
  2920. { // Set fft vectors.
  2921. for(size_t i=0;i<nodes_in_ .size();i++) fft_vec[blk0].push_back((size_t)(& input_vector[nodes_in_[i]->node_id][0][0]- input_data[0]));
  2922. for(size_t i=0;i<nodes_out_.size();i++)ifft_vec[blk0].push_back((size_t)(&output_vector[blk0_start + i ][0][0]-output_data[0]));
  2923. size_t scal_dim0=src_scal.Dim();
  2924. size_t scal_dim1=trg_scal.Dim();
  2925. fft_scl [blk0].resize(nodes_in_ .size()*scal_dim0);
  2926. ifft_scl[blk0].resize(nodes_out_.size()*scal_dim1);
  2927. for(size_t i=0;i<nodes_in_ .size();i++){
  2928. size_t depth=nodes_in_[i]->Depth()+1;
  2929. for(size_t j=0;j<scal_dim0;j++){
  2930. fft_scl[blk0][i*scal_dim0+j]=pvfmm::pow<Real_t>(2.0, src_scal[j]*depth);
  2931. }
  2932. }
  2933. for(size_t i=0;i<nodes_out_.size();i++){
  2934. size_t depth=nodes_out_[i]->Depth()+1;
  2935. for(size_t j=0;j<scal_dim1;j++){
  2936. ifft_scl[blk0][i*scal_dim1+j]=pvfmm::pow<Real_t>(2.0, trg_scal[j]*depth);
  2937. }
  2938. }
  2939. }
  2940. }
  2941. for(size_t blk0=0;blk0<n_blk0;blk0++){ // Hadamard interactions.
  2942. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2943. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2944. for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
  2945. { // Next blocking level.
  2946. size_t n_blk1=nodes_out_.size()*(2)*sizeof(Real_t)/(64*V_BLK_CACHE);
  2947. if(n_blk1==0) n_blk1=1;
  2948. size_t interac_dsp_=0;
  2949. for(size_t blk1=0;blk1<n_blk1;blk1++){
  2950. size_t blk1_start=(nodes_out_.size()* blk1 )/n_blk1;
  2951. size_t blk1_end =(nodes_out_.size()*(blk1+1))/n_blk1;
  2952. for(size_t k=0;k<mat_cnt;k++){
  2953. for(size_t i=blk1_start;i<blk1_end;i++){
  2954. Vector<FMMNode*>& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type];
  2955. if(lst[k]!=NULL && lst[k]->pt_cnt[0]){
  2956. interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof);
  2957. interac_vec[blk0].push_back( i *fftsize*ker_dim1*dof);
  2958. interac_dsp_++;
  2959. }
  2960. }
  2961. interac_dsp[blk0].push_back(interac_dsp_);
  2962. }
  2963. }
  2964. }
  2965. }
  2966. }
  2967. { // Set interac_data.
  2968. size_t data_size=sizeof(size_t)*6; // buff_size, m, dof, ker_dim0, ker_dim1, n_blk0
  2969. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2970. data_size+=sizeof(size_t)+ fft_vec[blk0].size()*sizeof(size_t);
  2971. data_size+=sizeof(size_t)+ ifft_vec[blk0].size()*sizeof(size_t);
  2972. data_size+=sizeof(size_t)+ fft_scl[blk0].size()*sizeof(Real_t);
  2973. data_size+=sizeof(size_t)+ ifft_scl[blk0].size()*sizeof(Real_t);
  2974. data_size+=sizeof(size_t)+interac_vec[blk0].size()*sizeof(size_t);
  2975. data_size+=sizeof(size_t)+interac_dsp[blk0].size()*sizeof(size_t);
  2976. }
  2977. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  2978. data_size+=sizeof(size_t)+interac_mat_ptr.size()*sizeof(Real_t*);
  2979. if(data_size>interac_data.Dim(0)*interac_data.Dim(1))
  2980. interac_data.ReInit(1,data_size);
  2981. char* data_ptr=&interac_data[0][0];
  2982. ((size_t*)data_ptr)[0]=buff_size; data_ptr+=sizeof(size_t);
  2983. ((size_t*)data_ptr)[0]= m; data_ptr+=sizeof(size_t);
  2984. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  2985. ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
  2986. ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
  2987. ((size_t*)data_ptr)[0]= n_blk0; data_ptr+=sizeof(size_t);
  2988. ((size_t*)data_ptr)[0]= interac_mat.size(); data_ptr+=sizeof(size_t);
  2989. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  2990. data_ptr+=interac_mat.size()*sizeof(size_t);
  2991. ((size_t*)data_ptr)[0]= interac_mat_ptr.size(); data_ptr+=sizeof(size_t);
  2992. mem::memcopy(data_ptr, &interac_mat_ptr[0], interac_mat_ptr.size()*sizeof(Real_t*));
  2993. data_ptr+=interac_mat_ptr.size()*sizeof(Real_t*);
  2994. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2995. ((size_t*)data_ptr)[0]= fft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2996. mem::memcopy(data_ptr, & fft_vec[blk0][0], fft_vec[blk0].size()*sizeof(size_t));
  2997. data_ptr+= fft_vec[blk0].size()*sizeof(size_t);
  2998. ((size_t*)data_ptr)[0]=ifft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2999. mem::memcopy(data_ptr, &ifft_vec[blk0][0], ifft_vec[blk0].size()*sizeof(size_t));
  3000. data_ptr+=ifft_vec[blk0].size()*sizeof(size_t);
  3001. ((size_t*)data_ptr)[0]= fft_scl[blk0].size(); data_ptr+=sizeof(size_t);
  3002. mem::memcopy(data_ptr, & fft_scl[blk0][0], fft_scl[blk0].size()*sizeof(Real_t));
  3003. data_ptr+= fft_scl[blk0].size()*sizeof(Real_t);
  3004. ((size_t*)data_ptr)[0]=ifft_scl[blk0].size(); data_ptr+=sizeof(size_t);
  3005. mem::memcopy(data_ptr, &ifft_scl[blk0][0], ifft_scl[blk0].size()*sizeof(Real_t));
  3006. data_ptr+=ifft_scl[blk0].size()*sizeof(Real_t);
  3007. ((size_t*)data_ptr)[0]=interac_vec[blk0].size(); data_ptr+=sizeof(size_t);
  3008. mem::memcopy(data_ptr, &interac_vec[blk0][0], interac_vec[blk0].size()*sizeof(size_t));
  3009. data_ptr+=interac_vec[blk0].size()*sizeof(size_t);
  3010. ((size_t*)data_ptr)[0]=interac_dsp[blk0].size(); data_ptr+=sizeof(size_t);
  3011. mem::memcopy(data_ptr, &interac_dsp[blk0][0], interac_dsp[blk0].size()*sizeof(size_t));
  3012. data_ptr+=interac_dsp[blk0].size()*sizeof(size_t);
  3013. }
  3014. }
  3015. }
  3016. Profile::Toc();
  3017. if(device){ // Host2Device
  3018. Profile::Tic("Host2Device",&this->comm,false,25);
  3019. setup_data.interac_data. AllocDevice(true);
  3020. Profile::Toc();
  3021. }
  3022. }
  3023. template <class FMMNode>
  3024. void FMM_Pts<FMMNode>::V_List (SetupData<Real_t>& setup_data, bool device){
  3025. if(!this->MultipoleOrder()) return;
  3026. assert(!device); //Can not run on accelerator yet.
  3027. int np;
  3028. MPI_Comm_size(comm,&np);
  3029. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  3030. if(np>1) Profile::Tic("Host2Device",&this->comm,false,25);
  3031. if(np>1) Profile::Toc();
  3032. return;
  3033. }
  3034. Profile::Tic("Host2Device",&this->comm,false,25);
  3035. int level=setup_data.level;
  3036. size_t buff_size=*((size_t*)&setup_data.interac_data[0][0]);
  3037. typename Vector<char>::Device buff;
  3038. //typename Matrix<char>::Device precomp_data;
  3039. typename Matrix<char>::Device interac_data;
  3040. typename Matrix<Real_t>::Device input_data;
  3041. typename Matrix<Real_t>::Device output_data;
  3042. if(device){
  3043. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  3044. buff = this-> dev_buffer. AllocDevice(false);
  3045. //precomp_data= setup_data.precomp_data->AllocDevice(false);
  3046. interac_data= setup_data.interac_data. AllocDevice(false);
  3047. input_data = setup_data. input_data->AllocDevice(false);
  3048. output_data = setup_data. output_data->AllocDevice(false);
  3049. }else{
  3050. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  3051. buff = this-> dev_buffer;
  3052. //precomp_data=*setup_data.precomp_data;
  3053. interac_data= setup_data.interac_data;
  3054. input_data =*setup_data. input_data;
  3055. output_data =*setup_data. output_data;
  3056. }
  3057. Profile::Toc();
  3058. { // Offloaded computation.
  3059. // Set interac_data.
  3060. size_t m, dof, ker_dim0, ker_dim1, n_blk0;
  3061. std::vector<Vector<size_t> > fft_vec;
  3062. std::vector<Vector<size_t> > ifft_vec;
  3063. std::vector<Vector<Real_t> > fft_scl;
  3064. std::vector<Vector<Real_t> > ifft_scl;
  3065. std::vector<Vector<size_t> > interac_vec;
  3066. std::vector<Vector<size_t> > interac_dsp;
  3067. Vector<Real_t*> precomp_mat;
  3068. { // Set interac_data.
  3069. char* data_ptr=&interac_data[0][0];
  3070. buff_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3071. m =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3072. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3073. ker_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3074. ker_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3075. n_blk0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  3076. fft_vec .resize(n_blk0);
  3077. ifft_vec.resize(n_blk0);
  3078. fft_scl .resize(n_blk0);
  3079. ifft_scl.resize(n_blk0);
  3080. interac_vec.resize(n_blk0);
  3081. interac_dsp.resize(n_blk0);
  3082. Vector<size_t> interac_mat;
  3083. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3084. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  3085. Vector<Real_t*> interac_mat_ptr;
  3086. interac_mat_ptr.ReInit(((size_t*)data_ptr)[0],(Real_t**)(data_ptr+sizeof(size_t)),false);
  3087. data_ptr+=sizeof(size_t)+interac_mat_ptr.Dim()*sizeof(Real_t*);
  3088. #if 0 // Since we skip SetupPrecomp for V-list
  3089. precomp_mat.Resize(interac_mat.Dim());
  3090. for(size_t i=0;i<interac_mat.Dim();i++){
  3091. precomp_mat[i]=(Real_t*)(precomp_data[0]+interac_mat[i]);
  3092. }
  3093. #else
  3094. precomp_mat.Resize(interac_mat_ptr.Dim());
  3095. for(size_t i=0;i<interac_mat_ptr.Dim();i++){
  3096. precomp_mat[i]=interac_mat_ptr[i];
  3097. }
  3098. #endif
  3099. for(size_t blk0=0;blk0<n_blk0;blk0++){
  3100. fft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3101. data_ptr+=sizeof(size_t)+fft_vec[blk0].Dim()*sizeof(size_t);
  3102. ifft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3103. data_ptr+=sizeof(size_t)+ifft_vec[blk0].Dim()*sizeof(size_t);
  3104. fft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  3105. data_ptr+=sizeof(size_t)+fft_scl[blk0].Dim()*sizeof(Real_t);
  3106. ifft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  3107. data_ptr+=sizeof(size_t)+ifft_scl[blk0].Dim()*sizeof(Real_t);
  3108. interac_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3109. data_ptr+=sizeof(size_t)+interac_vec[blk0].Dim()*sizeof(size_t);
  3110. interac_dsp[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  3111. data_ptr+=sizeof(size_t)+interac_dsp[blk0].Dim()*sizeof(size_t);
  3112. }
  3113. }
  3114. int omp_p=omp_get_max_threads();
  3115. size_t M_dim, fftsize;
  3116. {
  3117. size_t n1=m*2;
  3118. size_t n2=n1*n1;
  3119. size_t n3_=n2*(n1/2+1);
  3120. size_t chld_cnt=1UL<<COORD_DIM;
  3121. fftsize=2*n3_*chld_cnt;
  3122. M_dim=n3_;
  3123. }
  3124. for(size_t blk0=0;blk0<n_blk0;blk0++){ // interactions
  3125. size_t n_in = fft_vec[blk0].Dim();
  3126. size_t n_out=ifft_vec[blk0].Dim();
  3127. size_t input_dim=n_in *ker_dim0*dof*fftsize;
  3128. size_t output_dim=n_out*ker_dim1*dof*fftsize;
  3129. size_t buffer_dim=2*(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  3130. Vector<Real_t> fft_in ( input_dim, (Real_t*)&buff[ 0 ],false);
  3131. Vector<Real_t> fft_out(output_dim, (Real_t*)&buff[ input_dim *sizeof(Real_t)],false);
  3132. Vector<Real_t> buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
  3133. { // FFT
  3134. if(np==1) Profile::Tic("FFT",&comm,false,100);
  3135. Vector<Real_t> input_data_( input_data.dim[0]* input_data.dim[1], input_data[0], false);
  3136. FFT_UpEquiv(dof, m, ker_dim0, fft_vec[blk0], fft_scl[blk0], input_data_, fft_in, buffer);
  3137. if(np==1) Profile::Toc();
  3138. }
  3139. { // Hadamard
  3140. #ifdef PVFMM_HAVE_PAPI
  3141. #ifdef __VERBOSE__
  3142. std::cout << "Starting counters new\n";
  3143. if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
  3144. #endif
  3145. #endif
  3146. if(np==1) Profile::Tic("HadamardProduct",&comm,false,100);
  3147. VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
  3148. if(np==1) Profile::Toc();
  3149. #ifdef PVFMM_HAVE_PAPI
  3150. #ifdef __VERBOSE__
  3151. if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
  3152. std::cout << "Stopping counters\n";
  3153. #endif
  3154. #endif
  3155. }
  3156. { // IFFT
  3157. if(np==1) Profile::Tic("IFFT",&comm,false,100);
  3158. Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
  3159. FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], ifft_scl[blk0], fft_out, output_data_, buffer);
  3160. if(np==1) Profile::Toc();
  3161. }
  3162. }
  3163. }
  3164. }
  3165. template <class FMMNode>
  3166. void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3167. if(!this->MultipoleOrder()) return;
  3168. { // Set setup_data
  3169. setup_data.level=level;
  3170. setup_data.kernel=kernel->k_l2l;
  3171. setup_data.interac_type.resize(1);
  3172. setup_data.interac_type[0]=D2D_Type;
  3173. setup_data. input_data=&buff[1];
  3174. setup_data.output_data=&buff[1];
  3175. Vector<FMMNode_t*>& nodes_in =n_list[1];
  3176. Vector<FMMNode_t*>& nodes_out=n_list[1];
  3177. setup_data.nodes_in .clear();
  3178. setup_data.nodes_out.clear();
  3179. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1) && nodes_in [i]->pt_cnt[1]) setup_data.nodes_in .push_back(nodes_in [i]);
  3180. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level ) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
  3181. }
  3182. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3183. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3184. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3185. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3186. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
  3187. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
  3188. SetupInterac(setup_data,device);
  3189. }
  3190. template <class FMMNode>
  3191. void FMM_Pts<FMMNode>::Down2Down (SetupData<Real_t>& setup_data, bool device){
  3192. if(!this->MultipoleOrder()) return;
  3193. //Add Down2Down contribution.
  3194. EvalList(setup_data, device);
  3195. }
  3196. template <class FMMNode>
  3197. void FMM_Pts<FMMNode>::PtSetup(SetupData<Real_t>& setup_data, void* data_){
  3198. struct PackedData{
  3199. size_t len;
  3200. Matrix<Real_t>* ptr;
  3201. Vector<size_t> cnt;
  3202. Vector<size_t> dsp;
  3203. };
  3204. struct InteracData{
  3205. Vector<size_t> in_node;
  3206. Vector<size_t> scal_idx;
  3207. Vector<Real_t> coord_shift;
  3208. Vector<size_t> interac_cnt;
  3209. Vector<size_t> interac_dsp;
  3210. Vector<size_t> interac_cst;
  3211. Vector<Real_t> scal[4*MAX_DEPTH];
  3212. Matrix<Real_t> M[4];
  3213. };
  3214. struct ptSetupData{
  3215. int level;
  3216. const Kernel<Real_t>* kernel;
  3217. PackedData src_coord; // Src coord
  3218. PackedData src_value; // Src density
  3219. PackedData srf_coord; // Srf coord
  3220. PackedData srf_value; // Srf density
  3221. PackedData trg_coord; // Trg coord
  3222. PackedData trg_value; // Trg potential
  3223. InteracData interac_data;
  3224. };
  3225. ptSetupData& data=*(ptSetupData*)data_;
  3226. if(data.interac_data.interac_cnt.Dim()){ // Set data.interac_data.interac_cst
  3227. InteracData& intdata=data.interac_data;
  3228. Vector<size_t> cnt;
  3229. Vector<size_t>& dsp=intdata.interac_cst;
  3230. cnt.ReInit(intdata.interac_cnt.Dim());
  3231. dsp.ReInit(intdata.interac_dsp.Dim());
  3232. #pragma omp parallel for
  3233. for(size_t trg=0;trg<cnt.Dim();trg++){
  3234. size_t trg_cnt=data.trg_coord.cnt[trg];
  3235. cnt[trg]=0;
  3236. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3237. size_t int_id=intdata.interac_dsp[trg]+i;
  3238. size_t src=intdata.in_node[int_id];
  3239. size_t src_cnt=data.src_coord.cnt[src];
  3240. size_t srf_cnt=data.srf_coord.cnt[src];
  3241. cnt[trg]+=(src_cnt+srf_cnt)*trg_cnt;
  3242. }
  3243. }
  3244. dsp[0]=cnt[0];
  3245. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  3246. }
  3247. { // pack data
  3248. struct PackedSetupData{
  3249. size_t size;
  3250. int level;
  3251. const Kernel<Real_t>* kernel;
  3252. Matrix<Real_t>* src_coord; // Src coord
  3253. Matrix<Real_t>* src_value; // Src density
  3254. Matrix<Real_t>* srf_coord; // Srf coord
  3255. Matrix<Real_t>* srf_value; // Srf density
  3256. Matrix<Real_t>* trg_coord; // Trg coord
  3257. Matrix<Real_t>* trg_value; // Trg potential
  3258. size_t src_coord_cnt_size; size_t src_coord_cnt_offset;
  3259. size_t src_coord_dsp_size; size_t src_coord_dsp_offset;
  3260. size_t src_value_cnt_size; size_t src_value_cnt_offset;
  3261. size_t src_value_dsp_size; size_t src_value_dsp_offset;
  3262. size_t srf_coord_cnt_size; size_t srf_coord_cnt_offset;
  3263. size_t srf_coord_dsp_size; size_t srf_coord_dsp_offset;
  3264. size_t srf_value_cnt_size; size_t srf_value_cnt_offset;
  3265. size_t srf_value_dsp_size; size_t srf_value_dsp_offset;
  3266. size_t trg_coord_cnt_size; size_t trg_coord_cnt_offset;
  3267. size_t trg_coord_dsp_size; size_t trg_coord_dsp_offset;
  3268. size_t trg_value_cnt_size; size_t trg_value_cnt_offset;
  3269. size_t trg_value_dsp_size; size_t trg_value_dsp_offset;
  3270. // interac_data
  3271. size_t in_node_size; size_t in_node_offset;
  3272. size_t scal_idx_size; size_t scal_idx_offset;
  3273. size_t coord_shift_size; size_t coord_shift_offset;
  3274. size_t interac_cnt_size; size_t interac_cnt_offset;
  3275. size_t interac_dsp_size; size_t interac_dsp_offset;
  3276. size_t interac_cst_size; size_t interac_cst_offset;
  3277. size_t scal_dim[4*MAX_DEPTH]; size_t scal_offset[4*MAX_DEPTH];
  3278. size_t Mdim[4][2]; size_t M_offset[4];
  3279. };
  3280. PackedSetupData pkd_data;
  3281. { // Set pkd_data
  3282. size_t offset=mem::align_ptr(sizeof(PackedSetupData));
  3283. pkd_data. level=data. level;
  3284. pkd_data.kernel=data.kernel;
  3285. pkd_data.src_coord=data.src_coord.ptr;
  3286. pkd_data.src_value=data.src_value.ptr;
  3287. pkd_data.srf_coord=data.srf_coord.ptr;
  3288. pkd_data.srf_value=data.srf_value.ptr;
  3289. pkd_data.trg_coord=data.trg_coord.ptr;
  3290. pkd_data.trg_value=data.trg_value.ptr;
  3291. pkd_data.src_coord_cnt_offset=offset; pkd_data.src_coord_cnt_size=data.src_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_coord_cnt_size);
  3292. pkd_data.src_coord_dsp_offset=offset; pkd_data.src_coord_dsp_size=data.src_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_coord_dsp_size);
  3293. pkd_data.src_value_cnt_offset=offset; pkd_data.src_value_cnt_size=data.src_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_value_cnt_size);
  3294. pkd_data.src_value_dsp_offset=offset; pkd_data.src_value_dsp_size=data.src_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_value_dsp_size);
  3295. pkd_data.srf_coord_cnt_offset=offset; pkd_data.srf_coord_cnt_size=data.srf_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_coord_cnt_size);
  3296. pkd_data.srf_coord_dsp_offset=offset; pkd_data.srf_coord_dsp_size=data.srf_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_coord_dsp_size);
  3297. pkd_data.srf_value_cnt_offset=offset; pkd_data.srf_value_cnt_size=data.srf_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_value_cnt_size);
  3298. pkd_data.srf_value_dsp_offset=offset; pkd_data.srf_value_dsp_size=data.srf_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_value_dsp_size);
  3299. pkd_data.trg_coord_cnt_offset=offset; pkd_data.trg_coord_cnt_size=data.trg_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_coord_cnt_size);
  3300. pkd_data.trg_coord_dsp_offset=offset; pkd_data.trg_coord_dsp_size=data.trg_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_coord_dsp_size);
  3301. pkd_data.trg_value_cnt_offset=offset; pkd_data.trg_value_cnt_size=data.trg_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_value_cnt_size);
  3302. pkd_data.trg_value_dsp_offset=offset; pkd_data.trg_value_dsp_size=data.trg_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_value_dsp_size);
  3303. InteracData& intdata=data.interac_data;
  3304. pkd_data. in_node_offset=offset; pkd_data. in_node_size=intdata. in_node.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data. in_node_size);
  3305. pkd_data. scal_idx_offset=offset; pkd_data. scal_idx_size=intdata. scal_idx.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data. scal_idx_size);
  3306. pkd_data.coord_shift_offset=offset; pkd_data.coord_shift_size=intdata.coord_shift.Dim(); offset+=mem::align_ptr(sizeof(Real_t)*pkd_data.coord_shift_size);
  3307. pkd_data.interac_cnt_offset=offset; pkd_data.interac_cnt_size=intdata.interac_cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_cnt_size);
  3308. pkd_data.interac_dsp_offset=offset; pkd_data.interac_dsp_size=intdata.interac_dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_dsp_size);
  3309. pkd_data.interac_cst_offset=offset; pkd_data.interac_cst_size=intdata.interac_cst.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_cst_size);
  3310. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3311. pkd_data.scal_offset[i]=offset; pkd_data.scal_dim[i]=intdata.scal[i].Dim(); offset+=mem::align_ptr(sizeof(Real_t)*pkd_data.scal_dim[i]);
  3312. }
  3313. for(size_t i=0;i<4;i++){
  3314. size_t& Mdim0=pkd_data.Mdim[i][0];
  3315. size_t& Mdim1=pkd_data.Mdim[i][1];
  3316. pkd_data.M_offset[i]=offset; Mdim0=intdata.M[i].Dim(0); Mdim1=intdata.M[i].Dim(1); offset+=mem::align_ptr(sizeof(Real_t)*Mdim0*Mdim1);
  3317. }
  3318. pkd_data.size=offset;
  3319. }
  3320. { // Set setup_data.interac_data
  3321. Matrix<char>& buff=setup_data.interac_data;
  3322. if(pkd_data.size>buff.Dim(0)*buff.Dim(1)){
  3323. buff.ReInit(1,pkd_data.size);
  3324. }
  3325. ((PackedSetupData*)buff[0])[0]=pkd_data;
  3326. if(pkd_data.src_coord_cnt_size) memcpy(&buff[0][pkd_data.src_coord_cnt_offset], &data.src_coord.cnt[0], pkd_data.src_coord_cnt_size*sizeof(size_t));
  3327. if(pkd_data.src_coord_dsp_size) memcpy(&buff[0][pkd_data.src_coord_dsp_offset], &data.src_coord.dsp[0], pkd_data.src_coord_dsp_size*sizeof(size_t));
  3328. if(pkd_data.src_value_cnt_size) memcpy(&buff[0][pkd_data.src_value_cnt_offset], &data.src_value.cnt[0], pkd_data.src_value_cnt_size*sizeof(size_t));
  3329. if(pkd_data.src_value_dsp_size) memcpy(&buff[0][pkd_data.src_value_dsp_offset], &data.src_value.dsp[0], pkd_data.src_value_dsp_size*sizeof(size_t));
  3330. if(pkd_data.srf_coord_cnt_size) memcpy(&buff[0][pkd_data.srf_coord_cnt_offset], &data.srf_coord.cnt[0], pkd_data.srf_coord_cnt_size*sizeof(size_t));
  3331. if(pkd_data.srf_coord_dsp_size) memcpy(&buff[0][pkd_data.srf_coord_dsp_offset], &data.srf_coord.dsp[0], pkd_data.srf_coord_dsp_size*sizeof(size_t));
  3332. if(pkd_data.srf_value_cnt_size) memcpy(&buff[0][pkd_data.srf_value_cnt_offset], &data.srf_value.cnt[0], pkd_data.srf_value_cnt_size*sizeof(size_t));
  3333. if(pkd_data.srf_value_dsp_size) memcpy(&buff[0][pkd_data.srf_value_dsp_offset], &data.srf_value.dsp[0], pkd_data.srf_value_dsp_size*sizeof(size_t));
  3334. if(pkd_data.trg_coord_cnt_size) memcpy(&buff[0][pkd_data.trg_coord_cnt_offset], &data.trg_coord.cnt[0], pkd_data.trg_coord_cnt_size*sizeof(size_t));
  3335. if(pkd_data.trg_coord_dsp_size) memcpy(&buff[0][pkd_data.trg_coord_dsp_offset], &data.trg_coord.dsp[0], pkd_data.trg_coord_dsp_size*sizeof(size_t));
  3336. if(pkd_data.trg_value_cnt_size) memcpy(&buff[0][pkd_data.trg_value_cnt_offset], &data.trg_value.cnt[0], pkd_data.trg_value_cnt_size*sizeof(size_t));
  3337. if(pkd_data.trg_value_dsp_size) memcpy(&buff[0][pkd_data.trg_value_dsp_offset], &data.trg_value.dsp[0], pkd_data.trg_value_dsp_size*sizeof(size_t));
  3338. InteracData& intdata=data.interac_data;
  3339. if(pkd_data. in_node_size) memcpy(&buff[0][pkd_data. in_node_offset], &intdata. in_node[0], pkd_data. in_node_size*sizeof(size_t));
  3340. if(pkd_data. scal_idx_size) memcpy(&buff[0][pkd_data. scal_idx_offset], &intdata. scal_idx[0], pkd_data. scal_idx_size*sizeof(size_t));
  3341. if(pkd_data.coord_shift_size) memcpy(&buff[0][pkd_data.coord_shift_offset], &intdata.coord_shift[0], pkd_data.coord_shift_size*sizeof(Real_t));
  3342. if(pkd_data.interac_cnt_size) memcpy(&buff[0][pkd_data.interac_cnt_offset], &intdata.interac_cnt[0], pkd_data.interac_cnt_size*sizeof(size_t));
  3343. if(pkd_data.interac_dsp_size) memcpy(&buff[0][pkd_data.interac_dsp_offset], &intdata.interac_dsp[0], pkd_data.interac_dsp_size*sizeof(size_t));
  3344. if(pkd_data.interac_cst_size) memcpy(&buff[0][pkd_data.interac_cst_offset], &intdata.interac_cst[0], pkd_data.interac_cst_size*sizeof(size_t));
  3345. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3346. if(intdata.scal[i].Dim()) memcpy(&buff[0][pkd_data.scal_offset[i]], &intdata.scal[i][0], intdata.scal[i].Dim()*sizeof(Real_t));
  3347. }
  3348. for(size_t i=0;i<4;i++){
  3349. if(intdata.M[i].Dim(0)*intdata.M[i].Dim(1)) memcpy(&buff[0][pkd_data.M_offset[i]], &intdata.M[i][0][0], intdata.M[i].Dim(0)*intdata.M[i].Dim(1)*sizeof(Real_t));
  3350. }
  3351. }
  3352. }
  3353. { // Resize device buffer
  3354. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3355. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3356. }
  3357. }
  3358. template <class FMMNode>
  3359. template <int SYNC>
  3360. void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
  3361. if(setup_data.kernel->ker_dim[0]*setup_data.kernel->ker_dim[1]==0) return;
  3362. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  3363. Profile::Tic("Host2Device",&this->comm,false,25);
  3364. Profile::Toc();
  3365. Profile::Tic("DeviceComp",&this->comm,false,20);
  3366. Profile::Toc();
  3367. return;
  3368. }
  3369. bool have_gpu=false;
  3370. #if defined(PVFMM_HAVE_CUDA)
  3371. have_gpu=true;
  3372. #endif
  3373. Profile::Tic("Host2Device",&this->comm,false,25);
  3374. typename Vector<char>::Device dev_buff;
  3375. typename Matrix<char>::Device interac_data;
  3376. typename Matrix<Real_t>::Device coord_data;
  3377. typename Matrix<Real_t>::Device input_data;
  3378. typename Matrix<Real_t>::Device output_data;
  3379. size_t ptr_single_layer_kernel=(size_t)NULL;
  3380. size_t ptr_double_layer_kernel=(size_t)NULL;
  3381. if(device && !have_gpu){
  3382. dev_buff = this-> dev_buffer. AllocDevice(false);
  3383. interac_data= setup_data.interac_data. AllocDevice(false);
  3384. if(setup_data. coord_data!=NULL) coord_data = setup_data. coord_data->AllocDevice(false);
  3385. if(setup_data. input_data!=NULL) input_data = setup_data. input_data->AllocDevice(false);
  3386. if(setup_data. output_data!=NULL) output_data = setup_data. output_data->AllocDevice(false);
  3387. ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten;
  3388. ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten;
  3389. }else{
  3390. dev_buff = this-> dev_buffer;
  3391. interac_data= setup_data.interac_data;
  3392. if(setup_data. coord_data!=NULL) coord_data =*setup_data. coord_data;
  3393. if(setup_data. input_data!=NULL) input_data =*setup_data. input_data;
  3394. if(setup_data. output_data!=NULL) output_data =*setup_data. output_data;
  3395. ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten;
  3396. ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
  3397. }
  3398. Profile::Toc();
  3399. Profile::Tic("DeviceComp",&this->comm,false,20);
  3400. int lock_idx=-1;
  3401. int wait_lock_idx=-1;
  3402. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  3403. if(device) lock_idx=MIC_Lock::get_lock();
  3404. #ifdef __INTEL_OFFLOAD
  3405. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  3406. #endif
  3407. { // Offloaded computation.
  3408. struct PackedData{
  3409. size_t len;
  3410. Matrix<Real_t>* ptr;
  3411. Vector<size_t> cnt;
  3412. Vector<size_t> dsp;
  3413. };
  3414. struct InteracData{
  3415. Vector<size_t> in_node;
  3416. Vector<size_t> scal_idx;
  3417. Vector<Real_t> coord_shift;
  3418. Vector<size_t> interac_cnt;
  3419. Vector<size_t> interac_dsp;
  3420. Vector<size_t> interac_cst;
  3421. Vector<Real_t> scal[4*MAX_DEPTH];
  3422. Matrix<Real_t> M[4];
  3423. };
  3424. struct ptSetupData{
  3425. int level;
  3426. const Kernel<Real_t>* kernel;
  3427. PackedData src_coord; // Src coord
  3428. PackedData src_value; // Src density
  3429. PackedData srf_coord; // Srf coord
  3430. PackedData srf_value; // Srf density
  3431. PackedData trg_coord; // Trg coord
  3432. PackedData trg_value; // Trg potential
  3433. InteracData interac_data;
  3434. };
  3435. ptSetupData data;
  3436. { // Initialize data
  3437. struct PackedSetupData{
  3438. size_t size;
  3439. int level;
  3440. const Kernel<Real_t>* kernel;
  3441. Matrix<Real_t>* src_coord; // Src coord
  3442. Matrix<Real_t>* src_value; // Src density
  3443. Matrix<Real_t>* srf_coord; // Srf coord
  3444. Matrix<Real_t>* srf_value; // Srf density
  3445. Matrix<Real_t>* trg_coord; // Trg coord
  3446. Matrix<Real_t>* trg_value; // Trg potential
  3447. size_t src_coord_cnt_size; size_t src_coord_cnt_offset;
  3448. size_t src_coord_dsp_size; size_t src_coord_dsp_offset;
  3449. size_t src_value_cnt_size; size_t src_value_cnt_offset;
  3450. size_t src_value_dsp_size; size_t src_value_dsp_offset;
  3451. size_t srf_coord_cnt_size; size_t srf_coord_cnt_offset;
  3452. size_t srf_coord_dsp_size; size_t srf_coord_dsp_offset;
  3453. size_t srf_value_cnt_size; size_t srf_value_cnt_offset;
  3454. size_t srf_value_dsp_size; size_t srf_value_dsp_offset;
  3455. size_t trg_coord_cnt_size; size_t trg_coord_cnt_offset;
  3456. size_t trg_coord_dsp_size; size_t trg_coord_dsp_offset;
  3457. size_t trg_value_cnt_size; size_t trg_value_cnt_offset;
  3458. size_t trg_value_dsp_size; size_t trg_value_dsp_offset;
  3459. // interac_data
  3460. size_t in_node_size; size_t in_node_offset;
  3461. size_t scal_idx_size; size_t scal_idx_offset;
  3462. size_t coord_shift_size; size_t coord_shift_offset;
  3463. size_t interac_cnt_size; size_t interac_cnt_offset;
  3464. size_t interac_dsp_size; size_t interac_dsp_offset;
  3465. size_t interac_cst_size; size_t interac_cst_offset;
  3466. size_t scal_dim[4*MAX_DEPTH]; size_t scal_offset[4*MAX_DEPTH];
  3467. size_t Mdim[4][2]; size_t M_offset[4];
  3468. };
  3469. typename Matrix<char>::Device& setupdata=interac_data;
  3470. PackedSetupData& pkd_data=*((PackedSetupData*)setupdata[0]);
  3471. data. level=pkd_data. level;
  3472. data.kernel=pkd_data.kernel;
  3473. data.src_coord.ptr=pkd_data.src_coord;
  3474. data.src_value.ptr=pkd_data.src_value;
  3475. data.srf_coord.ptr=pkd_data.srf_coord;
  3476. data.srf_value.ptr=pkd_data.srf_value;
  3477. data.trg_coord.ptr=pkd_data.trg_coord;
  3478. data.trg_value.ptr=pkd_data.trg_value;
  3479. data.src_coord.cnt.ReInit(pkd_data.src_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.src_coord_cnt_offset], false);
  3480. data.src_coord.dsp.ReInit(pkd_data.src_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.src_coord_dsp_offset], false);
  3481. data.src_value.cnt.ReInit(pkd_data.src_value_cnt_size, (size_t*)&setupdata[0][pkd_data.src_value_cnt_offset], false);
  3482. data.src_value.dsp.ReInit(pkd_data.src_value_dsp_size, (size_t*)&setupdata[0][pkd_data.src_value_dsp_offset], false);
  3483. data.srf_coord.cnt.ReInit(pkd_data.srf_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.srf_coord_cnt_offset], false);
  3484. data.srf_coord.dsp.ReInit(pkd_data.srf_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.srf_coord_dsp_offset], false);
  3485. data.srf_value.cnt.ReInit(pkd_data.srf_value_cnt_size, (size_t*)&setupdata[0][pkd_data.srf_value_cnt_offset], false);
  3486. data.srf_value.dsp.ReInit(pkd_data.srf_value_dsp_size, (size_t*)&setupdata[0][pkd_data.srf_value_dsp_offset], false);
  3487. data.trg_coord.cnt.ReInit(pkd_data.trg_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.trg_coord_cnt_offset], false);
  3488. data.trg_coord.dsp.ReInit(pkd_data.trg_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.trg_coord_dsp_offset], false);
  3489. data.trg_value.cnt.ReInit(pkd_data.trg_value_cnt_size, (size_t*)&setupdata[0][pkd_data.trg_value_cnt_offset], false);
  3490. data.trg_value.dsp.ReInit(pkd_data.trg_value_dsp_size, (size_t*)&setupdata[0][pkd_data.trg_value_dsp_offset], false);
  3491. InteracData& intdata=data.interac_data;
  3492. intdata. in_node.ReInit(pkd_data. in_node_size, (size_t*)&setupdata[0][pkd_data. in_node_offset],false);
  3493. intdata. scal_idx.ReInit(pkd_data. scal_idx_size, (size_t*)&setupdata[0][pkd_data. scal_idx_offset],false);
  3494. intdata.coord_shift.ReInit(pkd_data.coord_shift_size, (Real_t*)&setupdata[0][pkd_data.coord_shift_offset],false);
  3495. intdata.interac_cnt.ReInit(pkd_data.interac_cnt_size, (size_t*)&setupdata[0][pkd_data.interac_cnt_offset],false);
  3496. intdata.interac_dsp.ReInit(pkd_data.interac_dsp_size, (size_t*)&setupdata[0][pkd_data.interac_dsp_offset],false);
  3497. intdata.interac_cst.ReInit(pkd_data.interac_cst_size, (size_t*)&setupdata[0][pkd_data.interac_cst_offset],false);
  3498. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3499. intdata.scal[i].ReInit(pkd_data.scal_dim[i], (Real_t*)&setupdata[0][pkd_data.scal_offset[i]],false);
  3500. }
  3501. for(size_t i=0;i<4;i++){
  3502. intdata.M[i].ReInit(pkd_data.Mdim[i][0], pkd_data.Mdim[i][1], (Real_t*)&setupdata[0][pkd_data.M_offset[i]],false);
  3503. }
  3504. }
  3505. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  3506. { // Compute interactions
  3507. InteracData& intdata=data.interac_data;
  3508. typename Kernel<Real_t>::Ker_t single_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_single_layer_kernel;
  3509. typename Kernel<Real_t>::Ker_t double_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_double_layer_kernel;
  3510. int omp_p=omp_get_max_threads();
  3511. #pragma omp parallel for
  3512. for(size_t tid=0;tid<omp_p;tid++){
  3513. Matrix<Real_t> src_coord, src_value;
  3514. Matrix<Real_t> srf_coord, srf_value;
  3515. Matrix<Real_t> trg_coord, trg_value;
  3516. Vector<Real_t> buff;
  3517. { // init buff
  3518. size_t thread_buff_size=dev_buff.dim/sizeof(Real_t)/omp_p;
  3519. buff.ReInit(thread_buff_size, (Real_t*)&dev_buff[tid*thread_buff_size*sizeof(Real_t)], false);
  3520. }
  3521. size_t vcnt=0;
  3522. std::vector<Matrix<Real_t> > vbuff(6);
  3523. { // init vbuff[0:5]
  3524. size_t vdim_=0, vdim[6];
  3525. for(size_t indx=0;indx<6;indx++){
  3526. vdim[indx]=0;
  3527. switch(indx){
  3528. case 0:
  3529. vdim[indx]=intdata.M[0].Dim(0); break;
  3530. case 1:
  3531. assert(intdata.M[0].Dim(1)==intdata.M[1].Dim(0));
  3532. vdim[indx]=intdata.M[0].Dim(1); break;
  3533. case 2:
  3534. vdim[indx]=intdata.M[1].Dim(1); break;
  3535. case 3:
  3536. vdim[indx]=intdata.M[2].Dim(0); break;
  3537. case 4:
  3538. assert(intdata.M[2].Dim(1)==intdata.M[3].Dim(0));
  3539. vdim[indx]=intdata.M[2].Dim(1); break;
  3540. case 5:
  3541. vdim[indx]=intdata.M[3].Dim(1); break;
  3542. default:
  3543. vdim[indx]=0; break;
  3544. }
  3545. vdim_+=vdim[indx];
  3546. }
  3547. if(vdim_){
  3548. vcnt=buff.Dim()/vdim_/2;
  3549. assert(vcnt>0); // Thread buffer is too small
  3550. }
  3551. for(size_t indx=0;indx<6;indx++){ // init vbuff[0:5]
  3552. vbuff[indx].ReInit(vcnt,vdim[indx],&buff[0],false);
  3553. buff.ReInit(buff.Dim()-vdim[indx]*vcnt, &buff[vdim[indx]*vcnt], false);
  3554. }
  3555. }
  3556. size_t trg_a=0, trg_b=0;
  3557. if(intdata.interac_cst.Dim()){ // Determine trg_a, trg_b
  3558. //trg_a=((tid+0)*intdata.interac_cnt.Dim())/omp_p;
  3559. //trg_b=((tid+1)*intdata.interac_cnt.Dim())/omp_p;
  3560. Vector<size_t>& interac_cst=intdata.interac_cst;
  3561. size_t cost=interac_cst[interac_cst.Dim()-1];
  3562. trg_a=std::lower_bound(&interac_cst[0],&interac_cst[interac_cst.Dim()-1],(cost*(tid+0))/omp_p)-&interac_cst[0]+1;
  3563. trg_b=std::lower_bound(&interac_cst[0],&interac_cst[interac_cst.Dim()-1],(cost*(tid+1))/omp_p)-&interac_cst[0]+1;
  3564. if(tid==omp_p-1) trg_b=interac_cst.Dim();
  3565. if(tid==0) trg_a=0;
  3566. }
  3567. for(size_t trg0=trg_a;trg0<trg_b;){
  3568. size_t trg1_max=1;
  3569. if(vcnt){ // Find trg1_max
  3570. size_t interac_cnt=intdata.interac_cnt[trg0];
  3571. while(trg0+trg1_max<trg_b){
  3572. interac_cnt+=intdata.interac_cnt[trg0+trg1_max];
  3573. if(interac_cnt>vcnt){
  3574. interac_cnt-=intdata.interac_cnt[trg0+trg1_max];
  3575. break;
  3576. }
  3577. trg1_max++;
  3578. }
  3579. assert(interac_cnt<=vcnt);
  3580. for(size_t k=0;k<6;k++){
  3581. if(vbuff[k].Dim(0)*vbuff[k].Dim(1)){
  3582. vbuff[k].ReInit(interac_cnt,vbuff[k].Dim(1),vbuff[k][0],false);
  3583. }
  3584. }
  3585. }else{
  3586. trg1_max=trg_b-trg0;
  3587. }
  3588. if(intdata.M[0].Dim(0) && intdata.M[0].Dim(1) && intdata.M[1].Dim(0) && intdata.M[1].Dim(1)){ // src mat-vec
  3589. size_t interac_idx=0;
  3590. for(size_t trg1=0;trg1<trg1_max;trg1++){ // Copy src_value to vbuff[0]
  3591. size_t trg=trg0+trg1;
  3592. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3593. size_t int_id=intdata.interac_dsp[trg]+i;
  3594. size_t src=intdata.in_node[int_id];
  3595. src_value.ReInit(1, data.src_value.cnt[src], &data.src_value.ptr[0][0][data.src_value.dsp[src]], false);
  3596. { // Copy src_value to vbuff[0]
  3597. size_t vdim=vbuff[0].Dim(1);
  3598. assert(src_value.Dim(1)==vdim);
  3599. for(size_t j=0;j<vdim;j++) vbuff[0][interac_idx][j]=src_value[0][j];
  3600. }
  3601. size_t scal_idx=intdata.scal_idx[int_id];
  3602. { // scaling
  3603. Matrix<Real_t>& vec=vbuff[0];
  3604. Vector<Real_t>& scal=intdata.scal[scal_idx*4+0];
  3605. size_t scal_dim=scal.Dim();
  3606. if(scal_dim){
  3607. size_t vdim=vec.Dim(1);
  3608. for(size_t j=0;j<vdim;j+=scal_dim){
  3609. for(size_t k=0;k<scal_dim;k++){
  3610. vec[interac_idx][j+k]*=scal[k];
  3611. }
  3612. }
  3613. }
  3614. }
  3615. interac_idx++;
  3616. }
  3617. }
  3618. Matrix<Real_t>::GEMM(vbuff[1],vbuff[0],intdata.M[0]);
  3619. Matrix<Real_t>::GEMM(vbuff[2],vbuff[1],intdata.M[1]);
  3620. interac_idx=0;
  3621. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3622. size_t trg=trg0+trg1;
  3623. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3624. size_t int_id=intdata.interac_dsp[trg]+i;
  3625. size_t scal_idx=intdata.scal_idx[int_id];
  3626. { // scaling
  3627. Matrix<Real_t>& vec=vbuff[2];
  3628. Vector<Real_t>& scal=intdata.scal[scal_idx*4+1];
  3629. size_t scal_dim=scal.Dim();
  3630. if(scal_dim){
  3631. size_t vdim=vec.Dim(1);
  3632. for(size_t j=0;j<vdim;j+=scal_dim){
  3633. for(size_t k=0;k<scal_dim;k++){
  3634. vec[interac_idx][j+k]*=scal[k];
  3635. }
  3636. }
  3637. }
  3638. }
  3639. interac_idx++;
  3640. }
  3641. }
  3642. }
  3643. if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){ // init vbuff[3]
  3644. size_t vdim=vbuff[3].Dim(0)*vbuff[3].Dim(1);
  3645. for(size_t i=0;i<vdim;i++) vbuff[3][0][i]=0;
  3646. }
  3647. { // Evaluate kernel functions
  3648. size_t interac_idx=0;
  3649. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3650. size_t trg=trg0+trg1;
  3651. trg_coord.ReInit(1, data.trg_coord.cnt[trg], &data.trg_coord.ptr[0][0][data.trg_coord.dsp[trg]], false);
  3652. trg_value.ReInit(1, data.trg_value.cnt[trg], &data.trg_value.ptr[0][0][data.trg_value.dsp[trg]], false);
  3653. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3654. size_t int_id=intdata.interac_dsp[trg]+i;
  3655. size_t src=intdata.in_node[int_id];
  3656. src_coord.ReInit(1, data.src_coord.cnt[src], &data.src_coord.ptr[0][0][data.src_coord.dsp[src]], false);
  3657. src_value.ReInit(1, data.src_value.cnt[src], &data.src_value.ptr[0][0][data.src_value.dsp[src]], false);
  3658. srf_coord.ReInit(1, data.srf_coord.cnt[src], &data.srf_coord.ptr[0][0][data.srf_coord.dsp[src]], false);
  3659. srf_value.ReInit(1, data.srf_value.cnt[src], &data.srf_value.ptr[0][0][data.srf_value.dsp[src]], false);
  3660. Real_t* vbuff2_ptr=(vbuff[2].Dim(0)*vbuff[2].Dim(1)?vbuff[2][interac_idx]:src_value[0]);
  3661. Real_t* vbuff3_ptr=(vbuff[3].Dim(0)*vbuff[3].Dim(1)?vbuff[3][interac_idx]:trg_value[0]);
  3662. if(src_coord.Dim(1)){
  3663. { // coord_shift
  3664. Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
  3665. if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
  3666. size_t vdim=src_coord.Dim(1);
  3667. Vector<Real_t> new_coord(vdim, &buff[0], false);
  3668. assert(buff.Dim()>=vdim); // Thread buffer is too small
  3669. //buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
  3670. for(size_t j=0;j<vdim;j+=COORD_DIM){
  3671. for(size_t k=0;k<COORD_DIM;k++){
  3672. new_coord[j+k]=src_coord[0][j+k]+shift[k];
  3673. }
  3674. }
  3675. src_coord.ReInit(1, vdim, &new_coord[0], false);
  3676. }
  3677. }
  3678. assert(ptr_single_layer_kernel); // assert(Single-layer kernel is implemented)
  3679. single_layer_kernel(src_coord[0], src_coord.Dim(1)/COORD_DIM, vbuff2_ptr, 1,
  3680. trg_coord[0], trg_coord.Dim(1)/COORD_DIM, vbuff3_ptr, NULL);
  3681. }
  3682. if(srf_coord.Dim(1)){
  3683. { // coord_shift
  3684. Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
  3685. if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
  3686. size_t vdim=srf_coord.Dim(1);
  3687. Vector<Real_t> new_coord(vdim, &buff[0], false);
  3688. assert(buff.Dim()>=vdim); // Thread buffer is too small
  3689. //buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
  3690. for(size_t j=0;j<vdim;j+=COORD_DIM){
  3691. for(size_t k=0;k<COORD_DIM;k++){
  3692. new_coord[j+k]=srf_coord[0][j+k]+shift[k];
  3693. }
  3694. }
  3695. srf_coord.ReInit(1, vdim, &new_coord[0], false);
  3696. }
  3697. }
  3698. assert(ptr_double_layer_kernel); // assert(Double-layer kernel is implemented)
  3699. double_layer_kernel(srf_coord[0], srf_coord.Dim(1)/COORD_DIM, srf_value[0], 1,
  3700. trg_coord[0], trg_coord.Dim(1)/COORD_DIM, vbuff3_ptr, NULL);
  3701. }
  3702. interac_idx++;
  3703. }
  3704. }
  3705. }
  3706. if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){ // trg mat-vec
  3707. size_t interac_idx=0;
  3708. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3709. size_t trg=trg0+trg1;
  3710. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3711. size_t int_id=intdata.interac_dsp[trg]+i;
  3712. size_t scal_idx=intdata.scal_idx[int_id];
  3713. { // scaling
  3714. Matrix<Real_t>& vec=vbuff[3];
  3715. Vector<Real_t>& scal=intdata.scal[scal_idx*4+2];
  3716. size_t scal_dim=scal.Dim();
  3717. if(scal_dim){
  3718. size_t vdim=vec.Dim(1);
  3719. for(size_t j=0;j<vdim;j+=scal_dim){
  3720. for(size_t k=0;k<scal_dim;k++){
  3721. vec[interac_idx][j+k]*=scal[k];
  3722. }
  3723. }
  3724. }
  3725. }
  3726. interac_idx++;
  3727. }
  3728. }
  3729. Matrix<Real_t>::GEMM(vbuff[4],vbuff[3],intdata.M[2]);
  3730. Matrix<Real_t>::GEMM(vbuff[5],vbuff[4],intdata.M[3]);
  3731. interac_idx=0;
  3732. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3733. size_t trg=trg0+trg1;
  3734. trg_value.ReInit(1, data.trg_value.cnt[trg], &data.trg_value.ptr[0][0][data.trg_value.dsp[trg]], false);
  3735. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3736. size_t int_id=intdata.interac_dsp[trg]+i;
  3737. size_t scal_idx=intdata.scal_idx[int_id];
  3738. { // scaling
  3739. Matrix<Real_t>& vec=vbuff[5];
  3740. Vector<Real_t>& scal=intdata.scal[scal_idx*4+3];
  3741. size_t scal_dim=scal.Dim();
  3742. if(scal_dim){
  3743. size_t vdim=vec.Dim(1);
  3744. for(size_t j=0;j<vdim;j+=scal_dim){
  3745. for(size_t k=0;k<scal_dim;k++){
  3746. vec[interac_idx][j+k]*=scal[k];
  3747. }
  3748. }
  3749. }
  3750. }
  3751. { // Add vbuff[5] to trg_value
  3752. size_t vdim=vbuff[5].Dim(1);
  3753. assert(trg_value.Dim(1)==vdim);
  3754. for(size_t i=0;i<vdim;i++) trg_value[0][i]+=vbuff[5][interac_idx][i];
  3755. }
  3756. interac_idx++;
  3757. }
  3758. }
  3759. }
  3760. trg0+=trg1_max;
  3761. }
  3762. }
  3763. }
  3764. if(device) MIC_Lock::release_lock(lock_idx);
  3765. }
  3766. #ifdef __INTEL_OFFLOAD
  3767. if(SYNC){
  3768. #pragma offload if(device) target(mic:0)
  3769. {if(device) MIC_Lock::wait_lock(lock_idx);}
  3770. }
  3771. #endif
  3772. Profile::Toc();
  3773. }
  3774. template <class FMMNode>
  3775. void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3776. if(!this->MultipoleOrder()) return;
  3777. { // Set setup_data
  3778. setup_data. level=level;
  3779. setup_data.kernel=kernel->k_s2l;
  3780. setup_data. input_data=&buff[4];
  3781. setup_data.output_data=&buff[1];
  3782. setup_data. coord_data=&buff[6];
  3783. Vector<FMMNode_t*>& nodes_in =n_list[4];
  3784. Vector<FMMNode_t*>& nodes_out=n_list[1];
  3785. setup_data.nodes_in .clear();
  3786. setup_data.nodes_out.clear();
  3787. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf ()) setup_data.nodes_in .push_back(nodes_in [i]);
  3788. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  3789. }
  3790. struct PackedData{
  3791. size_t len;
  3792. Matrix<Real_t>* ptr;
  3793. Vector<size_t> cnt;
  3794. Vector<size_t> dsp;
  3795. };
  3796. struct InteracData{
  3797. Vector<size_t> in_node;
  3798. Vector<size_t> scal_idx;
  3799. Vector<Real_t> coord_shift;
  3800. Vector<size_t> interac_cnt;
  3801. Vector<size_t> interac_dsp;
  3802. Vector<size_t> interac_cst;
  3803. Vector<Real_t> scal[4*MAX_DEPTH];
  3804. Matrix<Real_t> M[4];
  3805. };
  3806. struct ptSetupData{
  3807. int level;
  3808. const Kernel<Real_t>* kernel;
  3809. PackedData src_coord; // Src coord
  3810. PackedData src_value; // Src density
  3811. PackedData srf_coord; // Srf coord
  3812. PackedData srf_value; // Srf density
  3813. PackedData trg_coord; // Trg coord
  3814. PackedData trg_value; // Trg potential
  3815. InteracData interac_data;
  3816. };
  3817. ptSetupData data;
  3818. data. level=setup_data. level;
  3819. data.kernel=setup_data.kernel;
  3820. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3821. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3822. { // Set src data
  3823. std::vector<void*>& nodes=nodes_in;
  3824. PackedData& coord=data.src_coord;
  3825. PackedData& value=data.src_value;
  3826. coord.ptr=setup_data. coord_data;
  3827. value.ptr=setup_data. input_data;
  3828. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3829. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3830. coord.cnt.ReInit(nodes.size());
  3831. coord.dsp.ReInit(nodes.size());
  3832. value.cnt.ReInit(nodes.size());
  3833. value.dsp.ReInit(nodes.size());
  3834. #pragma omp parallel for
  3835. for(size_t i=0;i<nodes.size();i++){
  3836. ((FMMNode_t*)nodes[i])->node_id=i;
  3837. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->src_coord;
  3838. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->src_value;
  3839. if(coord_vec.Dim()){
  3840. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3841. assert(coord.dsp[i]<coord.len);
  3842. coord.cnt[i]=coord_vec.Dim();
  3843. }else{
  3844. coord.dsp[i]=0;
  3845. coord.cnt[i]=0;
  3846. }
  3847. if(value_vec.Dim()){
  3848. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3849. assert(value.dsp[i]<value.len);
  3850. value.cnt[i]=value_vec.Dim();
  3851. }else{
  3852. value.dsp[i]=0;
  3853. value.cnt[i]=0;
  3854. }
  3855. }
  3856. }
  3857. { // Set srf data
  3858. std::vector<void*>& nodes=nodes_in;
  3859. PackedData& coord=data.srf_coord;
  3860. PackedData& value=data.srf_value;
  3861. coord.ptr=setup_data. coord_data;
  3862. value.ptr=setup_data. input_data;
  3863. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3864. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3865. coord.cnt.ReInit(nodes.size());
  3866. coord.dsp.ReInit(nodes.size());
  3867. value.cnt.ReInit(nodes.size());
  3868. value.dsp.ReInit(nodes.size());
  3869. #pragma omp parallel for
  3870. for(size_t i=0;i<nodes.size();i++){
  3871. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->surf_coord;
  3872. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->surf_value;
  3873. if(coord_vec.Dim()){
  3874. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3875. assert(coord.dsp[i]<coord.len);
  3876. coord.cnt[i]=coord_vec.Dim();
  3877. }else{
  3878. coord.dsp[i]=0;
  3879. coord.cnt[i]=0;
  3880. }
  3881. if(value_vec.Dim()){
  3882. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3883. assert(value.dsp[i]<value.len);
  3884. value.cnt[i]=value_vec.Dim();
  3885. }else{
  3886. value.dsp[i]=0;
  3887. value.cnt[i]=0;
  3888. }
  3889. }
  3890. }
  3891. { // Set trg data
  3892. std::vector<void*>& nodes=nodes_out;
  3893. PackedData& coord=data.trg_coord;
  3894. PackedData& value=data.trg_value;
  3895. coord.ptr=setup_data. coord_data;
  3896. value.ptr=setup_data.output_data;
  3897. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3898. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3899. coord.cnt.ReInit(nodes.size());
  3900. coord.dsp.ReInit(nodes.size());
  3901. value.cnt.ReInit(nodes.size());
  3902. value.dsp.ReInit(nodes.size());
  3903. #pragma omp parallel for
  3904. for(size_t i=0;i<nodes.size();i++){
  3905. Vector<Real_t>& coord_vec=tree->dnwd_check_surf[((FMMNode*)nodes[i])->Depth()];
  3906. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->dnward_equiv;
  3907. if(coord_vec.Dim()){
  3908. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3909. assert(coord.dsp[i]<coord.len);
  3910. coord.cnt[i]=coord_vec.Dim();
  3911. }else{
  3912. coord.dsp[i]=0;
  3913. coord.cnt[i]=0;
  3914. }
  3915. if(value_vec.Dim()){
  3916. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3917. assert(value.dsp[i]<value.len);
  3918. value.cnt[i]=value_vec.Dim();
  3919. }else{
  3920. value.dsp[i]=0;
  3921. value.cnt[i]=0;
  3922. }
  3923. }
  3924. }
  3925. { // Set interac_data
  3926. int omp_p=omp_get_max_threads();
  3927. std::vector<std::vector<size_t> > in_node_(omp_p);
  3928. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  3929. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  3930. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  3931. size_t m=this->MultipoleOrder();
  3932. size_t Nsrf=(6*(m-1)*(m-1)+2);
  3933. #pragma omp parallel for
  3934. for(size_t tid=0;tid<omp_p;tid++){
  3935. std::vector<size_t>& in_node =in_node_[tid] ;
  3936. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  3937. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  3938. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  3939. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  3940. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  3941. for(size_t i=a;i<b;i++){
  3942. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  3943. if(tnode->IsLeaf() && tnode->pt_cnt[1]<=Nsrf){ // skip: handled in U-list
  3944. interac_cnt.push_back(0);
  3945. continue;
  3946. }
  3947. Real_t s=pvfmm::pow<Real_t>(0.5,tnode->Depth());
  3948. size_t interac_cnt_=0;
  3949. { // X_Type
  3950. Mat_Type type=X_Type;
  3951. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  3952. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  3953. FMMNode_t* snode=intlst[j];
  3954. size_t snode_id=snode->node_id;
  3955. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  3956. in_node.push_back(snode_id);
  3957. scal_idx.push_back(snode->Depth());
  3958. { // set coord_shift
  3959. const int* rel_coord=interac_list.RelativeCoord(type,j);
  3960. const Real_t* scoord=snode->Coord();
  3961. const Real_t* tcoord=tnode->Coord();
  3962. Real_t shift[COORD_DIM];
  3963. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(0+0.5*s);
  3964. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(0+0.5*s);
  3965. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(0+0.5*s);
  3966. coord_shift.push_back(shift[0]);
  3967. coord_shift.push_back(shift[1]);
  3968. coord_shift.push_back(shift[2]);
  3969. }
  3970. interac_cnt_++;
  3971. }
  3972. }
  3973. interac_cnt.push_back(interac_cnt_);
  3974. }
  3975. }
  3976. { // Combine interac data
  3977. InteracData& interac_data=data.interac_data;
  3978. { // in_node
  3979. typedef size_t ElemType;
  3980. std::vector<std::vector<ElemType> >& vec_=in_node_;
  3981. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  3982. std::vector<size_t> vec_dsp(omp_p+1,0);
  3983. for(size_t tid=0;tid<omp_p;tid++){
  3984. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3985. }
  3986. vec.ReInit(vec_dsp[omp_p]);
  3987. #pragma omp parallel for
  3988. for(size_t tid=0;tid<omp_p;tid++){
  3989. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3990. }
  3991. }
  3992. { // scal_idx
  3993. typedef size_t ElemType;
  3994. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  3995. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  3996. std::vector<size_t> vec_dsp(omp_p+1,0);
  3997. for(size_t tid=0;tid<omp_p;tid++){
  3998. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3999. }
  4000. vec.ReInit(vec_dsp[omp_p]);
  4001. #pragma omp parallel for
  4002. for(size_t tid=0;tid<omp_p;tid++){
  4003. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4004. }
  4005. }
  4006. { // coord_shift
  4007. typedef Real_t ElemType;
  4008. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4009. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4010. std::vector<size_t> vec_dsp(omp_p+1,0);
  4011. for(size_t tid=0;tid<omp_p;tid++){
  4012. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4013. }
  4014. vec.ReInit(vec_dsp[omp_p]);
  4015. #pragma omp parallel for
  4016. for(size_t tid=0;tid<omp_p;tid++){
  4017. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4018. }
  4019. }
  4020. { // interac_cnt
  4021. typedef size_t ElemType;
  4022. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4023. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4024. std::vector<size_t> vec_dsp(omp_p+1,0);
  4025. for(size_t tid=0;tid<omp_p;tid++){
  4026. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4027. }
  4028. vec.ReInit(vec_dsp[omp_p]);
  4029. #pragma omp parallel for
  4030. for(size_t tid=0;tid<omp_p;tid++){
  4031. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4032. }
  4033. }
  4034. { // interac_dsp
  4035. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4036. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4037. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4038. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4039. }
  4040. }
  4041. }
  4042. PtSetup(setup_data, &data);
  4043. }
  4044. template <class FMMNode>
  4045. void FMM_Pts<FMMNode>::X_List (SetupData<Real_t>& setup_data, bool device){
  4046. if(!this->MultipoleOrder()) return;
  4047. //Add X_List contribution.
  4048. this->EvalListPts(setup_data, device);
  4049. }
  4050. template <class FMMNode>
  4051. void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  4052. if(!this->MultipoleOrder()) return;
  4053. { // Set setup_data
  4054. setup_data. level=level;
  4055. setup_data.kernel=kernel->k_m2t;
  4056. setup_data. input_data=&buff[0];
  4057. setup_data.output_data=&buff[5];
  4058. setup_data. coord_data=&buff[6];
  4059. Vector<FMMNode_t*>& nodes_in =n_list[0];
  4060. Vector<FMMNode_t*>& nodes_out=n_list[5];
  4061. setup_data.nodes_in .clear();
  4062. setup_data.nodes_out.clear();
  4063. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] ) setup_data.nodes_in .push_back(nodes_in [i]);
  4064. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->trg_coord.Dim() && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  4065. }
  4066. struct PackedData{
  4067. size_t len;
  4068. Matrix<Real_t>* ptr;
  4069. Vector<size_t> cnt;
  4070. Vector<size_t> dsp;
  4071. };
  4072. struct InteracData{
  4073. Vector<size_t> in_node;
  4074. Vector<size_t> scal_idx;
  4075. Vector<Real_t> coord_shift;
  4076. Vector<size_t> interac_cnt;
  4077. Vector<size_t> interac_dsp;
  4078. Vector<size_t> interac_cst;
  4079. Vector<Real_t> scal[4*MAX_DEPTH];
  4080. Matrix<Real_t> M[4];
  4081. };
  4082. struct ptSetupData{
  4083. int level;
  4084. const Kernel<Real_t>* kernel;
  4085. PackedData src_coord; // Src coord
  4086. PackedData src_value; // Src density
  4087. PackedData srf_coord; // Srf coord
  4088. PackedData srf_value; // Srf density
  4089. PackedData trg_coord; // Trg coord
  4090. PackedData trg_value; // Trg potential
  4091. InteracData interac_data;
  4092. };
  4093. ptSetupData data;
  4094. data. level=setup_data. level;
  4095. data.kernel=setup_data.kernel;
  4096. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  4097. std::vector<void*>& nodes_out=setup_data.nodes_out;
  4098. { // Set src data
  4099. std::vector<void*>& nodes=nodes_in;
  4100. PackedData& coord=data.src_coord;
  4101. PackedData& value=data.src_value;
  4102. coord.ptr=setup_data. coord_data;
  4103. value.ptr=setup_data. input_data;
  4104. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4105. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4106. coord.cnt.ReInit(nodes.size());
  4107. coord.dsp.ReInit(nodes.size());
  4108. value.cnt.ReInit(nodes.size());
  4109. value.dsp.ReInit(nodes.size());
  4110. #pragma omp parallel for
  4111. for(size_t i=0;i<nodes.size();i++){
  4112. ((FMMNode_t*)nodes[i])->node_id=i;
  4113. Vector<Real_t>& coord_vec=tree->upwd_equiv_surf[((FMMNode*)nodes[i])->Depth()];
  4114. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->upward_equiv;
  4115. if(coord_vec.Dim()){
  4116. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4117. assert(coord.dsp[i]<coord.len);
  4118. coord.cnt[i]=coord_vec.Dim();
  4119. }else{
  4120. coord.dsp[i]=0;
  4121. coord.cnt[i]=0;
  4122. }
  4123. if(value_vec.Dim()){
  4124. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4125. assert(value.dsp[i]<value.len);
  4126. value.cnt[i]=value_vec.Dim();
  4127. }else{
  4128. value.dsp[i]=0;
  4129. value.cnt[i]=0;
  4130. }
  4131. }
  4132. }
  4133. { // Set srf data
  4134. std::vector<void*>& nodes=nodes_in;
  4135. PackedData& coord=data.srf_coord;
  4136. PackedData& value=data.srf_value;
  4137. coord.ptr=setup_data. coord_data;
  4138. value.ptr=setup_data. input_data;
  4139. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4140. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4141. coord.cnt.ReInit(nodes.size());
  4142. coord.dsp.ReInit(nodes.size());
  4143. value.cnt.ReInit(nodes.size());
  4144. value.dsp.ReInit(nodes.size());
  4145. #pragma omp parallel for
  4146. for(size_t i=0;i<nodes.size();i++){
  4147. coord.dsp[i]=0;
  4148. coord.cnt[i]=0;
  4149. value.dsp[i]=0;
  4150. value.cnt[i]=0;
  4151. }
  4152. }
  4153. { // Set trg data
  4154. std::vector<void*>& nodes=nodes_out;
  4155. PackedData& coord=data.trg_coord;
  4156. PackedData& value=data.trg_value;
  4157. coord.ptr=setup_data. coord_data;
  4158. value.ptr=setup_data.output_data;
  4159. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4160. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4161. coord.cnt.ReInit(nodes.size());
  4162. coord.dsp.ReInit(nodes.size());
  4163. value.cnt.ReInit(nodes.size());
  4164. value.dsp.ReInit(nodes.size());
  4165. #pragma omp parallel for
  4166. for(size_t i=0;i<nodes.size();i++){
  4167. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4168. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4169. if(coord_vec.Dim()){
  4170. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4171. assert(coord.dsp[i]<coord.len);
  4172. coord.cnt[i]=coord_vec.Dim();
  4173. }else{
  4174. coord.dsp[i]=0;
  4175. coord.cnt[i]=0;
  4176. }
  4177. if(value_vec.Dim()){
  4178. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4179. assert(value.dsp[i]<value.len);
  4180. value.cnt[i]=value_vec.Dim();
  4181. }else{
  4182. value.dsp[i]=0;
  4183. value.cnt[i]=0;
  4184. }
  4185. }
  4186. }
  4187. { // Set interac_data
  4188. int omp_p=omp_get_max_threads();
  4189. std::vector<std::vector<size_t> > in_node_(omp_p);
  4190. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4191. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4192. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4193. size_t m=this->MultipoleOrder();
  4194. size_t Nsrf=(6*(m-1)*(m-1)+2);
  4195. #pragma omp parallel for
  4196. for(size_t tid=0;tid<omp_p;tid++){
  4197. std::vector<size_t>& in_node =in_node_[tid] ;
  4198. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4199. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4200. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  4201. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4202. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4203. for(size_t i=a;i<b;i++){
  4204. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4205. Real_t s=pvfmm::pow<Real_t>(0.5,tnode->Depth());
  4206. size_t interac_cnt_=0;
  4207. { // W_Type
  4208. Mat_Type type=W_Type;
  4209. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4210. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4211. FMMNode_t* snode=intlst[j];
  4212. size_t snode_id=snode->node_id;
  4213. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4214. if(snode->IsGhost() && snode->src_coord.Dim()+snode->surf_coord.Dim()==0){ // Is non-leaf ghost node
  4215. }else if(snode->IsLeaf() && snode->pt_cnt[0]<=Nsrf) continue; // skip: handled in U-list
  4216. in_node.push_back(snode_id);
  4217. scal_idx.push_back(snode->Depth());
  4218. { // set coord_shift
  4219. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4220. const Real_t* scoord=snode->Coord();
  4221. const Real_t* tcoord=tnode->Coord();
  4222. Real_t shift[COORD_DIM];
  4223. shift[0]=rel_coord[0]*0.25*s-(0+0.25*s)+(tcoord[0]+0.5*s);
  4224. shift[1]=rel_coord[1]*0.25*s-(0+0.25*s)+(tcoord[1]+0.5*s);
  4225. shift[2]=rel_coord[2]*0.25*s-(0+0.25*s)+(tcoord[2]+0.5*s);
  4226. coord_shift.push_back(shift[0]);
  4227. coord_shift.push_back(shift[1]);
  4228. coord_shift.push_back(shift[2]);
  4229. }
  4230. interac_cnt_++;
  4231. }
  4232. }
  4233. interac_cnt.push_back(interac_cnt_);
  4234. }
  4235. }
  4236. { // Combine interac data
  4237. InteracData& interac_data=data.interac_data;
  4238. { // in_node
  4239. typedef size_t ElemType;
  4240. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4241. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4242. std::vector<size_t> vec_dsp(omp_p+1,0);
  4243. for(size_t tid=0;tid<omp_p;tid++){
  4244. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4245. }
  4246. vec.ReInit(vec_dsp[omp_p]);
  4247. #pragma omp parallel for
  4248. for(size_t tid=0;tid<omp_p;tid++){
  4249. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4250. }
  4251. }
  4252. { // scal_idx
  4253. typedef size_t ElemType;
  4254. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4255. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4256. std::vector<size_t> vec_dsp(omp_p+1,0);
  4257. for(size_t tid=0;tid<omp_p;tid++){
  4258. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4259. }
  4260. vec.ReInit(vec_dsp[omp_p]);
  4261. #pragma omp parallel for
  4262. for(size_t tid=0;tid<omp_p;tid++){
  4263. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4264. }
  4265. }
  4266. { // coord_shift
  4267. typedef Real_t ElemType;
  4268. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4269. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4270. std::vector<size_t> vec_dsp(omp_p+1,0);
  4271. for(size_t tid=0;tid<omp_p;tid++){
  4272. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4273. }
  4274. vec.ReInit(vec_dsp[omp_p]);
  4275. #pragma omp parallel for
  4276. for(size_t tid=0;tid<omp_p;tid++){
  4277. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4278. }
  4279. }
  4280. { // interac_cnt
  4281. typedef size_t ElemType;
  4282. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4283. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4284. std::vector<size_t> vec_dsp(omp_p+1,0);
  4285. for(size_t tid=0;tid<omp_p;tid++){
  4286. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4287. }
  4288. vec.ReInit(vec_dsp[omp_p]);
  4289. #pragma omp parallel for
  4290. for(size_t tid=0;tid<omp_p;tid++){
  4291. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4292. }
  4293. }
  4294. { // interac_dsp
  4295. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4296. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4297. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4298. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4299. }
  4300. }
  4301. }
  4302. PtSetup(setup_data, &data);
  4303. }
  4304. template <class FMMNode>
  4305. void FMM_Pts<FMMNode>::W_List (SetupData<Real_t>& setup_data, bool device){
  4306. if(!this->MultipoleOrder()) return;
  4307. //Add W_List contribution.
  4308. this->EvalListPts(setup_data, device);
  4309. }
  4310. template <class FMMNode>
  4311. void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  4312. { // Set setup_data
  4313. setup_data. level=level;
  4314. setup_data.kernel=kernel->k_s2t;
  4315. setup_data. input_data=&buff[4];
  4316. setup_data.output_data=&buff[5];
  4317. setup_data. coord_data=&buff[6];
  4318. Vector<FMMNode_t*>& nodes_in =n_list[4];
  4319. Vector<FMMNode_t*>& nodes_out=n_list[5];
  4320. setup_data.nodes_in .clear();
  4321. setup_data.nodes_out.clear();
  4322. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && (nodes_in [i]->src_coord.Dim() || nodes_in [i]->surf_coord.Dim()) && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]);
  4323. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && (nodes_out[i]->trg_coord.Dim() ) && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  4324. }
  4325. struct PackedData{
  4326. size_t len;
  4327. Matrix<Real_t>* ptr;
  4328. Vector<size_t> cnt;
  4329. Vector<size_t> dsp;
  4330. };
  4331. struct InteracData{
  4332. Vector<size_t> in_node;
  4333. Vector<size_t> scal_idx;
  4334. Vector<Real_t> coord_shift;
  4335. Vector<size_t> interac_cnt;
  4336. Vector<size_t> interac_dsp;
  4337. Vector<size_t> interac_cst;
  4338. Vector<Real_t> scal[4*MAX_DEPTH];
  4339. Matrix<Real_t> M[4];
  4340. };
  4341. struct ptSetupData{
  4342. int level;
  4343. const Kernel<Real_t>* kernel;
  4344. PackedData src_coord; // Src coord
  4345. PackedData src_value; // Src density
  4346. PackedData srf_coord; // Srf coord
  4347. PackedData srf_value; // Srf density
  4348. PackedData trg_coord; // Trg coord
  4349. PackedData trg_value; // Trg potential
  4350. InteracData interac_data;
  4351. };
  4352. ptSetupData data;
  4353. data. level=setup_data. level;
  4354. data.kernel=setup_data.kernel;
  4355. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  4356. std::vector<void*>& nodes_out=setup_data.nodes_out;
  4357. { // Set src data
  4358. std::vector<void*>& nodes=nodes_in;
  4359. PackedData& coord=data.src_coord;
  4360. PackedData& value=data.src_value;
  4361. coord.ptr=setup_data. coord_data;
  4362. value.ptr=setup_data. input_data;
  4363. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4364. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4365. coord.cnt.ReInit(nodes.size());
  4366. coord.dsp.ReInit(nodes.size());
  4367. value.cnt.ReInit(nodes.size());
  4368. value.dsp.ReInit(nodes.size());
  4369. #pragma omp parallel for
  4370. for(size_t i=0;i<nodes.size();i++){
  4371. ((FMMNode_t*)nodes[i])->node_id=i;
  4372. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->src_coord;
  4373. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->src_value;
  4374. if(coord_vec.Dim()){
  4375. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4376. assert(coord.dsp[i]<coord.len);
  4377. coord.cnt[i]=coord_vec.Dim();
  4378. }else{
  4379. coord.dsp[i]=0;
  4380. coord.cnt[i]=0;
  4381. }
  4382. if(value_vec.Dim()){
  4383. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4384. assert(value.dsp[i]<value.len);
  4385. value.cnt[i]=value_vec.Dim();
  4386. }else{
  4387. value.dsp[i]=0;
  4388. value.cnt[i]=0;
  4389. }
  4390. }
  4391. }
  4392. { // Set srf data
  4393. std::vector<void*>& nodes=nodes_in;
  4394. PackedData& coord=data.srf_coord;
  4395. PackedData& value=data.srf_value;
  4396. coord.ptr=setup_data. coord_data;
  4397. value.ptr=setup_data. input_data;
  4398. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4399. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4400. coord.cnt.ReInit(nodes.size());
  4401. coord.dsp.ReInit(nodes.size());
  4402. value.cnt.ReInit(nodes.size());
  4403. value.dsp.ReInit(nodes.size());
  4404. #pragma omp parallel for
  4405. for(size_t i=0;i<nodes.size();i++){
  4406. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->surf_coord;
  4407. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->surf_value;
  4408. if(coord_vec.Dim()){
  4409. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4410. assert(coord.dsp[i]<coord.len);
  4411. coord.cnt[i]=coord_vec.Dim();
  4412. }else{
  4413. coord.dsp[i]=0;
  4414. coord.cnt[i]=0;
  4415. }
  4416. if(value_vec.Dim()){
  4417. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4418. assert(value.dsp[i]<value.len);
  4419. value.cnt[i]=value_vec.Dim();
  4420. }else{
  4421. value.dsp[i]=0;
  4422. value.cnt[i]=0;
  4423. }
  4424. }
  4425. }
  4426. { // Set trg data
  4427. std::vector<void*>& nodes=nodes_out;
  4428. PackedData& coord=data.trg_coord;
  4429. PackedData& value=data.trg_value;
  4430. coord.ptr=setup_data. coord_data;
  4431. value.ptr=setup_data.output_data;
  4432. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4433. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4434. coord.cnt.ReInit(nodes.size());
  4435. coord.dsp.ReInit(nodes.size());
  4436. value.cnt.ReInit(nodes.size());
  4437. value.dsp.ReInit(nodes.size());
  4438. #pragma omp parallel for
  4439. for(size_t i=0;i<nodes.size();i++){
  4440. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4441. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4442. if(coord_vec.Dim()){
  4443. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4444. assert(coord.dsp[i]<coord.len);
  4445. coord.cnt[i]=coord_vec.Dim();
  4446. }else{
  4447. coord.dsp[i]=0;
  4448. coord.cnt[i]=0;
  4449. }
  4450. if(value_vec.Dim()){
  4451. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4452. assert(value.dsp[i]<value.len);
  4453. value.cnt[i]=value_vec.Dim();
  4454. }else{
  4455. value.dsp[i]=0;
  4456. value.cnt[i]=0;
  4457. }
  4458. }
  4459. }
  4460. { // Set interac_data
  4461. int omp_p=omp_get_max_threads();
  4462. std::vector<std::vector<size_t> > in_node_(omp_p);
  4463. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4464. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4465. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4466. size_t m=this->MultipoleOrder();
  4467. size_t Nsrf=(6*(m-1)*(m-1)+2);
  4468. #pragma omp parallel for
  4469. for(size_t tid=0;tid<omp_p;tid++){
  4470. std::vector<size_t>& in_node =in_node_[tid] ;
  4471. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4472. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4473. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  4474. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4475. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4476. for(size_t i=a;i<b;i++){
  4477. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4478. Real_t s=pvfmm::pow<Real_t>(0.5,tnode->Depth());
  4479. size_t interac_cnt_=0;
  4480. { // U0_Type
  4481. Mat_Type type=U0_Type;
  4482. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4483. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4484. FMMNode_t* snode=intlst[j];
  4485. size_t snode_id=snode->node_id;
  4486. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4487. in_node.push_back(snode_id);
  4488. scal_idx.push_back(snode->Depth());
  4489. { // set coord_shift
  4490. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4491. const Real_t* scoord=snode->Coord();
  4492. const Real_t* tcoord=tnode->Coord();
  4493. Real_t shift[COORD_DIM];
  4494. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(tcoord[0]+0.5*s);
  4495. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(tcoord[1]+0.5*s);
  4496. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(tcoord[2]+0.5*s);
  4497. coord_shift.push_back(shift[0]);
  4498. coord_shift.push_back(shift[1]);
  4499. coord_shift.push_back(shift[2]);
  4500. }
  4501. interac_cnt_++;
  4502. }
  4503. }
  4504. { // U1_Type
  4505. Mat_Type type=U1_Type;
  4506. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4507. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4508. FMMNode_t* snode=intlst[j];
  4509. size_t snode_id=snode->node_id;
  4510. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4511. in_node.push_back(snode_id);
  4512. scal_idx.push_back(snode->Depth());
  4513. { // set coord_shift
  4514. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4515. const Real_t* scoord=snode->Coord();
  4516. const Real_t* tcoord=tnode->Coord();
  4517. Real_t shift[COORD_DIM];
  4518. shift[0]=rel_coord[0]*1.0*s-(scoord[0]+0.5*s)+(tcoord[0]+0.5*s);
  4519. shift[1]=rel_coord[1]*1.0*s-(scoord[1]+0.5*s)+(tcoord[1]+0.5*s);
  4520. shift[2]=rel_coord[2]*1.0*s-(scoord[2]+0.5*s)+(tcoord[2]+0.5*s);
  4521. coord_shift.push_back(shift[0]);
  4522. coord_shift.push_back(shift[1]);
  4523. coord_shift.push_back(shift[2]);
  4524. }
  4525. interac_cnt_++;
  4526. }
  4527. }
  4528. { // U2_Type
  4529. Mat_Type type=U2_Type;
  4530. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4531. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4532. FMMNode_t* snode=intlst[j];
  4533. size_t snode_id=snode->node_id;
  4534. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4535. in_node.push_back(snode_id);
  4536. scal_idx.push_back(snode->Depth());
  4537. { // set coord_shift
  4538. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4539. const Real_t* scoord=snode->Coord();
  4540. const Real_t* tcoord=tnode->Coord();
  4541. Real_t shift[COORD_DIM];
  4542. shift[0]=rel_coord[0]*0.25*s-(scoord[0]+0.25*s)+(tcoord[0]+0.5*s);
  4543. shift[1]=rel_coord[1]*0.25*s-(scoord[1]+0.25*s)+(tcoord[1]+0.5*s);
  4544. shift[2]=rel_coord[2]*0.25*s-(scoord[2]+0.25*s)+(tcoord[2]+0.5*s);
  4545. coord_shift.push_back(shift[0]);
  4546. coord_shift.push_back(shift[1]);
  4547. coord_shift.push_back(shift[2]);
  4548. }
  4549. interac_cnt_++;
  4550. }
  4551. }
  4552. { // X_Type
  4553. Mat_Type type=X_Type;
  4554. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4555. if(tnode->pt_cnt[1]<=Nsrf)
  4556. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4557. FMMNode_t* snode=intlst[j];
  4558. size_t snode_id=snode->node_id;
  4559. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4560. in_node.push_back(snode_id);
  4561. scal_idx.push_back(snode->Depth());
  4562. { // set coord_shift
  4563. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4564. const Real_t* scoord=snode->Coord();
  4565. const Real_t* tcoord=tnode->Coord();
  4566. Real_t shift[COORD_DIM];
  4567. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(tcoord[0]+0.5*s);
  4568. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(tcoord[1]+0.5*s);
  4569. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(tcoord[2]+0.5*s);
  4570. coord_shift.push_back(shift[0]);
  4571. coord_shift.push_back(shift[1]);
  4572. coord_shift.push_back(shift[2]);
  4573. }
  4574. interac_cnt_++;
  4575. }
  4576. }
  4577. { // W_Type
  4578. Mat_Type type=W_Type;
  4579. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4580. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4581. FMMNode_t* snode=intlst[j];
  4582. size_t snode_id=snode->node_id;
  4583. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4584. if(snode->IsGhost() && snode->src_coord.Dim()+snode->surf_coord.Dim()==0) continue; // Is non-leaf ghost node
  4585. if(snode->pt_cnt[0]> Nsrf) continue;
  4586. in_node.push_back(snode_id);
  4587. scal_idx.push_back(snode->Depth());
  4588. { // set coord_shift
  4589. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4590. const Real_t* scoord=snode->Coord();
  4591. const Real_t* tcoord=tnode->Coord();
  4592. Real_t shift[COORD_DIM];
  4593. shift[0]=rel_coord[0]*0.25*s-(scoord[0]+0.25*s)+(tcoord[0]+0.5*s);
  4594. shift[1]=rel_coord[1]*0.25*s-(scoord[1]+0.25*s)+(tcoord[1]+0.5*s);
  4595. shift[2]=rel_coord[2]*0.25*s-(scoord[2]+0.25*s)+(tcoord[2]+0.5*s);
  4596. coord_shift.push_back(shift[0]);
  4597. coord_shift.push_back(shift[1]);
  4598. coord_shift.push_back(shift[2]);
  4599. }
  4600. interac_cnt_++;
  4601. }
  4602. }
  4603. interac_cnt.push_back(interac_cnt_);
  4604. }
  4605. }
  4606. { // Combine interac data
  4607. InteracData& interac_data=data.interac_data;
  4608. { // in_node
  4609. typedef size_t ElemType;
  4610. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4611. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4612. std::vector<size_t> vec_dsp(omp_p+1,0);
  4613. for(size_t tid=0;tid<omp_p;tid++){
  4614. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4615. }
  4616. vec.ReInit(vec_dsp[omp_p]);
  4617. #pragma omp parallel for
  4618. for(size_t tid=0;tid<omp_p;tid++){
  4619. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4620. }
  4621. }
  4622. { // scal_idx
  4623. typedef size_t ElemType;
  4624. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4625. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4626. std::vector<size_t> vec_dsp(omp_p+1,0);
  4627. for(size_t tid=0;tid<omp_p;tid++){
  4628. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4629. }
  4630. vec.ReInit(vec_dsp[omp_p]);
  4631. #pragma omp parallel for
  4632. for(size_t tid=0;tid<omp_p;tid++){
  4633. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4634. }
  4635. }
  4636. { // coord_shift
  4637. typedef Real_t ElemType;
  4638. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4639. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4640. std::vector<size_t> vec_dsp(omp_p+1,0);
  4641. for(size_t tid=0;tid<omp_p;tid++){
  4642. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4643. }
  4644. vec.ReInit(vec_dsp[omp_p]);
  4645. #pragma omp parallel for
  4646. for(size_t tid=0;tid<omp_p;tid++){
  4647. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4648. }
  4649. }
  4650. { // interac_cnt
  4651. typedef size_t ElemType;
  4652. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4653. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4654. std::vector<size_t> vec_dsp(omp_p+1,0);
  4655. for(size_t tid=0;tid<omp_p;tid++){
  4656. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4657. }
  4658. vec.ReInit(vec_dsp[omp_p]);
  4659. #pragma omp parallel for
  4660. for(size_t tid=0;tid<omp_p;tid++){
  4661. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4662. }
  4663. }
  4664. { // interac_dsp
  4665. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4666. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4667. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4668. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4669. }
  4670. }
  4671. }
  4672. PtSetup(setup_data, &data);
  4673. }
  4674. template <class FMMNode>
  4675. void FMM_Pts<FMMNode>::U_List (SetupData<Real_t>& setup_data, bool device){
  4676. //Add U_List contribution.
  4677. this->EvalListPts(setup_data, device);
  4678. }
  4679. template <class FMMNode>
  4680. void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  4681. if(!this->MultipoleOrder()) return;
  4682. { // Set setup_data
  4683. setup_data. level=level;
  4684. setup_data.kernel=kernel->k_l2t;
  4685. setup_data. input_data=&buff[1];
  4686. setup_data.output_data=&buff[5];
  4687. setup_data. coord_data=&buff[6];
  4688. Vector<FMMNode_t*>& nodes_in =n_list[1];
  4689. Vector<FMMNode_t*>& nodes_out=n_list[5];
  4690. setup_data.nodes_in .clear();
  4691. setup_data.nodes_out.clear();
  4692. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->trg_coord.Dim() && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
  4693. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->trg_coord.Dim() && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  4694. }
  4695. struct PackedData{
  4696. size_t len;
  4697. Matrix<Real_t>* ptr;
  4698. Vector<size_t> cnt;
  4699. Vector<size_t> dsp;
  4700. };
  4701. struct InteracData{
  4702. Vector<size_t> in_node;
  4703. Vector<size_t> scal_idx;
  4704. Vector<Real_t> coord_shift;
  4705. Vector<size_t> interac_cnt;
  4706. Vector<size_t> interac_dsp;
  4707. Vector<size_t> interac_cst;
  4708. Vector<Real_t> scal[4*MAX_DEPTH];
  4709. Matrix<Real_t> M[4];
  4710. };
  4711. struct ptSetupData{
  4712. int level;
  4713. const Kernel<Real_t>* kernel;
  4714. PackedData src_coord; // Src coord
  4715. PackedData src_value; // Src density
  4716. PackedData srf_coord; // Srf coord
  4717. PackedData srf_value; // Srf density
  4718. PackedData trg_coord; // Trg coord
  4719. PackedData trg_value; // Trg potential
  4720. InteracData interac_data;
  4721. };
  4722. ptSetupData data;
  4723. data. level=setup_data. level;
  4724. data.kernel=setup_data.kernel;
  4725. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  4726. std::vector<void*>& nodes_out=setup_data.nodes_out;
  4727. { // Set src data
  4728. std::vector<void*>& nodes=nodes_in;
  4729. PackedData& coord=data.src_coord;
  4730. PackedData& value=data.src_value;
  4731. coord.ptr=setup_data. coord_data;
  4732. value.ptr=setup_data. input_data;
  4733. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4734. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4735. coord.cnt.ReInit(nodes.size());
  4736. coord.dsp.ReInit(nodes.size());
  4737. value.cnt.ReInit(nodes.size());
  4738. value.dsp.ReInit(nodes.size());
  4739. #pragma omp parallel for
  4740. for(size_t i=0;i<nodes.size();i++){
  4741. ((FMMNode_t*)nodes[i])->node_id=i;
  4742. Vector<Real_t>& coord_vec=tree->dnwd_equiv_surf[((FMMNode*)nodes[i])->Depth()];
  4743. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->dnward_equiv;
  4744. if(coord_vec.Dim()){
  4745. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4746. assert(coord.dsp[i]<coord.len);
  4747. coord.cnt[i]=coord_vec.Dim();
  4748. }else{
  4749. coord.dsp[i]=0;
  4750. coord.cnt[i]=0;
  4751. }
  4752. if(value_vec.Dim()){
  4753. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4754. assert(value.dsp[i]<value.len);
  4755. value.cnt[i]=value_vec.Dim();
  4756. }else{
  4757. value.dsp[i]=0;
  4758. value.cnt[i]=0;
  4759. }
  4760. }
  4761. }
  4762. { // Set srf data
  4763. std::vector<void*>& nodes=nodes_in;
  4764. PackedData& coord=data.srf_coord;
  4765. PackedData& value=data.srf_value;
  4766. coord.ptr=setup_data. coord_data;
  4767. value.ptr=setup_data. input_data;
  4768. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4769. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4770. coord.cnt.ReInit(nodes.size());
  4771. coord.dsp.ReInit(nodes.size());
  4772. value.cnt.ReInit(nodes.size());
  4773. value.dsp.ReInit(nodes.size());
  4774. #pragma omp parallel for
  4775. for(size_t i=0;i<nodes.size();i++){
  4776. coord.dsp[i]=0;
  4777. coord.cnt[i]=0;
  4778. value.dsp[i]=0;
  4779. value.cnt[i]=0;
  4780. }
  4781. }
  4782. { // Set trg data
  4783. std::vector<void*>& nodes=nodes_out;
  4784. PackedData& coord=data.trg_coord;
  4785. PackedData& value=data.trg_value;
  4786. coord.ptr=setup_data. coord_data;
  4787. value.ptr=setup_data.output_data;
  4788. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4789. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4790. coord.cnt.ReInit(nodes.size());
  4791. coord.dsp.ReInit(nodes.size());
  4792. value.cnt.ReInit(nodes.size());
  4793. value.dsp.ReInit(nodes.size());
  4794. #pragma omp parallel for
  4795. for(size_t i=0;i<nodes.size();i++){
  4796. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4797. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4798. if(coord_vec.Dim()){
  4799. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4800. assert(coord.dsp[i]<coord.len);
  4801. coord.cnt[i]=coord_vec.Dim();
  4802. }else{
  4803. coord.dsp[i]=0;
  4804. coord.cnt[i]=0;
  4805. }
  4806. if(value_vec.Dim()){
  4807. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4808. assert(value.dsp[i]<value.len);
  4809. value.cnt[i]=value_vec.Dim();
  4810. }else{
  4811. value.dsp[i]=0;
  4812. value.cnt[i]=0;
  4813. }
  4814. }
  4815. }
  4816. { // Set interac_data
  4817. int omp_p=omp_get_max_threads();
  4818. std::vector<std::vector<size_t> > in_node_(omp_p);
  4819. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4820. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4821. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4822. if(this->ScaleInvar()){ // Set scal
  4823. const Kernel<Real_t>* ker=kernel->k_l2l;
  4824. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+0]
  4825. Vector<Real_t>& scal=data.interac_data.scal[l*4+0];
  4826. Vector<Real_t>& scal_exp=ker->trg_scal;
  4827. scal.ReInit(scal_exp.Dim());
  4828. for(size_t i=0;i<scal.Dim();i++){
  4829. scal[i]=pvfmm::pow<Real_t>(2.0,-scal_exp[i]*l);
  4830. }
  4831. }
  4832. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+1]
  4833. Vector<Real_t>& scal=data.interac_data.scal[l*4+1];
  4834. Vector<Real_t>& scal_exp=ker->src_scal;
  4835. scal.ReInit(scal_exp.Dim());
  4836. for(size_t i=0;i<scal.Dim();i++){
  4837. scal[i]=pvfmm::pow<Real_t>(2.0,-scal_exp[i]*l);
  4838. }
  4839. }
  4840. }
  4841. #pragma omp parallel for
  4842. for(size_t tid=0;tid<omp_p;tid++){
  4843. std::vector<size_t>& in_node =in_node_[tid] ;
  4844. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4845. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4846. std::vector<size_t>& interac_cnt=interac_cnt_[tid];
  4847. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4848. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4849. for(size_t i=a;i<b;i++){
  4850. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4851. Real_t s=pvfmm::pow<Real_t>(0.5,tnode->Depth());
  4852. size_t interac_cnt_=0;
  4853. { // D2T_Type
  4854. Mat_Type type=D2T_Type;
  4855. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4856. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4857. FMMNode_t* snode=intlst[j];
  4858. size_t snode_id=snode->node_id;
  4859. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4860. in_node.push_back(snode_id);
  4861. scal_idx.push_back(snode->Depth());
  4862. { // set coord_shift
  4863. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4864. const Real_t* scoord=snode->Coord();
  4865. const Real_t* tcoord=tnode->Coord();
  4866. Real_t shift[COORD_DIM];
  4867. shift[0]=rel_coord[0]*0.5*s-(0+0.5*s)+(tcoord[0]+0.5*s);
  4868. shift[1]=rel_coord[1]*0.5*s-(0+0.5*s)+(tcoord[1]+0.5*s);
  4869. shift[2]=rel_coord[2]*0.5*s-(0+0.5*s)+(tcoord[2]+0.5*s);
  4870. coord_shift.push_back(shift[0]);
  4871. coord_shift.push_back(shift[1]);
  4872. coord_shift.push_back(shift[2]);
  4873. }
  4874. interac_cnt_++;
  4875. }
  4876. }
  4877. interac_cnt.push_back(interac_cnt_);
  4878. }
  4879. }
  4880. { // Combine interac data
  4881. InteracData& interac_data=data.interac_data;
  4882. { // in_node
  4883. typedef size_t ElemType;
  4884. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4885. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4886. std::vector<size_t> vec_dsp(omp_p+1,0);
  4887. for(size_t tid=0;tid<omp_p;tid++){
  4888. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4889. }
  4890. vec.ReInit(vec_dsp[omp_p]);
  4891. #pragma omp parallel for
  4892. for(size_t tid=0;tid<omp_p;tid++){
  4893. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4894. }
  4895. }
  4896. { // scal_idx
  4897. typedef size_t ElemType;
  4898. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4899. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4900. std::vector<size_t> vec_dsp(omp_p+1,0);
  4901. for(size_t tid=0;tid<omp_p;tid++){
  4902. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4903. }
  4904. vec.ReInit(vec_dsp[omp_p]);
  4905. #pragma omp parallel for
  4906. for(size_t tid=0;tid<omp_p;tid++){
  4907. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4908. }
  4909. }
  4910. { // coord_shift
  4911. typedef Real_t ElemType;
  4912. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4913. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4914. std::vector<size_t> vec_dsp(omp_p+1,0);
  4915. for(size_t tid=0;tid<omp_p;tid++){
  4916. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4917. }
  4918. vec.ReInit(vec_dsp[omp_p]);
  4919. #pragma omp parallel for
  4920. for(size_t tid=0;tid<omp_p;tid++){
  4921. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4922. }
  4923. }
  4924. { // interac_cnt
  4925. typedef size_t ElemType;
  4926. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4927. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4928. std::vector<size_t> vec_dsp(omp_p+1,0);
  4929. for(size_t tid=0;tid<omp_p;tid++){
  4930. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4931. }
  4932. vec.ReInit(vec_dsp[omp_p]);
  4933. #pragma omp parallel for
  4934. for(size_t tid=0;tid<omp_p;tid++){
  4935. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4936. }
  4937. }
  4938. { // interac_dsp
  4939. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4940. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4941. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4942. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4943. }
  4944. }
  4945. { // Set M[0], M[1]
  4946. InteracData& interac_data=data.interac_data;
  4947. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4948. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4949. if(cnt.Dim() && cnt[cnt.Dim()-1]+dsp[dsp.Dim()-1]){
  4950. data.interac_data.M[0]=this->mat->Mat(level, DC2DE0_Type, 0);
  4951. data.interac_data.M[1]=this->mat->Mat(level, DC2DE1_Type, 0);
  4952. }else{
  4953. data.interac_data.M[0].ReInit(0,0);
  4954. data.interac_data.M[1].ReInit(0,0);
  4955. }
  4956. }
  4957. }
  4958. PtSetup(setup_data, &data);
  4959. }
  4960. template <class FMMNode>
  4961. void FMM_Pts<FMMNode>::Down2Target(SetupData<Real_t>& setup_data, bool device){
  4962. if(!this->MultipoleOrder()) return;
  4963. //Add Down2Target contribution.
  4964. this->EvalListPts(setup_data, device);
  4965. }
  4966. template <class FMMNode>
  4967. void FMM_Pts<FMMNode>::PostProcessing(FMMTree_t* tree, std::vector<FMMNode_t*>& nodes, BoundaryType bndry){
  4968. if(kernel->k_m2l->vol_poten && bndry==Periodic && BC_LEVELS>0){ // Add analytical near-field to target potential
  4969. const Kernel<Real_t>& k_m2t=*kernel->k_m2t;
  4970. int ker_dim[2]={k_m2t.ker_dim[0],k_m2t.ker_dim[1]};
  4971. Vector<Real_t>& up_equiv=((FMMData*)tree->RootNode()->FMMData())->upward_equiv;
  4972. Matrix<Real_t> avg_density(1,ker_dim[0]); avg_density.SetZero();
  4973. for(size_t i0=0;i0<up_equiv.Dim();i0+=ker_dim[0]){
  4974. for(size_t i1=0;i1<ker_dim[0];i1++){
  4975. avg_density[0][i1]+=up_equiv[i0+i1];
  4976. }
  4977. }
  4978. int omp_p=omp_get_max_threads();
  4979. std::vector<Matrix<Real_t> > M_tmp(omp_p);
  4980. #pragma omp parallel for
  4981. for(size_t i=0;i<nodes.size();i++)
  4982. if(nodes[i]->IsLeaf() && !nodes[i]->IsGhost()){
  4983. Vector<Real_t>& trg_coord=nodes[i]->trg_coord;
  4984. Vector<Real_t>& trg_value=nodes[i]->trg_value;
  4985. size_t n_trg=trg_coord.Dim()/COORD_DIM;
  4986. Matrix<Real_t>& M_vol=M_tmp[omp_get_thread_num()];
  4987. M_vol.ReInit(ker_dim[0],n_trg*ker_dim[1]); M_vol.SetZero();
  4988. k_m2t.vol_poten(&trg_coord[0],n_trg,&M_vol[0][0]);
  4989. Matrix<Real_t> M_trg(1,n_trg*ker_dim[1],&trg_value[0],false);
  4990. M_trg-=avg_density*M_vol;
  4991. }
  4992. }
  4993. }
  4994. template <class FMMNode>
  4995. void FMM_Pts<FMMNode>::CopyOutput(FMMNode** nodes, size_t n){
  4996. }
  4997. }//end namespace