fmm_pts.txx 205 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416
  1. /**
  2. * \file fmm_pts.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 3-07-2011
  5. * \brief This file contains the implementation of the FMM_Pts class.
  6. */
  7. #include <omp.h>
  8. #include <cmath>
  9. #include <cstdlib>
  10. #include <cassert>
  11. #include <sstream>
  12. #include <iostream>
  13. #include <stdint.h>
  14. #include <set>
  15. #ifdef PVFMM_HAVE_SYS_STAT_H
  16. #include <sys/stat.h>
  17. #endif
  18. #ifdef __SSE__
  19. #include <xmmintrin.h>
  20. #endif
  21. #ifdef __SSE2__
  22. #include <emmintrin.h>
  23. #endif
  24. #ifdef __SSE3__
  25. #include <pmmintrin.h>
  26. #endif
  27. #ifdef __AVX__
  28. #include <immintrin.h>
  29. #endif
  30. #if defined(__MIC__)
  31. #include <immintrin.h>
  32. #endif
  33. #include <profile.hpp>
  34. namespace pvfmm{
  35. /**
  36. * \brief Returns the coordinates of points on the surface of a cube.
  37. * \param[in] p Number of points on an edge of the cube is (n+1)
  38. * \param[in] c Coordinates to the centre of the cube (3D array).
  39. * \param[in] alpha Scaling factor for the size of the cube.
  40. * \param[in] depth Depth of the cube in the octree.
  41. * \return Vector with coordinates of points on the surface of the cube in the
  42. * format [x0 y0 z0 x1 y1 z1 .... ].
  43. */
  44. template <class Real_t>
  45. std::vector<Real_t> surface(int p, Real_t* c, Real_t alpha, int depth){
  46. size_t n_=(6*(p-1)*(p-1)+2); //Total number of points.
  47. std::vector<Real_t> coord(n_*3);
  48. coord[0]=coord[1]=coord[2]=-1.0;
  49. size_t cnt=1;
  50. for(int i=0;i<p-1;i++)
  51. for(int j=0;j<p-1;j++){
  52. coord[cnt*3 ]=-1.0;
  53. coord[cnt*3+1]=(2.0*(i+1)-p+1)/(p-1);
  54. coord[cnt*3+2]=(2.0*j-p+1)/(p-1);
  55. cnt++;
  56. }
  57. for(int i=0;i<p-1;i++)
  58. for(int j=0;j<p-1;j++){
  59. coord[cnt*3 ]=(2.0*i-p+1)/(p-1);
  60. coord[cnt*3+1]=-1.0;
  61. coord[cnt*3+2]=(2.0*(j+1)-p+1)/(p-1);
  62. cnt++;
  63. }
  64. for(int i=0;i<p-1;i++)
  65. for(int j=0;j<p-1;j++){
  66. coord[cnt*3 ]=(2.0*(i+1)-p+1)/(p-1);
  67. coord[cnt*3+1]=(2.0*j-p+1)/(p-1);
  68. coord[cnt*3+2]=-1.0;
  69. cnt++;
  70. }
  71. for(size_t i=0;i<(n_/2)*3;i++)
  72. coord[cnt*3+i]=-coord[i];
  73. Real_t r = 0.5*pow(0.5,depth);
  74. Real_t b = alpha*r;
  75. for(size_t i=0;i<n_;i++){
  76. coord[i*3+0]=(coord[i*3+0]+1.0)*b+c[0];
  77. coord[i*3+1]=(coord[i*3+1]+1.0)*b+c[1];
  78. coord[i*3+2]=(coord[i*3+2]+1.0)*b+c[2];
  79. }
  80. return coord;
  81. }
  82. /**
  83. * \brief Returns the coordinates of points on the upward check surface of cube.
  84. * \see surface()
  85. */
  86. template <class Real_t>
  87. std::vector<Real_t> u_check_surf(int p, Real_t* c, int depth){
  88. Real_t r=0.5*pow(0.5,depth);
  89. Real_t coord[3]={(Real_t)(c[0]-r*(RAD1-1.0)),(Real_t)(c[1]-r*(RAD1-1.0)),(Real_t)(c[2]-r*(RAD1-1.0))};
  90. return surface(p,coord,(Real_t)RAD1,depth);
  91. }
  92. /**
  93. * \brief Returns the coordinates of points on the upward equivalent surface of cube.
  94. * \see surface()
  95. */
  96. template <class Real_t>
  97. std::vector<Real_t> u_equiv_surf(int p, Real_t* c, int depth){
  98. Real_t r=0.5*pow(0.5,depth);
  99. Real_t coord[3]={(Real_t)(c[0]-r*(RAD0-1.0)),(Real_t)(c[1]-r*(RAD0-1.0)),(Real_t)(c[2]-r*(RAD0-1.0))};
  100. return surface(p,coord,(Real_t)RAD0,depth);
  101. }
  102. /**
  103. * \brief Returns the coordinates of points on the downward check surface of cube.
  104. * \see surface()
  105. */
  106. template <class Real_t>
  107. std::vector<Real_t> d_check_surf(int p, Real_t* c, int depth){
  108. Real_t r=0.5*pow(0.5,depth);
  109. Real_t coord[3]={(Real_t)(c[0]-r*(RAD0-1.0)),(Real_t)(c[1]-r*(RAD0-1.0)),(Real_t)(c[2]-r*(RAD0-1.0))};
  110. return surface(p,coord,(Real_t)RAD0,depth);
  111. }
  112. /**
  113. * \brief Returns the coordinates of points on the downward equivalent surface of cube.
  114. * \see surface()
  115. */
  116. template <class Real_t>
  117. std::vector<Real_t> d_equiv_surf(int p, Real_t* c, int depth){
  118. Real_t r=0.5*pow(0.5,depth);
  119. Real_t coord[3]={(Real_t)(c[0]-r*(RAD1-1.0)),(Real_t)(c[1]-r*(RAD1-1.0)),(Real_t)(c[2]-r*(RAD1-1.0))};
  120. return surface(p,coord,(Real_t)RAD1,depth);
  121. }
  122. /**
  123. * \brief Defines the 3D grid for convolution in FFT acceleration of V-list.
  124. * \see surface()
  125. */
  126. template <class Real_t>
  127. std::vector<Real_t> conv_grid(int p, Real_t* c, int depth){
  128. Real_t r=pow(0.5,depth);
  129. Real_t a=r*RAD0;
  130. Real_t coord[3]={c[0],c[1],c[2]};
  131. int n1=p*2;
  132. int n2=(int)pow((Real_t)n1,2);
  133. int n3=(int)pow((Real_t)n1,3);
  134. std::vector<Real_t> grid(n3*3);
  135. for(int i=0;i<n1;i++)
  136. for(int j=0;j<n1;j++)
  137. for(int k=0;k<n1;k++){
  138. grid[(i+n1*j+n2*k)*3+0]=(i-p)*a/(p-1)+coord[0];
  139. grid[(i+n1*j+n2*k)*3+1]=(j-p)*a/(p-1)+coord[1];
  140. grid[(i+n1*j+n2*k)*3+2]=(k-p)*a/(p-1)+coord[2];
  141. }
  142. return grid;
  143. }
  144. template <class Real_t>
  145. void FMM_Data<Real_t>::Clear(){
  146. upward_equiv.Resize(0);
  147. }
  148. template <class Real_t>
  149. PackedData FMM_Data<Real_t>::PackMultipole(void* buff_ptr){
  150. PackedData p0; p0.data=buff_ptr;
  151. p0.length=upward_equiv.Dim()*sizeof(Real_t);
  152. if(p0.length==0) return p0;
  153. if(p0.data==NULL) p0.data=(char*)&upward_equiv[0];
  154. else mem::memcopy(p0.data,&upward_equiv[0],p0.length);
  155. return p0;
  156. }
  157. template <class Real_t>
  158. void FMM_Data<Real_t>::AddMultipole(PackedData p0){
  159. Real_t* data=(Real_t*)p0.data;
  160. size_t n=p0.length/sizeof(Real_t);
  161. assert(upward_equiv.Dim()==n);
  162. Matrix<Real_t> v0(1,n,&upward_equiv[0],false);
  163. Matrix<Real_t> v1(1,n,data,false);
  164. v0+=v1;
  165. }
  166. template <class Real_t>
  167. void FMM_Data<Real_t>::InitMultipole(PackedData p0, bool own_data){
  168. Real_t* data=(Real_t*)p0.data;
  169. size_t n=p0.length/sizeof(Real_t);
  170. if(n==0) return;
  171. if(own_data){
  172. upward_equiv=Vector<Real_t>(n, &data[0], false);
  173. }else{
  174. upward_equiv.ReInit(n, &data[0], false);
  175. }
  176. }
  177. template <class FMMNode>
  178. FMM_Pts<FMMNode>::~FMM_Pts() {
  179. if(mat!=NULL){
  180. // int rank;
  181. // MPI_Comm_rank(comm,&rank);
  182. // if(rank==0) mat->Save2File("Precomp.data");
  183. delete mat;
  184. mat=NULL;
  185. }
  186. if(vprecomp_fft_flag) FFTW_t<Real_t>::fft_destroy_plan(vprecomp_fftplan);
  187. #ifdef __INTEL_OFFLOAD0
  188. #pragma offload target(mic:0)
  189. #endif
  190. {
  191. if(vlist_fft_flag ) FFTW_t<Real_t>::fft_destroy_plan(vlist_fftplan );
  192. if(vlist_ifft_flag) FFTW_t<Real_t>::fft_destroy_plan(vlist_ifftplan);
  193. vlist_fft_flag =false;
  194. vlist_ifft_flag=false;
  195. }
  196. }
  197. template <class FMMNode>
  198. void FMM_Pts<FMMNode>::Initialize(int mult_order, const MPI_Comm& comm_, const Kernel<Real_t>* kernel_){
  199. Profile::Tic("InitFMM_Pts",&comm_,true);{
  200. int rank;
  201. MPI_Comm_rank(comm_,&rank);
  202. bool verbose=false;
  203. #ifndef NDEBUG
  204. #ifdef __VERBOSE__
  205. if(!rank) verbose=true;
  206. #endif
  207. #endif
  208. if(kernel_) kernel_->Initialize(verbose);
  209. multipole_order=mult_order;
  210. comm=comm_;
  211. kernel=kernel_;
  212. assert(kernel!=NULL);
  213. bool save_precomp=false;
  214. mat=new PrecompMat<Real_t>(ScaleInvar());
  215. if(this->mat_fname.size()==0){// && !this->ScaleInvar()){
  216. std::stringstream st;
  217. st<<PVFMM_PRECOMP_DATA_PATH;
  218. if(!st.str().size()){ // look in PVFMM_DIR
  219. char* pvfmm_dir = getenv ("PVFMM_DIR");
  220. if(pvfmm_dir) st<<pvfmm_dir;
  221. }
  222. #ifndef STAT_MACROS_BROKEN
  223. if(st.str().size()){ // check if the path is a directory
  224. struct stat stat_buff;
  225. if(stat(st.str().c_str(), &stat_buff) || !S_ISDIR(stat_buff.st_mode)){
  226. std::cout<<"error: path not found: "<<st.str()<<'\n';
  227. exit(0);
  228. }
  229. }
  230. #endif
  231. if(st.str().size()) st<<'/';
  232. st<<"Precomp_"<<kernel->ker_name.c_str()<<"_m"<<mult_order;
  233. if(sizeof(Real_t)==8) st<<"";
  234. else if(sizeof(Real_t)==4) st<<"_f";
  235. else st<<"_t"<<sizeof(Real_t);
  236. st<<".data";
  237. this->mat_fname=st.str();
  238. save_precomp=true;
  239. }
  240. this->mat->LoadFile(mat_fname.c_str(), this->comm);
  241. interac_list.Initialize(COORD_DIM, this->mat);
  242. Profile::Tic("PrecompUC2UE",&comm,false,4);
  243. this->PrecompAll(UC2UE0_Type);
  244. this->PrecompAll(UC2UE1_Type);
  245. Profile::Toc();
  246. Profile::Tic("PrecompDC2DE",&comm,false,4);
  247. this->PrecompAll(DC2DE0_Type);
  248. this->PrecompAll(DC2DE1_Type);
  249. Profile::Toc();
  250. Profile::Tic("PrecompBC",&comm,false,4);
  251. { /*
  252. int type=BC_Type;
  253. for(int l=0;l<MAX_DEPTH;l++)
  254. for(size_t indx=0;indx<this->interac_list.ListCount((Mat_Type)type);indx++){
  255. Matrix<Real_t>& M=this->mat->Mat(l, (Mat_Type)type, indx);
  256. M.Resize(0,0);
  257. } // */
  258. }
  259. this->PrecompAll(BC_Type,0);
  260. Profile::Toc();
  261. Profile::Tic("PrecompU2U",&comm,false,4);
  262. this->PrecompAll(U2U_Type);
  263. Profile::Toc();
  264. Profile::Tic("PrecompD2D",&comm,false,4);
  265. this->PrecompAll(D2D_Type);
  266. Profile::Toc();
  267. if(save_precomp){
  268. Profile::Tic("Save2File",&this->comm,false,4);
  269. if(!rank){
  270. FILE* f=fopen(this->mat_fname.c_str(),"r");
  271. if(f==NULL) { //File does not exists.
  272. this->mat->Save2File(this->mat_fname.c_str());
  273. }else fclose(f);
  274. }
  275. Profile::Toc();
  276. }
  277. Profile::Tic("PrecompV",&comm,false,4);
  278. this->PrecompAll(V_Type);
  279. Profile::Toc();
  280. Profile::Tic("PrecompV1",&comm,false,4);
  281. this->PrecompAll(V1_Type);
  282. Profile::Toc();
  283. }Profile::Toc();
  284. }
  285. template <class Real_t>
  286. Permutation<Real_t> equiv_surf_perm(size_t m, size_t p_indx, const Permutation<Real_t>& ker_perm, const Vector<Real_t>* scal_exp=NULL){
  287. Real_t eps=1e-10;
  288. int dof=ker_perm.Dim();
  289. Real_t c[3]={-0.5,-0.5,-0.5};
  290. std::vector<Real_t> trg_coord=d_check_surf(m,c,0);
  291. int n_trg=trg_coord.size()/3;
  292. Permutation<Real_t> P=Permutation<Real_t>(n_trg*dof);
  293. if(p_indx==ReflecX || p_indx==ReflecY || p_indx==ReflecZ){ // Set P.perm
  294. for(int i=0;i<n_trg;i++)
  295. for(int j=0;j<n_trg;j++){
  296. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+0]*(p_indx==ReflecX?-1.0:1.0))<eps)
  297. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+1]*(p_indx==ReflecY?-1.0:1.0))<eps)
  298. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+2]*(p_indx==ReflecZ?-1.0:1.0))<eps){
  299. for(int k=0;k<dof;k++){
  300. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  301. }
  302. }
  303. }
  304. }else if(p_indx==SwapXY || p_indx==SwapXZ){
  305. for(int i=0;i<n_trg;i++)
  306. for(int j=0;j<n_trg;j++){
  307. if(fabs(trg_coord[i*3+0]-trg_coord[j*3+(p_indx==SwapXY?1:2)])<eps)
  308. if(fabs(trg_coord[i*3+1]-trg_coord[j*3+(p_indx==SwapXY?0:1)])<eps)
  309. if(fabs(trg_coord[i*3+2]-trg_coord[j*3+(p_indx==SwapXY?2:0)])<eps){
  310. for(int k=0;k<dof;k++){
  311. P.perm[j*dof+k]=i*dof+ker_perm.perm[k];
  312. }
  313. }
  314. }
  315. }else{
  316. for(int j=0;j<n_trg;j++){
  317. for(int k=0;k<dof;k++){
  318. P.perm[j*dof+k]=j*dof+ker_perm.perm[k];
  319. }
  320. }
  321. }
  322. if(scal_exp && p_indx==Scaling){ // Set level-by-level scaling
  323. assert(dof==scal_exp->Dim());
  324. Vector<Real_t> scal(scal_exp->Dim());
  325. for(size_t i=0;i<scal.Dim();i++){
  326. scal[i]=pow(2.0,(*scal_exp)[i]);
  327. }
  328. for(int j=0;j<n_trg;j++){
  329. for(int i=0;i<dof;i++){
  330. P.scal[j*dof+i]*=scal[i];
  331. }
  332. }
  333. }
  334. { // Set P.scal
  335. for(int j=0;j<n_trg;j++){
  336. for(int i=0;i<dof;i++){
  337. P.scal[j*dof+i]*=ker_perm.scal[i];
  338. }
  339. }
  340. }
  341. return P;
  342. }
  343. template <class FMMNode>
  344. Permutation<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::PrecompPerm(Mat_Type type, Perm_Type perm_indx){
  345. //Check if the matrix already exists.
  346. Permutation<Real_t>& P_ = mat->Perm((Mat_Type)type, perm_indx);
  347. if(P_.Dim()!=0) return P_;
  348. size_t m=this->MultipoleOrder();
  349. size_t p_indx=perm_indx % C_Perm;
  350. //Compute the matrix.
  351. Permutation<Real_t> P;
  352. switch (type){
  353. case U2U_Type:
  354. {
  355. Vector<Real_t> scal_exp;
  356. Permutation<Real_t> ker_perm;
  357. if(perm_indx<C_Perm){ // Source permutation
  358. ker_perm=kernel->k_m2m->perm_vec[0 +p_indx];
  359. scal_exp=kernel->k_m2m->src_scal;
  360. }else{ // Target permutation
  361. ker_perm=kernel->k_m2m->perm_vec[0 +p_indx];
  362. scal_exp=kernel->k_m2m->src_scal;
  363. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  364. }
  365. P=equiv_surf_perm(m, p_indx, ker_perm, (this->ScaleInvar()?&scal_exp:NULL));
  366. break;
  367. }
  368. case D2D_Type:
  369. {
  370. Vector<Real_t> scal_exp;
  371. Permutation<Real_t> ker_perm;
  372. if(perm_indx<C_Perm){ // Source permutation
  373. ker_perm=kernel->k_l2l->perm_vec[C_Perm+p_indx];
  374. scal_exp=kernel->k_l2l->trg_scal;
  375. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  376. }else{ // Target permutation
  377. ker_perm=kernel->k_l2l->perm_vec[C_Perm+p_indx];
  378. scal_exp=kernel->k_l2l->trg_scal;
  379. }
  380. P=equiv_surf_perm(m, p_indx, ker_perm, (this->ScaleInvar()?&scal_exp:NULL));
  381. break;
  382. }
  383. default:
  384. break;
  385. }
  386. //Save the matrix for future use.
  387. #pragma omp critical (PRECOMP_MATRIX_PTS)
  388. {
  389. if(P_.Dim()==0) P_=P;
  390. }
  391. return P_;
  392. }
  393. template <class FMMNode>
  394. Matrix<typename FMMNode::Real_t>& FMM_Pts<FMMNode>::Precomp(int level, Mat_Type type, size_t mat_indx){
  395. if(this->ScaleInvar()) level=0;
  396. //Check if the matrix already exists.
  397. Matrix<Real_t>& M_ = this->mat->Mat(level, type, mat_indx);
  398. if(M_.Dim(0)!=0 && M_.Dim(1)!=0) return M_;
  399. else{ //Compute matrix from symmetry class (if possible).
  400. size_t class_indx = this->interac_list.InteracClass(type, mat_indx);
  401. if(class_indx!=mat_indx){
  402. Matrix<Real_t>& M0 = this->Precomp(level, type, class_indx);
  403. if(M0.Dim(0)==0 || M0.Dim(1)==0) return M_;
  404. for(size_t i=0;i<Perm_Count;i++) this->PrecompPerm(type, (Perm_Type) i);
  405. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, type, mat_indx);
  406. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, type, mat_indx);
  407. if(Pr.Dim()>0 && Pc.Dim()>0 && M0.Dim(0)>0 && M0.Dim(1)>0) return M_;
  408. }
  409. }
  410. //Compute the matrix.
  411. Matrix<Real_t> M;
  412. //int omp_p=omp_get_max_threads();
  413. switch (type){
  414. case UC2UE0_Type:
  415. {
  416. if(MultipoleOrder()==0) break;
  417. const int* ker_dim=kernel->k_m2m->ker_dim;
  418. // Coord of upward check surface
  419. Real_t c[3]={0,0,0};
  420. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  421. size_t n_uc=uc_coord.size()/3;
  422. // Coord of upward equivalent surface
  423. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  424. size_t n_ue=ue_coord.size()/3;
  425. // Evaluate potential at check surface due to equivalent surface.
  426. Matrix<Real_t> M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  427. kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue,
  428. &uc_coord[0], n_uc, &(M_e2c[0][0]));
  429. Matrix<Real_t> U,S,V;
  430. M_e2c.SVD(U,S,V);
  431. Real_t eps=1, max_S=0;
  432. while(eps*(Real_t)0.5+(Real_t)1.0>1.0) eps*=0.5;
  433. for(size_t i=0;i<std::min(S.Dim(0),S.Dim(1));i++){
  434. if(fabs(S[i][i])>max_S) max_S=fabs(S[i][i]);
  435. }
  436. for(size_t i=0;i<S.Dim(0);i++) S[i][i]=(S[i][i]>eps*max_S*4?1.0/S[i][i]:0.0);
  437. M=V.Transpose()*S;//*U.Transpose();
  438. break;
  439. }
  440. case UC2UE1_Type:
  441. {
  442. if(MultipoleOrder()==0) break;
  443. const int* ker_dim=kernel->k_m2m->ker_dim;
  444. // Coord of upward check surface
  445. Real_t c[3]={0,0,0};
  446. std::vector<Real_t> uc_coord=u_check_surf(MultipoleOrder(),c,level);
  447. size_t n_uc=uc_coord.size()/3;
  448. // Coord of upward equivalent surface
  449. std::vector<Real_t> ue_coord=u_equiv_surf(MultipoleOrder(),c,level);
  450. size_t n_ue=ue_coord.size()/3;
  451. // Evaluate potential at check surface due to equivalent surface.
  452. Matrix<Real_t> M_e2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  453. kernel->k_m2m->BuildMatrix(&ue_coord[0], n_ue,
  454. &uc_coord[0], n_uc, &(M_e2c[0][0]));
  455. Matrix<Real_t> U,S,V;
  456. M_e2c.SVD(U,S,V);
  457. M=U.Transpose();
  458. break;
  459. }
  460. case DC2DE0_Type:
  461. {
  462. if(MultipoleOrder()==0) break;
  463. const int* ker_dim=kernel->k_l2l->ker_dim;
  464. // Coord of downward check surface
  465. Real_t c[3]={0,0,0};
  466. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  467. size_t n_ch=check_surf.size()/3;
  468. // Coord of downward equivalent surface
  469. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  470. size_t n_eq=equiv_surf.size()/3;
  471. // Evaluate potential at check surface due to equivalent surface.
  472. Matrix<Real_t> M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]);
  473. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq,
  474. &check_surf[0], n_ch, &(M_e2c[0][0]));
  475. Matrix<Real_t> U,S,V;
  476. M_e2c.SVD(U,S,V);
  477. Real_t eps=1, max_S=0;
  478. while(eps*(Real_t)0.5+(Real_t)1.0>1.0) eps*=0.5;
  479. for(size_t i=0;i<std::min(S.Dim(0),S.Dim(1));i++){
  480. if(fabs(S[i][i])>max_S) max_S=fabs(S[i][i]);
  481. }
  482. for(size_t i=0;i<S.Dim(0);i++) S[i][i]=(S[i][i]>eps*max_S*4?1.0/S[i][i]:0.0);
  483. M=V.Transpose()*S;//*U.Transpose();
  484. break;
  485. }
  486. case DC2DE1_Type:
  487. {
  488. if(MultipoleOrder()==0) break;
  489. const int* ker_dim=kernel->k_l2l->ker_dim;
  490. // Coord of downward check surface
  491. Real_t c[3]={0,0,0};
  492. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  493. size_t n_ch=check_surf.size()/3;
  494. // Coord of downward equivalent surface
  495. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  496. size_t n_eq=equiv_surf.size()/3;
  497. // Evaluate potential at check surface due to equivalent surface.
  498. Matrix<Real_t> M_e2c(n_eq*ker_dim[0],n_ch*ker_dim[1]);
  499. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_eq,
  500. &check_surf[0], n_ch, &(M_e2c[0][0]));
  501. Matrix<Real_t> U,S,V;
  502. M_e2c.SVD(U,S,V);
  503. M=U.Transpose();
  504. break;
  505. }
  506. case U2U_Type:
  507. {
  508. if(MultipoleOrder()==0) break;
  509. const int* ker_dim=kernel->k_m2m->ker_dim;
  510. // Coord of upward check surface
  511. Real_t c[3]={0,0,0};
  512. std::vector<Real_t> check_surf=u_check_surf(MultipoleOrder(),c,level);
  513. size_t n_uc=check_surf.size()/3;
  514. // Coord of child's upward equivalent surface
  515. Real_t s=pow(0.5,(level+2));
  516. int* coord=interac_list.RelativeCoord(type,mat_indx);
  517. Real_t child_coord[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  518. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),child_coord,level+1);
  519. size_t n_ue=equiv_surf.size()/3;
  520. // Evaluate potential at check surface due to equivalent surface.
  521. Matrix<Real_t> M_ce2c(n_ue*ker_dim[0],n_uc*ker_dim[1]);
  522. kernel->k_m2m->BuildMatrix(&equiv_surf[0], n_ue,
  523. &check_surf[0], n_uc, &(M_ce2c[0][0]));
  524. Matrix<Real_t>& M_c2e0 = Precomp(level, UC2UE0_Type, 0);
  525. Matrix<Real_t>& M_c2e1 = Precomp(level, UC2UE1_Type, 0);
  526. M=(M_ce2c*M_c2e0)*M_c2e1;
  527. break;
  528. }
  529. case D2D_Type:
  530. {
  531. if(MultipoleOrder()==0) break;
  532. const int* ker_dim=kernel->k_l2l->ker_dim;
  533. // Coord of downward check surface
  534. Real_t s=pow(0.5,level+1);
  535. int* coord=interac_list.RelativeCoord(type,mat_indx);
  536. Real_t c[3]={(coord[0]+1)*s,(coord[1]+1)*s,(coord[2]+1)*s};
  537. std::vector<Real_t> check_surf=d_check_surf(MultipoleOrder(),c,level);
  538. size_t n_dc=check_surf.size()/3;
  539. // Coord of parent's downward equivalent surface
  540. Real_t parent_coord[3]={0,0,0};
  541. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),parent_coord,level-1);
  542. size_t n_de=equiv_surf.size()/3;
  543. // Evaluate potential at check surface due to equivalent surface.
  544. Matrix<Real_t> M_pe2c(n_de*ker_dim[0],n_dc*ker_dim[1]);
  545. kernel->k_l2l->BuildMatrix(&equiv_surf[0], n_de,
  546. &check_surf[0], n_dc, &(M_pe2c[0][0]));
  547. Matrix<Real_t> M_c2e0=Precomp(level-1,DC2DE0_Type,0);
  548. Matrix<Real_t> M_c2e1=Precomp(level-1,DC2DE1_Type,0);
  549. if(ScaleInvar()){ // Scale M_c2e0 for level-1
  550. Permutation<Real_t> ker_perm=this->kernel->k_l2l->perm_vec[C_Perm+Scaling];
  551. Vector<Real_t> scal_exp=this->kernel->k_l2l->trg_scal;
  552. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  553. M_c2e0=P*M_c2e0;
  554. }
  555. if(ScaleInvar()){ // Scale M_c2e1 for level-1
  556. Permutation<Real_t> ker_perm=this->kernel->k_l2l->perm_vec[0 +Scaling];
  557. Vector<Real_t> scal_exp=this->kernel->k_l2l->src_scal;
  558. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  559. M_c2e1=M_c2e1*P;
  560. }
  561. M=M_c2e0*(M_c2e1*M_pe2c);
  562. break;
  563. }
  564. case D2T_Type:
  565. {
  566. if(MultipoleOrder()==0) break;
  567. const int* ker_dim=kernel->k_l2t->ker_dim;
  568. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  569. // Coord of target points
  570. Real_t r=pow(0.5,level);
  571. size_t n_trg=rel_trg_coord.size()/3;
  572. std::vector<Real_t> trg_coord(n_trg*3);
  573. for(size_t i=0;i<n_trg*COORD_DIM;i++) trg_coord[i]=rel_trg_coord[i]*r;
  574. // Coord of downward equivalent surface
  575. Real_t c[3]={0,0,0};
  576. std::vector<Real_t> equiv_surf=d_equiv_surf(MultipoleOrder(),c,level);
  577. size_t n_eq=equiv_surf.size()/3;
  578. // Evaluate potential at target points due to equivalent surface.
  579. {
  580. M .Resize(n_eq*ker_dim [0], n_trg*ker_dim [1]);
  581. kernel->k_l2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  582. }
  583. Matrix<Real_t>& M_c2e0=Precomp(level,DC2DE0_Type,0);
  584. Matrix<Real_t>& M_c2e1=Precomp(level,DC2DE1_Type,0);
  585. M=M_c2e0*(M_c2e1*M);
  586. break;
  587. }
  588. case V_Type:
  589. {
  590. if(MultipoleOrder()==0) break;
  591. const int* ker_dim=kernel->k_m2l->ker_dim;
  592. int n1=MultipoleOrder()*2;
  593. int n3 =n1*n1*n1;
  594. int n3_=n1*n1*(n1/2+1);
  595. //Compute the matrix.
  596. Real_t s=pow(0.5,level);
  597. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  598. Real_t coord_diff[3]={coord2[0]*s,coord2[1]*s,coord2[2]*s};
  599. //Evaluate potential.
  600. std::vector<Real_t> r_trg(COORD_DIM,0.0);
  601. std::vector<Real_t> conv_poten(n3*ker_dim[0]*ker_dim[1]);
  602. std::vector<Real_t> conv_coord=conv_grid(MultipoleOrder(),coord_diff,level);
  603. kernel->k_m2l->BuildMatrix(&conv_coord[0],n3,&r_trg[0],1,&conv_poten[0]);
  604. //Rearrange data.
  605. Matrix<Real_t> M_conv(n3,ker_dim[0]*ker_dim[1],&conv_poten[0],false);
  606. M_conv=M_conv.Transpose();
  607. //Compute FFTW plan.
  608. int nnn[3]={n1,n1,n1};
  609. Real_t *fftw_in, *fftw_out;
  610. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  611. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  612. #pragma omp critical (FFTW_PLAN)
  613. {
  614. if (!vprecomp_fft_flag){
  615. vprecomp_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM, nnn, ker_dim[0]*ker_dim[1],
  616. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*) fftw_out, NULL, 1, n3_);
  617. vprecomp_fft_flag=true;
  618. }
  619. }
  620. //Compute FFT.
  621. mem::memcopy(fftw_in, &conv_poten[0], n3*ker_dim[0]*ker_dim[1]*sizeof(Real_t));
  622. FFTW_t<Real_t>::fft_execute_dft_r2c(vprecomp_fftplan, (Real_t*)fftw_in, (typename FFTW_t<Real_t>::cplx*)(fftw_out));
  623. Matrix<Real_t> M_(2*n3_*ker_dim[0]*ker_dim[1],1,(Real_t*)fftw_out,false);
  624. M=M_;
  625. //Free memory.
  626. mem::aligned_delete<Real_t>(fftw_in);
  627. mem::aligned_delete<Real_t>(fftw_out);
  628. break;
  629. }
  630. case V1_Type:
  631. {
  632. if(MultipoleOrder()==0) break;
  633. const int* ker_dim=kernel->k_m2l->ker_dim;
  634. size_t mat_cnt =interac_list.ListCount( V_Type);
  635. for(size_t k=0;k<mat_cnt;k++) Precomp(level, V_Type, k);
  636. const size_t chld_cnt=1UL<<COORD_DIM;
  637. size_t n1=MultipoleOrder()*2;
  638. size_t M_dim=n1*n1*(n1/2+1);
  639. size_t n3=n1*n1*n1;
  640. Vector<Real_t> zero_vec(M_dim*ker_dim[0]*ker_dim[1]*2);
  641. zero_vec.SetZero();
  642. Vector<Real_t*> M_ptr(chld_cnt*chld_cnt);
  643. for(size_t i=0;i<chld_cnt*chld_cnt;i++) M_ptr[i]=&zero_vec[0];
  644. int* rel_coord_=interac_list.RelativeCoord(V1_Type, mat_indx);
  645. for(int j1=0;j1<chld_cnt;j1++)
  646. for(int j2=0;j2<chld_cnt;j2++){
  647. int rel_coord[3]={rel_coord_[0]*2-(j1/1)%2+(j2/1)%2,
  648. rel_coord_[1]*2-(j1/2)%2+(j2/2)%2,
  649. rel_coord_[2]*2-(j1/4)%2+(j2/4)%2};
  650. for(size_t k=0;k<mat_cnt;k++){
  651. int* ref_coord=interac_list.RelativeCoord(V_Type, k);
  652. if(ref_coord[0]==rel_coord[0] &&
  653. ref_coord[1]==rel_coord[1] &&
  654. ref_coord[2]==rel_coord[2]){
  655. Matrix<Real_t>& M = this->mat->Mat(level, V_Type, k);
  656. M_ptr[j2*chld_cnt+j1]=&M[0][0];
  657. break;
  658. }
  659. }
  660. }
  661. // Build matrix ker_dim0 x ker_dim1 x M_dim x 8 x 8
  662. M.Resize(ker_dim[0]*ker_dim[1]*M_dim, 2*chld_cnt*chld_cnt);
  663. for(int j=0;j<ker_dim[0]*ker_dim[1]*M_dim;j++){
  664. for(size_t k=0;k<chld_cnt*chld_cnt;k++){
  665. M[j][k*2+0]=M_ptr[k][j*2+0]/n3;
  666. M[j][k*2+1]=M_ptr[k][j*2+1]/n3;
  667. }
  668. }
  669. break;
  670. }
  671. case W_Type:
  672. {
  673. if(MultipoleOrder()==0) break;
  674. const int* ker_dim=kernel->k_m2t->ker_dim;
  675. std::vector<Real_t>& rel_trg_coord=mat->RelativeTrgCoord();
  676. // Coord of target points
  677. Real_t s=pow(0.5,level);
  678. size_t n_trg=rel_trg_coord.size()/3;
  679. std::vector<Real_t> trg_coord(n_trg*3);
  680. for(size_t j=0;j<n_trg*COORD_DIM;j++) trg_coord[j]=rel_trg_coord[j]*s;
  681. // Coord of downward equivalent surface
  682. int* coord2=interac_list.RelativeCoord(type,mat_indx);
  683. Real_t c[3]={(Real_t)((coord2[0]+1)*s*0.25),(Real_t)((coord2[1]+1)*s*0.25),(Real_t)((coord2[2]+1)*s*0.25)};
  684. std::vector<Real_t> equiv_surf=u_equiv_surf(MultipoleOrder(),c,level+1);
  685. size_t n_eq=equiv_surf.size()/3;
  686. // Evaluate potential at target points due to equivalent surface.
  687. {
  688. M .Resize(n_eq*ker_dim [0],n_trg*ker_dim [1]);
  689. kernel->k_m2t->BuildMatrix(&equiv_surf[0], n_eq, &trg_coord[0], n_trg, &(M [0][0]));
  690. }
  691. break;
  692. }
  693. case BC_Type:
  694. {
  695. if(!this->ScaleInvar() || MultipoleOrder()==0) break;
  696. if(kernel->k_m2l->ker_dim[0]!=kernel->k_m2m->ker_dim[0]) break;
  697. if(kernel->k_m2l->ker_dim[1]!=kernel->k_l2l->ker_dim[1]) break;
  698. const int* ker_dim=kernel->k_m2l->ker_dim;
  699. size_t mat_cnt_m2m=interac_list.ListCount(U2U_Type);
  700. size_t n_surf=(6*(MultipoleOrder()-1)*(MultipoleOrder()-1)+2); //Total number of points.
  701. if((M.Dim(0)!=n_surf*ker_dim[0] || M.Dim(1)!=n_surf*ker_dim[1]) && level==0){
  702. Matrix<Real_t> M_m2m[BC_LEVELS+1];
  703. Matrix<Real_t> M_m2l[BC_LEVELS+1];
  704. Matrix<Real_t> M_l2l[BC_LEVELS+1];
  705. Matrix<Real_t> M_equiv_zero_avg(n_surf*ker_dim[0],n_surf*ker_dim[0]);
  706. Matrix<Real_t> M_check_zero_avg(n_surf*ker_dim[1],n_surf*ker_dim[1]);
  707. { // Set average multipole charge to zero. (improves stability for large BC_LEVELS)
  708. M_equiv_zero_avg.SetZero();
  709. for(size_t i=0;i<n_surf*ker_dim[0];i++)
  710. M_equiv_zero_avg[i][i]+=1;
  711. for(size_t i=0;i<n_surf;i++)
  712. for(size_t j=0;j<n_surf;j++)
  713. for(size_t k=0;k<ker_dim[0];k++)
  714. M_equiv_zero_avg[i*ker_dim[0]+k][j*ker_dim[0]+k]-=1.0/n_surf;
  715. }
  716. { // Set average check potential to zero. (improves stability for large BC_LEVELS)
  717. M_check_zero_avg.SetZero();
  718. for(size_t i=0;i<n_surf*ker_dim[1];i++)
  719. M_check_zero_avg[i][i]+=1;
  720. for(size_t i=0;i<n_surf;i++)
  721. for(size_t j=0;j<n_surf;j++)
  722. for(size_t k=0;k<ker_dim[1];k++)
  723. M_check_zero_avg[i*ker_dim[1]+k][j*ker_dim[1]+k]-=1.0/n_surf;
  724. }
  725. for(int level=0; level>=-BC_LEVELS; level--){
  726. { // Compute M_l2l
  727. this->Precomp(level, D2D_Type, 0);
  728. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, D2D_Type, 0);
  729. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, D2D_Type, 0);
  730. M_l2l[-level] = M_check_zero_avg * Pr * this->Precomp(level, D2D_Type, this->interac_list.InteracClass(D2D_Type, 0)) * Pc * M_check_zero_avg;
  731. assert(M_l2l[-level].Dim(0)>0 && M_l2l[-level].Dim(1)>0);
  732. }
  733. // Compute M_m2m
  734. for(size_t mat_indx=0; mat_indx<mat_cnt_m2m; mat_indx++){
  735. this->Precomp(level, U2U_Type, mat_indx);
  736. Permutation<Real_t>& Pr = this->interac_list.Perm_R(level, U2U_Type, mat_indx);
  737. Permutation<Real_t>& Pc = this->interac_list.Perm_C(level, U2U_Type, mat_indx);
  738. Matrix<Real_t> M = Pr * this->Precomp(level, U2U_Type, this->interac_list.InteracClass(U2U_Type, mat_indx)) * Pc;
  739. assert(M.Dim(0)>0 && M.Dim(1)>0);
  740. if(mat_indx==0) M_m2m[-level] = M_equiv_zero_avg*M*M_equiv_zero_avg;
  741. else M_m2m[-level] += M_equiv_zero_avg*M*M_equiv_zero_avg;
  742. }
  743. // Compute M_m2l
  744. if(!ScaleInvar() || level==0){
  745. Real_t s=(1UL<<(-level));
  746. Real_t dc_coord[3]={0,0,0};
  747. std::vector<Real_t> trg_coord=d_check_surf(MultipoleOrder(), dc_coord, level);
  748. Matrix<Real_t> M_ue2dc(n_surf*ker_dim[0], n_surf*ker_dim[1]); M_ue2dc.SetZero();
  749. for(int x0=-2;x0<4;x0++)
  750. for(int x1=-2;x1<4;x1++)
  751. for(int x2=-2;x2<4;x2++)
  752. if(abs(x0)>1 || abs(x1)>1 || abs(x2)>1){
  753. Real_t ue_coord[3]={x0*s, x1*s, x2*s};
  754. std::vector<Real_t> src_coord=u_equiv_surf(MultipoleOrder(), ue_coord, level);
  755. Matrix<Real_t> M_tmp(n_surf*ker_dim[0], n_surf*ker_dim[1]);
  756. kernel->k_m2l->BuildMatrix(&src_coord[0], n_surf,
  757. &trg_coord[0], n_surf, &(M_tmp[0][0]));
  758. M_ue2dc+=M_tmp;
  759. }
  760. M_m2l[-level]=M_check_zero_avg*M_ue2dc * M_check_zero_avg;
  761. }else{
  762. M_m2l[-level]=M_equiv_zero_avg * M_m2l[-level-1] * M_check_zero_avg;
  763. if(ScaleInvar()){ // Scale M_m2l
  764. Permutation<Real_t> ker_perm=this->kernel->k_m2l->perm_vec[0 +Scaling];
  765. Vector<Real_t> scal_exp=this->kernel->k_m2l->src_scal;
  766. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  767. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  768. M_m2l[-level]=P*M_m2l[-level];
  769. }
  770. if(ScaleInvar()){ // Scale M_m2l
  771. Permutation<Real_t> ker_perm=this->kernel->k_m2l->perm_vec[C_Perm+Scaling];
  772. Vector<Real_t> scal_exp=this->kernel->k_m2l->trg_scal;
  773. for(size_t i=0;i<scal_exp.Dim();i++) scal_exp[i]=-scal_exp[i];
  774. Permutation<Real_t> P=equiv_surf_perm(MultipoleOrder(), Scaling, ker_perm, &scal_exp);
  775. M_m2l[-level]=M_m2l[-level]*P;
  776. }
  777. }
  778. }
  779. for(int level=-BC_LEVELS;level<=0;level++){
  780. if(level==-BC_LEVELS) M = M_m2l[-level];
  781. else M = M_equiv_zero_avg * (M_m2l[-level] + M_m2m[-level]*M*M_l2l[-level]) * M_equiv_zero_avg;
  782. }
  783. { // ax+by+cz+d correction.
  784. std::vector<Real_t> corner_pts;
  785. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(0);
  786. corner_pts.push_back(1); corner_pts.push_back(0); corner_pts.push_back(0);
  787. corner_pts.push_back(0); corner_pts.push_back(1); corner_pts.push_back(0);
  788. corner_pts.push_back(0); corner_pts.push_back(0); corner_pts.push_back(1);
  789. size_t n_corner=corner_pts.size()/COORD_DIM;
  790. // Coord of downward equivalent surface
  791. Real_t c[3]={0,0,0};
  792. std::vector<Real_t> up_equiv_surf=u_equiv_surf(MultipoleOrder(),c,0);
  793. std::vector<Real_t> dn_equiv_surf=d_equiv_surf(MultipoleOrder(),c,0);
  794. std::vector<Real_t> dn_check_surf=d_check_surf(MultipoleOrder(),c,0);
  795. Matrix<Real_t> M_err;
  796. { // Evaluate potential at corner due to upward and dnward equivalent surface.
  797. { // Error from local expansion.
  798. Matrix<Real_t> M_e2pt(n_surf*ker_dim[0],n_corner*ker_dim[1]);
  799. kernel->k_m2l->BuildMatrix(&dn_equiv_surf[0], n_surf,
  800. &corner_pts[0], n_corner, &(M_e2pt[0][0]));
  801. Matrix<Real_t>& M_dc2de0 = Precomp(0, DC2DE0_Type, 0);
  802. Matrix<Real_t>& M_dc2de1 = Precomp(0, DC2DE1_Type, 0);
  803. M_err=(M*M_dc2de0)*(M_dc2de1*M_e2pt);
  804. }
  805. for(size_t k=0;k<4;k++){ // Error from colleagues of root.
  806. for(int j0=-1;j0<=1;j0++)
  807. for(int j1=-1;j1<=1;j1++)
  808. for(int j2=-1;j2<=1;j2++){
  809. Real_t pt_coord[3]={corner_pts[k*COORD_DIM+0]-j0,
  810. corner_pts[k*COORD_DIM+1]-j1,
  811. corner_pts[k*COORD_DIM+2]-j2};
  812. if(fabs(pt_coord[0]-0.5)>1.0 || fabs(pt_coord[1]-0.5)>1.0 || fabs(pt_coord[2]-0.5)>1.0){
  813. Matrix<Real_t> M_e2pt(n_surf*ker_dim[0],ker_dim[1]);
  814. kernel->k_m2l->BuildMatrix(&up_equiv_surf[0], n_surf,
  815. &pt_coord[0], 1, &(M_e2pt[0][0]));
  816. for(size_t i=0;i<M_e2pt.Dim(0);i++)
  817. for(size_t j=0;j<M_e2pt.Dim(1);j++)
  818. M_err[i][k*ker_dim[1]+j]+=M_e2pt[i][j];
  819. }
  820. }
  821. }
  822. }
  823. Matrix<Real_t> M_grad(M_err.Dim(0),n_surf*ker_dim[1]);
  824. for(size_t i=0;i<M_err.Dim(0);i++)
  825. for(size_t k=0;k<ker_dim[1];k++)
  826. for(size_t j=0;j<n_surf;j++){
  827. M_grad[i][j*ker_dim[1]+k]=(M_err[i][0*ker_dim[1]+k] )*1.0 +
  828. (M_err[i][1*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+0]+
  829. (M_err[i][2*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+1]+
  830. (M_err[i][3*ker_dim[1]+k]-M_err[i][0*ker_dim[1]+k])*dn_check_surf[j*COORD_DIM+2];
  831. }
  832. M-=M_grad;
  833. }
  834. if(!this->ScaleInvar()){ // Free memory
  835. Mat_Type type=D2D_Type;
  836. for(int l=-BC_LEVELS;l<0;l++)
  837. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  838. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  839. M.Resize(0,0);
  840. }
  841. type=U2U_Type;
  842. for(int l=-BC_LEVELS;l<0;l++)
  843. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  844. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  845. M.Resize(0,0);
  846. }
  847. type=DC2DE0_Type;
  848. for(int l=-BC_LEVELS;l<0;l++)
  849. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  850. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  851. M.Resize(0,0);
  852. }
  853. type=DC2DE1_Type;
  854. for(int l=-BC_LEVELS;l<0;l++)
  855. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  856. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  857. M.Resize(0,0);
  858. }
  859. type=UC2UE0_Type;
  860. for(int l=-BC_LEVELS;l<0;l++)
  861. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  862. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  863. M.Resize(0,0);
  864. }
  865. type=UC2UE1_Type;
  866. for(int l=-BC_LEVELS;l<0;l++)
  867. for(size_t indx=0;indx<this->interac_list.ListCount(type);indx++){
  868. Matrix<Real_t>& M=this->mat->Mat(l, type, indx);
  869. M.Resize(0,0);
  870. }
  871. }
  872. }
  873. break;
  874. }
  875. default:
  876. break;
  877. }
  878. //Save the matrix for future use.
  879. #pragma omp critical (PRECOMP_MATRIX_PTS)
  880. if(M_.Dim(0)==0 && M_.Dim(1)==0){
  881. M_=M;
  882. /*
  883. M_.Resize(M.Dim(0),M.Dim(1));
  884. int dof=ker_dim[0]*ker_dim[1];
  885. for(int j=0;j<dof;j++){
  886. size_t a=(M.Dim(0)*M.Dim(1)* j )/dof;
  887. size_t b=(M.Dim(0)*M.Dim(1)*(j+1))/dof;
  888. #pragma omp parallel for // NUMA
  889. for(int tid=0;tid<omp_p;tid++){
  890. size_t a_=a+((b-a)* tid )/omp_p;
  891. size_t b_=a+((b-a)*(tid+1))/omp_p;
  892. mem::memcopy(&M_[0][a_], &M[0][a_], (b_-a_)*sizeof(Real_t));
  893. }
  894. }
  895. */
  896. }
  897. return M_;
  898. }
  899. template <class FMMNode>
  900. void FMM_Pts<FMMNode>::PrecompAll(Mat_Type type, int level){
  901. if(level==-1){
  902. for(int l=0;l<MAX_DEPTH;l++){
  903. PrecompAll(type, l);
  904. }
  905. return;
  906. }
  907. //Compute basic permutations.
  908. for(size_t i=0;i<Perm_Count;i++)
  909. this->PrecompPerm(type, (Perm_Type) i);
  910. {
  911. //Allocate matrices.
  912. size_t mat_cnt=interac_list.ListCount((Mat_Type)type);
  913. mat->Mat(level, (Mat_Type)type, mat_cnt-1);
  914. { // Compute InteracClass matrices.
  915. std::vector<size_t> indx_lst;
  916. for(size_t i=0; i<mat_cnt; i++){
  917. if(interac_list.InteracClass((Mat_Type)type,i)==i)
  918. indx_lst.push_back(i);
  919. }
  920. //Compute Transformations.
  921. //#pragma omp parallel for //lets use fine grained parallelism
  922. for(size_t i=0; i<indx_lst.size(); i++){
  923. Precomp(level, (Mat_Type)type, indx_lst[i]);
  924. }
  925. }
  926. //#pragma omp parallel for //lets use fine grained parallelism
  927. for(size_t mat_indx=0;mat_indx<mat_cnt;mat_indx++){
  928. Matrix<Real_t>& M0=interac_list.ClassMat(level,(Mat_Type)type,mat_indx);
  929. Permutation<Real_t>& pr=interac_list.Perm_R(level, (Mat_Type)type, mat_indx);
  930. Permutation<Real_t>& pc=interac_list.Perm_C(level, (Mat_Type)type, mat_indx);
  931. if(pr.Dim()!=M0.Dim(0) || pc.Dim()!=M0.Dim(1)) Precomp(level, (Mat_Type)type, mat_indx);
  932. }
  933. }
  934. }
  935. template <class FMMNode>
  936. void FMM_Pts<FMMNode>::CollectNodeData(FMMTree_t* tree, std::vector<FMMNode*>& node, std::vector<Matrix<Real_t> >& buff_list, std::vector<Vector<FMMNode_t*> >& n_list, std::vector<std::vector<Vector<Real_t>* > > vec_list){
  937. if(buff_list.size()<7) buff_list.resize(7);
  938. if( n_list.size()<7) n_list.resize(7);
  939. if( vec_list.size()<7) vec_list.resize(7);
  940. int omp_p=omp_get_max_threads();
  941. if(node.size()==0) return;
  942. {// 0. upward_equiv
  943. int indx=0;
  944. size_t vec_sz;
  945. { // Set vec_sz
  946. Matrix<Real_t>& M_uc2ue = this->interac_list.ClassMat(0, UC2UE1_Type, 0);
  947. vec_sz=M_uc2ue.Dim(1);
  948. }
  949. std::vector< FMMNode* > node_lst;
  950. {// Construct node_lst
  951. node_lst.clear();
  952. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  953. FMMNode_t* r_node=NULL;
  954. for(size_t i=0;i<node.size();i++){
  955. if(!node[i]->IsLeaf()){
  956. node[i]->pt_cnt[0] =0;
  957. node_lst_[node[i]->Depth()].push_back(node[i]);
  958. }else{
  959. node[i]->pt_cnt[0] =node[i]-> src_coord.Dim()/COORD_DIM;
  960. node[i]->pt_cnt[0]+=node[i]->surf_coord.Dim()/COORD_DIM;
  961. if(node[i]->IsGhost()) node[i]->pt_cnt[0]++; // TODO: temporary fix, pt_cnt not known for ghost nodes
  962. }
  963. if(node[i]->Depth()==0) r_node=node[i];
  964. }
  965. size_t chld_cnt=1UL<<COORD_DIM;
  966. for(int i=MAX_DEPTH;i>=0;i--){
  967. for(size_t j=0;j<node_lst_[i].size();j++){
  968. for(size_t k=0;k<chld_cnt;k++){
  969. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  970. node_lst_[i][j]->pt_cnt[0]+=node->pt_cnt[0];
  971. }
  972. }
  973. }
  974. for(int i=0;i<=MAX_DEPTH;i++){
  975. for(size_t j=0;j<node_lst_[i].size();j++){
  976. if(node_lst_[i][j]->pt_cnt[0])
  977. for(size_t k=0;k<chld_cnt;k++){
  978. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  979. node_lst.push_back(node);
  980. }
  981. }
  982. }
  983. if(r_node!=NULL) node_lst.push_back(r_node);
  984. n_list[indx]=node_lst;
  985. }
  986. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  987. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  988. FMMNode_t* node=node_lst[i];
  989. Vector<Real_t>& data_vec=node->FMMData()->upward_equiv;
  990. data_vec.ReInit(vec_sz,NULL,false);
  991. vec_lst.push_back(&data_vec);
  992. }
  993. }
  994. {// 1. dnward_equiv
  995. int indx=1;
  996. size_t vec_sz;
  997. { // Set vec_sz
  998. Matrix<Real_t>& M_dc2de0 = this->interac_list.ClassMat(0, DC2DE0_Type, 0);
  999. vec_sz=M_dc2de0.Dim(0);
  1000. }
  1001. std::vector< FMMNode* > node_lst;
  1002. {// Construct node_lst
  1003. node_lst.clear();
  1004. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1005. FMMNode_t* r_node=NULL;
  1006. for(size_t i=0;i<node.size();i++){
  1007. if(!node[i]->IsLeaf()){
  1008. node[i]->pt_cnt[1]=0;
  1009. node_lst_[node[i]->Depth()].push_back(node[i]);
  1010. }else{
  1011. node[i]->pt_cnt[1]=node[i]->trg_coord.Dim()/COORD_DIM;
  1012. }
  1013. if(node[i]->Depth()==0) r_node=node[i];
  1014. }
  1015. size_t chld_cnt=1UL<<COORD_DIM;
  1016. for(int i=MAX_DEPTH;i>=0;i--){
  1017. for(size_t j=0;j<node_lst_[i].size();j++){
  1018. for(size_t k=0;k<chld_cnt;k++){
  1019. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1020. node_lst_[i][j]->pt_cnt[1]+=node->pt_cnt[1];
  1021. }
  1022. }
  1023. }
  1024. for(int i=0;i<=MAX_DEPTH;i++){
  1025. for(size_t j=0;j<node_lst_[i].size();j++){
  1026. if(node_lst_[i][j]->pt_cnt[1])
  1027. for(size_t k=0;k<chld_cnt;k++){
  1028. FMMNode_t* node=(FMMNode_t*)node_lst_[i][j]->Child(k);
  1029. node_lst.push_back(node);
  1030. }
  1031. }
  1032. }
  1033. if(r_node!=NULL) node_lst.push_back(r_node);
  1034. n_list[indx]=node_lst;
  1035. }
  1036. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1037. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1038. FMMNode_t* node=node_lst[i];
  1039. Vector<Real_t>& data_vec=node->FMMData()->dnward_equiv;
  1040. data_vec.ReInit(vec_sz,NULL,false);
  1041. vec_lst.push_back(&data_vec);
  1042. }
  1043. }
  1044. {// 2. upward_equiv_fft
  1045. int indx=2;
  1046. std::vector< FMMNode* > node_lst;
  1047. {
  1048. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1049. for(size_t i=0;i<node.size();i++)
  1050. if(!node[i]->IsLeaf())
  1051. node_lst_[node[i]->Depth()].push_back(node[i]);
  1052. for(int i=0;i<=MAX_DEPTH;i++)
  1053. for(size_t j=0;j<node_lst_[i].size();j++)
  1054. node_lst.push_back(node_lst_[i][j]);
  1055. }
  1056. n_list[indx]=node_lst;
  1057. }
  1058. {// 3. dnward_check_fft
  1059. int indx=3;
  1060. std::vector< FMMNode* > node_lst;
  1061. {
  1062. std::vector<std::vector< FMMNode* > > node_lst_(MAX_DEPTH+1);
  1063. for(size_t i=0;i<node.size();i++)
  1064. if(!node[i]->IsLeaf() && !node[i]->IsGhost())
  1065. node_lst_[node[i]->Depth()].push_back(node[i]);
  1066. for(int i=0;i<=MAX_DEPTH;i++)
  1067. for(size_t j=0;j<node_lst_[i].size();j++)
  1068. node_lst.push_back(node_lst_[i][j]);
  1069. }
  1070. n_list[indx]=node_lst;
  1071. }
  1072. {// 4. src_val
  1073. int indx=4;
  1074. int src_dof=kernel->ker_dim[0];
  1075. int surf_dof=COORD_DIM+src_dof;
  1076. std::vector< FMMNode* > node_lst;
  1077. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1078. if(node[i]->IsLeaf()){
  1079. node_lst.push_back(node[i]);
  1080. }
  1081. }
  1082. n_list[indx]=node_lst;
  1083. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1084. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1085. FMMNode_t* node=node_lst[i];
  1086. { // src_value
  1087. Vector<Real_t>& data_vec=node->src_value;
  1088. size_t vec_sz=(node->src_coord.Dim()/COORD_DIM)*src_dof;
  1089. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false);
  1090. vec_lst.push_back(&data_vec);
  1091. }
  1092. { // surf_value
  1093. Vector<Real_t>& data_vec=node->surf_value;
  1094. size_t vec_sz=(node->surf_coord.Dim()/COORD_DIM)*surf_dof;
  1095. if(data_vec.Dim()!=vec_sz) data_vec.ReInit(vec_sz,NULL,false);
  1096. vec_lst.push_back(&data_vec);
  1097. }
  1098. }
  1099. }
  1100. {// 5. trg_val
  1101. int indx=5;
  1102. int trg_dof=kernel->ker_dim[1];
  1103. std::vector< FMMNode* > node_lst;
  1104. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1105. if(node[i]->IsLeaf() && !node[i]->IsGhost()){
  1106. node_lst.push_back(node[i]);
  1107. }
  1108. }
  1109. n_list[indx]=node_lst;
  1110. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1111. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1112. FMMNode_t* node=node_lst[i];
  1113. { // trg_value
  1114. Vector<Real_t>& data_vec=node->trg_value;
  1115. size_t vec_sz=(node->trg_coord.Dim()/COORD_DIM)*trg_dof;
  1116. data_vec.ReInit(vec_sz,NULL,false);
  1117. vec_lst.push_back(&data_vec);
  1118. }
  1119. }
  1120. }
  1121. {// 6. pts_coord
  1122. int indx=6;
  1123. std::vector< FMMNode* > node_lst;
  1124. for(size_t i=0;i<node.size();i++){// Construct node_lst
  1125. if(node[i]->IsLeaf()){
  1126. node_lst.push_back(node[i]);
  1127. }
  1128. }
  1129. n_list[indx]=node_lst;
  1130. std::vector<Vector<Real_t>*>& vec_lst=vec_list[indx];
  1131. for(size_t i=0;i<node_lst.size();i++){ // Construct vec_lst
  1132. FMMNode_t* node=node_lst[i];
  1133. { // src_coord
  1134. Vector<Real_t>& data_vec=node->src_coord;
  1135. vec_lst.push_back(&data_vec);
  1136. }
  1137. { // surf_coord
  1138. Vector<Real_t>& data_vec=node->surf_coord;
  1139. vec_lst.push_back(&data_vec);
  1140. }
  1141. { // trg_coord
  1142. Vector<Real_t>& data_vec=node->trg_coord;
  1143. vec_lst.push_back(&data_vec);
  1144. }
  1145. }
  1146. { // check and equiv surfaces.
  1147. if(tree->upwd_check_surf.size()==0){
  1148. size_t m=MultipoleOrder();
  1149. tree->upwd_check_surf.resize(MAX_DEPTH);
  1150. tree->upwd_equiv_surf.resize(MAX_DEPTH);
  1151. tree->dnwd_check_surf.resize(MAX_DEPTH);
  1152. tree->dnwd_equiv_surf.resize(MAX_DEPTH);
  1153. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1154. Real_t c[3]={0.0,0.0,0.0};
  1155. tree->upwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1156. tree->upwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1157. tree->dnwd_check_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1158. tree->dnwd_equiv_surf[depth].ReInit((6*(m-1)*(m-1)+2)*COORD_DIM);
  1159. tree->upwd_check_surf[depth]=u_check_surf(m,c,depth);
  1160. tree->upwd_equiv_surf[depth]=u_equiv_surf(m,c,depth);
  1161. tree->dnwd_check_surf[depth]=d_check_surf(m,c,depth);
  1162. tree->dnwd_equiv_surf[depth]=d_equiv_surf(m,c,depth);
  1163. }
  1164. }
  1165. for(size_t depth=0;depth<MAX_DEPTH;depth++){
  1166. vec_lst.push_back(&tree->upwd_check_surf[depth]);
  1167. vec_lst.push_back(&tree->upwd_equiv_surf[depth]);
  1168. vec_lst.push_back(&tree->dnwd_check_surf[depth]);
  1169. vec_lst.push_back(&tree->dnwd_equiv_surf[depth]);
  1170. }
  1171. }
  1172. }
  1173. // Create extra auxiliary buffer.
  1174. if(buff_list.size()<=vec_list.size()) buff_list.resize(vec_list.size()+1);
  1175. for(size_t indx=0;indx<vec_list.size();indx++){ // Resize buffer
  1176. Matrix<Real_t>& buff=buff_list[indx];
  1177. std::vector<Vector<Real_t>*>& vec_lst= vec_list[indx];
  1178. bool keep_data=(indx==4 || indx==6);
  1179. size_t n_vec=vec_lst.size();
  1180. { // Continue if nothing to be done.
  1181. if(!n_vec) continue;
  1182. if(buff.Dim(0)*buff.Dim(1)>0){
  1183. bool init_buff=false;
  1184. Real_t* buff_start=&buff[0][0];
  1185. Real_t* buff_end=&buff[0][0]+buff.Dim(0)*buff.Dim(1);
  1186. #pragma omp parallel for reduction(||:init_buff)
  1187. for(size_t i=0;i<n_vec;i++){
  1188. if(vec_lst[i]->Dim() && (&(*vec_lst[i])[0]<buff_start || &(*vec_lst[i])[0]>=buff_end)){
  1189. init_buff=true;
  1190. }
  1191. }
  1192. if(!init_buff) continue;
  1193. }
  1194. }
  1195. std::vector<size_t> vec_size(n_vec);
  1196. std::vector<size_t> vec_disp(n_vec);
  1197. if(n_vec){ // Set vec_size and vec_disp
  1198. #pragma omp parallel for
  1199. for(size_t i=0;i<n_vec;i++){ // Set vec_size
  1200. vec_size[i]=vec_lst[i]->Dim();
  1201. }
  1202. vec_disp[0]=0;
  1203. omp_par::scan(&vec_size[0],&vec_disp[0],n_vec);
  1204. }
  1205. size_t buff_size=vec_size[n_vec-1]+vec_disp[n_vec-1];
  1206. if(!buff_size) continue;
  1207. if(keep_data){ // Copy to dev_buffer
  1208. if(dev_buffer.Dim()<buff_size*sizeof(Real_t)){ // Resize dev_buffer
  1209. dev_buffer.ReInit(buff_size*sizeof(Real_t)*1.05);
  1210. }
  1211. #pragma omp parallel for
  1212. for(size_t i=0;i<n_vec;i++){
  1213. if(&(*vec_lst[i])[0]){
  1214. mem::memcopy(((Real_t*)&dev_buffer[0])+vec_disp[i],&(*vec_lst[i])[0],vec_size[i]*sizeof(Real_t));
  1215. }
  1216. }
  1217. }
  1218. if(buff.Dim(0)*buff.Dim(1)<buff_size){ // Resize buff
  1219. buff.ReInit(1,buff_size*1.05);
  1220. }
  1221. if(keep_data){ // Copy to buff (from dev_buffer)
  1222. #pragma omp parallel for
  1223. for(size_t tid=0;tid<omp_p;tid++){
  1224. size_t a=(buff_size*(tid+0))/omp_p;
  1225. size_t b=(buff_size*(tid+1))/omp_p;
  1226. mem::memcopy(&buff[0][0]+a,((Real_t*)&dev_buffer[0])+a,(b-a)*sizeof(Real_t));
  1227. }
  1228. }
  1229. #pragma omp parallel for
  1230. for(size_t i=0;i<n_vec;i++){ // ReInit vectors
  1231. vec_lst[i]->ReInit(vec_size[i],&buff[0][0]+vec_disp[i],false);
  1232. }
  1233. }
  1234. }
  1235. template <class FMMNode>
  1236. void FMM_Pts<FMMNode>::SetupPrecomp(SetupData<Real_t>& setup_data, bool device){
  1237. if(setup_data.precomp_data==NULL || setup_data.level>MAX_DEPTH) return;
  1238. Profile::Tic("SetupPrecomp",&this->comm,true,25);
  1239. { // Build precomp_data
  1240. size_t precomp_offset=0;
  1241. int level=setup_data.level;
  1242. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1243. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1244. for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1245. Mat_Type& interac_type=interac_type_lst[type_indx];
  1246. this->PrecompAll(interac_type, level); // Compute matrices.
  1247. precomp_offset=this->mat->CompactData(level, interac_type, precomp_data, precomp_offset);
  1248. }
  1249. }
  1250. Profile::Toc();
  1251. if(device){ // Host2Device
  1252. Profile::Tic("Host2Device",&this->comm,false,25);
  1253. setup_data.precomp_data->AllocDevice(true);
  1254. Profile::Toc();
  1255. }
  1256. }
  1257. template <class FMMNode>
  1258. void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
  1259. int level=setup_data.level;
  1260. std::vector<Mat_Type>& interac_type_lst=setup_data.interac_type;
  1261. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1262. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1263. Matrix<Real_t>& input_data=*setup_data. input_data;
  1264. Matrix<Real_t>& output_data=*setup_data.output_data;
  1265. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector;
  1266. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector;
  1267. size_t n_in =nodes_in .size();
  1268. size_t n_out=nodes_out.size();
  1269. // Setup precomputed data.
  1270. if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  1271. // Build interac_data
  1272. Profile::Tic("Interac-Data",&this->comm,true,25);
  1273. Matrix<char>& interac_data=setup_data.interac_data;
  1274. { // Build precomp_data, interac_data
  1275. std::vector<size_t> interac_mat;
  1276. std::vector<size_t> interac_cnt;
  1277. std::vector<size_t> interac_blk;
  1278. std::vector<size_t> input_perm;
  1279. std::vector<size_t> output_perm;
  1280. size_t dof=0, M_dim0=0, M_dim1=0;
  1281. size_t precomp_offset=0;
  1282. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  1283. if(n_out && n_in) for(size_t type_indx=0; type_indx<interac_type_lst.size(); type_indx++){
  1284. Mat_Type& interac_type=interac_type_lst[type_indx];
  1285. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  1286. Matrix<size_t> precomp_data_offset;
  1287. { // Load precomp_data for interac_type.
  1288. struct HeaderData{
  1289. size_t total_size;
  1290. size_t level;
  1291. size_t mat_cnt ;
  1292. size_t max_depth;
  1293. };
  1294. Matrix<char>& precomp_data=*setup_data.precomp_data;
  1295. char* indx_ptr=precomp_data[0]+precomp_offset;
  1296. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  1297. precomp_data_offset.ReInit(header.mat_cnt,(1+(2+2)*header.max_depth), (size_t*)indx_ptr, false);
  1298. precomp_offset+=header.total_size;
  1299. }
  1300. Matrix<FMMNode*> src_interac_list(n_in ,mat_cnt); src_interac_list.SetZero();
  1301. Matrix<FMMNode*> trg_interac_list(n_out,mat_cnt); trg_interac_list.SetZero();
  1302. { // Build trg_interac_list
  1303. #pragma omp parallel for
  1304. for(size_t i=0;i<n_out;i++){
  1305. if(!((FMMNode*)nodes_out[i])->IsGhost() && (level==-1 || ((FMMNode*)nodes_out[i])->Depth()==level)){
  1306. Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  1307. mem::memcopy(&trg_interac_list[i][0], &lst[0], lst.Dim()*sizeof(FMMNode*));
  1308. assert(lst.Dim()==mat_cnt);
  1309. }
  1310. }
  1311. }
  1312. { // Build src_interac_list
  1313. #pragma omp parallel for
  1314. for(size_t i=0;i<n_out;i++){
  1315. for(size_t j=0;j<mat_cnt;j++)
  1316. if(trg_interac_list[i][j]!=NULL){
  1317. trg_interac_list[i][j]->node_id=n_in;
  1318. }
  1319. }
  1320. #pragma omp parallel for
  1321. for(size_t i=0;i<n_in ;i++) ((FMMNode*)nodes_in [i])->node_id=i;
  1322. #pragma omp parallel for
  1323. for(size_t i=0;i<n_out;i++){
  1324. for(size_t j=0;j<mat_cnt;j++){
  1325. if(trg_interac_list[i][j]!=NULL){
  1326. if(trg_interac_list[i][j]->node_id==n_in){
  1327. trg_interac_list[i][j]=NULL;
  1328. }else{
  1329. src_interac_list[trg_interac_list[i][j]->node_id][j]=(FMMNode*)nodes_out[i];
  1330. }
  1331. }
  1332. }
  1333. }
  1334. }
  1335. Matrix<size_t> interac_dsp(n_out,mat_cnt);
  1336. std::vector<size_t> interac_blk_dsp(1,0);
  1337. { // Determine dof, M_dim0, M_dim1
  1338. dof=1;
  1339. Matrix<Real_t>& M0 = this->interac_list.ClassMat(level, interac_type_lst[0], 0);
  1340. M_dim0=M0.Dim(0); M_dim1=M0.Dim(1);
  1341. }
  1342. { // Determine interaction blocks which fit in memory.
  1343. size_t vec_size=(M_dim0+M_dim1)*sizeof(Real_t)*dof;
  1344. for(size_t j=0;j<mat_cnt;j++){// Determine minimum buff_size
  1345. size_t vec_cnt=0;
  1346. for(size_t i=0;i<n_out;i++){
  1347. if(trg_interac_list[i][j]!=NULL) vec_cnt++;
  1348. }
  1349. if(buff_size<vec_cnt*vec_size)
  1350. buff_size=vec_cnt*vec_size;
  1351. }
  1352. size_t interac_dsp_=0;
  1353. for(size_t j=0;j<mat_cnt;j++){
  1354. for(size_t i=0;i<n_out;i++){
  1355. interac_dsp[i][j]=interac_dsp_;
  1356. if(trg_interac_list[i][j]!=NULL) interac_dsp_++;
  1357. }
  1358. if(interac_dsp_*vec_size>buff_size) // Comment to disable symmetries.
  1359. {
  1360. interac_blk.push_back(j-interac_blk_dsp.back());
  1361. interac_blk_dsp.push_back(j);
  1362. size_t offset=interac_dsp[0][j];
  1363. for(size_t i=0;i<n_out;i++) interac_dsp[i][j]-=offset;
  1364. interac_dsp_-=offset;
  1365. assert(interac_dsp_*vec_size<=buff_size); // Problem too big for buff_size.
  1366. }
  1367. interac_mat.push_back(precomp_data_offset[this->interac_list.InteracClass(interac_type,j)][0]);
  1368. interac_cnt.push_back(interac_dsp_-interac_dsp[0][j]);
  1369. }
  1370. interac_blk.push_back(mat_cnt-interac_blk_dsp.back());
  1371. interac_blk_dsp.push_back(mat_cnt);
  1372. }
  1373. { // Determine input_perm.
  1374. size_t vec_size=M_dim0*dof;
  1375. for(size_t i=0;i<n_out;i++) ((FMMNode*)nodes_out[i])->node_id=i;
  1376. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1377. for(size_t i=0;i<n_in ;i++){
  1378. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1379. FMMNode_t* trg_node=src_interac_list[i][j];
  1380. if(trg_node!=NULL && trg_node->node_id<n_out){
  1381. size_t depth=(this->ScaleInvar()?trg_node->Depth():0);
  1382. input_perm .push_back(precomp_data_offset[j][1+4*depth+0]); // prem
  1383. input_perm .push_back(precomp_data_offset[j][1+4*depth+1]); // scal
  1384. input_perm .push_back(interac_dsp[trg_node->node_id][j]*vec_size*sizeof(Real_t)); // trg_ptr
  1385. input_perm .push_back((size_t)(& input_vector[i][0][0]- input_data[0])); // src_ptr
  1386. assert(input_vector[i]->Dim()==vec_size);
  1387. }
  1388. }
  1389. }
  1390. }
  1391. }
  1392. { // Determine output_perm
  1393. size_t vec_size=M_dim1*dof;
  1394. for(size_t k=1;k<interac_blk_dsp.size();k++){
  1395. for(size_t i=0;i<n_out;i++){
  1396. for(size_t j=interac_blk_dsp[k-1];j<interac_blk_dsp[k];j++){
  1397. if(trg_interac_list[i][j]!=NULL){
  1398. size_t depth=(this->ScaleInvar()?((FMMNode*)nodes_out[i])->Depth():0);
  1399. output_perm.push_back(precomp_data_offset[j][1+4*depth+2]); // prem
  1400. output_perm.push_back(precomp_data_offset[j][1+4*depth+3]); // scal
  1401. output_perm.push_back(interac_dsp[ i ][j]*vec_size*sizeof(Real_t)); // src_ptr
  1402. output_perm.push_back((size_t)(&output_vector[i][0][0]-output_data[0])); // trg_ptr
  1403. assert(output_vector[i]->Dim()==vec_size);
  1404. }
  1405. }
  1406. }
  1407. }
  1408. }
  1409. }
  1410. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  1411. { // Set interac_data.
  1412. size_t data_size=sizeof(size_t)*4;
  1413. data_size+=sizeof(size_t)+interac_blk.size()*sizeof(size_t);
  1414. data_size+=sizeof(size_t)+interac_cnt.size()*sizeof(size_t);
  1415. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  1416. data_size+=sizeof(size_t)+ input_perm.size()*sizeof(size_t);
  1417. data_size+=sizeof(size_t)+output_perm.size()*sizeof(size_t);
  1418. if(interac_data.Dim(0)*interac_data.Dim(1)<sizeof(size_t)){
  1419. data_size+=sizeof(size_t);
  1420. interac_data.ReInit(1,data_size);
  1421. ((size_t*)&interac_data[0][0])[0]=sizeof(size_t);
  1422. }else{
  1423. size_t pts_data_size=*((size_t*)&interac_data[0][0]);
  1424. assert(interac_data.Dim(0)*interac_data.Dim(1)>=pts_data_size);
  1425. data_size+=pts_data_size;
  1426. if(data_size>interac_data.Dim(0)*interac_data.Dim(1)){ //Resize and copy interac_data.
  1427. Matrix< char> pts_interac_data=interac_data;
  1428. interac_data.ReInit(1,data_size);
  1429. mem::memcopy(&interac_data[0][0],&pts_interac_data[0][0],pts_data_size);
  1430. }
  1431. }
  1432. char* data_ptr=&interac_data[0][0];
  1433. data_ptr+=((size_t*)data_ptr)[0];
  1434. ((size_t*)data_ptr)[0]=data_size; data_ptr+=sizeof(size_t);
  1435. ((size_t*)data_ptr)[0]= M_dim0; data_ptr+=sizeof(size_t);
  1436. ((size_t*)data_ptr)[0]= M_dim1; data_ptr+=sizeof(size_t);
  1437. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  1438. ((size_t*)data_ptr)[0]=interac_blk.size(); data_ptr+=sizeof(size_t);
  1439. mem::memcopy(data_ptr, &interac_blk[0], interac_blk.size()*sizeof(size_t));
  1440. data_ptr+=interac_blk.size()*sizeof(size_t);
  1441. ((size_t*)data_ptr)[0]=interac_cnt.size(); data_ptr+=sizeof(size_t);
  1442. mem::memcopy(data_ptr, &interac_cnt[0], interac_cnt.size()*sizeof(size_t));
  1443. data_ptr+=interac_cnt.size()*sizeof(size_t);
  1444. ((size_t*)data_ptr)[0]=interac_mat.size(); data_ptr+=sizeof(size_t);
  1445. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  1446. data_ptr+=interac_mat.size()*sizeof(size_t);
  1447. ((size_t*)data_ptr)[0]= input_perm.size(); data_ptr+=sizeof(size_t);
  1448. mem::memcopy(data_ptr, & input_perm[0], input_perm.size()*sizeof(size_t));
  1449. data_ptr+= input_perm.size()*sizeof(size_t);
  1450. ((size_t*)data_ptr)[0]=output_perm.size(); data_ptr+=sizeof(size_t);
  1451. mem::memcopy(data_ptr, &output_perm[0], output_perm.size()*sizeof(size_t));
  1452. data_ptr+=output_perm.size()*sizeof(size_t);
  1453. }
  1454. }
  1455. Profile::Toc();
  1456. if(device){ // Host2Device
  1457. Profile::Tic("Host2Device",&this->comm,false,25);
  1458. setup_data.interac_data .AllocDevice(true);
  1459. Profile::Toc();
  1460. }
  1461. }
  1462. #if defined(PVFMM_HAVE_CUDA)
  1463. #include <fmm_pts_gpu.hpp>
  1464. template <class Real_t, int SYNC>
  1465. void EvalListGPU(SetupData<Real_t>& setup_data, Vector<char>& dev_buffer, MPI_Comm& comm) {
  1466. cudaStream_t* stream = pvfmm::CUDA_Lock::acquire_stream();
  1467. Profile::Tic("Host2Device",&comm,false,25);
  1468. typename Matrix<char>::Device interac_data;
  1469. typename Vector<char>::Device buff;
  1470. typename Matrix<char>::Device precomp_data_d;
  1471. typename Matrix<char>::Device interac_data_d;
  1472. typename Matrix<Real_t>::Device input_data_d;
  1473. typename Matrix<Real_t>::Device output_data_d;
  1474. interac_data = setup_data.interac_data;
  1475. buff = dev_buffer. AllocDevice(false);
  1476. precomp_data_d= setup_data.precomp_data->AllocDevice(false);
  1477. interac_data_d= setup_data.interac_data. AllocDevice(false);
  1478. input_data_d = setup_data. input_data->AllocDevice(false);
  1479. output_data_d = setup_data. output_data->AllocDevice(false);
  1480. Profile::Toc();
  1481. Profile::Tic("DeviceComp",&comm,false,20);
  1482. { // Offloaded computation.
  1483. size_t data_size, M_dim0, M_dim1, dof;
  1484. Vector<size_t> interac_blk;
  1485. Vector<size_t> interac_cnt;
  1486. Vector<size_t> interac_mat;
  1487. Vector<size_t> input_perm_d;
  1488. Vector<size_t> output_perm_d;
  1489. { // Set interac_data.
  1490. char* data_ptr=&interac_data [0][0];
  1491. char* dev_ptr=&interac_data_d[0][0];
  1492. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size; dev_ptr += data_size;
  1493. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1494. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1495. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1496. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t); dev_ptr += sizeof(size_t);
  1497. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1498. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1499. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_blk.Dim();
  1500. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1501. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1502. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_cnt.Dim();
  1503. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1504. data_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1505. dev_ptr += sizeof(size_t) + sizeof(size_t)*interac_mat.Dim();
  1506. input_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1507. data_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1508. dev_ptr += sizeof(size_t) + sizeof(size_t)*input_perm_d.Dim();
  1509. output_perm_d.ReInit(((size_t*)data_ptr)[0],(size_t*)(dev_ptr+sizeof(size_t)),false);
  1510. data_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1511. dev_ptr += sizeof(size_t) + sizeof(size_t)*output_perm_d.Dim();
  1512. }
  1513. { // interactions
  1514. size_t interac_indx = 0;
  1515. size_t interac_blk_dsp = 0;
  1516. cudaError_t error;
  1517. for (size_t k = 0; k < interac_blk.Dim(); k++) {
  1518. size_t vec_cnt=0;
  1519. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1520. if(vec_cnt==0){
  1521. //interac_indx += vec_cnt;
  1522. interac_blk_dsp += interac_blk[k];
  1523. continue;
  1524. }
  1525. char *buff_in_d =&buff[0];
  1526. char *buff_out_d =&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1527. { // Input permutation.
  1528. in_perm_gpu<Real_t>(&precomp_data_d[0][0], &input_data_d[0][0], buff_in_d,
  1529. &input_perm_d[interac_indx*4], vec_cnt, M_dim0, stream);
  1530. }
  1531. size_t vec_cnt0 = 0;
  1532. for (size_t j = interac_blk_dsp; j < interac_blk_dsp + interac_blk[k];) {
  1533. size_t vec_cnt1 = 0;
  1534. size_t interac_mat0 = interac_mat[j];
  1535. for (; j < interac_blk_dsp + interac_blk[k] && interac_mat[j] == interac_mat0; j++) vec_cnt1 += interac_cnt[j];
  1536. Matrix<Real_t> M_d(M_dim0, M_dim1, (Real_t*)(precomp_data_d.dev_ptr + interac_mat0), false);
  1537. Matrix<Real_t> Ms_d(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in_d + M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1538. Matrix<Real_t> Mt_d(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out_d + M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1539. Matrix<Real_t>::CUBLASGEMM(Mt_d, Ms_d, M_d);
  1540. vec_cnt0 += vec_cnt1;
  1541. }
  1542. { // Output permutation.
  1543. out_perm_gpu<Real_t>(&precomp_data_d[0][0], &output_data_d[0][0], buff_out_d,
  1544. &output_perm_d[interac_indx*4], vec_cnt, M_dim1, stream);
  1545. }
  1546. interac_indx += vec_cnt;
  1547. interac_blk_dsp += interac_blk[k];
  1548. }
  1549. }
  1550. }
  1551. Profile::Toc();
  1552. if(SYNC) CUDA_Lock::wait();
  1553. }
  1554. #endif
  1555. template <class FMMNode>
  1556. template <int SYNC>
  1557. void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
  1558. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  1559. Profile::Tic("Host2Device",&this->comm,false,25);
  1560. Profile::Toc();
  1561. Profile::Tic("DeviceComp",&this->comm,false,20);
  1562. Profile::Toc();
  1563. return;
  1564. }
  1565. #if defined(PVFMM_HAVE_CUDA)
  1566. if (device) {
  1567. EvalListGPU<Real_t, SYNC>(setup_data, this->dev_buffer, this->comm);
  1568. return;
  1569. }
  1570. #endif
  1571. Profile::Tic("Host2Device",&this->comm,false,25);
  1572. typename Vector<char>::Device buff;
  1573. typename Matrix<char>::Device precomp_data;
  1574. typename Matrix<char>::Device interac_data;
  1575. typename Matrix<Real_t>::Device input_data;
  1576. typename Matrix<Real_t>::Device output_data;
  1577. if(device){
  1578. buff = this-> dev_buffer. AllocDevice(false);
  1579. precomp_data= setup_data.precomp_data->AllocDevice(false);
  1580. interac_data= setup_data.interac_data. AllocDevice(false);
  1581. input_data = setup_data. input_data->AllocDevice(false);
  1582. output_data = setup_data. output_data->AllocDevice(false);
  1583. }else{
  1584. buff = this-> dev_buffer;
  1585. precomp_data=*setup_data.precomp_data;
  1586. interac_data= setup_data.interac_data;
  1587. input_data =*setup_data. input_data;
  1588. output_data =*setup_data. output_data;
  1589. }
  1590. Profile::Toc();
  1591. Profile::Tic("DeviceComp",&this->comm,false,20);
  1592. int lock_idx=-1;
  1593. int wait_lock_idx=-1;
  1594. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  1595. if(device) lock_idx=MIC_Lock::get_lock();
  1596. #ifdef __INTEL_OFFLOAD
  1597. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  1598. #endif
  1599. { // Offloaded computation.
  1600. // Set interac_data.
  1601. size_t data_size, M_dim0, M_dim1, dof;
  1602. Vector<size_t> interac_blk;
  1603. Vector<size_t> interac_cnt;
  1604. Vector<size_t> interac_mat;
  1605. Vector<size_t> input_perm;
  1606. Vector<size_t> output_perm;
  1607. { // Set interac_data.
  1608. char* data_ptr=&interac_data[0][0];
  1609. data_size=((size_t*)data_ptr)[0]; data_ptr+=data_size;
  1610. data_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1611. M_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1612. M_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1613. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  1614. interac_blk.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1615. data_ptr+=sizeof(size_t)+interac_blk.Dim()*sizeof(size_t);
  1616. interac_cnt.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1617. data_ptr+=sizeof(size_t)+interac_cnt.Dim()*sizeof(size_t);
  1618. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1619. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  1620. input_perm .ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1621. data_ptr+=sizeof(size_t)+ input_perm.Dim()*sizeof(size_t);
  1622. output_perm.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  1623. data_ptr+=sizeof(size_t)+output_perm.Dim()*sizeof(size_t);
  1624. }
  1625. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  1626. //Compute interaction from Chebyshev source density.
  1627. { // interactions
  1628. int omp_p=omp_get_max_threads();
  1629. size_t interac_indx=0;
  1630. size_t interac_blk_dsp=0;
  1631. for(size_t k=0;k<interac_blk.Dim();k++){
  1632. size_t vec_cnt=0;
  1633. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];j++) vec_cnt+=interac_cnt[j];
  1634. if(vec_cnt==0){
  1635. //interac_indx += vec_cnt;
  1636. interac_blk_dsp += interac_blk[k];
  1637. continue;
  1638. }
  1639. char* buff_in =&buff[0];
  1640. char* buff_out=&buff[vec_cnt*dof*M_dim0*sizeof(Real_t)];
  1641. // Input permutation.
  1642. #pragma omp parallel for
  1643. for(int tid=0;tid<omp_p;tid++){
  1644. size_t a=( tid *vec_cnt)/omp_p;
  1645. size_t b=((tid+1)*vec_cnt)/omp_p;
  1646. for(size_t i=a;i<b;i++){
  1647. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+input_perm[(interac_indx+i)*4+0]);
  1648. const Real_t* scal=( Real_t*)(precomp_data[0]+input_perm[(interac_indx+i)*4+1]);
  1649. const Real_t* v_in =( Real_t*)( input_data[0]+input_perm[(interac_indx+i)*4+3]);
  1650. Real_t* v_out=( Real_t*)( buff_in +input_perm[(interac_indx+i)*4+2]);
  1651. // TODO: Fix for dof>1
  1652. #ifdef __MIC__
  1653. {
  1654. __m512d v8;
  1655. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1656. size_t j_end =(((uintptr_t)(v_out+M_dim0) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1657. j_start/=sizeof(Real_t);
  1658. j_end /=sizeof(Real_t);
  1659. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1660. assert(((uintptr_t)(v_out+j_start))%64==0);
  1661. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1662. size_t j=0;
  1663. for(;j<j_start;j++ ){
  1664. v_out[j]=v_in[perm[j]]*scal[j];
  1665. }
  1666. for(;j<j_end ;j+=8){
  1667. v8=_mm512_setr_pd(
  1668. v_in[perm[j+0]]*scal[j+0],
  1669. v_in[perm[j+1]]*scal[j+1],
  1670. v_in[perm[j+2]]*scal[j+2],
  1671. v_in[perm[j+3]]*scal[j+3],
  1672. v_in[perm[j+4]]*scal[j+4],
  1673. v_in[perm[j+5]]*scal[j+5],
  1674. v_in[perm[j+6]]*scal[j+6],
  1675. v_in[perm[j+7]]*scal[j+7]);
  1676. _mm512_storenrngo_pd(v_out+j,v8);
  1677. }
  1678. for(;j<M_dim0 ;j++ ){
  1679. v_out[j]=v_in[perm[j]]*scal[j];
  1680. }
  1681. }
  1682. #else
  1683. for(size_t j=0;j<M_dim0;j++ ){
  1684. v_out[j]=v_in[perm[j]]*scal[j];
  1685. }
  1686. #endif
  1687. }
  1688. }
  1689. size_t vec_cnt0=0;
  1690. for(size_t j=interac_blk_dsp;j<interac_blk_dsp+interac_blk[k];){
  1691. size_t vec_cnt1=0;
  1692. size_t interac_mat0=interac_mat[j];
  1693. for(;j<interac_blk_dsp+interac_blk[k] && interac_mat[j]==interac_mat0;j++) vec_cnt1+=interac_cnt[j];
  1694. Matrix<Real_t> M(M_dim0, M_dim1, (Real_t*)(precomp_data[0]+interac_mat0), false);
  1695. #ifdef __MIC__
  1696. {
  1697. Matrix<Real_t> Ms(dof*vec_cnt1, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t)), false);
  1698. Matrix<Real_t> Mt(dof*vec_cnt1, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t)), false);
  1699. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1700. }
  1701. #else
  1702. #pragma omp parallel for
  1703. for(int tid=0;tid<omp_p;tid++){
  1704. size_t a=(dof*vec_cnt1*(tid ))/omp_p;
  1705. size_t b=(dof*vec_cnt1*(tid+1))/omp_p;
  1706. Matrix<Real_t> Ms(b-a, M_dim0, (Real_t*)(buff_in +M_dim0*vec_cnt0*dof*sizeof(Real_t))+M_dim0*a, false);
  1707. Matrix<Real_t> Mt(b-a, M_dim1, (Real_t*)(buff_out+M_dim1*vec_cnt0*dof*sizeof(Real_t))+M_dim1*a, false);
  1708. Matrix<Real_t>::GEMM(Mt,Ms,M);
  1709. }
  1710. #endif
  1711. vec_cnt0+=vec_cnt1;
  1712. }
  1713. // Output permutation.
  1714. #pragma omp parallel for
  1715. for(int tid=0;tid<omp_p;tid++){
  1716. size_t a=( tid *vec_cnt)/omp_p;
  1717. size_t b=((tid+1)*vec_cnt)/omp_p;
  1718. if(tid> 0 && a<vec_cnt){ // Find 'a' independent of other threads.
  1719. size_t out_ptr=output_perm[(interac_indx+a)*4+3];
  1720. if(tid> 0) while(a<vec_cnt && out_ptr==output_perm[(interac_indx+a)*4+3]) a++;
  1721. }
  1722. if(tid<omp_p-1 && b<vec_cnt){ // Find 'b' independent of other threads.
  1723. size_t out_ptr=output_perm[(interac_indx+b)*4+3];
  1724. if(tid<omp_p-1) while(b<vec_cnt && out_ptr==output_perm[(interac_indx+b)*4+3]) b++;
  1725. }
  1726. for(size_t i=a;i<b;i++){ // Compute permutations.
  1727. const PERM_INT_T* perm=(PERM_INT_T*)(precomp_data[0]+output_perm[(interac_indx+i)*4+0]);
  1728. const Real_t* scal=( Real_t*)(precomp_data[0]+output_perm[(interac_indx+i)*4+1]);
  1729. const Real_t* v_in =( Real_t*)( buff_out +output_perm[(interac_indx+i)*4+2]);
  1730. Real_t* v_out=( Real_t*)( output_data[0]+output_perm[(interac_indx+i)*4+3]);
  1731. // TODO: Fix for dof>1
  1732. #ifdef __MIC__
  1733. {
  1734. __m512d v8;
  1735. __m512d v_old;
  1736. size_t j_start=(((uintptr_t)(v_out ) + (uintptr_t)(MEM_ALIGN-1)) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1737. size_t j_end =(((uintptr_t)(v_out+M_dim1) ) & ~ (uintptr_t)(MEM_ALIGN-1))-((uintptr_t)v_out);
  1738. j_start/=sizeof(Real_t);
  1739. j_end /=sizeof(Real_t);
  1740. assert(((uintptr_t)(v_out))%sizeof(Real_t)==0);
  1741. assert(((uintptr_t)(v_out+j_start))%64==0);
  1742. assert(((uintptr_t)(v_out+j_end ))%64==0);
  1743. size_t j=0;
  1744. for(;j<j_start;j++ ){
  1745. v_out[j]+=v_in[perm[j]]*scal[j];
  1746. }
  1747. for(;j<j_end ;j+=8){
  1748. v_old=_mm512_load_pd(v_out+j);
  1749. v8=_mm512_setr_pd(
  1750. v_in[perm[j+0]]*scal[j+0],
  1751. v_in[perm[j+1]]*scal[j+1],
  1752. v_in[perm[j+2]]*scal[j+2],
  1753. v_in[perm[j+3]]*scal[j+3],
  1754. v_in[perm[j+4]]*scal[j+4],
  1755. v_in[perm[j+5]]*scal[j+5],
  1756. v_in[perm[j+6]]*scal[j+6],
  1757. v_in[perm[j+7]]*scal[j+7]);
  1758. v_old=_mm512_add_pd(v_old, v8);
  1759. _mm512_storenrngo_pd(v_out+j,v_old);
  1760. }
  1761. for(;j<M_dim1 ;j++ ){
  1762. v_out[j]+=v_in[perm[j]]*scal[j];
  1763. }
  1764. }
  1765. #else
  1766. for(size_t j=0;j<M_dim1;j++ ){
  1767. v_out[j]+=v_in[perm[j]]*scal[j];
  1768. }
  1769. #endif
  1770. }
  1771. }
  1772. interac_indx+=vec_cnt;
  1773. interac_blk_dsp+=interac_blk[k];
  1774. }
  1775. }
  1776. if(device) MIC_Lock::release_lock(lock_idx);
  1777. }
  1778. #ifdef __INTEL_OFFLOAD
  1779. if(SYNC){
  1780. #pragma offload if(device) target(mic:0)
  1781. {if(device) MIC_Lock::wait_lock(lock_idx);}
  1782. }
  1783. #endif
  1784. Profile::Toc();
  1785. }
  1786. template <class FMMNode>
  1787. void FMM_Pts<FMMNode>::Source2UpSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  1788. if(!this->MultipoleOrder()) return;
  1789. { // Set setup_data
  1790. setup_data. level=level;
  1791. setup_data.kernel=kernel->k_s2m;
  1792. setup_data. input_data=&buff[4];
  1793. setup_data.output_data=&buff[0];
  1794. setup_data. coord_data=&buff[6];
  1795. Vector<FMMNode_t*>& nodes_in =n_list[4];
  1796. Vector<FMMNode_t*>& nodes_out=n_list[0];
  1797. setup_data.nodes_in .clear();
  1798. setup_data.nodes_out.clear();
  1799. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
  1800. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->pt_cnt[0] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  1801. }
  1802. struct PackedData{
  1803. size_t len;
  1804. Matrix<Real_t>* ptr;
  1805. Vector<size_t> cnt;
  1806. Vector<size_t> dsp;
  1807. };
  1808. struct InteracData{
  1809. Vector<size_t> in_node;
  1810. Vector<size_t> scal_idx;
  1811. Vector<Real_t> coord_shift;
  1812. Vector<size_t> interac_cnt;
  1813. Vector<size_t> interac_dsp;
  1814. Vector<size_t> interac_cst;
  1815. Vector<Real_t> scal[4*MAX_DEPTH];
  1816. Matrix<Real_t> M[4];
  1817. };
  1818. struct ptSetupData{
  1819. int level;
  1820. const Kernel<Real_t>* kernel;
  1821. PackedData src_coord; // Src coord
  1822. PackedData src_value; // Src density
  1823. PackedData srf_coord; // Srf coord
  1824. PackedData srf_value; // Srf density
  1825. PackedData trg_coord; // Trg coord
  1826. PackedData trg_value; // Trg potential
  1827. InteracData interac_data;
  1828. };
  1829. ptSetupData data;
  1830. data. level=setup_data. level;
  1831. data.kernel=setup_data.kernel;
  1832. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  1833. std::vector<void*>& nodes_out=setup_data.nodes_out;
  1834. { // Set src data
  1835. std::vector<void*>& nodes=nodes_in;
  1836. PackedData& coord=data.src_coord;
  1837. PackedData& value=data.src_value;
  1838. coord.ptr=setup_data. coord_data;
  1839. value.ptr=setup_data. input_data;
  1840. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  1841. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  1842. coord.cnt.ReInit(nodes.size());
  1843. coord.dsp.ReInit(nodes.size());
  1844. value.cnt.ReInit(nodes.size());
  1845. value.dsp.ReInit(nodes.size());
  1846. #pragma omp parallel for
  1847. for(size_t i=0;i<nodes.size();i++){
  1848. ((FMMNode_t*)nodes[i])->node_id=i;
  1849. Vector<Real_t>& coord_vec=((FMMNode*)nodes[i])->src_coord;
  1850. Vector<Real_t>& value_vec=((FMMNode*)nodes[i])->src_value;
  1851. if(coord_vec.Dim()){
  1852. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  1853. assert(coord.dsp[i]<coord.len);
  1854. coord.cnt[i]=coord_vec.Dim();
  1855. }else{
  1856. coord.dsp[i]=0;
  1857. coord.cnt[i]=0;
  1858. }
  1859. if(value_vec.Dim()){
  1860. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  1861. assert(value.dsp[i]<value.len);
  1862. value.cnt[i]=value_vec.Dim();
  1863. }else{
  1864. value.dsp[i]=0;
  1865. value.cnt[i]=0;
  1866. }
  1867. }
  1868. }
  1869. { // Set srf data
  1870. std::vector<void*>& nodes=nodes_in;
  1871. PackedData& coord=data.srf_coord;
  1872. PackedData& value=data.srf_value;
  1873. coord.ptr=setup_data. coord_data;
  1874. value.ptr=setup_data. input_data;
  1875. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  1876. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  1877. coord.cnt.ReInit(nodes.size());
  1878. coord.dsp.ReInit(nodes.size());
  1879. value.cnt.ReInit(nodes.size());
  1880. value.dsp.ReInit(nodes.size());
  1881. #pragma omp parallel for
  1882. for(size_t i=0;i<nodes.size();i++){
  1883. Vector<Real_t>& coord_vec=((FMMNode*)nodes[i])->surf_coord;
  1884. Vector<Real_t>& value_vec=((FMMNode*)nodes[i])->surf_value;
  1885. if(coord_vec.Dim()){
  1886. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  1887. assert(coord.dsp[i]<coord.len);
  1888. coord.cnt[i]=coord_vec.Dim();
  1889. }else{
  1890. coord.dsp[i]=0;
  1891. coord.cnt[i]=0;
  1892. }
  1893. if(value_vec.Dim()){
  1894. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  1895. assert(value.dsp[i]<value.len);
  1896. value.cnt[i]=value_vec.Dim();
  1897. }else{
  1898. value.dsp[i]=0;
  1899. value.cnt[i]=0;
  1900. }
  1901. }
  1902. }
  1903. { // Set trg data
  1904. std::vector<void*>& nodes=nodes_out;
  1905. PackedData& coord=data.trg_coord;
  1906. PackedData& value=data.trg_value;
  1907. coord.ptr=setup_data. coord_data;
  1908. value.ptr=setup_data.output_data;
  1909. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  1910. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  1911. coord.cnt.ReInit(nodes.size());
  1912. coord.dsp.ReInit(nodes.size());
  1913. value.cnt.ReInit(nodes.size());
  1914. value.dsp.ReInit(nodes.size());
  1915. #pragma omp parallel for
  1916. for(size_t i=0;i<nodes.size();i++){
  1917. Vector<Real_t>& coord_vec=tree->upwd_check_surf[((FMMNode*)nodes[i])->Depth()];
  1918. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->upward_equiv;
  1919. if(coord_vec.Dim()){
  1920. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  1921. assert(coord.dsp[i]<coord.len);
  1922. coord.cnt[i]=coord_vec.Dim();
  1923. }else{
  1924. coord.dsp[i]=0;
  1925. coord.cnt[i]=0;
  1926. }
  1927. if(value_vec.Dim()){
  1928. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  1929. assert(value.dsp[i]<value.len);
  1930. value.cnt[i]=value_vec.Dim();
  1931. }else{
  1932. value.dsp[i]=0;
  1933. value.cnt[i]=0;
  1934. }
  1935. }
  1936. }
  1937. { // Set interac_data
  1938. int omp_p=omp_get_max_threads();
  1939. std::vector<std::vector<size_t> > in_node_(omp_p);
  1940. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  1941. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  1942. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  1943. if(this->ScaleInvar()){ // Set scal
  1944. const Kernel<Real_t>* ker=kernel->k_m2m;
  1945. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+2]
  1946. Vector<Real_t>& scal=data.interac_data.scal[l*4+2];
  1947. Vector<Real_t>& scal_exp=ker->trg_scal;
  1948. scal.ReInit(scal_exp.Dim());
  1949. for(size_t i=0;i<scal.Dim();i++){
  1950. scal[i]=std::pow(2.0,-scal_exp[i]*l);
  1951. }
  1952. }
  1953. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+3]
  1954. Vector<Real_t>& scal=data.interac_data.scal[l*4+3];
  1955. Vector<Real_t>& scal_exp=ker->src_scal;
  1956. scal.ReInit(scal_exp.Dim());
  1957. for(size_t i=0;i<scal.Dim();i++){
  1958. scal[i]=std::pow(2.0,-scal_exp[i]*l);
  1959. }
  1960. }
  1961. }
  1962. #pragma omp parallel for
  1963. for(size_t tid=0;tid<omp_p;tid++){
  1964. std::vector<size_t>& in_node =in_node_[tid] ;
  1965. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  1966. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  1967. std::vector<size_t>& interac_cnt=interac_cnt_[tid];
  1968. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  1969. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  1970. for(size_t i=a;i<b;i++){
  1971. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  1972. Real_t s=std::pow(0.5,tnode->Depth());
  1973. size_t interac_cnt_=0;
  1974. { // S2U_Type
  1975. Mat_Type type=S2U_Type;
  1976. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  1977. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  1978. FMMNode_t* snode=intlst[j];
  1979. size_t snode_id=snode->node_id;
  1980. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  1981. in_node.push_back(snode_id);
  1982. scal_idx.push_back(snode->Depth());
  1983. { // set coord_shift
  1984. const int* rel_coord=interac_list.RelativeCoord(type,j);
  1985. const Real_t* scoord=snode->Coord();
  1986. const Real_t* tcoord=tnode->Coord();
  1987. Real_t shift[COORD_DIM];
  1988. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+0.5*s)+(0+0.5*s);
  1989. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+0.5*s)+(0+0.5*s);
  1990. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+0.5*s)+(0+0.5*s);
  1991. coord_shift.push_back(shift[0]);
  1992. coord_shift.push_back(shift[1]);
  1993. coord_shift.push_back(shift[2]);
  1994. }
  1995. interac_cnt_++;
  1996. }
  1997. }
  1998. interac_cnt.push_back(interac_cnt_);
  1999. }
  2000. }
  2001. { // Combine interac data
  2002. InteracData& interac_data=data.interac_data;
  2003. { // in_node
  2004. typedef size_t ElemType;
  2005. std::vector<std::vector<ElemType> >& vec_=in_node_;
  2006. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  2007. std::vector<size_t> vec_dsp(omp_p+1,0);
  2008. for(size_t tid=0;tid<omp_p;tid++){
  2009. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2010. }
  2011. vec.ReInit(vec_dsp[omp_p]);
  2012. #pragma omp parallel for
  2013. for(size_t tid=0;tid<omp_p;tid++){
  2014. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2015. }
  2016. }
  2017. { // scal_idx
  2018. typedef size_t ElemType;
  2019. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  2020. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  2021. std::vector<size_t> vec_dsp(omp_p+1,0);
  2022. for(size_t tid=0;tid<omp_p;tid++){
  2023. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2024. }
  2025. vec.ReInit(vec_dsp[omp_p]);
  2026. #pragma omp parallel for
  2027. for(size_t tid=0;tid<omp_p;tid++){
  2028. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2029. }
  2030. }
  2031. { // coord_shift
  2032. typedef Real_t ElemType;
  2033. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  2034. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  2035. std::vector<size_t> vec_dsp(omp_p+1,0);
  2036. for(size_t tid=0;tid<omp_p;tid++){
  2037. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2038. }
  2039. vec.ReInit(vec_dsp[omp_p]);
  2040. #pragma omp parallel for
  2041. for(size_t tid=0;tid<omp_p;tid++){
  2042. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2043. }
  2044. }
  2045. { // interac_cnt
  2046. typedef size_t ElemType;
  2047. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  2048. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  2049. std::vector<size_t> vec_dsp(omp_p+1,0);
  2050. for(size_t tid=0;tid<omp_p;tid++){
  2051. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  2052. }
  2053. vec.ReInit(vec_dsp[omp_p]);
  2054. #pragma omp parallel for
  2055. for(size_t tid=0;tid<omp_p;tid++){
  2056. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  2057. }
  2058. }
  2059. { // interac_dsp
  2060. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  2061. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  2062. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  2063. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  2064. }
  2065. }
  2066. { // Set M[2], M[3]
  2067. InteracData& interac_data=data.interac_data;
  2068. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  2069. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  2070. if(cnt.Dim() && cnt[cnt.Dim()-1]+dsp[dsp.Dim()-1]){
  2071. data.interac_data.M[2]=this->mat->Mat(level, UC2UE0_Type, 0);
  2072. data.interac_data.M[3]=this->mat->Mat(level, UC2UE1_Type, 0);
  2073. }else{
  2074. data.interac_data.M[2].ReInit(0,0);
  2075. data.interac_data.M[3].ReInit(0,0);
  2076. }
  2077. }
  2078. }
  2079. PtSetup(setup_data, &data);
  2080. }
  2081. template <class FMMNode>
  2082. void FMM_Pts<FMMNode>::Source2Up(SetupData<Real_t>& setup_data, bool device){
  2083. if(!this->MultipoleOrder()) return;
  2084. //Add Source2Up contribution.
  2085. this->EvalListPts(setup_data, device);
  2086. }
  2087. template <class FMMNode>
  2088. void FMM_Pts<FMMNode>::Up2UpSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2089. if(!this->MultipoleOrder()) return;
  2090. { // Set setup_data
  2091. setup_data.level=level;
  2092. setup_data.kernel=kernel->k_m2m;
  2093. setup_data.interac_type.resize(1);
  2094. setup_data.interac_type[0]=U2U_Type;
  2095. setup_data. input_data=&buff[0];
  2096. setup_data.output_data=&buff[0];
  2097. Vector<FMMNode_t*>& nodes_in =n_list[0];
  2098. Vector<FMMNode_t*>& nodes_out=n_list[0];
  2099. setup_data.nodes_in .clear();
  2100. setup_data.nodes_out.clear();
  2101. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level+1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
  2102. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level ) && nodes_out[i]->pt_cnt[0]) setup_data.nodes_out.push_back(nodes_out[i]);
  2103. }
  2104. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2105. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2106. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2107. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2108. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->upward_equiv);
  2109. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->upward_equiv);
  2110. SetupInterac(setup_data,device);
  2111. }
  2112. template <class FMMNode>
  2113. void FMM_Pts<FMMNode>::Up2Up (SetupData<Real_t>& setup_data, bool device){
  2114. if(!this->MultipoleOrder()) return;
  2115. //Add Up2Up contribution.
  2116. EvalList(setup_data, device);
  2117. }
  2118. template <class FMMNode>
  2119. void FMM_Pts<FMMNode>::PeriodicBC(FMMNode* node){
  2120. if(!this->ScaleInvar() || this->MultipoleOrder()==0) return;
  2121. Matrix<Real_t>& M = Precomp(0, BC_Type, 0);
  2122. assert(node->FMMData()->upward_equiv.Dim()>0);
  2123. int dof=1;
  2124. Vector<Real_t>& upward_equiv=node->FMMData()->upward_equiv;
  2125. Vector<Real_t>& dnward_equiv=node->FMMData()->dnward_equiv;
  2126. assert(upward_equiv.Dim()==M.Dim(0)*dof);
  2127. assert(dnward_equiv.Dim()==M.Dim(1)*dof);
  2128. Matrix<Real_t> d_equiv(dof,M.Dim(0),&dnward_equiv[0],false);
  2129. Matrix<Real_t> u_equiv(dof,M.Dim(1),&upward_equiv[0],false);
  2130. Matrix<Real_t>::GEMM(d_equiv,u_equiv,M);
  2131. }
  2132. template <class FMMNode>
  2133. void FMM_Pts<FMMNode>::FFT_UpEquiv(size_t dof, size_t m, size_t ker_dim0, Vector<size_t>& fft_vec, Vector<Real_t>& fft_scal,
  2134. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  2135. size_t n1=m*2;
  2136. size_t n2=n1*n1;
  2137. size_t n3=n1*n2;
  2138. size_t n3_=n2*(n1/2+1);
  2139. size_t chld_cnt=1UL<<COORD_DIM;
  2140. size_t fftsize_in =2*n3_*chld_cnt*ker_dim0*dof;
  2141. int omp_p=omp_get_max_threads();
  2142. //Load permutation map.
  2143. size_t n=6*(m-1)*(m-1)+2;
  2144. static Vector<size_t> map;
  2145. { // Build map to reorder upward_equiv
  2146. size_t n_old=map.Dim();
  2147. if(n_old!=n){
  2148. Real_t c[3]={0,0,0};
  2149. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  2150. map.Resize(surf.Dim()/COORD_DIM);
  2151. for(size_t i=0;i<map.Dim();i++)
  2152. map[i]=((size_t)(m-1-surf[i*3]+0.5))+((size_t)(m-1-surf[i*3+1]+0.5))*n1+((size_t)(m-1-surf[i*3+2]+0.5))*n2;
  2153. }
  2154. }
  2155. { // Build FFTW plan.
  2156. if(!vlist_fft_flag){
  2157. int nnn[3]={(int)n1,(int)n1,(int)n1};
  2158. void *fftw_in, *fftw_out;
  2159. fftw_in = mem::aligned_new<Real_t>( n3 *ker_dim0*chld_cnt);
  2160. fftw_out = mem::aligned_new<Real_t>(2*n3_*ker_dim0*chld_cnt);
  2161. vlist_fftplan = FFTW_t<Real_t>::fft_plan_many_dft_r2c(COORD_DIM,nnn,ker_dim0*chld_cnt,
  2162. (Real_t*)fftw_in, NULL, 1, n3, (typename FFTW_t<Real_t>::cplx*)(fftw_out),NULL, 1, n3_);
  2163. mem::aligned_delete<Real_t>((Real_t*)fftw_in );
  2164. mem::aligned_delete<Real_t>((Real_t*)fftw_out);
  2165. vlist_fft_flag=true;
  2166. }
  2167. }
  2168. { // Offload section
  2169. size_t n_in = fft_vec.Dim();
  2170. #pragma omp parallel for
  2171. for(int pid=0; pid<omp_p; pid++){
  2172. size_t node_start=(n_in*(pid ))/omp_p;
  2173. size_t node_end =(n_in*(pid+1))/omp_p;
  2174. Vector<Real_t> buffer(fftsize_in, &buffer_[fftsize_in*pid], false);
  2175. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  2176. Matrix<Real_t> upward_equiv(chld_cnt,n*ker_dim0*dof,&input_data[0] + fft_vec[node_idx],false);
  2177. Vector<Real_t> upward_equiv_fft(fftsize_in, &output_data[fftsize_in *node_idx], false);
  2178. upward_equiv_fft.SetZero();
  2179. // Rearrange upward equivalent data.
  2180. for(size_t k=0;k<n;k++){
  2181. size_t idx=map[k];
  2182. for(int j1=0;j1<dof;j1++)
  2183. for(int j0=0;j0<(int)chld_cnt;j0++)
  2184. for(int i=0;i<ker_dim0;i++)
  2185. upward_equiv_fft[idx+(j0+(i+j1*ker_dim0)*chld_cnt)*n3]=upward_equiv[j0][ker_dim0*(n*j1+k)+i]*fft_scal[ker_dim0*node_idx+i];
  2186. }
  2187. // Compute FFT.
  2188. for(int i=0;i<dof;i++)
  2189. FFTW_t<Real_t>::fft_execute_dft_r2c(vlist_fftplan, (Real_t*)&upward_equiv_fft[i* n3 *ker_dim0*chld_cnt],
  2190. (typename FFTW_t<Real_t>::cplx*)&buffer [i*2*n3_*ker_dim0*chld_cnt]);
  2191. //Compute flops.
  2192. #ifndef FFTW3_MKL
  2193. double add, mul, fma;
  2194. FFTW_t<Real_t>::fftw_flops(vlist_fftplan, &add, &mul, &fma);
  2195. #ifndef __INTEL_OFFLOAD0
  2196. Profile::Add_FLOP((long long)(add+mul+2*fma));
  2197. #endif
  2198. #endif
  2199. for(int i=0;i<ker_dim0*dof;i++)
  2200. for(size_t j=0;j<n3_;j++)
  2201. for(size_t k=0;k<chld_cnt;k++){
  2202. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+0]=buffer[2*(n3_*(chld_cnt*i+k)+j)+0];
  2203. upward_equiv_fft[2*(chld_cnt*(n3_*i+j)+k)+1]=buffer[2*(n3_*(chld_cnt*i+k)+j)+1];
  2204. }
  2205. }
  2206. }
  2207. }
  2208. }
  2209. template <class FMMNode>
  2210. void FMM_Pts<FMMNode>::FFT_Check2Equiv(size_t dof, size_t m, size_t ker_dim1, Vector<size_t>& ifft_vec, Vector<Real_t>& ifft_scal,
  2211. Vector<Real_t>& input_data, Vector<Real_t>& output_data, Vector<Real_t>& buffer_){
  2212. size_t n1=m*2;
  2213. size_t n2=n1*n1;
  2214. size_t n3=n1*n2;
  2215. size_t n3_=n2*(n1/2+1);
  2216. size_t chld_cnt=1UL<<COORD_DIM;
  2217. size_t fftsize_out=2*n3_*dof*ker_dim1*chld_cnt;
  2218. int omp_p=omp_get_max_threads();
  2219. //Load permutation map.
  2220. size_t n=6*(m-1)*(m-1)+2;
  2221. static Vector<size_t> map;
  2222. { // Build map to reorder dnward_check
  2223. size_t n_old=map.Dim();
  2224. if(n_old!=n){
  2225. Real_t c[3]={0,0,0};
  2226. Vector<Real_t> surf=surface(m, c, (Real_t)(m-1), 0);
  2227. map.Resize(surf.Dim()/COORD_DIM);
  2228. for(size_t i=0;i<map.Dim();i++)
  2229. map[i]=((size_t)(m*2-0.5-surf[i*3]))+((size_t)(m*2-0.5-surf[i*3+1]))*n1+((size_t)(m*2-0.5-surf[i*3+2]))*n2;
  2230. //map;//.AllocDevice(true);
  2231. }
  2232. }
  2233. { // Build FFTW plan.
  2234. if(!vlist_ifft_flag){
  2235. //Build FFTW plan.
  2236. int nnn[3]={(int)n1,(int)n1,(int)n1};
  2237. Real_t *fftw_in, *fftw_out;
  2238. fftw_in = mem::aligned_new<Real_t>(2*n3_*ker_dim1*chld_cnt);
  2239. fftw_out = mem::aligned_new<Real_t>( n3 *ker_dim1*chld_cnt);
  2240. vlist_ifftplan = FFTW_t<Real_t>::fft_plan_many_dft_c2r(COORD_DIM,nnn,ker_dim1*chld_cnt,
  2241. (typename FFTW_t<Real_t>::cplx*)fftw_in, NULL, 1, n3_, (Real_t*)(fftw_out),NULL, 1, n3);
  2242. mem::aligned_delete<Real_t>(fftw_in);
  2243. mem::aligned_delete<Real_t>(fftw_out);
  2244. vlist_ifft_flag=true;
  2245. }
  2246. }
  2247. { // Offload section
  2248. assert(buffer_.Dim()>=2*fftsize_out*omp_p);
  2249. size_t n_out=ifft_vec.Dim();
  2250. #pragma omp parallel for
  2251. for(int pid=0; pid<omp_p; pid++){
  2252. size_t node_start=(n_out*(pid ))/omp_p;
  2253. size_t node_end =(n_out*(pid+1))/omp_p;
  2254. Vector<Real_t> buffer0(fftsize_out, &buffer_[fftsize_out*(2*pid+0)], false);
  2255. Vector<Real_t> buffer1(fftsize_out, &buffer_[fftsize_out*(2*pid+1)], false);
  2256. for(size_t node_idx=node_start; node_idx<node_end; node_idx++){
  2257. Vector<Real_t> dnward_check_fft(fftsize_out, &input_data[fftsize_out*node_idx], false);
  2258. Vector<Real_t> dnward_equiv(ker_dim1*n*dof*chld_cnt,&output_data[0] + ifft_vec[node_idx],false);
  2259. //De-interleave data.
  2260. for(int i=0;i<ker_dim1*dof;i++)
  2261. for(size_t j=0;j<n3_;j++)
  2262. for(size_t k=0;k<chld_cnt;k++){
  2263. buffer0[2*(n3_*(ker_dim1*dof*k+i)+j)+0]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+0];
  2264. buffer0[2*(n3_*(ker_dim1*dof*k+i)+j)+1]=dnward_check_fft[2*(chld_cnt*(n3_*i+j)+k)+1];
  2265. }
  2266. // Compute FFT.
  2267. for(int i=0;i<dof;i++)
  2268. FFTW_t<Real_t>::fft_execute_dft_c2r(vlist_ifftplan, (typename FFTW_t<Real_t>::cplx*)&buffer0[i*2*n3_*ker_dim1*chld_cnt],
  2269. (Real_t*)&buffer1[i* n3 *ker_dim1*chld_cnt]);
  2270. //Compute flops.
  2271. #ifndef FFTW3_MKL
  2272. double add, mul, fma;
  2273. FFTW_t<Real_t>::fftw_flops(vlist_ifftplan, &add, &mul, &fma);
  2274. #ifndef __INTEL_OFFLOAD0
  2275. Profile::Add_FLOP((long long)(add+mul+2*fma)*dof);
  2276. #endif
  2277. #endif
  2278. // Rearrange downward check data.
  2279. for(size_t k=0;k<n;k++){
  2280. size_t idx=map[k];
  2281. for(int j1=0;j1<dof;j1++)
  2282. for(int j0=0;j0<(int)chld_cnt;j0++)
  2283. for(int i=0;i<ker_dim1;i++)
  2284. dnward_equiv[ker_dim1*(n*(dof*j0+j1)+k)+i]+=buffer1[idx+(i+(j1+j0*dof)*ker_dim1)*n3]*ifft_scal[ker_dim1*node_idx+i];
  2285. }
  2286. }
  2287. }
  2288. }
  2289. }
  2290. template<class Real_t>
  2291. inline void matmult_8x8x2(Real_t*& M_, Real_t*& IN0, Real_t*& IN1, Real_t*& OUT0, Real_t*& OUT1){
  2292. // Generic code.
  2293. Real_t out_reg000, out_reg001, out_reg010, out_reg011;
  2294. Real_t out_reg100, out_reg101, out_reg110, out_reg111;
  2295. Real_t in_reg000, in_reg001, in_reg010, in_reg011;
  2296. Real_t in_reg100, in_reg101, in_reg110, in_reg111;
  2297. Real_t m_reg000, m_reg001, m_reg010, m_reg011;
  2298. Real_t m_reg100, m_reg101, m_reg110, m_reg111;
  2299. //#pragma unroll
  2300. for(int i1=0;i1<8;i1+=2){
  2301. Real_t* IN0_=IN0;
  2302. Real_t* IN1_=IN1;
  2303. out_reg000=OUT0[ 0]; out_reg001=OUT0[ 1];
  2304. out_reg010=OUT0[ 2]; out_reg011=OUT0[ 3];
  2305. out_reg100=OUT1[ 0]; out_reg101=OUT1[ 1];
  2306. out_reg110=OUT1[ 2]; out_reg111=OUT1[ 3];
  2307. //#pragma unroll
  2308. for(int i2=0;i2<8;i2+=2){
  2309. m_reg000=M_[ 0]; m_reg001=M_[ 1];
  2310. m_reg010=M_[ 2]; m_reg011=M_[ 3];
  2311. m_reg100=M_[16]; m_reg101=M_[17];
  2312. m_reg110=M_[18]; m_reg111=M_[19];
  2313. in_reg000=IN0_[0]; in_reg001=IN0_[1];
  2314. in_reg010=IN0_[2]; in_reg011=IN0_[3];
  2315. in_reg100=IN1_[0]; in_reg101=IN1_[1];
  2316. in_reg110=IN1_[2]; in_reg111=IN1_[3];
  2317. out_reg000 += m_reg000*in_reg000 - m_reg001*in_reg001;
  2318. out_reg001 += m_reg000*in_reg001 + m_reg001*in_reg000;
  2319. out_reg010 += m_reg010*in_reg000 - m_reg011*in_reg001;
  2320. out_reg011 += m_reg010*in_reg001 + m_reg011*in_reg000;
  2321. out_reg000 += m_reg100*in_reg010 - m_reg101*in_reg011;
  2322. out_reg001 += m_reg100*in_reg011 + m_reg101*in_reg010;
  2323. out_reg010 += m_reg110*in_reg010 - m_reg111*in_reg011;
  2324. out_reg011 += m_reg110*in_reg011 + m_reg111*in_reg010;
  2325. out_reg100 += m_reg000*in_reg100 - m_reg001*in_reg101;
  2326. out_reg101 += m_reg000*in_reg101 + m_reg001*in_reg100;
  2327. out_reg110 += m_reg010*in_reg100 - m_reg011*in_reg101;
  2328. out_reg111 += m_reg010*in_reg101 + m_reg011*in_reg100;
  2329. out_reg100 += m_reg100*in_reg110 - m_reg101*in_reg111;
  2330. out_reg101 += m_reg100*in_reg111 + m_reg101*in_reg110;
  2331. out_reg110 += m_reg110*in_reg110 - m_reg111*in_reg111;
  2332. out_reg111 += m_reg110*in_reg111 + m_reg111*in_reg110;
  2333. M_+=32; // Jump to (column+2).
  2334. IN0_+=4;
  2335. IN1_+=4;
  2336. }
  2337. OUT0[ 0]=out_reg000; OUT0[ 1]=out_reg001;
  2338. OUT0[ 2]=out_reg010; OUT0[ 3]=out_reg011;
  2339. OUT1[ 0]=out_reg100; OUT1[ 1]=out_reg101;
  2340. OUT1[ 2]=out_reg110; OUT1[ 3]=out_reg111;
  2341. M_+=4-64*2; // Jump back to first column (row+2).
  2342. OUT0+=4;
  2343. OUT1+=4;
  2344. }
  2345. }
  2346. #if defined(__AVX__) || defined(__SSE3__)
  2347. template<>
  2348. inline void matmult_8x8x2<double>(double*& M_, double*& IN0, double*& IN1, double*& OUT0, double*& OUT1){
  2349. #ifdef __AVX__ //AVX code.
  2350. __m256d out00,out01,out10,out11;
  2351. __m256d out20,out21,out30,out31;
  2352. double* in0__ = IN0;
  2353. double* in1__ = IN1;
  2354. out00 = _mm256_load_pd(OUT0);
  2355. out01 = _mm256_load_pd(OUT1);
  2356. out10 = _mm256_load_pd(OUT0+4);
  2357. out11 = _mm256_load_pd(OUT1+4);
  2358. out20 = _mm256_load_pd(OUT0+8);
  2359. out21 = _mm256_load_pd(OUT1+8);
  2360. out30 = _mm256_load_pd(OUT0+12);
  2361. out31 = _mm256_load_pd(OUT1+12);
  2362. for(int i2=0;i2<8;i2+=2){
  2363. __m256d m00;
  2364. __m256d ot00;
  2365. __m256d mt0,mtt0;
  2366. __m256d in00,in00_r,in01,in01_r;
  2367. in00 = _mm256_broadcast_pd((const __m128d*)in0__);
  2368. in00_r = _mm256_permute_pd(in00,5);
  2369. in01 = _mm256_broadcast_pd((const __m128d*)in1__);
  2370. in01_r = _mm256_permute_pd(in01,5);
  2371. m00 = _mm256_load_pd(M_);
  2372. mt0 = _mm256_unpacklo_pd(m00,m00);
  2373. ot00 = _mm256_mul_pd(mt0,in00);
  2374. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2375. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2376. ot00 = _mm256_mul_pd(mt0,in01);
  2377. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2378. m00 = _mm256_load_pd(M_+4);
  2379. mt0 = _mm256_unpacklo_pd(m00,m00);
  2380. ot00 = _mm256_mul_pd(mt0,in00);
  2381. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2382. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2383. ot00 = _mm256_mul_pd(mt0,in01);
  2384. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2385. m00 = _mm256_load_pd(M_+8);
  2386. mt0 = _mm256_unpacklo_pd(m00,m00);
  2387. ot00 = _mm256_mul_pd(mt0,in00);
  2388. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2389. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2390. ot00 = _mm256_mul_pd(mt0,in01);
  2391. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2392. m00 = _mm256_load_pd(M_+12);
  2393. mt0 = _mm256_unpacklo_pd(m00,m00);
  2394. ot00 = _mm256_mul_pd(mt0,in00);
  2395. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2396. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2397. ot00 = _mm256_mul_pd(mt0,in01);
  2398. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2399. in00 = _mm256_broadcast_pd((const __m128d*) (in0__+2));
  2400. in00_r = _mm256_permute_pd(in00,5);
  2401. in01 = _mm256_broadcast_pd((const __m128d*) (in1__+2));
  2402. in01_r = _mm256_permute_pd(in01,5);
  2403. m00 = _mm256_load_pd(M_+16);
  2404. mt0 = _mm256_unpacklo_pd(m00,m00);
  2405. ot00 = _mm256_mul_pd(mt0,in00);
  2406. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2407. out00 = _mm256_add_pd(out00,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2408. ot00 = _mm256_mul_pd(mt0,in01);
  2409. out01 = _mm256_add_pd(out01,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2410. m00 = _mm256_load_pd(M_+20);
  2411. mt0 = _mm256_unpacklo_pd(m00,m00);
  2412. ot00 = _mm256_mul_pd(mt0,in00);
  2413. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2414. out10 = _mm256_add_pd(out10,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2415. ot00 = _mm256_mul_pd(mt0,in01);
  2416. out11 = _mm256_add_pd(out11,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2417. m00 = _mm256_load_pd(M_+24);
  2418. mt0 = _mm256_unpacklo_pd(m00,m00);
  2419. ot00 = _mm256_mul_pd(mt0,in00);
  2420. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2421. out20 = _mm256_add_pd(out20,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2422. ot00 = _mm256_mul_pd(mt0,in01);
  2423. out21 = _mm256_add_pd(out21,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2424. m00 = _mm256_load_pd(M_+28);
  2425. mt0 = _mm256_unpacklo_pd(m00,m00);
  2426. ot00 = _mm256_mul_pd(mt0,in00);
  2427. mtt0 = _mm256_unpackhi_pd(m00,m00);
  2428. out30 = _mm256_add_pd(out30,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in00_r)));
  2429. ot00 = _mm256_mul_pd(mt0,in01);
  2430. out31 = _mm256_add_pd(out31,_mm256_addsub_pd(ot00,_mm256_mul_pd(mtt0,in01_r)));
  2431. M_ += 32;
  2432. in0__ += 4;
  2433. in1__ += 4;
  2434. }
  2435. _mm256_store_pd(OUT0,out00);
  2436. _mm256_store_pd(OUT1,out01);
  2437. _mm256_store_pd(OUT0+4,out10);
  2438. _mm256_store_pd(OUT1+4,out11);
  2439. _mm256_store_pd(OUT0+8,out20);
  2440. _mm256_store_pd(OUT1+8,out21);
  2441. _mm256_store_pd(OUT0+12,out30);
  2442. _mm256_store_pd(OUT1+12,out31);
  2443. #elif defined __SSE3__ // SSE code.
  2444. __m128d out00, out01, out10, out11;
  2445. __m128d in00, in01, in10, in11;
  2446. __m128d m00, m01, m10, m11;
  2447. //#pragma unroll
  2448. for(int i1=0;i1<8;i1+=2){
  2449. double* IN0_=IN0;
  2450. double* IN1_=IN1;
  2451. out00 =_mm_load_pd (OUT0 );
  2452. out10 =_mm_load_pd (OUT0+2);
  2453. out01 =_mm_load_pd (OUT1 );
  2454. out11 =_mm_load_pd (OUT1+2);
  2455. //#pragma unroll
  2456. for(int i2=0;i2<8;i2+=2){
  2457. m00 =_mm_load1_pd (M_ );
  2458. m10 =_mm_load1_pd (M_+ 2);
  2459. m01 =_mm_load1_pd (M_+16);
  2460. m11 =_mm_load1_pd (M_+18);
  2461. in00 =_mm_load_pd (IN0_ );
  2462. in10 =_mm_load_pd (IN0_+2);
  2463. in01 =_mm_load_pd (IN1_ );
  2464. in11 =_mm_load_pd (IN1_+2);
  2465. out00 = _mm_add_pd (out00, _mm_mul_pd(m00 , in00 ));
  2466. out00 = _mm_add_pd (out00, _mm_mul_pd(m01 , in10 ));
  2467. out01 = _mm_add_pd (out01, _mm_mul_pd(m00 , in01 ));
  2468. out01 = _mm_add_pd (out01, _mm_mul_pd(m01 , in11 ));
  2469. out10 = _mm_add_pd (out10, _mm_mul_pd(m10 , in00 ));
  2470. out10 = _mm_add_pd (out10, _mm_mul_pd(m11 , in10 ));
  2471. out11 = _mm_add_pd (out11, _mm_mul_pd(m10 , in01 ));
  2472. out11 = _mm_add_pd (out11, _mm_mul_pd(m11 , in11 ));
  2473. m00 =_mm_load1_pd (M_+ 1);
  2474. m10 =_mm_load1_pd (M_+ 2+1);
  2475. m01 =_mm_load1_pd (M_+16+1);
  2476. m11 =_mm_load1_pd (M_+18+1);
  2477. in00 =_mm_shuffle_pd (in00,in00,_MM_SHUFFLE2(0,1));
  2478. in01 =_mm_shuffle_pd (in01,in01,_MM_SHUFFLE2(0,1));
  2479. in10 =_mm_shuffle_pd (in10,in10,_MM_SHUFFLE2(0,1));
  2480. in11 =_mm_shuffle_pd (in11,in11,_MM_SHUFFLE2(0,1));
  2481. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m00, in00));
  2482. out00 = _mm_addsub_pd(out00, _mm_mul_pd(m01, in10));
  2483. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m00, in01));
  2484. out01 = _mm_addsub_pd(out01, _mm_mul_pd(m01, in11));
  2485. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m10, in00));
  2486. out10 = _mm_addsub_pd(out10, _mm_mul_pd(m11, in10));
  2487. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m10, in01));
  2488. out11 = _mm_addsub_pd(out11, _mm_mul_pd(m11, in11));
  2489. M_+=32; // Jump to (column+2).
  2490. IN0_+=4;
  2491. IN1_+=4;
  2492. }
  2493. _mm_store_pd (OUT0 ,out00);
  2494. _mm_store_pd (OUT0+2,out10);
  2495. _mm_store_pd (OUT1 ,out01);
  2496. _mm_store_pd (OUT1+2,out11);
  2497. M_+=4-64*2; // Jump back to first column (row+2).
  2498. OUT0+=4;
  2499. OUT1+=4;
  2500. }
  2501. #endif
  2502. }
  2503. #endif
  2504. #if defined(__SSE3__)
  2505. template<>
  2506. inline void matmult_8x8x2<float>(float*& M_, float*& IN0, float*& IN1, float*& OUT0, float*& OUT1){
  2507. #if defined __SSE3__ // SSE code.
  2508. __m128 out00,out01,out10,out11;
  2509. __m128 out20,out21,out30,out31;
  2510. float* in0__ = IN0;
  2511. float* in1__ = IN1;
  2512. out00 = _mm_load_ps(OUT0);
  2513. out01 = _mm_load_ps(OUT1);
  2514. out10 = _mm_load_ps(OUT0+4);
  2515. out11 = _mm_load_ps(OUT1+4);
  2516. out20 = _mm_load_ps(OUT0+8);
  2517. out21 = _mm_load_ps(OUT1+8);
  2518. out30 = _mm_load_ps(OUT0+12);
  2519. out31 = _mm_load_ps(OUT1+12);
  2520. for(int i2=0;i2<8;i2+=2){
  2521. __m128 m00;
  2522. __m128 mt0,mtt0;
  2523. __m128 in00,in00_r,in01,in01_r;
  2524. in00 = _mm_castpd_ps(_mm_load_pd1((const double*)in0__));
  2525. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2526. in01 = _mm_castpd_ps(_mm_load_pd1((const double*)in1__));
  2527. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2528. m00 = _mm_load_ps(M_);
  2529. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2530. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2531. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2532. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2533. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2534. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2535. m00 = _mm_load_ps(M_+4);
  2536. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2537. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2538. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2539. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2540. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2541. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2542. m00 = _mm_load_ps(M_+8);
  2543. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2544. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2545. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2546. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2547. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2548. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2549. m00 = _mm_load_ps(M_+12);
  2550. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2551. out30= _mm_add_ps (out30,_mm_mul_ps( mt0, in00));
  2552. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2553. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2554. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2555. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2556. in00 = _mm_castpd_ps(_mm_load_pd1((const double*) (in0__+2)));
  2557. in00_r = _mm_shuffle_ps(in00,in00,_MM_SHUFFLE(2,3,0,1));
  2558. in01 = _mm_castpd_ps(_mm_load_pd1((const double*) (in1__+2)));
  2559. in01_r = _mm_shuffle_ps(in01,in01,_MM_SHUFFLE(2,3,0,1));
  2560. m00 = _mm_load_ps(M_+16);
  2561. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2562. out00= _mm_add_ps (out00,_mm_mul_ps( mt0,in00 ));
  2563. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2564. out00= _mm_addsub_ps(out00,_mm_mul_ps(mtt0,in00_r));
  2565. out01 = _mm_add_ps (out01,_mm_mul_ps( mt0,in01 ));
  2566. out01 = _mm_addsub_ps(out01,_mm_mul_ps(mtt0,in01_r));
  2567. m00 = _mm_load_ps(M_+20);
  2568. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2569. out10= _mm_add_ps (out10,_mm_mul_ps( mt0,in00 ));
  2570. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2571. out10= _mm_addsub_ps(out10,_mm_mul_ps(mtt0,in00_r));
  2572. out11 = _mm_add_ps (out11,_mm_mul_ps( mt0,in01 ));
  2573. out11 = _mm_addsub_ps(out11,_mm_mul_ps(mtt0,in01_r));
  2574. m00 = _mm_load_ps(M_+24);
  2575. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2576. out20= _mm_add_ps (out20,_mm_mul_ps( mt0,in00 ));
  2577. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2578. out20= _mm_addsub_ps(out20,_mm_mul_ps(mtt0,in00_r));
  2579. out21 = _mm_add_ps (out21,_mm_mul_ps( mt0,in01 ));
  2580. out21 = _mm_addsub_ps(out21,_mm_mul_ps(mtt0,in01_r));
  2581. m00 = _mm_load_ps(M_+28);
  2582. mt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(2,2,0,0));
  2583. out30= _mm_add_ps (out30,_mm_mul_ps( mt0,in00 ));
  2584. mtt0 = _mm_shuffle_ps(m00,m00,_MM_SHUFFLE(3,3,1,1));
  2585. out30= _mm_addsub_ps(out30,_mm_mul_ps(mtt0,in00_r));
  2586. out31 = _mm_add_ps (out31,_mm_mul_ps( mt0,in01 ));
  2587. out31 = _mm_addsub_ps(out31,_mm_mul_ps(mtt0,in01_r));
  2588. M_ += 32;
  2589. in0__ += 4;
  2590. in1__ += 4;
  2591. }
  2592. _mm_store_ps(OUT0,out00);
  2593. _mm_store_ps(OUT1,out01);
  2594. _mm_store_ps(OUT0+4,out10);
  2595. _mm_store_ps(OUT1+4,out11);
  2596. _mm_store_ps(OUT0+8,out20);
  2597. _mm_store_ps(OUT1+8,out21);
  2598. _mm_store_ps(OUT0+12,out30);
  2599. _mm_store_ps(OUT1+12,out31);
  2600. #endif
  2601. }
  2602. #endif
  2603. template <class Real_t>
  2604. void VListHadamard(size_t dof, size_t M_dim, size_t ker_dim0, size_t ker_dim1, Vector<size_t>& interac_dsp,
  2605. Vector<size_t>& interac_vec, Vector<Real_t*>& precomp_mat, Vector<Real_t>& fft_in, Vector<Real_t>& fft_out){
  2606. size_t chld_cnt=1UL<<COORD_DIM;
  2607. size_t fftsize_in =M_dim*ker_dim0*chld_cnt*2;
  2608. size_t fftsize_out=M_dim*ker_dim1*chld_cnt*2;
  2609. Real_t* zero_vec0=mem::aligned_new<Real_t>(fftsize_in );
  2610. Real_t* zero_vec1=mem::aligned_new<Real_t>(fftsize_out);
  2611. size_t n_out=fft_out.Dim()/fftsize_out;
  2612. // Set buff_out to zero.
  2613. #pragma omp parallel for
  2614. for(size_t k=0;k<n_out;k++){
  2615. Vector<Real_t> dnward_check_fft(fftsize_out, &fft_out[k*fftsize_out], false);
  2616. dnward_check_fft.SetZero();
  2617. }
  2618. // Build list of interaction pairs (in, out vectors).
  2619. size_t mat_cnt=precomp_mat.Dim();
  2620. size_t blk1_cnt=interac_dsp.Dim()/mat_cnt;
  2621. const size_t V_BLK_SIZE=V_BLK_CACHE*64/sizeof(Real_t);
  2622. Real_t** IN_ =mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2623. Real_t** OUT_=mem::aligned_new<Real_t*>(2*V_BLK_SIZE*blk1_cnt*mat_cnt);
  2624. #pragma omp parallel for
  2625. for(size_t interac_blk1=0; interac_blk1<blk1_cnt*mat_cnt; interac_blk1++){
  2626. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2627. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2628. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2629. for(size_t j=0;j<interac_cnt;j++){
  2630. IN_ [2*V_BLK_SIZE*interac_blk1 +j]=&fft_in [interac_vec[(interac_dsp0+j)*2+0]];
  2631. OUT_[2*V_BLK_SIZE*interac_blk1 +j]=&fft_out[interac_vec[(interac_dsp0+j)*2+1]];
  2632. }
  2633. IN_ [2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec0;
  2634. OUT_[2*V_BLK_SIZE*interac_blk1 +interac_cnt]=zero_vec1;
  2635. }
  2636. int omp_p=omp_get_max_threads();
  2637. #pragma omp parallel for
  2638. for(int pid=0; pid<omp_p; pid++){
  2639. size_t a=( pid *M_dim)/omp_p;
  2640. size_t b=((pid+1)*M_dim)/omp_p;
  2641. for(int in_dim=0;in_dim<ker_dim0;in_dim++)
  2642. for(int ot_dim=0;ot_dim<ker_dim1;ot_dim++)
  2643. for(size_t blk1=0; blk1<blk1_cnt; blk1++)
  2644. for(size_t k=a; k< b; k++)
  2645. for(size_t mat_indx=0; mat_indx< mat_cnt;mat_indx++){
  2646. size_t interac_blk1 = blk1*mat_cnt+mat_indx;
  2647. size_t interac_dsp0 = (interac_blk1==0?0:interac_dsp[interac_blk1-1]);
  2648. size_t interac_dsp1 = interac_dsp[interac_blk1 ] ;
  2649. size_t interac_cnt = interac_dsp1-interac_dsp0;
  2650. Real_t** IN = IN_ + 2*V_BLK_SIZE*interac_blk1;
  2651. Real_t** OUT= OUT_+ 2*V_BLK_SIZE*interac_blk1;
  2652. Real_t* M = precomp_mat[mat_indx] + k*chld_cnt*chld_cnt*2 + (ot_dim+in_dim*ker_dim1)*M_dim*128;
  2653. {
  2654. for(size_t j=0;j<interac_cnt;j+=2){
  2655. Real_t* M_ = M;
  2656. Real_t* IN0 = IN [j+0] + (in_dim*M_dim+k)*chld_cnt*2;
  2657. Real_t* IN1 = IN [j+1] + (in_dim*M_dim+k)*chld_cnt*2;
  2658. Real_t* OUT0 = OUT[j+0] + (ot_dim*M_dim+k)*chld_cnt*2;
  2659. Real_t* OUT1 = OUT[j+1] + (ot_dim*M_dim+k)*chld_cnt*2;
  2660. #ifdef __SSE__
  2661. if (j+2 < interac_cnt) { // Prefetch
  2662. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2663. _mm_prefetch(((char *)(IN[j+2] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2664. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2665. _mm_prefetch(((char *)(IN[j+3] + (in_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2666. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2667. _mm_prefetch(((char *)(OUT[j+2] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2668. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2)), _MM_HINT_T0);
  2669. _mm_prefetch(((char *)(OUT[j+3] + (ot_dim*M_dim+k)*chld_cnt*2) + 64), _MM_HINT_T0);
  2670. }
  2671. #endif
  2672. matmult_8x8x2(M_, IN0, IN1, OUT0, OUT1);
  2673. }
  2674. }
  2675. }
  2676. }
  2677. // Compute flops.
  2678. {
  2679. Profile::Add_FLOP(8*8*8*(interac_vec.Dim()/2)*M_dim*ker_dim0*ker_dim1*dof);
  2680. }
  2681. // Free memory
  2682. mem::aligned_delete<Real_t*>(IN_ );
  2683. mem::aligned_delete<Real_t*>(OUT_);
  2684. mem::aligned_delete<Real_t>(zero_vec0);
  2685. mem::aligned_delete<Real_t>(zero_vec1);
  2686. }
  2687. template <class FMMNode>
  2688. void FMM_Pts<FMMNode>::V_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  2689. if(!this->MultipoleOrder()) return;
  2690. if(level==0) return;
  2691. { // Set setup_data
  2692. setup_data.level=level;
  2693. setup_data.kernel=kernel->k_m2l;
  2694. setup_data.interac_type.resize(1);
  2695. setup_data.interac_type[0]=V1_Type;
  2696. setup_data. input_data=&buff[0];
  2697. setup_data.output_data=&buff[1];
  2698. Vector<FMMNode_t*>& nodes_in =n_list[2];
  2699. Vector<FMMNode_t*>& nodes_out=n_list[3];
  2700. setup_data.nodes_in .clear();
  2701. setup_data.nodes_out.clear();
  2702. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1 || level==-1) && nodes_in [i]->pt_cnt[0]) setup_data.nodes_in .push_back(nodes_in [i]);
  2703. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level-1 || level==-1) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
  2704. }
  2705. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  2706. std::vector<void*>& nodes_out=setup_data.nodes_out;
  2707. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  2708. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  2709. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_in [i])->Child(0))->FMMData())->upward_equiv);
  2710. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)((FMMNode*)nodes_out[i])->Child(0))->FMMData())->dnward_equiv);
  2711. /////////////////////////////////////////////////////////////////////////////
  2712. Real_t eps=1e-10;
  2713. size_t n_in =nodes_in .size();
  2714. size_t n_out=nodes_out.size();
  2715. // Setup precomputed data.
  2716. //if(setup_data.precomp_data->Dim(0)*setup_data.precomp_data->Dim(1)==0) SetupPrecomp(setup_data,device);
  2717. // Build interac_data
  2718. Profile::Tic("Interac-Data",&this->comm,true,25);
  2719. Matrix<char>& interac_data=setup_data.interac_data;
  2720. if(n_out>0 && n_in >0){ // Build precomp_data, interac_data
  2721. size_t precomp_offset=0;
  2722. Mat_Type& interac_type=setup_data.interac_type[0];
  2723. size_t mat_cnt=this->interac_list.ListCount(interac_type);
  2724. Matrix<size_t> precomp_data_offset;
  2725. std::vector<size_t> interac_mat;
  2726. std::vector<Real_t*> interac_mat_ptr;
  2727. #if 0 // Since we skip SetupPrecomp for V-list
  2728. { // Load precomp_data for interac_type.
  2729. struct HeaderData{
  2730. size_t total_size;
  2731. size_t level;
  2732. size_t mat_cnt ;
  2733. size_t max_depth;
  2734. };
  2735. Matrix<char>& precomp_data=*setup_data.precomp_data;
  2736. char* indx_ptr=precomp_data[0]+precomp_offset;
  2737. HeaderData& header=*(HeaderData*)indx_ptr;indx_ptr+=sizeof(HeaderData);
  2738. precomp_data_offset.ReInit(header.mat_cnt,1+(2+2)*header.max_depth, (size_t*)indx_ptr, false);
  2739. precomp_offset+=header.total_size;
  2740. for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
  2741. Matrix<Real_t>& M0 = this->mat->Mat(level, interac_type, mat_id);
  2742. assert(M0.Dim(0)>0 && M0.Dim(1)>0); UNUSED(M0);
  2743. interac_mat.push_back(precomp_data_offset[mat_id][0]);
  2744. }
  2745. }
  2746. #else
  2747. {
  2748. for(size_t mat_id=0;mat_id<mat_cnt;mat_id++){
  2749. Matrix<Real_t>& M = this->mat->Mat(level, interac_type, mat_id);
  2750. interac_mat_ptr.push_back(&M[0][0]);
  2751. }
  2752. }
  2753. #endif
  2754. size_t dof;
  2755. size_t m=MultipoleOrder();
  2756. size_t ker_dim0=setup_data.kernel->ker_dim[0];
  2757. size_t ker_dim1=setup_data.kernel->ker_dim[1];
  2758. size_t fftsize;
  2759. {
  2760. size_t n1=m*2;
  2761. size_t n2=n1*n1;
  2762. size_t n3_=n2*(n1/2+1);
  2763. size_t chld_cnt=1UL<<COORD_DIM;
  2764. fftsize=2*n3_*chld_cnt;
  2765. dof=1;
  2766. }
  2767. int omp_p=omp_get_max_threads();
  2768. size_t buff_size=DEVICE_BUFFER_SIZE*1024l*1024l;
  2769. size_t n_blk0=2*fftsize*dof*(ker_dim0*n_in +ker_dim1*n_out)*sizeof(Real_t)/buff_size;
  2770. if(n_blk0==0) n_blk0=1;
  2771. std::vector<std::vector<size_t> > fft_vec(n_blk0);
  2772. std::vector<std::vector<size_t> > ifft_vec(n_blk0);
  2773. std::vector<std::vector<Real_t> > fft_scl(n_blk0);
  2774. std::vector<std::vector<Real_t> > ifft_scl(n_blk0);
  2775. std::vector<std::vector<size_t> > interac_vec(n_blk0);
  2776. std::vector<std::vector<size_t> > interac_dsp(n_blk0);
  2777. {
  2778. Matrix<Real_t>& input_data=*setup_data. input_data;
  2779. Matrix<Real_t>& output_data=*setup_data.output_data;
  2780. std::vector<std::vector<FMMNode*> > nodes_blk_in (n_blk0);
  2781. std::vector<std::vector<FMMNode*> > nodes_blk_out(n_blk0);
  2782. Vector<Real_t> src_scal=this->kernel->k_m2l->src_scal;
  2783. Vector<Real_t> trg_scal=this->kernel->k_m2l->trg_scal;
  2784. for(size_t i=0;i<n_in;i++) ((FMMNode*)nodes_in[i])->node_id=i;
  2785. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2786. size_t blk0_start=(n_out* blk0 )/n_blk0;
  2787. size_t blk0_end =(n_out*(blk0+1))/n_blk0;
  2788. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2789. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2790. { // Build node list for blk0.
  2791. std::set<void*> nodes_in;
  2792. for(size_t i=blk0_start;i<blk0_end;i++){
  2793. nodes_out_.push_back((FMMNode*)nodes_out[i]);
  2794. Vector<FMMNode*>& lst=((FMMNode*)nodes_out[i])->interac_list[interac_type];
  2795. for(size_t k=0;k<mat_cnt;k++) if(lst[k]!=NULL && lst[k]->pt_cnt[0]) nodes_in.insert(lst[k]);
  2796. }
  2797. for(std::set<void*>::iterator node=nodes_in.begin(); node != nodes_in.end(); node++){
  2798. nodes_in_.push_back((FMMNode*)*node);
  2799. }
  2800. size_t input_dim=nodes_in_ .size()*ker_dim0*dof*fftsize;
  2801. size_t output_dim=nodes_out_.size()*ker_dim1*dof*fftsize;
  2802. size_t buffer_dim=2*(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  2803. if(buff_size<(input_dim + output_dim + buffer_dim)*sizeof(Real_t))
  2804. buff_size=(input_dim + output_dim + buffer_dim)*sizeof(Real_t);
  2805. }
  2806. { // Set fft vectors.
  2807. for(size_t i=0;i<nodes_in_ .size();i++) fft_vec[blk0].push_back((size_t)(& input_vector[nodes_in_[i]->node_id][0][0]- input_data[0]));
  2808. for(size_t i=0;i<nodes_out_.size();i++)ifft_vec[blk0].push_back((size_t)(&output_vector[blk0_start + i ][0][0]-output_data[0]));
  2809. size_t scal_dim0=src_scal.Dim();
  2810. size_t scal_dim1=trg_scal.Dim();
  2811. fft_scl [blk0].resize(nodes_in_ .size()*scal_dim0);
  2812. ifft_scl[blk0].resize(nodes_out_.size()*scal_dim1);
  2813. for(size_t i=0;i<nodes_in_ .size();i++){
  2814. size_t depth=nodes_in_[i]->Depth()+1;
  2815. for(size_t j=0;j<scal_dim0;j++){
  2816. fft_scl[blk0][i*scal_dim0+j]=pow(2.0, src_scal[j]*depth);
  2817. }
  2818. }
  2819. for(size_t i=0;i<nodes_out_.size();i++){
  2820. size_t depth=nodes_out_[i]->Depth()+1;
  2821. for(size_t j=0;j<scal_dim1;j++){
  2822. ifft_scl[blk0][i*scal_dim1+j]=pow(2.0, trg_scal[j]*depth);
  2823. }
  2824. }
  2825. }
  2826. }
  2827. for(size_t blk0=0;blk0<n_blk0;blk0++){ // Hadamard interactions.
  2828. std::vector<FMMNode*>& nodes_in_ =nodes_blk_in [blk0];
  2829. std::vector<FMMNode*>& nodes_out_=nodes_blk_out[blk0];
  2830. for(size_t i=0;i<nodes_in_.size();i++) nodes_in_[i]->node_id=i;
  2831. { // Next blocking level.
  2832. size_t n_blk1=nodes_out_.size()*(2)*sizeof(Real_t)/(64*V_BLK_CACHE);
  2833. if(n_blk1==0) n_blk1=1;
  2834. size_t interac_dsp_=0;
  2835. for(size_t blk1=0;blk1<n_blk1;blk1++){
  2836. size_t blk1_start=(nodes_out_.size()* blk1 )/n_blk1;
  2837. size_t blk1_end =(nodes_out_.size()*(blk1+1))/n_blk1;
  2838. for(size_t k=0;k<mat_cnt;k++){
  2839. for(size_t i=blk1_start;i<blk1_end;i++){
  2840. Vector<FMMNode*>& lst=((FMMNode*)nodes_out_[i])->interac_list[interac_type];
  2841. if(lst[k]!=NULL && lst[k]->pt_cnt[0]){
  2842. interac_vec[blk0].push_back(lst[k]->node_id*fftsize*ker_dim0*dof);
  2843. interac_vec[blk0].push_back( i *fftsize*ker_dim1*dof);
  2844. interac_dsp_++;
  2845. }
  2846. }
  2847. interac_dsp[blk0].push_back(interac_dsp_);
  2848. }
  2849. }
  2850. }
  2851. }
  2852. }
  2853. { // Set interac_data.
  2854. size_t data_size=sizeof(size_t)*6; // buff_size, m, dof, ker_dim0, ker_dim1, n_blk0
  2855. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2856. data_size+=sizeof(size_t)+ fft_vec[blk0].size()*sizeof(size_t);
  2857. data_size+=sizeof(size_t)+ ifft_vec[blk0].size()*sizeof(size_t);
  2858. data_size+=sizeof(size_t)+ fft_scl[blk0].size()*sizeof(Real_t);
  2859. data_size+=sizeof(size_t)+ ifft_scl[blk0].size()*sizeof(Real_t);
  2860. data_size+=sizeof(size_t)+interac_vec[blk0].size()*sizeof(size_t);
  2861. data_size+=sizeof(size_t)+interac_dsp[blk0].size()*sizeof(size_t);
  2862. }
  2863. data_size+=sizeof(size_t)+interac_mat.size()*sizeof(size_t);
  2864. data_size+=sizeof(size_t)+interac_mat_ptr.size()*sizeof(Real_t*);
  2865. if(data_size>interac_data.Dim(0)*interac_data.Dim(1))
  2866. interac_data.ReInit(1,data_size);
  2867. char* data_ptr=&interac_data[0][0];
  2868. ((size_t*)data_ptr)[0]=buff_size; data_ptr+=sizeof(size_t);
  2869. ((size_t*)data_ptr)[0]= m; data_ptr+=sizeof(size_t);
  2870. ((size_t*)data_ptr)[0]= dof; data_ptr+=sizeof(size_t);
  2871. ((size_t*)data_ptr)[0]= ker_dim0; data_ptr+=sizeof(size_t);
  2872. ((size_t*)data_ptr)[0]= ker_dim1; data_ptr+=sizeof(size_t);
  2873. ((size_t*)data_ptr)[0]= n_blk0; data_ptr+=sizeof(size_t);
  2874. ((size_t*)data_ptr)[0]= interac_mat.size(); data_ptr+=sizeof(size_t);
  2875. mem::memcopy(data_ptr, &interac_mat[0], interac_mat.size()*sizeof(size_t));
  2876. data_ptr+=interac_mat.size()*sizeof(size_t);
  2877. ((size_t*)data_ptr)[0]= interac_mat_ptr.size(); data_ptr+=sizeof(size_t);
  2878. mem::memcopy(data_ptr, &interac_mat_ptr[0], interac_mat_ptr.size()*sizeof(Real_t*));
  2879. data_ptr+=interac_mat_ptr.size()*sizeof(Real_t*);
  2880. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2881. ((size_t*)data_ptr)[0]= fft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2882. mem::memcopy(data_ptr, & fft_vec[blk0][0], fft_vec[blk0].size()*sizeof(size_t));
  2883. data_ptr+= fft_vec[blk0].size()*sizeof(size_t);
  2884. ((size_t*)data_ptr)[0]=ifft_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2885. mem::memcopy(data_ptr, &ifft_vec[blk0][0], ifft_vec[blk0].size()*sizeof(size_t));
  2886. data_ptr+=ifft_vec[blk0].size()*sizeof(size_t);
  2887. ((size_t*)data_ptr)[0]= fft_scl[blk0].size(); data_ptr+=sizeof(size_t);
  2888. mem::memcopy(data_ptr, & fft_scl[blk0][0], fft_scl[blk0].size()*sizeof(Real_t));
  2889. data_ptr+= fft_scl[blk0].size()*sizeof(Real_t);
  2890. ((size_t*)data_ptr)[0]=ifft_scl[blk0].size(); data_ptr+=sizeof(size_t);
  2891. mem::memcopy(data_ptr, &ifft_scl[blk0][0], ifft_scl[blk0].size()*sizeof(Real_t));
  2892. data_ptr+=ifft_scl[blk0].size()*sizeof(Real_t);
  2893. ((size_t*)data_ptr)[0]=interac_vec[blk0].size(); data_ptr+=sizeof(size_t);
  2894. mem::memcopy(data_ptr, &interac_vec[blk0][0], interac_vec[blk0].size()*sizeof(size_t));
  2895. data_ptr+=interac_vec[blk0].size()*sizeof(size_t);
  2896. ((size_t*)data_ptr)[0]=interac_dsp[blk0].size(); data_ptr+=sizeof(size_t);
  2897. mem::memcopy(data_ptr, &interac_dsp[blk0][0], interac_dsp[blk0].size()*sizeof(size_t));
  2898. data_ptr+=interac_dsp[blk0].size()*sizeof(size_t);
  2899. }
  2900. }
  2901. }
  2902. Profile::Toc();
  2903. if(device){ // Host2Device
  2904. Profile::Tic("Host2Device",&this->comm,false,25);
  2905. setup_data.interac_data. AllocDevice(true);
  2906. Profile::Toc();
  2907. }
  2908. }
  2909. template <class FMMNode>
  2910. void FMM_Pts<FMMNode>::V_List (SetupData<Real_t>& setup_data, bool device){
  2911. if(!this->MultipoleOrder()) return;
  2912. assert(!device); //Can not run on accelerator yet.
  2913. int np;
  2914. MPI_Comm_size(comm,&np);
  2915. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  2916. if(np>1) Profile::Tic("Host2Device",&this->comm,false,25);
  2917. if(np>1) Profile::Toc();
  2918. return;
  2919. }
  2920. Profile::Tic("Host2Device",&this->comm,false,25);
  2921. int level=setup_data.level;
  2922. size_t buff_size=*((size_t*)&setup_data.interac_data[0][0]);
  2923. typename Vector<char>::Device buff;
  2924. //typename Matrix<char>::Device precomp_data;
  2925. typename Matrix<char>::Device interac_data;
  2926. typename Matrix<Real_t>::Device input_data;
  2927. typename Matrix<Real_t>::Device output_data;
  2928. if(device){
  2929. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  2930. buff = this-> dev_buffer. AllocDevice(false);
  2931. //precomp_data= setup_data.precomp_data->AllocDevice(false);
  2932. interac_data= setup_data.interac_data. AllocDevice(false);
  2933. input_data = setup_data. input_data->AllocDevice(false);
  2934. output_data = setup_data. output_data->AllocDevice(false);
  2935. }else{
  2936. if(this->dev_buffer.Dim()<buff_size) this->dev_buffer.ReInit(buff_size);
  2937. buff = this-> dev_buffer;
  2938. //precomp_data=*setup_data.precomp_data;
  2939. interac_data= setup_data.interac_data;
  2940. input_data =*setup_data. input_data;
  2941. output_data =*setup_data. output_data;
  2942. }
  2943. Profile::Toc();
  2944. { // Offloaded computation.
  2945. // Set interac_data.
  2946. size_t m, dof, ker_dim0, ker_dim1, n_blk0;
  2947. std::vector<Vector<size_t> > fft_vec;
  2948. std::vector<Vector<size_t> > ifft_vec;
  2949. std::vector<Vector<Real_t> > fft_scl;
  2950. std::vector<Vector<Real_t> > ifft_scl;
  2951. std::vector<Vector<size_t> > interac_vec;
  2952. std::vector<Vector<size_t> > interac_dsp;
  2953. Vector<Real_t*> precomp_mat;
  2954. { // Set interac_data.
  2955. char* data_ptr=&interac_data[0][0];
  2956. buff_size=((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2957. m =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2958. dof =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2959. ker_dim0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2960. ker_dim1 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2961. n_blk0 =((size_t*)data_ptr)[0]; data_ptr+=sizeof(size_t);
  2962. fft_vec .resize(n_blk0);
  2963. ifft_vec.resize(n_blk0);
  2964. fft_scl .resize(n_blk0);
  2965. ifft_scl.resize(n_blk0);
  2966. interac_vec.resize(n_blk0);
  2967. interac_dsp.resize(n_blk0);
  2968. Vector<size_t> interac_mat;
  2969. interac_mat.ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2970. data_ptr+=sizeof(size_t)+interac_mat.Dim()*sizeof(size_t);
  2971. Vector<Real_t*> interac_mat_ptr;
  2972. interac_mat_ptr.ReInit(((size_t*)data_ptr)[0],(Real_t**)(data_ptr+sizeof(size_t)),false);
  2973. data_ptr+=sizeof(size_t)+interac_mat_ptr.Dim()*sizeof(Real_t*);
  2974. #if 0 // Since we skip SetupPrecomp for V-list
  2975. precomp_mat.Resize(interac_mat.Dim());
  2976. for(size_t i=0;i<interac_mat.Dim();i++){
  2977. precomp_mat[i]=(Real_t*)(precomp_data[0]+interac_mat[i]);
  2978. }
  2979. #else
  2980. precomp_mat.Resize(interac_mat_ptr.Dim());
  2981. for(size_t i=0;i<interac_mat_ptr.Dim();i++){
  2982. precomp_mat[i]=interac_mat_ptr[i];
  2983. }
  2984. #endif
  2985. for(size_t blk0=0;blk0<n_blk0;blk0++){
  2986. fft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2987. data_ptr+=sizeof(size_t)+fft_vec[blk0].Dim()*sizeof(size_t);
  2988. ifft_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2989. data_ptr+=sizeof(size_t)+ifft_vec[blk0].Dim()*sizeof(size_t);
  2990. fft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  2991. data_ptr+=sizeof(size_t)+fft_scl[blk0].Dim()*sizeof(Real_t);
  2992. ifft_scl[blk0].ReInit(((size_t*)data_ptr)[0],(Real_t*)(data_ptr+sizeof(size_t)),false);
  2993. data_ptr+=sizeof(size_t)+ifft_scl[blk0].Dim()*sizeof(Real_t);
  2994. interac_vec[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2995. data_ptr+=sizeof(size_t)+interac_vec[blk0].Dim()*sizeof(size_t);
  2996. interac_dsp[blk0].ReInit(((size_t*)data_ptr)[0],(size_t*)(data_ptr+sizeof(size_t)),false);
  2997. data_ptr+=sizeof(size_t)+interac_dsp[blk0].Dim()*sizeof(size_t);
  2998. }
  2999. }
  3000. int omp_p=omp_get_max_threads();
  3001. size_t M_dim, fftsize;
  3002. {
  3003. size_t n1=m*2;
  3004. size_t n2=n1*n1;
  3005. size_t n3_=n2*(n1/2+1);
  3006. size_t chld_cnt=1UL<<COORD_DIM;
  3007. fftsize=2*n3_*chld_cnt;
  3008. M_dim=n3_;
  3009. }
  3010. for(size_t blk0=0;blk0<n_blk0;blk0++){ // interactions
  3011. size_t n_in = fft_vec[blk0].Dim();
  3012. size_t n_out=ifft_vec[blk0].Dim();
  3013. size_t input_dim=n_in *ker_dim0*dof*fftsize;
  3014. size_t output_dim=n_out*ker_dim1*dof*fftsize;
  3015. size_t buffer_dim=2*(ker_dim0+ker_dim1)*dof*fftsize*omp_p;
  3016. Vector<Real_t> fft_in ( input_dim, (Real_t*)&buff[ 0 ],false);
  3017. Vector<Real_t> fft_out(output_dim, (Real_t*)&buff[ input_dim *sizeof(Real_t)],false);
  3018. Vector<Real_t> buffer(buffer_dim, (Real_t*)&buff[(input_dim+output_dim)*sizeof(Real_t)],false);
  3019. { // FFT
  3020. if(np==1) Profile::Tic("FFT",&comm,false,100);
  3021. Vector<Real_t> input_data_( input_data.dim[0]* input_data.dim[1], input_data[0], false);
  3022. FFT_UpEquiv(dof, m, ker_dim0, fft_vec[blk0], fft_scl[blk0], input_data_, fft_in, buffer);
  3023. if(np==1) Profile::Toc();
  3024. }
  3025. { // Hadamard
  3026. #ifdef PVFMM_HAVE_PAPI
  3027. #ifdef __VERBOSE__
  3028. std::cout << "Starting counters new\n";
  3029. if (PAPI_start(EventSet) != PAPI_OK) std::cout << "handle_error3" << std::endl;
  3030. #endif
  3031. #endif
  3032. if(np==1) Profile::Tic("HadamardProduct",&comm,false,100);
  3033. VListHadamard<Real_t>(dof, M_dim, ker_dim0, ker_dim1, interac_dsp[blk0], interac_vec[blk0], precomp_mat, fft_in, fft_out);
  3034. if(np==1) Profile::Toc();
  3035. #ifdef PVFMM_HAVE_PAPI
  3036. #ifdef __VERBOSE__
  3037. if (PAPI_stop(EventSet, values) != PAPI_OK) std::cout << "handle_error4" << std::endl;
  3038. std::cout << "Stopping counters\n";
  3039. #endif
  3040. #endif
  3041. }
  3042. { // IFFT
  3043. if(np==1) Profile::Tic("IFFT",&comm,false,100);
  3044. Vector<Real_t> output_data_(output_data.dim[0]*output_data.dim[1], output_data[0], false);
  3045. FFT_Check2Equiv(dof, m, ker_dim1, ifft_vec[blk0], ifft_scl[blk0], fft_out, output_data_, buffer);
  3046. if(np==1) Profile::Toc();
  3047. }
  3048. }
  3049. }
  3050. }
  3051. template <class FMMNode>
  3052. void FMM_Pts<FMMNode>::Down2DownSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3053. if(!this->MultipoleOrder()) return;
  3054. { // Set setup_data
  3055. setup_data.level=level;
  3056. setup_data.kernel=kernel->k_l2l;
  3057. setup_data.interac_type.resize(1);
  3058. setup_data.interac_type[0]=D2D_Type;
  3059. setup_data. input_data=&buff[1];
  3060. setup_data.output_data=&buff[1];
  3061. Vector<FMMNode_t*>& nodes_in =n_list[1];
  3062. Vector<FMMNode_t*>& nodes_out=n_list[1];
  3063. setup_data.nodes_in .clear();
  3064. setup_data.nodes_out.clear();
  3065. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level-1) && nodes_in [i]->pt_cnt[1]) setup_data.nodes_in .push_back(nodes_in [i]);
  3066. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level ) && nodes_out[i]->pt_cnt[1]) setup_data.nodes_out.push_back(nodes_out[i]);
  3067. }
  3068. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3069. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3070. std::vector<Vector<Real_t>*>& input_vector=setup_data. input_vector; input_vector.clear();
  3071. std::vector<Vector<Real_t>*>& output_vector=setup_data.output_vector; output_vector.clear();
  3072. for(size_t i=0;i<nodes_in .size();i++) input_vector.push_back(&((FMMData*)((FMMNode*)nodes_in [i])->FMMData())->dnward_equiv);
  3073. for(size_t i=0;i<nodes_out.size();i++) output_vector.push_back(&((FMMData*)((FMMNode*)nodes_out[i])->FMMData())->dnward_equiv);
  3074. SetupInterac(setup_data,device);
  3075. }
  3076. template <class FMMNode>
  3077. void FMM_Pts<FMMNode>::Down2Down (SetupData<Real_t>& setup_data, bool device){
  3078. if(!this->MultipoleOrder()) return;
  3079. //Add Down2Down contribution.
  3080. EvalList(setup_data, device);
  3081. }
  3082. template <class FMMNode>
  3083. void FMM_Pts<FMMNode>::PtSetup(SetupData<Real_t>& setup_data, void* data_){
  3084. struct PackedData{
  3085. size_t len;
  3086. Matrix<Real_t>* ptr;
  3087. Vector<size_t> cnt;
  3088. Vector<size_t> dsp;
  3089. };
  3090. struct InteracData{
  3091. Vector<size_t> in_node;
  3092. Vector<size_t> scal_idx;
  3093. Vector<Real_t> coord_shift;
  3094. Vector<size_t> interac_cnt;
  3095. Vector<size_t> interac_dsp;
  3096. Vector<size_t> interac_cst;
  3097. Vector<Real_t> scal[4*MAX_DEPTH];
  3098. Matrix<Real_t> M[4];
  3099. };
  3100. struct ptSetupData{
  3101. int level;
  3102. const Kernel<Real_t>* kernel;
  3103. PackedData src_coord; // Src coord
  3104. PackedData src_value; // Src density
  3105. PackedData srf_coord; // Srf coord
  3106. PackedData srf_value; // Srf density
  3107. PackedData trg_coord; // Trg coord
  3108. PackedData trg_value; // Trg potential
  3109. InteracData interac_data;
  3110. };
  3111. ptSetupData& data=*(ptSetupData*)data_;
  3112. if(data.interac_data.interac_cnt.Dim()){ // Set data.interac_data.interac_cst
  3113. InteracData& intdata=data.interac_data;
  3114. Vector<size_t> cnt;
  3115. Vector<size_t>& dsp=intdata.interac_cst;
  3116. cnt.ReInit(intdata.interac_cnt.Dim());
  3117. dsp.ReInit(intdata.interac_dsp.Dim());
  3118. for(size_t trg=0;trg<cnt.Dim();trg++){
  3119. size_t trg_cnt=data.trg_coord.cnt[trg];
  3120. cnt[trg]=0;
  3121. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3122. size_t int_id=intdata.interac_dsp[trg]+i;
  3123. size_t src=intdata.in_node[int_id];
  3124. size_t src_cnt=data.src_coord.cnt[src];
  3125. size_t srf_cnt=data.srf_coord.cnt[src];
  3126. cnt[trg]+=(src_cnt+srf_cnt)*trg_cnt;
  3127. }
  3128. }
  3129. dsp[0]=cnt[0];
  3130. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  3131. }
  3132. { // pack data
  3133. struct PackedSetupData{
  3134. size_t size;
  3135. int level;
  3136. const Kernel<Real_t>* kernel;
  3137. Matrix<Real_t>* src_coord; // Src coord
  3138. Matrix<Real_t>* src_value; // Src density
  3139. Matrix<Real_t>* srf_coord; // Srf coord
  3140. Matrix<Real_t>* srf_value; // Srf density
  3141. Matrix<Real_t>* trg_coord; // Trg coord
  3142. Matrix<Real_t>* trg_value; // Trg potential
  3143. size_t src_coord_cnt_size; size_t src_coord_cnt_offset;
  3144. size_t src_coord_dsp_size; size_t src_coord_dsp_offset;
  3145. size_t src_value_cnt_size; size_t src_value_cnt_offset;
  3146. size_t src_value_dsp_size; size_t src_value_dsp_offset;
  3147. size_t srf_coord_cnt_size; size_t srf_coord_cnt_offset;
  3148. size_t srf_coord_dsp_size; size_t srf_coord_dsp_offset;
  3149. size_t srf_value_cnt_size; size_t srf_value_cnt_offset;
  3150. size_t srf_value_dsp_size; size_t srf_value_dsp_offset;
  3151. size_t trg_coord_cnt_size; size_t trg_coord_cnt_offset;
  3152. size_t trg_coord_dsp_size; size_t trg_coord_dsp_offset;
  3153. size_t trg_value_cnt_size; size_t trg_value_cnt_offset;
  3154. size_t trg_value_dsp_size; size_t trg_value_dsp_offset;
  3155. // interac_data
  3156. size_t in_node_size; size_t in_node_offset;
  3157. size_t scal_idx_size; size_t scal_idx_offset;
  3158. size_t coord_shift_size; size_t coord_shift_offset;
  3159. size_t interac_cnt_size; size_t interac_cnt_offset;
  3160. size_t interac_dsp_size; size_t interac_dsp_offset;
  3161. size_t interac_cst_size; size_t interac_cst_offset;
  3162. size_t scal_dim[4*MAX_DEPTH]; size_t scal_offset[4*MAX_DEPTH];
  3163. size_t Mdim[4][2]; size_t M_offset[4];
  3164. };
  3165. PackedSetupData pkd_data;
  3166. { // Set pkd_data
  3167. size_t offset=mem::align_ptr(sizeof(PackedSetupData));
  3168. pkd_data. level=data. level;
  3169. pkd_data.kernel=data.kernel;
  3170. pkd_data.src_coord=data.src_coord.ptr;
  3171. pkd_data.src_value=data.src_value.ptr;
  3172. pkd_data.srf_coord=data.srf_coord.ptr;
  3173. pkd_data.srf_value=data.srf_value.ptr;
  3174. pkd_data.trg_coord=data.trg_coord.ptr;
  3175. pkd_data.trg_value=data.trg_value.ptr;
  3176. pkd_data.src_coord_cnt_offset=offset; pkd_data.src_coord_cnt_size=data.src_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_coord_cnt_size);
  3177. pkd_data.src_coord_dsp_offset=offset; pkd_data.src_coord_dsp_size=data.src_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_coord_dsp_size);
  3178. pkd_data.src_value_cnt_offset=offset; pkd_data.src_value_cnt_size=data.src_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_value_cnt_size);
  3179. pkd_data.src_value_dsp_offset=offset; pkd_data.src_value_dsp_size=data.src_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.src_value_dsp_size);
  3180. pkd_data.srf_coord_cnt_offset=offset; pkd_data.srf_coord_cnt_size=data.srf_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_coord_cnt_size);
  3181. pkd_data.srf_coord_dsp_offset=offset; pkd_data.srf_coord_dsp_size=data.srf_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_coord_dsp_size);
  3182. pkd_data.srf_value_cnt_offset=offset; pkd_data.srf_value_cnt_size=data.srf_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_value_cnt_size);
  3183. pkd_data.srf_value_dsp_offset=offset; pkd_data.srf_value_dsp_size=data.srf_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.srf_value_dsp_size);
  3184. pkd_data.trg_coord_cnt_offset=offset; pkd_data.trg_coord_cnt_size=data.trg_coord.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_coord_cnt_size);
  3185. pkd_data.trg_coord_dsp_offset=offset; pkd_data.trg_coord_dsp_size=data.trg_coord.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_coord_dsp_size);
  3186. pkd_data.trg_value_cnt_offset=offset; pkd_data.trg_value_cnt_size=data.trg_value.cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_value_cnt_size);
  3187. pkd_data.trg_value_dsp_offset=offset; pkd_data.trg_value_dsp_size=data.trg_value.dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.trg_value_dsp_size);
  3188. InteracData& intdata=data.interac_data;
  3189. pkd_data. in_node_offset=offset; pkd_data. in_node_size=intdata. in_node.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data. in_node_size);
  3190. pkd_data. scal_idx_offset=offset; pkd_data. scal_idx_size=intdata. scal_idx.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data. scal_idx_size);
  3191. pkd_data.coord_shift_offset=offset; pkd_data.coord_shift_size=intdata.coord_shift.Dim(); offset+=mem::align_ptr(sizeof(Real_t)*pkd_data.coord_shift_size);
  3192. pkd_data.interac_cnt_offset=offset; pkd_data.interac_cnt_size=intdata.interac_cnt.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_cnt_size);
  3193. pkd_data.interac_dsp_offset=offset; pkd_data.interac_dsp_size=intdata.interac_dsp.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_dsp_size);
  3194. pkd_data.interac_cst_offset=offset; pkd_data.interac_cst_size=intdata.interac_cst.Dim(); offset+=mem::align_ptr(sizeof(size_t)*pkd_data.interac_cst_size);
  3195. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3196. pkd_data.scal_offset[i]=offset; pkd_data.scal_dim[i]=intdata.scal[i].Dim(); offset+=mem::align_ptr(sizeof(Real_t)*pkd_data.scal_dim[i]);
  3197. }
  3198. for(size_t i=0;i<4;i++){
  3199. size_t& Mdim0=pkd_data.Mdim[i][0];
  3200. size_t& Mdim1=pkd_data.Mdim[i][1];
  3201. pkd_data.M_offset[i]=offset; Mdim0=intdata.M[i].Dim(0); Mdim1=intdata.M[i].Dim(1); offset+=mem::align_ptr(sizeof(Real_t)*Mdim0*Mdim1);
  3202. }
  3203. pkd_data.size=offset;
  3204. }
  3205. { // Set setup_data.interac_data
  3206. Matrix<char>& buff=setup_data.interac_data;
  3207. if(pkd_data.size>buff.Dim(0)*buff.Dim(1)){
  3208. buff.ReInit(1,pkd_data.size);
  3209. }
  3210. ((PackedSetupData*)buff[0])[0]=pkd_data;
  3211. if(pkd_data.src_coord_cnt_size) memcpy(&buff[0][pkd_data.src_coord_cnt_offset], &data.src_coord.cnt[0], pkd_data.src_coord_cnt_size*sizeof(size_t));
  3212. if(pkd_data.src_coord_dsp_size) memcpy(&buff[0][pkd_data.src_coord_dsp_offset], &data.src_coord.dsp[0], pkd_data.src_coord_dsp_size*sizeof(size_t));
  3213. if(pkd_data.src_value_cnt_size) memcpy(&buff[0][pkd_data.src_value_cnt_offset], &data.src_value.cnt[0], pkd_data.src_value_cnt_size*sizeof(size_t));
  3214. if(pkd_data.src_value_dsp_size) memcpy(&buff[0][pkd_data.src_value_dsp_offset], &data.src_value.dsp[0], pkd_data.src_value_dsp_size*sizeof(size_t));
  3215. if(pkd_data.srf_coord_cnt_size) memcpy(&buff[0][pkd_data.srf_coord_cnt_offset], &data.srf_coord.cnt[0], pkd_data.srf_coord_cnt_size*sizeof(size_t));
  3216. if(pkd_data.srf_coord_dsp_size) memcpy(&buff[0][pkd_data.srf_coord_dsp_offset], &data.srf_coord.dsp[0], pkd_data.srf_coord_dsp_size*sizeof(size_t));
  3217. if(pkd_data.srf_value_cnt_size) memcpy(&buff[0][pkd_data.srf_value_cnt_offset], &data.srf_value.cnt[0], pkd_data.srf_value_cnt_size*sizeof(size_t));
  3218. if(pkd_data.srf_value_dsp_size) memcpy(&buff[0][pkd_data.srf_value_dsp_offset], &data.srf_value.dsp[0], pkd_data.srf_value_dsp_size*sizeof(size_t));
  3219. if(pkd_data.trg_coord_cnt_size) memcpy(&buff[0][pkd_data.trg_coord_cnt_offset], &data.trg_coord.cnt[0], pkd_data.trg_coord_cnt_size*sizeof(size_t));
  3220. if(pkd_data.trg_coord_dsp_size) memcpy(&buff[0][pkd_data.trg_coord_dsp_offset], &data.trg_coord.dsp[0], pkd_data.trg_coord_dsp_size*sizeof(size_t));
  3221. if(pkd_data.trg_value_cnt_size) memcpy(&buff[0][pkd_data.trg_value_cnt_offset], &data.trg_value.cnt[0], pkd_data.trg_value_cnt_size*sizeof(size_t));
  3222. if(pkd_data.trg_value_dsp_size) memcpy(&buff[0][pkd_data.trg_value_dsp_offset], &data.trg_value.dsp[0], pkd_data.trg_value_dsp_size*sizeof(size_t));
  3223. InteracData& intdata=data.interac_data;
  3224. if(pkd_data. in_node_size) memcpy(&buff[0][pkd_data. in_node_offset], &intdata. in_node[0], pkd_data. in_node_size*sizeof(size_t));
  3225. if(pkd_data. scal_idx_size) memcpy(&buff[0][pkd_data. scal_idx_offset], &intdata. scal_idx[0], pkd_data. scal_idx_size*sizeof(size_t));
  3226. if(pkd_data.coord_shift_size) memcpy(&buff[0][pkd_data.coord_shift_offset], &intdata.coord_shift[0], pkd_data.coord_shift_size*sizeof(Real_t));
  3227. if(pkd_data.interac_cnt_size) memcpy(&buff[0][pkd_data.interac_cnt_offset], &intdata.interac_cnt[0], pkd_data.interac_cnt_size*sizeof(size_t));
  3228. if(pkd_data.interac_dsp_size) memcpy(&buff[0][pkd_data.interac_dsp_offset], &intdata.interac_dsp[0], pkd_data.interac_dsp_size*sizeof(size_t));
  3229. if(pkd_data.interac_cst_size) memcpy(&buff[0][pkd_data.interac_cst_offset], &intdata.interac_cst[0], pkd_data.interac_cst_size*sizeof(size_t));
  3230. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3231. if(intdata.scal[i].Dim()) memcpy(&buff[0][pkd_data.scal_offset[i]], &intdata.scal[i][0], intdata.scal[i].Dim()*sizeof(Real_t));
  3232. }
  3233. for(size_t i=0;i<4;i++){
  3234. if(intdata.M[i].Dim(0)*intdata.M[i].Dim(1)) memcpy(&buff[0][pkd_data.M_offset[i]], &intdata.M[i][0][0], intdata.M[i].Dim(0)*intdata.M[i].Dim(1)*sizeof(Real_t));
  3235. }
  3236. }
  3237. }
  3238. { // Resize device buffer
  3239. size_t n=setup_data.output_data->Dim(0)*setup_data.output_data->Dim(1)*sizeof(Real_t);
  3240. if(this->dev_buffer.Dim()<n) this->dev_buffer.ReInit(n);
  3241. }
  3242. }
  3243. template <class FMMNode>
  3244. template <int SYNC>
  3245. void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
  3246. if(setup_data.kernel->ker_dim[0]*setup_data.kernel->ker_dim[1]==0) return;
  3247. if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
  3248. Profile::Tic("Host2Device",&this->comm,false,25);
  3249. Profile::Toc();
  3250. Profile::Tic("DeviceComp",&this->comm,false,20);
  3251. Profile::Toc();
  3252. return;
  3253. }
  3254. bool have_gpu=false;
  3255. #if defined(PVFMM_HAVE_CUDA)
  3256. have_gpu=true;
  3257. #endif
  3258. Profile::Tic("Host2Device",&this->comm,false,25);
  3259. typename Vector<char>::Device dev_buff;
  3260. typename Matrix<char>::Device interac_data;
  3261. typename Matrix<Real_t>::Device coord_data;
  3262. typename Matrix<Real_t>::Device input_data;
  3263. typename Matrix<Real_t>::Device output_data;
  3264. size_t ptr_single_layer_kernel=(size_t)NULL;
  3265. size_t ptr_double_layer_kernel=(size_t)NULL;
  3266. if(device && !have_gpu){
  3267. dev_buff = this-> dev_buffer. AllocDevice(false);
  3268. interac_data= setup_data.interac_data. AllocDevice(false);
  3269. if(setup_data. coord_data!=NULL) coord_data = setup_data. coord_data->AllocDevice(false);
  3270. if(setup_data. input_data!=NULL) input_data = setup_data. input_data->AllocDevice(false);
  3271. if(setup_data. output_data!=NULL) output_data = setup_data. output_data->AllocDevice(false);
  3272. ptr_single_layer_kernel=setup_data.kernel->dev_ker_poten;
  3273. ptr_double_layer_kernel=setup_data.kernel->dev_dbl_layer_poten;
  3274. }else{
  3275. dev_buff = this-> dev_buffer;
  3276. interac_data= setup_data.interac_data;
  3277. if(setup_data. coord_data!=NULL) coord_data =*setup_data. coord_data;
  3278. if(setup_data. input_data!=NULL) input_data =*setup_data. input_data;
  3279. if(setup_data. output_data!=NULL) output_data =*setup_data. output_data;
  3280. ptr_single_layer_kernel=(size_t)setup_data.kernel->ker_poten;
  3281. ptr_double_layer_kernel=(size_t)setup_data.kernel->dbl_layer_poten;
  3282. }
  3283. Profile::Toc();
  3284. Profile::Tic("DeviceComp",&this->comm,false,20);
  3285. int lock_idx=-1;
  3286. int wait_lock_idx=-1;
  3287. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  3288. if(device) lock_idx=MIC_Lock::get_lock();
  3289. #ifdef __INTEL_OFFLOAD
  3290. #pragma offload if(device) target(mic:0) signal(&MIC_Lock::lock_vec[device?lock_idx:0])
  3291. #endif
  3292. { // Offloaded computation.
  3293. struct PackedData{
  3294. size_t len;
  3295. Matrix<Real_t>* ptr;
  3296. Vector<size_t> cnt;
  3297. Vector<size_t> dsp;
  3298. };
  3299. struct InteracData{
  3300. Vector<size_t> in_node;
  3301. Vector<size_t> scal_idx;
  3302. Vector<Real_t> coord_shift;
  3303. Vector<size_t> interac_cnt;
  3304. Vector<size_t> interac_dsp;
  3305. Vector<size_t> interac_cst;
  3306. Vector<Real_t> scal[4*MAX_DEPTH];
  3307. Matrix<Real_t> M[4];
  3308. };
  3309. struct ptSetupData{
  3310. int level;
  3311. const Kernel<Real_t>* kernel;
  3312. PackedData src_coord; // Src coord
  3313. PackedData src_value; // Src density
  3314. PackedData srf_coord; // Srf coord
  3315. PackedData srf_value; // Srf density
  3316. PackedData trg_coord; // Trg coord
  3317. PackedData trg_value; // Trg potential
  3318. InteracData interac_data;
  3319. };
  3320. ptSetupData data;
  3321. { // Initialize data
  3322. struct PackedSetupData{
  3323. size_t size;
  3324. int level;
  3325. const Kernel<Real_t>* kernel;
  3326. Matrix<Real_t>* src_coord; // Src coord
  3327. Matrix<Real_t>* src_value; // Src density
  3328. Matrix<Real_t>* srf_coord; // Srf coord
  3329. Matrix<Real_t>* srf_value; // Srf density
  3330. Matrix<Real_t>* trg_coord; // Trg coord
  3331. Matrix<Real_t>* trg_value; // Trg potential
  3332. size_t src_coord_cnt_size; size_t src_coord_cnt_offset;
  3333. size_t src_coord_dsp_size; size_t src_coord_dsp_offset;
  3334. size_t src_value_cnt_size; size_t src_value_cnt_offset;
  3335. size_t src_value_dsp_size; size_t src_value_dsp_offset;
  3336. size_t srf_coord_cnt_size; size_t srf_coord_cnt_offset;
  3337. size_t srf_coord_dsp_size; size_t srf_coord_dsp_offset;
  3338. size_t srf_value_cnt_size; size_t srf_value_cnt_offset;
  3339. size_t srf_value_dsp_size; size_t srf_value_dsp_offset;
  3340. size_t trg_coord_cnt_size; size_t trg_coord_cnt_offset;
  3341. size_t trg_coord_dsp_size; size_t trg_coord_dsp_offset;
  3342. size_t trg_value_cnt_size; size_t trg_value_cnt_offset;
  3343. size_t trg_value_dsp_size; size_t trg_value_dsp_offset;
  3344. // interac_data
  3345. size_t in_node_size; size_t in_node_offset;
  3346. size_t scal_idx_size; size_t scal_idx_offset;
  3347. size_t coord_shift_size; size_t coord_shift_offset;
  3348. size_t interac_cnt_size; size_t interac_cnt_offset;
  3349. size_t interac_dsp_size; size_t interac_dsp_offset;
  3350. size_t interac_cst_size; size_t interac_cst_offset;
  3351. size_t scal_dim[4*MAX_DEPTH]; size_t scal_offset[4*MAX_DEPTH];
  3352. size_t Mdim[4][2]; size_t M_offset[4];
  3353. };
  3354. typename Matrix<char>::Device& setupdata=interac_data;
  3355. PackedSetupData& pkd_data=*((PackedSetupData*)setupdata[0]);
  3356. data. level=pkd_data. level;
  3357. data.kernel=pkd_data.kernel;
  3358. data.src_coord.ptr=pkd_data.src_coord;
  3359. data.src_value.ptr=pkd_data.src_value;
  3360. data.srf_coord.ptr=pkd_data.srf_coord;
  3361. data.srf_value.ptr=pkd_data.srf_value;
  3362. data.trg_coord.ptr=pkd_data.trg_coord;
  3363. data.trg_value.ptr=pkd_data.trg_value;
  3364. data.src_coord.cnt.ReInit(pkd_data.src_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.src_coord_cnt_offset], false);
  3365. data.src_coord.dsp.ReInit(pkd_data.src_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.src_coord_dsp_offset], false);
  3366. data.src_value.cnt.ReInit(pkd_data.src_value_cnt_size, (size_t*)&setupdata[0][pkd_data.src_value_cnt_offset], false);
  3367. data.src_value.dsp.ReInit(pkd_data.src_value_dsp_size, (size_t*)&setupdata[0][pkd_data.src_value_dsp_offset], false);
  3368. data.srf_coord.cnt.ReInit(pkd_data.srf_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.srf_coord_cnt_offset], false);
  3369. data.srf_coord.dsp.ReInit(pkd_data.srf_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.srf_coord_dsp_offset], false);
  3370. data.srf_value.cnt.ReInit(pkd_data.srf_value_cnt_size, (size_t*)&setupdata[0][pkd_data.srf_value_cnt_offset], false);
  3371. data.srf_value.dsp.ReInit(pkd_data.srf_value_dsp_size, (size_t*)&setupdata[0][pkd_data.srf_value_dsp_offset], false);
  3372. data.trg_coord.cnt.ReInit(pkd_data.trg_coord_cnt_size, (size_t*)&setupdata[0][pkd_data.trg_coord_cnt_offset], false);
  3373. data.trg_coord.dsp.ReInit(pkd_data.trg_coord_dsp_size, (size_t*)&setupdata[0][pkd_data.trg_coord_dsp_offset], false);
  3374. data.trg_value.cnt.ReInit(pkd_data.trg_value_cnt_size, (size_t*)&setupdata[0][pkd_data.trg_value_cnt_offset], false);
  3375. data.trg_value.dsp.ReInit(pkd_data.trg_value_dsp_size, (size_t*)&setupdata[0][pkd_data.trg_value_dsp_offset], false);
  3376. InteracData& intdata=data.interac_data;
  3377. intdata. in_node.ReInit(pkd_data. in_node_size, (size_t*)&setupdata[0][pkd_data. in_node_offset],false);
  3378. intdata. scal_idx.ReInit(pkd_data. scal_idx_size, (size_t*)&setupdata[0][pkd_data. scal_idx_offset],false);
  3379. intdata.coord_shift.ReInit(pkd_data.coord_shift_size, (Real_t*)&setupdata[0][pkd_data.coord_shift_offset],false);
  3380. intdata.interac_cnt.ReInit(pkd_data.interac_cnt_size, (size_t*)&setupdata[0][pkd_data.interac_cnt_offset],false);
  3381. intdata.interac_dsp.ReInit(pkd_data.interac_dsp_size, (size_t*)&setupdata[0][pkd_data.interac_dsp_offset],false);
  3382. intdata.interac_cst.ReInit(pkd_data.interac_cst_size, (size_t*)&setupdata[0][pkd_data.interac_cst_offset],false);
  3383. for(size_t i=0;i<4*MAX_DEPTH;i++){
  3384. intdata.scal[i].ReInit(pkd_data.scal_dim[i], (Real_t*)&setupdata[0][pkd_data.scal_offset[i]],false);
  3385. }
  3386. for(size_t i=0;i<4;i++){
  3387. intdata.M[i].ReInit(pkd_data.Mdim[i][0], pkd_data.Mdim[i][1], (Real_t*)&setupdata[0][pkd_data.M_offset[i]],false);
  3388. }
  3389. }
  3390. if(device) MIC_Lock::wait_lock(wait_lock_idx);
  3391. { // Compute interactions
  3392. InteracData& intdata=data.interac_data;
  3393. typename Kernel<Real_t>::Ker_t single_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_single_layer_kernel;
  3394. typename Kernel<Real_t>::Ker_t double_layer_kernel=(typename Kernel<Real_t>::Ker_t)ptr_double_layer_kernel;
  3395. int omp_p=omp_get_max_threads();
  3396. #pragma omp parallel for
  3397. for(size_t tid=0;tid<omp_p;tid++){
  3398. Matrix<Real_t> src_coord, src_value;
  3399. Matrix<Real_t> srf_coord, srf_value;
  3400. Matrix<Real_t> trg_coord, trg_value;
  3401. Vector<Real_t> buff;
  3402. { // init buff
  3403. size_t thread_buff_size=dev_buff.dim/sizeof(Real_t)/omp_p;
  3404. buff.ReInit(thread_buff_size, (Real_t*)&dev_buff[tid*thread_buff_size*sizeof(Real_t)], false);
  3405. }
  3406. size_t vcnt=0;
  3407. std::vector<Matrix<Real_t> > vbuff(6);
  3408. { // init vbuff[0:5]
  3409. size_t vdim_=0, vdim[6];
  3410. for(size_t indx=0;indx<6;indx++){
  3411. vdim[indx]=0;
  3412. switch(indx){
  3413. case 0:
  3414. vdim[indx]=intdata.M[0].Dim(0); break;
  3415. case 1:
  3416. assert(intdata.M[0].Dim(1)==intdata.M[1].Dim(0));
  3417. vdim[indx]=intdata.M[0].Dim(1); break;
  3418. case 2:
  3419. vdim[indx]=intdata.M[1].Dim(1); break;
  3420. case 3:
  3421. vdim[indx]=intdata.M[2].Dim(0); break;
  3422. case 4:
  3423. assert(intdata.M[2].Dim(1)==intdata.M[3].Dim(0));
  3424. vdim[indx]=intdata.M[2].Dim(1); break;
  3425. case 5:
  3426. vdim[indx]=intdata.M[3].Dim(1); break;
  3427. default:
  3428. vdim[indx]=0; break;
  3429. }
  3430. vdim_+=vdim[indx];
  3431. }
  3432. if(vdim_){
  3433. vcnt=buff.Dim()/vdim_/2;
  3434. assert(vcnt>0); // Thread buffer is too small
  3435. }
  3436. for(size_t indx=0;indx<6;indx++){ // init vbuff[0:5]
  3437. vbuff[indx].ReInit(vcnt,vdim[indx],&buff[0],false);
  3438. buff.ReInit(buff.Dim()-vdim[indx]*vcnt, &buff[vdim[indx]*vcnt], false);
  3439. }
  3440. }
  3441. size_t trg_a, trg_b;
  3442. { // Determine trg_a, trg_b
  3443. //trg_a=((tid+0)*intdata.interac_cnt.Dim())/omp_p;
  3444. //trg_b=((tid+1)*intdata.interac_cnt.Dim())/omp_p;
  3445. Vector<size_t>& interac_cst=intdata.interac_cst;
  3446. size_t cost=interac_cst[interac_cst.Dim()-1];
  3447. trg_a=std::lower_bound(&interac_cst[0],&interac_cst[interac_cst.Dim()-1],(cost*(tid+0))/omp_p)-&interac_cst[0]+1;
  3448. trg_b=std::lower_bound(&interac_cst[0],&interac_cst[interac_cst.Dim()-1],(cost*(tid+1))/omp_p)-&interac_cst[0]+1;
  3449. if(tid==0) trg_a=0;
  3450. }
  3451. for(size_t trg0=trg_a;trg0<trg_b;){
  3452. size_t trg1_max=1;
  3453. if(vcnt){ // Find trg1_max
  3454. size_t interac_cnt=intdata.interac_cnt[trg0];
  3455. while(trg0+trg1_max<trg_b){
  3456. interac_cnt+=intdata.interac_cnt[trg0+trg1_max];
  3457. if(interac_cnt>vcnt){
  3458. interac_cnt-=intdata.interac_cnt[trg0+trg1_max];
  3459. break;
  3460. }
  3461. trg1_max++;
  3462. }
  3463. assert(interac_cnt<=vcnt);
  3464. for(size_t k=0;k<6;k++){
  3465. if(vbuff[k].Dim(0)*vbuff[k].Dim(1)){
  3466. vbuff[k].ReInit(interac_cnt,vbuff[k].Dim(1),vbuff[k][0]);
  3467. }
  3468. }
  3469. }else{
  3470. trg1_max=trg_b-trg0;
  3471. }
  3472. if(intdata.M[0].Dim(0) && intdata.M[0].Dim(1) && intdata.M[1].Dim(0) && intdata.M[1].Dim(1)){ // src mat-vec
  3473. size_t interac_idx=0;
  3474. for(size_t trg1=0;trg1<trg1_max;trg1++){ // Copy src_value to vbuff[0]
  3475. size_t trg=trg0+trg1;
  3476. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3477. size_t int_id=intdata.interac_dsp[trg]+i;
  3478. size_t src=intdata.in_node[int_id];
  3479. src_value.ReInit(1, data.src_value.cnt[src], &data.src_value.ptr[0][0][data.src_value.dsp[src]], false);
  3480. { // Copy src_value to vbuff[0]
  3481. size_t vdim=vbuff[0].Dim(1);
  3482. assert(src_value.Dim(1)==vdim);
  3483. for(size_t j=0;j<vdim;j++) vbuff[0][interac_idx][j]=src_value[0][j];
  3484. }
  3485. size_t scal_idx=intdata.scal_idx[int_id];
  3486. { // scaling
  3487. Matrix<Real_t>& vec=vbuff[0];
  3488. Vector<Real_t>& scal=intdata.scal[scal_idx*4+0];
  3489. size_t scal_dim=scal.Dim();
  3490. if(scal_dim){
  3491. size_t vdim=vec.Dim(1);
  3492. for(size_t j=0;j<vdim;j+=scal_dim){
  3493. for(size_t k=0;k<scal_dim;k++){
  3494. vec[interac_idx][j+k]*=scal[k];
  3495. }
  3496. }
  3497. }
  3498. }
  3499. interac_idx++;
  3500. }
  3501. }
  3502. Matrix<Real_t>::GEMM(vbuff[1],vbuff[0],intdata.M[0]);
  3503. Matrix<Real_t>::GEMM(vbuff[2],vbuff[1],intdata.M[1]);
  3504. interac_idx=0;
  3505. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3506. size_t trg=trg0+trg1;
  3507. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3508. size_t int_id=intdata.interac_dsp[trg]+i;
  3509. size_t scal_idx=intdata.scal_idx[int_id];
  3510. { // scaling
  3511. Matrix<Real_t>& vec=vbuff[2];
  3512. Vector<Real_t>& scal=intdata.scal[scal_idx*4+1];
  3513. size_t scal_dim=scal.Dim();
  3514. if(scal_dim){
  3515. size_t vdim=vec.Dim(1);
  3516. for(size_t j=0;j<vdim;j+=scal_dim){
  3517. for(size_t k=0;k<scal_dim;k++){
  3518. vec[interac_idx][j+k]*=scal[k];
  3519. }
  3520. }
  3521. }
  3522. }
  3523. interac_idx++;
  3524. }
  3525. }
  3526. }
  3527. if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){ // init vbuff[3]
  3528. size_t vdim=vbuff[3].Dim(0)*vbuff[3].Dim(1);
  3529. for(size_t i=0;i<vdim;i++) vbuff[3][0][i]=0;
  3530. }
  3531. { // Evaluate kernel functions
  3532. size_t interac_idx=0;
  3533. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3534. size_t trg=trg0+trg1;
  3535. trg_coord.ReInit(1, data.trg_coord.cnt[trg], &data.trg_coord.ptr[0][0][data.trg_coord.dsp[trg]], false);
  3536. trg_value.ReInit(1, data.trg_value.cnt[trg], &data.trg_value.ptr[0][0][data.trg_value.dsp[trg]], false);
  3537. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3538. size_t int_id=intdata.interac_dsp[trg]+i;
  3539. size_t src=intdata.in_node[int_id];
  3540. src_coord.ReInit(1, data.src_coord.cnt[src], &data.src_coord.ptr[0][0][data.src_coord.dsp[src]], false);
  3541. src_value.ReInit(1, data.src_value.cnt[src], &data.src_value.ptr[0][0][data.src_value.dsp[src]], false);
  3542. srf_coord.ReInit(1, data.srf_coord.cnt[src], &data.srf_coord.ptr[0][0][data.srf_coord.dsp[src]], false);
  3543. srf_value.ReInit(1, data.srf_value.cnt[src], &data.srf_value.ptr[0][0][data.srf_value.dsp[src]], false);
  3544. Real_t* vbuff2_ptr=(vbuff[2].Dim(0)*vbuff[2].Dim(1)?vbuff[2][interac_idx]:src_value[0]);
  3545. Real_t* vbuff3_ptr=(vbuff[3].Dim(0)*vbuff[3].Dim(1)?vbuff[3][interac_idx]:trg_value[0]);
  3546. { // coord_shift
  3547. Real_t* shift=&intdata.coord_shift[int_id*COORD_DIM];
  3548. if(shift[0]!=0 || shift[1]!=0 || shift[2]!=0){
  3549. size_t vdim=src_coord.Dim(1);
  3550. Vector<Real_t> new_coord(vdim, &buff[0], false);
  3551. assert(buff.Dim()>=vdim); // Thread buffer is too small
  3552. //buff.ReInit(buff.Dim()-vdim, &buff[vdim], false);
  3553. for(size_t j=0;j<vdim;j+=COORD_DIM){
  3554. for(size_t k=0;k<COORD_DIM;k++){
  3555. new_coord[j+k]=src_coord[0][j+k]+shift[k];
  3556. }
  3557. }
  3558. src_coord.ReInit(1, vdim, &new_coord[0], false);
  3559. }
  3560. }
  3561. if(src_coord.Dim(1)){
  3562. assert(ptr_single_layer_kernel); // assert(Single-layer kernel is implemented)
  3563. single_layer_kernel(src_coord[0], src_coord.Dim(1)/COORD_DIM, vbuff2_ptr, 1,
  3564. trg_coord[0], trg_coord.Dim(1)/COORD_DIM, vbuff3_ptr, NULL);
  3565. }
  3566. if(srf_coord.Dim(1)){
  3567. assert(ptr_double_layer_kernel); // assert(Double-layer kernel is implemented)
  3568. double_layer_kernel(srf_coord[0], srf_coord.Dim(1)/COORD_DIM, srf_value[0], 1,
  3569. trg_coord[0], trg_coord.Dim(1)/COORD_DIM, trg_value[0], NULL);
  3570. }
  3571. interac_idx++;
  3572. }
  3573. }
  3574. }
  3575. if(intdata.M[2].Dim(0) && intdata.M[2].Dim(1) && intdata.M[3].Dim(0) && intdata.M[3].Dim(1)){ // trg mat-vec
  3576. size_t interac_idx=0;
  3577. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3578. size_t trg=trg0+trg1;
  3579. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3580. size_t int_id=intdata.interac_dsp[trg]+i;
  3581. size_t scal_idx=intdata.scal_idx[int_id];
  3582. { // scaling
  3583. Matrix<Real_t>& vec=vbuff[3];
  3584. Vector<Real_t>& scal=intdata.scal[scal_idx*4+2];
  3585. size_t scal_dim=scal.Dim();
  3586. if(scal_dim){
  3587. size_t vdim=vec.Dim(1);
  3588. for(size_t j=0;j<vdim;j+=scal_dim){
  3589. for(size_t k=0;k<scal_dim;k++){
  3590. vec[interac_idx][j+k]*=scal[k];
  3591. }
  3592. }
  3593. }
  3594. }
  3595. interac_idx++;
  3596. }
  3597. }
  3598. Matrix<Real_t>::GEMM(vbuff[4],vbuff[3],intdata.M[2]);
  3599. Matrix<Real_t>::GEMM(vbuff[5],vbuff[4],intdata.M[3]);
  3600. interac_idx=0;
  3601. for(size_t trg1=0;trg1<trg1_max;trg1++){
  3602. size_t trg=trg0+trg1;
  3603. trg_value.ReInit(1, data.trg_value.cnt[trg], &data.trg_value.ptr[0][0][data.trg_value.dsp[trg]], false);
  3604. for(size_t i=0;i<intdata.interac_cnt[trg];i++){
  3605. size_t int_id=intdata.interac_dsp[trg]+i;
  3606. size_t scal_idx=intdata.scal_idx[int_id];
  3607. { // scaling
  3608. Matrix<Real_t>& vec=vbuff[5];
  3609. Vector<Real_t>& scal=intdata.scal[scal_idx*4+3];
  3610. size_t scal_dim=scal.Dim();
  3611. if(scal_dim){
  3612. size_t vdim=vec.Dim(1);
  3613. for(size_t j=0;j<vdim;j+=scal_dim){
  3614. for(size_t k=0;k<scal_dim;k++){
  3615. vec[interac_idx][j+k]*=scal[k];
  3616. }
  3617. }
  3618. }
  3619. }
  3620. { // Add vbuff[5] to trg_value
  3621. size_t vdim=vbuff[5].Dim(1);
  3622. assert(trg_value.Dim(1)==vdim);
  3623. for(size_t i=0;i<vdim;i++) trg_value[0][i]+=vbuff[5][interac_idx][i];
  3624. }
  3625. interac_idx++;
  3626. }
  3627. }
  3628. }
  3629. trg0+=trg1_max;
  3630. }
  3631. }
  3632. }
  3633. if(device) MIC_Lock::release_lock(lock_idx);
  3634. }
  3635. #ifdef __INTEL_OFFLOAD
  3636. if(SYNC){
  3637. #pragma offload if(device) target(mic:0)
  3638. {if(device) MIC_Lock::wait_lock(lock_idx);}
  3639. }
  3640. #endif
  3641. Profile::Toc();
  3642. }
  3643. template <class FMMNode>
  3644. void FMM_Pts<FMMNode>::X_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3645. if(!this->MultipoleOrder()) return;
  3646. { // Set setup_data
  3647. setup_data. level=level;
  3648. setup_data.kernel=kernel->k_s2l;
  3649. setup_data. input_data=&buff[4];
  3650. setup_data.output_data=&buff[1];
  3651. setup_data. coord_data=&buff[6];
  3652. Vector<FMMNode_t*>& nodes_in =n_list[4];
  3653. Vector<FMMNode_t*>& nodes_out=n_list[1];
  3654. setup_data.nodes_in .clear();
  3655. setup_data.nodes_out.clear();
  3656. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]);
  3657. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  3658. }
  3659. struct PackedData{
  3660. size_t len;
  3661. Matrix<Real_t>* ptr;
  3662. Vector<size_t> cnt;
  3663. Vector<size_t> dsp;
  3664. };
  3665. struct InteracData{
  3666. Vector<size_t> in_node;
  3667. Vector<size_t> scal_idx;
  3668. Vector<Real_t> coord_shift;
  3669. Vector<size_t> interac_cnt;
  3670. Vector<size_t> interac_dsp;
  3671. Vector<size_t> interac_cst;
  3672. Vector<Real_t> scal[4*MAX_DEPTH];
  3673. Matrix<Real_t> M[4];
  3674. };
  3675. struct ptSetupData{
  3676. int level;
  3677. const Kernel<Real_t>* kernel;
  3678. PackedData src_coord; // Src coord
  3679. PackedData src_value; // Src density
  3680. PackedData srf_coord; // Srf coord
  3681. PackedData srf_value; // Srf density
  3682. PackedData trg_coord; // Trg coord
  3683. PackedData trg_value; // Trg potential
  3684. InteracData interac_data;
  3685. };
  3686. ptSetupData data;
  3687. data. level=setup_data. level;
  3688. data.kernel=setup_data.kernel;
  3689. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3690. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3691. { // Set src data
  3692. std::vector<void*>& nodes=nodes_in;
  3693. PackedData& coord=data.src_coord;
  3694. PackedData& value=data.src_value;
  3695. coord.ptr=setup_data. coord_data;
  3696. value.ptr=setup_data. input_data;
  3697. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3698. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3699. coord.cnt.ReInit(nodes.size());
  3700. coord.dsp.ReInit(nodes.size());
  3701. value.cnt.ReInit(nodes.size());
  3702. value.dsp.ReInit(nodes.size());
  3703. #pragma omp parallel for
  3704. for(size_t i=0;i<nodes.size();i++){
  3705. ((FMMNode_t*)nodes[i])->node_id=i;
  3706. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->src_coord;
  3707. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->src_value;
  3708. if(coord_vec.Dim()){
  3709. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3710. assert(coord.dsp[i]<coord.len);
  3711. coord.cnt[i]=coord_vec.Dim();
  3712. }else{
  3713. coord.dsp[i]=0;
  3714. coord.cnt[i]=0;
  3715. }
  3716. if(value_vec.Dim()){
  3717. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3718. assert(value.dsp[i]<value.len);
  3719. value.cnt[i]=value_vec.Dim();
  3720. }else{
  3721. value.dsp[i]=0;
  3722. value.cnt[i]=0;
  3723. }
  3724. }
  3725. }
  3726. { // Set srf data
  3727. std::vector<void*>& nodes=nodes_in;
  3728. PackedData& coord=data.srf_coord;
  3729. PackedData& value=data.srf_value;
  3730. coord.ptr=setup_data. coord_data;
  3731. value.ptr=setup_data. input_data;
  3732. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3733. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3734. coord.cnt.ReInit(nodes.size());
  3735. coord.dsp.ReInit(nodes.size());
  3736. value.cnt.ReInit(nodes.size());
  3737. value.dsp.ReInit(nodes.size());
  3738. #pragma omp parallel for
  3739. for(size_t i=0;i<nodes.size();i++){
  3740. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->surf_coord;
  3741. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->surf_value;
  3742. if(coord_vec.Dim()){
  3743. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3744. assert(coord.dsp[i]<coord.len);
  3745. coord.cnt[i]=coord_vec.Dim();
  3746. }else{
  3747. coord.dsp[i]=0;
  3748. coord.cnt[i]=0;
  3749. }
  3750. if(value_vec.Dim()){
  3751. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3752. assert(value.dsp[i]<value.len);
  3753. value.cnt[i]=value_vec.Dim();
  3754. }else{
  3755. value.dsp[i]=0;
  3756. value.cnt[i]=0;
  3757. }
  3758. }
  3759. }
  3760. { // Set trg data
  3761. std::vector<void*>& nodes=nodes_out;
  3762. PackedData& coord=data.trg_coord;
  3763. PackedData& value=data.trg_value;
  3764. coord.ptr=setup_data. coord_data;
  3765. value.ptr=setup_data.output_data;
  3766. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3767. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3768. coord.cnt.ReInit(nodes.size());
  3769. coord.dsp.ReInit(nodes.size());
  3770. value.cnt.ReInit(nodes.size());
  3771. value.dsp.ReInit(nodes.size());
  3772. #pragma omp parallel for
  3773. for(size_t i=0;i<nodes.size();i++){
  3774. Vector<Real_t>& coord_vec=tree->dnwd_check_surf[((FMMNode*)nodes[i])->Depth()];
  3775. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->dnward_equiv;
  3776. if(coord_vec.Dim()){
  3777. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3778. assert(coord.dsp[i]<coord.len);
  3779. coord.cnt[i]=coord_vec.Dim();
  3780. }else{
  3781. coord.dsp[i]=0;
  3782. coord.cnt[i]=0;
  3783. }
  3784. if(value_vec.Dim()){
  3785. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3786. assert(value.dsp[i]<value.len);
  3787. value.cnt[i]=value_vec.Dim();
  3788. }else{
  3789. value.dsp[i]=0;
  3790. value.cnt[i]=0;
  3791. }
  3792. }
  3793. }
  3794. { // Set interac_data
  3795. int omp_p=omp_get_max_threads();
  3796. std::vector<std::vector<size_t> > in_node_(omp_p);
  3797. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  3798. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  3799. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  3800. size_t m=this->MultipoleOrder();
  3801. size_t Nsrf=(6*(m-1)*(m-1)+2);
  3802. #pragma omp parallel for
  3803. for(size_t tid=0;tid<omp_p;tid++){
  3804. std::vector<size_t>& in_node =in_node_[tid] ;
  3805. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  3806. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  3807. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  3808. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  3809. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  3810. for(size_t i=a;i<b;i++){
  3811. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  3812. if(tnode->IsLeaf() && tnode->pt_cnt[1]<=Nsrf){ // skip: handled in U-list
  3813. interac_cnt.push_back(0);
  3814. continue;
  3815. }
  3816. Real_t s=std::pow(0.5,tnode->Depth());
  3817. size_t interac_cnt_=0;
  3818. { // X_Type
  3819. Mat_Type type=X_Type;
  3820. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  3821. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  3822. FMMNode_t* snode=intlst[j];
  3823. size_t snode_id=snode->node_id;
  3824. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  3825. in_node.push_back(snode_id);
  3826. scal_idx.push_back(snode->Depth());
  3827. { // set coord_shift
  3828. const int* rel_coord=interac_list.RelativeCoord(type,j);
  3829. const Real_t* scoord=snode->Coord();
  3830. const Real_t* tcoord=tnode->Coord();
  3831. Real_t shift[COORD_DIM];
  3832. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(0+0.5*s);
  3833. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(0+0.5*s);
  3834. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(0+0.5*s);
  3835. coord_shift.push_back(shift[0]);
  3836. coord_shift.push_back(shift[1]);
  3837. coord_shift.push_back(shift[2]);
  3838. }
  3839. interac_cnt_++;
  3840. }
  3841. }
  3842. interac_cnt.push_back(interac_cnt_);
  3843. }
  3844. }
  3845. { // Combine interac data
  3846. InteracData& interac_data=data.interac_data;
  3847. { // in_node
  3848. typedef size_t ElemType;
  3849. std::vector<std::vector<ElemType> >& vec_=in_node_;
  3850. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  3851. std::vector<size_t> vec_dsp(omp_p+1,0);
  3852. for(size_t tid=0;tid<omp_p;tid++){
  3853. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3854. }
  3855. vec.ReInit(vec_dsp[omp_p]);
  3856. #pragma omp parallel for
  3857. for(size_t tid=0;tid<omp_p;tid++){
  3858. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3859. }
  3860. }
  3861. { // scal_idx
  3862. typedef size_t ElemType;
  3863. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  3864. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  3865. std::vector<size_t> vec_dsp(omp_p+1,0);
  3866. for(size_t tid=0;tid<omp_p;tid++){
  3867. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3868. }
  3869. vec.ReInit(vec_dsp[omp_p]);
  3870. #pragma omp parallel for
  3871. for(size_t tid=0;tid<omp_p;tid++){
  3872. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3873. }
  3874. }
  3875. { // coord_shift
  3876. typedef Real_t ElemType;
  3877. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  3878. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  3879. std::vector<size_t> vec_dsp(omp_p+1,0);
  3880. for(size_t tid=0;tid<omp_p;tid++){
  3881. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3882. }
  3883. vec.ReInit(vec_dsp[omp_p]);
  3884. #pragma omp parallel for
  3885. for(size_t tid=0;tid<omp_p;tid++){
  3886. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3887. }
  3888. }
  3889. { // interac_cnt
  3890. typedef size_t ElemType;
  3891. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  3892. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  3893. std::vector<size_t> vec_dsp(omp_p+1,0);
  3894. for(size_t tid=0;tid<omp_p;tid++){
  3895. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  3896. }
  3897. vec.ReInit(vec_dsp[omp_p]);
  3898. #pragma omp parallel for
  3899. for(size_t tid=0;tid<omp_p;tid++){
  3900. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  3901. }
  3902. }
  3903. { // interac_dsp
  3904. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  3905. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  3906. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  3907. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  3908. }
  3909. }
  3910. }
  3911. PtSetup(setup_data, &data);
  3912. }
  3913. template <class FMMNode>
  3914. void FMM_Pts<FMMNode>::X_List (SetupData<Real_t>& setup_data, bool device){
  3915. if(!this->MultipoleOrder()) return;
  3916. //Add X_List contribution.
  3917. this->EvalListPts(setup_data, device);
  3918. }
  3919. template <class FMMNode>
  3920. void FMM_Pts<FMMNode>::W_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  3921. if(!this->MultipoleOrder()) return;
  3922. { // Set setup_data
  3923. setup_data. level=level;
  3924. setup_data.kernel=kernel->k_m2t;
  3925. setup_data. input_data=&buff[0];
  3926. setup_data.output_data=&buff[5];
  3927. setup_data. coord_data=&buff[6];
  3928. Vector<FMMNode_t*>& nodes_in =n_list[0];
  3929. Vector<FMMNode_t*>& nodes_out=n_list[5];
  3930. setup_data.nodes_in .clear();
  3931. setup_data.nodes_out.clear();
  3932. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] ) setup_data.nodes_in .push_back(nodes_in [i]);
  3933. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  3934. }
  3935. struct PackedData{
  3936. size_t len;
  3937. Matrix<Real_t>* ptr;
  3938. Vector<size_t> cnt;
  3939. Vector<size_t> dsp;
  3940. };
  3941. struct InteracData{
  3942. Vector<size_t> in_node;
  3943. Vector<size_t> scal_idx;
  3944. Vector<Real_t> coord_shift;
  3945. Vector<size_t> interac_cnt;
  3946. Vector<size_t> interac_dsp;
  3947. Vector<size_t> interac_cst;
  3948. Vector<Real_t> scal[4*MAX_DEPTH];
  3949. Matrix<Real_t> M[4];
  3950. };
  3951. struct ptSetupData{
  3952. int level;
  3953. const Kernel<Real_t>* kernel;
  3954. PackedData src_coord; // Src coord
  3955. PackedData src_value; // Src density
  3956. PackedData srf_coord; // Srf coord
  3957. PackedData srf_value; // Srf density
  3958. PackedData trg_coord; // Trg coord
  3959. PackedData trg_value; // Trg potential
  3960. InteracData interac_data;
  3961. };
  3962. ptSetupData data;
  3963. data. level=setup_data. level;
  3964. data.kernel=setup_data.kernel;
  3965. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  3966. std::vector<void*>& nodes_out=setup_data.nodes_out;
  3967. { // Set src data
  3968. std::vector<void*>& nodes=nodes_in;
  3969. PackedData& coord=data.src_coord;
  3970. PackedData& value=data.src_value;
  3971. coord.ptr=setup_data. coord_data;
  3972. value.ptr=setup_data. input_data;
  3973. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  3974. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  3975. coord.cnt.ReInit(nodes.size());
  3976. coord.dsp.ReInit(nodes.size());
  3977. value.cnt.ReInit(nodes.size());
  3978. value.dsp.ReInit(nodes.size());
  3979. #pragma omp parallel for
  3980. for(size_t i=0;i<nodes.size();i++){
  3981. ((FMMNode_t*)nodes[i])->node_id=i;
  3982. Vector<Real_t>& coord_vec=tree->upwd_equiv_surf[((FMMNode*)nodes[i])->Depth()];
  3983. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->upward_equiv;
  3984. if(coord_vec.Dim()){
  3985. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  3986. assert(coord.dsp[i]<coord.len);
  3987. coord.cnt[i]=coord_vec.Dim();
  3988. }else{
  3989. coord.dsp[i]=0;
  3990. coord.cnt[i]=0;
  3991. }
  3992. if(value_vec.Dim()){
  3993. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  3994. assert(value.dsp[i]<value.len);
  3995. value.cnt[i]=value_vec.Dim();
  3996. }else{
  3997. value.dsp[i]=0;
  3998. value.cnt[i]=0;
  3999. }
  4000. }
  4001. }
  4002. { // Set srf data
  4003. std::vector<void*>& nodes=nodes_in;
  4004. PackedData& coord=data.srf_coord;
  4005. PackedData& value=data.srf_value;
  4006. coord.ptr=setup_data. coord_data;
  4007. value.ptr=setup_data. input_data;
  4008. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4009. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4010. coord.cnt.ReInit(nodes.size());
  4011. coord.dsp.ReInit(nodes.size());
  4012. value.cnt.ReInit(nodes.size());
  4013. value.dsp.ReInit(nodes.size());
  4014. #pragma omp parallel for
  4015. for(size_t i=0;i<nodes.size();i++){
  4016. coord.dsp[i]=0;
  4017. coord.cnt[i]=0;
  4018. value.dsp[i]=0;
  4019. value.cnt[i]=0;
  4020. }
  4021. }
  4022. { // Set trg data
  4023. std::vector<void*>& nodes=nodes_out;
  4024. PackedData& coord=data.trg_coord;
  4025. PackedData& value=data.trg_value;
  4026. coord.ptr=setup_data. coord_data;
  4027. value.ptr=setup_data.output_data;
  4028. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4029. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4030. coord.cnt.ReInit(nodes.size());
  4031. coord.dsp.ReInit(nodes.size());
  4032. value.cnt.ReInit(nodes.size());
  4033. value.dsp.ReInit(nodes.size());
  4034. #pragma omp parallel for
  4035. for(size_t i=0;i<nodes.size();i++){
  4036. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4037. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4038. if(coord_vec.Dim()){
  4039. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4040. assert(coord.dsp[i]<coord.len);
  4041. coord.cnt[i]=coord_vec.Dim();
  4042. }else{
  4043. coord.dsp[i]=0;
  4044. coord.cnt[i]=0;
  4045. }
  4046. if(value_vec.Dim()){
  4047. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4048. assert(value.dsp[i]<value.len);
  4049. value.cnt[i]=value_vec.Dim();
  4050. }else{
  4051. value.dsp[i]=0;
  4052. value.cnt[i]=0;
  4053. }
  4054. }
  4055. }
  4056. { // Set interac_data
  4057. int omp_p=omp_get_max_threads();
  4058. std::vector<std::vector<size_t> > in_node_(omp_p);
  4059. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4060. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4061. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4062. size_t m=this->MultipoleOrder();
  4063. size_t Nsrf=(6*(m-1)*(m-1)+2);
  4064. #pragma omp parallel for
  4065. for(size_t tid=0;tid<omp_p;tid++){
  4066. std::vector<size_t>& in_node =in_node_[tid] ;
  4067. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4068. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4069. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  4070. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4071. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4072. for(size_t i=a;i<b;i++){
  4073. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4074. Real_t s=std::pow(0.5,tnode->Depth());
  4075. size_t interac_cnt_=0;
  4076. { // W_Type
  4077. Mat_Type type=W_Type;
  4078. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4079. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4080. FMMNode_t* snode=intlst[j];
  4081. size_t snode_id=snode->node_id;
  4082. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4083. if(snode->IsGhost() && snode->src_coord.Dim()+snode->surf_coord.Dim()==0){ // Is non-leaf ghost node
  4084. }else if(snode->IsLeaf() && snode->pt_cnt[0]<=Nsrf) continue; // skip: handled in U-list
  4085. in_node.push_back(snode_id);
  4086. scal_idx.push_back(snode->Depth());
  4087. { // set coord_shift
  4088. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4089. const Real_t* scoord=snode->Coord();
  4090. const Real_t* tcoord=tnode->Coord();
  4091. Real_t shift[COORD_DIM];
  4092. shift[0]=rel_coord[0]*0.25*s-(0+0.25*s)+(tcoord[0]+0.5*s);
  4093. shift[1]=rel_coord[1]*0.25*s-(0+0.25*s)+(tcoord[1]+0.5*s);
  4094. shift[2]=rel_coord[2]*0.25*s-(0+0.25*s)+(tcoord[2]+0.5*s);
  4095. coord_shift.push_back(shift[0]);
  4096. coord_shift.push_back(shift[1]);
  4097. coord_shift.push_back(shift[2]);
  4098. }
  4099. interac_cnt_++;
  4100. }
  4101. }
  4102. interac_cnt.push_back(interac_cnt_);
  4103. }
  4104. }
  4105. { // Combine interac data
  4106. InteracData& interac_data=data.interac_data;
  4107. { // in_node
  4108. typedef size_t ElemType;
  4109. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4110. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4111. std::vector<size_t> vec_dsp(omp_p+1,0);
  4112. for(size_t tid=0;tid<omp_p;tid++){
  4113. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4114. }
  4115. vec.ReInit(vec_dsp[omp_p]);
  4116. #pragma omp parallel for
  4117. for(size_t tid=0;tid<omp_p;tid++){
  4118. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4119. }
  4120. }
  4121. { // scal_idx
  4122. typedef size_t ElemType;
  4123. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4124. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4125. std::vector<size_t> vec_dsp(omp_p+1,0);
  4126. for(size_t tid=0;tid<omp_p;tid++){
  4127. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4128. }
  4129. vec.ReInit(vec_dsp[omp_p]);
  4130. #pragma omp parallel for
  4131. for(size_t tid=0;tid<omp_p;tid++){
  4132. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4133. }
  4134. }
  4135. { // coord_shift
  4136. typedef Real_t ElemType;
  4137. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4138. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4139. std::vector<size_t> vec_dsp(omp_p+1,0);
  4140. for(size_t tid=0;tid<omp_p;tid++){
  4141. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4142. }
  4143. vec.ReInit(vec_dsp[omp_p]);
  4144. #pragma omp parallel for
  4145. for(size_t tid=0;tid<omp_p;tid++){
  4146. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4147. }
  4148. }
  4149. { // interac_cnt
  4150. typedef size_t ElemType;
  4151. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4152. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4153. std::vector<size_t> vec_dsp(omp_p+1,0);
  4154. for(size_t tid=0;tid<omp_p;tid++){
  4155. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4156. }
  4157. vec.ReInit(vec_dsp[omp_p]);
  4158. #pragma omp parallel for
  4159. for(size_t tid=0;tid<omp_p;tid++){
  4160. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4161. }
  4162. }
  4163. { // interac_dsp
  4164. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4165. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4166. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4167. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4168. }
  4169. }
  4170. }
  4171. PtSetup(setup_data, &data);
  4172. }
  4173. template <class FMMNode>
  4174. void FMM_Pts<FMMNode>::W_List (SetupData<Real_t>& setup_data, bool device){
  4175. if(!this->MultipoleOrder()) return;
  4176. //Add W_List contribution.
  4177. this->EvalListPts(setup_data, device);
  4178. }
  4179. template <class FMMNode>
  4180. void FMM_Pts<FMMNode>::U_ListSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  4181. { // Set setup_data
  4182. setup_data. level=level;
  4183. setup_data.kernel=kernel->k_s2t;
  4184. setup_data. input_data=&buff[4];
  4185. setup_data.output_data=&buff[5];
  4186. setup_data. coord_data=&buff[6];
  4187. Vector<FMMNode_t*>& nodes_in =n_list[4];
  4188. Vector<FMMNode_t*>& nodes_out=n_list[5];
  4189. setup_data.nodes_in .clear();
  4190. setup_data.nodes_out.clear();
  4191. for(size_t i=0;i<nodes_in .Dim();i++) if((level==0 || level==-1) && nodes_in [i]->pt_cnt[0] && nodes_in [i]->IsLeaf() ) setup_data.nodes_in .push_back(nodes_in [i]);
  4192. for(size_t i=0;i<nodes_out.Dim();i++) if((level==0 || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  4193. }
  4194. struct PackedData{
  4195. size_t len;
  4196. Matrix<Real_t>* ptr;
  4197. Vector<size_t> cnt;
  4198. Vector<size_t> dsp;
  4199. };
  4200. struct InteracData{
  4201. Vector<size_t> in_node;
  4202. Vector<size_t> scal_idx;
  4203. Vector<Real_t> coord_shift;
  4204. Vector<size_t> interac_cnt;
  4205. Vector<size_t> interac_dsp;
  4206. Vector<size_t> interac_cst;
  4207. Vector<Real_t> scal[4*MAX_DEPTH];
  4208. Matrix<Real_t> M[4];
  4209. };
  4210. struct ptSetupData{
  4211. int level;
  4212. const Kernel<Real_t>* kernel;
  4213. PackedData src_coord; // Src coord
  4214. PackedData src_value; // Src density
  4215. PackedData srf_coord; // Srf coord
  4216. PackedData srf_value; // Srf density
  4217. PackedData trg_coord; // Trg coord
  4218. PackedData trg_value; // Trg potential
  4219. InteracData interac_data;
  4220. };
  4221. ptSetupData data;
  4222. data. level=setup_data. level;
  4223. data.kernel=setup_data.kernel;
  4224. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  4225. std::vector<void*>& nodes_out=setup_data.nodes_out;
  4226. { // Set src data
  4227. std::vector<void*>& nodes=nodes_in;
  4228. PackedData& coord=data.src_coord;
  4229. PackedData& value=data.src_value;
  4230. coord.ptr=setup_data. coord_data;
  4231. value.ptr=setup_data. input_data;
  4232. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4233. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4234. coord.cnt.ReInit(nodes.size());
  4235. coord.dsp.ReInit(nodes.size());
  4236. value.cnt.ReInit(nodes.size());
  4237. value.dsp.ReInit(nodes.size());
  4238. #pragma omp parallel for
  4239. for(size_t i=0;i<nodes.size();i++){
  4240. ((FMMNode_t*)nodes[i])->node_id=i;
  4241. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->src_coord;
  4242. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->src_value;
  4243. if(coord_vec.Dim()){
  4244. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4245. assert(coord.dsp[i]<coord.len);
  4246. coord.cnt[i]=coord_vec.Dim();
  4247. }else{
  4248. coord.dsp[i]=0;
  4249. coord.cnt[i]=0;
  4250. }
  4251. if(value_vec.Dim()){
  4252. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4253. assert(value.dsp[i]<value.len);
  4254. value.cnt[i]=value_vec.Dim();
  4255. }else{
  4256. value.dsp[i]=0;
  4257. value.cnt[i]=0;
  4258. }
  4259. }
  4260. }
  4261. { // Set srf data
  4262. std::vector<void*>& nodes=nodes_in;
  4263. PackedData& coord=data.srf_coord;
  4264. PackedData& value=data.srf_value;
  4265. coord.ptr=setup_data. coord_data;
  4266. value.ptr=setup_data. input_data;
  4267. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4268. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4269. coord.cnt.ReInit(nodes.size());
  4270. coord.dsp.ReInit(nodes.size());
  4271. value.cnt.ReInit(nodes.size());
  4272. value.dsp.ReInit(nodes.size());
  4273. #pragma omp parallel for
  4274. for(size_t i=0;i<nodes.size();i++){
  4275. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->surf_coord;
  4276. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->surf_value;
  4277. if(coord_vec.Dim()){
  4278. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4279. assert(coord.dsp[i]<coord.len);
  4280. coord.cnt[i]=coord_vec.Dim();
  4281. }else{
  4282. coord.dsp[i]=0;
  4283. coord.cnt[i]=0;
  4284. }
  4285. if(value_vec.Dim()){
  4286. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4287. assert(value.dsp[i]<value.len);
  4288. value.cnt[i]=value_vec.Dim();
  4289. }else{
  4290. value.dsp[i]=0;
  4291. value.cnt[i]=0;
  4292. }
  4293. }
  4294. }
  4295. { // Set trg data
  4296. std::vector<void*>& nodes=nodes_out;
  4297. PackedData& coord=data.trg_coord;
  4298. PackedData& value=data.trg_value;
  4299. coord.ptr=setup_data. coord_data;
  4300. value.ptr=setup_data.output_data;
  4301. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4302. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4303. coord.cnt.ReInit(nodes.size());
  4304. coord.dsp.ReInit(nodes.size());
  4305. value.cnt.ReInit(nodes.size());
  4306. value.dsp.ReInit(nodes.size());
  4307. #pragma omp parallel for
  4308. for(size_t i=0;i<nodes.size();i++){
  4309. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4310. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4311. if(coord_vec.Dim()){
  4312. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4313. assert(coord.dsp[i]<coord.len);
  4314. coord.cnt[i]=coord_vec.Dim();
  4315. }else{
  4316. coord.dsp[i]=0;
  4317. coord.cnt[i]=0;
  4318. }
  4319. if(value_vec.Dim()){
  4320. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4321. assert(value.dsp[i]<value.len);
  4322. value.cnt[i]=value_vec.Dim();
  4323. }else{
  4324. value.dsp[i]=0;
  4325. value.cnt[i]=0;
  4326. }
  4327. }
  4328. }
  4329. { // Set interac_data
  4330. int omp_p=omp_get_max_threads();
  4331. std::vector<std::vector<size_t> > in_node_(omp_p);
  4332. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4333. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4334. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4335. size_t m=this->MultipoleOrder();
  4336. size_t Nsrf=(6*(m-1)*(m-1)+2);
  4337. #pragma omp parallel for
  4338. for(size_t tid=0;tid<omp_p;tid++){
  4339. std::vector<size_t>& in_node =in_node_[tid] ;
  4340. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4341. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4342. std::vector<size_t>& interac_cnt=interac_cnt_[tid] ;
  4343. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4344. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4345. for(size_t i=a;i<b;i++){
  4346. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4347. Real_t s=std::pow(0.5,tnode->Depth());
  4348. size_t interac_cnt_=0;
  4349. { // U0_Type
  4350. Mat_Type type=U0_Type;
  4351. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4352. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4353. FMMNode_t* snode=intlst[j];
  4354. size_t snode_id=snode->node_id;
  4355. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4356. in_node.push_back(snode_id);
  4357. scal_idx.push_back(snode->Depth());
  4358. { // set coord_shift
  4359. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4360. const Real_t* scoord=snode->Coord();
  4361. const Real_t* tcoord=tnode->Coord();
  4362. Real_t shift[COORD_DIM];
  4363. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(tcoord[0]+0.5*s);
  4364. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(tcoord[1]+0.5*s);
  4365. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(tcoord[2]+0.5*s);
  4366. coord_shift.push_back(shift[0]);
  4367. coord_shift.push_back(shift[1]);
  4368. coord_shift.push_back(shift[2]);
  4369. }
  4370. interac_cnt_++;
  4371. }
  4372. }
  4373. { // U1_Type
  4374. Mat_Type type=U1_Type;
  4375. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4376. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4377. FMMNode_t* snode=intlst[j];
  4378. size_t snode_id=snode->node_id;
  4379. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4380. in_node.push_back(snode_id);
  4381. scal_idx.push_back(snode->Depth());
  4382. { // set coord_shift
  4383. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4384. const Real_t* scoord=snode->Coord();
  4385. const Real_t* tcoord=tnode->Coord();
  4386. Real_t shift[COORD_DIM];
  4387. shift[0]=rel_coord[0]*1.0*s-(scoord[0]+0.5*s)+(tcoord[0]+0.5*s);
  4388. shift[1]=rel_coord[1]*1.0*s-(scoord[1]+0.5*s)+(tcoord[1]+0.5*s);
  4389. shift[2]=rel_coord[2]*1.0*s-(scoord[2]+0.5*s)+(tcoord[2]+0.5*s);
  4390. coord_shift.push_back(shift[0]);
  4391. coord_shift.push_back(shift[1]);
  4392. coord_shift.push_back(shift[2]);
  4393. }
  4394. interac_cnt_++;
  4395. }
  4396. }
  4397. { // U2_Type
  4398. Mat_Type type=U2_Type;
  4399. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4400. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4401. FMMNode_t* snode=intlst[j];
  4402. size_t snode_id=snode->node_id;
  4403. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4404. in_node.push_back(snode_id);
  4405. scal_idx.push_back(snode->Depth());
  4406. { // set coord_shift
  4407. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4408. const Real_t* scoord=snode->Coord();
  4409. const Real_t* tcoord=tnode->Coord();
  4410. Real_t shift[COORD_DIM];
  4411. shift[0]=rel_coord[0]*0.25*s-(scoord[0]+0.25*s)+(tcoord[0]+0.5*s);
  4412. shift[1]=rel_coord[1]*0.25*s-(scoord[1]+0.25*s)+(tcoord[1]+0.5*s);
  4413. shift[2]=rel_coord[2]*0.25*s-(scoord[2]+0.25*s)+(tcoord[2]+0.5*s);
  4414. coord_shift.push_back(shift[0]);
  4415. coord_shift.push_back(shift[1]);
  4416. coord_shift.push_back(shift[2]);
  4417. }
  4418. interac_cnt_++;
  4419. }
  4420. }
  4421. { // X_Type
  4422. Mat_Type type=X_Type;
  4423. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4424. if(tnode->pt_cnt[1]<=Nsrf)
  4425. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4426. FMMNode_t* snode=intlst[j];
  4427. size_t snode_id=snode->node_id;
  4428. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4429. in_node.push_back(snode_id);
  4430. scal_idx.push_back(snode->Depth());
  4431. { // set coord_shift
  4432. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4433. const Real_t* scoord=snode->Coord();
  4434. const Real_t* tcoord=tnode->Coord();
  4435. Real_t shift[COORD_DIM];
  4436. shift[0]=rel_coord[0]*0.5*s-(scoord[0]+1.0*s)+(tcoord[0]+0.5*s);
  4437. shift[1]=rel_coord[1]*0.5*s-(scoord[1]+1.0*s)+(tcoord[1]+0.5*s);
  4438. shift[2]=rel_coord[2]*0.5*s-(scoord[2]+1.0*s)+(tcoord[2]+0.5*s);
  4439. coord_shift.push_back(shift[0]);
  4440. coord_shift.push_back(shift[1]);
  4441. coord_shift.push_back(shift[2]);
  4442. }
  4443. interac_cnt_++;
  4444. }
  4445. }
  4446. { // W_Type
  4447. Mat_Type type=W_Type;
  4448. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4449. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4450. FMMNode_t* snode=intlst[j];
  4451. size_t snode_id=snode->node_id;
  4452. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4453. if(snode->IsGhost() && snode->src_coord.Dim()+snode->surf_coord.Dim()==0) continue; // Is non-leaf ghost node
  4454. if(snode->pt_cnt[0]> Nsrf) continue;
  4455. in_node.push_back(snode_id);
  4456. scal_idx.push_back(snode->Depth());
  4457. { // set coord_shift
  4458. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4459. const Real_t* scoord=snode->Coord();
  4460. const Real_t* tcoord=tnode->Coord();
  4461. Real_t shift[COORD_DIM];
  4462. shift[0]=rel_coord[0]*0.25*s-(scoord[0]+0.25*s)+(tcoord[0]+0.5*s);
  4463. shift[1]=rel_coord[1]*0.25*s-(scoord[1]+0.25*s)+(tcoord[1]+0.5*s);
  4464. shift[2]=rel_coord[2]*0.25*s-(scoord[2]+0.25*s)+(tcoord[2]+0.5*s);
  4465. coord_shift.push_back(shift[0]);
  4466. coord_shift.push_back(shift[1]);
  4467. coord_shift.push_back(shift[2]);
  4468. }
  4469. interac_cnt_++;
  4470. }
  4471. }
  4472. interac_cnt.push_back(interac_cnt_);
  4473. }
  4474. }
  4475. { // Combine interac data
  4476. InteracData& interac_data=data.interac_data;
  4477. { // in_node
  4478. typedef size_t ElemType;
  4479. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4480. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4481. std::vector<size_t> vec_dsp(omp_p+1,0);
  4482. for(size_t tid=0;tid<omp_p;tid++){
  4483. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4484. }
  4485. vec.ReInit(vec_dsp[omp_p]);
  4486. #pragma omp parallel for
  4487. for(size_t tid=0;tid<omp_p;tid++){
  4488. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4489. }
  4490. }
  4491. { // scal_idx
  4492. typedef size_t ElemType;
  4493. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4494. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4495. std::vector<size_t> vec_dsp(omp_p+1,0);
  4496. for(size_t tid=0;tid<omp_p;tid++){
  4497. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4498. }
  4499. vec.ReInit(vec_dsp[omp_p]);
  4500. #pragma omp parallel for
  4501. for(size_t tid=0;tid<omp_p;tid++){
  4502. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4503. }
  4504. }
  4505. { // coord_shift
  4506. typedef Real_t ElemType;
  4507. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4508. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4509. std::vector<size_t> vec_dsp(omp_p+1,0);
  4510. for(size_t tid=0;tid<omp_p;tid++){
  4511. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4512. }
  4513. vec.ReInit(vec_dsp[omp_p]);
  4514. #pragma omp parallel for
  4515. for(size_t tid=0;tid<omp_p;tid++){
  4516. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4517. }
  4518. }
  4519. { // interac_cnt
  4520. typedef size_t ElemType;
  4521. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4522. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4523. std::vector<size_t> vec_dsp(omp_p+1,0);
  4524. for(size_t tid=0;tid<omp_p;tid++){
  4525. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4526. }
  4527. vec.ReInit(vec_dsp[omp_p]);
  4528. #pragma omp parallel for
  4529. for(size_t tid=0;tid<omp_p;tid++){
  4530. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4531. }
  4532. }
  4533. { // interac_dsp
  4534. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4535. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4536. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4537. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4538. }
  4539. }
  4540. }
  4541. PtSetup(setup_data, &data);
  4542. }
  4543. template <class FMMNode>
  4544. void FMM_Pts<FMMNode>::U_List (SetupData<Real_t>& setup_data, bool device){
  4545. //Add U_List contribution.
  4546. this->EvalListPts(setup_data, device);
  4547. }
  4548. template <class FMMNode>
  4549. void FMM_Pts<FMMNode>::Down2TargetSetup(SetupData<Real_t>& setup_data, FMMTree_t* tree, std::vector<Matrix<Real_t> >& buff, std::vector<Vector<FMMNode_t*> >& n_list, int level, bool device){
  4550. if(!this->MultipoleOrder()) return;
  4551. { // Set setup_data
  4552. setup_data. level=level;
  4553. setup_data.kernel=kernel->k_l2t;
  4554. setup_data. input_data=&buff[1];
  4555. setup_data.output_data=&buff[5];
  4556. setup_data. coord_data=&buff[6];
  4557. Vector<FMMNode_t*>& nodes_in =n_list[1];
  4558. Vector<FMMNode_t*>& nodes_out=n_list[5];
  4559. setup_data.nodes_in .clear();
  4560. setup_data.nodes_out.clear();
  4561. for(size_t i=0;i<nodes_in .Dim();i++) if((nodes_in [i]->Depth()==level || level==-1) && nodes_in [i]->pt_cnt[1] && nodes_in [i]->IsLeaf() && !nodes_in [i]->IsGhost()) setup_data.nodes_in .push_back(nodes_in [i]);
  4562. for(size_t i=0;i<nodes_out.Dim();i++) if((nodes_out[i]->Depth()==level || level==-1) && nodes_out[i]->pt_cnt[1] && nodes_out[i]->IsLeaf() && !nodes_out[i]->IsGhost()) setup_data.nodes_out.push_back(nodes_out[i]);
  4563. }
  4564. struct PackedData{
  4565. size_t len;
  4566. Matrix<Real_t>* ptr;
  4567. Vector<size_t> cnt;
  4568. Vector<size_t> dsp;
  4569. };
  4570. struct InteracData{
  4571. Vector<size_t> in_node;
  4572. Vector<size_t> scal_idx;
  4573. Vector<Real_t> coord_shift;
  4574. Vector<size_t> interac_cnt;
  4575. Vector<size_t> interac_dsp;
  4576. Vector<size_t> interac_cst;
  4577. Vector<Real_t> scal[4*MAX_DEPTH];
  4578. Matrix<Real_t> M[4];
  4579. };
  4580. struct ptSetupData{
  4581. int level;
  4582. const Kernel<Real_t>* kernel;
  4583. PackedData src_coord; // Src coord
  4584. PackedData src_value; // Src density
  4585. PackedData srf_coord; // Srf coord
  4586. PackedData srf_value; // Srf density
  4587. PackedData trg_coord; // Trg coord
  4588. PackedData trg_value; // Trg potential
  4589. InteracData interac_data;
  4590. };
  4591. ptSetupData data;
  4592. data. level=setup_data. level;
  4593. data.kernel=setup_data.kernel;
  4594. std::vector<void*>& nodes_in =setup_data.nodes_in ;
  4595. std::vector<void*>& nodes_out=setup_data.nodes_out;
  4596. { // Set src data
  4597. std::vector<void*>& nodes=nodes_in;
  4598. PackedData& coord=data.src_coord;
  4599. PackedData& value=data.src_value;
  4600. coord.ptr=setup_data. coord_data;
  4601. value.ptr=setup_data. input_data;
  4602. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4603. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4604. coord.cnt.ReInit(nodes.size());
  4605. coord.dsp.ReInit(nodes.size());
  4606. value.cnt.ReInit(nodes.size());
  4607. value.dsp.ReInit(nodes.size());
  4608. #pragma omp parallel for
  4609. for(size_t i=0;i<nodes.size();i++){
  4610. ((FMMNode_t*)nodes[i])->node_id=i;
  4611. Vector<Real_t>& coord_vec=tree->dnwd_equiv_surf[((FMMNode*)nodes[i])->Depth()];
  4612. Vector<Real_t>& value_vec=((FMMData*)((FMMNode*)nodes[i])->FMMData())->dnward_equiv;
  4613. if(coord_vec.Dim()){
  4614. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4615. assert(coord.dsp[i]<coord.len);
  4616. coord.cnt[i]=coord_vec.Dim();
  4617. }else{
  4618. coord.dsp[i]=0;
  4619. coord.cnt[i]=0;
  4620. }
  4621. if(value_vec.Dim()){
  4622. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4623. assert(value.dsp[i]<value.len);
  4624. value.cnt[i]=value_vec.Dim();
  4625. }else{
  4626. value.dsp[i]=0;
  4627. value.cnt[i]=0;
  4628. }
  4629. }
  4630. }
  4631. { // Set srf data
  4632. std::vector<void*>& nodes=nodes_in;
  4633. PackedData& coord=data.srf_coord;
  4634. PackedData& value=data.srf_value;
  4635. coord.ptr=setup_data. coord_data;
  4636. value.ptr=setup_data. input_data;
  4637. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4638. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4639. coord.cnt.ReInit(nodes.size());
  4640. coord.dsp.ReInit(nodes.size());
  4641. value.cnt.ReInit(nodes.size());
  4642. value.dsp.ReInit(nodes.size());
  4643. #pragma omp parallel for
  4644. for(size_t i=0;i<nodes.size();i++){
  4645. coord.dsp[i]=0;
  4646. coord.cnt[i]=0;
  4647. value.dsp[i]=0;
  4648. value.cnt[i]=0;
  4649. }
  4650. }
  4651. { // Set trg data
  4652. std::vector<void*>& nodes=nodes_out;
  4653. PackedData& coord=data.trg_coord;
  4654. PackedData& value=data.trg_value;
  4655. coord.ptr=setup_data. coord_data;
  4656. value.ptr=setup_data.output_data;
  4657. coord.len=coord.ptr->Dim(0)*coord.ptr->Dim(1);
  4658. value.len=value.ptr->Dim(0)*value.ptr->Dim(1);
  4659. coord.cnt.ReInit(nodes.size());
  4660. coord.dsp.ReInit(nodes.size());
  4661. value.cnt.ReInit(nodes.size());
  4662. value.dsp.ReInit(nodes.size());
  4663. #pragma omp parallel for
  4664. for(size_t i=0;i<nodes.size();i++){
  4665. Vector<Real_t>& coord_vec=((FMMNode_t*)nodes[i])->trg_coord;
  4666. Vector<Real_t>& value_vec=((FMMNode_t*)nodes[i])->trg_value;
  4667. if(coord_vec.Dim()){
  4668. coord.dsp[i]=&coord_vec[0]-coord.ptr[0][0];
  4669. assert(coord.dsp[i]<coord.len);
  4670. coord.cnt[i]=coord_vec.Dim();
  4671. }else{
  4672. coord.dsp[i]=0;
  4673. coord.cnt[i]=0;
  4674. }
  4675. if(value_vec.Dim()){
  4676. value.dsp[i]=&value_vec[0]-value.ptr[0][0];
  4677. assert(value.dsp[i]<value.len);
  4678. value.cnt[i]=value_vec.Dim();
  4679. }else{
  4680. value.dsp[i]=0;
  4681. value.cnt[i]=0;
  4682. }
  4683. }
  4684. }
  4685. { // Set interac_data
  4686. int omp_p=omp_get_max_threads();
  4687. std::vector<std::vector<size_t> > in_node_(omp_p);
  4688. std::vector<std::vector<size_t> > scal_idx_(omp_p);
  4689. std::vector<std::vector<Real_t> > coord_shift_(omp_p);
  4690. std::vector<std::vector<size_t> > interac_cnt_(omp_p);
  4691. if(this->ScaleInvar()){ // Set scal
  4692. const Kernel<Real_t>* ker=kernel->k_l2l;
  4693. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+0]
  4694. Vector<Real_t>& scal=data.interac_data.scal[l*4+0];
  4695. Vector<Real_t>& scal_exp=ker->trg_scal;
  4696. scal.ReInit(scal_exp.Dim());
  4697. for(size_t i=0;i<scal.Dim();i++){
  4698. scal[i]=std::pow(2.0,-scal_exp[i]*l);
  4699. }
  4700. }
  4701. for(size_t l=0;l<MAX_DEPTH;l++){ // scal[l*4+1]
  4702. Vector<Real_t>& scal=data.interac_data.scal[l*4+1];
  4703. Vector<Real_t>& scal_exp=ker->src_scal;
  4704. scal.ReInit(scal_exp.Dim());
  4705. for(size_t i=0;i<scal.Dim();i++){
  4706. scal[i]=std::pow(2.0,-scal_exp[i]*l);
  4707. }
  4708. }
  4709. }
  4710. #pragma omp parallel for
  4711. for(size_t tid=0;tid<omp_p;tid++){
  4712. std::vector<size_t>& in_node =in_node_[tid] ;
  4713. std::vector<size_t>& scal_idx =scal_idx_[tid] ;
  4714. std::vector<Real_t>& coord_shift=coord_shift_[tid];
  4715. std::vector<size_t>& interac_cnt=interac_cnt_[tid];
  4716. size_t a=(nodes_out.size()*(tid+0))/omp_p;
  4717. size_t b=(nodes_out.size()*(tid+1))/omp_p;
  4718. for(size_t i=a;i<b;i++){
  4719. FMMNode_t* tnode=(FMMNode_t*)nodes_out[i];
  4720. Real_t s=std::pow(0.5,tnode->Depth());
  4721. size_t interac_cnt_=0;
  4722. { // D2T_Type
  4723. Mat_Type type=D2T_Type;
  4724. Vector<FMMNode_t*>& intlst=tnode->interac_list[type];
  4725. for(size_t j=0;j<intlst.Dim();j++) if(intlst[j]){
  4726. FMMNode_t* snode=intlst[j];
  4727. size_t snode_id=snode->node_id;
  4728. if(snode_id>=nodes_in.size() || nodes_in[snode_id]!=snode) continue;
  4729. in_node.push_back(snode_id);
  4730. scal_idx.push_back(snode->Depth());
  4731. { // set coord_shift
  4732. const int* rel_coord=interac_list.RelativeCoord(type,j);
  4733. const Real_t* scoord=snode->Coord();
  4734. const Real_t* tcoord=tnode->Coord();
  4735. Real_t shift[COORD_DIM];
  4736. shift[0]=rel_coord[0]*0.5*s-(0+0.5*s)+(tcoord[0]+0.5*s);
  4737. shift[1]=rel_coord[1]*0.5*s-(0+0.5*s)+(tcoord[1]+0.5*s);
  4738. shift[2]=rel_coord[2]*0.5*s-(0+0.5*s)+(tcoord[2]+0.5*s);
  4739. coord_shift.push_back(shift[0]);
  4740. coord_shift.push_back(shift[1]);
  4741. coord_shift.push_back(shift[2]);
  4742. }
  4743. interac_cnt_++;
  4744. }
  4745. }
  4746. interac_cnt.push_back(interac_cnt_);
  4747. }
  4748. }
  4749. { // Combine interac data
  4750. InteracData& interac_data=data.interac_data;
  4751. { // in_node
  4752. typedef size_t ElemType;
  4753. std::vector<std::vector<ElemType> >& vec_=in_node_;
  4754. pvfmm::Vector<ElemType>& vec=interac_data.in_node;
  4755. std::vector<size_t> vec_dsp(omp_p+1,0);
  4756. for(size_t tid=0;tid<omp_p;tid++){
  4757. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4758. }
  4759. vec.ReInit(vec_dsp[omp_p]);
  4760. #pragma omp parallel for
  4761. for(size_t tid=0;tid<omp_p;tid++){
  4762. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4763. }
  4764. }
  4765. { // scal_idx
  4766. typedef size_t ElemType;
  4767. std::vector<std::vector<ElemType> >& vec_=scal_idx_;
  4768. pvfmm::Vector<ElemType>& vec=interac_data.scal_idx;
  4769. std::vector<size_t> vec_dsp(omp_p+1,0);
  4770. for(size_t tid=0;tid<omp_p;tid++){
  4771. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4772. }
  4773. vec.ReInit(vec_dsp[omp_p]);
  4774. #pragma omp parallel for
  4775. for(size_t tid=0;tid<omp_p;tid++){
  4776. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4777. }
  4778. }
  4779. { // coord_shift
  4780. typedef Real_t ElemType;
  4781. std::vector<std::vector<ElemType> >& vec_=coord_shift_;
  4782. pvfmm::Vector<ElemType>& vec=interac_data.coord_shift;
  4783. std::vector<size_t> vec_dsp(omp_p+1,0);
  4784. for(size_t tid=0;tid<omp_p;tid++){
  4785. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4786. }
  4787. vec.ReInit(vec_dsp[omp_p]);
  4788. #pragma omp parallel for
  4789. for(size_t tid=0;tid<omp_p;tid++){
  4790. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4791. }
  4792. }
  4793. { // interac_cnt
  4794. typedef size_t ElemType;
  4795. std::vector<std::vector<ElemType> >& vec_=interac_cnt_;
  4796. pvfmm::Vector<ElemType>& vec=interac_data.interac_cnt;
  4797. std::vector<size_t> vec_dsp(omp_p+1,0);
  4798. for(size_t tid=0;tid<omp_p;tid++){
  4799. vec_dsp[tid+1]=vec_dsp[tid]+vec_[tid].size();
  4800. }
  4801. vec.ReInit(vec_dsp[omp_p]);
  4802. #pragma omp parallel for
  4803. for(size_t tid=0;tid<omp_p;tid++){
  4804. memcpy(&vec[0]+vec_dsp[tid],&vec_[tid][0],vec_[tid].size()*sizeof(ElemType));
  4805. }
  4806. }
  4807. { // interac_dsp
  4808. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4809. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4810. dsp.ReInit(cnt.Dim()); if(dsp.Dim()) dsp[0]=0;
  4811. omp_par::scan(&cnt[0],&dsp[0],dsp.Dim());
  4812. }
  4813. }
  4814. { // Set M[0], M[1]
  4815. InteracData& interac_data=data.interac_data;
  4816. pvfmm::Vector<size_t>& cnt=interac_data.interac_cnt;
  4817. pvfmm::Vector<size_t>& dsp=interac_data.interac_dsp;
  4818. if(cnt.Dim() && cnt[cnt.Dim()-1]+dsp[dsp.Dim()-1]){
  4819. data.interac_data.M[0]=this->mat->Mat(level, DC2DE0_Type, 0);
  4820. data.interac_data.M[1]=this->mat->Mat(level, DC2DE1_Type, 0);
  4821. }else{
  4822. data.interac_data.M[0].ReInit(0,0);
  4823. data.interac_data.M[1].ReInit(0,0);
  4824. }
  4825. }
  4826. }
  4827. PtSetup(setup_data, &data);
  4828. }
  4829. template <class FMMNode>
  4830. void FMM_Pts<FMMNode>::Down2Target(SetupData<Real_t>& setup_data, bool device){
  4831. if(!this->MultipoleOrder()) return;
  4832. //Add Down2Target contribution.
  4833. this->EvalListPts(setup_data, device);
  4834. }
  4835. template <class FMMNode>
  4836. void FMM_Pts<FMMNode>::PostProcessing(std::vector<FMMNode_t*>& nodes){
  4837. }
  4838. template <class FMMNode>
  4839. void FMM_Pts<FMMNode>::CopyOutput(FMMNode** nodes, size_t n){
  4840. }
  4841. }//end namespace