fmm_tree.txx 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748
  1. /**
  2. * \file fmm_tree.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 12-11-2010
  5. * \brief This file contains the implementation of the class FMM_Tree.
  6. */
  7. #include <omp.h>
  8. #include <sstream>
  9. #include <iomanip>
  10. #include <cassert>
  11. #include <cstdlib>
  12. #include <mpi_node.hpp>
  13. #include <fmm_node.hpp>
  14. #include <mem_mgr.hpp>
  15. #include <mortonid.hpp>
  16. #include <profile.hpp>
  17. #include <vector.hpp>
  18. namespace pvfmm{
  19. template <class FMM_Mat_t>
  20. void FMM_Tree<FMM_Mat_t>::Initialize(typename Node_t::NodeData* init_data) {
  21. Profile::Tic("InitTree",this->Comm(),true);{
  22. //Build octree from points.
  23. MPI_Tree<Node_t>::Initialize(init_data);
  24. Profile::Tic("InitFMMData",this->Comm(),true,5);
  25. { //Initialize FMM data.
  26. std::vector<Node_t*>& nodes=this->GetNodeList();
  27. #pragma omp parallel for
  28. for(size_t i=0;i<nodes.size();i++){
  29. if(nodes[i]->FMMData()==NULL) nodes[i]->FMMData()=mem::aligned_new<typename FMM_Mat_t::FMMData>();
  30. }
  31. }
  32. Profile::Toc();
  33. }Profile::Toc();
  34. }
  35. template <class FMM_Mat_t>
  36. void FMM_Tree<FMM_Mat_t>::InitFMM_Tree(bool refine, BoundaryType bndry_) {
  37. Profile::Tic("InitFMM_Tree",this->Comm(),true);{
  38. interac_list.Initialize(this->Dim());
  39. bndry=bndry_;
  40. if(refine){
  41. //RefineTree
  42. Profile::Tic("RefineTree",this->Comm(),true,5);
  43. this->RefineTree();
  44. Profile::Toc();
  45. }
  46. //2:1 Balancing
  47. Profile::Tic("2:1Balance",this->Comm(),true,5);
  48. this->Balance21(bndry);
  49. Profile::Toc();
  50. //Redistribute nodes.
  51. Profile::Tic("Redistribute",this->Comm(),true,5);
  52. this->RedistNodes();
  53. Profile::Toc();
  54. }Profile::Toc();
  55. }
  56. template <class FMM_Mat_t>
  57. void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
  58. Profile::Tic("SetupFMM",this->Comm(),true);{
  59. typedef typename FMM_Mat_t::FMMTree_t MatTree_t;
  60. bool device=true;
  61. #ifdef __INTEL_OFFLOAD
  62. Profile::Tic("InitLocks",this->Comm(),false,3);
  63. MIC_Lock::init();
  64. Profile::Toc();
  65. #endif
  66. //int omp_p=omp_get_max_threads();
  67. if(fmm_mat!=fmm_mat_){ // Clear previous setup
  68. setup_data.clear();
  69. precomp_lst.clear();
  70. fmm_mat=fmm_mat_;
  71. }
  72. //Construct LET
  73. Profile::Tic("ConstructLET",this->Comm(),false,2);
  74. this->ConstructLET(bndry);
  75. Profile::Toc();
  76. //Set Colleagues (Needed to build U, V, W and X lists.)
  77. Profile::Tic("SetColleagues",this->Comm(),false,3);
  78. this->SetColleagues(bndry);
  79. Profile::Toc();
  80. Profile::Tic("CollectNodeData",this->Comm(),false,3);
  81. //Build node list.
  82. Node_t* n=dynamic_cast<Node_t*>(this->PostorderFirst());
  83. std::vector<Node_t*> all_nodes;
  84. while(n!=NULL){
  85. n->pt_cnt[0]=0;
  86. n->pt_cnt[1]=0;
  87. all_nodes.push_back(n);
  88. n=static_cast<Node_t*>(this->PostorderNxt(n));
  89. }
  90. //Collect node data into continuous array.
  91. std::vector<Vector<Node_t*> > node_lists; // TODO: Remove this parameter, not really needed
  92. fmm_mat->CollectNodeData((MatTree_t*)this,all_nodes, node_data_buff, node_lists);
  93. Profile::Toc();
  94. Profile::Tic("BuildLists",this->Comm(),false,3);
  95. BuildInteracLists();
  96. Profile::Toc();
  97. setup_data.resize(8*MAX_DEPTH);
  98. precomp_lst.resize(8);
  99. Profile::Tic("UListSetup",this->Comm(),false,3);
  100. for(size_t i=0;i<MAX_DEPTH;i++){
  101. setup_data[i+MAX_DEPTH*0].precomp_data=&precomp_lst[0];
  102. fmm_mat->U_ListSetup(setup_data[i+MAX_DEPTH*0],(MatTree_t*)this,node_data_buff,node_lists,fmm_mat->ScaleInvar()?(i==0?-1:MAX_DEPTH+1):i, device);
  103. }
  104. Profile::Toc();
  105. Profile::Tic("WListSetup",this->Comm(),false,3);
  106. for(size_t i=0;i<MAX_DEPTH;i++){
  107. setup_data[i+MAX_DEPTH*1].precomp_data=&precomp_lst[1];
  108. fmm_mat->W_ListSetup(setup_data[i+MAX_DEPTH*1],(MatTree_t*)this,node_data_buff,node_lists,fmm_mat->ScaleInvar()?(i==0?-1:MAX_DEPTH+1):i, device);
  109. }
  110. Profile::Toc();
  111. Profile::Tic("XListSetup",this->Comm(),false,3);
  112. for(size_t i=0;i<MAX_DEPTH;i++){
  113. setup_data[i+MAX_DEPTH*2].precomp_data=&precomp_lst[2];
  114. fmm_mat->X_ListSetup(setup_data[i+MAX_DEPTH*2],(MatTree_t*)this,node_data_buff,node_lists,fmm_mat->ScaleInvar()?(i==0?-1:MAX_DEPTH+1):i, device);
  115. }
  116. Profile::Toc();
  117. Profile::Tic("VListSetup",this->Comm(),false,3);
  118. for(size_t i=0;i<MAX_DEPTH;i++){
  119. setup_data[i+MAX_DEPTH*3].precomp_data=&precomp_lst[3];
  120. fmm_mat->V_ListSetup(setup_data[i+MAX_DEPTH*3],(MatTree_t*)this,node_data_buff,node_lists,fmm_mat->ScaleInvar()?(i==0?-1:MAX_DEPTH+1):i, /*device*/ false);
  121. }
  122. Profile::Toc();
  123. Profile::Tic("D2DSetup",this->Comm(),false,3);
  124. for(size_t i=0;i<MAX_DEPTH;i++){
  125. setup_data[i+MAX_DEPTH*4].precomp_data=&precomp_lst[4];
  126. fmm_mat->Down2DownSetup(setup_data[i+MAX_DEPTH*4],(MatTree_t*)this,node_data_buff,node_lists,i, /*device*/ false);
  127. }
  128. Profile::Toc();
  129. Profile::Tic("D2TSetup",this->Comm(),false,3);
  130. for(size_t i=0;i<MAX_DEPTH;i++){
  131. setup_data[i+MAX_DEPTH*5].precomp_data=&precomp_lst[5];
  132. fmm_mat->Down2TargetSetup(setup_data[i+MAX_DEPTH*5],(MatTree_t*)this,node_data_buff,node_lists,fmm_mat->ScaleInvar()?(i==0?-1:MAX_DEPTH+1):i, /*device*/ false);
  133. }
  134. Profile::Toc();
  135. Profile::Tic("S2USetup",this->Comm(),false,3);
  136. for(size_t i=0;i<MAX_DEPTH;i++){
  137. setup_data[i+MAX_DEPTH*6].precomp_data=&precomp_lst[6];
  138. fmm_mat->Source2UpSetup(setup_data[i+MAX_DEPTH*6],(MatTree_t*)this,node_data_buff,node_lists,fmm_mat->ScaleInvar()?(i==0?-1:MAX_DEPTH+1):i, /*device*/ false);
  139. }
  140. Profile::Toc();
  141. Profile::Tic("U2USetup",this->Comm(),false,3);
  142. for(size_t i=0;i<MAX_DEPTH;i++){
  143. setup_data[i+MAX_DEPTH*7].precomp_data=&precomp_lst[7];
  144. fmm_mat->Up2UpSetup(setup_data[i+MAX_DEPTH*7],(MatTree_t*)this,node_data_buff,node_lists,i, /*device*/ false);
  145. }
  146. Profile::Toc();
  147. #ifdef __INTEL_OFFLOAD
  148. int wait_lock_idx=-1;
  149. wait_lock_idx=MIC_Lock::curr_lock();
  150. #pragma offload target(mic:0)
  151. {MIC_Lock::wait_lock(wait_lock_idx);}
  152. #endif
  153. ClearFMMData();
  154. }Profile::Toc();
  155. }
  156. template <class FMM_Mat_t>
  157. void FMM_Tree<FMM_Mat_t>::ClearFMMData() {
  158. Profile::Tic("ClearFMMData",this->Comm(),true);{
  159. bool device=true;
  160. int omp_p=omp_get_max_threads();
  161. #pragma omp parallel for
  162. for(int j=0;j<omp_p;j++){
  163. Matrix<Real_t>* mat;
  164. mat=setup_data[0+MAX_DEPTH*1]. input_data;
  165. if(mat!=NULL){
  166. size_t a=(mat->Dim(0)*mat->Dim(1)*(j+0))/omp_p;
  167. size_t b=(mat->Dim(0)*mat->Dim(1)*(j+1))/omp_p;
  168. memset(&(*mat)[0][a],0,(b-a)*sizeof(Real_t));
  169. }
  170. mat=setup_data[0+MAX_DEPTH*2].output_data;
  171. if(mat!=NULL){
  172. size_t a=(mat->Dim(0)*mat->Dim(1)*(j+0))/omp_p;
  173. size_t b=(mat->Dim(0)*mat->Dim(1)*(j+1))/omp_p;
  174. memset(&(*mat)[0][a],0,(b-a)*sizeof(Real_t));
  175. }
  176. mat=setup_data[0+MAX_DEPTH*0].output_data;
  177. if(mat!=NULL){
  178. size_t a=(mat->Dim(0)*mat->Dim(1)*(j+0))/omp_p;
  179. size_t b=(mat->Dim(0)*mat->Dim(1)*(j+1))/omp_p;
  180. memset(&(*mat)[0][a],0,(b-a)*sizeof(Real_t));
  181. }
  182. }
  183. if(device){ // Host2Device
  184. if(setup_data[0+MAX_DEPTH*1]. input_data!=NULL) setup_data[0+MAX_DEPTH*1]. input_data->AllocDevice(true);
  185. if(setup_data[0+MAX_DEPTH*2].output_data!=NULL) setup_data[0+MAX_DEPTH*2].output_data->AllocDevice(true);
  186. if(setup_data[0+MAX_DEPTH*0].output_data!=NULL) setup_data[0+MAX_DEPTH*0].output_data->AllocDevice(true);
  187. #ifdef __INTEL_OFFLOAD
  188. if(!fmm_mat->ScaleInvar()){ // Wait
  189. int wait_lock_idx=-1;
  190. wait_lock_idx=MIC_Lock::curr_lock();
  191. #pragma offload target(mic:0)
  192. {MIC_Lock::wait_lock(wait_lock_idx);}
  193. }
  194. MIC_Lock::init();
  195. #endif
  196. }
  197. }Profile::Toc();
  198. }
  199. template <class FMM_Mat_t>
  200. void FMM_Tree<FMM_Mat_t>::RunFMM() {
  201. Profile::Tic("RunFMM",this->Comm(),true);{
  202. //Upward Pass
  203. Profile::Tic("UpwardPass",this->Comm(),false,2);
  204. UpwardPass();
  205. Profile::Toc();
  206. //Multipole Reduce Broadcast.
  207. Profile::Tic("ReduceBcast",this->Comm(),true,2);
  208. MultipoleReduceBcast();
  209. Profile::Toc();
  210. //Local 2:1 Balancing.
  211. //This can cause load imbalance, always use global 2:1 balance instead.
  212. //Profile::Tic("2:1Balance(local)",this->Comm(),false,3);
  213. //this->Balance21_local(bndry);
  214. //UpwardPass(true);
  215. //Profile::Toc();
  216. //Downward Pass
  217. Profile::Tic("DownwardPass",this->Comm(),true,2);
  218. DownwardPass();
  219. Profile::Toc();
  220. }Profile::Toc();
  221. }
  222. template <class FMM_Mat_t>
  223. void FMM_Tree<FMM_Mat_t>::UpwardPass() {
  224. //bool device=true;
  225. int max_depth=0;
  226. { // Get max_depth
  227. int max_depth_loc=0;
  228. std::vector<Node_t*>& nodes=this->GetNodeList();
  229. for(size_t i=0;i<nodes.size();i++){
  230. Node_t* n=nodes[i];
  231. if(n->Depth()>max_depth_loc) max_depth_loc=n->Depth();
  232. }
  233. MPI_Allreduce(&max_depth_loc, &max_depth, 1, MPI_INT, MPI_MAX, *this->Comm());
  234. }
  235. //Upward Pass (initialize all leaf nodes)
  236. Profile::Tic("S2U",this->Comm(),false,5);
  237. for(int i=0; i<=(fmm_mat->ScaleInvar()?0:max_depth); i++){ // Source2Up
  238. if(!fmm_mat->ScaleInvar()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*6],/*device*/ false);
  239. fmm_mat->Source2Up(setup_data[i+MAX_DEPTH*6]);
  240. }
  241. Profile::Toc();
  242. //Upward Pass (level by level)
  243. Profile::Tic("U2U",this->Comm(),false,5);
  244. for(int i=max_depth-1; i>=0; i--){ // Up2Up
  245. if(!fmm_mat->ScaleInvar()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*7],/*device*/ false);
  246. fmm_mat->Up2Up(setup_data[i+MAX_DEPTH*7]);
  247. }
  248. Profile::Toc();
  249. }
  250. template <class FMM_Mat_t>
  251. void FMM_Tree<FMM_Mat_t>::BuildInteracLists() {
  252. std::vector<Node_t*> n_list;
  253. { // Build n_list
  254. std::vector<Node_t*>& nodes=this->GetNodeList();
  255. for(size_t i=0;i<nodes.size();i++){
  256. if(!nodes[i]->IsGhost() && nodes[i]->pt_cnt[1]){
  257. n_list.push_back(nodes[i]);
  258. }
  259. }
  260. }
  261. size_t node_cnt=n_list.size();
  262. std::vector<Mat_Type> type_lst;
  263. type_lst.push_back(S2U_Type);
  264. type_lst.push_back(U2U_Type);
  265. type_lst.push_back(D2D_Type);
  266. type_lst.push_back(D2T_Type);
  267. type_lst.push_back(U0_Type );
  268. type_lst.push_back(U1_Type );
  269. type_lst.push_back(U2_Type );
  270. type_lst.push_back(W_Type );
  271. type_lst.push_back(X_Type );
  272. type_lst.push_back(V1_Type );
  273. size_t all_interac_cnt=0;
  274. pvfmm::Vector<size_t> interac_cnt(type_lst.size());
  275. for(size_t i=0;i<type_lst.size();i++){
  276. interac_cnt[i]=interac_list.ListCount(type_lst[i]);
  277. all_interac_cnt+=interac_cnt[i];
  278. }
  279. node_interac_lst.ReInit(node_cnt,all_interac_cnt);
  280. // Build interaction lists.
  281. int omp_p=omp_get_max_threads();
  282. #pragma omp parallel for
  283. for(int j=0;j<omp_p;j++){
  284. size_t a=(node_cnt*(j ))/omp_p;
  285. size_t b=(node_cnt*(j+1))/omp_p;
  286. for(size_t i=a;i<b;i++){
  287. size_t offset=0;
  288. Node_t* n=n_list[i];
  289. for(size_t k=0;k<type_lst.size();k++){
  290. n->interac_list[type_lst[k]].ReInit(interac_cnt[k],&node_interac_lst[i][offset],false);
  291. interac_list.BuildList(n,type_lst[k]);
  292. offset+=interac_cnt[k];
  293. }
  294. }
  295. }
  296. }
  297. template <class FMM_Mat_t>
  298. void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
  299. int num_p,rank;
  300. MPI_Comm_size(*this->Comm(),&num_p);
  301. MPI_Comm_rank(*this->Comm(),&rank );
  302. if(num_p==1) return;
  303. Profile::Tic("Reduce",this->Comm(),true,3);
  304. std::vector<MortonId> mins=this->GetMins();
  305. size_t bit_mask=1;
  306. size_t max_child=(1UL<<this->Dim());
  307. //Initialize initial send nodes.
  308. std::vector<Node_t*> send_nodes[2];
  309. //Initialize send_node[0]
  310. Node_t* tmp_node=static_cast<Node_t*>(this->RootNode());
  311. assert(!tmp_node->IsGhost());
  312. while(!tmp_node->IsLeaf()){
  313. Node_t* tmp_node_=NULL;
  314. for(size_t i=0;i<max_child;i++){
  315. tmp_node_=static_cast<Node_t*>(tmp_node->Child(i));
  316. if(tmp_node_!=NULL) if(!tmp_node_->IsGhost()) break;
  317. }
  318. tmp_node=tmp_node_; assert(tmp_node!=NULL);
  319. }
  320. int n[2];
  321. n[0]=tmp_node->Depth()+1;
  322. send_nodes[0].resize(n[0]);
  323. send_nodes[0][n[0]-1]=tmp_node;
  324. for(int i=n[0]-1;i>0;i--)
  325. send_nodes[0][i-1]=static_cast<Node_t*>(send_nodes[0][i]->Parent());
  326. //Initialize send_node[1]
  327. tmp_node=static_cast<Node_t*>(this->RootNode());
  328. while(!tmp_node->IsLeaf()){
  329. Node_t* tmp_node_=NULL;
  330. for(int i=max_child-1;i>=0;i--){
  331. tmp_node_=static_cast<Node_t*>(tmp_node->Child(i));
  332. if(tmp_node_!=NULL) if(!tmp_node_->IsGhost()) break;
  333. }
  334. tmp_node=tmp_node_; assert(tmp_node!=NULL);
  335. }
  336. n[1]=tmp_node->Depth()+1;
  337. send_nodes[1].resize(n[1]);
  338. send_nodes[1][n[1]-1]=tmp_node;
  339. for(int i=n[1]-1;i>0;i--)
  340. send_nodes[1][i-1]=static_cast<Node_t*>(send_nodes[1][i]->Parent());
  341. //Hypercube reduction.
  342. while(bit_mask<(size_t)num_p){
  343. size_t partner=rank^bit_mask; //Partner process id
  344. int merge_indx=((bit_mask & rank)==0?1:0);
  345. bit_mask=bit_mask<<1;
  346. //if(rank >= num_p - (num_p % bit_mask)) break;
  347. //Initialize send data.
  348. size_t s_node_cnt[2]={send_nodes[0].size(),send_nodes[1].size()};
  349. int send_size=2*sizeof(size_t)+(s_node_cnt[0]+s_node_cnt[1])*sizeof(MortonId);
  350. std::vector<PackedData> send_data(s_node_cnt[0]+s_node_cnt[1]);
  351. size_t s_iter=0;
  352. for(int i=0;i<2;i++)
  353. for(size_t j=0;j<s_node_cnt[i];j++){
  354. assert(send_nodes[i][j]!=NULL);
  355. send_data[s_iter]=send_nodes[i][j]->PackMultipole();
  356. send_size+=send_data[s_iter].length+sizeof(size_t);
  357. s_iter++;
  358. }
  359. char* send_buff=mem::aligned_new<char>(send_size);
  360. char* buff_iter=send_buff;
  361. ((size_t*)buff_iter)[0]=s_node_cnt[0];
  362. ((size_t*)buff_iter)[1]=s_node_cnt[1];
  363. buff_iter+=2*sizeof(size_t);
  364. s_iter=0;
  365. for(int i=0;i<2;i++)
  366. for(size_t j=0;j<s_node_cnt[i];j++){
  367. ((MortonId*)buff_iter)[0]=send_nodes[i][j]->GetMortonId();
  368. buff_iter+=sizeof(MortonId);
  369. ((size_t*)buff_iter)[0]=send_data[s_iter].length;
  370. buff_iter+=sizeof(size_t);
  371. mem::memcopy((void*)buff_iter,send_data[s_iter].data,send_data[s_iter].length);
  372. buff_iter+=send_data[s_iter].length;
  373. s_iter++;
  374. }
  375. //Exchange send and recv sizes
  376. int recv_size=0;
  377. MPI_Status status;
  378. char* recv_buff=NULL;
  379. if(partner<(size_t)num_p){
  380. MPI_Sendrecv(&send_size, 1, MPI_INT, partner, 0, &recv_size, 1, MPI_INT, partner, 0, *this->Comm(), &status);
  381. recv_buff=mem::aligned_new<char>(recv_size);
  382. MPI_Sendrecv(send_buff, send_size, MPI_BYTE, partner, 0, recv_buff, recv_size, MPI_BYTE, partner, 0, *this->Comm(), &status);
  383. }
  384. //Need an extra broadcast for incomplete hypercubes.
  385. size_t p0_start=num_p - (num_p % (bit_mask ));
  386. size_t p0_end =num_p - (num_p % (bit_mask>>1));
  387. if(((size_t)rank >= p0_start) && ((size_t)num_p>p0_end) && ((size_t)rank < p0_end) ){
  388. size_t bit_mask0=1;
  389. size_t num_p0=p0_end-p0_start;
  390. while( bit_mask0 < num_p0 ){
  391. if( (bit_mask0<<1) > (num_p - p0_end) ){
  392. size_t partner0=rank^bit_mask0;
  393. if( rank-p0_start < bit_mask0 ){
  394. //Send
  395. MPI_Send(&recv_size, 1, MPI_INT , partner0, 0, *this->Comm());
  396. MPI_Send( recv_buff, recv_size, MPI_BYTE, partner0, 0, *this->Comm());
  397. }else if( rank-p0_start < (bit_mask0<<1) ){
  398. //Receive
  399. if(recv_size>0) mem::aligned_delete<char>(recv_buff);
  400. MPI_Recv(&recv_size, 1, MPI_INT , partner0, 0, *this->Comm(), &status);
  401. recv_buff=mem::aligned_new<char>(recv_size);
  402. MPI_Recv( recv_buff, recv_size, MPI_BYTE, partner0, 0, *this->Comm(), &status);
  403. }
  404. }
  405. bit_mask0=bit_mask0<<1;
  406. }
  407. }
  408. //Construct nodes from received data.
  409. if(recv_size>0){
  410. buff_iter=recv_buff;
  411. size_t r_node_cnt[2]={((size_t*)buff_iter)[0],((size_t*)buff_iter)[1]};
  412. buff_iter+=2*sizeof(size_t);
  413. std::vector<MortonId> r_mid[2];
  414. r_mid[0].resize(r_node_cnt[0]);
  415. r_mid[1].resize(r_node_cnt[1]);
  416. std::vector<Node_t*> recv_nodes[2];
  417. recv_nodes[0].resize(r_node_cnt[0]);
  418. recv_nodes[1].resize(r_node_cnt[1]);
  419. std::vector<PackedData> recv_data[2];
  420. recv_data[0].resize(r_node_cnt[0]);
  421. recv_data[1].resize(r_node_cnt[1]);
  422. for(int i=0;i<2;i++)
  423. for(size_t j=0;j<r_node_cnt[i];j++){
  424. r_mid[i][j]=((MortonId*)buff_iter)[0];
  425. buff_iter+=sizeof(MortonId);
  426. recv_data[i][j].length=((size_t*)buff_iter)[0];
  427. buff_iter+=sizeof(size_t);
  428. recv_data[i][j].data=(void*)buff_iter;
  429. buff_iter+=recv_data[i][j].length;
  430. }
  431. // Add multipole expansion to existing nodes.
  432. for(size_t i=0;i<r_node_cnt[1-merge_indx];i++){
  433. if(i<send_nodes[merge_indx].size()){
  434. if(r_mid[1-merge_indx][i]==send_nodes[merge_indx][i]->GetMortonId()){
  435. send_nodes[merge_indx][i]->AddMultipole(recv_data[1-merge_indx][i]);
  436. }else break;
  437. }else break;
  438. }
  439. bool new_branch=false;
  440. for(size_t i=0;i<r_node_cnt[merge_indx];i++){
  441. if(i<send_nodes[merge_indx].size() && !new_branch){
  442. if(r_mid[merge_indx][i]==send_nodes[merge_indx][i]->GetMortonId()){
  443. recv_nodes[merge_indx][i]=send_nodes[merge_indx][i];
  444. }else{
  445. new_branch=true;
  446. size_t n_=(i<(size_t)n[merge_indx]?n[merge_indx]:i);
  447. for(size_t j=n_;j<send_nodes[merge_indx].size();j++)
  448. mem::aligned_delete(send_nodes[merge_indx][j]);
  449. if(i<(size_t)n[merge_indx]) n[merge_indx]=i;
  450. }
  451. }
  452. if(i>=send_nodes[merge_indx].size() || new_branch){
  453. recv_nodes[merge_indx][i]=static_cast<Node_t*>(this->NewNode());
  454. recv_nodes[merge_indx][i]->SetCoord(r_mid[merge_indx][i]);
  455. recv_nodes[merge_indx][i]->InitMultipole(recv_data[merge_indx][i]);
  456. }
  457. }
  458. send_nodes[merge_indx]=recv_nodes[merge_indx];
  459. }
  460. mem::aligned_delete<char>(send_buff);
  461. mem::aligned_delete<char>(recv_buff);
  462. }
  463. for(int i=0;i<2;i++)
  464. for(size_t j=n[i];j<send_nodes[i].size();j++)
  465. mem::aligned_delete(send_nodes[i][j]);
  466. Profile::Toc();
  467. //Now Broadcast nodes to build LET.
  468. Profile::Tic("Broadcast",this->Comm(),true,4);
  469. this->ConstructLET(bndry);
  470. Profile::Toc();
  471. }
  472. template <class FMM_Mat_t>
  473. void FMM_Tree<FMM_Mat_t>::DownwardPass() {
  474. bool device=true;
  475. Profile::Tic("Setup",this->Comm(),true,3);
  476. std::vector<Node_t*> leaf_nodes;
  477. int max_depth=0;
  478. { // Build leaf node list
  479. int max_depth_loc=0;
  480. std::vector<Node_t*>& nodes=this->GetNodeList();
  481. for(size_t i=0;i<nodes.size();i++){
  482. Node_t* n=nodes[i];
  483. if(!n->IsGhost() && n->IsLeaf()) leaf_nodes.push_back(n);
  484. if(n->Depth()>max_depth_loc) max_depth_loc=n->Depth();
  485. }
  486. MPI_Allreduce(&max_depth_loc, &max_depth, 1, MPI_INT, MPI_MAX, *this->Comm());
  487. }
  488. Profile::Toc();
  489. #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
  490. if(device){ // Host2Device:Src
  491. Profile::Tic("Host2Device:Src",this->Comm(),false,5);
  492. if(setup_data[0+MAX_DEPTH*2]. coord_data!=NULL) setup_data[0+MAX_DEPTH*2]. coord_data->AllocDevice(true);
  493. if(setup_data[0+MAX_DEPTH*2]. input_data!=NULL) setup_data[0+MAX_DEPTH*2]. input_data->AllocDevice(true);
  494. Profile::Toc();
  495. }
  496. #endif
  497. if(bndry==Periodic){ //Add contribution from periodic infinite tiling.
  498. Profile::Tic("BoundaryCondition",this->Comm(),false,5);
  499. fmm_mat->PeriodicBC(dynamic_cast<Node_t*>(this->RootNode()));
  500. Profile::Toc();
  501. }
  502. for(size_t i=0; i<=(fmm_mat->ScaleInvar()?0:max_depth); i++){ // U,V,W,X-lists
  503. if(!fmm_mat->ScaleInvar()){ // Precomp
  504. std::stringstream level_str;
  505. level_str<<"Level-"<<std::setfill('0')<<std::setw(2)<<i<<"\0";
  506. Profile::Tic(level_str.str().c_str(),this->Comm(),false,5);
  507. Profile::Tic("Precomp",this->Comm(),false,5);
  508. {// Precomp U
  509. Profile::Tic("Precomp-U",this->Comm(),false,10);
  510. fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*0],device);
  511. Profile::Toc();
  512. }
  513. {// Precomp W
  514. Profile::Tic("Precomp-W",this->Comm(),false,10);
  515. fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*1],device);
  516. Profile::Toc();
  517. }
  518. {// Precomp X
  519. Profile::Tic("Precomp-X",this->Comm(),false,10);
  520. fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*2],device);
  521. Profile::Toc();
  522. }
  523. if(0){// Precomp V
  524. Profile::Tic("Precomp-V",this->Comm(),false,10);
  525. fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*3], /*device*/ false);
  526. Profile::Toc();
  527. }
  528. Profile::Toc();
  529. }
  530. {// X-List
  531. Profile::Tic("X-List",this->Comm(),false,5);
  532. fmm_mat->X_List(setup_data[i+MAX_DEPTH*2], device);
  533. Profile::Toc();
  534. }
  535. #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
  536. if(i==0 && device){ // Host2Device:Mult
  537. Profile::Tic("Host2Device:Mult",this->Comm(),false,5);
  538. if(setup_data[0+MAX_DEPTH*1]. input_data!=NULL) setup_data[0+MAX_DEPTH*1]. input_data->AllocDevice(true);
  539. Profile::Toc();
  540. }
  541. if(device) if(i==(fmm_mat->ScaleInvar()?0:max_depth)){ // Device2Host: LocalExp
  542. Profile::Tic("Device2Host:LocExp",this->Comm(),false,5);
  543. if(setup_data[0+MAX_DEPTH*2].output_data!=NULL){
  544. Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*2].output_data;
  545. assert(fmm_mat->dev_buffer.Dim()>=output_data.Dim(0)*output_data.Dim(1));
  546. output_data.Device2Host((Real_t*)&fmm_mat->dev_buffer[0]);
  547. }
  548. Profile::Toc();
  549. }
  550. #endif
  551. {// W-List
  552. Profile::Tic("W-List",this->Comm(),false,5);
  553. fmm_mat->W_List(setup_data[i+MAX_DEPTH*1], device);
  554. Profile::Toc();
  555. }
  556. {// U-List
  557. Profile::Tic("U-List",this->Comm(),false,5);
  558. fmm_mat->U_List(setup_data[i+MAX_DEPTH*0], device);
  559. Profile::Toc();
  560. }
  561. {// V-List
  562. Profile::Tic("V-List",this->Comm(),false,5);
  563. fmm_mat->V_List(setup_data[i+MAX_DEPTH*3], /*device*/ false);
  564. Profile::Toc();
  565. }
  566. if(!fmm_mat->ScaleInvar()){ // Wait
  567. #ifdef __INTEL_OFFLOAD
  568. int wait_lock_idx=-1;
  569. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  570. #pragma offload if(device) target(mic:0)
  571. {if(device) MIC_Lock::wait_lock(wait_lock_idx);}
  572. #endif
  573. Profile::Toc();
  574. }
  575. }
  576. #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
  577. Profile::Tic("D2H_Wait:LocExp",this->Comm(),false,5);
  578. if(device) if(setup_data[0+MAX_DEPTH*2].output_data!=NULL){
  579. Real_t* dev_ptr=(Real_t*)&fmm_mat->dev_buffer[0];
  580. Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*2].output_data;
  581. size_t n=output_data.Dim(0)*output_data.Dim(1);
  582. Real_t* host_ptr=output_data[0];
  583. output_data.Device2HostWait();
  584. #pragma omp parallel for
  585. for(size_t i=0;i<n;i++){
  586. host_ptr[i]+=dev_ptr[i];
  587. }
  588. }
  589. Profile::Toc();
  590. Profile::Tic("Device2Host:Trg",this->Comm(),false,5);
  591. if(device) if(setup_data[0+MAX_DEPTH*0].output_data!=NULL){ // Device2Host: Target
  592. Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*0].output_data;
  593. assert(fmm_mat->dev_buffer.Dim()>=output_data.Dim(0)*output_data.Dim(1));
  594. output_data.Device2Host((Real_t*)&fmm_mat->dev_buffer[0]);
  595. }
  596. Profile::Toc();
  597. #endif
  598. Profile::Tic("D2D",this->Comm(),false,5);
  599. for(size_t i=0; i<=max_depth; i++){ // Down2Down
  600. if(!fmm_mat->ScaleInvar()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*4],/*device*/ false);
  601. fmm_mat->Down2Down(setup_data[i+MAX_DEPTH*4]);
  602. }
  603. Profile::Toc();
  604. Profile::Tic("D2T",this->Comm(),false,5);
  605. for(int i=0; i<=(fmm_mat->ScaleInvar()?0:max_depth); i++){ // Down2Target
  606. if(!fmm_mat->ScaleInvar()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*5],/*device*/ false);
  607. fmm_mat->Down2Target(setup_data[i+MAX_DEPTH*5]);
  608. }
  609. Profile::Toc();
  610. #if defined(__INTEL_OFFLOAD) || defined(PVFMM_HAVE_CUDA)
  611. Profile::Tic("D2H_Wait:Trg",this->Comm(),false,5);
  612. if(device) if(setup_data[0+MAX_DEPTH*0].output_data!=NULL){
  613. Real_t* dev_ptr=(Real_t*)&fmm_mat->dev_buffer[0];
  614. Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*0].output_data;
  615. size_t n=output_data.Dim(0)*output_data.Dim(1);
  616. Real_t* host_ptr=output_data[0];
  617. output_data.Device2HostWait();
  618. #pragma omp parallel for
  619. for(size_t i=0;i<n;i++){
  620. host_ptr[i]+=dev_ptr[i];
  621. }
  622. }
  623. Profile::Toc();
  624. #endif
  625. Profile::Tic("PostProc",this->Comm(),false,5);
  626. fmm_mat->PostProcessing(leaf_nodes);
  627. Profile::Toc();
  628. }
  629. template <class FMM_Mat_t>
  630. void FMM_Tree<FMM_Mat_t>::Copy_FMMOutput() {
  631. std::vector<Node_t*>& all_nodes=this->GetNodeList();
  632. int omp_p=omp_get_max_threads();
  633. // Copy output to the tree.
  634. {
  635. size_t k=all_nodes.size();
  636. #pragma omp parallel for
  637. for(int j=0;j<omp_p;j++){
  638. size_t a=(k*j)/omp_p;
  639. size_t b=(k*(j+1))/omp_p;
  640. fmm_mat->CopyOutput(&(all_nodes[a]),b-a);
  641. }
  642. }
  643. }
  644. }//end namespace