fmm_tree.txx 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674
  1. /**
  2. * \file fmm_tree.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 12-11-2010
  5. * \brief This file contains the implementation of the class FMM_Tree.
  6. */
  7. #include <assert.h>
  8. #include <fmm_node.hpp>
  9. #include <profile.hpp>
  10. namespace pvfmm{
  11. template <class FMM_Mat_t>
  12. void FMM_Tree<FMM_Mat_t>::Initialize(typename FMM_Node_t::NodeData* init_data) {
  13. Profile::Tic("InitTree",this->Comm(),true);{
  14. //Build octree from points.
  15. MPI_Tree<FMM_Node_t>::Initialize(init_data);
  16. Profile::Tic("InitFMMData",this->Comm(),true,5);
  17. { //Initialize FMM data.
  18. std::vector<FMM_Node_t*>& nodes=this->GetNodeList();
  19. #pragma omp parallel for
  20. for(size_t i=0;i<nodes.size();i++){
  21. if(nodes[i]->FMMData()==NULL) nodes[i]->FMMData()=new typename FMM_Mat_t::FMMData;
  22. }
  23. }
  24. Profile::Toc();
  25. }Profile::Toc();
  26. }
  27. template <class FMM_Mat_t>
  28. void FMM_Tree<FMM_Mat_t>::InitFMM_Tree(bool refine, BoundaryType bndry_) {
  29. Profile::Tic("InitFMM_Tree",this->Comm(),true);{
  30. interac_list.Initialize(this->Dim());
  31. bndry=bndry_;
  32. if(refine){
  33. //RefineTree
  34. Profile::Tic("RefineTree",this->Comm(),true,2);
  35. this->RefineTree();
  36. Profile::Toc();
  37. }
  38. //2:1 Balancing
  39. Profile::Tic("2:1Balance",this->Comm(),true,2);
  40. this->Balance21(bndry);
  41. Profile::Toc();
  42. //Redistribute nodes.
  43. Profile::Tic("Redistribute",this->Comm(),true,3);
  44. this->RedistNodes();
  45. Profile::Toc();
  46. }Profile::Toc();
  47. }
  48. template <class FMM_Mat_t>
  49. void FMM_Tree<FMM_Mat_t>::SetupFMM(FMM_Mat_t* fmm_mat_) {
  50. Profile::Tic("SetupFMM",this->Comm(),true);{
  51. bool device=true;
  52. #ifdef __INTEL_OFFLOAD
  53. Profile::Tic("InitLocks",this->Comm(),false,3);
  54. MIC_Lock::init();
  55. Profile::Toc();
  56. #endif
  57. int omp_p=omp_get_max_threads();
  58. fmm_mat=fmm_mat_;
  59. //Construct LET
  60. Profile::Tic("ConstructLET",this->Comm(),false,2);
  61. this->ConstructLET(bndry);
  62. Profile::Toc();
  63. //Set Colleagues (Needed to build U, V, W and X lists.)
  64. Profile::Tic("SetColleagues",this->Comm(),false,3);
  65. this->SetColleagues(bndry);
  66. Profile::Toc();
  67. Profile::Tic("BuildLists",this->Comm(),false,3);
  68. BuildInteracLists();
  69. Profile::Toc();
  70. Profile::Tic("CollectNodeData",this->Comm(),false,3);
  71. //Build node list.
  72. FMM_Node_t* n=dynamic_cast<FMM_Node_t*>(this->PostorderFirst());
  73. std::vector<FMM_Node_t*> all_nodes;
  74. while(n!=NULL){
  75. all_nodes.push_back(n);
  76. n=static_cast<FMM_Node_t*>(this->PostorderNxt(n));
  77. }
  78. //Collect node data into continuous array.
  79. std::vector<Vector<FMM_Node_t*> > node_lists; // TODO: Remove this parameter, not really needed
  80. fmm_mat->CollectNodeData(all_nodes, node_data_buff, node_lists);
  81. Profile::Toc();
  82. setup_data.resize(6*MAX_DEPTH);
  83. precomp_lst.resize(6);
  84. Profile::Tic("UListSetup",this->Comm(),false,3);
  85. for(size_t i=0;i<MAX_DEPTH;i++){
  86. setup_data[i+MAX_DEPTH*0].precomp_data=&precomp_lst[0];
  87. fmm_mat->U_ListSetup(setup_data[i+MAX_DEPTH*0],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, device);
  88. }
  89. Profile::Toc();
  90. Profile::Tic("WListSetup",this->Comm(),false,3);
  91. for(size_t i=0;i<MAX_DEPTH;i++){
  92. setup_data[i+MAX_DEPTH*1].precomp_data=&precomp_lst[1];
  93. fmm_mat->W_ListSetup(setup_data[i+MAX_DEPTH*1],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, device);
  94. }
  95. Profile::Toc();
  96. Profile::Tic("XListSetup",this->Comm(),false,3);
  97. for(size_t i=0;i<MAX_DEPTH;i++){
  98. setup_data[i+MAX_DEPTH*2].precomp_data=&precomp_lst[2];
  99. fmm_mat->X_ListSetup(setup_data[i+MAX_DEPTH*2],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, device);
  100. }
  101. Profile::Toc();
  102. Profile::Tic("VListSetup",this->Comm(),false,3);
  103. for(size_t i=0;i<MAX_DEPTH;i++){
  104. setup_data[i+MAX_DEPTH*3].precomp_data=&precomp_lst[3];
  105. fmm_mat->V_ListSetup(setup_data[i+MAX_DEPTH*3],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, /*device*/ false);
  106. }
  107. Profile::Toc();
  108. Profile::Tic("D2DSetup",this->Comm(),false,3);
  109. for(size_t i=0;i<MAX_DEPTH;i++){
  110. setup_data[i+MAX_DEPTH*4].precomp_data=&precomp_lst[4];
  111. fmm_mat->Down2DownSetup(setup_data[i+MAX_DEPTH*4],node_data_buff,node_lists,i, device);
  112. }
  113. Profile::Toc();
  114. Profile::Tic("D2TSetup",this->Comm(),false,3);
  115. for(size_t i=0;i<MAX_DEPTH;i++){
  116. setup_data[i+MAX_DEPTH*5].precomp_data=&precomp_lst[5];
  117. fmm_mat->Down2TargetSetup(setup_data[i+MAX_DEPTH*5],node_data_buff,node_lists,fmm_mat->Homogen()?(i==0?-1:MAX_DEPTH+1):i, device);
  118. }
  119. Profile::Toc();
  120. #ifdef __INTEL_OFFLOAD
  121. int wait_lock_idx=-1;
  122. wait_lock_idx=MIC_Lock::curr_lock();
  123. #pragma offload target(mic:0)
  124. {MIC_Lock::wait_lock(wait_lock_idx);}
  125. #endif
  126. }Profile::Toc();
  127. }
  128. template <class FMM_Mat_t>
  129. void FMM_Tree<FMM_Mat_t>::ClearFMMData() {
  130. Profile::Tic("ClearFMMData",this->Comm(),true);{
  131. bool device=true;
  132. if(setup_data[0+MAX_DEPTH*1]. input_data!=NULL) setup_data[0+MAX_DEPTH*1]. input_data->SetZero();
  133. if(setup_data[0+MAX_DEPTH*2].output_data!=NULL) setup_data[0+MAX_DEPTH*2].output_data->SetZero();
  134. if(setup_data[0+MAX_DEPTH*0].output_data!=NULL) setup_data[0+MAX_DEPTH*0].output_data->SetZero();
  135. if(device){ // Host2Device
  136. if(setup_data[0+MAX_DEPTH*1]. input_data!=NULL) setup_data[0+MAX_DEPTH*1]. input_data->AllocDevice(true);
  137. if(setup_data[0+MAX_DEPTH*2].output_data!=NULL) setup_data[0+MAX_DEPTH*2].output_data->AllocDevice(true);
  138. if(setup_data[0+MAX_DEPTH*0].output_data!=NULL) setup_data[0+MAX_DEPTH*0].output_data->AllocDevice(true);
  139. #ifdef __INTEL_OFFLOAD
  140. if(!fmm_mat->Homogen()){ // Wait
  141. int wait_lock_idx=-1;
  142. wait_lock_idx=MIC_Lock::curr_lock();
  143. #pragma offload target(mic:0)
  144. {MIC_Lock::wait_lock(wait_lock_idx);}
  145. }
  146. #endif
  147. MIC_Lock::init();
  148. }
  149. }Profile::Toc();
  150. }
  151. template <class FMM_Mat_t>
  152. void FMM_Tree<FMM_Mat_t>::RunFMM() {
  153. Profile::Tic("RunFMM",this->Comm(),true);{
  154. //Upward Pass
  155. Profile::Tic("UpwardPass",this->Comm(),false,2);
  156. UpwardPass();
  157. Profile::Toc();
  158. //Multipole Reduce Broadcast.
  159. Profile::Tic("ReduceBcast",this->Comm(),true,2);
  160. MultipoleReduceBcast();
  161. Profile::Toc();
  162. //Local 2:1 Balancing.
  163. //This can cause load imbalance, always use global 2:1 balance instead.
  164. //Profile::Tic("2:1Balance(local)",this->Comm(),false,3);
  165. //this->Balance21_local(bndry);
  166. //UpwardPass(true);
  167. //Profile::Toc();
  168. //Downward Pass
  169. Profile::Tic("DownwardPass",this->Comm(),true,2);
  170. DownwardPass();
  171. Profile::Toc();
  172. }Profile::Toc();
  173. }
  174. template <class FMM_Mat_t>
  175. void FMM_Tree<FMM_Mat_t>::UpwardPass() {
  176. std::vector<FMM_Node_t*> all_leaf_nodes;
  177. std::vector<std::vector<FMM_Node_t*> > nodes(MAX_DEPTH+1);
  178. std::vector<std::vector<FMM_Node_t*> > leaf_nodes(MAX_DEPTH+1);
  179. std::vector<FMM_Node_t*> all_nodes=this->GetNodeList();
  180. for(size_t i=0;i<all_nodes.size();i++){
  181. FMM_Node_t* n=all_nodes[i];
  182. if(!n->IsGhost()){
  183. if(!n->IsLeaf())
  184. nodes[n->Depth()].push_back(n);
  185. else{
  186. all_leaf_nodes.push_back(n);
  187. leaf_nodes[n->Depth()].push_back(n);
  188. }
  189. }
  190. }
  191. //Upward Pass (initialize all leaf nodes)
  192. if(fmm_mat->Homogen()){
  193. size_t k=all_leaf_nodes.size();
  194. fmm_mat->InitMultipole(&(all_leaf_nodes[0]),k,0);
  195. }else{ //Level by level
  196. for(int i=MAX_DEPTH;i>=0;i--){
  197. size_t k=leaf_nodes[i].size();
  198. fmm_mat->InitMultipole(&(leaf_nodes[i][0]),k,i);
  199. }
  200. }
  201. //Upward Pass (level by level)
  202. for(int i=MAX_DEPTH;i>=0;i--){
  203. size_t k=nodes[i].size();
  204. fmm_mat->Up2Up(&(nodes[i][0]),k,i);
  205. }
  206. }
  207. template <class FMM_Mat_t>
  208. void FMM_Tree<FMM_Mat_t>::BuildInteracLists() {
  209. std::vector<FMM_Node_t*>& n_list=this->GetNodeList();
  210. // Build interaction lists.
  211. int omp_p=omp_get_max_threads();
  212. {
  213. size_t k=n_list.size();
  214. #pragma omp parallel for
  215. for(int j=0;j<omp_p;j++){
  216. size_t a=(k*j)/omp_p;
  217. size_t b=(k*(j+1))/omp_p;
  218. for(size_t i=a;i<b;i++){
  219. FMM_Node_t* n=n_list[i];
  220. n->interac_list.resize(Type_Count);
  221. n->interac_list[D2D_Type]=interac_list.BuildList(n,D2D_Type);
  222. n->interac_list[D2T_Type]=interac_list.BuildList(n,D2T_Type);
  223. n->interac_list[U0_Type]=interac_list.BuildList(n,U0_Type);
  224. n->interac_list[U1_Type]=interac_list.BuildList(n,U1_Type);
  225. n->interac_list[U2_Type]=interac_list.BuildList(n,U2_Type);
  226. n->interac_list[V_Type]=interac_list.BuildList(n,V_Type);
  227. n->interac_list[V1_Type]=interac_list.BuildList(n,V1_Type);
  228. n->interac_list[W_Type]=interac_list.BuildList(n,W_Type);
  229. n->interac_list[X_Type]=interac_list.BuildList(n,X_Type);
  230. }
  231. }
  232. }
  233. }
  234. template <class FMM_Mat_t>
  235. void FMM_Tree<FMM_Mat_t>::MultipoleReduceBcast() {
  236. int num_p,rank;
  237. MPI_Comm_size(*this->Comm(),&num_p);
  238. MPI_Comm_rank(*this->Comm(),&rank );
  239. if(num_p==1) return;
  240. Profile::Tic("Reduce",this->Comm(),true,3);
  241. std::vector<MortonId> mins=this->GetMins();
  242. size_t bit_mask=1;
  243. size_t max_child=(1UL<<this->Dim());
  244. //Initialize initial send nodes.
  245. std::vector<FMM_Node_t*> send_nodes[2];
  246. //Initialize send_node[0]
  247. FMM_Node_t* tmp_node=static_cast<FMM_Node_t*>(this->RootNode());
  248. assert(!tmp_node->IsGhost());
  249. while(!tmp_node->IsLeaf()){
  250. FMM_Node_t* tmp_node_=NULL;
  251. for(size_t i=0;i<max_child;i++){
  252. tmp_node_=static_cast<FMM_Node_t*>(tmp_node->Child(i));
  253. if(tmp_node_!=NULL) if(!tmp_node_->IsGhost()) break;
  254. }
  255. tmp_node=tmp_node_; assert(tmp_node!=NULL);
  256. }
  257. int n[2];
  258. n[0]=tmp_node->Depth()+1;
  259. send_nodes[0].resize(n[0]);
  260. send_nodes[0][n[0]-1]=tmp_node;
  261. for(int i=n[0]-1;i>0;i--)
  262. send_nodes[0][i-1]=static_cast<FMM_Node_t*>(send_nodes[0][i]->Parent());
  263. //Initialize send_node[1]
  264. tmp_node=static_cast<FMM_Node_t*>(this->RootNode());
  265. while(!tmp_node->IsLeaf()){
  266. FMM_Node_t* tmp_node_=NULL;
  267. for(int i=max_child-1;i>=0;i--){
  268. tmp_node_=static_cast<FMM_Node_t*>(tmp_node->Child(i));
  269. if(tmp_node_!=NULL) if(!tmp_node_->IsGhost()) break;
  270. }
  271. tmp_node=tmp_node_; assert(tmp_node!=NULL);
  272. }
  273. n[1]=tmp_node->Depth()+1;
  274. send_nodes[1].resize(n[1]);
  275. send_nodes[1][n[1]-1]=tmp_node;
  276. for(int i=n[1]-1;i>0;i--)
  277. send_nodes[1][i-1]=static_cast<FMM_Node_t*>(send_nodes[1][i]->Parent());
  278. //Hypercube reduction.
  279. while(bit_mask<(size_t)num_p){
  280. size_t partner=rank^bit_mask; //Partner process id
  281. int merge_indx=((bit_mask & rank)==0?1:0);
  282. bit_mask=bit_mask<<1;
  283. //if(rank >= num_p - (num_p % bit_mask)) break;
  284. //Initialize send data.
  285. size_t s_node_cnt[2]={send_nodes[0].size(),send_nodes[1].size()};
  286. int send_size=2*sizeof(size_t)+(s_node_cnt[0]+s_node_cnt[1])*sizeof(MortonId);
  287. std::vector<PackedData> send_data(s_node_cnt[0]+s_node_cnt[1]);
  288. size_t s_iter=0;
  289. for(int i=0;i<2;i++)
  290. for(size_t j=0;j<s_node_cnt[i];j++){
  291. assert(send_nodes[i][j]!=NULL);
  292. send_data[s_iter]=send_nodes[i][j]->PackMultipole();
  293. send_size+=send_data[s_iter].length+sizeof(size_t);
  294. s_iter++;
  295. }
  296. char* send_buff=new char[send_size];
  297. char* buff_iter=send_buff;
  298. ((size_t*)buff_iter)[0]=s_node_cnt[0];
  299. ((size_t*)buff_iter)[1]=s_node_cnt[1];
  300. buff_iter+=2*sizeof(size_t);
  301. s_iter=0;
  302. for(int i=0;i<2;i++)
  303. for(size_t j=0;j<s_node_cnt[i];j++){
  304. ((MortonId*)buff_iter)[0]=send_nodes[i][j]->GetMortonId();
  305. buff_iter+=sizeof(MortonId);
  306. ((size_t*)buff_iter)[0]=send_data[s_iter].length;
  307. buff_iter+=sizeof(size_t);
  308. mem::memcopy((void*)buff_iter,send_data[s_iter].data,send_data[s_iter].length);
  309. buff_iter+=send_data[s_iter].length;
  310. s_iter++;
  311. }
  312. //Exchange send and recv sizes
  313. int recv_size=0;
  314. MPI_Status status;
  315. char* recv_buff=NULL;
  316. if(partner<(size_t)num_p){
  317. MPI_Sendrecv(&send_size, 1, MPI_INT, partner, 0, &recv_size, 1, MPI_INT, partner, 0, *this->Comm(), &status);
  318. recv_buff=new char[recv_size];
  319. MPI_Sendrecv(send_buff, send_size, MPI_BYTE, partner, 0, recv_buff, recv_size, MPI_BYTE, partner, 0, *this->Comm(), &status);
  320. }
  321. //Need an extra broadcast for incomplete hypercubes.
  322. size_t p0_start=num_p - (num_p % (bit_mask ));
  323. size_t p0_end =num_p - (num_p % (bit_mask>>1));
  324. if(((size_t)rank >= p0_start) && ((size_t)num_p>p0_end) && ((size_t)rank < p0_end) ){
  325. size_t bit_mask0=1;
  326. size_t num_p0=p0_end-p0_start;
  327. while( bit_mask0 < num_p0 ){
  328. if( (bit_mask0<<1) > (num_p - p0_end) ){
  329. size_t partner0=rank^bit_mask0;
  330. if( rank-p0_start < bit_mask0 ){
  331. //Send
  332. MPI_Send(&recv_size, 1, MPI_INT , partner0, 0, *this->Comm());
  333. MPI_Send( recv_buff, recv_size, MPI_BYTE, partner0, 0, *this->Comm());
  334. }else if( rank-p0_start < (bit_mask0<<1) ){
  335. //Receive
  336. if(recv_size>0) delete[] recv_buff;
  337. MPI_Recv(&recv_size, 1, MPI_INT , partner0, 0, *this->Comm(), &status);
  338. recv_buff=new char[recv_size];
  339. MPI_Recv( recv_buff, recv_size, MPI_BYTE, partner0, 0, *this->Comm(), &status);
  340. }
  341. }
  342. bit_mask0=bit_mask0<<1;
  343. }
  344. }
  345. //Construct nodes from received data.
  346. if(recv_size>0){
  347. buff_iter=recv_buff;
  348. size_t r_node_cnt[2]={((size_t*)buff_iter)[0],((size_t*)buff_iter)[1]};
  349. buff_iter+=2*sizeof(size_t);
  350. std::vector<MortonId> r_mid[2];
  351. r_mid[0].resize(r_node_cnt[0]);
  352. r_mid[1].resize(r_node_cnt[1]);
  353. std::vector<FMM_Node_t*> recv_nodes[2];
  354. recv_nodes[0].resize(r_node_cnt[0]);
  355. recv_nodes[1].resize(r_node_cnt[1]);
  356. std::vector<PackedData> recv_data[2];
  357. recv_data[0].resize(r_node_cnt[0]);
  358. recv_data[1].resize(r_node_cnt[1]);
  359. for(int i=0;i<2;i++)
  360. for(size_t j=0;j<r_node_cnt[i];j++){
  361. r_mid[i][j]=((MortonId*)buff_iter)[0];
  362. buff_iter+=sizeof(MortonId);
  363. recv_data[i][j].length=((size_t*)buff_iter)[0];
  364. buff_iter+=sizeof(size_t);
  365. recv_data[i][j].data=(void*)buff_iter;
  366. buff_iter+=recv_data[i][j].length;
  367. }
  368. // Add multipole expansion to existing nodes.
  369. for(size_t i=0;i<r_node_cnt[1-merge_indx];i++){
  370. if(i<send_nodes[merge_indx].size()){
  371. if(r_mid[1-merge_indx][i]==send_nodes[merge_indx][i]->GetMortonId()){
  372. send_nodes[merge_indx][i]->AddMultipole(recv_data[1-merge_indx][i]);
  373. }else break;
  374. }else break;
  375. }
  376. bool new_branch=false;
  377. for(size_t i=0;i<r_node_cnt[merge_indx];i++){
  378. if(i<send_nodes[merge_indx].size() && !new_branch){
  379. if(r_mid[merge_indx][i]==send_nodes[merge_indx][i]->GetMortonId()){
  380. recv_nodes[merge_indx][i]=send_nodes[merge_indx][i];
  381. }else{
  382. new_branch=true;
  383. size_t n_=(i<(size_t)n[merge_indx]?n[merge_indx]:i);
  384. for(size_t j=n_;j<send_nodes[merge_indx].size();j++)
  385. delete send_nodes[merge_indx][j];
  386. if(i<(size_t)n[merge_indx]) n[merge_indx]=i;
  387. }
  388. }
  389. if(i>=send_nodes[merge_indx].size() || new_branch){
  390. recv_nodes[merge_indx][i]=static_cast<FMM_Node_t*>(this->NewNode());
  391. recv_nodes[merge_indx][i]->SetCoord(r_mid[merge_indx][i]);
  392. recv_nodes[merge_indx][i]->InitMultipole(recv_data[merge_indx][i]);
  393. }
  394. }
  395. send_nodes[merge_indx]=recv_nodes[merge_indx];
  396. }
  397. delete[] send_buff;
  398. delete[] recv_buff;
  399. }
  400. for(int i=0;i<2;i++)
  401. for(size_t j=n[i];j<send_nodes[i].size();j++)
  402. delete send_nodes[i][j];
  403. Profile::Toc();
  404. //Now Broadcast nodes to build LET.
  405. Profile::Tic("Broadcast",this->Comm(),true,4);
  406. this->ConstructLET(bndry);
  407. Profile::Toc();
  408. }
  409. template <class FMM_Mat_t>
  410. void FMM_Tree<FMM_Mat_t>::DownwardPass() {
  411. bool device=true;
  412. Profile::Tic("Setup",this->Comm(),true,3);
  413. std::vector<FMM_Node_t*> leaf_nodes;
  414. int max_depth=0;
  415. { // Build leaf node list
  416. std::vector<FMM_Node_t*>& nodes=this->GetNodeList();
  417. for(size_t i=0;i<nodes.size();i++){
  418. FMM_Node_t* n=nodes[i];
  419. if(!n->IsGhost() && n->IsLeaf()) leaf_nodes.push_back(n);
  420. if(n->Depth()>max_depth) max_depth=n->Depth();
  421. }
  422. }
  423. Profile::Toc();
  424. #ifdef __INTEL_OFFLOAD
  425. if(device){ // Host2Device:Src
  426. Profile::Tic("Host2Device:Src",this->Comm(),false,5);
  427. if(setup_data[0+MAX_DEPTH*2]. coord_data!=NULL) setup_data[0+MAX_DEPTH*2]. coord_data->AllocDevice(true);
  428. if(setup_data[0+MAX_DEPTH*2]. input_data!=NULL) setup_data[0+MAX_DEPTH*2]. input_data->AllocDevice(true);
  429. Profile::Toc();
  430. }
  431. #endif
  432. if(bndry==Periodic){ //Add contribution from periodic infinite tiling.
  433. Profile::Tic("BoundaryCondition",this->Comm(),false,5);
  434. fmm_mat->PeriodicBC(dynamic_cast<FMM_Node_t*>(this->RootNode()));
  435. Profile::Toc();
  436. }
  437. for(size_t i=0; i<=(fmm_mat->Homogen()?0:max_depth); i++){ // U,V,W,X-lists
  438. if(!fmm_mat->Homogen()){ // Precomp
  439. std::stringstream level_str;
  440. level_str<<"Level-"<<std::setfill('0')<<std::setw(2)<<i<<"\0";
  441. Profile::Tic(level_str.str().c_str(),this->Comm(),false,5);
  442. Profile::Tic("Precomp",this->Comm(),false,5);
  443. {// Precomp U
  444. Profile::Tic("Precomp-U",this->Comm(),false,10);
  445. fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*0],device);
  446. Profile::Toc();
  447. }
  448. {// Precomp W
  449. Profile::Tic("Precomp-W",this->Comm(),false,10);
  450. fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*1],device);
  451. Profile::Toc();
  452. }
  453. {// Precomp X
  454. Profile::Tic("Precomp-X",this->Comm(),false,10);
  455. fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*2],device);
  456. Profile::Toc();
  457. }
  458. {// Precomp V
  459. Profile::Tic("Precomp-V",this->Comm(),false,10);
  460. fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*3], /*device*/ false);
  461. Profile::Toc();
  462. }
  463. Profile::Toc();
  464. }
  465. {// X-List
  466. Profile::Tic("X-List",this->Comm(),false,5);
  467. fmm_mat->X_List(setup_data[i+MAX_DEPTH*2], device);
  468. Profile::Toc();
  469. }
  470. #ifdef __INTEL_OFFLOAD
  471. if(i==0 && device){ // Host2Device:Mult
  472. Profile::Tic("Host2Device:Mult",this->Comm(),false,5);
  473. if(setup_data[0+MAX_DEPTH*1]. input_data!=NULL) setup_data[0+MAX_DEPTH*1]. input_data->AllocDevice(true);
  474. Profile::Toc();
  475. }
  476. if(device) if(i==(fmm_mat->Homogen()?0:max_depth)){ // Device2Host: LocalExp
  477. Profile::Tic("Device2Host:LocExp",this->Comm(),false,5);
  478. if(setup_data[0+MAX_DEPTH*2].output_data!=NULL){
  479. Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*2].output_data;
  480. assert(fmm_mat->dev_buffer.Dim()>=output_data.Dim(0)*output_data.Dim(1));
  481. output_data.Device2Host((Real_t*)&fmm_mat->dev_buffer[0]);
  482. }
  483. Profile::Toc();
  484. }
  485. #endif
  486. {// W-List
  487. Profile::Tic("W-List",this->Comm(),false,5);
  488. fmm_mat->W_List(setup_data[i+MAX_DEPTH*1], device);
  489. Profile::Toc();
  490. }
  491. {// U-List
  492. Profile::Tic("U-List",this->Comm(),false,5);
  493. fmm_mat->U_List(setup_data[i+MAX_DEPTH*0], device);
  494. Profile::Toc();
  495. }
  496. {// V-List
  497. Profile::Tic("V-List",this->Comm(),false,5);
  498. fmm_mat->V_List(setup_data[i+MAX_DEPTH*3], /*device*/ false);
  499. Profile::Toc();
  500. }
  501. if(!fmm_mat->Homogen()){ // Wait
  502. #ifdef __INTEL_OFFLOAD
  503. int wait_lock_idx=-1;
  504. if(device) wait_lock_idx=MIC_Lock::curr_lock();
  505. #pragma offload if(device) target(mic:0)
  506. {if(device) MIC_Lock::wait_lock(wait_lock_idx);}
  507. #endif
  508. Profile::Toc();
  509. }
  510. }
  511. #ifdef __INTEL_OFFLOAD
  512. Profile::Tic("D2H_Wait:LocExp",this->Comm(),false,5);
  513. if(device) if(setup_data[0+MAX_DEPTH*2].output_data!=NULL){
  514. Real_t* dev_ptr=(Real_t*)&fmm_mat->dev_buffer[0];
  515. Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*2].output_data;
  516. size_t n=output_data.Dim(0)*output_data.Dim(1);
  517. Real_t* host_ptr=output_data[0];
  518. output_data.Device2HostWait();
  519. #pragma omp parallel for
  520. for(size_t i=0;i<n;i++){
  521. host_ptr[i]+=dev_ptr[i];
  522. }
  523. }
  524. Profile::Toc();
  525. Profile::Tic("Device2Host:Trg",this->Comm(),false,5);
  526. if(device) if(setup_data[0+MAX_DEPTH*0].output_data!=NULL){ // Device2Host: Target
  527. Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*0].output_data;
  528. assert(fmm_mat->dev_buffer.Dim()>=output_data.Dim(0)*output_data.Dim(1));
  529. output_data.Device2Host((Real_t*)&fmm_mat->dev_buffer[0]);
  530. }
  531. Profile::Toc();
  532. #endif
  533. Profile::Tic("D2D",this->Comm(),false,5);
  534. for(size_t i=0; i<=max_depth; i++){ // Down2Down
  535. if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*4],device);
  536. fmm_mat->Down2Down(setup_data[i+MAX_DEPTH*4]);
  537. }
  538. Profile::Toc();
  539. Profile::Tic("D2T",this->Comm(),false,5);
  540. for(int i=0; i<=(fmm_mat->Homogen()?0:max_depth); i++){ // Down2Target
  541. if(!fmm_mat->Homogen()) fmm_mat->SetupPrecomp(setup_data[i+MAX_DEPTH*5],device);
  542. fmm_mat->Down2Target(setup_data[i+MAX_DEPTH*5]);
  543. }
  544. Profile::Toc();
  545. #ifdef __INTEL_OFFLOAD
  546. Profile::Tic("D2H_Wait:Trg",this->Comm(),false,5);
  547. if(device) if(setup_data[0+MAX_DEPTH*0].output_data!=NULL){
  548. Real_t* dev_ptr=(Real_t*)&fmm_mat->dev_buffer[0];
  549. Matrix<Real_t>& output_data=*setup_data[0+MAX_DEPTH*0].output_data;
  550. size_t n=output_data.Dim(0)*output_data.Dim(1);
  551. Real_t* host_ptr=output_data[0];
  552. output_data.Device2HostWait();
  553. #pragma omp parallel for
  554. for(size_t i=0;i<n;i++){
  555. host_ptr[i]+=dev_ptr[i];
  556. }
  557. }
  558. Profile::Toc();
  559. #endif
  560. Profile::Tic("PostProc",this->Comm(),false,5);
  561. fmm_mat->PostProcessing(leaf_nodes);
  562. Profile::Toc();
  563. }
  564. template <class FMM_Mat_t>
  565. void FMM_Tree<FMM_Mat_t>::Copy_FMMOutput() {
  566. std::vector<FMM_Node_t*>& all_nodes=this->GetNodeList();
  567. int omp_p=omp_get_max_threads();
  568. // Copy output to the tree.
  569. {
  570. size_t k=all_nodes.size();
  571. #pragma omp parallel for
  572. for(int j=0;j<omp_p;j++){
  573. size_t a=(k*j)/omp_p;
  574. size_t b=(k*(j+1))/omp_p;
  575. fmm_mat->CopyOutput(&(all_nodes[a]),b-a);
  576. }
  577. }
  578. }
  579. }//end namespace