mpi_tree.txx 69 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046
  1. /**
  2. * \file mpi_tree.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 12-11-2010
  5. * \brief This file contains the implementation of the class MPI_Tree.
  6. */
  7. #include <assert.h>
  8. #include <typeinfo>
  9. #include <cstring>
  10. #include <fstream>
  11. #include <list>
  12. #include <set>
  13. #include <parUtils.h>
  14. #include <ompUtils.h>
  15. #include <profile.hpp>
  16. #include <mem_mgr.hpp>
  17. namespace pvfmm{
  18. /**
  19. * @author Dhairya Malhotra, dhairya.malhotra@gmail.com
  20. * @date 08 Feb 2011
  21. */
  22. inline int p2oLocal(Vector<MortonId> & nodes, Vector<MortonId>& leaves,
  23. unsigned int maxNumPts, unsigned int maxDepth, bool complete) {
  24. assert(maxDepth<=MAX_DEPTH);
  25. std::vector<MortonId> leaves_lst;
  26. unsigned int init_size=leaves.Dim();
  27. unsigned int num_pts=nodes.Dim();
  28. MortonId curr_node=leaves[0];
  29. MortonId last_node=leaves[init_size-1].NextId();
  30. MortonId next_node;
  31. unsigned int curr_pt=0;
  32. unsigned int next_pt=curr_pt+maxNumPts;
  33. while(next_pt <= num_pts){
  34. next_node = curr_node.NextId();
  35. while( next_pt < num_pts && next_node > nodes[next_pt] && curr_node.GetDepth() < maxDepth-1 ){
  36. curr_node = curr_node.getDFD(curr_node.GetDepth()+1);
  37. next_node = curr_node.NextId();
  38. }
  39. leaves_lst.push_back(curr_node);
  40. curr_node = next_node;
  41. unsigned int inc=maxNumPts;
  42. while(next_pt < num_pts && curr_node > nodes[next_pt]){
  43. // We have more than maxNumPts points per octant because the node can
  44. // not be refined any further.
  45. inc=inc<<1;
  46. next_pt+=inc;
  47. if(next_pt > num_pts){
  48. next_pt = num_pts;
  49. break;
  50. }
  51. }
  52. curr_pt = std::lower_bound(&nodes[0]+curr_pt,&nodes[0]+next_pt,curr_node,std::less<MortonId>())-&nodes[0];
  53. if(curr_pt >= num_pts) break;
  54. next_pt = curr_pt + maxNumPts;
  55. if(next_pt > num_pts) next_pt = num_pts;
  56. }
  57. #ifndef NDEBUG
  58. for(size_t i=0;i<leaves_lst.size();i++){
  59. size_t a=std::lower_bound(&nodes[0],&nodes[0]+nodes.Dim(),leaves_lst[i],std::less<MortonId>())-&nodes[0];
  60. size_t b=std::lower_bound(&nodes[0],&nodes[0]+nodes.Dim(),leaves_lst[i].NextId(),std::less<MortonId>())-&nodes[0];
  61. assert(b-a<=maxNumPts || leaves_lst[i].GetDepth()==maxDepth-1);
  62. if(i==leaves_lst.size()-1) assert(b==nodes.Dim() && a<nodes.Dim());
  63. if(i==0) assert(a==0);
  64. }
  65. #endif
  66. if(complete)
  67. while(curr_node<last_node){
  68. while( curr_node.NextId() > last_node && curr_node.GetDepth() < maxDepth-1 )
  69. curr_node = curr_node.getDFD(curr_node.GetDepth()+1);
  70. leaves_lst.push_back(curr_node);
  71. curr_node = curr_node.NextId();
  72. }
  73. leaves=leaves_lst;
  74. return 0;
  75. }
  76. inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes,
  77. unsigned int maxDepth, unsigned int maxNumPts, const MPI_Comm& comm ) {
  78. int myrank, np;
  79. MPI_Comm_rank(comm, &myrank);
  80. MPI_Comm_size(comm, &np);
  81. // Sort morton id of points.
  82. Profile::Tic("SortMortonId", &comm, true, 5);
  83. Vector<MortonId> pt_sorted;
  84. //par::partitionW<MortonId>(pt_mid, NULL, comm);
  85. par::HyperQuickSort(pt_mid, pt_sorted, comm);
  86. size_t pt_cnt=pt_sorted.Dim();
  87. Profile::Toc();
  88. // Add last few points from next process, to get the boundary octant right.
  89. Profile::Tic("Comm", &comm, true, 5);
  90. {
  91. { // Adjust maxNumPts
  92. size_t glb_pt_cnt=0;
  93. MPI_Allreduce(&pt_cnt, &glb_pt_cnt, 1, par::Mpi_datatype<size_t>::value(), MPI_SUM, comm);
  94. if(glb_pt_cnt<maxNumPts*np) maxNumPts=glb_pt_cnt/np;
  95. }
  96. size_t recv_size=0;
  97. size_t send_size=(2*maxNumPts<pt_cnt?2*maxNumPts:pt_cnt);
  98. {
  99. MPI_Request recvRequest;
  100. MPI_Request sendRequest;
  101. MPI_Status statusWait;
  102. if(myrank < (np-1)) MPI_Irecv (&recv_size, 1, par::Mpi_datatype<size_t>::value(), myrank+1, 1, comm, &recvRequest);
  103. if(myrank > 0 ) MPI_Issend(&send_size, 1, par::Mpi_datatype<size_t>::value(), myrank-1, 1, comm, &sendRequest);
  104. if(myrank < (np-1)) MPI_Wait(&recvRequest, &statusWait);
  105. if(myrank > 0 ) MPI_Wait(&sendRequest, &statusWait); //This can be done later.
  106. }
  107. if(recv_size>0){// Resize pt_sorted.
  108. Vector<MortonId> pt_sorted_(pt_cnt+recv_size);
  109. mem::memcopy(&pt_sorted_[0], &pt_sorted[0], pt_cnt*sizeof(MortonId));
  110. pt_sorted.Swap(pt_sorted_);
  111. }
  112. {// Exchange data.
  113. MPI_Request recvRequest;
  114. MPI_Request sendRequest;
  115. MPI_Status statusWait;
  116. if(myrank < (np-1)) MPI_Irecv (&pt_sorted[0]+pt_cnt, recv_size, par::Mpi_datatype<MortonId>::value(), myrank+1, 1, comm, &recvRequest);
  117. if(myrank > 0 ) MPI_Issend(&pt_sorted[0] , send_size, par::Mpi_datatype<MortonId>::value(), myrank-1, 1, comm, &sendRequest);
  118. if(myrank < (np-1)) MPI_Wait(&recvRequest, &statusWait);
  119. if(myrank > 0 ) MPI_Wait(&sendRequest, &statusWait); //This can be done later.
  120. }
  121. }
  122. Profile::Toc();
  123. // Construct local octree.
  124. Profile::Tic("p2o_local", &comm, false, 5);
  125. Vector<MortonId> nodes_local(1); nodes_local[0]=MortonId();
  126. p2oLocal(pt_sorted, nodes_local, maxNumPts, maxDepth, myrank==np-1);
  127. Profile::Toc();
  128. // Remove duplicate nodes on adjacent processors.
  129. Profile::Tic("RemoveDuplicates", &comm, true, 5);
  130. {
  131. size_t node_cnt=nodes_local.Dim();
  132. MortonId first_node;
  133. MortonId last_node=nodes_local[node_cnt-1];
  134. { // Send last_node to next process and get first_node from previous process.
  135. MPI_Request recvRequest;
  136. MPI_Request sendRequest;
  137. MPI_Status statusWait;
  138. if(myrank < (np-1)) MPI_Issend(& last_node, 1, par::Mpi_datatype<MortonId>::value(), myrank+1, 1, comm, &recvRequest);
  139. if(myrank > 0 ) MPI_Irecv (&first_node, 1, par::Mpi_datatype<MortonId>::value(), myrank-1, 1, comm, &sendRequest);
  140. if(myrank < (np-1)) MPI_Wait(&recvRequest, &statusWait);
  141. if(myrank > 0 ) MPI_Wait(&sendRequest, &statusWait); //This can be done later.
  142. }
  143. size_t i=0;
  144. std::vector<MortonId> node_lst;
  145. if(myrank){
  146. while(i<node_cnt && nodes_local[i].getDFD(maxDepth)<first_node) i++; assert(i);
  147. last_node=nodes_local[i>0?i-1:0].NextId(); // Next MortonId in the tree after first_node.
  148. while(first_node<last_node){ // Complete nodes between first_node and last_node.
  149. while(first_node.isAncestor(last_node))
  150. first_node=first_node.getDFD(first_node.GetDepth()+1);
  151. if(first_node==last_node) break;
  152. node_lst.push_back(first_node);
  153. first_node=first_node.NextId();
  154. }
  155. }
  156. for(;i<node_cnt-(myrank==np-1?0:1);i++) node_lst.push_back(nodes_local[i]);
  157. nodes=node_lst;
  158. }
  159. Profile::Toc();
  160. // Repartition nodes.
  161. Profile::Tic("partitionW", &comm, false, 5);
  162. par::partitionW<MortonId>(nodes, NULL , comm);
  163. Profile::Toc();
  164. return 0;
  165. }
  166. template <class TreeNode>
  167. void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
  168. //Initialize root node.
  169. Profile::Tic("InitRoot",Comm(),false,3);
  170. Tree<TreeNode>::Initialize(init_data);
  171. TreeNode* rnode=this->RootNode();
  172. assert(this->dim==COORD_DIM);
  173. Profile::Toc();
  174. Profile::Tic("Points2Octee",Comm(),true,3);
  175. Vector<MortonId> lin_oct;
  176. { //Get the linear tree.
  177. // Compute MortonId from pt_coord.
  178. Vector<MortonId> pt_mid;
  179. Vector<Real_t>& pt_coord=rnode->pt_coord;
  180. size_t pt_cnt=pt_coord.Dim()/this->dim;
  181. pt_mid.Resize(pt_cnt);
  182. #pragma omp parallel for
  183. for(size_t i=0;i<pt_cnt;i++){
  184. pt_mid[i]=MortonId(pt_coord[i*COORD_DIM+0],pt_coord[i*COORD_DIM+1],pt_coord[i*COORD_DIM+2],this->max_depth);
  185. }
  186. //Get the linear tree.
  187. points2Octree(pt_mid,lin_oct,this->max_depth,init_data->max_pts,*Comm());
  188. }
  189. Profile::Toc();
  190. Profile::Tic("ScatterPoints",Comm(),true,3);
  191. { // Sort and partition point coordinates and values.
  192. std::vector<Vector<Real_t>*> coord_lst;
  193. std::vector<Vector<Real_t>*> value_lst;
  194. std::vector<Vector<size_t>*> scatter_lst;
  195. rnode->NodeDataVec(coord_lst, value_lst, scatter_lst);
  196. assert(coord_lst.size()==value_lst.size());
  197. assert(coord_lst.size()==scatter_lst.size());
  198. Vector<MortonId> pt_mid;
  199. Vector<size_t> scatter_index;
  200. for(size_t i=0;i<coord_lst.size();i++){
  201. if(!coord_lst[i]) continue;
  202. Vector<Real_t>& pt_coord=*coord_lst[i];
  203. { // Compute MortonId from pt_coord.
  204. size_t pt_cnt=pt_coord.Dim()/this->dim;
  205. pt_mid.Resize(pt_cnt);
  206. #pragma omp parallel for
  207. for(size_t i=0;i<pt_cnt;i++){
  208. pt_mid[i]=MortonId(pt_coord[i*COORD_DIM+0],pt_coord[i*COORD_DIM+1],pt_coord[i*COORD_DIM+2],this->max_depth);
  209. }
  210. }
  211. par::SortScatterIndex(pt_mid , scatter_index, comm, &lin_oct[0]);
  212. par::ScatterForward (pt_coord, scatter_index, comm);
  213. if(value_lst[i]!=NULL){
  214. Vector<Real_t>& pt_value=*value_lst[i];
  215. par::ScatterForward(pt_value, scatter_index, comm);
  216. }
  217. if(scatter_lst[i]!=NULL){
  218. Vector<size_t>& pt_scatter=*scatter_lst[i];
  219. pt_scatter=scatter_index;
  220. }
  221. }
  222. }
  223. Profile::Toc();
  224. //Initialize the pointer based tree from the linear tree.
  225. Profile::Tic("PointerTree",Comm(),false,3);
  226. { // Construct the pointer tree from lin_oct
  227. int omp_p=omp_get_max_threads();
  228. // Partition nodes between threads
  229. rnode->SetGhost(false);
  230. for(int i=0;i<omp_p;i++){
  231. size_t idx=(lin_oct.Dim()*i)/omp_p;
  232. Node_t* n=FindNode(lin_oct[idx], true);
  233. assert(n->GetMortonId()==lin_oct[idx]);
  234. UNUSED(n);
  235. }
  236. #pragma omp parallel for
  237. for(int i=0;i<omp_p;i++){
  238. size_t a=(lin_oct.Dim()* i )/omp_p;
  239. size_t b=(lin_oct.Dim()*(i+1))/omp_p;
  240. size_t idx=a;
  241. Node_t* n=FindNode(lin_oct[idx], false);
  242. if(a==0) n=rnode;
  243. while(n!=NULL && (idx<b || i==omp_p-1)){
  244. n->SetGhost(false);
  245. MortonId dn=n->GetMortonId();
  246. if(idx<b && dn.isAncestor(lin_oct[idx])){
  247. if(n->IsLeaf()) n->Subdivide();
  248. }else if(idx<b && dn==lin_oct[idx]){
  249. if(!n->IsLeaf()) n->Truncate();
  250. assert(n->IsLeaf());
  251. idx++;
  252. }else{
  253. n->Truncate();
  254. n->SetGhost(true);
  255. }
  256. n=this->PreorderNxt(n);
  257. }
  258. assert(idx==b);
  259. }
  260. }
  261. Profile::Toc();
  262. #ifndef NDEBUG
  263. CheckTree();
  264. #endif
  265. }
  266. template <class TreeNode>
  267. void MPI_Tree<TreeNode>::CoarsenTree(){
  268. int myrank;
  269. MPI_Comm_rank(*Comm(),&myrank);
  270. //Redistribute.
  271. {
  272. Node_t* n=this->PostorderFirst();
  273. while(n){
  274. if(n->IsLeaf() && !n->IsGhost()) break;
  275. n=this->PostorderNxt(n);
  276. }
  277. while(myrank){
  278. Node_t* n_parent=(Node_t*)n->Parent();
  279. Node_t* n_ = n_parent;
  280. while(n_ && !n_->IsLeaf()){
  281. n_=this->PostorderNxt(n_);
  282. if(!n_) break;
  283. }
  284. if(!n_ || n_->IsGhost()) break;
  285. if(n->Depth()<=n_->Depth()) break;
  286. if(n_->Depth()<=1) break;
  287. n=n_;
  288. }
  289. MortonId loc_min=n->GetMortonId();
  290. RedistNodes(&loc_min);
  291. }
  292. //Truncate ghost nodes and build node list
  293. std::vector<Node_t*> leaf_nodes;
  294. {
  295. Node_t* n=this->PostorderFirst();
  296. while(n!=NULL){
  297. if(n->IsLeaf() && !n->IsGhost()) break;
  298. n->Truncate();
  299. n->SetGhost(true);
  300. n->ClearData();
  301. n=this->PostorderNxt(n);
  302. }
  303. while(n!=NULL){
  304. if(n->IsLeaf() && n->IsGhost()) break;
  305. if(n->IsLeaf()) leaf_nodes.push_back(n);
  306. n=this->PreorderNxt(n);
  307. }
  308. while(n!=NULL){
  309. n->Truncate();
  310. n->SetGhost(true);
  311. n->ClearData();
  312. n=this->PreorderNxt(n);
  313. }
  314. }
  315. size_t node_cnt=leaf_nodes.size();
  316. //Partition nodes between OpenMP threads.
  317. int omp_p=omp_get_max_threads();
  318. std::vector<MortonId> mid(omp_p);
  319. std::vector<MortonId> split_key(omp_p);
  320. #pragma omp parallel for
  321. for(int i=0;i<omp_p;i++){
  322. mid[i]=leaf_nodes[(i*node_cnt)/omp_p]->GetMortonId();
  323. }
  324. //Coarsen Tree.
  325. #pragma omp parallel for
  326. for(int i=0;i<omp_p;i++){
  327. Node_t* n_=leaf_nodes[i*node_cnt/omp_p];
  328. if(i*node_cnt/omp_p<(i+1)*node_cnt/omp_p)
  329. while(n_!=NULL){
  330. MortonId n_mid=n_->GetMortonId();
  331. if(!n_->IsLeaf() && !n_mid.isAncestor(mid[i].getDFD()))
  332. if(i<omp_p-1? !n_mid.isAncestor(mid[i+1].getDFD()):true)
  333. if(!n_->SubdivCond()) n_->Truncate();
  334. if(i<omp_p-1? n_mid==mid[i+1]: false) break;
  335. n_=this->PostorderNxt(n_);
  336. }
  337. }
  338. //Truncate nodes along ancestors of splitters.
  339. for(int i=0;i<omp_p;i++){
  340. Node_t* n_=FindNode(mid[i], false, this->RootNode());
  341. while(n_->Depth()>0){
  342. n_=(Node_t*)n_->Parent();
  343. if(!n_->SubdivCond()) n_->Truncate();
  344. else break;
  345. }
  346. }
  347. }
  348. template <class TreeNode>
  349. void MPI_Tree<TreeNode>::RefineTree(){
  350. int np, myrank;
  351. MPI_Comm_size(*Comm(),&np);
  352. MPI_Comm_rank(*Comm(),&myrank);
  353. int omp_p=omp_get_max_threads();
  354. int n_child=1UL<<this->Dim();
  355. //Coarsen tree.
  356. MPI_Tree<TreeNode>::CoarsenTree();
  357. //Build node list.
  358. std::vector<Node_t*> leaf_nodes;
  359. {
  360. Node_t* n=this->PostorderFirst();
  361. while(n!=NULL){
  362. if(n->IsLeaf() && !n->IsGhost())
  363. leaf_nodes.push_back(n);
  364. n=this->PostorderNxt(n);
  365. }
  366. }
  367. size_t tree_node_cnt=leaf_nodes.size();
  368. //Adaptive subdivision of leaf nodes with load balancing.
  369. for(int l=0;l<this->max_depth;l++){
  370. //Subdivide nodes.
  371. std::vector<std::vector<Node_t*> > leaf_nodes_(omp_p);
  372. #pragma omp parallel for
  373. for(int i=0;i<omp_p;i++){
  374. size_t a=(leaf_nodes.size()* i )/omp_p;
  375. size_t b=(leaf_nodes.size()*(i+1))/omp_p;
  376. for(size_t j=a;j<b;j++){
  377. if(leaf_nodes[j]->IsLeaf() && !leaf_nodes[j]->IsGhost()){
  378. if(leaf_nodes[j]->SubdivCond()) leaf_nodes[j]->Subdivide();
  379. if(!leaf_nodes[j]->IsLeaf())
  380. for(int k=0;k<n_child;k++)
  381. leaf_nodes_[i].push_back((Node_t*)leaf_nodes[j]->Child(k));
  382. }
  383. }
  384. }
  385. for(int i=0;i<omp_p;i++)
  386. tree_node_cnt+=(leaf_nodes_[i].size()/n_child)*(n_child-1);
  387. //Determine load imbalance.
  388. int global_max, global_sum;
  389. MPI_Allreduce(&tree_node_cnt, &global_max, 1, MPI_INT, MPI_MAX, *Comm());
  390. MPI_Allreduce(&tree_node_cnt, &global_sum, 1, MPI_INT, MPI_SUM, *Comm());
  391. //RedistNodes if needed.
  392. if(global_max*np>4*global_sum){
  393. #ifndef NDEBUG
  394. Profile::Tic("RedistNodes",Comm(),true,4);
  395. #endif
  396. RedistNodes();
  397. #ifndef NDEBUG
  398. Profile::Toc();
  399. #endif
  400. //Rebuild node list.
  401. leaf_nodes.clear();
  402. Node_t* n=this->PostorderFirst();
  403. while(n!=NULL){
  404. if(n->IsLeaf() && !n->IsGhost())
  405. leaf_nodes.push_back(n);
  406. n=this->PostorderNxt(n);
  407. }
  408. tree_node_cnt=leaf_nodes.size();
  409. }else{
  410. //Combine partial list of nodes.
  411. int node_cnt=0;
  412. for(int j=0;j<omp_p;j++)
  413. node_cnt+=leaf_nodes_[j].size();
  414. leaf_nodes.resize(node_cnt);
  415. #pragma omp parallel for
  416. for(int i=0;i<omp_p;i++){
  417. int offset=0;
  418. for(int j=0;j<i;j++)
  419. offset+=leaf_nodes_[j].size();
  420. for(size_t j=0;j<leaf_nodes_[i].size();j++)
  421. leaf_nodes[offset+j]=leaf_nodes_[i][j];
  422. }
  423. }
  424. }
  425. RedistNodes();
  426. MPI_Tree<TreeNode>::CoarsenTree();
  427. RedistNodes();
  428. MPI_Tree<TreeNode>::CoarsenTree();
  429. RedistNodes();
  430. }
  431. template <class TreeNode>
  432. void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
  433. int np, myrank;
  434. MPI_Comm_size(*Comm(),&np);
  435. MPI_Comm_rank(*Comm(),&myrank);
  436. if(np==1)return;
  437. //Create a linear tree in dendro format.
  438. Node_t* curr_node=this->PreorderFirst();
  439. std::vector<MortonId> in;
  440. std::vector<Node_t*> node_lst;
  441. while(curr_node!=NULL){
  442. if(curr_node->IsLeaf() && !curr_node->IsGhost()){
  443. node_lst.push_back(curr_node);
  444. in.push_back(curr_node->GetMortonId());
  445. //in.back().setWeight(curr_node->NodeCost()); //Using default weights.
  446. }
  447. curr_node=this->PreorderNxt(curr_node);
  448. }
  449. size_t leaf_cnt=in.size();
  450. //Get new mins.
  451. std::vector<MortonId> new_mins(np);
  452. if(loc_min==NULL){
  453. //Partition vector of MortonIds using par::partitionW
  454. std::vector<MortonId> out=in;
  455. par::partitionW<MortonId>(out,NULL,*Comm());
  456. MPI_Allgather(&out[0] , 1, par::Mpi_datatype<MortonId>::value(),
  457. &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  458. }else{
  459. MPI_Allgather(loc_min , 1, par::Mpi_datatype<MortonId>::value(),
  460. &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  461. }
  462. //Now exchange nodes according to new mins
  463. std::vector<PackedData> data(leaf_cnt);
  464. std::vector<int> send_cnts; send_cnts.assign(np,0);
  465. std::vector<int> send_size; send_size.assign(np,0);
  466. size_t sbuff_size=0;
  467. int omp_p=omp_get_max_threads();
  468. #pragma omp parallel for reduction(+:sbuff_size)
  469. for(int i=0;i<omp_p;i++){
  470. size_t a=( i *np)/omp_p;
  471. size_t b=((i+1)*np)/omp_p;
  472. if(b>a){
  473. size_t p_iter=a;
  474. size_t node_iter=std::lower_bound(&in[0], &in[in.size()], new_mins[a])-&in[0];
  475. for( ;node_iter<node_lst.size();node_iter++){
  476. while(p_iter+1u<(size_t)np? in[node_iter]>=new_mins[p_iter+1]: false) p_iter++;
  477. if(p_iter>=b) break;
  478. send_cnts[p_iter]++;
  479. data[node_iter]=node_lst[node_iter]->Pack();
  480. send_size[p_iter]+=data[node_iter].length+sizeof(size_t)+sizeof(MortonId);
  481. sbuff_size +=data[node_iter].length+sizeof(size_t)+sizeof(MortonId);
  482. }
  483. }
  484. }
  485. std::vector<int> recv_cnts(np);
  486. std::vector<int> recv_size(np);
  487. MPI_Alltoall(&send_cnts[0], 1, par::Mpi_datatype<int>::value(),
  488. &recv_cnts[0], 1, par::Mpi_datatype<int>::value(), *Comm());
  489. MPI_Alltoall(&send_size[0], 1, par::Mpi_datatype<int>::value(),
  490. &recv_size[0], 1, par::Mpi_datatype<int>::value(), *Comm());
  491. size_t recv_cnt=0;
  492. #pragma omp parallel for reduction(+:recv_cnt)
  493. for(int i=0;i<np;i++) recv_cnt+=recv_cnts[i];
  494. std::vector<MortonId> out(recv_cnt);
  495. std::vector<int> sdisp; sdisp.assign(np,0);
  496. std::vector<int> rdisp; rdisp.assign(np,0);
  497. omp_par::scan(&send_size[0],&sdisp[0],np); //TODO Don't need to do a full scan
  498. omp_par::scan(&recv_size[0],&rdisp[0],np); // as most entries will be 0.
  499. size_t rbuff_size=rdisp[np-1]+recv_size[np-1];
  500. char* send_buff=new char[sbuff_size];
  501. char* recv_buff=new char[rbuff_size];
  502. std::vector<char*> data_ptr(leaf_cnt);
  503. char* s_ptr=send_buff;
  504. for(size_t i=0;i<leaf_cnt;i++){
  505. *((MortonId*)s_ptr)=in [i] ; s_ptr+=sizeof(MortonId);
  506. *(( size_t*)s_ptr)=data[i].length; s_ptr+=sizeof(size_t);
  507. data_ptr[i]=s_ptr ; s_ptr+=data[i].length;
  508. }
  509. #pragma omp parallel for
  510. for(int p=0;p<omp_p;p++){
  511. size_t a=( p *leaf_cnt)/omp_p;
  512. size_t b=((p+1)*leaf_cnt)/omp_p;
  513. for(size_t i=a;i<b;i++)
  514. mem::memcopy(data_ptr[i], data[i].data, data[i].length);
  515. }
  516. par::Mpi_Alltoallv_sparse<char>(&send_buff[0], &send_size[0], &sdisp[0],
  517. &recv_buff[0], &recv_size[0], &rdisp[0], *Comm());
  518. char* r_ptr=recv_buff;
  519. std::vector<PackedData> r_data(recv_cnt);
  520. for(size_t i=0;i<recv_cnt;i++){
  521. out [i] =*(MortonId*)r_ptr; r_ptr+=sizeof(MortonId);
  522. r_data[i].length=*( size_t*)r_ptr; r_ptr+=sizeof(size_t);
  523. r_data[i].data = r_ptr; r_ptr+=r_data[i].length;
  524. }
  525. //Initialize all new nodes.
  526. int nchld=1UL<<this->Dim();
  527. size_t node_iter=0;
  528. MortonId dn;
  529. node_lst.resize(recv_cnt);
  530. Node_t* n=this->PreorderFirst();
  531. while(n!=NULL && node_iter<recv_cnt){
  532. n->SetGhost(false);
  533. dn=n->GetMortonId();
  534. if(dn.isAncestor(out[node_iter]) && dn!=out[node_iter]){
  535. if(n->IsLeaf()){
  536. {
  537. n->SetGhost(true);
  538. n->Subdivide();
  539. n->SetGhost(false);
  540. for(int j=0;j<nchld;j++){
  541. Node_t* ch_node=(Node_t*)n->Child(j);
  542. ch_node->SetGhost(false);
  543. }
  544. }
  545. }
  546. }else if(dn==out[node_iter]){
  547. if(!n->IsLeaf()){
  548. n->Truncate();
  549. n->SetGhost(false);
  550. }
  551. node_lst[node_iter]=n;
  552. node_iter++;
  553. }else{
  554. n->Truncate(); //This node does not belong to this process.
  555. n->SetGhost(true);
  556. }
  557. n=this->PreorderNxt(n);
  558. }
  559. while(n!=NULL){
  560. n->Truncate();
  561. n->SetGhost(true);
  562. n=this->PreorderNxt(n);
  563. }
  564. #pragma omp parallel for
  565. for(int p=0;p<omp_p;p++){
  566. size_t a=( p *recv_cnt)/omp_p;
  567. size_t b=((p+1)*recv_cnt)/omp_p;
  568. for(size_t i=a;i<b;i++)
  569. node_lst[i]->Unpack(r_data[i]);
  570. }
  571. //Free memory buffers.
  572. delete[] recv_buff;
  573. delete[] send_buff;
  574. }
  575. template <class TreeNode>
  576. TreeNode* MPI_Tree<TreeNode>::FindNode(MortonId& key, bool subdiv, TreeNode* start){
  577. int num_child=1UL<<this->Dim();
  578. Node_t* n=start;
  579. if(n==NULL) n=this->RootNode();
  580. while(n->GetMortonId()<key && (!n->IsLeaf()||subdiv)){
  581. if(n->IsLeaf() && !n->IsGhost()) n->Subdivide();
  582. if(n->IsLeaf()) break;
  583. for(int j=0;j<num_child;j++){
  584. if(((Node_t*)n->Child(j))->GetMortonId().NextId()>key){
  585. n=(Node_t*)n->Child(j);
  586. break;
  587. }
  588. }
  589. }
  590. assert(!subdiv || n->IsGhost() || n->GetMortonId()==key);
  591. return n;
  592. }
  593. //list must be sorted.
  594. inline int lineariseList(std::vector<MortonId> & list, MPI_Comm comm) {
  595. int rank,size;
  596. MPI_Comm_rank(comm,&rank);
  597. MPI_Comm_size(comm,&size);
  598. //Remove empty processors...
  599. int new_rank, new_size;
  600. MPI_Comm new_comm;
  601. MPI_Comm_split(comm, (list.empty()?0:1), rank, &new_comm);
  602. MPI_Comm_rank (new_comm, &new_rank);
  603. MPI_Comm_size (new_comm, &new_size);
  604. if(!list.empty()) {
  605. //Send the last octant to the next processor.
  606. MortonId lastOctant = list[list.size()-1];
  607. MortonId lastOnPrev;
  608. MPI_Request recvRequest;
  609. MPI_Request sendRequest;
  610. if(new_rank > 0) {
  611. MPI_Irecv(&lastOnPrev, 1, par::Mpi_datatype<MortonId>::value(), new_rank-1, 1, new_comm, &recvRequest);
  612. }
  613. if(new_rank < (new_size-1)) {
  614. MPI_Issend( &lastOctant, 1, par::Mpi_datatype<MortonId>::value(), new_rank+1, 1, new_comm, &sendRequest);
  615. }
  616. if(new_rank > 0) {
  617. std::vector<MortonId> tmp(list.size()+1);
  618. for(size_t i = 0; i < list.size(); i++) {
  619. tmp[i+1] = list[i];
  620. }
  621. MPI_Status statusWait;
  622. MPI_Wait(&recvRequest, &statusWait);
  623. tmp[0] = lastOnPrev;
  624. list = tmp;
  625. tmp.clear();
  626. }
  627. {// Remove duplicates and ancestors.
  628. std::vector<MortonId> tmp;
  629. if(!(list.empty())) {
  630. for(unsigned int i = 0; i < (list.size()-1); i++) {
  631. if( (!(list[i].isAncestor(list[i+1]))) && (list[i] != list[i+1]) ) {
  632. tmp.push_back(list[i]);
  633. }
  634. }
  635. if(new_rank == (new_size-1)) {
  636. tmp.push_back(list[list.size()-1]);
  637. }
  638. }
  639. list = tmp;
  640. tmp.clear();
  641. }
  642. if(new_rank < (new_size-1)) {
  643. MPI_Status statusWait;
  644. MPI_Wait(&sendRequest, &statusWait);
  645. }
  646. }//not empty procs only
  647. return 1;
  648. }//end fn.
  649. inline unsigned int balance_wt(const MortonId* n){
  650. return n->GetDepth();
  651. }
  652. inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &out,
  653. unsigned int dim, unsigned int maxDepth, bool periodic, MPI_Comm comm) {
  654. int omp_p=omp_get_max_threads();
  655. int rank, size;
  656. MPI_Comm_size(comm,&size);
  657. MPI_Comm_rank(comm,&rank);
  658. #ifdef __VERBOSE__
  659. long long locInSize = in.size();
  660. #endif
  661. //////////////////////////////////////////////////////////////////////////////////////////////////
  662. //Redistribute.
  663. par::partitionW<MortonId>(in, balance_wt, comm);
  664. //Build level-by-level set of nodes.
  665. std::vector<std::set<MortonId> > nodes((maxDepth+1)*omp_p);
  666. #pragma omp parallel for
  667. for(int p=0;p<omp_p;p++){
  668. size_t a=( p *in.size())/omp_p;
  669. size_t b=((p+1)*in.size())/omp_p;
  670. for(size_t i=a;i<b;i++)
  671. nodes[in[i].GetDepth()+(maxDepth+1)*p].insert(in[i]);
  672. //Add new nodes level-by-level.
  673. std::vector<MortonId> nbrs;
  674. unsigned int num_chld=1UL<<dim;
  675. for(unsigned int l=maxDepth;l>1;l--){
  676. //Build set of parents of balancing nodes.
  677. std::set<MortonId> nbrs_parent;
  678. std::set<MortonId>::iterator start=nodes[l+(maxDepth+1)*p].begin();
  679. std::set<MortonId>::iterator end =nodes[l+(maxDepth+1)*p].end();
  680. for(std::set<MortonId>::iterator node=start; node != end;){
  681. node->NbrList(nbrs, l-1, periodic);
  682. int nbr_cnt=nbrs.size();
  683. for(int i=0;i<nbr_cnt;i++)
  684. nbrs_parent.insert(nbrs[i].getAncestor(nbrs[i].GetDepth()-1));
  685. //node++; //Optimize this by skipping siblings.
  686. MortonId pnode=node->getAncestor(node->GetDepth()-1);
  687. while(node != end && pnode==node->getAncestor(node->GetDepth()-1)) node++;
  688. }
  689. //Get the balancing nodes.
  690. std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
  691. start=nbrs_parent.begin();
  692. end =nbrs_parent.end();
  693. for(std::set<MortonId>::iterator node=start; node != end; node++){
  694. std::vector<MortonId> children;
  695. children=node->Children();
  696. for(unsigned int j=0;j<num_chld;j++)
  697. ancestor_nodes.insert(children[j]);
  698. }
  699. }
  700. //Remove non-leaf nodes.
  701. for(unsigned int l=1;l<=maxDepth;l++){
  702. std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin();
  703. std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end();
  704. std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
  705. for(std::set<MortonId>::iterator node=start; node != end; node++){
  706. MortonId parent=node->getAncestor(node->GetDepth()-1);
  707. ancestor_nodes.erase(parent);
  708. }
  709. }
  710. }
  711. //Resize in.
  712. std::vector<size_t> node_cnt(omp_p,0);
  713. std::vector<size_t> node_dsp(omp_p,0);
  714. #pragma omp parallel for
  715. for(int i=0;i<omp_p;i++){
  716. for(unsigned int j=0;j<=maxDepth;j++)
  717. node_cnt[i]+=nodes[j+i*(maxDepth+1)].size();
  718. }
  719. omp_par::scan(&node_cnt[0],&node_dsp[0], omp_p);
  720. in.resize(node_cnt[omp_p-1]+node_dsp[omp_p-1]);
  721. //Copy leaf nodes to in.
  722. #pragma omp parallel for
  723. for(int p=0;p<omp_p;p++){
  724. size_t node_iter=node_dsp[p];
  725. for(unsigned int l=maxDepth;l>=1;l--){
  726. std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin();
  727. std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end();
  728. for(std::set<MortonId>::iterator node=start; node != end; node++)
  729. in[node_iter++]=*node;
  730. }
  731. }
  732. #ifdef __VERBOSE__
  733. //Local size before removing duplicates and ancestors (linearise).
  734. long long locTmpSize = in.size();
  735. #endif
  736. //Sort, Linearise, Redistribute.
  737. //TODO Sort and linearize non-leaf nodes and then add leaf nodes.
  738. //TODO The following might work better as it reduces the comm bandwidth:
  739. //Split comm into sqrt(np) processes and sort, linearise for each comm group.
  740. //Then do the global sort, linearise with the original comm.
  741. par::HyperQuickSort(in, out, comm);
  742. lineariseList(out, comm);
  743. par::partitionW<MortonId>(out, NULL , comm);
  744. //////////////////////////////////////////////////////////////////////////////////////////////////
  745. #ifdef __VERBOSE__
  746. long long locOutSize = out.size();
  747. long long globInSize, globTmpSize, globOutSize;
  748. MPI_Allreduce(&locInSize , &globInSize , 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
  749. MPI_Allreduce(&locTmpSize, &globTmpSize, 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
  750. MPI_Allreduce(&locOutSize, &globOutSize, 1, par::Mpi_datatype<long long>::value(), MPI_SUM, comm);
  751. if(!rank) std::cout<<"Balance Octree. inpSize: "<<globInSize
  752. <<" tmpSize: "<<globTmpSize
  753. <<" outSize: "<<globOutSize
  754. <<" activeNpes: "<<size<<std::endl;
  755. #endif
  756. return 0;
  757. }//end function
  758. template <class TreeNode>
  759. void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
  760. int num_proc,myrank;
  761. MPI_Comm_rank(*Comm(),&myrank);
  762. MPI_Comm_size(*Comm(),&num_proc);
  763. //Using Dendro for balancing
  764. //Create a linear tree in dendro format.
  765. Node_t* curr_node=this->PreorderFirst();
  766. std::vector<MortonId> in;
  767. while(curr_node!=NULL){
  768. if(curr_node->IsLeaf() && !curr_node->IsGhost()){
  769. in.push_back(curr_node->GetMortonId());
  770. }
  771. curr_node=this->PreorderNxt(curr_node);
  772. }
  773. //2:1 balance
  774. Profile::Tic("ot::balanceOctree",Comm(),true,3);
  775. std::vector<MortonId> out;
  776. balanceOctree(in, out, this->Dim(), this->max_depth, (bndry==Periodic), *Comm());
  777. Profile::Toc();
  778. //Get new_mins.
  779. std::vector<MortonId> new_mins(num_proc);
  780. MPI_Allgather(&out[0] , 1, par::Mpi_datatype<MortonId>::value(),
  781. &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  782. // Refine to new_mins in my range of octants
  783. // or else RedistNodes(...) will not work correctly.
  784. {
  785. int i=0;
  786. std::vector<MortonId> mins=GetMins();
  787. while(new_mins[i]<mins[myrank] && i<num_proc) i++; //TODO: Use binary search.
  788. for(;i<num_proc;i++){
  789. Node_t* n=FindNode(new_mins[i], true);
  790. if(n->IsGhost()) break;
  791. else assert(n->GetMortonId()==new_mins[i]);
  792. }
  793. }
  794. //Redist nodes using new_mins.
  795. Profile::Tic("RedistNodes",Comm(),true,3);
  796. RedistNodes(&out[0]);
  797. #ifndef NDEBUG
  798. std::vector<MortonId> mins=GetMins();
  799. assert(mins[myrank].getDFD()==out[0].getDFD());
  800. #endif
  801. Profile::Toc();
  802. //Now subdivide the current tree as necessary to make it balanced.
  803. Profile::Tic("LocalSubdivide",Comm(),false,3);
  804. int omp_p=omp_get_max_threads();
  805. for(int i=0;i<omp_p;i++){
  806. size_t a=(out.size()*i)/omp_p;
  807. Node_t* n=FindNode(out[a], true);
  808. assert(n->GetMortonId()==out[a]);
  809. UNUSED(n);
  810. }
  811. #pragma omp parallel for
  812. for(int i=0;i<omp_p;i++){
  813. size_t a=(out.size()* i )/omp_p;
  814. size_t b=(out.size()*(i+1))/omp_p;
  815. MortonId dn;
  816. size_t node_iter=a;
  817. Node_t* n=FindNode(out[node_iter], false);
  818. while(n!=NULL && node_iter<b){
  819. n->SetGhost(false);
  820. dn=n->GetMortonId();
  821. if(dn.isAncestor(out[node_iter]) && dn!=out[node_iter]){
  822. if(n->IsLeaf()) n->Subdivide();
  823. }else if(dn==out[node_iter]){
  824. assert(n->IsLeaf());
  825. //if(!n->IsLeaf()){ //This should never happen
  826. // n->Truncate();
  827. // n->SetGhost(false);
  828. //}
  829. assert(n->IsLeaf());
  830. node_iter++;
  831. }else{
  832. n->Truncate(); //This node does not belong to this process.
  833. n->SetGhost(true);
  834. }
  835. n=this->PreorderNxt(n);
  836. }
  837. if(i==omp_p-1){
  838. while(n!=NULL){
  839. n->Truncate();
  840. n->SetGhost(true);
  841. n=this->PreorderNxt(n);
  842. }
  843. }
  844. }
  845. Profile::Toc();
  846. }
  847. template <class TreeNode>
  848. void MPI_Tree<TreeNode>::Balance21_local(BoundaryType bndry){
  849. //SetColleagues(bndry);
  850. std::vector<std::vector<Node_t*> > node_lst(this->max_depth+1);
  851. Node_t* curr_node=this->PreorderFirst();
  852. while(curr_node!=NULL){
  853. node_lst[curr_node->Depth()].push_back(curr_node);
  854. curr_node=this->PreorderNxt(curr_node);
  855. }
  856. int n1=pow(3.0,this->Dim());
  857. int n2=pow(2.0,this->Dim());
  858. for(int i=this->max_depth;i>0;i--){
  859. Real_t s=pow(0.5,i);
  860. for(size_t j=0;j<node_lst[i].size();j++){
  861. curr_node=node_lst[i][j];
  862. Real_t* coord=curr_node->Coord();
  863. if(!curr_node->IsLeaf()) for(int k=0;k<n1;k++){
  864. if(curr_node->Colleague(k)==NULL){
  865. Real_t c0[3]={coord[0]+((k/1)%3-1)*s+s*0.5,
  866. coord[1]+((k/3)%3-1)*s+s*0.5,
  867. coord[2]+((k/9)%3-1)*s+s*0.5};
  868. if(bndry==Periodic){
  869. c0[0]=c0[0]-floor(c0[0]);
  870. c0[1]=c0[1]-floor(c0[1]);
  871. c0[2]=c0[2]-floor(c0[2]);
  872. }
  873. if(c0[0]>0 && c0[0]<1)
  874. if(c0[1]>0 && c0[1]<1)
  875. if(c0[2]>0 && c0[2]<1){
  876. Node_t* node=this->RootNode();
  877. while(node->Depth()<i){
  878. if(node->IsLeaf()){
  879. node->Subdivide();
  880. for(int l=0;l<n2;l++){
  881. node_lst[node->Depth()+1].push_back((Node_t*)node->Child(l));
  882. /*
  883. SetColleagues(bndry,(Node_t*)node->Child(l));
  884. for(int i_=0;i_<n1;i_++){
  885. Node_t* coll=(Node_t*)((Node_t*)node->Child(l))->Colleague(i_);
  886. if(coll!=NULL) SetColleagues(bndry,coll);
  887. }// */
  888. }
  889. }
  890. Real_t s1=pow(0.5,node->Depth()+1);
  891. Real_t* c1=node->Coord();
  892. int c_id=((c0[0]-c1[0])>s1?1:0)+
  893. ((c0[1]-c1[1])>s1?2:0)+
  894. ((c0[2]-c1[2])>s1?4:0);
  895. node=(Node_t*)node->Child(c_id);
  896. /*if(node->Depth()==i){
  897. c1=node->Coord();
  898. std::cout<<(c0[0]-c1[0])-s1/2<<' '
  899. std::cout<<(c0[1]-c1[1])-s1/2<<' '
  900. std::cout<<(c0[2]-c1[2])-s1/2<<'\n';
  901. }// */
  902. }
  903. }
  904. }
  905. }
  906. }
  907. }
  908. }
  909. template <class TreeNode>
  910. void MPI_Tree<TreeNode>::SetColleagues(BoundaryType bndry, Node_t* node){
  911. int n1=(int)pow(3.0,this->Dim());
  912. int n2=(int)pow(2.0,this->Dim());
  913. if(node==NULL){
  914. Node_t* curr_node=this->PreorderFirst();
  915. if(curr_node!=NULL){
  916. if(bndry==Periodic){
  917. for(int i=0;i<n1;i++)
  918. curr_node->SetColleague(curr_node,i);
  919. }else{
  920. curr_node->SetColleague(curr_node,(n1-1)/2);
  921. }
  922. curr_node=this->PreorderNxt(curr_node);
  923. }
  924. Vector<std::vector<Node_t*> > nodes(MAX_DEPTH);
  925. while(curr_node!=NULL){
  926. nodes[curr_node->Depth()].push_back(curr_node);
  927. curr_node=this->PreorderNxt(curr_node);
  928. }
  929. for(size_t i=0;i<MAX_DEPTH;i++){
  930. size_t j0=nodes[i].size();
  931. Node_t** nodes_=&nodes[i][0];
  932. #pragma omp parallel for
  933. for(size_t j=0;j<j0;j++){
  934. SetColleagues(bndry, nodes_[j]);
  935. }
  936. }
  937. }else{
  938. /* //This is slower
  939. Node_t* root_node=this->RootNode();
  940. int d=node->Depth();
  941. Real_t* c0=node->Coord();
  942. Real_t s=pow(0.5,d);
  943. Real_t c[COORD_DIM];
  944. int idx=0;
  945. for(int i=-1;i<=1;i++)
  946. for(int j=-1;j<=1;j++)
  947. for(int k=-1;k<=1;k++){
  948. c[0]=c0[0]+s*0.5+s*k;
  949. c[1]=c0[1]+s*0.5+s*j;
  950. c[2]=c0[2]+s*0.5+s*i;
  951. if(bndry==Periodic){
  952. if(c[0]<0.0) c[0]+=1.0;
  953. if(c[0]>1.0) c[0]-=1.0;
  954. if(c[1]<1.0) c[1]+=1.0;
  955. if(c[1]>1.0) c[1]-=1.0;
  956. if(c[2]<1.0) c[2]+=1.0;
  957. if(c[2]>1.0) c[2]-=1.0;
  958. }
  959. node->SetColleague(NULL,idx);
  960. if(c[0]<1.0 && c[0]>0.0)
  961. if(c[1]<1.0 && c[1]>0.0)
  962. if(c[2]<1.0 && c[2]>0.0){
  963. MortonId m(c,d);
  964. Node_t* nbr=FindNode(m,false,root_node);
  965. while(nbr->Depth()>d) nbr=(Node_t*)nbr->Parent();
  966. if(nbr->Depth()==d) node->SetColleague(nbr,idx);
  967. }
  968. idx++;
  969. }
  970. /*/
  971. Node_t* parent_node;
  972. Node_t* tmp_node1;
  973. Node_t* tmp_node2;
  974. for(int i=0;i<n1;i++)node->SetColleague(NULL,i);
  975. parent_node=(Node_t*)node->Parent();
  976. if(parent_node==NULL) return;
  977. int l=node->Path2Node();
  978. for(int i=0;i<n1;i++){ //For each coll of the parent
  979. tmp_node1=(Node_t*)parent_node->Colleague(i);
  980. if(tmp_node1!=NULL)
  981. if(!tmp_node1->IsLeaf()){
  982. for(int j=0;j<n2;j++){ //For each child
  983. tmp_node2=(Node_t*)tmp_node1->Child(j);
  984. if(tmp_node2!=NULL){
  985. bool flag=true;
  986. int a=1,b=1,new_indx=0;
  987. for(int k=0;k<this->Dim();k++){
  988. int indx_diff=(((i/b)%3)-1)*2+((j/a)%2)-((l/a)%2);
  989. if(-1>indx_diff || indx_diff>1) flag=false;
  990. new_indx+=(indx_diff+1)*b;
  991. a*=2;b*=3;
  992. }
  993. if(flag){
  994. node->SetColleague(tmp_node2,new_indx);
  995. }
  996. }
  997. }
  998. }
  999. }// */
  1000. }
  1001. }
  1002. template <class TreeNode>
  1003. bool MPI_Tree<TreeNode>::CheckTree(){
  1004. int myrank,np;
  1005. MPI_Comm_rank(*Comm(),&myrank);
  1006. MPI_Comm_size(*Comm(),&np);
  1007. std::vector<MortonId> mins=GetMins();
  1008. std::stringstream st;
  1009. st<<"PID_"<<myrank<<" : ";
  1010. std::string str;
  1011. Node_t* n=this->PostorderFirst();
  1012. while(n!=NULL){
  1013. if(myrank<np-1) if(n->GetMortonId().getDFD()>=mins[myrank+1])break;
  1014. if(n->GetMortonId()>=mins[myrank] && n->IsLeaf() && n->IsGhost()){
  1015. std::cout<<n->GetMortonId()<<'\n';
  1016. std::cout<<mins[myrank]<<'\n';
  1017. if(myrank+1<np) std::cout<<mins[myrank+1]<<'\n';
  1018. std::cout<<myrank<<'\n';
  1019. assert(false);
  1020. }
  1021. if(n->GetMortonId()<mins[myrank] && n->IsLeaf() && !n->IsGhost()){
  1022. assert(false);
  1023. }
  1024. if(!n->IsGhost() && n->Depth()>0)
  1025. assert(!((Node_t*)n->Parent())->IsGhost());
  1026. n=this->PostorderNxt(n);
  1027. }
  1028. while(n!=NULL){
  1029. if(n->IsLeaf() && !n->IsGhost()){
  1030. st<<"non-ghost leaf node "<<n->GetMortonId()<<"; after last node.";
  1031. str=st.str(); ASSERT_WITH_MSG(false,str.c_str());
  1032. }
  1033. n=this->PostorderNxt(n);
  1034. }
  1035. return true;
  1036. };
  1037. /**
  1038. * \brief Determines if node is used in the region between Morton Ids m1 and m2
  1039. * ( m1 <= m2 ).
  1040. */
  1041. template <class TreeNode>
  1042. void IsShared(std::vector<TreeNode*>& nodes, MortonId* m1, MortonId* m2, BoundaryType bndry, std::vector<char>& shared_flag){
  1043. MortonId mm1, mm2;
  1044. if(m1!=NULL) mm1=m1->getDFD();
  1045. if(m2!=NULL) mm2=m2->getDFD();
  1046. shared_flag.resize(nodes.size());
  1047. int omp_p=omp_get_max_threads();
  1048. #pragma omp parallel for
  1049. for(int j=0;j<omp_p;j++){
  1050. size_t a=((j )*nodes.size())/omp_p;
  1051. size_t b=((j+1)*nodes.size())/omp_p;
  1052. std::vector<MortonId> nbr_lst;
  1053. for(size_t i=a;i<b;i++){
  1054. shared_flag[i]=false;
  1055. TreeNode* node=nodes[i];
  1056. assert(node!=NULL);
  1057. if(node->Depth()<2){
  1058. shared_flag[i]=true;
  1059. continue;
  1060. }
  1061. node->GetMortonId().NbrList(nbr_lst, node->Depth()-1, bndry==Periodic);
  1062. for(size_t k=0;k<nbr_lst.size();k++){
  1063. MortonId n1=nbr_lst[k] .getDFD();
  1064. MortonId n2=nbr_lst[k].NextId().getDFD();
  1065. if(m1==NULL || n2>mm1)
  1066. if(m2==NULL || n1<mm2){
  1067. shared_flag[i]=true;
  1068. break;
  1069. }
  1070. }
  1071. }
  1072. }
  1073. }
  1074. inline void IsShared(std::vector<PackedData>& nodes, MortonId* m1, MortonId* m2, BoundaryType bndry, std::vector<char>& shared_flag){
  1075. MortonId mm1, mm2;
  1076. if(m1!=NULL) mm1=m1->getDFD();
  1077. if(m2!=NULL) mm2=m2->getDFD();
  1078. shared_flag.resize(nodes.size());
  1079. int omp_p=omp_get_max_threads();
  1080. #pragma omp parallel for
  1081. for(int j=0;j<omp_p;j++){
  1082. size_t a=((j )*nodes.size())/omp_p;
  1083. size_t b=((j+1)*nodes.size())/omp_p;
  1084. std::vector<MortonId> nbr_lst;
  1085. for(size_t i=a;i<b;i++){
  1086. shared_flag[i]=false;
  1087. MortonId* node=(MortonId*)nodes[i].data;
  1088. assert(node!=NULL);
  1089. if(node->GetDepth()<2){
  1090. shared_flag[i]=true;
  1091. continue;
  1092. }
  1093. node->NbrList(nbr_lst, node->GetDepth()-1, bndry==Periodic);
  1094. for(size_t k=0;k<nbr_lst.size();k++){
  1095. MortonId n1=nbr_lst[k] .getDFD();
  1096. MortonId n2=nbr_lst[k].NextId().getDFD();
  1097. if(m1==NULL || n2>mm1)
  1098. if(m2==NULL || n1<mm2){
  1099. shared_flag[i]=true;
  1100. break;
  1101. }
  1102. }
  1103. }
  1104. }
  1105. }
  1106. /**
  1107. * \brief Construct Locally Essential Tree by exchanging Ghost octants.
  1108. */
  1109. template <class TreeNode>
  1110. void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
  1111. Profile::Tic("LET_Hypercube", &comm, true, 5);
  1112. ConstructLET_Hypercube(bndry);
  1113. Profile::Toc();
  1114. Profile::Tic("LET_Sparse", &comm, true, 5);
  1115. ConstructLET_Sparse(bndry);
  1116. Profile::Toc();
  1117. #ifndef NDEBUG
  1118. CheckTree();
  1119. #endif
  1120. }
  1121. /**
  1122. * \brief Hypercube based scheme to exchange Ghost octants.
  1123. */
  1124. //#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)
  1125. template <class TreeNode>
  1126. void MPI_Tree<TreeNode>::ConstructLET_Hypercube(BoundaryType bndry){
  1127. int num_p,rank;
  1128. MPI_Comm_size(*Comm(),&num_p);
  1129. MPI_Comm_rank(*Comm(),&rank );
  1130. if(num_p==1) return;
  1131. int omp_p=omp_get_max_threads();
  1132. std::vector<MortonId> mins=GetMins();
  1133. // Build list of shared nodes.
  1134. std::vector<Node_t*> shared_nodes; shared_nodes.clear();
  1135. std::vector<Node_t*> node_lst; node_lst.clear();
  1136. Node_t* curr_node=this->PreorderFirst();
  1137. while(curr_node!=NULL){
  1138. if(curr_node->GetMortonId().getDFD()>=mins[rank]) break;
  1139. curr_node=this->PreorderNxt(curr_node);
  1140. }
  1141. while(curr_node!=NULL){
  1142. if(curr_node->IsGhost()) break;
  1143. node_lst.push_back(curr_node);
  1144. curr_node=this->PreorderNxt(curr_node);
  1145. }
  1146. std::vector<char> node_flag0; node_flag0.clear();
  1147. std::vector<char> node_flag1; node_flag1.clear();
  1148. IsShared(node_lst,&mins[0],&mins[rank],bndry,node_flag0);
  1149. if(rank<num_p-1) IsShared(node_lst,&mins[rank+1],NULL,bndry,node_flag1);
  1150. for(size_t i=0;i<node_lst.size();i++){
  1151. if(node_flag0[i] || (rank<num_p-1 && node_flag1[i]))
  1152. shared_nodes.push_back(node_lst[i]);
  1153. }
  1154. //std::cout<<"Shared = "<<shared_nodes.size()<<'\n';
  1155. // Pack shared nodes.
  1156. static std::vector<char> shrd_buff_vec0(omp_p*64l*1024l*1024l);
  1157. static std::vector<char> shrd_buff_vec1(omp_p*128l*1024l*1024l);
  1158. static std::vector<char> send_buff_vec(omp_p*64l*1024l*1024l); char* send_buff;
  1159. static std::vector<char> recv_buff_vec(omp_p*64l*1024l*1024l); char* recv_buff;
  1160. std::vector<PackedData> shrd_data;
  1161. size_t max_data_size=0;
  1162. {
  1163. long max_data_size_lcl=0;
  1164. long max_data_size_glb=0;
  1165. char* data_ptr=&shrd_buff_vec0[0];
  1166. for(size_t i=0;i<shared_nodes.size();i++){
  1167. PackedData p=shared_nodes[i]->Pack(true,data_ptr,sizeof(MortonId));
  1168. ((MortonId*)data_ptr)[0]=shared_nodes[i]->GetMortonId();
  1169. p.length+=sizeof(MortonId);
  1170. shrd_data.push_back(p);
  1171. data_ptr+=p.length;
  1172. if(max_data_size_lcl<(long)p.length) max_data_size_lcl=p.length;
  1173. assert(data_ptr<=&(*shrd_buff_vec0.end())); //TODO: resize if needed.
  1174. }
  1175. MPI_Allreduce(&max_data_size_lcl, &max_data_size_glb, 1, MPI_LONG, MPI_MAX, *Comm());
  1176. max_data_size=max_data_size_glb;
  1177. }
  1178. // Memory slots for storing node data.
  1179. std::set<void*> mem_set;
  1180. size_t mem_set_size=0;
  1181. size_t range[2]={0,(size_t)num_p-1};
  1182. while(range[1]-range[0]>0){
  1183. size_t split_p=(range[0]+range[1])/2;
  1184. size_t new_range[2]={(size_t)rank<=split_p?range[0]:split_p+1,(size_t)rank<=split_p?split_p:range[1]};
  1185. size_t com_range[2]={(size_t)rank> split_p?range[0]:split_p+1,(size_t)rank> split_p?split_p:range[1]};
  1186. size_t partner=rank-new_range[0]+com_range[0];
  1187. if(partner>range[1]) partner--;
  1188. bool extra_partner=((size_t)rank==range[1] && ((range[1]-range[0])%2)==0);
  1189. int send_length=0;
  1190. std::vector<PackedData> shrd_data_new;
  1191. IsShared(shrd_data, &mins[com_range[0]], (com_range[1]==(size_t)num_p-1?NULL:&mins[com_range[1]+1]),bndry, node_flag0);
  1192. IsShared(shrd_data, &mins[new_range[0]], (new_range[1]==(size_t)num_p-1?NULL:&mins[new_range[1]+1]),bndry, node_flag1);
  1193. {
  1194. std::vector<void*> srctrg_ptr;
  1195. std::vector<size_t> mem_size;
  1196. for(size_t i=0;i<shrd_data.size();i++){
  1197. PackedData& p=shrd_data[i];
  1198. if( node_flag0[i]){ // Copy data to send buffer.
  1199. char* data_ptr=(char*)&send_buff_vec[send_length];
  1200. ((size_t*)data_ptr)[0]=p.length; data_ptr+=sizeof(size_t);
  1201. //mem::memcopy(data_ptr,p.data,p.length);
  1202. mem_size.push_back(p.length);
  1203. srctrg_ptr.push_back(p.data);
  1204. srctrg_ptr.push_back(data_ptr);
  1205. send_length+=p.length+sizeof(size_t);
  1206. assert((size_t)send_length<=send_buff_vec.size()); //TODO: resize if needed.
  1207. }
  1208. if(!node_flag1[i]){ // Free memory slot.
  1209. //assert(node_flag0[0]);
  1210. if(p.data>=&shrd_buff_vec1[0] && p.data<&shrd_buff_vec1[0]+shrd_buff_vec1.size())
  1211. mem_set.insert(p.data);
  1212. } else shrd_data_new.push_back(p);
  1213. }
  1214. shrd_data=shrd_data_new;
  1215. #pragma omp parallel for
  1216. for(int k=0;k<omp_p;k++){
  1217. size_t i0=((k+0)*mem_size.size())/omp_p;
  1218. size_t i1=((k+1)*mem_size.size())/omp_p;
  1219. for(size_t i=i0;i<i1;i++){
  1220. mem::memcopy(srctrg_ptr[2*i+1],srctrg_ptr[2*i+0],mem_size[i]);
  1221. }
  1222. }
  1223. }
  1224. //Exchange send size.
  1225. int recv_length=0;
  1226. int extra_recv_length=0;
  1227. int extra_send_length=0;
  1228. MPI_Status status;
  1229. MPI_Sendrecv (& send_length,1,MPI_INT,partner,0, &recv_length,1,MPI_INT,partner,0,*Comm(),&status);
  1230. if(extra_partner) MPI_Sendrecv(&extra_send_length,1,MPI_INT,split_p,0,&extra_recv_length,1,MPI_INT,split_p,0,*Comm(),&status);
  1231. //SendRecv data.
  1232. assert((size_t)send_length <=send_buff_vec.size()); send_buff=&send_buff_vec[0];
  1233. assert((size_t)recv_length+extra_recv_length<=recv_buff_vec.size()); recv_buff=&recv_buff_vec[0];
  1234. MPI_Sendrecv (send_buff,send_length,MPI_BYTE,partner,0, recv_buff , recv_length,MPI_BYTE,partner,0,*Comm(),&status);
  1235. if(extra_partner) MPI_Sendrecv( NULL, 0,MPI_BYTE,split_p,0,&recv_buff[recv_length],extra_recv_length,MPI_BYTE,split_p,0,*Comm(),&status);
  1236. //Get nodes from received data.
  1237. {
  1238. std::vector<void*> srctrg_ptr;
  1239. std::vector<size_t> mem_size;
  1240. int buff_length=0;
  1241. while(buff_length<recv_length+extra_recv_length){
  1242. PackedData p0,p1;
  1243. p0.length=((size_t*)&recv_buff_vec[buff_length])[0];
  1244. p0.data=(char*)&recv_buff_vec[buff_length]+sizeof(size_t);
  1245. buff_length+=p0.length+sizeof(size_t);
  1246. p1.length=p0.length;
  1247. if(mem_set.size()==0){
  1248. assert(mem_set_size*max_data_size<shrd_buff_vec1.size());
  1249. p1.data=&shrd_buff_vec1[mem_set_size*max_data_size];
  1250. mem_set_size++;
  1251. }else{
  1252. p1.data=*mem_set.begin();
  1253. mem_set.erase(mem_set.begin());
  1254. }
  1255. //mem::memcopy(p1.data,p0.data,p0.length);
  1256. mem_size.push_back(p0.length);
  1257. srctrg_ptr.push_back(p0.data);
  1258. srctrg_ptr.push_back(p1.data);
  1259. shrd_data.push_back(p1);
  1260. }
  1261. #pragma omp parallel for
  1262. for(int k=0;k<omp_p;k++){
  1263. size_t i0=((k+0)*mem_size.size())/omp_p;
  1264. size_t i1=((k+1)*mem_size.size())/omp_p;
  1265. for(size_t i=i0;i<i1;i++){
  1266. mem::memcopy(srctrg_ptr[2*i+1],srctrg_ptr[2*i+0],mem_size[i]);
  1267. }
  1268. }
  1269. }
  1270. range[0]=new_range[0];
  1271. range[1]=new_range[1];
  1272. }
  1273. //Add shared_nodes to the tree.
  1274. //std::cout<<"Number of Ghost Nodes = "<<shrd_data.size()<<'\n';
  1275. int nchld=(1UL<<this->Dim()); // Number of children.
  1276. std::vector<Node_t*> shrd_nodes(shrd_data.size());
  1277. for(size_t i=0;i<shrd_data.size();i++){ // Find shared nodes.
  1278. MortonId& mid=*(MortonId*)shrd_data[i].data;
  1279. Node_t* srch_node=this->RootNode();
  1280. while(srch_node->GetMortonId()!=mid){
  1281. Node_t* ch_node;
  1282. if(srch_node->IsLeaf()){
  1283. srch_node->SetGhost(true);
  1284. srch_node->Subdivide();
  1285. }
  1286. for(int j=nchld-1;j>=0;j--){
  1287. ch_node=(Node_t*)srch_node->Child(j);
  1288. if(ch_node->GetMortonId()<=mid){
  1289. srch_node=ch_node;
  1290. break;
  1291. }
  1292. }
  1293. }
  1294. shrd_nodes[i]=srch_node;
  1295. }
  1296. #pragma omp parallel for
  1297. for(size_t i=0;i<shrd_data.size();i++){
  1298. if(shrd_nodes[i]->IsGhost()) { // Initialize ghost node.
  1299. PackedData p=shrd_data[i];
  1300. p.data=((char*)p.data)+sizeof(MortonId);
  1301. p.length-=sizeof(MortonId);
  1302. shrd_nodes[i]->Unpack(p);
  1303. }
  1304. }
  1305. //Now LET is complete.
  1306. }
  1307. /**
  1308. * \brief Sparse communication scheme to exchange Ghost octants.
  1309. */
  1310. template <class TreeNode>
  1311. void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
  1312. typedef int MPI_size_t;
  1313. struct CommData{
  1314. MortonId mid;
  1315. TreeNode* node;
  1316. size_t pkd_length;
  1317. size_t usr_cnt;
  1318. MortonId usr_mid[COLLEAGUE_COUNT];
  1319. size_t usr_pid[COLLEAGUE_COUNT];
  1320. };
  1321. int num_p,rank;
  1322. MPI_Comm_size(*Comm(),&num_p);
  1323. MPI_Comm_rank(*Comm(),&rank );
  1324. if(num_p==1) return;
  1325. int omp_p=omp_get_max_threads();
  1326. std::vector<MortonId> mins=GetMins();
  1327. // Allocate Memory.
  1328. #define BUFFER_SIZE 1024
  1329. static mem::MemoryManager memgr(BUFFER_SIZE*1024l*1024l);
  1330. static std::vector<char> shrd_buff_vec1(16*64l*1024l*1024l); // TODO: Build memory manager for such allocations.
  1331. static std::vector<char> send_buff;
  1332. static std::vector<char> recv_buff;
  1333. //Profile::Tic("SharedNodes", &comm, false, 5);
  1334. CommData* node_comm_data=NULL; // CommData for all nodes.
  1335. std::vector<void*> shared_data; // CommData for shared nodes.
  1336. std::vector<par::SortPair<size_t,size_t> > pid_node_pair; // <pid, shared_data index> list
  1337. { // Set node_comm_data
  1338. MortonId mins_r0=mins[ rank+0 ].getDFD();
  1339. MortonId mins_r1=mins[std::min(rank+1,num_p-1)].getDFD();
  1340. std::vector<TreeNode*> nodes=this->GetNodeList();
  1341. node_comm_data=(CommData*)memgr.malloc(sizeof(CommData)*nodes.size());
  1342. #pragma omp parallel for
  1343. for(size_t tid=0;tid<omp_p;tid++){
  1344. std::vector<MortonId> nbr_lst;
  1345. size_t a=(nodes.size()* tid )/omp_p;
  1346. size_t b=(nodes.size()*(tid+1))/omp_p;
  1347. for(size_t i=a;i<b;i++){
  1348. bool shared=false;
  1349. CommData& comm_data=node_comm_data[i];
  1350. comm_data.node=nodes[i];
  1351. comm_data.mid=comm_data.node->GetMortonId();
  1352. comm_data.usr_cnt=0;
  1353. if(comm_data.node->IsGhost()) continue;
  1354. if(comm_data.node->Depth()==0) continue;
  1355. if(comm_data.mid.getDFD()<mins_r0) continue;
  1356. MortonId mid0=comm_data.mid. getDFD();
  1357. MortonId mid1=comm_data.mid.NextId().getDFD();
  1358. comm_data.mid.NbrList(nbr_lst,comm_data.node->Depth()-1, bndry==Periodic);
  1359. comm_data.usr_cnt=nbr_lst.size();
  1360. for(size_t j=0;j<nbr_lst.size();j++){
  1361. MortonId usr_mid=nbr_lst[j];
  1362. MortonId usr_mid_dfd=usr_mid.getDFD();
  1363. comm_data.usr_mid[j]=usr_mid;
  1364. comm_data.usr_pid[j]=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
  1365. // if(usr_mid_dfd<mins_r0 || (rank+1<num_p && usr_mid_dfd>=mins_r1)){ // Find the user pid.
  1366. // size_t usr_pid=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
  1367. // comm_data.usr_pid[j]=usr_pid;
  1368. // }else comm_data.usr_pid[j]=rank;
  1369. if(!shared){ // Check if this node needs to be transferred during broadcast.
  1370. if(comm_data.usr_pid[j]!=rank || (rank+1<num_p && usr_mid.NextId()>mins_r1) ){
  1371. shared=true;
  1372. }
  1373. }
  1374. }
  1375. #pragma omp critical (ADD_SHARED)
  1376. if(shared){
  1377. for(size_t j=0;j<comm_data.usr_cnt;j++)
  1378. if(comm_data.usr_pid[j]!=rank){
  1379. bool unique_pid=true;
  1380. for(size_t k=0;k<j;k++){
  1381. if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
  1382. unique_pid=false;
  1383. break;
  1384. }
  1385. }
  1386. if(unique_pid){
  1387. par::SortPair<size_t,size_t> p;
  1388. p.key=comm_data.usr_pid[j];
  1389. p.data=shared_data.size();
  1390. pid_node_pair.push_back(p);
  1391. }
  1392. }
  1393. shared_data.push_back(&comm_data);
  1394. }
  1395. }
  1396. }
  1397. omp_par::merge_sort(&pid_node_pair[0], &pid_node_pair[pid_node_pair.size()]);
  1398. //std::cout<<rank<<' '<<shared_data.size()<<' '<<pid_node_pair.size()<<'\n';
  1399. }
  1400. //Profile::Toc();
  1401. //Profile::Tic("PackNodes", &comm, false, 5);
  1402. { // Pack shared nodes.
  1403. #pragma omp parallel for
  1404. for(size_t tid=0;tid<omp_p;tid++){
  1405. size_t buff_length=10l*1024l*1024l; // 1MB buffer per thread.
  1406. char* buff=(char*)memgr.malloc(buff_length);
  1407. size_t a=( tid *shared_data.size())/omp_p;
  1408. size_t b=((tid+1)*shared_data.size())/omp_p;
  1409. for(size_t i=a;i<b;i++){
  1410. CommData& comm_data=*(CommData*)shared_data[i];
  1411. PackedData p0=comm_data.node->Pack(true,buff);
  1412. assert(p0.length<buff_length);
  1413. shared_data[i]=memgr.malloc(sizeof(CommData)+p0.length);
  1414. CommData& new_comm_data=*(CommData*)shared_data[i];
  1415. new_comm_data=comm_data;
  1416. new_comm_data.pkd_length=sizeof(CommData)+p0.length;
  1417. mem::memcopy(((char*)shared_data[i])+sizeof(CommData),buff,p0.length);
  1418. }
  1419. memgr.free(buff);
  1420. }
  1421. // now CommData is stored in shared_data
  1422. memgr.free(node_comm_data);
  1423. node_comm_data=NULL;
  1424. }
  1425. //Profile::Toc();
  1426. //Profile::Tic("SendBuff", &comm, false, 5);
  1427. std::vector<MPI_size_t> send_size(num_p,0);
  1428. std::vector<MPI_size_t> send_disp(num_p,0);
  1429. if(pid_node_pair.size()){ // Build send_buff.
  1430. std::vector<size_t> size(pid_node_pair.size(),0);
  1431. std::vector<size_t> disp(pid_node_pair.size(),0);
  1432. #pragma omp parallel for
  1433. for(size_t i=0;i<pid_node_pair.size();i++){
  1434. size[i]=((CommData*)shared_data[pid_node_pair[i].data])->pkd_length;
  1435. }
  1436. omp_par::scan(&size[0],&disp[0],pid_node_pair.size());
  1437. // Resize send_buff.
  1438. if(send_buff.size()<size[pid_node_pair.size()-1]+disp[pid_node_pair.size()-1]){
  1439. send_buff.resize(size[pid_node_pair.size()-1]+disp[pid_node_pair.size()-1]);
  1440. }
  1441. // Copy data to send_buff.
  1442. #pragma omp parallel for
  1443. for(size_t i=0;i<pid_node_pair.size();i++){
  1444. size_t shrd_idx=pid_node_pair[i].data;
  1445. mem::memcopy(&send_buff[disp[i]], shared_data[shrd_idx], size[i]);
  1446. }
  1447. // Compute send_size, send_disp.
  1448. {
  1449. // Compute send_size.
  1450. #pragma omp parallel for
  1451. for(size_t tid=0;tid<omp_p;tid++){
  1452. size_t a=(pid_node_pair.size()* tid )/omp_p;
  1453. size_t b=(pid_node_pair.size()*(tid+1))/omp_p;
  1454. if(a> 0){
  1455. size_t p0=pid_node_pair[a].key;
  1456. while(a<pid_node_pair.size() && p0==pid_node_pair[a].key) a++;
  1457. }
  1458. if(b<pid_node_pair.size()){
  1459. size_t p1=pid_node_pair[b].key;
  1460. while(b<pid_node_pair.size() && p1==pid_node_pair[b].key) b++;
  1461. }
  1462. for(size_t i=a;i<b;i++){
  1463. send_size[pid_node_pair[i].key]+=size[i];
  1464. }
  1465. }
  1466. // Compute send_disp.
  1467. omp_par::scan(&send_size[0],&send_disp[0],num_p);
  1468. }
  1469. }
  1470. //Profile::Toc();
  1471. //Profile::Tic("A2A_Sparse", &comm, true, 5);
  1472. size_t recv_length=0;
  1473. { // Allocate recv_buff.
  1474. std::vector<MPI_size_t> recv_size(num_p,0);
  1475. std::vector<MPI_size_t> recv_disp(num_p,0);
  1476. MPI_Alltoall(&send_size[0], 1, par::Mpi_datatype<MPI_size_t>::value(),
  1477. &recv_size[0], 1, par::Mpi_datatype<MPI_size_t>::value(), *Comm());
  1478. omp_par::scan(&recv_size[0],&recv_disp[0],num_p);
  1479. recv_length=recv_size[num_p-1]+recv_disp[num_p-1];
  1480. if(recv_buff.size()<recv_length){
  1481. recv_buff.resize(recv_length);
  1482. }
  1483. par::Mpi_Alltoallv_sparse(&send_buff[0], &send_size[0], &send_disp[0],
  1484. &recv_buff[0], &recv_size[0], &recv_disp[0], *Comm());
  1485. }
  1486. //Profile::Toc();
  1487. //Profile::Tic("Unpack", &comm, false, 5);
  1488. std::vector<void*> recv_data; // CommData for received nodes.
  1489. { // Unpack received octants.
  1490. for(size_t i=0; i<recv_length;){
  1491. CommData& comm_data=*(CommData*)&recv_buff[i];
  1492. recv_data.push_back(&comm_data);
  1493. i+=comm_data.pkd_length;
  1494. assert(comm_data.pkd_length>0);
  1495. }
  1496. int nchld=(1UL<<this->Dim()); // Number of children.
  1497. std::vector<Node_t*> recv_nodes(recv_data.size());
  1498. for(size_t i=0;i<recv_data.size();i++){ // Find shared nodes.
  1499. CommData& comm_data=*(CommData*)recv_data[i];
  1500. MortonId& mid=comm_data.mid;
  1501. Node_t* srch_node=this->RootNode();
  1502. while(srch_node->GetMortonId()!=mid){
  1503. Node_t* ch_node;
  1504. if(srch_node->IsLeaf()){
  1505. srch_node->SetGhost(true);
  1506. srch_node->Subdivide();
  1507. }
  1508. for(int j=nchld-1;j>=0;j--){
  1509. ch_node=(Node_t*)srch_node->Child(j);
  1510. if(ch_node->GetMortonId()<=mid){
  1511. srch_node=ch_node;
  1512. break;
  1513. }
  1514. }
  1515. }
  1516. recv_nodes[i]=srch_node;
  1517. }
  1518. #pragma omp parallel for
  1519. for(size_t i=0;i<recv_data.size();i++){
  1520. if(!recv_nodes[i]->IsGhost()) continue;
  1521. assert(recv_nodes[i]->IsGhost());
  1522. CommData& comm_data=*(CommData*)recv_data[i];
  1523. PackedData p;
  1524. p.data=((char*)recv_data[i])+sizeof(CommData);
  1525. p.length=comm_data.pkd_length-sizeof(CommData);
  1526. recv_nodes[i]->Unpack(p);
  1527. }
  1528. }
  1529. //Profile::Toc();
  1530. //Profile::Tic("Broadcast", &comm, true, 5);
  1531. { // Broadcast octants.
  1532. std::vector<MortonId> shrd_mid;
  1533. if(rank+1<num_p){ // Set shrd_mid.
  1534. MortonId m=mins[rank+1];
  1535. while(m.GetDepth()>0 && m.getDFD()>=mins[rank+1]){
  1536. m=m.getAncestor(m.GetDepth()-1);
  1537. }
  1538. size_t d=m.GetDepth()+1;
  1539. shrd_mid.resize(d);
  1540. for(size_t i=0;i<d;i++){
  1541. shrd_mid[i]=m.getAncestor(i);
  1542. }
  1543. }
  1544. std::vector<void*> shrd_data; // CommData for shared nodes.
  1545. char* data_ptr=&shrd_buff_vec1[0];
  1546. { // Copy data to shrd_buff_vec1 and Set shrd_data
  1547. for(size_t i=0;i<shared_data.size();i++){
  1548. CommData& comm_data=*(CommData*)shared_data[i];
  1549. assert(comm_data.mid.GetDepth()>0);
  1550. size_t d=comm_data.mid.GetDepth()-1;
  1551. if(d<shrd_mid.size() && shrd_mid[d].getDFD()>=mins[rank])
  1552. for(size_t j=0;j<comm_data.usr_cnt;j++){
  1553. if(comm_data.usr_mid[j]==shrd_mid[d]){
  1554. assert(data_ptr+comm_data.pkd_length<=&(*shrd_buff_vec1.end())); //TODO: resize if needed.
  1555. mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
  1556. shrd_data.push_back(data_ptr);
  1557. data_ptr+=comm_data.pkd_length;
  1558. break;
  1559. }
  1560. }
  1561. }
  1562. for(size_t i=0;i<recv_data.size();i++){
  1563. CommData& comm_data=*(CommData*)recv_data[i];
  1564. assert(comm_data.mid.GetDepth()>0);
  1565. size_t d=comm_data.mid.GetDepth()-1;
  1566. if(d<shrd_mid.size() && shrd_mid[d].getDFD()>=mins[rank])
  1567. for(size_t j=0;j<comm_data.usr_cnt;j++){
  1568. if(comm_data.usr_mid[j]==shrd_mid[d]){
  1569. assert(data_ptr+comm_data.pkd_length<=&(*shrd_buff_vec1.end())); //TODO: resize if needed.
  1570. mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
  1571. shrd_data.push_back(data_ptr);
  1572. data_ptr+=comm_data.pkd_length;
  1573. break;
  1574. }
  1575. }
  1576. }
  1577. }
  1578. size_t pid_shift=1;
  1579. while(pid_shift<num_p){
  1580. MPI_size_t recv_pid=(rank>=pid_shift?rank-pid_shift:rank);
  1581. MPI_size_t send_pid=(rank+pid_shift<num_p?rank+pid_shift:rank);
  1582. MPI_size_t send_length=0;
  1583. if(send_pid!=rank){ // Send data for send_pid
  1584. std::vector<void*> send_data;
  1585. std::vector<size_t> send_size;
  1586. for(size_t i=0; i<shrd_data.size();i++){
  1587. CommData& comm_data=*(CommData*)shrd_data[i];
  1588. size_t d=comm_data.mid.GetDepth()-1;
  1589. bool shared=(d<shrd_mid.size() && shrd_mid[d].NextId().getDFD()>mins[send_pid].getDFD());
  1590. if(shared) for(size_t j=0;j<comm_data.usr_cnt;j++){ // if send_pid already has this node then skip
  1591. if(comm_data.usr_pid[j]==send_pid){
  1592. shared=false;
  1593. break;
  1594. }
  1595. }
  1596. if(!shared) continue;
  1597. send_data.push_back(&comm_data);
  1598. send_size.push_back(comm_data.pkd_length);
  1599. }
  1600. std::vector<size_t> send_disp(send_data.size(),0);
  1601. omp_par::scan(&send_size[0],&send_disp[0],send_data.size());
  1602. if(send_data.size()>0) send_length=send_size.back()+send_disp.back();
  1603. // Resize send_buff.
  1604. if(send_buff.size()<send_length){
  1605. send_buff.resize(send_length);
  1606. }
  1607. // Copy data to send_buff.
  1608. #pragma omp parallel for
  1609. for(size_t i=0;i<send_data.size();i++){
  1610. CommData& comm_data=*(CommData*)send_data[i];
  1611. mem::memcopy(&send_buff[send_disp[i]], &comm_data, comm_data.pkd_length);
  1612. }
  1613. }
  1614. MPI_size_t recv_length=0;
  1615. { // Send-Recv data
  1616. MPI_Request request;
  1617. MPI_Status status;
  1618. if(recv_pid!=rank) MPI_Irecv(&recv_length, 1, par::Mpi_datatype<MPI_size_t>::value(),recv_pid, 1, *Comm(), &request);
  1619. if(send_pid!=rank) MPI_Send (&send_length, 1, par::Mpi_datatype<MPI_size_t>::value(),send_pid, 1, *Comm());
  1620. if(recv_pid!=rank) MPI_Wait(&request, &status);
  1621. // Resize recv_buff
  1622. if(recv_buff.size()<recv_length){
  1623. recv_buff.resize(recv_length);
  1624. }
  1625. if(recv_length>0) MPI_Irecv(&recv_buff[0], recv_length, par::Mpi_datatype<char>::value(),recv_pid, 1, *Comm(), &request);
  1626. if(send_length>0) MPI_Send (&send_buff[0], send_length, par::Mpi_datatype<char>::value(),send_pid, 1, *Comm());
  1627. if(recv_length>0) MPI_Wait(&request, &status);
  1628. }
  1629. std::vector<void*> recv_data; // CommData for received nodes.
  1630. { // Unpack received octants.
  1631. for(size_t i=0; i<recv_length;){
  1632. CommData& comm_data=*(CommData*)&recv_buff[i];
  1633. recv_data.push_back(&comm_data);
  1634. i+=comm_data.pkd_length;
  1635. assert(comm_data.pkd_length>0);
  1636. }
  1637. int nchld=(1UL<<this->Dim()); // Number of children.
  1638. std::vector<Node_t*> recv_nodes(recv_data.size());
  1639. for(size_t i=0;i<recv_data.size();i++){ // Find received octants in tree.
  1640. CommData& comm_data=*(CommData*)recv_data[i];
  1641. MortonId& mid=comm_data.mid;
  1642. Node_t* srch_node=this->RootNode();
  1643. while(srch_node->GetMortonId()!=mid){
  1644. Node_t* ch_node;
  1645. if(srch_node->IsLeaf()){
  1646. srch_node->SetGhost(true);
  1647. srch_node->Subdivide();
  1648. }
  1649. for(int j=nchld-1;j>=0;j--){
  1650. ch_node=(Node_t*)srch_node->Child(j);
  1651. if(ch_node->GetMortonId()<=mid){
  1652. srch_node=ch_node;
  1653. break;
  1654. }
  1655. }
  1656. }
  1657. recv_nodes[i]=srch_node;
  1658. }
  1659. #pragma omp parallel for
  1660. for(size_t i=0;i<recv_data.size();i++){
  1661. if(!recv_nodes[i]->IsGhost()) continue;
  1662. assert(recv_nodes[i]->IsGhost());
  1663. CommData& comm_data=*(CommData*)recv_data[i];
  1664. PackedData p;
  1665. p.data=((char*)recv_data[i])+sizeof(CommData);
  1666. p.length=comm_data.pkd_length-sizeof(CommData);
  1667. recv_nodes[i]->Unpack(p);
  1668. }
  1669. }
  1670. pid_shift<<=1;
  1671. send_pid=(rank+pid_shift<num_p?rank+pid_shift:rank);
  1672. if(send_pid!=rank){ // Copy data to shrd_buff_vec1 and Set shrd_data
  1673. for(size_t i=0;i<recv_data.size();i++){
  1674. CommData& comm_data=*(CommData*)recv_data[i];
  1675. //{ // Skip if this node already exists.
  1676. // bool skip=false;
  1677. // for(size_t k=0;k<shrd_data.size();k++){
  1678. // CommData& comm_data_=*(CommData*)shrd_data[k];
  1679. // if(comm_data_.mid==comm_data.mid){
  1680. // assert(false);
  1681. // skip=true;
  1682. // break;
  1683. // }
  1684. // }
  1685. // if(skip) continue;
  1686. //}
  1687. assert(comm_data.mid.GetDepth()>0);
  1688. size_t d=comm_data.mid.GetDepth()-1;
  1689. if(d<shrd_mid.size() && shrd_mid[d].isAncestor(mins[rank]) && shrd_mid[d].NextId().getDFD()>mins[send_pid].getDFD())
  1690. for(size_t j=0;j<comm_data.usr_cnt;j++){
  1691. if(comm_data.usr_mid[j]==shrd_mid[d]){
  1692. assert(data_ptr+comm_data.pkd_length<=&(*shrd_buff_vec1.end())); //TODO: resize if needed.
  1693. mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
  1694. shrd_data.push_back(data_ptr);
  1695. data_ptr+=comm_data.pkd_length;
  1696. break;
  1697. }
  1698. }
  1699. }
  1700. }
  1701. }
  1702. }
  1703. //Profile::Toc();
  1704. // Free data
  1705. for(size_t i=0;i<shared_data.size();i++) memgr.free(shared_data[i]);
  1706. }
  1707. inline bool isLittleEndian(){
  1708. uint16_t number = 0x1;
  1709. uint8_t *numPtr = (uint8_t*)&number;
  1710. return (numPtr[0] == 1);
  1711. }
  1712. template <class TreeNode>
  1713. void MPI_Tree<TreeNode>::Write2File(const char* fname, int lod){
  1714. int myrank, np;
  1715. MPI_Comm_size(*Comm(),&np);
  1716. MPI_Comm_rank(*Comm(),&myrank);
  1717. std::vector<Real_t> coord; //Coordinates of octant corners.
  1718. std::vector<Real_t> value; //Data value at points.
  1719. std::vector<int32_t> mpi_rank; //MPI_Rank at points.
  1720. std::vector<int32_t> connect; //Cell connectivity.
  1721. std::vector<int32_t> offset ; //Cell offset.
  1722. std::vector<uint8_t> types ; //Cell types.
  1723. //Build list of octant corner points.
  1724. Node_t* n=this->PreorderFirst();
  1725. while(n!=NULL){
  1726. if(!n->IsGhost() && n->IsLeaf())
  1727. n->VTU_Data(coord, value, connect, offset, types, lod);
  1728. n=this->PreorderNxt(n);
  1729. }
  1730. int pt_cnt=coord.size()/COORD_DIM;
  1731. int dof=(pt_cnt?value.size()/pt_cnt:0);
  1732. assert(value.size()==(size_t)pt_cnt*dof);
  1733. int cell_cnt=types.size();
  1734. mpi_rank.resize(pt_cnt);
  1735. int new_myrank=myrank;//rand();
  1736. for(int i=0;i<pt_cnt;i++) mpi_rank[i]=new_myrank;
  1737. //Open file for writing.
  1738. std::stringstream vtufname;
  1739. vtufname<<fname<<std::setfill('0')<<std::setw(6)<<myrank<<".vtu";
  1740. std::ofstream vtufile;
  1741. vtufile.open(vtufname.str().c_str());
  1742. if(vtufile.fail()) return;
  1743. //Proceed to write to file.
  1744. size_t data_size=0;
  1745. vtufile<<"<?xml version=\"1.0\"?>\n";
  1746. if(isLittleEndian()) vtufile<<"<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n";
  1747. else vtufile<<"<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n";
  1748. //===========================================================================
  1749. vtufile<<" <UnstructuredGrid>\n";
  1750. vtufile<<" <Piece NumberOfPoints=\""<<pt_cnt<<"\" NumberOfCells=\""<<cell_cnt<<"\">\n";
  1751. //---------------------------------------------------------------------------
  1752. vtufile<<" <Points>\n";
  1753. vtufile<<" <DataArray type=\"Float"<<sizeof(Real_t)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1754. data_size+=sizeof(uint32_t)+coord.size()*sizeof(Real_t);
  1755. vtufile<<" </Points>\n";
  1756. //---------------------------------------------------------------------------
  1757. vtufile<<" <PointData>\n";
  1758. vtufile<<" <DataArray type=\"Float"<<sizeof(Real_t)*8<<"\" NumberOfComponents=\""<<dof<<"\" Name=\"value\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1759. data_size+=sizeof(uint32_t)+value .size()*sizeof( Real_t);
  1760. vtufile<<" <DataArray type=\"Int32\" NumberOfComponents=\"1\" Name=\"mpi_rank\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1761. data_size+=sizeof(uint32_t)+mpi_rank.size()*sizeof(int32_t);
  1762. vtufile<<" </PointData>\n";
  1763. //---------------------------------------------------------------------------
  1764. //---------------------------------------------------------------------------
  1765. vtufile<<" <Cells>\n";
  1766. vtufile<<" <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1767. data_size+=sizeof(uint32_t)+connect.size()*sizeof(int32_t);
  1768. vtufile<<" <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1769. data_size+=sizeof(uint32_t)+offset.size() *sizeof(int32_t);
  1770. vtufile<<" <DataArray type=\"UInt8\" Name=\"types\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1771. data_size+=sizeof(uint32_t)+types.size() *sizeof(uint8_t);
  1772. vtufile<<" </Cells>\n";
  1773. //---------------------------------------------------------------------------
  1774. //vtufile<<" <CellData>\n";
  1775. //vtufile<<" <DataArray type=\"Float"<<sizeof(Real_t)*8<<"\" Name=\"Velocity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1776. //vtufile<<" </CellData>\n";
  1777. //---------------------------------------------------------------------------
  1778. vtufile<<" </Piece>\n";
  1779. vtufile<<" </UnstructuredGrid>\n";
  1780. //===========================================================================
  1781. vtufile<<" <AppendedData encoding=\"raw\">\n";
  1782. vtufile<<" _";
  1783. int32_t block_size;
  1784. block_size=coord .size()*sizeof( Real_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&coord [0], coord .size()*sizeof( Real_t));
  1785. block_size=value .size()*sizeof( Real_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&value [0], value .size()*sizeof( Real_t));
  1786. block_size=mpi_rank.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&mpi_rank[0], mpi_rank.size()*sizeof(int32_t));
  1787. block_size=connect.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&connect[0], connect.size()*sizeof(int32_t));
  1788. block_size=offset .size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&offset [0], offset .size()*sizeof(int32_t));
  1789. block_size=types .size()*sizeof(uint8_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&types [0], types .size()*sizeof(uint8_t));
  1790. vtufile<<"\n";
  1791. vtufile<<" </AppendedData>\n";
  1792. //===========================================================================
  1793. vtufile<<"</VTKFile>\n";
  1794. vtufile.close();
  1795. if(myrank) return;
  1796. std::stringstream pvtufname;
  1797. pvtufname<<fname<<".pvtu";
  1798. std::ofstream pvtufile;
  1799. pvtufile.open(pvtufname.str().c_str());
  1800. if(pvtufile.fail()) return;
  1801. pvtufile<<"<?xml version=\"1.0\"?>\n";
  1802. pvtufile<<"<VTKFile type=\"PUnstructuredGrid\">\n";
  1803. pvtufile<<" <PUnstructuredGrid GhostLevel=\"0\">\n";
  1804. pvtufile<<" <PPoints>\n";
  1805. pvtufile<<" <PDataArray type=\"Float"<<sizeof(Real_t)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\"/>\n";
  1806. pvtufile<<" </PPoints>\n";
  1807. pvtufile<<" <PPointData>\n";
  1808. pvtufile<<" <PDataArray type=\"Float"<<sizeof(Real_t)*8<<"\" NumberOfComponents=\""<<dof<<"\" Name=\"value\"/>\n";
  1809. pvtufile<<" <PDataArray type=\"Int32\" NumberOfComponents=\"1\" Name=\"mpi_rank\"/>\n";
  1810. pvtufile<<" </PPointData>\n";
  1811. {
  1812. // Extract filename from path.
  1813. std::stringstream vtupath;
  1814. vtupath<<'/'<<fname<<'\0';
  1815. char *fname_ = (char*)strrchr(vtupath.str().c_str(), '/') + 1;
  1816. //std::string fname_ = boost::filesystem::path(fname).filename().string().
  1817. for(int i=0;i<np;i++) pvtufile<<" <Piece Source=\""<<fname_<<std::setfill('0')<<std::setw(6)<<i<<".vtu\"/>\n";
  1818. }
  1819. pvtufile<<" </PUnstructuredGrid>\n";
  1820. pvtufile<<"</VTKFile>\n";
  1821. pvtufile.close();
  1822. }
  1823. template <class TreeNode>
  1824. const std::vector<MortonId>& MPI_Tree<TreeNode>::GetMins(){
  1825. Node_t* n=this->PreorderFirst();
  1826. while(n!=NULL){
  1827. if(!n->IsGhost() && n->IsLeaf()) break;
  1828. n=this->PreorderNxt(n);
  1829. }
  1830. ASSERT_WITH_MSG(n!=NULL,"No non-ghost nodes found on this process.");
  1831. MortonId my_min;
  1832. my_min=n->GetMortonId();
  1833. int np;
  1834. MPI_Comm_size(*Comm(),&np);
  1835. mins.resize(np);
  1836. MPI_Allgather(&my_min , 1, par::Mpi_datatype<MortonId>::value(),
  1837. &mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  1838. return mins;
  1839. }
  1840. }//end namespace