mpi_tree.txx 75 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230
  1. /**
  2. * \file mpi_tree.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 12-11-2010
  5. * \brief This file contains the implementation of the class MPI_Tree.
  6. */
  7. #include <assert.h>
  8. #include <cstring>
  9. #include <fstream>
  10. #include <list>
  11. #include <set>
  12. #include <parUtils.h>
  13. #include <ompUtils.h>
  14. #include <profile.hpp>
  15. namespace pvfmm{
  16. /**
  17. * @author Dhairya Malhotra, dhairya.malhotra@gmail.com
  18. * @date 08 Feb 2011
  19. */
  20. inline int p2oLocal(Vector<MortonId> & nodes, Vector<MortonId>& leaves,
  21. unsigned int maxNumPts, unsigned int maxDepth, bool complete) {
  22. assert(maxDepth<=MAX_DEPTH);
  23. std::vector<MortonId> leaves_lst;
  24. unsigned int init_size=leaves.Dim();
  25. unsigned int num_pts=nodes.Dim();
  26. MortonId curr_node=leaves[0];
  27. MortonId last_node=leaves[init_size-1].NextId();
  28. MortonId next_node;
  29. unsigned int curr_pt=0;
  30. unsigned int next_pt=curr_pt+maxNumPts;
  31. while(next_pt <= num_pts){
  32. next_node = curr_node.NextId();
  33. while( next_pt < num_pts && next_node > nodes[next_pt] && curr_node.GetDepth() < maxDepth-1 ){
  34. curr_node = curr_node.getDFD(curr_node.GetDepth()+1);
  35. next_node = curr_node.NextId();
  36. }
  37. leaves_lst.push_back(curr_node);
  38. curr_node = next_node;
  39. unsigned int inc=maxNumPts;
  40. while(next_pt < num_pts && curr_node > nodes[next_pt]){
  41. // We have more than maxNumPts points per octant because the node can
  42. // not be refined any further.
  43. inc=inc<<1;
  44. next_pt+=inc;
  45. if(next_pt > num_pts){
  46. next_pt = num_pts;
  47. break;
  48. }
  49. }
  50. curr_pt = std::lower_bound(&nodes[0]+curr_pt,&nodes[0]+next_pt,curr_node,std::less<MortonId>())-&nodes[0];
  51. if(curr_pt >= num_pts) break;
  52. next_pt = curr_pt + maxNumPts;
  53. if(next_pt > num_pts) next_pt = num_pts;
  54. }
  55. #ifndef NDEBUG
  56. for(size_t i=0;i<leaves_lst.size();i++){
  57. size_t a=std::lower_bound(&nodes[0],&nodes[0]+nodes.Dim(),leaves_lst[i],std::less<MortonId>())-&nodes[0];
  58. size_t b=std::lower_bound(&nodes[0],&nodes[0]+nodes.Dim(),leaves_lst[i].NextId(),std::less<MortonId>())-&nodes[0];
  59. assert(b-a<=maxNumPts || leaves_lst[i].GetDepth()==maxDepth-1);
  60. if(i==leaves_lst.size()-1) assert(b==nodes.Dim() && a<nodes.Dim());
  61. if(i==0) assert(a==0);
  62. }
  63. #endif
  64. if(complete)
  65. while(curr_node<last_node){
  66. while( curr_node.NextId() > last_node && curr_node.GetDepth() < maxDepth-1 )
  67. curr_node = curr_node.getDFD(curr_node.GetDepth()+1);
  68. leaves_lst.push_back(curr_node);
  69. curr_node = curr_node.NextId();
  70. }
  71. leaves=leaves_lst;
  72. return 0;
  73. }
  74. inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes,
  75. unsigned int maxDepth, unsigned int maxNumPts, const MPI_Comm& comm ) {
  76. int myrank, np;
  77. MPI_Comm_rank(comm, &myrank);
  78. MPI_Comm_size(comm, &np);
  79. // Sort morton id of points.
  80. Profile::Tic("SortMortonId", &comm, true, 5);
  81. Vector<MortonId> pt_sorted;
  82. //par::partitionW<MortonId>(pt_mid, NULL, comm);
  83. par::HyperQuickSort(pt_mid, pt_sorted, comm);
  84. size_t pt_cnt=pt_sorted.Dim();
  85. Profile::Toc();
  86. // Add last few points from next process, to get the boundary octant right.
  87. Profile::Tic("Comm", &comm, true, 5);
  88. {
  89. { // Adjust maxNumPts
  90. size_t glb_pt_cnt=0;
  91. MPI_Allreduce(&pt_cnt, &glb_pt_cnt, 1, par::Mpi_datatype<size_t>::value(), par::Mpi_datatype<size_t>::sum(), comm);
  92. if(glb_pt_cnt<maxNumPts*np) maxNumPts=glb_pt_cnt/np;
  93. }
  94. size_t recv_size=0;
  95. size_t send_size=(2*maxNumPts<pt_cnt?2*maxNumPts:pt_cnt);
  96. {
  97. MPI_Request recvRequest;
  98. MPI_Request sendRequest;
  99. MPI_Status statusWait;
  100. if(myrank < (np-1)) MPI_Irecv (&recv_size, 1, par::Mpi_datatype<size_t>::value(), myrank+1, 1, comm, &recvRequest);
  101. if(myrank > 0 ) MPI_Issend(&send_size, 1, par::Mpi_datatype<size_t>::value(), myrank-1, 1, comm, &sendRequest);
  102. if(myrank < (np-1)) MPI_Wait(&recvRequest, &statusWait);
  103. if(myrank > 0 ) MPI_Wait(&sendRequest, &statusWait); //This can be done later.
  104. }
  105. if(recv_size>0){// Resize pt_sorted.
  106. Vector<MortonId> pt_sorted_(pt_cnt+recv_size);
  107. mem::memcopy(&pt_sorted_[0], &pt_sorted[0], pt_cnt*sizeof(MortonId));
  108. pt_sorted.Swap(pt_sorted_);
  109. }
  110. {// Exchange data.
  111. MPI_Request recvRequest;
  112. MPI_Request sendRequest;
  113. MPI_Status statusWait;
  114. if(myrank < (np-1)) MPI_Irecv (&pt_sorted[0]+pt_cnt, recv_size, par::Mpi_datatype<MortonId>::value(), myrank+1, 1, comm, &recvRequest);
  115. if(myrank > 0 ) MPI_Issend(&pt_sorted[0] , send_size, par::Mpi_datatype<MortonId>::value(), myrank-1, 1, comm, &sendRequest);
  116. if(myrank < (np-1)) MPI_Wait(&recvRequest, &statusWait);
  117. if(myrank > 0 ) MPI_Wait(&sendRequest, &statusWait); //This can be done later.
  118. }
  119. }
  120. Profile::Toc();
  121. // Construct local octree.
  122. Profile::Tic("p2o_local", &comm, false, 5);
  123. Vector<MortonId> nodes_local(1); nodes_local[0]=MortonId();
  124. p2oLocal(pt_sorted, nodes_local, maxNumPts, maxDepth, myrank==np-1);
  125. Profile::Toc();
  126. // Remove duplicate nodes on adjacent processors.
  127. Profile::Tic("RemoveDuplicates", &comm, true, 5);
  128. {
  129. size_t node_cnt=nodes_local.Dim();
  130. MortonId first_node;
  131. MortonId last_node=nodes_local[node_cnt-1];
  132. { // Send last_node to next process and get first_node from previous process.
  133. MPI_Request recvRequest;
  134. MPI_Request sendRequest;
  135. MPI_Status statusWait;
  136. if(myrank < (np-1)) MPI_Issend(& last_node, 1, par::Mpi_datatype<MortonId>::value(), myrank+1, 1, comm, &recvRequest);
  137. if(myrank > 0 ) MPI_Irecv (&first_node, 1, par::Mpi_datatype<MortonId>::value(), myrank-1, 1, comm, &sendRequest);
  138. if(myrank < (np-1)) MPI_Wait(&recvRequest, &statusWait);
  139. if(myrank > 0 ) MPI_Wait(&sendRequest, &statusWait); //This can be done later.
  140. }
  141. size_t i=0;
  142. std::vector<MortonId> node_lst;
  143. if(myrank){
  144. while(i<node_cnt && nodes_local[i].getDFD(maxDepth)<first_node) i++; assert(i);
  145. last_node=nodes_local[i>0?i-1:0].NextId(); // Next MortonId in the tree after first_node.
  146. while(first_node<last_node){ // Complete nodes between first_node and last_node.
  147. while(first_node.isAncestor(last_node))
  148. first_node=first_node.getDFD(first_node.GetDepth()+1);
  149. if(first_node==last_node) break;
  150. node_lst.push_back(first_node);
  151. first_node=first_node.NextId();
  152. }
  153. }
  154. for(;i<node_cnt-(myrank==np-1?0:1);i++) node_lst.push_back(nodes_local[i]);
  155. nodes=node_lst;
  156. }
  157. Profile::Toc();
  158. // Repartition nodes.
  159. Profile::Tic("partitionW", &comm, false, 5);
  160. par::partitionW<MortonId>(nodes, NULL , comm);
  161. Profile::Toc();
  162. return 0;
  163. }
  164. template <class TreeNode>
  165. void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
  166. //Initialize root node.
  167. Profile::Tic("InitRoot",Comm(),false,3);
  168. Tree<TreeNode>::Initialize(init_data);
  169. TreeNode* rnode=this->RootNode();
  170. assert(this->dim==COORD_DIM);
  171. Profile::Toc();
  172. Profile::Tic("Points2Octee",Comm(),true,3);
  173. Vector<MortonId> lin_oct;
  174. { //Get the linear tree.
  175. // Compute MortonId from pt_coord.
  176. Vector<MortonId> pt_mid;
  177. Vector<Real_t>& pt_coord=rnode->pt_coord;
  178. size_t pt_cnt=pt_coord.Dim()/this->dim;
  179. pt_mid.Resize(pt_cnt);
  180. #pragma omp parallel for
  181. for(size_t i=0;i<pt_cnt;i++){
  182. pt_mid[i]=MortonId(pt_coord[i*COORD_DIM+0],pt_coord[i*COORD_DIM+1],pt_coord[i*COORD_DIM+2],this->max_depth);
  183. }
  184. //Get the linear tree.
  185. points2Octree(pt_mid,lin_oct,this->max_depth,init_data->max_pts,*Comm());
  186. }
  187. Profile::Toc();
  188. Profile::Tic("ScatterPoints",Comm(),true,3);
  189. { // Sort and partition point coordinates and values.
  190. std::vector<Vector<Real_t>*> coord_lst;
  191. std::vector<Vector<Real_t>*> value_lst;
  192. std::vector<Vector<size_t>*> scatter_lst;
  193. rnode->NodeDataVec(coord_lst, value_lst, scatter_lst);
  194. assert(coord_lst.size()==value_lst.size());
  195. assert(coord_lst.size()==scatter_lst.size());
  196. Vector<MortonId> pt_mid;
  197. Vector<size_t> scatter_index;
  198. for(size_t i=0;i<coord_lst.size();i++){
  199. if(!coord_lst[i]) continue;
  200. Vector<Real_t>& pt_coord=*coord_lst[i];
  201. { // Compute MortonId from pt_coord.
  202. size_t pt_cnt=pt_coord.Dim()/this->dim;
  203. pt_mid.Resize(pt_cnt);
  204. #pragma omp parallel for
  205. for(size_t i=0;i<pt_cnt;i++){
  206. pt_mid[i]=MortonId(pt_coord[i*COORD_DIM+0],pt_coord[i*COORD_DIM+1],pt_coord[i*COORD_DIM+2],this->max_depth);
  207. }
  208. }
  209. par::SortScatterIndex(pt_mid , scatter_index, comm, &lin_oct[0]);
  210. par::ScatterForward (pt_coord, scatter_index, comm);
  211. if(value_lst[i]!=NULL){
  212. Vector<Real_t>& pt_value=*value_lst[i];
  213. par::ScatterForward(pt_value, scatter_index, comm);
  214. }
  215. if(scatter_lst[i]!=NULL){
  216. Vector<size_t>& pt_scatter=*scatter_lst[i];
  217. pt_scatter=scatter_index;
  218. }
  219. }
  220. }
  221. Profile::Toc();
  222. //Initialize the pointer based tree from the linear tree.
  223. Profile::Tic("PointerTree",Comm(),false,3);
  224. { // Construct the pointer tree from lin_oct
  225. int omp_p=omp_get_max_threads();
  226. // Partition nodes between threads
  227. rnode->SetGhost(false);
  228. for(int i=0;i<omp_p;i++){
  229. size_t idx=(lin_oct.Dim()*i)/omp_p;
  230. Node_t* n=FindNode(lin_oct[idx], true);
  231. assert(n->GetMortonId()==lin_oct[idx]);
  232. UNUSED(n);
  233. }
  234. #pragma omp parallel for
  235. for(int i=0;i<omp_p;i++){
  236. size_t a=(lin_oct.Dim()* i )/omp_p;
  237. size_t b=(lin_oct.Dim()*(i+1))/omp_p;
  238. size_t idx=a;
  239. Node_t* n=FindNode(lin_oct[idx], false);
  240. if(a==0) n=rnode;
  241. while(n!=NULL && (idx<b || i==omp_p-1)){
  242. n->SetGhost(false);
  243. MortonId dn=n->GetMortonId();
  244. if(idx<b && dn.isAncestor(lin_oct[idx])){
  245. if(n->IsLeaf()) n->Subdivide();
  246. }else if(idx<b && dn==lin_oct[idx]){
  247. if(!n->IsLeaf()) n->Truncate();
  248. assert(n->IsLeaf());
  249. idx++;
  250. }else{
  251. n->Truncate();
  252. n->SetGhost(true);
  253. }
  254. n=this->PreorderNxt(n);
  255. }
  256. assert(idx==b);
  257. }
  258. }
  259. Profile::Toc();
  260. #ifndef NDEBUG
  261. CheckTree();
  262. #endif
  263. }
  264. template <class TreeNode>
  265. void MPI_Tree<TreeNode>::CoarsenTree(){
  266. int myrank;
  267. MPI_Comm_rank(*Comm(),&myrank);
  268. //Redistribute.
  269. {
  270. Node_t* n=this->PostorderFirst();
  271. while(n){
  272. if(n->IsLeaf() && !n->IsGhost()) break;
  273. n=this->PostorderNxt(n);
  274. }
  275. while(myrank){
  276. Node_t* n_parent=(Node_t*)n->Parent();
  277. Node_t* n_ = n_parent;
  278. while(n_ && !n_->IsLeaf()){
  279. n_=this->PostorderNxt(n_);
  280. if(!n_) break;
  281. }
  282. if(!n_ || n_->IsGhost()) break;
  283. if(n->Depth()<=n_->Depth()) break;
  284. if(n_->Depth()<=1) break;
  285. n=n_;
  286. }
  287. MortonId loc_min=n->GetMortonId();
  288. RedistNodes(&loc_min);
  289. }
  290. //Truncate ghost nodes and build node list
  291. std::vector<Node_t*> leaf_nodes;
  292. {
  293. Node_t* n=this->PostorderFirst();
  294. while(n!=NULL){
  295. if(n->IsLeaf() && !n->IsGhost()) break;
  296. n->Truncate();
  297. n->SetGhost(true);
  298. n->ClearData();
  299. n=this->PostorderNxt(n);
  300. }
  301. while(n!=NULL){
  302. if(n->IsLeaf() && n->IsGhost()) break;
  303. if(n->IsLeaf()) leaf_nodes.push_back(n);
  304. n=this->PreorderNxt(n);
  305. }
  306. while(n!=NULL){
  307. n->Truncate();
  308. n->SetGhost(true);
  309. n->ClearData();
  310. n=this->PreorderNxt(n);
  311. }
  312. }
  313. size_t node_cnt=leaf_nodes.size();
  314. //Partition nodes between OpenMP threads.
  315. int omp_p=omp_get_max_threads();
  316. std::vector<MortonId> mid(omp_p);
  317. std::vector<MortonId> split_key(omp_p);
  318. #pragma omp parallel for
  319. for(int i=0;i<omp_p;i++){
  320. mid[i]=leaf_nodes[(i*node_cnt)/omp_p]->GetMortonId();
  321. }
  322. //Coarsen Tree.
  323. #pragma omp parallel for
  324. for(int i=0;i<omp_p;i++){
  325. Node_t* n_=leaf_nodes[i*node_cnt/omp_p];
  326. if(i*node_cnt/omp_p<(i+1)*node_cnt/omp_p)
  327. while(n_!=NULL){
  328. MortonId n_mid=n_->GetMortonId();
  329. if(!n_->IsLeaf() && !n_mid.isAncestor(mid[i].getDFD()))
  330. if(i<omp_p-1? !n_mid.isAncestor(mid[i+1].getDFD()):true)
  331. if(!n_->SubdivCond()) n_->Truncate();
  332. if(i<omp_p-1? n_mid==mid[i+1]: false) break;
  333. n_=this->PostorderNxt(n_);
  334. }
  335. }
  336. //Truncate nodes along ancestors of splitters.
  337. for(int i=0;i<omp_p;i++){
  338. Node_t* n_=FindNode(mid[i], false, this->RootNode());
  339. while(n_->Depth()>0){
  340. n_=(Node_t*)n_->Parent();
  341. if(!n_->SubdivCond()) n_->Truncate();
  342. else break;
  343. }
  344. }
  345. }
  346. template <class TreeNode>
  347. void MPI_Tree<TreeNode>::RefineTree(){
  348. int np, myrank;
  349. MPI_Comm_size(*Comm(),&np);
  350. MPI_Comm_rank(*Comm(),&myrank);
  351. int omp_p=omp_get_max_threads();
  352. int n_child=1UL<<this->Dim();
  353. //Coarsen tree.
  354. MPI_Tree<TreeNode>::CoarsenTree();
  355. //Build node list.
  356. std::vector<Node_t*> leaf_nodes;
  357. {
  358. Node_t* n=this->PostorderFirst();
  359. while(n!=NULL){
  360. if(n->IsLeaf() && !n->IsGhost())
  361. leaf_nodes.push_back(n);
  362. n=this->PostorderNxt(n);
  363. }
  364. }
  365. size_t tree_node_cnt=leaf_nodes.size();
  366. //Adaptive subdivision of leaf nodes with load balancing.
  367. for(int l=0;l<this->max_depth;l++){
  368. //Subdivide nodes.
  369. std::vector<std::vector<Node_t*> > leaf_nodes_(omp_p);
  370. #pragma omp parallel for
  371. for(int i=0;i<omp_p;i++){
  372. size_t a=(leaf_nodes.size()* i )/omp_p;
  373. size_t b=(leaf_nodes.size()*(i+1))/omp_p;
  374. for(size_t j=a;j<b;j++){
  375. if(leaf_nodes[j]->IsLeaf() && !leaf_nodes[j]->IsGhost()){
  376. if(leaf_nodes[j]->SubdivCond()) leaf_nodes[j]->Subdivide();
  377. if(!leaf_nodes[j]->IsLeaf())
  378. for(int k=0;k<n_child;k++)
  379. leaf_nodes_[i].push_back((Node_t*)leaf_nodes[j]->Child(k));
  380. }
  381. }
  382. }
  383. for(int i=0;i<omp_p;i++)
  384. tree_node_cnt+=(leaf_nodes_[i].size()/n_child)*(n_child-1);
  385. //Determine load imbalance.
  386. size_t global_max, global_sum;
  387. MPI_Allreduce(&tree_node_cnt, &global_max, 1, par::Mpi_datatype<size_t>::value(), par::Mpi_datatype<size_t>::max(), *Comm());
  388. MPI_Allreduce(&tree_node_cnt, &global_sum, 1, par::Mpi_datatype<size_t>::value(), par::Mpi_datatype<size_t>::sum(), *Comm());
  389. //RedistNodes if needed.
  390. if(global_max*np>4*global_sum){
  391. #ifndef NDEBUG
  392. Profile::Tic("RedistNodes",Comm(),true,4);
  393. #endif
  394. RedistNodes();
  395. #ifndef NDEBUG
  396. Profile::Toc();
  397. #endif
  398. //Rebuild node list.
  399. leaf_nodes.clear();
  400. Node_t* n=this->PostorderFirst();
  401. while(n!=NULL){
  402. if(n->IsLeaf() && !n->IsGhost())
  403. leaf_nodes.push_back(n);
  404. n=this->PostorderNxt(n);
  405. }
  406. tree_node_cnt=leaf_nodes.size();
  407. }else{
  408. //Combine partial list of nodes.
  409. int node_cnt=0;
  410. for(int j=0;j<omp_p;j++)
  411. node_cnt+=leaf_nodes_[j].size();
  412. leaf_nodes.resize(node_cnt);
  413. #pragma omp parallel for
  414. for(int i=0;i<omp_p;i++){
  415. int offset=0;
  416. for(int j=0;j<i;j++)
  417. offset+=leaf_nodes_[j].size();
  418. for(size_t j=0;j<leaf_nodes_[i].size();j++)
  419. leaf_nodes[offset+j]=leaf_nodes_[i][j];
  420. }
  421. }
  422. }
  423. RedistNodes();
  424. MPI_Tree<TreeNode>::CoarsenTree();
  425. RedistNodes();
  426. MPI_Tree<TreeNode>::CoarsenTree();
  427. RedistNodes();
  428. }
  429. template <class TreeNode>
  430. void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
  431. int np, myrank;
  432. MPI_Comm_size(*Comm(),&np);
  433. MPI_Comm_rank(*Comm(),&myrank);
  434. if(np==1)return;
  435. //Create a linear tree in dendro format.
  436. Node_t* curr_node=this->PreorderFirst();
  437. std::vector<MortonId> in;
  438. std::vector<Node_t*> node_lst;
  439. while(curr_node!=NULL){
  440. if(curr_node->IsLeaf() && !curr_node->IsGhost()){
  441. node_lst.push_back(curr_node);
  442. in.push_back(curr_node->GetMortonId());
  443. }
  444. curr_node=this->PreorderNxt(curr_node);
  445. }
  446. size_t leaf_cnt=in.size();
  447. //Get new mins.
  448. std::vector<MortonId> new_mins(np);
  449. if(loc_min==NULL){
  450. //Partition vector of MortonIds using par::partitionW
  451. std::vector<MortonId> in_=in;
  452. std::vector<long long> wts(in_.size());
  453. #pragma omp parallel for
  454. for(size_t i=0;i<wts.size();i++){
  455. wts[i]=node_lst[i]->NodeCost();
  456. }
  457. par::partitionW<MortonId>(in_,&wts[0],*Comm());
  458. MPI_Allgather(&in_[0] , 1, par::Mpi_datatype<MortonId>::value(),
  459. &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  460. }else{
  461. MPI_Allgather(loc_min , 1, par::Mpi_datatype<MortonId>::value(),
  462. &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  463. }
  464. //Now exchange nodes according to new mins
  465. std::vector<PackedData> data(leaf_cnt);
  466. std::vector<int> send_cnts; send_cnts.assign(np,0);
  467. std::vector<int> send_size; send_size.assign(np,0);
  468. size_t sbuff_size=0;
  469. int omp_p=omp_get_max_threads();
  470. #pragma omp parallel for reduction(+:sbuff_size)
  471. for(int i=0;i<omp_p;i++){
  472. size_t a=( i *np)/omp_p;
  473. size_t b=((i+1)*np)/omp_p;
  474. if(b>a){
  475. size_t p_iter=a;
  476. size_t node_iter=std::lower_bound(&in[0], &in[in.size()], new_mins[a])-&in[0];
  477. for( ;node_iter<node_lst.size();node_iter++){
  478. while(p_iter+1u<(size_t)np? in[node_iter]>=new_mins[p_iter+1]: false) p_iter++;
  479. if(p_iter>=b) break;
  480. send_cnts[p_iter]++;
  481. data[node_iter]=node_lst[node_iter]->Pack();
  482. send_size[p_iter]+=data[node_iter].length+sizeof(size_t)+sizeof(MortonId);
  483. sbuff_size +=data[node_iter].length+sizeof(size_t)+sizeof(MortonId);
  484. }
  485. }
  486. }
  487. std::vector<int> recv_cnts(np);
  488. std::vector<int> recv_size(np);
  489. MPI_Alltoall(&send_cnts[0], 1, par::Mpi_datatype<int>::value(),
  490. &recv_cnts[0], 1, par::Mpi_datatype<int>::value(), *Comm());
  491. MPI_Alltoall(&send_size[0], 1, par::Mpi_datatype<int>::value(),
  492. &recv_size[0], 1, par::Mpi_datatype<int>::value(), *Comm());
  493. size_t recv_cnt=0;
  494. #pragma omp parallel for reduction(+:recv_cnt)
  495. for(int i=0;i<np;i++) recv_cnt+=recv_cnts[i];
  496. std::vector<MortonId> out(recv_cnt);
  497. std::vector<int> sdisp; sdisp.assign(np,0);
  498. std::vector<int> rdisp; rdisp.assign(np,0);
  499. omp_par::scan(&send_size[0],&sdisp[0],np); //TODO Don't need to do a full scan
  500. omp_par::scan(&recv_size[0],&rdisp[0],np); // as most entries will be 0.
  501. size_t rbuff_size=rdisp[np-1]+recv_size[np-1];
  502. char* send_buff=new char[sbuff_size];
  503. char* recv_buff=new char[rbuff_size];
  504. std::vector<char*> data_ptr(leaf_cnt);
  505. char* s_ptr=send_buff;
  506. for(size_t i=0;i<leaf_cnt;i++){
  507. *((MortonId*)s_ptr)=in [i] ; s_ptr+=sizeof(MortonId);
  508. *(( size_t*)s_ptr)=data[i].length; s_ptr+=sizeof(size_t);
  509. data_ptr[i]=s_ptr ; s_ptr+=data[i].length;
  510. }
  511. #pragma omp parallel for
  512. for(int p=0;p<omp_p;p++){
  513. size_t a=( p *leaf_cnt)/omp_p;
  514. size_t b=((p+1)*leaf_cnt)/omp_p;
  515. for(size_t i=a;i<b;i++)
  516. mem::memcopy(data_ptr[i], data[i].data, data[i].length);
  517. }
  518. par::Mpi_Alltoallv_sparse<char>(&send_buff[0], &send_size[0], &sdisp[0],
  519. &recv_buff[0], &recv_size[0], &rdisp[0], *Comm());
  520. char* r_ptr=recv_buff;
  521. std::vector<PackedData> r_data(recv_cnt);
  522. for(size_t i=0;i<recv_cnt;i++){
  523. out [i] =*(MortonId*)r_ptr; r_ptr+=sizeof(MortonId);
  524. r_data[i].length=*( size_t*)r_ptr; r_ptr+=sizeof(size_t);
  525. r_data[i].data = r_ptr; r_ptr+=r_data[i].length;
  526. }
  527. //Initialize all new nodes.
  528. int nchld=1UL<<this->Dim();
  529. size_t node_iter=0;
  530. MortonId dn;
  531. node_lst.resize(recv_cnt);
  532. Node_t* n=this->PreorderFirst();
  533. while(n!=NULL && node_iter<recv_cnt){
  534. n->SetGhost(false);
  535. dn=n->GetMortonId();
  536. if(dn.isAncestor(out[node_iter]) && dn!=out[node_iter]){
  537. if(n->IsLeaf()){
  538. {
  539. n->SetGhost(true);
  540. n->Subdivide();
  541. n->SetGhost(false);
  542. for(int j=0;j<nchld;j++){
  543. Node_t* ch_node=(Node_t*)n->Child(j);
  544. ch_node->SetGhost(false);
  545. }
  546. }
  547. }
  548. }else if(dn==out[node_iter]){
  549. if(!n->IsLeaf()){
  550. n->Truncate();
  551. n->SetGhost(false);
  552. }
  553. node_lst[node_iter]=n;
  554. node_iter++;
  555. }else{
  556. n->Truncate(); //This node does not belong to this process.
  557. n->SetGhost(true);
  558. }
  559. n=this->PreorderNxt(n);
  560. }
  561. while(n!=NULL){
  562. n->Truncate();
  563. n->SetGhost(true);
  564. n=this->PreorderNxt(n);
  565. }
  566. #pragma omp parallel for
  567. for(int p=0;p<omp_p;p++){
  568. size_t a=( p *recv_cnt)/omp_p;
  569. size_t b=((p+1)*recv_cnt)/omp_p;
  570. for(size_t i=a;i<b;i++)
  571. node_lst[i]->Unpack(r_data[i]);
  572. }
  573. //Free memory buffers.
  574. delete[] recv_buff;
  575. delete[] send_buff;
  576. }
  577. template <class TreeNode>
  578. TreeNode* MPI_Tree<TreeNode>::FindNode(MortonId& key, bool subdiv, TreeNode* start){
  579. int num_child=1UL<<this->Dim();
  580. Node_t* n=start;
  581. if(n==NULL) n=this->RootNode();
  582. while(n->GetMortonId()<key && (!n->IsLeaf()||subdiv)){
  583. if(n->IsLeaf() && !n->IsGhost()) n->Subdivide();
  584. if(n->IsLeaf()) break;
  585. for(int j=0;j<num_child;j++){
  586. if(((Node_t*)n->Child(j))->GetMortonId().NextId()>key){
  587. n=(Node_t*)n->Child(j);
  588. break;
  589. }
  590. }
  591. }
  592. assert(!subdiv || n->IsGhost() || n->GetMortonId()==key);
  593. return n;
  594. }
  595. //list must be sorted.
  596. inline int lineariseList(std::vector<MortonId> & list, MPI_Comm comm) {
  597. int rank,size;
  598. MPI_Comm_rank(comm,&rank);
  599. MPI_Comm_size(comm,&size);
  600. //Remove empty processors...
  601. int new_rank, new_size;
  602. MPI_Comm new_comm;
  603. MPI_Comm_split(comm, (list.empty()?0:1), rank, &new_comm);
  604. MPI_Comm_rank (new_comm, &new_rank);
  605. MPI_Comm_size (new_comm, &new_size);
  606. if(!list.empty()) {
  607. //Send the last octant to the next processor.
  608. MortonId lastOctant = list[list.size()-1];
  609. MortonId lastOnPrev;
  610. MPI_Request recvRequest;
  611. MPI_Request sendRequest;
  612. if(new_rank > 0) {
  613. MPI_Irecv(&lastOnPrev, 1, par::Mpi_datatype<MortonId>::value(), new_rank-1, 1, new_comm, &recvRequest);
  614. }
  615. if(new_rank < (new_size-1)) {
  616. MPI_Issend( &lastOctant, 1, par::Mpi_datatype<MortonId>::value(), new_rank+1, 1, new_comm, &sendRequest);
  617. }
  618. if(new_rank > 0) {
  619. std::vector<MortonId> tmp(list.size()+1);
  620. for(size_t i = 0; i < list.size(); i++) {
  621. tmp[i+1] = list[i];
  622. }
  623. MPI_Status statusWait;
  624. MPI_Wait(&recvRequest, &statusWait);
  625. tmp[0] = lastOnPrev;
  626. list.swap(tmp);
  627. }
  628. {// Remove duplicates and ancestors.
  629. std::vector<MortonId> tmp;
  630. if(!(list.empty())) {
  631. for(unsigned int i = 0; i < (list.size()-1); i++) {
  632. if( (!(list[i].isAncestor(list[i+1]))) && (list[i] != list[i+1]) ) {
  633. tmp.push_back(list[i]);
  634. }
  635. }
  636. if(new_rank == (new_size-1)) {
  637. tmp.push_back(list[list.size()-1]);
  638. }
  639. }
  640. list.swap(tmp);
  641. }
  642. if(new_rank < (new_size-1)) {
  643. MPI_Status statusWait;
  644. MPI_Wait(&sendRequest, &statusWait);
  645. }
  646. }//not empty procs only
  647. return 1;
  648. }//end fn.
  649. inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &out,
  650. unsigned int dim, unsigned int maxDepth, bool periodic, MPI_Comm comm) {
  651. int omp_p=omp_get_max_threads();
  652. int rank, size;
  653. MPI_Comm_size(comm,&size);
  654. MPI_Comm_rank(comm,&rank);
  655. if(size==1 && in.size()==1){
  656. out=in;
  657. return 0;
  658. }
  659. #ifdef __VERBOSE__
  660. long long locInSize = in.size();
  661. #endif
  662. //////////////////////////////////////////////////////////////////////////////////////////////////
  663. { //Redistribute.
  664. //Vector<long long> balance_wt(size);
  665. //#pragma omp parallel for
  666. //for(size_t i=0;i<size;i++){
  667. // balance_wt[i]=in[i].GetDepth();
  668. //}
  669. //par::partitionW<MortonId>(in, &balance_wt[0], comm);
  670. par::partitionW<MortonId>(in, NULL, comm);
  671. }
  672. //Build level-by-level set of nodes.
  673. std::vector<std::set<MortonId> > nodes((maxDepth+1)*omp_p);
  674. #pragma omp parallel for
  675. for(int p=0;p<omp_p;p++){
  676. size_t a=( p *in.size())/omp_p;
  677. size_t b=((p+1)*in.size())/omp_p;
  678. for(size_t i=a;i<b;){
  679. size_t d=in[i].GetDepth();
  680. if(d==0){i++; continue;}
  681. MortonId pnode=in[i].getAncestor(d-1);
  682. nodes[d-1+(maxDepth+1)*p].insert(pnode);
  683. while(i<b && d==in[i].GetDepth() && pnode==in[i].getAncestor(d-1)) i++;
  684. }
  685. //Add new nodes level-by-level.
  686. std::vector<MortonId> nbrs;
  687. unsigned int num_chld=1UL<<dim;
  688. for(unsigned int l=maxDepth;l>=1;l--){
  689. //Build set of parents of balancing nodes.
  690. std::set<MortonId> nbrs_parent;
  691. std::set<MortonId>::iterator start=nodes[l+(maxDepth+1)*p].begin();
  692. std::set<MortonId>::iterator end =nodes[l+(maxDepth+1)*p].end();
  693. for(std::set<MortonId>::iterator node=start; node != end;){
  694. node->NbrList(nbrs, l, periodic);
  695. int nbr_cnt=nbrs.size();
  696. for(int i=0;i<nbr_cnt;i++)
  697. nbrs_parent.insert(nbrs[i].getAncestor(l-1));
  698. node++;
  699. }
  700. //Get the balancing nodes.
  701. std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
  702. start=nbrs_parent.begin();
  703. end =nbrs_parent.end();
  704. ancestor_nodes.insert(start,end);
  705. }
  706. //Remove non-leaf nodes. (optional)
  707. for(unsigned int l=1;l<=maxDepth;l++){
  708. std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin();
  709. std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end();
  710. std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
  711. for(std::set<MortonId>::iterator node=start; node != end; node++){
  712. MortonId parent=node->getAncestor(node->GetDepth()-1);
  713. ancestor_nodes.erase(parent);
  714. }
  715. }
  716. }
  717. //Resize in.
  718. std::vector<size_t> node_cnt(omp_p,0);
  719. std::vector<size_t> node_dsp(omp_p,0);
  720. #pragma omp parallel for
  721. for(int i=0;i<omp_p;i++){
  722. for(unsigned int j=0;j<=maxDepth;j++)
  723. node_cnt[i]+=nodes[j+i*(maxDepth+1)].size();
  724. }
  725. omp_par::scan(&node_cnt[0],&node_dsp[0], omp_p);
  726. in.resize(node_cnt[omp_p-1]+node_dsp[omp_p-1]);
  727. //Copy leaf nodes to in.
  728. #pragma omp parallel for
  729. for(int p=0;p<omp_p;p++){
  730. size_t node_iter=node_dsp[p];
  731. for(unsigned int l=0;l<=maxDepth;l++){
  732. std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin();
  733. std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end();
  734. for(std::set<MortonId>::iterator node=start; node != end; node++)
  735. in[node_iter++]=*node;
  736. }
  737. }
  738. #ifdef __VERBOSE__
  739. //Local size before removing duplicates and ancestors (linearise).
  740. long long locTmpSize = in.size();
  741. #endif
  742. //Sort, Linearise, Redistribute.
  743. //TODO Sort and linearize non-leaf nodes and then add leaf nodes.
  744. //TODO The following might work better as it reduces the comm bandwidth:
  745. //Split comm into sqrt(np) processes and sort, linearise for each comm group.
  746. //Then do the global sort, linearise with the original comm.
  747. par::HyperQuickSort(in, out, comm);
  748. lineariseList(out, comm);
  749. par::partitionW<MortonId>(out, NULL , comm);
  750. { // Add children
  751. //Remove empty processors...
  752. int new_rank, new_size;
  753. MPI_Comm new_comm;
  754. MPI_Comm_split(comm, (out.empty()?0:1), rank, &new_comm);
  755. MPI_Comm_rank (new_comm, &new_rank);
  756. MPI_Comm_size (new_comm, &new_size);
  757. if(!out.empty()) {
  758. MortonId nxt_mid(0,0,0,0);
  759. { // Get last octant from previous process.
  760. assert(out.size());
  761. //Send the last octant to the next processor.
  762. MortonId lastOctant = out.back();
  763. MortonId lastOnPrev;
  764. MPI_Request recvRequest;
  765. MPI_Request sendRequest;
  766. if(rank > 0) {
  767. MPI_Irecv(&lastOnPrev, 1, par::Mpi_datatype<MortonId>::value(), rank-1, 1, comm, &recvRequest);
  768. }
  769. if(rank < (size-1)) {
  770. MPI_Issend( &lastOctant, 1, par::Mpi_datatype<MortonId>::value(), rank+1, 1, comm, &sendRequest);
  771. }
  772. if(rank > 0) {
  773. MPI_Status statusWait;
  774. MPI_Wait(&recvRequest, &statusWait);
  775. nxt_mid = lastOnPrev.NextId();
  776. }
  777. if(rank < (size-1)) {
  778. MPI_Status statusWait;
  779. MPI_Wait(&sendRequest, &statusWait);
  780. }
  781. }
  782. std::vector<MortonId> out1;
  783. std::vector<MortonId> children;
  784. for(size_t i=0;i<out.size();i++){
  785. while(nxt_mid.getDFD()<out[i]){
  786. while(nxt_mid.isAncestor(out[i])){
  787. nxt_mid=nxt_mid.getAncestor(nxt_mid.GetDepth()+1);
  788. }
  789. out1.push_back(nxt_mid);
  790. nxt_mid=nxt_mid.NextId();
  791. }
  792. children=out[i].Children();
  793. for(size_t j=0;j<8;j++){
  794. out1.push_back(children[j]);
  795. }
  796. nxt_mid=out[i].NextId();
  797. }
  798. if(rank==size-1){
  799. while(nxt_mid.GetDepth()>0){
  800. out1.push_back(nxt_mid);
  801. nxt_mid=nxt_mid.NextId();
  802. }
  803. }
  804. out.swap(out1);
  805. }
  806. }
  807. //////////////////////////////////////////////////////////////////////////////////////////////////
  808. #ifdef __VERBOSE__
  809. long long locOutSize = out.size();
  810. long long globInSize, globTmpSize, globOutSize;
  811. MPI_Allreduce(&locInSize , &globInSize , 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
  812. MPI_Allreduce(&locTmpSize, &globTmpSize, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
  813. MPI_Allreduce(&locOutSize, &globOutSize, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
  814. if(!rank) std::cout<<"Balance Octree. inpSize: "<<globInSize
  815. <<" tmpSize: "<<globTmpSize
  816. <<" outSize: "<<globOutSize
  817. <<" activeNpes: "<<size<<std::endl;
  818. #endif
  819. return 0;
  820. }//end function
  821. template <class TreeNode>
  822. void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
  823. int num_proc,myrank;
  824. MPI_Comm_rank(*Comm(),&myrank);
  825. MPI_Comm_size(*Comm(),&num_proc);
  826. //Using Dendro for balancing
  827. //Create a linear tree in dendro format.
  828. Node_t* curr_node=this->PreorderFirst();
  829. std::vector<MortonId> in;
  830. while(curr_node!=NULL){
  831. if(curr_node->IsLeaf() && !curr_node->IsGhost()){
  832. in.push_back(curr_node->GetMortonId());
  833. }
  834. curr_node=this->PreorderNxt(curr_node);
  835. }
  836. //2:1 balance
  837. Profile::Tic("ot::balanceOctree",Comm(),true,3);
  838. std::vector<MortonId> out;
  839. balanceOctree(in, out, this->Dim(), this->max_depth, (bndry==Periodic), *Comm());
  840. Profile::Toc();
  841. //Get new_mins.
  842. std::vector<MortonId> new_mins(num_proc);
  843. MPI_Allgather(&out[0] , 1, par::Mpi_datatype<MortonId>::value(),
  844. &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  845. // Refine to new_mins in my range of octants
  846. // or else RedistNodes(...) will not work correctly.
  847. {
  848. int i=0;
  849. std::vector<MortonId> mins=GetMins();
  850. while(new_mins[i]<mins[myrank] && i<num_proc) i++; //TODO: Use binary search.
  851. for(;i<num_proc;i++){
  852. Node_t* n=FindNode(new_mins[i], true);
  853. if(n->IsGhost()) break;
  854. else assert(n->GetMortonId()==new_mins[i]);
  855. }
  856. }
  857. //Redist nodes using new_mins.
  858. Profile::Tic("RedistNodes",Comm(),true,3);
  859. RedistNodes(&out[0]);
  860. #ifndef NDEBUG
  861. std::vector<MortonId> mins=GetMins();
  862. assert(mins[myrank].getDFD()==out[0].getDFD());
  863. #endif
  864. Profile::Toc();
  865. //Now subdivide the current tree as necessary to make it balanced.
  866. Profile::Tic("LocalSubdivide",Comm(),false,3);
  867. int omp_p=omp_get_max_threads();
  868. for(int i=0;i<omp_p;i++){
  869. size_t a=(out.size()*i)/omp_p;
  870. Node_t* n=FindNode(out[a], true);
  871. assert(n->GetMortonId()==out[a]);
  872. UNUSED(n);
  873. }
  874. #pragma omp parallel for
  875. for(int i=0;i<omp_p;i++){
  876. size_t a=(out.size()* i )/omp_p;
  877. size_t b=(out.size()*(i+1))/omp_p;
  878. MortonId dn;
  879. size_t node_iter=a;
  880. Node_t* n=FindNode(out[node_iter], false);
  881. while(n!=NULL && node_iter<b){
  882. n->SetGhost(false);
  883. dn=n->GetMortonId();
  884. if(dn.isAncestor(out[node_iter]) && dn!=out[node_iter]){
  885. if(n->IsLeaf()) n->Subdivide();
  886. }else if(dn==out[node_iter]){
  887. assert(n->IsLeaf());
  888. //if(!n->IsLeaf()){ //This should never happen
  889. // n->Truncate();
  890. // n->SetGhost(false);
  891. //}
  892. assert(n->IsLeaf());
  893. node_iter++;
  894. }else{
  895. n->Truncate(); //This node does not belong to this process.
  896. n->SetGhost(true);
  897. }
  898. n=this->PreorderNxt(n);
  899. }
  900. if(i==omp_p-1){
  901. while(n!=NULL){
  902. n->Truncate();
  903. n->SetGhost(true);
  904. n=this->PreorderNxt(n);
  905. }
  906. }
  907. }
  908. Profile::Toc();
  909. }
  910. template <class TreeNode>
  911. void MPI_Tree<TreeNode>::Balance21_local(BoundaryType bndry){
  912. //SetColleagues(bndry);
  913. std::vector<std::vector<Node_t*> > node_lst(this->max_depth+1);
  914. Node_t* curr_node=this->PreorderFirst();
  915. while(curr_node!=NULL){
  916. node_lst[curr_node->Depth()].push_back(curr_node);
  917. curr_node=this->PreorderNxt(curr_node);
  918. }
  919. int n1=pow(3.0,this->Dim());
  920. int n2=pow(2.0,this->Dim());
  921. for(int i=this->max_depth;i>0;i--){
  922. Real_t s=pow(0.5,i);
  923. for(size_t j=0;j<node_lst[i].size();j++){
  924. curr_node=node_lst[i][j];
  925. Real_t* coord=curr_node->Coord();
  926. if(!curr_node->IsLeaf()) for(int k=0;k<n1;k++){
  927. if(curr_node->Colleague(k)==NULL){
  928. Real_t c0[3]={coord[0]+((k/1)%3-1)*s+s*0.5,
  929. coord[1]+((k/3)%3-1)*s+s*0.5,
  930. coord[2]+((k/9)%3-1)*s+s*0.5};
  931. if(bndry==Periodic){
  932. c0[0]=c0[0]-floor(c0[0]);
  933. c0[1]=c0[1]-floor(c0[1]);
  934. c0[2]=c0[2]-floor(c0[2]);
  935. }
  936. if(c0[0]>0 && c0[0]<1)
  937. if(c0[1]>0 && c0[1]<1)
  938. if(c0[2]>0 && c0[2]<1){
  939. Node_t* node=this->RootNode();
  940. while(node->Depth()<i){
  941. if(node->IsLeaf()){
  942. node->Subdivide();
  943. for(int l=0;l<n2;l++){
  944. node_lst[node->Depth()+1].push_back((Node_t*)node->Child(l));
  945. /*
  946. SetColleagues(bndry,(Node_t*)node->Child(l));
  947. for(int i_=0;i_<n1;i_++){
  948. Node_t* coll=(Node_t*)((Node_t*)node->Child(l))->Colleague(i_);
  949. if(coll!=NULL) SetColleagues(bndry,coll);
  950. }// */
  951. }
  952. }
  953. Real_t s1=pow(0.5,node->Depth()+1);
  954. Real_t* c1=node->Coord();
  955. int c_id=((c0[0]-c1[0])>s1?1:0)+
  956. ((c0[1]-c1[1])>s1?2:0)+
  957. ((c0[2]-c1[2])>s1?4:0);
  958. node=(Node_t*)node->Child(c_id);
  959. /*if(node->Depth()==i){
  960. c1=node->Coord();
  961. std::cout<<(c0[0]-c1[0])-s1/2<<' '
  962. std::cout<<(c0[1]-c1[1])-s1/2<<' '
  963. std::cout<<(c0[2]-c1[2])-s1/2<<'\n';
  964. }// */
  965. }
  966. }
  967. }
  968. }
  969. }
  970. }
  971. }
  972. template <class TreeNode>
  973. void MPI_Tree<TreeNode>::SetColleagues(BoundaryType bndry, Node_t* node){
  974. int n1=(int)pow(3.0,this->Dim());
  975. int n2=(int)pow(2.0,this->Dim());
  976. if(node==NULL){
  977. Node_t* curr_node=this->PreorderFirst();
  978. if(curr_node!=NULL){
  979. if(bndry==Periodic){
  980. for(int i=0;i<n1;i++)
  981. curr_node->SetColleague(curr_node,i);
  982. }else{
  983. curr_node->SetColleague(curr_node,(n1-1)/2);
  984. }
  985. curr_node=this->PreorderNxt(curr_node);
  986. }
  987. Vector<std::vector<Node_t*> > nodes(MAX_DEPTH);
  988. while(curr_node!=NULL){
  989. nodes[curr_node->Depth()].push_back(curr_node);
  990. curr_node=this->PreorderNxt(curr_node);
  991. }
  992. for(size_t i=0;i<MAX_DEPTH;i++){
  993. size_t j0=nodes[i].size();
  994. Node_t** nodes_=&nodes[i][0];
  995. #pragma omp parallel for
  996. for(size_t j=0;j<j0;j++){
  997. SetColleagues(bndry, nodes_[j]);
  998. }
  999. }
  1000. }else{
  1001. /* //This is slower
  1002. Node_t* root_node=this->RootNode();
  1003. int d=node->Depth();
  1004. Real_t* c0=node->Coord();
  1005. Real_t s=pow(0.5,d);
  1006. Real_t c[COORD_DIM];
  1007. int idx=0;
  1008. for(int i=-1;i<=1;i++)
  1009. for(int j=-1;j<=1;j++)
  1010. for(int k=-1;k<=1;k++){
  1011. c[0]=c0[0]+s*0.5+s*k;
  1012. c[1]=c0[1]+s*0.5+s*j;
  1013. c[2]=c0[2]+s*0.5+s*i;
  1014. if(bndry==Periodic){
  1015. if(c[0]<0.0) c[0]+=1.0;
  1016. if(c[0]>1.0) c[0]-=1.0;
  1017. if(c[1]<1.0) c[1]+=1.0;
  1018. if(c[1]>1.0) c[1]-=1.0;
  1019. if(c[2]<1.0) c[2]+=1.0;
  1020. if(c[2]>1.0) c[2]-=1.0;
  1021. }
  1022. node->SetColleague(NULL,idx);
  1023. if(c[0]<1.0 && c[0]>0.0)
  1024. if(c[1]<1.0 && c[1]>0.0)
  1025. if(c[2]<1.0 && c[2]>0.0){
  1026. MortonId m(c,d);
  1027. Node_t* nbr=FindNode(m,false,root_node);
  1028. while(nbr->Depth()>d) nbr=(Node_t*)nbr->Parent();
  1029. if(nbr->Depth()==d) node->SetColleague(nbr,idx);
  1030. }
  1031. idx++;
  1032. }
  1033. /*/
  1034. Node_t* parent_node;
  1035. Node_t* tmp_node1;
  1036. Node_t* tmp_node2;
  1037. for(int i=0;i<n1;i++)node->SetColleague(NULL,i);
  1038. parent_node=(Node_t*)node->Parent();
  1039. if(parent_node==NULL) return;
  1040. int l=node->Path2Node();
  1041. for(int i=0;i<n1;i++){ //For each coll of the parent
  1042. tmp_node1=(Node_t*)parent_node->Colleague(i);
  1043. if(tmp_node1!=NULL)
  1044. if(!tmp_node1->IsLeaf()){
  1045. for(int j=0;j<n2;j++){ //For each child
  1046. tmp_node2=(Node_t*)tmp_node1->Child(j);
  1047. if(tmp_node2!=NULL){
  1048. bool flag=true;
  1049. int a=1,b=1,new_indx=0;
  1050. for(int k=0;k<this->Dim();k++){
  1051. int indx_diff=(((i/b)%3)-1)*2+((j/a)%2)-((l/a)%2);
  1052. if(-1>indx_diff || indx_diff>1) flag=false;
  1053. new_indx+=(indx_diff+1)*b;
  1054. a*=2;b*=3;
  1055. }
  1056. if(flag){
  1057. node->SetColleague(tmp_node2,new_indx);
  1058. }
  1059. }
  1060. }
  1061. }
  1062. }// */
  1063. }
  1064. }
  1065. template <class TreeNode>
  1066. bool MPI_Tree<TreeNode>::CheckTree(){
  1067. int myrank,np;
  1068. MPI_Comm_rank(*Comm(),&myrank);
  1069. MPI_Comm_size(*Comm(),&np);
  1070. std::vector<MortonId> mins=GetMins();
  1071. std::stringstream st;
  1072. st<<"PID_"<<myrank<<" : ";
  1073. std::string str;
  1074. Node_t* n=this->PostorderFirst();
  1075. while(n!=NULL){
  1076. if(myrank<np-1) if(n->GetMortonId().getDFD()>=mins[myrank+1])break;
  1077. if(n->GetMortonId()>=mins[myrank] && n->IsLeaf() && n->IsGhost()){
  1078. std::cout<<n->GetMortonId()<<'\n';
  1079. std::cout<<mins[myrank]<<'\n';
  1080. if(myrank+1<np) std::cout<<mins[myrank+1]<<'\n';
  1081. std::cout<<myrank<<'\n';
  1082. assert(false);
  1083. }
  1084. if(n->GetMortonId()<mins[myrank] && n->IsLeaf() && !n->IsGhost()){
  1085. assert(false);
  1086. }
  1087. if(!n->IsGhost() && n->Depth()>0)
  1088. assert(!((Node_t*)n->Parent())->IsGhost());
  1089. n=this->PostorderNxt(n);
  1090. }
  1091. while(n!=NULL){
  1092. if(n->IsLeaf() && !n->IsGhost()){
  1093. st<<"non-ghost leaf node "<<n->GetMortonId()<<"; after last node.";
  1094. str=st.str(); ASSERT_WITH_MSG(false,str.c_str());
  1095. }
  1096. n=this->PostorderNxt(n);
  1097. }
  1098. return true;
  1099. };
  1100. /**
  1101. * \brief Determines if node is used in the region between Morton Ids m1 and m2
  1102. * ( m1 <= m2 ).
  1103. */
  1104. template <class TreeNode>
  1105. void IsShared(std::vector<TreeNode*>& nodes, MortonId* m1, MortonId* m2, BoundaryType bndry, std::vector<char>& shared_flag){
  1106. MortonId mm1, mm2;
  1107. if(m1!=NULL) mm1=m1->getDFD();
  1108. if(m2!=NULL) mm2=m2->getDFD();
  1109. shared_flag.resize(nodes.size());
  1110. int omp_p=omp_get_max_threads();
  1111. #pragma omp parallel for
  1112. for(int j=0;j<omp_p;j++){
  1113. size_t a=((j )*nodes.size())/omp_p;
  1114. size_t b=((j+1)*nodes.size())/omp_p;
  1115. std::vector<MortonId> nbr_lst;
  1116. for(size_t i=a;i<b;i++){
  1117. shared_flag[i]=false;
  1118. TreeNode* node=nodes[i];
  1119. assert(node!=NULL);
  1120. if(node->Depth()<2){
  1121. shared_flag[i]=true;
  1122. continue;
  1123. }
  1124. node->GetMortonId().NbrList(nbr_lst, node->Depth()-1, bndry==Periodic);
  1125. for(size_t k=0;k<nbr_lst.size();k++){
  1126. MortonId n1=nbr_lst[k] .getDFD();
  1127. MortonId n2=nbr_lst[k].NextId().getDFD();
  1128. if(m1==NULL || n2>mm1)
  1129. if(m2==NULL || n1<mm2){
  1130. shared_flag[i]=true;
  1131. break;
  1132. }
  1133. }
  1134. }
  1135. }
  1136. }
  1137. inline void IsShared(std::vector<PackedData>& nodes, MortonId* m1, MortonId* m2, BoundaryType bndry, std::vector<char>& shared_flag){
  1138. MortonId mm1, mm2;
  1139. if(m1!=NULL) mm1=m1->getDFD();
  1140. if(m2!=NULL) mm2=m2->getDFD();
  1141. shared_flag.resize(nodes.size());
  1142. int omp_p=omp_get_max_threads();
  1143. #pragma omp parallel for
  1144. for(int j=0;j<omp_p;j++){
  1145. size_t a=((j )*nodes.size())/omp_p;
  1146. size_t b=((j+1)*nodes.size())/omp_p;
  1147. std::vector<MortonId> nbr_lst;
  1148. for(size_t i=a;i<b;i++){
  1149. shared_flag[i]=false;
  1150. MortonId* node=(MortonId*)nodes[i].data;
  1151. assert(node!=NULL);
  1152. if(node->GetDepth()<2){
  1153. shared_flag[i]=true;
  1154. continue;
  1155. }
  1156. node->NbrList(nbr_lst, node->GetDepth()-1, bndry==Periodic);
  1157. for(size_t k=0;k<nbr_lst.size();k++){
  1158. MortonId n1=nbr_lst[k] .getDFD();
  1159. MortonId n2=nbr_lst[k].NextId().getDFD();
  1160. if(m1==NULL || n2>mm1)
  1161. if(m2==NULL || n1<mm2){
  1162. shared_flag[i]=true;
  1163. break;
  1164. }
  1165. }
  1166. }
  1167. }
  1168. }
  1169. /**
  1170. * \brief Construct Locally Essential Tree by exchanging Ghost octants.
  1171. */
  1172. template <class TreeNode>
  1173. void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
  1174. //Profile::Tic("LET_Hypercube", &comm, true, 5);
  1175. //ConstructLET_Hypercube(bndry);
  1176. //Profile::Toc();
  1177. //Profile::Tic("LET_Sparse", &comm, true, 5);
  1178. ConstructLET_Sparse(bndry);
  1179. //Profile::Toc();
  1180. #ifndef NDEBUG
  1181. CheckTree();
  1182. #endif
  1183. }
  1184. /**
  1185. * \brief Hypercube based scheme to exchange Ghost octants.
  1186. */
  1187. //#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)
  1188. template <class TreeNode>
  1189. void MPI_Tree<TreeNode>::ConstructLET_Hypercube(BoundaryType bndry){
  1190. int num_p,rank;
  1191. MPI_Comm_size(*Comm(),&num_p);
  1192. MPI_Comm_rank(*Comm(),&rank );
  1193. if(num_p==1) return;
  1194. int omp_p=omp_get_max_threads();
  1195. std::vector<MortonId> mins=GetMins();
  1196. // Build list of shared nodes.
  1197. std::vector<Node_t*> shared_nodes; shared_nodes.clear();
  1198. std::vector<Node_t*> node_lst; node_lst.clear();
  1199. Node_t* curr_node=this->PreorderFirst();
  1200. while(curr_node!=NULL){
  1201. if(curr_node->GetMortonId().getDFD()>=mins[rank]) break;
  1202. curr_node=this->PreorderNxt(curr_node);
  1203. }
  1204. while(curr_node!=NULL){
  1205. if(curr_node->IsGhost()) break;
  1206. node_lst.push_back(curr_node);
  1207. curr_node=this->PreorderNxt(curr_node);
  1208. }
  1209. std::vector<char> node_flag0; node_flag0.clear();
  1210. std::vector<char> node_flag1; node_flag1.clear();
  1211. IsShared(node_lst,&mins[0],&mins[rank],bndry,node_flag0);
  1212. if(rank<num_p-1) IsShared(node_lst,&mins[rank+1],NULL,bndry,node_flag1);
  1213. for(size_t i=0;i<node_lst.size();i++){
  1214. if(node_flag0[i] || (rank<num_p-1 && node_flag1[i]))
  1215. shared_nodes.push_back(node_lst[i]);
  1216. }
  1217. //std::cout<<"Shared = "<<shared_nodes.size()<<'\n';
  1218. // Pack shared nodes.
  1219. static std::vector<char> shrd_buff_vec0(omp_p*64l*1024l*1024l);
  1220. static std::vector<char> shrd_buff_vec1(omp_p*128l*1024l*1024l);
  1221. static std::vector<char> send_buff_vec(omp_p*64l*1024l*1024l); char* send_buff;
  1222. static std::vector<char> recv_buff_vec(omp_p*64l*1024l*1024l); char* recv_buff;
  1223. std::vector<PackedData> shrd_data;
  1224. size_t max_data_size=0;
  1225. {
  1226. long max_data_size_lcl=0;
  1227. long max_data_size_glb=0;
  1228. char* data_ptr=&shrd_buff_vec0[0];
  1229. for(size_t i=0;i<shared_nodes.size();i++){
  1230. PackedData p=shared_nodes[i]->Pack(true,data_ptr,sizeof(MortonId));
  1231. ((MortonId*)data_ptr)[0]=shared_nodes[i]->GetMortonId();
  1232. p.length+=sizeof(MortonId);
  1233. shrd_data.push_back(p);
  1234. data_ptr+=p.length;
  1235. if(max_data_size_lcl<(long)p.length) max_data_size_lcl=p.length;
  1236. assert(data_ptr<=&(*shrd_buff_vec0.end())); //TODO: resize if needed.
  1237. }
  1238. MPI_Allreduce(&max_data_size_lcl, &max_data_size_glb, 1, MPI_LONG, MPI_MAX, *Comm());
  1239. max_data_size=max_data_size_glb;
  1240. }
  1241. // Memory slots for storing node data.
  1242. std::set<void*> mem_set;
  1243. size_t mem_set_size=0;
  1244. size_t range[2]={0,(size_t)num_p-1};
  1245. while(range[1]-range[0]>0){
  1246. size_t split_p=(range[0]+range[1])/2;
  1247. size_t new_range[2]={(size_t)rank<=split_p?range[0]:split_p+1,(size_t)rank<=split_p?split_p:range[1]};
  1248. size_t com_range[2]={(size_t)rank> split_p?range[0]:split_p+1,(size_t)rank> split_p?split_p:range[1]};
  1249. size_t partner=rank-new_range[0]+com_range[0];
  1250. if(partner>range[1]) partner--;
  1251. bool extra_partner=((size_t)rank==range[1] && ((range[1]-range[0])%2)==0);
  1252. int send_length=0;
  1253. std::vector<PackedData> shrd_data_new;
  1254. IsShared(shrd_data, &mins[com_range[0]], (com_range[1]==(size_t)num_p-1?NULL:&mins[com_range[1]+1]),bndry, node_flag0);
  1255. IsShared(shrd_data, &mins[new_range[0]], (new_range[1]==(size_t)num_p-1?NULL:&mins[new_range[1]+1]),bndry, node_flag1);
  1256. {
  1257. std::vector<void*> srctrg_ptr;
  1258. std::vector<size_t> mem_size;
  1259. for(size_t i=0;i<shrd_data.size();i++){
  1260. PackedData& p=shrd_data[i];
  1261. if( node_flag0[i]){ // Copy data to send buffer.
  1262. char* data_ptr=(char*)&send_buff_vec[send_length];
  1263. ((size_t*)data_ptr)[0]=p.length; data_ptr+=sizeof(size_t);
  1264. //mem::memcopy(data_ptr,p.data,p.length);
  1265. mem_size.push_back(p.length);
  1266. srctrg_ptr.push_back(p.data);
  1267. srctrg_ptr.push_back(data_ptr);
  1268. send_length+=p.length+sizeof(size_t);
  1269. assert((size_t)send_length<=send_buff_vec.size()); //TODO: resize if needed.
  1270. }
  1271. if(!node_flag1[i]){ // Free memory slot.
  1272. //assert(node_flag0[0]);
  1273. if(p.data>=&shrd_buff_vec1[0] && p.data<&shrd_buff_vec1[0]+shrd_buff_vec1.size())
  1274. mem_set.insert(p.data);
  1275. } else shrd_data_new.push_back(p);
  1276. }
  1277. shrd_data=shrd_data_new;
  1278. #pragma omp parallel for
  1279. for(int k=0;k<omp_p;k++){
  1280. size_t i0=((k+0)*mem_size.size())/omp_p;
  1281. size_t i1=((k+1)*mem_size.size())/omp_p;
  1282. for(size_t i=i0;i<i1;i++){
  1283. mem::memcopy(srctrg_ptr[2*i+1],srctrg_ptr[2*i+0],mem_size[i]);
  1284. }
  1285. }
  1286. }
  1287. //Exchange send size.
  1288. int recv_length=0;
  1289. int extra_recv_length=0;
  1290. int extra_send_length=0;
  1291. MPI_Status status;
  1292. MPI_Sendrecv (& send_length,1,MPI_INT,partner,0, &recv_length,1,MPI_INT,partner,0,*Comm(),&status);
  1293. if(extra_partner) MPI_Sendrecv(&extra_send_length,1,MPI_INT,split_p,0,&extra_recv_length,1,MPI_INT,split_p,0,*Comm(),&status);
  1294. //SendRecv data.
  1295. assert((size_t)send_length <=send_buff_vec.size()); send_buff=&send_buff_vec[0];
  1296. assert((size_t)recv_length+extra_recv_length<=recv_buff_vec.size()); recv_buff=&recv_buff_vec[0];
  1297. MPI_Sendrecv (send_buff,send_length,MPI_BYTE,partner,0, recv_buff , recv_length,MPI_BYTE,partner,0,*Comm(),&status);
  1298. if(extra_partner) MPI_Sendrecv( NULL, 0,MPI_BYTE,split_p,0,&recv_buff[recv_length],extra_recv_length,MPI_BYTE,split_p,0,*Comm(),&status);
  1299. //Get nodes from received data.
  1300. {
  1301. std::vector<void*> srctrg_ptr;
  1302. std::vector<size_t> mem_size;
  1303. int buff_length=0;
  1304. while(buff_length<recv_length+extra_recv_length){
  1305. PackedData p0,p1;
  1306. p0.length=((size_t*)&recv_buff_vec[buff_length])[0];
  1307. p0.data=(char*)&recv_buff_vec[buff_length]+sizeof(size_t);
  1308. buff_length+=p0.length+sizeof(size_t);
  1309. p1.length=p0.length;
  1310. if(mem_set.size()==0){
  1311. assert(mem_set_size*max_data_size<shrd_buff_vec1.size());
  1312. p1.data=&shrd_buff_vec1[mem_set_size*max_data_size];
  1313. mem_set_size++;
  1314. }else{
  1315. p1.data=*mem_set.begin();
  1316. mem_set.erase(mem_set.begin());
  1317. }
  1318. //mem::memcopy(p1.data,p0.data,p0.length);
  1319. mem_size.push_back(p0.length);
  1320. srctrg_ptr.push_back(p0.data);
  1321. srctrg_ptr.push_back(p1.data);
  1322. shrd_data.push_back(p1);
  1323. }
  1324. #pragma omp parallel for
  1325. for(int k=0;k<omp_p;k++){
  1326. size_t i0=((k+0)*mem_size.size())/omp_p;
  1327. size_t i1=((k+1)*mem_size.size())/omp_p;
  1328. for(size_t i=i0;i<i1;i++){
  1329. mem::memcopy(srctrg_ptr[2*i+1],srctrg_ptr[2*i+0],mem_size[i]);
  1330. }
  1331. }
  1332. }
  1333. range[0]=new_range[0];
  1334. range[1]=new_range[1];
  1335. }
  1336. //Add shared_nodes to the tree.
  1337. //std::cout<<"Number of Ghost Nodes = "<<shrd_data.size()<<'\n';
  1338. int nchld=(1UL<<this->Dim()); // Number of children.
  1339. std::vector<Node_t*> shrd_nodes(shrd_data.size());
  1340. for(size_t i=0;i<shrd_data.size();i++){ // Find shared nodes.
  1341. MortonId& mid=*(MortonId*)shrd_data[i].data;
  1342. Node_t* srch_node=this->RootNode();
  1343. while(srch_node->GetMortonId()!=mid){
  1344. Node_t* ch_node;
  1345. if(srch_node->IsLeaf()){
  1346. srch_node->SetGhost(true);
  1347. srch_node->Subdivide();
  1348. }
  1349. for(int j=nchld-1;j>=0;j--){
  1350. ch_node=(Node_t*)srch_node->Child(j);
  1351. if(ch_node->GetMortonId()<=mid){
  1352. srch_node=ch_node;
  1353. break;
  1354. }
  1355. }
  1356. }
  1357. shrd_nodes[i]=srch_node;
  1358. }
  1359. #pragma omp parallel for
  1360. for(size_t i=0;i<shrd_data.size();i++){
  1361. if(shrd_nodes[i]->IsGhost()) { // Initialize ghost node.
  1362. PackedData p=shrd_data[i];
  1363. p.data=((char*)p.data)+sizeof(MortonId);
  1364. p.length-=sizeof(MortonId);
  1365. shrd_nodes[i]->Unpack(p);
  1366. }
  1367. }
  1368. //Now LET is complete.
  1369. }
  1370. /**
  1371. * \brief Sparse communication scheme to exchange Ghost octants.
  1372. */
  1373. template <class TreeNode>
  1374. void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
  1375. typedef int MPI_size_t;
  1376. struct CommData{
  1377. MortonId mid;
  1378. TreeNode* node;
  1379. size_t pkd_length;
  1380. size_t usr_cnt;
  1381. MortonId usr_mid[COLLEAGUE_COUNT];
  1382. size_t usr_pid[COLLEAGUE_COUNT];
  1383. };
  1384. int num_p,rank;
  1385. MPI_Comm_size(*Comm(),&num_p);
  1386. MPI_Comm_rank(*Comm(),&rank );
  1387. if(num_p==1) return;
  1388. int omp_p=omp_get_max_threads();
  1389. std::vector<MortonId> mins=GetMins();
  1390. // Allocate Memory.
  1391. static std::vector<char> send_buff;
  1392. static std::vector<char> recv_buff;
  1393. //Profile::Tic("SharedNodes", &comm, false, 5);
  1394. CommData* node_comm_data=NULL; // CommData for all nodes.
  1395. std::vector<void*> shared_data; // CommData for shared nodes.
  1396. std::vector<par::SortPair<size_t,size_t> > pid_node_pair; // <pid, shared_data index> list
  1397. { // Set node_comm_data
  1398. MortonId mins_r0=mins[ rank+0 ].getDFD();
  1399. MortonId mins_r1=mins[std::min(rank+1,num_p-1)].getDFD();
  1400. std::vector<TreeNode*> nodes=this->GetNodeList();
  1401. node_comm_data=(CommData*)this->memgr.malloc(sizeof(CommData)*nodes.size());
  1402. #pragma omp parallel for
  1403. for(size_t tid=0;tid<omp_p;tid++){
  1404. std::vector<MortonId> nbr_lst;
  1405. size_t a=(nodes.size()* tid )/omp_p;
  1406. size_t b=(nodes.size()*(tid+1))/omp_p;
  1407. for(size_t i=a;i<b;i++){
  1408. bool shared=false;
  1409. CommData& comm_data=node_comm_data[i];
  1410. comm_data.node=nodes[i];
  1411. comm_data.mid=comm_data.node->GetMortonId();
  1412. comm_data.usr_cnt=0;
  1413. if(comm_data.node->IsGhost()) continue;
  1414. if(comm_data.node->Depth()==0) continue;
  1415. if(comm_data.mid.getDFD()<mins_r0) continue;
  1416. MortonId mid0=comm_data.mid. getDFD();
  1417. MortonId mid1=comm_data.mid.NextId().getDFD();
  1418. comm_data.mid.NbrList(nbr_lst,comm_data.node->Depth()-1, bndry==Periodic);
  1419. comm_data.usr_cnt=nbr_lst.size();
  1420. for(size_t j=0;j<nbr_lst.size();j++){
  1421. MortonId usr_mid=nbr_lst[j];
  1422. MortonId usr_mid_dfd=usr_mid.getDFD();
  1423. comm_data.usr_mid[j]=usr_mid;
  1424. comm_data.usr_pid[j]=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
  1425. // if(usr_mid_dfd<mins_r0 || (rank+1<num_p && usr_mid_dfd>=mins_r1)){ // Find the user pid.
  1426. // size_t usr_pid=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
  1427. // comm_data.usr_pid[j]=usr_pid;
  1428. // }else comm_data.usr_pid[j]=rank;
  1429. if(!shared){ // Check if this node needs to be transferred during broadcast.
  1430. if(comm_data.usr_pid[j]!=rank || (rank+1<num_p && usr_mid.NextId()>mins_r1) ){
  1431. shared=true;
  1432. }
  1433. }
  1434. }
  1435. #pragma omp critical (ADD_SHARED)
  1436. if(shared){
  1437. for(size_t j=0;j<comm_data.usr_cnt;j++)
  1438. if(comm_data.usr_pid[j]!=rank){
  1439. bool unique_pid=true;
  1440. for(size_t k=0;k<j;k++){
  1441. if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
  1442. unique_pid=false;
  1443. break;
  1444. }
  1445. }
  1446. if(unique_pid){
  1447. par::SortPair<size_t,size_t> p;
  1448. p.key=comm_data.usr_pid[j];
  1449. p.data=shared_data.size();
  1450. pid_node_pair.push_back(p);
  1451. }
  1452. }
  1453. shared_data.push_back(&comm_data);
  1454. }
  1455. }
  1456. }
  1457. omp_par::merge_sort(&pid_node_pair[0], &pid_node_pair[pid_node_pair.size()]);
  1458. //std::cout<<rank<<' '<<shared_data.size()<<' '<<pid_node_pair.size()<<'\n';
  1459. }
  1460. //Profile::Toc();
  1461. //Profile::Tic("PackNodes", &comm, false, 5);
  1462. { // Pack shared nodes.
  1463. #pragma omp parallel for
  1464. for(size_t tid=0;tid<omp_p;tid++){
  1465. size_t buff_length=10l*1024l*1024l; // 10MB buffer per thread.
  1466. char* buff=(char*)this->memgr.malloc(buff_length);
  1467. size_t a=( tid *shared_data.size())/omp_p;
  1468. size_t b=((tid+1)*shared_data.size())/omp_p;
  1469. for(size_t i=a;i<b;i++){
  1470. CommData& comm_data=*(CommData*)shared_data[i];
  1471. PackedData p0=comm_data.node->Pack(true,buff);
  1472. assert(p0.length<buff_length);
  1473. shared_data[i]=this->memgr.malloc(sizeof(CommData)+p0.length);
  1474. CommData& new_comm_data=*(CommData*)shared_data[i];
  1475. new_comm_data=comm_data;
  1476. new_comm_data.pkd_length=sizeof(CommData)+p0.length;
  1477. mem::memcopy(((char*)shared_data[i])+sizeof(CommData),buff,p0.length);
  1478. }
  1479. this->memgr.free(buff);
  1480. }
  1481. // now CommData is stored in shared_data
  1482. this->memgr.free(node_comm_data);
  1483. node_comm_data=NULL;
  1484. }
  1485. //Profile::Toc();
  1486. //Profile::Tic("SendBuff", &comm, false, 5);
  1487. std::vector<MPI_size_t> send_size(num_p,0);
  1488. std::vector<MPI_size_t> send_disp(num_p,0);
  1489. if(pid_node_pair.size()){ // Build send_buff.
  1490. std::vector<size_t> size(pid_node_pair.size(),0);
  1491. std::vector<size_t> disp(pid_node_pair.size(),0);
  1492. #pragma omp parallel for
  1493. for(size_t i=0;i<pid_node_pair.size();i++){
  1494. size[i]=((CommData*)shared_data[pid_node_pair[i].data])->pkd_length;
  1495. }
  1496. omp_par::scan(&size[0],&disp[0],pid_node_pair.size());
  1497. // Resize send_buff.
  1498. if(send_buff.size()<size[pid_node_pair.size()-1]+disp[pid_node_pair.size()-1]){
  1499. send_buff.resize(size[pid_node_pair.size()-1]+disp[pid_node_pair.size()-1]);
  1500. }
  1501. // Copy data to send_buff.
  1502. #pragma omp parallel for
  1503. for(size_t i=0;i<pid_node_pair.size();i++){
  1504. size_t shrd_idx=pid_node_pair[i].data;
  1505. mem::memcopy(&send_buff[disp[i]], shared_data[shrd_idx], size[i]);
  1506. }
  1507. // Compute send_size, send_disp.
  1508. {
  1509. // Compute send_size.
  1510. #pragma omp parallel for
  1511. for(size_t tid=0;tid<omp_p;tid++){
  1512. size_t a=(pid_node_pair.size()* tid )/omp_p;
  1513. size_t b=(pid_node_pair.size()*(tid+1))/omp_p;
  1514. if(a>0 && a<pid_node_pair.size()){
  1515. size_t p0=pid_node_pair[a].key;
  1516. while(a<pid_node_pair.size() && p0==pid_node_pair[a].key) a++;
  1517. }
  1518. if(b>0 && b<pid_node_pair.size()){
  1519. size_t p1=pid_node_pair[b].key;
  1520. while(b<pid_node_pair.size() && p1==pid_node_pair[b].key) b++;
  1521. }
  1522. for(size_t i=a;i<b;i++){
  1523. send_size[pid_node_pair[i].key]+=size[i];
  1524. }
  1525. }
  1526. // Compute send_disp.
  1527. omp_par::scan(&send_size[0],&send_disp[0],num_p);
  1528. }
  1529. }
  1530. //Profile::Toc();
  1531. //Profile::Tic("A2A_Sparse", &comm, true, 5);
  1532. size_t recv_length=0;
  1533. { // Allocate recv_buff.
  1534. std::vector<MPI_size_t> recv_size(num_p,0);
  1535. std::vector<MPI_size_t> recv_disp(num_p,0);
  1536. MPI_Alltoall(&send_size[0], 1, par::Mpi_datatype<MPI_size_t>::value(),
  1537. &recv_size[0], 1, par::Mpi_datatype<MPI_size_t>::value(), *Comm());
  1538. omp_par::scan(&recv_size[0],&recv_disp[0],num_p);
  1539. recv_length=recv_size[num_p-1]+recv_disp[num_p-1];
  1540. if(recv_buff.size()<recv_length){
  1541. recv_buff.resize(recv_length);
  1542. }
  1543. par::Mpi_Alltoallv_sparse(&send_buff[0], &send_size[0], &send_disp[0],
  1544. &recv_buff[0], &recv_size[0], &recv_disp[0], *Comm());
  1545. }
  1546. //Profile::Toc();
  1547. //Profile::Tic("Unpack", &comm, false, 5);
  1548. std::vector<void*> recv_data; // CommData for received nodes.
  1549. { // Unpack received octants.
  1550. std::vector<par::SortPair<MortonId,size_t> > mid_indx_pair;
  1551. for(size_t i=0; i<recv_length;){
  1552. CommData& comm_data=*(CommData*)&recv_buff[i];
  1553. recv_data.push_back(&comm_data);
  1554. { // Add mid_indx_pair
  1555. par::SortPair<MortonId,size_t> p;
  1556. p.key=comm_data.mid;
  1557. p.data=mid_indx_pair.size();
  1558. mid_indx_pair.push_back(p);
  1559. }
  1560. i+=comm_data.pkd_length;
  1561. assert(comm_data.pkd_length>0);
  1562. }
  1563. std::vector<Node_t*> recv_nodes(recv_data.size());
  1564. { // Find received octants in tree.
  1565. omp_par::merge_sort(&mid_indx_pair[0], &mid_indx_pair[0]+mid_indx_pair.size());
  1566. std::vector<size_t> indx(omp_p+1);
  1567. for(size_t i=0;i<=omp_p;i++){
  1568. size_t j=(mid_indx_pair.size()*i)/omp_p;
  1569. if(j>0) while(j<mid_indx_pair.size()-1){
  1570. if(mid_indx_pair[j+1].key.GetDepth()<=
  1571. mid_indx_pair[j].key.GetDepth()) break;
  1572. j++;
  1573. }
  1574. indx[i]=j;
  1575. }
  1576. int nchld=(1UL<<this->Dim()); // Number of children.
  1577. if(mid_indx_pair.size()>0)
  1578. for(size_t tid=1;tid<omp_p;tid++){
  1579. size_t j=indx[tid];
  1580. MortonId& mid=mid_indx_pair[j].key;
  1581. Node_t* srch_node=this->RootNode();
  1582. while(srch_node->GetMortonId()!=mid){
  1583. Node_t* ch_node;
  1584. if(srch_node->IsLeaf()){
  1585. srch_node->SetGhost(true);
  1586. srch_node->Subdivide();
  1587. }
  1588. for(int j=nchld-1;j>=0;j--){
  1589. ch_node=(Node_t*)srch_node->Child(j);
  1590. if(ch_node->GetMortonId()<=mid){
  1591. srch_node=ch_node;
  1592. break;
  1593. }
  1594. }
  1595. }
  1596. }
  1597. #pragma omp parallel for
  1598. for(size_t tid=0;tid<omp_p;tid++){
  1599. size_t a=indx[tid ];
  1600. size_t b=indx[tid+1];
  1601. for(size_t j=a;j<b;j++){ // Find shared nodes.
  1602. size_t i=mid_indx_pair[j].data;
  1603. MortonId& mid=mid_indx_pair[j].key;
  1604. Node_t* srch_node=this->RootNode();
  1605. while(srch_node->GetMortonId()!=mid){
  1606. Node_t* ch_node;
  1607. if(srch_node->IsLeaf()){
  1608. srch_node->SetGhost(true);
  1609. srch_node->Subdivide();
  1610. }
  1611. for(int j=nchld-1;j>=0;j--){
  1612. ch_node=(Node_t*)srch_node->Child(j);
  1613. if(ch_node->GetMortonId()<=mid){
  1614. srch_node=ch_node;
  1615. break;
  1616. }
  1617. }
  1618. }
  1619. recv_nodes[i]=srch_node;
  1620. }
  1621. }
  1622. }
  1623. #pragma omp parallel for
  1624. for(size_t i=0;i<recv_data.size();i++){ // Unpack
  1625. if(!recv_nodes[i]->IsGhost()) continue;
  1626. assert(recv_nodes[i]->IsGhost());
  1627. CommData& comm_data=*(CommData*)recv_data[i];
  1628. PackedData p;
  1629. p.data=((char*)recv_data[i])+sizeof(CommData);
  1630. p.length=comm_data.pkd_length-sizeof(CommData);
  1631. recv_nodes[i]->Unpack(p);
  1632. }
  1633. }
  1634. //Profile::Toc();
  1635. //Profile::Tic("Broadcast", &comm, true, 5);
  1636. { // Broadcast octants.
  1637. std::vector<MortonId> shrd_mid;
  1638. if(rank+1<num_p){ // Set shrd_mid.
  1639. MortonId m=mins[rank+1];
  1640. while(m.GetDepth()>0 && m.getDFD()>=mins[rank+1]){
  1641. m=m.getAncestor(m.GetDepth()-1);
  1642. }
  1643. size_t d=m.GetDepth()+1;
  1644. shrd_mid.resize(d);
  1645. for(size_t i=0;i<d;i++){
  1646. shrd_mid[i]=m.getAncestor(i);
  1647. }
  1648. }
  1649. std::vector<void*> shrd_data; // CommData for shared nodes.
  1650. { // Set shrd_data
  1651. for(size_t i=0;i<shared_data.size();i++){
  1652. CommData& comm_data=*(CommData*)shared_data[i];
  1653. assert(comm_data.mid.GetDepth()>0);
  1654. size_t d=comm_data.mid.GetDepth()-1;
  1655. if(d<shrd_mid.size() && shrd_mid[d].getDFD()>=mins[rank])
  1656. for(size_t j=0;j<comm_data.usr_cnt;j++){
  1657. if(comm_data.usr_mid[j]==shrd_mid[d]){
  1658. shrd_data.push_back(&comm_data);
  1659. break;
  1660. }
  1661. }
  1662. if(shrd_data.size()==0 || shrd_data.back()!=&comm_data) this->memgr.free(&comm_data);
  1663. }
  1664. for(size_t i=0;i<recv_data.size();i++){
  1665. CommData& comm_data=*(CommData*)recv_data[i];
  1666. assert(comm_data.mid.GetDepth()>0);
  1667. size_t d=comm_data.mid.GetDepth()-1;
  1668. if(d<shrd_mid.size() && shrd_mid[d].getDFD()>=mins[rank])
  1669. for(size_t j=0;j<comm_data.usr_cnt;j++){
  1670. if(comm_data.usr_mid[j]==shrd_mid[d]){
  1671. char* data_ptr=(char*)this->memgr.malloc(comm_data.pkd_length);
  1672. mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
  1673. shrd_data.push_back(data_ptr);
  1674. break;
  1675. }
  1676. }
  1677. }
  1678. }
  1679. size_t pid_shift=1;
  1680. while(pid_shift<num_p){
  1681. MPI_size_t recv_pid=(rank>=pid_shift?rank-pid_shift:rank);
  1682. MPI_size_t send_pid=(rank+pid_shift<num_p?rank+pid_shift:rank);
  1683. MPI_size_t send_length=0;
  1684. if(send_pid!=rank){ // Send data for send_pid
  1685. std::vector<void*> send_data;
  1686. std::vector<size_t> send_size;
  1687. for(size_t i=0; i<shrd_data.size();i++){
  1688. CommData& comm_data=*(CommData*)shrd_data[i];
  1689. size_t d=comm_data.mid.GetDepth()-1;
  1690. bool shared=(d<shrd_mid.size() && shrd_mid[d].NextId().getDFD()>mins[send_pid].getDFD());
  1691. if(shared) for(size_t j=0;j<comm_data.usr_cnt;j++){ // if send_pid already has this node then skip
  1692. if(comm_data.usr_pid[j]==send_pid){
  1693. shared=false;
  1694. break;
  1695. }
  1696. }
  1697. if(!shared) continue;
  1698. send_data.push_back(&comm_data);
  1699. send_size.push_back(comm_data.pkd_length);
  1700. }
  1701. std::vector<size_t> send_disp(send_data.size(),0);
  1702. omp_par::scan(&send_size[0],&send_disp[0],send_data.size());
  1703. if(send_data.size()>0) send_length=send_size.back()+send_disp.back();
  1704. // Resize send_buff.
  1705. if(send_buff.size()<send_length){
  1706. send_buff.resize(send_length);
  1707. }
  1708. // Copy data to send_buff.
  1709. #pragma omp parallel for
  1710. for(size_t i=0;i<send_data.size();i++){
  1711. CommData& comm_data=*(CommData*)send_data[i];
  1712. mem::memcopy(&send_buff[send_disp[i]], &comm_data, comm_data.pkd_length);
  1713. }
  1714. }
  1715. MPI_size_t recv_length=0;
  1716. { // Send-Recv data
  1717. MPI_Request request;
  1718. MPI_Status status;
  1719. if(recv_pid!=rank) MPI_Irecv(&recv_length, 1, par::Mpi_datatype<MPI_size_t>::value(),recv_pid, 1, *Comm(), &request);
  1720. if(send_pid!=rank) MPI_Send (&send_length, 1, par::Mpi_datatype<MPI_size_t>::value(),send_pid, 1, *Comm());
  1721. if(recv_pid!=rank) MPI_Wait(&request, &status);
  1722. // Resize recv_buff
  1723. if(recv_buff.size()<recv_length){
  1724. recv_buff.resize(recv_length);
  1725. }
  1726. if(recv_length>0) MPI_Irecv(&recv_buff[0], recv_length, par::Mpi_datatype<char>::value(),recv_pid, 1, *Comm(), &request);
  1727. if(send_length>0) MPI_Send (&send_buff[0], send_length, par::Mpi_datatype<char>::value(),send_pid, 1, *Comm());
  1728. if(recv_length>0) MPI_Wait(&request, &status);
  1729. }
  1730. std::vector<void*> recv_data; // CommData for received nodes.
  1731. { // Unpack received octants.
  1732. std::vector<par::SortPair<MortonId,size_t> > mid_indx_pair;
  1733. for(size_t i=0; i<recv_length;){
  1734. CommData& comm_data=*(CommData*)&recv_buff[i];
  1735. recv_data.push_back(&comm_data);
  1736. { // Add mid_indx_pair
  1737. par::SortPair<MortonId,size_t> p;
  1738. p.key=comm_data.mid;
  1739. p.data=mid_indx_pair.size();
  1740. mid_indx_pair.push_back(p);
  1741. }
  1742. i+=comm_data.pkd_length;
  1743. assert(comm_data.pkd_length>0);
  1744. }
  1745. std::vector<Node_t*> recv_nodes(recv_data.size());
  1746. int nchld=(1UL<<this->Dim()); // Number of children.
  1747. // for(size_t i=0;i<recv_data.size();i++){ // Find received octants in tree.
  1748. // CommData& comm_data=*(CommData*)recv_data[i];
  1749. // MortonId& mid=comm_data.mid;
  1750. // Node_t* srch_node=this->RootNode();
  1751. // while(srch_node->GetMortonId()!=mid){
  1752. // Node_t* ch_node;
  1753. // if(srch_node->IsLeaf()){
  1754. // srch_node->SetGhost(true);
  1755. // srch_node->Subdivide();
  1756. // }
  1757. // for(int j=nchld-1;j>=0;j--){
  1758. // ch_node=(Node_t*)srch_node->Child(j);
  1759. // if(ch_node->GetMortonId()<=mid){
  1760. // srch_node=ch_node;
  1761. // break;
  1762. // }
  1763. // }
  1764. // }
  1765. // recv_nodes[i]=srch_node;
  1766. // }
  1767. { // Find received octants in tree.
  1768. omp_par::merge_sort(&mid_indx_pair[0], &mid_indx_pair[0]+mid_indx_pair.size());
  1769. std::vector<size_t> indx(omp_p+1);
  1770. for(size_t i=0;i<=omp_p;i++){
  1771. size_t j=(mid_indx_pair.size()*i)/omp_p;
  1772. if(j>0) while(j<mid_indx_pair.size()-1){
  1773. if(mid_indx_pair[j+1].key.GetDepth()<=
  1774. mid_indx_pair[j].key.GetDepth()) break;
  1775. j++;
  1776. }
  1777. indx[i]=j;
  1778. }
  1779. int nchld=(1UL<<this->Dim()); // Number of children.
  1780. if(mid_indx_pair.size()>0)
  1781. for(size_t tid=1;tid<omp_p;tid++){
  1782. size_t j=indx[tid];
  1783. MortonId& mid=mid_indx_pair[j].key;
  1784. Node_t* srch_node=this->RootNode();
  1785. while(srch_node->GetMortonId()!=mid){
  1786. Node_t* ch_node;
  1787. if(srch_node->IsLeaf()){
  1788. srch_node->SetGhost(true);
  1789. srch_node->Subdivide();
  1790. }
  1791. for(int j=nchld-1;j>=0;j--){
  1792. ch_node=(Node_t*)srch_node->Child(j);
  1793. if(ch_node->GetMortonId()<=mid){
  1794. srch_node=ch_node;
  1795. break;
  1796. }
  1797. }
  1798. }
  1799. }
  1800. #pragma omp parallel for
  1801. for(size_t tid=0;tid<omp_p;tid++){
  1802. size_t a=indx[tid ];
  1803. size_t b=indx[tid+1];
  1804. for(size_t j=a;j<b;j++){ // Find shared nodes.
  1805. size_t i=mid_indx_pair[j].data;
  1806. MortonId& mid=mid_indx_pair[j].key;
  1807. Node_t* srch_node=this->RootNode();
  1808. while(srch_node->GetMortonId()!=mid){
  1809. Node_t* ch_node;
  1810. if(srch_node->IsLeaf()){
  1811. srch_node->SetGhost(true);
  1812. srch_node->Subdivide();
  1813. }
  1814. for(int j=nchld-1;j>=0;j--){
  1815. ch_node=(Node_t*)srch_node->Child(j);
  1816. if(ch_node->GetMortonId()<=mid){
  1817. srch_node=ch_node;
  1818. break;
  1819. }
  1820. }
  1821. }
  1822. recv_nodes[i]=srch_node;
  1823. }
  1824. }
  1825. }
  1826. #pragma omp parallel for
  1827. for(size_t i=0;i<recv_data.size();i++){
  1828. if(!recv_nodes[i]->IsGhost()) continue;
  1829. assert(recv_nodes[i]->IsGhost());
  1830. CommData& comm_data=*(CommData*)recv_data[i];
  1831. PackedData p;
  1832. p.data=((char*)recv_data[i])+sizeof(CommData);
  1833. p.length=comm_data.pkd_length-sizeof(CommData);
  1834. recv_nodes[i]->Unpack(p);
  1835. }
  1836. }
  1837. pid_shift<<=1;
  1838. send_pid=(rank+pid_shift<num_p?rank+pid_shift:rank);
  1839. if(send_pid!=rank){ // Set shrd_data
  1840. for(size_t i=0;i<recv_data.size();i++){
  1841. CommData& comm_data=*(CommData*)recv_data[i];
  1842. //{ // Skip if this node already exists.
  1843. // bool skip=false;
  1844. // for(size_t k=0;k<shrd_data.size();k++){
  1845. // CommData& comm_data_=*(CommData*)shrd_data[k];
  1846. // if(comm_data_.mid==comm_data.mid){
  1847. // assert(false);
  1848. // skip=true;
  1849. // break;
  1850. // }
  1851. // }
  1852. // if(skip) continue;
  1853. //}
  1854. assert(comm_data.mid.GetDepth()>0);
  1855. size_t d=comm_data.mid.GetDepth()-1;
  1856. if(d<shrd_mid.size() && shrd_mid[d].isAncestor(mins[rank]) && shrd_mid[d].NextId().getDFD()>mins[send_pid].getDFD())
  1857. for(size_t j=0;j<comm_data.usr_cnt;j++){
  1858. if(comm_data.usr_mid[j]==shrd_mid[d]){
  1859. char* data_ptr=(char*)this->memgr.malloc(comm_data.pkd_length);
  1860. mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
  1861. shrd_data.push_back(data_ptr);
  1862. break;
  1863. }
  1864. }
  1865. }
  1866. }
  1867. }
  1868. // Free data
  1869. //Profile::Tic("Free", &comm, false, 5);
  1870. for(size_t i=0;i<shrd_data.size();i++) this->memgr.free(shrd_data[i]);
  1871. //Profile::Toc();
  1872. }
  1873. //Profile::Toc();
  1874. }
  1875. inline bool isLittleEndian(){
  1876. uint16_t number = 0x1;
  1877. uint8_t *numPtr = (uint8_t*)&number;
  1878. return (numPtr[0] == 1);
  1879. }
  1880. template <class TreeNode>
  1881. void MPI_Tree<TreeNode>::Write2File(const char* fname, int lod){
  1882. typedef double VTKData_t;
  1883. int myrank, np;
  1884. MPI_Comm_size(*Comm(),&np);
  1885. MPI_Comm_rank(*Comm(),&myrank);
  1886. std::vector<Real_t> coord_; //Coordinates of octant corners.
  1887. std::vector<Real_t> value_; //Data value at points.
  1888. std::vector<VTKData_t> coord; //Coordinates of octant corners.
  1889. std::vector<VTKData_t> value; //Data value at points.
  1890. std::vector<int32_t> mpi_rank; //MPI_Rank at points.
  1891. std::vector<int32_t> connect; //Cell connectivity.
  1892. std::vector<int32_t> offset ; //Cell offset.
  1893. std::vector<uint8_t> types ; //Cell types.
  1894. //Build list of octant corner points.
  1895. Node_t* n=this->PreorderFirst();
  1896. while(n!=NULL){
  1897. if(!n->IsGhost() && n->IsLeaf())
  1898. n->VTU_Data(coord_, value_, connect, offset, types, lod);
  1899. n=this->PreorderNxt(n);
  1900. }
  1901. for(size_t i=0;i<coord_.size();i++) coord.push_back(coord_[i]);
  1902. for(size_t i=0;i<value_.size();i++) value.push_back(value_[i]);
  1903. int pt_cnt=coord.size()/COORD_DIM;
  1904. int dof=(pt_cnt?value.size()/pt_cnt:0);
  1905. assert(value.size()==(size_t)pt_cnt*dof);
  1906. int cell_cnt=types.size();
  1907. mpi_rank.resize(pt_cnt);
  1908. int new_myrank=myrank;//rand();
  1909. for(int i=0;i<pt_cnt;i++) mpi_rank[i]=new_myrank;
  1910. //Open file for writing.
  1911. std::stringstream vtufname;
  1912. vtufname<<fname<<std::setfill('0')<<std::setw(6)<<myrank<<".vtu";
  1913. std::ofstream vtufile;
  1914. vtufile.open(vtufname.str().c_str());
  1915. if(vtufile.fail()) return;
  1916. //Proceed to write to file.
  1917. size_t data_size=0;
  1918. vtufile<<"<?xml version=\"1.0\"?>\n";
  1919. if(isLittleEndian()) vtufile<<"<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n";
  1920. else vtufile<<"<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n";
  1921. //===========================================================================
  1922. vtufile<<" <UnstructuredGrid>\n";
  1923. vtufile<<" <Piece NumberOfPoints=\""<<pt_cnt<<"\" NumberOfCells=\""<<cell_cnt<<"\">\n";
  1924. //---------------------------------------------------------------------------
  1925. vtufile<<" <Points>\n";
  1926. vtufile<<" <DataArray type=\"Float"<<sizeof(VTKData_t)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1927. data_size+=sizeof(uint32_t)+coord.size()*sizeof(VTKData_t);
  1928. vtufile<<" </Points>\n";
  1929. //---------------------------------------------------------------------------
  1930. vtufile<<" <PointData>\n";
  1931. vtufile<<" <DataArray type=\"Float"<<sizeof(VTKData_t)*8<<"\" NumberOfComponents=\""<<dof<<"\" Name=\"value\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1932. data_size+=sizeof(uint32_t)+value.size()*sizeof(VTKData_t);
  1933. vtufile<<" <DataArray type=\"Int32\" NumberOfComponents=\"1\" Name=\"mpi_rank\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1934. data_size+=sizeof(uint32_t)+mpi_rank.size()*sizeof(int32_t);
  1935. vtufile<<" </PointData>\n";
  1936. //---------------------------------------------------------------------------
  1937. //---------------------------------------------------------------------------
  1938. vtufile<<" <Cells>\n";
  1939. vtufile<<" <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1940. data_size+=sizeof(uint32_t)+connect.size()*sizeof(int32_t);
  1941. vtufile<<" <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1942. data_size+=sizeof(uint32_t)+offset.size() *sizeof(int32_t);
  1943. vtufile<<" <DataArray type=\"UInt8\" Name=\"types\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1944. data_size+=sizeof(uint32_t)+types.size() *sizeof(uint8_t);
  1945. vtufile<<" </Cells>\n";
  1946. //---------------------------------------------------------------------------
  1947. //vtufile<<" <CellData>\n";
  1948. //vtufile<<" <DataArray type=\"Float"<<sizeof(VTKData_t)*8<<"\" Name=\"Velocity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1949. //vtufile<<" </CellData>\n";
  1950. //---------------------------------------------------------------------------
  1951. vtufile<<" </Piece>\n";
  1952. vtufile<<" </UnstructuredGrid>\n";
  1953. //===========================================================================
  1954. vtufile<<" <AppendedData encoding=\"raw\">\n";
  1955. vtufile<<" _";
  1956. int32_t block_size;
  1957. block_size=coord .size()*sizeof(VTKData_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&coord [0], coord .size()*sizeof(VTKData_t));
  1958. block_size=value .size()*sizeof(VTKData_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&value [0], value .size()*sizeof(VTKData_t));
  1959. block_size=mpi_rank.size()*sizeof( int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&mpi_rank[0], mpi_rank.size()*sizeof( int32_t));
  1960. block_size=connect.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&connect[0], connect.size()*sizeof(int32_t));
  1961. block_size=offset .size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&offset [0], offset .size()*sizeof(int32_t));
  1962. block_size=types .size()*sizeof(uint8_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&types [0], types .size()*sizeof(uint8_t));
  1963. vtufile<<"\n";
  1964. vtufile<<" </AppendedData>\n";
  1965. //===========================================================================
  1966. vtufile<<"</VTKFile>\n";
  1967. vtufile.close();
  1968. if(myrank) return;
  1969. std::stringstream pvtufname;
  1970. pvtufname<<fname<<".pvtu";
  1971. std::ofstream pvtufile;
  1972. pvtufile.open(pvtufname.str().c_str());
  1973. if(pvtufile.fail()) return;
  1974. pvtufile<<"<?xml version=\"1.0\"?>\n";
  1975. pvtufile<<"<VTKFile type=\"PUnstructuredGrid\">\n";
  1976. pvtufile<<" <PUnstructuredGrid GhostLevel=\"0\">\n";
  1977. pvtufile<<" <PPoints>\n";
  1978. pvtufile<<" <PDataArray type=\"Float"<<sizeof(VTKData_t)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\"/>\n";
  1979. pvtufile<<" </PPoints>\n";
  1980. pvtufile<<" <PPointData>\n";
  1981. pvtufile<<" <PDataArray type=\"Float"<<sizeof(VTKData_t)*8<<"\" NumberOfComponents=\""<<dof<<"\" Name=\"value\"/>\n";
  1982. pvtufile<<" <PDataArray type=\"Int32\" NumberOfComponents=\"1\" Name=\"mpi_rank\"/>\n";
  1983. pvtufile<<" </PPointData>\n";
  1984. {
  1985. // Extract filename from path.
  1986. std::stringstream vtupath;
  1987. vtupath<<'/'<<fname<<'\0';
  1988. char *fname_ = (char*)strrchr(vtupath.str().c_str(), '/') + 1;
  1989. //std::string fname_ = boost::filesystem::path(fname).filename().string().
  1990. for(int i=0;i<np;i++) pvtufile<<" <Piece Source=\""<<fname_<<std::setfill('0')<<std::setw(6)<<i<<".vtu\"/>\n";
  1991. }
  1992. pvtufile<<" </PUnstructuredGrid>\n";
  1993. pvtufile<<"</VTKFile>\n";
  1994. pvtufile.close();
  1995. }
  1996. template <class TreeNode>
  1997. const std::vector<MortonId>& MPI_Tree<TreeNode>::GetMins(){
  1998. Node_t* n=this->PreorderFirst();
  1999. while(n!=NULL){
  2000. if(!n->IsGhost() && n->IsLeaf()) break;
  2001. n=this->PreorderNxt(n);
  2002. }
  2003. ASSERT_WITH_MSG(n!=NULL,"No non-ghost nodes found on this process.");
  2004. MortonId my_min;
  2005. my_min=n->GetMortonId();
  2006. int np;
  2007. MPI_Comm_size(*Comm(),&np);
  2008. mins.resize(np);
  2009. MPI_Allgather(&my_min , 1, par::Mpi_datatype<MortonId>::value(),
  2010. &mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  2011. return mins;
  2012. }
  2013. }//end namespace