mpi_tree.txx 75 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252
  1. /**
  2. * \file mpi_tree.txx
  3. * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  4. * \date 12-11-2010
  5. * \brief This file contains the implementation of the class MPI_Tree.
  6. */
  7. #include <omp.h>
  8. #include <cmath>
  9. #include <cstdlib>
  10. #include <cassert>
  11. #include <string>
  12. #include <sstream>
  13. #include <iostream>
  14. #include <iomanip>
  15. #include <fstream>
  16. #include <algorithm>
  17. #include <stdint.h>
  18. #include <set>
  19. #include <dtypes.h>
  20. #include <ompUtils.h>
  21. #include <parUtils.h>
  22. #include <mem_mgr.hpp>
  23. #include <mpi_node.hpp>
  24. #include <profile.hpp>
  25. namespace pvfmm{
  26. /**
  27. * @author Dhairya Malhotra, dhairya.malhotra@gmail.com
  28. * @date 08 Feb 2011
  29. */
  30. inline int p2oLocal(Vector<MortonId> & nodes, Vector<MortonId>& leaves,
  31. unsigned int maxNumPts, unsigned int maxDepth, bool complete) {
  32. assert(maxDepth<=MAX_DEPTH);
  33. std::vector<MortonId> leaves_lst;
  34. unsigned int init_size=leaves.Dim();
  35. unsigned int num_pts=nodes.Dim();
  36. MortonId curr_node=leaves[0];
  37. MortonId last_node=leaves[init_size-1].NextId();
  38. MortonId next_node;
  39. unsigned int curr_pt=0;
  40. unsigned int next_pt=curr_pt+maxNumPts;
  41. while(next_pt <= num_pts){
  42. next_node = curr_node.NextId();
  43. while( next_pt < num_pts && next_node > nodes[next_pt] && curr_node.GetDepth() < maxDepth-1 ){
  44. curr_node = curr_node.getDFD(curr_node.GetDepth()+1);
  45. next_node = curr_node.NextId();
  46. }
  47. leaves_lst.push_back(curr_node);
  48. curr_node = next_node;
  49. unsigned int inc=maxNumPts;
  50. while(next_pt < num_pts && curr_node > nodes[next_pt]){
  51. // We have more than maxNumPts points per octant because the node can
  52. // not be refined any further.
  53. inc=inc<<1;
  54. next_pt+=inc;
  55. if(next_pt > num_pts){
  56. next_pt = num_pts;
  57. break;
  58. }
  59. }
  60. curr_pt = std::lower_bound(&nodes[0]+curr_pt,&nodes[0]+next_pt,curr_node,std::less<MortonId>())-&nodes[0];
  61. if(curr_pt >= num_pts) break;
  62. next_pt = curr_pt + maxNumPts;
  63. if(next_pt > num_pts) next_pt = num_pts;
  64. }
  65. #ifndef NDEBUG
  66. for(size_t i=0;i<leaves_lst.size();i++){
  67. size_t a=std::lower_bound(&nodes[0],&nodes[0]+nodes.Dim(),leaves_lst[i],std::less<MortonId>())-&nodes[0];
  68. size_t b=std::lower_bound(&nodes[0],&nodes[0]+nodes.Dim(),leaves_lst[i].NextId(),std::less<MortonId>())-&nodes[0];
  69. assert(b-a<=maxNumPts || leaves_lst[i].GetDepth()==maxDepth-1);
  70. if(i==leaves_lst.size()-1) assert(b==nodes.Dim() && a<nodes.Dim());
  71. if(i==0) assert(a==0);
  72. }
  73. #endif
  74. if(complete)
  75. while(curr_node<last_node){
  76. while( curr_node.NextId() > last_node && curr_node.GetDepth() < maxDepth-1 )
  77. curr_node = curr_node.getDFD(curr_node.GetDepth()+1);
  78. leaves_lst.push_back(curr_node);
  79. curr_node = curr_node.NextId();
  80. }
  81. leaves=leaves_lst;
  82. return 0;
  83. }
  84. inline int points2Octree(const Vector<MortonId>& pt_mid, Vector<MortonId>& nodes,
  85. unsigned int maxDepth, unsigned int maxNumPts, const MPI_Comm& comm ) {
  86. int myrank, np;
  87. MPI_Comm_rank(comm, &myrank);
  88. MPI_Comm_size(comm, &np);
  89. // Sort morton id of points.
  90. Profile::Tic("SortMortonId", &comm, true, 10);
  91. Vector<MortonId> pt_sorted;
  92. //par::partitionW<MortonId>(pt_mid, NULL, comm);
  93. par::HyperQuickSort(pt_mid, pt_sorted, comm);
  94. size_t pt_cnt=pt_sorted.Dim();
  95. Profile::Toc();
  96. // Add last few points from next process, to get the boundary octant right.
  97. Profile::Tic("Comm", &comm, true, 10);
  98. {
  99. { // Adjust maxNumPts
  100. size_t glb_pt_cnt=0;
  101. MPI_Allreduce(&pt_cnt, &glb_pt_cnt, 1, par::Mpi_datatype<size_t>::value(), par::Mpi_datatype<size_t>::sum(), comm);
  102. if(glb_pt_cnt<maxNumPts*np) maxNumPts=glb_pt_cnt/np;
  103. }
  104. size_t recv_size=0;
  105. size_t send_size=(2*maxNumPts<pt_cnt?2*maxNumPts:pt_cnt);
  106. {
  107. MPI_Request recvRequest;
  108. MPI_Request sendRequest;
  109. MPI_Status statusWait;
  110. if(myrank < (np-1)) MPI_Irecv (&recv_size, 1, par::Mpi_datatype<size_t>::value(), myrank+1, 1, comm, &recvRequest);
  111. if(myrank > 0 ) MPI_Issend(&send_size, 1, par::Mpi_datatype<size_t>::value(), myrank-1, 1, comm, &sendRequest);
  112. if(myrank < (np-1)) MPI_Wait(&recvRequest, &statusWait);
  113. if(myrank > 0 ) MPI_Wait(&sendRequest, &statusWait); //This can be done later.
  114. }
  115. if(recv_size>0){// Resize pt_sorted.
  116. Vector<MortonId> pt_sorted_(pt_cnt+recv_size);
  117. mem::memcopy(&pt_sorted_[0], &pt_sorted[0], pt_cnt*sizeof(MortonId));
  118. pt_sorted.Swap(pt_sorted_);
  119. }
  120. {// Exchange data.
  121. MPI_Request recvRequest;
  122. MPI_Request sendRequest;
  123. MPI_Status statusWait;
  124. if(myrank < (np-1)) MPI_Irecv (&pt_sorted[0]+pt_cnt, recv_size, par::Mpi_datatype<MortonId>::value(), myrank+1, 1, comm, &recvRequest);
  125. if(myrank > 0 ) MPI_Issend(&pt_sorted[0] , send_size, par::Mpi_datatype<MortonId>::value(), myrank-1, 1, comm, &sendRequest);
  126. if(myrank < (np-1)) MPI_Wait(&recvRequest, &statusWait);
  127. if(myrank > 0 ) MPI_Wait(&sendRequest, &statusWait); //This can be done later.
  128. }
  129. }
  130. Profile::Toc();
  131. // Construct local octree.
  132. Profile::Tic("p2o_local", &comm, false, 10);
  133. Vector<MortonId> nodes_local(1); nodes_local[0]=MortonId();
  134. p2oLocal(pt_sorted, nodes_local, maxNumPts, maxDepth, myrank==np-1);
  135. Profile::Toc();
  136. // Remove duplicate nodes on adjacent processors.
  137. Profile::Tic("RemoveDuplicates", &comm, true, 10);
  138. {
  139. size_t node_cnt=nodes_local.Dim();
  140. MortonId first_node;
  141. MortonId last_node=nodes_local[node_cnt-1];
  142. { // Send last_node to next process and get first_node from previous process.
  143. MPI_Request recvRequest;
  144. MPI_Request sendRequest;
  145. MPI_Status statusWait;
  146. if(myrank < (np-1)) MPI_Issend(& last_node, 1, par::Mpi_datatype<MortonId>::value(), myrank+1, 1, comm, &recvRequest);
  147. if(myrank > 0 ) MPI_Irecv (&first_node, 1, par::Mpi_datatype<MortonId>::value(), myrank-1, 1, comm, &sendRequest);
  148. if(myrank < (np-1)) MPI_Wait(&recvRequest, &statusWait);
  149. if(myrank > 0 ) MPI_Wait(&sendRequest, &statusWait); //This can be done later.
  150. }
  151. size_t i=0;
  152. std::vector<MortonId> node_lst;
  153. if(myrank){
  154. while(i<node_cnt && nodes_local[i].getDFD(maxDepth)<first_node) i++; assert(i);
  155. last_node=nodes_local[i>0?i-1:0].NextId(); // Next MortonId in the tree after first_node.
  156. while(first_node<last_node){ // Complete nodes between first_node and last_node.
  157. while(first_node.isAncestor(last_node))
  158. first_node=first_node.getDFD(first_node.GetDepth()+1);
  159. if(first_node==last_node) break;
  160. node_lst.push_back(first_node);
  161. first_node=first_node.NextId();
  162. }
  163. }
  164. for(;i<node_cnt-(myrank==np-1?0:1);i++) node_lst.push_back(nodes_local[i]);
  165. nodes=node_lst;
  166. }
  167. Profile::Toc();
  168. // Repartition nodes.
  169. Profile::Tic("partitionW", &comm, false, 10);
  170. par::partitionW<MortonId>(nodes, NULL , comm);
  171. Profile::Toc();
  172. return 0;
  173. }
  174. template <class TreeNode>
  175. void MPI_Tree<TreeNode>::Initialize(typename Node_t::NodeData* init_data){
  176. //Initialize root node.
  177. Profile::Tic("InitRoot",Comm(),false,5);
  178. Tree<TreeNode>::Initialize(init_data);
  179. TreeNode* rnode=this->RootNode();
  180. assert(this->dim==COORD_DIM);
  181. Profile::Toc();
  182. Profile::Tic("Points2Octee",Comm(),true,5);
  183. Vector<MortonId> lin_oct;
  184. { //Get the linear tree.
  185. // Compute MortonId from pt_coord.
  186. Vector<MortonId> pt_mid;
  187. Vector<Real_t>& pt_coord=rnode->pt_coord;
  188. size_t pt_cnt=pt_coord.Dim()/this->dim;
  189. pt_mid.Resize(pt_cnt);
  190. #pragma omp parallel for
  191. for(size_t i=0;i<pt_cnt;i++){
  192. pt_mid[i]=MortonId(pt_coord[i*COORD_DIM+0],pt_coord[i*COORD_DIM+1],pt_coord[i*COORD_DIM+2],this->max_depth);
  193. }
  194. //Get the linear tree.
  195. points2Octree(pt_mid,lin_oct,this->max_depth,init_data->max_pts,*Comm());
  196. }
  197. Profile::Toc();
  198. Profile::Tic("ScatterPoints",Comm(),true,5);
  199. { // Sort and partition point coordinates and values.
  200. std::vector<Vector<Real_t>*> coord_lst;
  201. std::vector<Vector<Real_t>*> value_lst;
  202. std::vector<Vector<size_t>*> scatter_lst;
  203. rnode->NodeDataVec(coord_lst, value_lst, scatter_lst);
  204. assert(coord_lst.size()==value_lst.size());
  205. assert(coord_lst.size()==scatter_lst.size());
  206. Vector<MortonId> pt_mid;
  207. Vector<size_t> scatter_index;
  208. for(size_t i=0;i<coord_lst.size();i++){
  209. if(!coord_lst[i]) continue;
  210. Vector<Real_t>& pt_coord=*coord_lst[i];
  211. { // Compute MortonId from pt_coord.
  212. size_t pt_cnt=pt_coord.Dim()/this->dim;
  213. pt_mid.Resize(pt_cnt);
  214. #pragma omp parallel for
  215. for(size_t i=0;i<pt_cnt;i++){
  216. pt_mid[i]=MortonId(pt_coord[i*COORD_DIM+0],pt_coord[i*COORD_DIM+1],pt_coord[i*COORD_DIM+2],this->max_depth);
  217. }
  218. }
  219. par::SortScatterIndex(pt_mid , scatter_index, comm, &lin_oct[0]);
  220. par::ScatterForward (pt_coord, scatter_index, comm);
  221. if(value_lst[i]!=NULL){
  222. Vector<Real_t>& pt_value=*value_lst[i];
  223. par::ScatterForward(pt_value, scatter_index, comm);
  224. }
  225. if(scatter_lst[i]!=NULL){
  226. Vector<size_t>& pt_scatter=*scatter_lst[i];
  227. pt_scatter=scatter_index;
  228. }
  229. }
  230. }
  231. Profile::Toc();
  232. //Initialize the pointer based tree from the linear tree.
  233. Profile::Tic("PointerTree",Comm(),false,5);
  234. { // Construct the pointer tree from lin_oct
  235. int omp_p=omp_get_max_threads();
  236. // Partition nodes between threads
  237. rnode->SetGhost(false);
  238. for(int i=0;i<omp_p;i++){
  239. size_t idx=(lin_oct.Dim()*i)/omp_p;
  240. Node_t* n=FindNode(lin_oct[idx], true);
  241. assert(n->GetMortonId()==lin_oct[idx]);
  242. UNUSED(n);
  243. }
  244. #pragma omp parallel for
  245. for(int i=0;i<omp_p;i++){
  246. size_t a=(lin_oct.Dim()* i )/omp_p;
  247. size_t b=(lin_oct.Dim()*(i+1))/omp_p;
  248. size_t idx=a;
  249. Node_t* n=FindNode(lin_oct[idx], false);
  250. if(a==0) n=rnode;
  251. while(n!=NULL && (idx<b || i==omp_p-1)){
  252. n->SetGhost(false);
  253. MortonId dn=n->GetMortonId();
  254. if(idx<b && dn.isAncestor(lin_oct[idx])){
  255. if(n->IsLeaf()) n->Subdivide();
  256. }else if(idx<b && dn==lin_oct[idx]){
  257. if(!n->IsLeaf()) n->Truncate();
  258. assert(n->IsLeaf());
  259. idx++;
  260. }else{
  261. n->Truncate();
  262. n->SetGhost(true);
  263. }
  264. n=this->PreorderNxt(n);
  265. }
  266. assert(idx==b);
  267. }
  268. }
  269. Profile::Toc();
  270. #ifndef NDEBUG
  271. CheckTree();
  272. #endif
  273. }
  274. template <class TreeNode>
  275. void MPI_Tree<TreeNode>::CoarsenTree(){
  276. int myrank;
  277. MPI_Comm_rank(*Comm(),&myrank);
  278. //Redistribute.
  279. {
  280. Node_t* n=this->PostorderFirst();
  281. while(n){
  282. if(n->IsLeaf() && !n->IsGhost()) break;
  283. n=this->PostorderNxt(n);
  284. }
  285. while(myrank){
  286. Node_t* n_parent=(Node_t*)n->Parent();
  287. Node_t* n_ = n_parent;
  288. while(n_ && !n_->IsLeaf()){
  289. n_=this->PostorderNxt(n_);
  290. if(!n_) break;
  291. }
  292. if(!n_ || n_->IsGhost()) break;
  293. if(n->Depth()<=n_->Depth()) break;
  294. if(n_->Depth()<=1) break;
  295. n=n_;
  296. }
  297. MortonId loc_min=n->GetMortonId();
  298. RedistNodes(&loc_min);
  299. }
  300. //Truncate ghost nodes and build node list
  301. std::vector<Node_t*> leaf_nodes;
  302. {
  303. Node_t* n=this->PostorderFirst();
  304. while(n!=NULL){
  305. if(n->IsLeaf() && !n->IsGhost()) break;
  306. n->Truncate();
  307. n->SetGhost(true);
  308. n->ClearData();
  309. n=this->PostorderNxt(n);
  310. }
  311. while(n!=NULL){
  312. if(n->IsLeaf() && n->IsGhost()) break;
  313. if(n->IsLeaf()) leaf_nodes.push_back(n);
  314. n=this->PreorderNxt(n);
  315. }
  316. while(n!=NULL){
  317. n->Truncate();
  318. n->SetGhost(true);
  319. n->ClearData();
  320. n=this->PreorderNxt(n);
  321. }
  322. }
  323. size_t node_cnt=leaf_nodes.size();
  324. //Partition nodes between OpenMP threads.
  325. int omp_p=omp_get_max_threads();
  326. std::vector<MortonId> mid(omp_p);
  327. std::vector<MortonId> split_key(omp_p);
  328. #pragma omp parallel for
  329. for(int i=0;i<omp_p;i++){
  330. mid[i]=leaf_nodes[(i*node_cnt)/omp_p]->GetMortonId();
  331. }
  332. //Coarsen Tree.
  333. #pragma omp parallel for
  334. for(int i=0;i<omp_p;i++){
  335. Node_t* n_=leaf_nodes[i*node_cnt/omp_p];
  336. if(i*node_cnt/omp_p<(i+1)*node_cnt/omp_p)
  337. while(n_!=NULL){
  338. MortonId n_mid=n_->GetMortonId();
  339. if(!n_->IsLeaf() && !n_mid.isAncestor(mid[i].getDFD()))
  340. if(i<omp_p-1? !n_mid.isAncestor(mid[i+1].getDFD()):true)
  341. if(!n_->SubdivCond()) n_->Truncate();
  342. if(i<omp_p-1? n_mid==mid[i+1]: false) break;
  343. n_=this->PostorderNxt(n_);
  344. }
  345. }
  346. //Truncate nodes along ancestors of splitters.
  347. for(int i=0;i<omp_p;i++){
  348. Node_t* n_=FindNode(mid[i], false, this->RootNode());
  349. while(n_->Depth()>0){
  350. n_=(Node_t*)n_->Parent();
  351. if(!n_->SubdivCond()) n_->Truncate();
  352. else break;
  353. }
  354. }
  355. }
  356. template <class TreeNode>
  357. void MPI_Tree<TreeNode>::RefineTree(){
  358. int np, myrank;
  359. MPI_Comm_size(*Comm(),&np);
  360. MPI_Comm_rank(*Comm(),&myrank);
  361. int omp_p=omp_get_max_threads();
  362. int n_child=1UL<<this->Dim();
  363. //Coarsen tree.
  364. MPI_Tree<TreeNode>::CoarsenTree();
  365. //Build node list.
  366. std::vector<Node_t*> leaf_nodes;
  367. {
  368. Node_t* n=this->PostorderFirst();
  369. while(n!=NULL){
  370. if(n->IsLeaf() && !n->IsGhost())
  371. leaf_nodes.push_back(n);
  372. n=this->PostorderNxt(n);
  373. }
  374. }
  375. size_t tree_node_cnt=leaf_nodes.size();
  376. //Adaptive subdivision of leaf nodes with load balancing.
  377. for(int l=0;l<this->max_depth;l++){
  378. //Subdivide nodes.
  379. std::vector<std::vector<Node_t*> > leaf_nodes_(omp_p);
  380. #pragma omp parallel for
  381. for(int i=0;i<omp_p;i++){
  382. size_t a=(leaf_nodes.size()* i )/omp_p;
  383. size_t b=(leaf_nodes.size()*(i+1))/omp_p;
  384. for(size_t j=a;j<b;j++){
  385. if(leaf_nodes[j]->IsLeaf() && !leaf_nodes[j]->IsGhost()){
  386. if(leaf_nodes[j]->SubdivCond()) leaf_nodes[j]->Subdivide();
  387. if(!leaf_nodes[j]->IsLeaf())
  388. for(int k=0;k<n_child;k++)
  389. leaf_nodes_[i].push_back((Node_t*)leaf_nodes[j]->Child(k));
  390. }
  391. }
  392. }
  393. for(int i=0;i<omp_p;i++)
  394. tree_node_cnt+=(leaf_nodes_[i].size()/n_child)*(n_child-1);
  395. //Determine load imbalance.
  396. size_t global_max, global_sum;
  397. MPI_Allreduce(&tree_node_cnt, &global_max, 1, par::Mpi_datatype<size_t>::value(), par::Mpi_datatype<size_t>::max(), *Comm());
  398. MPI_Allreduce(&tree_node_cnt, &global_sum, 1, par::Mpi_datatype<size_t>::value(), par::Mpi_datatype<size_t>::sum(), *Comm());
  399. //RedistNodes if needed.
  400. if(global_max*np>4*global_sum){
  401. #ifndef NDEBUG
  402. Profile::Tic("RedistNodes",Comm(),true,4);
  403. #endif
  404. RedistNodes();
  405. #ifndef NDEBUG
  406. Profile::Toc();
  407. #endif
  408. //Rebuild node list.
  409. leaf_nodes.clear();
  410. Node_t* n=this->PostorderFirst();
  411. while(n!=NULL){
  412. if(n->IsLeaf() && !n->IsGhost())
  413. leaf_nodes.push_back(n);
  414. n=this->PostorderNxt(n);
  415. }
  416. tree_node_cnt=leaf_nodes.size();
  417. }else{
  418. //Combine partial list of nodes.
  419. int node_cnt=0;
  420. for(int j=0;j<omp_p;j++)
  421. node_cnt+=leaf_nodes_[j].size();
  422. leaf_nodes.resize(node_cnt);
  423. #pragma omp parallel for
  424. for(int i=0;i<omp_p;i++){
  425. int offset=0;
  426. for(int j=0;j<i;j++)
  427. offset+=leaf_nodes_[j].size();
  428. for(size_t j=0;j<leaf_nodes_[i].size();j++)
  429. leaf_nodes[offset+j]=leaf_nodes_[i][j];
  430. }
  431. }
  432. }
  433. RedistNodes();
  434. MPI_Tree<TreeNode>::CoarsenTree();
  435. RedistNodes();
  436. MPI_Tree<TreeNode>::CoarsenTree();
  437. RedistNodes();
  438. }
  439. template <class TreeNode>
  440. void MPI_Tree<TreeNode>::RedistNodes(MortonId* loc_min) {
  441. int np, myrank;
  442. MPI_Comm_size(*Comm(),&np);
  443. MPI_Comm_rank(*Comm(),&myrank);
  444. if(np==1)return;
  445. //Create a linear tree in dendro format.
  446. Node_t* curr_node=this->PreorderFirst();
  447. std::vector<MortonId> in;
  448. std::vector<Node_t*> node_lst;
  449. while(curr_node!=NULL){
  450. if(curr_node->IsLeaf() && !curr_node->IsGhost()){
  451. node_lst.push_back(curr_node);
  452. in.push_back(curr_node->GetMortonId());
  453. }
  454. curr_node=this->PreorderNxt(curr_node);
  455. }
  456. size_t leaf_cnt=in.size();
  457. //Get new mins.
  458. std::vector<MortonId> new_mins(np);
  459. if(loc_min==NULL){
  460. //Partition vector of MortonIds using par::partitionW
  461. std::vector<MortonId> in_=in;
  462. std::vector<long long> wts(in_.size());
  463. #pragma omp parallel for
  464. for(size_t i=0;i<wts.size();i++){
  465. wts[i]=node_lst[i]->NodeCost();
  466. }
  467. par::partitionW<MortonId>(in_,&wts[0],*Comm());
  468. MPI_Allgather(&in_[0] , 1, par::Mpi_datatype<MortonId>::value(),
  469. &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  470. }else{
  471. MPI_Allgather(loc_min , 1, par::Mpi_datatype<MortonId>::value(),
  472. &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  473. }
  474. //Now exchange nodes according to new mins
  475. std::vector<PackedData> data(leaf_cnt);
  476. std::vector<int> send_cnts; send_cnts.assign(np,0);
  477. std::vector<int> send_size; send_size.assign(np,0);
  478. size_t sbuff_size=0;
  479. int omp_p=omp_get_max_threads();
  480. #pragma omp parallel for reduction(+:sbuff_size)
  481. for(int i=0;i<omp_p;i++){
  482. size_t a=( i *np)/omp_p;
  483. size_t b=((i+1)*np)/omp_p;
  484. if(b>a){
  485. size_t p_iter=a;
  486. size_t node_iter=std::lower_bound(&in[0], &in[in.size()], new_mins[a])-&in[0];
  487. for( ;node_iter<node_lst.size();node_iter++){
  488. while(p_iter+1u<(size_t)np? in[node_iter]>=new_mins[p_iter+1]: false) p_iter++;
  489. if(p_iter>=b) break;
  490. send_cnts[p_iter]++;
  491. data[node_iter]=node_lst[node_iter]->Pack();
  492. send_size[p_iter]+=data[node_iter].length+sizeof(size_t)+sizeof(MortonId);
  493. sbuff_size +=data[node_iter].length+sizeof(size_t)+sizeof(MortonId);
  494. }
  495. }
  496. }
  497. std::vector<int> recv_cnts(np);
  498. std::vector<int> recv_size(np);
  499. MPI_Alltoall(&send_cnts[0], 1, par::Mpi_datatype<int>::value(),
  500. &recv_cnts[0], 1, par::Mpi_datatype<int>::value(), *Comm());
  501. MPI_Alltoall(&send_size[0], 1, par::Mpi_datatype<int>::value(),
  502. &recv_size[0], 1, par::Mpi_datatype<int>::value(), *Comm());
  503. size_t recv_cnt=0;
  504. #pragma omp parallel for reduction(+:recv_cnt)
  505. for(int i=0;i<np;i++) recv_cnt+=recv_cnts[i];
  506. std::vector<MortonId> out(recv_cnt);
  507. std::vector<int> sdisp; sdisp.assign(np,0);
  508. std::vector<int> rdisp; rdisp.assign(np,0);
  509. omp_par::scan(&send_size[0],&sdisp[0],np); //TODO Don't need to do a full scan
  510. omp_par::scan(&recv_size[0],&rdisp[0],np); // as most entries will be 0.
  511. size_t rbuff_size=rdisp[np-1]+recv_size[np-1];
  512. char* send_buff=mem::aligned_new<char>(sbuff_size);
  513. char* recv_buff=mem::aligned_new<char>(rbuff_size);
  514. std::vector<char*> data_ptr(leaf_cnt);
  515. char* s_ptr=send_buff;
  516. for(size_t i=0;i<leaf_cnt;i++){
  517. *((MortonId*)s_ptr)=in [i] ; s_ptr+=sizeof(MortonId);
  518. *(( size_t*)s_ptr)=data[i].length; s_ptr+=sizeof(size_t);
  519. data_ptr[i]=s_ptr ; s_ptr+=data[i].length;
  520. }
  521. #pragma omp parallel for
  522. for(int p=0;p<omp_p;p++){
  523. size_t a=( p *leaf_cnt)/omp_p;
  524. size_t b=((p+1)*leaf_cnt)/omp_p;
  525. for(size_t i=a;i<b;i++)
  526. mem::memcopy(data_ptr[i], data[i].data, data[i].length);
  527. }
  528. par::Mpi_Alltoallv_sparse<char>(&send_buff[0], &send_size[0], &sdisp[0],
  529. &recv_buff[0], &recv_size[0], &rdisp[0], *Comm());
  530. char* r_ptr=recv_buff;
  531. std::vector<PackedData> r_data(recv_cnt);
  532. for(size_t i=0;i<recv_cnt;i++){
  533. out [i] =*(MortonId*)r_ptr; r_ptr+=sizeof(MortonId);
  534. r_data[i].length=*( size_t*)r_ptr; r_ptr+=sizeof(size_t);
  535. r_data[i].data = r_ptr; r_ptr+=r_data[i].length;
  536. }
  537. //Initialize all new nodes.
  538. int nchld=1UL<<this->Dim();
  539. size_t node_iter=0;
  540. MortonId dn;
  541. node_lst.resize(recv_cnt);
  542. Node_t* n=this->PreorderFirst();
  543. while(n!=NULL && node_iter<recv_cnt){
  544. n->SetGhost(false);
  545. dn=n->GetMortonId();
  546. if(dn.isAncestor(out[node_iter]) && dn!=out[node_iter]){
  547. if(n->IsLeaf()){
  548. {
  549. n->SetGhost(true);
  550. n->Subdivide();
  551. n->SetGhost(false);
  552. for(int j=0;j<nchld;j++){
  553. Node_t* ch_node=(Node_t*)n->Child(j);
  554. ch_node->SetGhost(false);
  555. }
  556. }
  557. }
  558. }else if(dn==out[node_iter]){
  559. if(!n->IsLeaf()){
  560. n->Truncate();
  561. n->SetGhost(false);
  562. }
  563. node_lst[node_iter]=n;
  564. node_iter++;
  565. }else{
  566. n->Truncate(); //This node does not belong to this process.
  567. n->SetGhost(true);
  568. }
  569. n=this->PreorderNxt(n);
  570. }
  571. while(n!=NULL){
  572. n->Truncate();
  573. n->SetGhost(true);
  574. n=this->PreorderNxt(n);
  575. }
  576. #pragma omp parallel for
  577. for(int p=0;p<omp_p;p++){
  578. size_t a=( p *recv_cnt)/omp_p;
  579. size_t b=((p+1)*recv_cnt)/omp_p;
  580. for(size_t i=a;i<b;i++)
  581. node_lst[i]->Unpack(r_data[i]);
  582. }
  583. //Free memory buffers.
  584. mem::aligned_delete<char>(recv_buff);
  585. mem::aligned_delete<char>(send_buff);
  586. }
  587. template <class TreeNode>
  588. TreeNode* MPI_Tree<TreeNode>::FindNode(MortonId& key, bool subdiv, TreeNode* start){
  589. int num_child=1UL<<this->Dim();
  590. Node_t* n=start;
  591. if(n==NULL) n=this->RootNode();
  592. while(n->GetMortonId()<key && (!n->IsLeaf()||subdiv)){
  593. if(n->IsLeaf() && !n->IsGhost()) n->Subdivide();
  594. if(n->IsLeaf()) break;
  595. for(int j=0;j<num_child;j++){
  596. if(((Node_t*)n->Child(j))->GetMortonId().NextId()>key){
  597. n=(Node_t*)n->Child(j);
  598. break;
  599. }
  600. }
  601. }
  602. assert(!subdiv || n->IsGhost() || n->GetMortonId()==key);
  603. return n;
  604. }
  605. //list must be sorted.
  606. inline int lineariseList(std::vector<MortonId> & list, MPI_Comm comm) {
  607. int rank,size;
  608. MPI_Comm_rank(comm,&rank);
  609. MPI_Comm_size(comm,&size);
  610. //Remove empty processors...
  611. int new_rank, new_size;
  612. MPI_Comm new_comm;
  613. MPI_Comm_split(comm, (list.empty()?0:1), rank, &new_comm);
  614. MPI_Comm_rank (new_comm, &new_rank);
  615. MPI_Comm_size (new_comm, &new_size);
  616. if(!list.empty()) {
  617. //Send the last octant to the next processor.
  618. MortonId lastOctant = list[list.size()-1];
  619. MortonId lastOnPrev;
  620. MPI_Request recvRequest;
  621. MPI_Request sendRequest;
  622. if(new_rank > 0) {
  623. MPI_Irecv(&lastOnPrev, 1, par::Mpi_datatype<MortonId>::value(), new_rank-1, 1, new_comm, &recvRequest);
  624. }
  625. if(new_rank < (new_size-1)) {
  626. MPI_Issend( &lastOctant, 1, par::Mpi_datatype<MortonId>::value(), new_rank+1, 1, new_comm, &sendRequest);
  627. }
  628. if(new_rank > 0) {
  629. std::vector<MortonId> tmp(list.size()+1);
  630. for(size_t i = 0; i < list.size(); i++) {
  631. tmp[i+1] = list[i];
  632. }
  633. MPI_Status statusWait;
  634. MPI_Wait(&recvRequest, &statusWait);
  635. tmp[0] = lastOnPrev;
  636. list.swap(tmp);
  637. }
  638. {// Remove duplicates and ancestors.
  639. std::vector<MortonId> tmp;
  640. if(!(list.empty())) {
  641. for(unsigned int i = 0; i < (list.size()-1); i++) {
  642. if( (!(list[i].isAncestor(list[i+1]))) && (list[i] != list[i+1]) ) {
  643. tmp.push_back(list[i]);
  644. }
  645. }
  646. if(new_rank == (new_size-1)) {
  647. tmp.push_back(list[list.size()-1]);
  648. }
  649. }
  650. list.swap(tmp);
  651. }
  652. if(new_rank < (new_size-1)) {
  653. MPI_Status statusWait;
  654. MPI_Wait(&sendRequest, &statusWait);
  655. }
  656. }//not empty procs only
  657. // Free new_comm
  658. MPI_Comm_free(&new_comm);
  659. return 1;
  660. }//end fn.
  661. inline int balanceOctree (std::vector<MortonId > &in, std::vector<MortonId > &out,
  662. unsigned int dim, unsigned int maxDepth, bool periodic, MPI_Comm comm) {
  663. int omp_p=omp_get_max_threads();
  664. int rank, size;
  665. MPI_Comm_size(comm,&size);
  666. MPI_Comm_rank(comm,&rank);
  667. if(size==1 && in.size()==1){
  668. out=in;
  669. return 0;
  670. }
  671. #ifdef __VERBOSE__
  672. long long locInSize = in.size();
  673. #endif
  674. //////////////////////////////////////////////////////////////////////////////////////////////////
  675. { //Redistribute.
  676. //Vector<long long> balance_wt(size);
  677. //#pragma omp parallel for
  678. //for(size_t i=0;i<size;i++){
  679. // balance_wt[i]=in[i].GetDepth();
  680. //}
  681. //par::partitionW<MortonId>(in, &balance_wt[0], comm);
  682. par::partitionW<MortonId>(in, NULL, comm);
  683. }
  684. //Build level-by-level set of nodes.
  685. std::vector<std::set<MortonId> > nodes((maxDepth+1)*omp_p);
  686. #pragma omp parallel for
  687. for(int p=0;p<omp_p;p++){
  688. size_t a=( p *in.size())/omp_p;
  689. size_t b=((p+1)*in.size())/omp_p;
  690. for(size_t i=a;i<b;){
  691. size_t d=in[i].GetDepth();
  692. if(d==0){i++; continue;}
  693. MortonId pnode=in[i].getAncestor(d-1);
  694. nodes[d-1+(maxDepth+1)*p].insert(pnode);
  695. while(i<b && d==in[i].GetDepth() && pnode==in[i].getAncestor(d-1)) i++;
  696. }
  697. //Add new nodes level-by-level.
  698. std::vector<MortonId> nbrs;
  699. for(unsigned int l=maxDepth;l>=1;l--){
  700. //Build set of parents of balancing nodes.
  701. std::set<MortonId> nbrs_parent;
  702. std::set<MortonId>::iterator start=nodes[l+(maxDepth+1)*p].begin();
  703. std::set<MortonId>::iterator end =nodes[l+(maxDepth+1)*p].end();
  704. for(std::set<MortonId>::iterator node=start; node != end;){
  705. node->NbrList(nbrs, l, periodic);
  706. int nbr_cnt=nbrs.size();
  707. for(int i=0;i<nbr_cnt;i++)
  708. nbrs_parent.insert(nbrs[i].getAncestor(l-1));
  709. node++;
  710. }
  711. //Get the balancing nodes.
  712. std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
  713. start=nbrs_parent.begin();
  714. end =nbrs_parent.end();
  715. ancestor_nodes.insert(start,end);
  716. }
  717. //Remove non-leaf nodes. (optional)
  718. for(unsigned int l=1;l<=maxDepth;l++){
  719. std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin();
  720. std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end();
  721. std::set<MortonId>& ancestor_nodes=nodes[l-1+(maxDepth+1)*p];
  722. for(std::set<MortonId>::iterator node=start; node != end; node++){
  723. MortonId parent=node->getAncestor(node->GetDepth()-1);
  724. ancestor_nodes.erase(parent);
  725. }
  726. }
  727. }
  728. //Resize in.
  729. std::vector<size_t> node_cnt(omp_p,0);
  730. std::vector<size_t> node_dsp(omp_p,0);
  731. #pragma omp parallel for
  732. for(int i=0;i<omp_p;i++){
  733. for(unsigned int j=0;j<=maxDepth;j++)
  734. node_cnt[i]+=nodes[j+i*(maxDepth+1)].size();
  735. }
  736. omp_par::scan(&node_cnt[0],&node_dsp[0], omp_p);
  737. in.resize(node_cnt[omp_p-1]+node_dsp[omp_p-1]);
  738. //Copy leaf nodes to in.
  739. #pragma omp parallel for
  740. for(int p=0;p<omp_p;p++){
  741. size_t node_iter=node_dsp[p];
  742. for(unsigned int l=0;l<=maxDepth;l++){
  743. std::set<MortonId>::iterator start=nodes[l +(maxDepth+1)*p].begin();
  744. std::set<MortonId>::iterator end =nodes[l +(maxDepth+1)*p].end();
  745. for(std::set<MortonId>::iterator node=start; node != end; node++)
  746. in[node_iter++]=*node;
  747. }
  748. }
  749. #ifdef __VERBOSE__
  750. //Local size before removing duplicates and ancestors (linearise).
  751. long long locTmpSize = in.size();
  752. #endif
  753. //Sort, Linearise, Redistribute.
  754. //TODO The following might work better as it reduces the comm bandwidth:
  755. //Split comm into sqrt(np) processes and sort, linearise for each comm group.
  756. //Then do the global sort, linearise with the original comm.
  757. par::HyperQuickSort(in, out, comm);
  758. lineariseList(out, comm);
  759. par::partitionW<MortonId>(out, NULL , comm);
  760. { // Add children
  761. //Remove empty processors...
  762. int new_rank, new_size;
  763. MPI_Comm new_comm;
  764. MPI_Comm_split(comm, (out.empty()?0:1), rank, &new_comm);
  765. MPI_Comm_rank (new_comm, &new_rank);
  766. MPI_Comm_size (new_comm, &new_size);
  767. if(!out.empty()) {
  768. MortonId nxt_mid(0,0,0,0);
  769. { // Get last octant from previous process.
  770. assert(out.size());
  771. //Send the last octant to the next processor.
  772. MortonId lastOctant = out.back();
  773. MortonId lastOnPrev;
  774. MPI_Request recvRequest;
  775. MPI_Request sendRequest;
  776. if(new_rank > 0) {
  777. MPI_Irecv(&lastOnPrev, 1, par::Mpi_datatype<MortonId>::value(), new_rank-1, 1, new_comm, &recvRequest);
  778. }
  779. if(new_rank < (new_size-1)) {
  780. MPI_Issend( &lastOctant, 1, par::Mpi_datatype<MortonId>::value(), new_rank+1, 1, new_comm, &sendRequest);
  781. }
  782. if(new_rank > 0) {
  783. MPI_Status statusWait;
  784. MPI_Wait(&recvRequest, &statusWait);
  785. nxt_mid = lastOnPrev.NextId();
  786. }
  787. if(new_rank < (new_size-1)) {
  788. MPI_Status statusWait;
  789. MPI_Wait(&sendRequest, &statusWait);
  790. }
  791. }
  792. std::vector<MortonId> out1;
  793. std::vector<MortonId> children;
  794. for(size_t i=0;i<out.size();i++){
  795. while(nxt_mid.getDFD()<out[i]){
  796. while(nxt_mid.isAncestor(out[i])){
  797. nxt_mid=nxt_mid.getAncestor(nxt_mid.GetDepth()+1);
  798. }
  799. out1.push_back(nxt_mid);
  800. nxt_mid=nxt_mid.NextId();
  801. }
  802. children=out[i].Children();
  803. for(size_t j=0;j<8;j++){
  804. out1.push_back(children[j]);
  805. }
  806. nxt_mid=out[i].NextId();
  807. }
  808. if(new_rank==new_size-1){
  809. while(nxt_mid.GetDepth()>0){
  810. out1.push_back(nxt_mid);
  811. nxt_mid=nxt_mid.NextId();
  812. }
  813. }
  814. out.swap(out1);
  815. }
  816. if(new_size<size){
  817. par::partitionW<MortonId>(out, NULL , comm);
  818. }
  819. // Free new_comm
  820. MPI_Comm_free(&new_comm);
  821. }
  822. //////////////////////////////////////////////////////////////////////////////////////////////////
  823. #ifdef __VERBOSE__
  824. long long locOutSize = out.size();
  825. long long globInSize, globTmpSize, globOutSize;
  826. MPI_Allreduce(&locInSize , &globInSize , 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
  827. MPI_Allreduce(&locTmpSize, &globTmpSize, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
  828. MPI_Allreduce(&locOutSize, &globOutSize, 1, par::Mpi_datatype<long long>::value(), par::Mpi_datatype<long long>::sum(), comm);
  829. if(!rank) std::cout<<"Balance Octree. inpSize: "<<globInSize
  830. <<" tmpSize: "<<globTmpSize
  831. <<" outSize: "<<globOutSize
  832. <<" activeNpes: "<<size<<std::endl;
  833. #endif
  834. return 0;
  835. }//end function
  836. template <class TreeNode>
  837. void MPI_Tree<TreeNode>::Balance21(BoundaryType bndry) {
  838. int num_proc,myrank;
  839. MPI_Comm_rank(*Comm(),&myrank);
  840. MPI_Comm_size(*Comm(),&num_proc);
  841. //Using Dendro for balancing
  842. //Create a linear tree in dendro format.
  843. Node_t* curr_node=this->PreorderFirst();
  844. std::vector<MortonId> in;
  845. while(curr_node!=NULL){
  846. if(curr_node->IsLeaf() && !curr_node->IsGhost()){
  847. in.push_back(curr_node->GetMortonId());
  848. }
  849. curr_node=this->PreorderNxt(curr_node);
  850. }
  851. //2:1 balance
  852. Profile::Tic("ot::balanceOctree",Comm(),true,10);
  853. std::vector<MortonId> out;
  854. balanceOctree(in, out, this->Dim(), this->max_depth, (bndry==Periodic), *Comm());
  855. Profile::Toc();
  856. //Get new_mins.
  857. std::vector<MortonId> new_mins(num_proc);
  858. MPI_Allgather(&out[0] , 1, par::Mpi_datatype<MortonId>::value(),
  859. &new_mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  860. // Refine to new_mins in my range of octants
  861. // or else RedistNodes(...) will not work correctly.
  862. {
  863. int i=0;
  864. std::vector<MortonId> mins=GetMins();
  865. while(new_mins[i]<mins[myrank] && i<num_proc) i++; //TODO: Use binary search.
  866. for(;i<num_proc;i++){
  867. Node_t* n=FindNode(new_mins[i], true);
  868. if(n->IsGhost()) break;
  869. else assert(n->GetMortonId()==new_mins[i]);
  870. }
  871. }
  872. //Redist nodes using new_mins.
  873. Profile::Tic("RedistNodes",Comm(),true,10);
  874. RedistNodes(&out[0]);
  875. #ifndef NDEBUG
  876. std::vector<MortonId> mins=GetMins();
  877. assert(mins[myrank].getDFD()==out[0].getDFD());
  878. #endif
  879. Profile::Toc();
  880. //Now subdivide the current tree as necessary to make it balanced.
  881. Profile::Tic("LocalSubdivide",Comm(),false,10);
  882. int omp_p=omp_get_max_threads();
  883. for(int i=0;i<omp_p;i++){
  884. size_t a=(out.size()*i)/omp_p;
  885. Node_t* n=FindNode(out[a], true);
  886. assert(n->GetMortonId()==out[a]);
  887. UNUSED(n);
  888. }
  889. #pragma omp parallel for
  890. for(int i=0;i<omp_p;i++){
  891. size_t a=(out.size()* i )/omp_p;
  892. size_t b=(out.size()*(i+1))/omp_p;
  893. MortonId dn;
  894. size_t node_iter=a;
  895. Node_t* n=FindNode(out[node_iter], false);
  896. while(n!=NULL && node_iter<b){
  897. n->SetGhost(false);
  898. dn=n->GetMortonId();
  899. if(dn.isAncestor(out[node_iter]) && dn!=out[node_iter]){
  900. if(n->IsLeaf()) n->Subdivide();
  901. }else if(dn==out[node_iter]){
  902. assert(n->IsLeaf());
  903. //if(!n->IsLeaf()){ //This should never happen
  904. // n->Truncate();
  905. // n->SetGhost(false);
  906. //}
  907. assert(n->IsLeaf());
  908. node_iter++;
  909. }else{
  910. n->Truncate(); //This node does not belong to this process.
  911. n->SetGhost(true);
  912. }
  913. n=this->PreorderNxt(n);
  914. }
  915. if(i==omp_p-1){
  916. while(n!=NULL){
  917. n->Truncate();
  918. n->SetGhost(true);
  919. n=this->PreorderNxt(n);
  920. }
  921. }
  922. }
  923. Profile::Toc();
  924. }
  925. template <class TreeNode>
  926. void MPI_Tree<TreeNode>::Balance21_local(BoundaryType bndry){
  927. //SetColleagues(bndry);
  928. std::vector<std::vector<Node_t*> > node_lst(this->max_depth+1);
  929. Node_t* curr_node=this->PreorderFirst();
  930. while(curr_node!=NULL){
  931. node_lst[curr_node->Depth()].push_back(curr_node);
  932. curr_node=this->PreorderNxt(curr_node);
  933. }
  934. int n1=pow(3.0,this->Dim());
  935. int n2=pow(2.0,this->Dim());
  936. for(int i=this->max_depth;i>0;i--){
  937. Real_t s=pow(0.5,i);
  938. for(size_t j=0;j<node_lst[i].size();j++){
  939. curr_node=node_lst[i][j];
  940. Real_t* coord=curr_node->Coord();
  941. if(!curr_node->IsLeaf()) for(int k=0;k<n1;k++){
  942. if(curr_node->Colleague(k)==NULL){
  943. Real_t c0[3]={coord[0]+((k/1)%3-1)*s+s*0.5,
  944. coord[1]+((k/3)%3-1)*s+s*0.5,
  945. coord[2]+((k/9)%3-1)*s+s*0.5};
  946. if(bndry==Periodic){
  947. c0[0]=c0[0]-floor(c0[0]);
  948. c0[1]=c0[1]-floor(c0[1]);
  949. c0[2]=c0[2]-floor(c0[2]);
  950. }
  951. if(c0[0]>0 && c0[0]<1)
  952. if(c0[1]>0 && c0[1]<1)
  953. if(c0[2]>0 && c0[2]<1){
  954. Node_t* node=this->RootNode();
  955. while(node->Depth()<i){
  956. if(node->IsLeaf()){
  957. node->Subdivide();
  958. for(int l=0;l<n2;l++){
  959. node_lst[node->Depth()+1].push_back((Node_t*)node->Child(l));
  960. /*
  961. SetColleagues(bndry,(Node_t*)node->Child(l));
  962. for(int i_=0;i_<n1;i_++){
  963. Node_t* coll=(Node_t*)((Node_t*)node->Child(l))->Colleague(i_);
  964. if(coll!=NULL) SetColleagues(bndry,coll);
  965. }// */
  966. }
  967. }
  968. Real_t s1=pow(0.5,node->Depth()+1);
  969. Real_t* c1=node->Coord();
  970. int c_id=((c0[0]-c1[0])>s1?1:0)+
  971. ((c0[1]-c1[1])>s1?2:0)+
  972. ((c0[2]-c1[2])>s1?4:0);
  973. node=(Node_t*)node->Child(c_id);
  974. /*if(node->Depth()==i){
  975. c1=node->Coord();
  976. std::cout<<(c0[0]-c1[0])-s1/2<<' '
  977. std::cout<<(c0[1]-c1[1])-s1/2<<' '
  978. std::cout<<(c0[2]-c1[2])-s1/2<<'\n';
  979. }// */
  980. }
  981. }
  982. }
  983. }
  984. }
  985. }
  986. }
  987. template <class TreeNode>
  988. void MPI_Tree<TreeNode>::SetColleagues(BoundaryType bndry, Node_t* node){
  989. int n1=(int)pow(3.0,this->Dim());
  990. int n2=(int)pow(2.0,this->Dim());
  991. if(node==NULL){
  992. Node_t* curr_node=this->PreorderFirst();
  993. if(curr_node!=NULL){
  994. if(bndry==Periodic){
  995. for(int i=0;i<n1;i++)
  996. curr_node->SetColleague(curr_node,i);
  997. }else{
  998. curr_node->SetColleague(curr_node,(n1-1)/2);
  999. }
  1000. curr_node=this->PreorderNxt(curr_node);
  1001. }
  1002. Vector<std::vector<Node_t*> > nodes(MAX_DEPTH);
  1003. while(curr_node!=NULL){
  1004. nodes[curr_node->Depth()].push_back(curr_node);
  1005. curr_node=this->PreorderNxt(curr_node);
  1006. }
  1007. for(size_t i=0;i<MAX_DEPTH;i++){
  1008. size_t j0=nodes[i].size();
  1009. Node_t** nodes_=&nodes[i][0];
  1010. #pragma omp parallel for
  1011. for(size_t j=0;j<j0;j++){
  1012. SetColleagues(bndry, nodes_[j]);
  1013. }
  1014. }
  1015. }else{
  1016. /* //This is slower
  1017. Node_t* root_node=this->RootNode();
  1018. int d=node->Depth();
  1019. Real_t* c0=node->Coord();
  1020. Real_t s=pow(0.5,d);
  1021. Real_t c[COORD_DIM];
  1022. int idx=0;
  1023. for(int i=-1;i<=1;i++)
  1024. for(int j=-1;j<=1;j++)
  1025. for(int k=-1;k<=1;k++){
  1026. c[0]=c0[0]+s*0.5+s*k;
  1027. c[1]=c0[1]+s*0.5+s*j;
  1028. c[2]=c0[2]+s*0.5+s*i;
  1029. if(bndry==Periodic){
  1030. if(c[0]<0.0) c[0]+=1.0;
  1031. if(c[0]>1.0) c[0]-=1.0;
  1032. if(c[1]<1.0) c[1]+=1.0;
  1033. if(c[1]>1.0) c[1]-=1.0;
  1034. if(c[2]<1.0) c[2]+=1.0;
  1035. if(c[2]>1.0) c[2]-=1.0;
  1036. }
  1037. node->SetColleague(NULL,idx);
  1038. if(c[0]<1.0 && c[0]>0.0)
  1039. if(c[1]<1.0 && c[1]>0.0)
  1040. if(c[2]<1.0 && c[2]>0.0){
  1041. MortonId m(c,d);
  1042. Node_t* nbr=FindNode(m,false,root_node);
  1043. while(nbr->Depth()>d) nbr=(Node_t*)nbr->Parent();
  1044. if(nbr->Depth()==d) node->SetColleague(nbr,idx);
  1045. }
  1046. idx++;
  1047. }
  1048. /*/
  1049. Node_t* parent_node;
  1050. Node_t* tmp_node1;
  1051. Node_t* tmp_node2;
  1052. for(int i=0;i<n1;i++)node->SetColleague(NULL,i);
  1053. parent_node=(Node_t*)node->Parent();
  1054. if(parent_node==NULL) return;
  1055. int l=node->Path2Node();
  1056. for(int i=0;i<n1;i++){ //For each coll of the parent
  1057. tmp_node1=(Node_t*)parent_node->Colleague(i);
  1058. if(tmp_node1!=NULL)
  1059. if(!tmp_node1->IsLeaf()){
  1060. for(int j=0;j<n2;j++){ //For each child
  1061. tmp_node2=(Node_t*)tmp_node1->Child(j);
  1062. if(tmp_node2!=NULL){
  1063. bool flag=true;
  1064. int a=1,b=1,new_indx=0;
  1065. for(int k=0;k<this->Dim();k++){
  1066. int indx_diff=(((i/b)%3)-1)*2+((j/a)%2)-((l/a)%2);
  1067. if(-1>indx_diff || indx_diff>1) flag=false;
  1068. new_indx+=(indx_diff+1)*b;
  1069. a*=2;b*=3;
  1070. }
  1071. if(flag){
  1072. node->SetColleague(tmp_node2,new_indx);
  1073. }
  1074. }
  1075. }
  1076. }
  1077. }// */
  1078. }
  1079. }
  1080. template <class TreeNode>
  1081. bool MPI_Tree<TreeNode>::CheckTree(){
  1082. int myrank,np;
  1083. MPI_Comm_rank(*Comm(),&myrank);
  1084. MPI_Comm_size(*Comm(),&np);
  1085. std::vector<MortonId> mins=GetMins();
  1086. std::stringstream st;
  1087. st<<"PID_"<<myrank<<" : ";
  1088. std::string str;
  1089. Node_t* n=this->PostorderFirst();
  1090. while(n!=NULL){
  1091. if(myrank<np-1) if(n->GetMortonId().getDFD()>=mins[myrank+1])break;
  1092. if(n->GetMortonId()>=mins[myrank] && n->IsLeaf() && n->IsGhost()){
  1093. std::cout<<n->GetMortonId()<<'\n';
  1094. std::cout<<mins[myrank]<<'\n';
  1095. if(myrank+1<np) std::cout<<mins[myrank+1]<<'\n';
  1096. std::cout<<myrank<<'\n';
  1097. assert(false);
  1098. }
  1099. if(n->GetMortonId()<mins[myrank] && n->IsLeaf() && !n->IsGhost()){
  1100. assert(false);
  1101. }
  1102. if(!n->IsGhost() && n->Depth()>0)
  1103. assert(!((Node_t*)n->Parent())->IsGhost());
  1104. n=this->PostorderNxt(n);
  1105. }
  1106. while(n!=NULL){
  1107. if(n->IsLeaf() && !n->IsGhost()){
  1108. st<<"non-ghost leaf node "<<n->GetMortonId()<<"; after last node.";
  1109. str=st.str(); ASSERT_WITH_MSG(false,str.c_str());
  1110. }
  1111. n=this->PostorderNxt(n);
  1112. }
  1113. return true;
  1114. };
  1115. /**
  1116. * \brief Determines if node is used in the region between Morton Ids m1 and m2
  1117. * ( m1 <= m2 ).
  1118. */
  1119. template <class TreeNode>
  1120. void IsShared(std::vector<TreeNode*>& nodes, MortonId* m1, MortonId* m2, BoundaryType bndry, std::vector<char>& shared_flag){
  1121. MortonId mm1, mm2;
  1122. if(m1!=NULL) mm1=m1->getDFD();
  1123. if(m2!=NULL) mm2=m2->getDFD();
  1124. shared_flag.resize(nodes.size());
  1125. int omp_p=omp_get_max_threads();
  1126. #pragma omp parallel for
  1127. for(int j=0;j<omp_p;j++){
  1128. size_t a=((j )*nodes.size())/omp_p;
  1129. size_t b=((j+1)*nodes.size())/omp_p;
  1130. std::vector<MortonId> nbr_lst;
  1131. for(size_t i=a;i<b;i++){
  1132. shared_flag[i]=false;
  1133. TreeNode* node=nodes[i];
  1134. assert(node!=NULL);
  1135. if(node->Depth()<2){
  1136. shared_flag[i]=true;
  1137. continue;
  1138. }
  1139. node->GetMortonId().NbrList(nbr_lst, node->Depth()-1, bndry==Periodic);
  1140. for(size_t k=0;k<nbr_lst.size();k++){
  1141. MortonId n1=nbr_lst[k] .getDFD();
  1142. MortonId n2=nbr_lst[k].NextId().getDFD();
  1143. if(m1==NULL || n2>mm1)
  1144. if(m2==NULL || n1<mm2){
  1145. shared_flag[i]=true;
  1146. break;
  1147. }
  1148. }
  1149. }
  1150. }
  1151. }
  1152. inline void IsShared(std::vector<PackedData>& nodes, MortonId* m1, MortonId* m2, BoundaryType bndry, std::vector<char>& shared_flag){
  1153. MortonId mm1, mm2;
  1154. if(m1!=NULL) mm1=m1->getDFD();
  1155. if(m2!=NULL) mm2=m2->getDFD();
  1156. shared_flag.resize(nodes.size());
  1157. int omp_p=omp_get_max_threads();
  1158. #pragma omp parallel for
  1159. for(int j=0;j<omp_p;j++){
  1160. size_t a=((j )*nodes.size())/omp_p;
  1161. size_t b=((j+1)*nodes.size())/omp_p;
  1162. std::vector<MortonId> nbr_lst;
  1163. for(size_t i=a;i<b;i++){
  1164. shared_flag[i]=false;
  1165. MortonId* node=(MortonId*)nodes[i].data;
  1166. assert(node!=NULL);
  1167. if(node->GetDepth()<2){
  1168. shared_flag[i]=true;
  1169. continue;
  1170. }
  1171. node->NbrList(nbr_lst, node->GetDepth()-1, bndry==Periodic);
  1172. for(size_t k=0;k<nbr_lst.size();k++){
  1173. MortonId n1=nbr_lst[k] .getDFD();
  1174. MortonId n2=nbr_lst[k].NextId().getDFD();
  1175. if(m1==NULL || n2>mm1)
  1176. if(m2==NULL || n1<mm2){
  1177. shared_flag[i]=true;
  1178. break;
  1179. }
  1180. }
  1181. }
  1182. }
  1183. }
  1184. /**
  1185. * \brief Construct Locally Essential Tree by exchanging Ghost octants.
  1186. */
  1187. template <class TreeNode>
  1188. void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
  1189. //Profile::Tic("LET_Hypercube", &comm, true, 5);
  1190. //ConstructLET_Hypercube(bndry);
  1191. //Profile::Toc();
  1192. //Profile::Tic("LET_Sparse", &comm, true, 5);
  1193. ConstructLET_Sparse(bndry);
  1194. //Profile::Toc();
  1195. #ifndef NDEBUG
  1196. CheckTree();
  1197. #endif
  1198. }
  1199. /**
  1200. * \brief Hypercube based scheme to exchange Ghost octants.
  1201. */
  1202. //#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)
  1203. template <class TreeNode>
  1204. void MPI_Tree<TreeNode>::ConstructLET_Hypercube(BoundaryType bndry){
  1205. int num_p,rank;
  1206. MPI_Comm_size(*Comm(),&num_p);
  1207. MPI_Comm_rank(*Comm(),&rank );
  1208. if(num_p==1) return;
  1209. int omp_p=omp_get_max_threads();
  1210. std::vector<MortonId> mins=GetMins();
  1211. // Build list of shared nodes.
  1212. std::vector<Node_t*> shared_nodes; shared_nodes.clear();
  1213. std::vector<Node_t*> node_lst; node_lst.clear();
  1214. Node_t* curr_node=this->PreorderFirst();
  1215. while(curr_node!=NULL){
  1216. if(curr_node->GetMortonId().getDFD()>=mins[rank]) break;
  1217. curr_node=this->PreorderNxt(curr_node);
  1218. }
  1219. while(curr_node!=NULL){
  1220. if(curr_node->IsGhost()) break;
  1221. node_lst.push_back(curr_node);
  1222. curr_node=this->PreorderNxt(curr_node);
  1223. }
  1224. std::vector<char> node_flag0; node_flag0.clear();
  1225. std::vector<char> node_flag1; node_flag1.clear();
  1226. IsShared(node_lst,&mins[0],&mins[rank],bndry,node_flag0);
  1227. if(rank<num_p-1) IsShared(node_lst,&mins[rank+1],NULL,bndry,node_flag1);
  1228. for(size_t i=0;i<node_lst.size();i++){
  1229. if(node_flag0[i] || (rank<num_p-1 && node_flag1[i]))
  1230. shared_nodes.push_back(node_lst[i]);
  1231. }
  1232. //std::cout<<"Shared = "<<shared_nodes.size()<<'\n';
  1233. // Pack shared nodes.
  1234. static std::vector<char> shrd_buff_vec0(omp_p*64l*1024l*1024l);
  1235. static std::vector<char> shrd_buff_vec1(omp_p*128l*1024l*1024l);
  1236. static std::vector<char> send_buff_vec(omp_p*64l*1024l*1024l); char* send_buff;
  1237. static std::vector<char> recv_buff_vec(omp_p*64l*1024l*1024l); char* recv_buff;
  1238. std::vector<PackedData> shrd_data;
  1239. size_t max_data_size=0;
  1240. {
  1241. long max_data_size_lcl=0;
  1242. long max_data_size_glb=0;
  1243. char* data_ptr=&shrd_buff_vec0[0];
  1244. for(size_t i=0;i<shared_nodes.size();i++){
  1245. PackedData p=shared_nodes[i]->Pack(true,data_ptr,sizeof(MortonId));
  1246. ((MortonId*)data_ptr)[0]=shared_nodes[i]->GetMortonId();
  1247. p.length+=sizeof(MortonId);
  1248. shrd_data.push_back(p);
  1249. data_ptr+=p.length;
  1250. if(max_data_size_lcl<(long)p.length) max_data_size_lcl=p.length;
  1251. assert(data_ptr<=&(*shrd_buff_vec0.end())); //TODO: resize if needed.
  1252. }
  1253. MPI_Allreduce(&max_data_size_lcl, &max_data_size_glb, 1, MPI_LONG, MPI_MAX, *Comm());
  1254. max_data_size=max_data_size_glb;
  1255. }
  1256. // Memory slots for storing node data.
  1257. std::set<void*> mem_set;
  1258. size_t mem_set_size=0;
  1259. size_t range[2]={0,(size_t)num_p-1};
  1260. while(range[1]-range[0]>0){
  1261. size_t split_p=(range[0]+range[1])/2;
  1262. size_t new_range[2]={(size_t)rank<=split_p?range[0]:split_p+1,(size_t)rank<=split_p?split_p:range[1]};
  1263. size_t com_range[2]={(size_t)rank> split_p?range[0]:split_p+1,(size_t)rank> split_p?split_p:range[1]};
  1264. size_t partner=rank-new_range[0]+com_range[0];
  1265. if(partner>range[1]) partner--;
  1266. bool extra_partner=((size_t)rank==range[1] && ((range[1]-range[0])%2)==0);
  1267. int send_length=0;
  1268. std::vector<PackedData> shrd_data_new;
  1269. IsShared(shrd_data, &mins[com_range[0]], (com_range[1]==(size_t)num_p-1?NULL:&mins[com_range[1]+1]),bndry, node_flag0);
  1270. IsShared(shrd_data, &mins[new_range[0]], (new_range[1]==(size_t)num_p-1?NULL:&mins[new_range[1]+1]),bndry, node_flag1);
  1271. {
  1272. std::vector<void*> srctrg_ptr;
  1273. std::vector<size_t> mem_size;
  1274. for(size_t i=0;i<shrd_data.size();i++){
  1275. PackedData& p=shrd_data[i];
  1276. if( node_flag0[i]){ // Copy data to send buffer.
  1277. char* data_ptr=(char*)&send_buff_vec[send_length];
  1278. ((size_t*)data_ptr)[0]=p.length; data_ptr+=sizeof(size_t);
  1279. //mem::memcopy(data_ptr,p.data,p.length);
  1280. mem_size.push_back(p.length);
  1281. srctrg_ptr.push_back(p.data);
  1282. srctrg_ptr.push_back(data_ptr);
  1283. send_length+=p.length+sizeof(size_t);
  1284. assert((size_t)send_length<=send_buff_vec.size()); //TODO: resize if needed.
  1285. }
  1286. if(!node_flag1[i]){ // Free memory slot.
  1287. //assert(node_flag0[0]);
  1288. if(p.data>=&shrd_buff_vec1[0] && p.data<&shrd_buff_vec1[0]+shrd_buff_vec1.size())
  1289. mem_set.insert(p.data);
  1290. } else shrd_data_new.push_back(p);
  1291. }
  1292. shrd_data=shrd_data_new;
  1293. #pragma omp parallel for
  1294. for(int k=0;k<omp_p;k++){
  1295. size_t i0=((k+0)*mem_size.size())/omp_p;
  1296. size_t i1=((k+1)*mem_size.size())/omp_p;
  1297. for(size_t i=i0;i<i1;i++){
  1298. mem::memcopy(srctrg_ptr[2*i+1],srctrg_ptr[2*i+0],mem_size[i]);
  1299. }
  1300. }
  1301. }
  1302. //Exchange send size.
  1303. int recv_length=0;
  1304. int extra_recv_length=0;
  1305. int extra_send_length=0;
  1306. MPI_Status status;
  1307. MPI_Sendrecv (& send_length,1,MPI_INT,partner,0, &recv_length,1,MPI_INT,partner,0,*Comm(),&status);
  1308. if(extra_partner) MPI_Sendrecv(&extra_send_length,1,MPI_INT,split_p,0,&extra_recv_length,1,MPI_INT,split_p,0,*Comm(),&status);
  1309. //SendRecv data.
  1310. assert((size_t)send_length <=send_buff_vec.size()); send_buff=&send_buff_vec[0];
  1311. assert((size_t)recv_length+extra_recv_length<=recv_buff_vec.size()); recv_buff=&recv_buff_vec[0];
  1312. MPI_Sendrecv (send_buff,send_length,MPI_BYTE,partner,0, recv_buff , recv_length,MPI_BYTE,partner,0,*Comm(),&status);
  1313. if(extra_partner) MPI_Sendrecv( NULL, 0,MPI_BYTE,split_p,0,&recv_buff[recv_length],extra_recv_length,MPI_BYTE,split_p,0,*Comm(),&status);
  1314. //Get nodes from received data.
  1315. {
  1316. std::vector<void*> srctrg_ptr;
  1317. std::vector<size_t> mem_size;
  1318. int buff_length=0;
  1319. while(buff_length<recv_length+extra_recv_length){
  1320. PackedData p0,p1;
  1321. p0.length=((size_t*)&recv_buff_vec[buff_length])[0];
  1322. p0.data=(char*)&recv_buff_vec[buff_length]+sizeof(size_t);
  1323. buff_length+=p0.length+sizeof(size_t);
  1324. p1.length=p0.length;
  1325. if(mem_set.size()==0){
  1326. assert(mem_set_size*max_data_size<shrd_buff_vec1.size());
  1327. p1.data=&shrd_buff_vec1[mem_set_size*max_data_size];
  1328. mem_set_size++;
  1329. }else{
  1330. p1.data=*mem_set.begin();
  1331. mem_set.erase(mem_set.begin());
  1332. }
  1333. //mem::memcopy(p1.data,p0.data,p0.length);
  1334. mem_size.push_back(p0.length);
  1335. srctrg_ptr.push_back(p0.data);
  1336. srctrg_ptr.push_back(p1.data);
  1337. shrd_data.push_back(p1);
  1338. }
  1339. #pragma omp parallel for
  1340. for(int k=0;k<omp_p;k++){
  1341. size_t i0=((k+0)*mem_size.size())/omp_p;
  1342. size_t i1=((k+1)*mem_size.size())/omp_p;
  1343. for(size_t i=i0;i<i1;i++){
  1344. mem::memcopy(srctrg_ptr[2*i+1],srctrg_ptr[2*i+0],mem_size[i]);
  1345. }
  1346. }
  1347. }
  1348. range[0]=new_range[0];
  1349. range[1]=new_range[1];
  1350. }
  1351. //Add shared_nodes to the tree.
  1352. //std::cout<<"Number of Ghost Nodes = "<<shrd_data.size()<<'\n';
  1353. int nchld=(1UL<<this->Dim()); // Number of children.
  1354. std::vector<Node_t*> shrd_nodes(shrd_data.size());
  1355. for(size_t i=0;i<shrd_data.size();i++){ // Find shared nodes.
  1356. MortonId& mid=*(MortonId*)shrd_data[i].data;
  1357. Node_t* srch_node=this->RootNode();
  1358. while(srch_node->GetMortonId()!=mid){
  1359. Node_t* ch_node;
  1360. if(srch_node->IsLeaf()){
  1361. srch_node->SetGhost(true);
  1362. srch_node->Subdivide();
  1363. }
  1364. for(int j=nchld-1;j>=0;j--){
  1365. ch_node=(Node_t*)srch_node->Child(j);
  1366. if(ch_node->GetMortonId()<=mid){
  1367. srch_node=ch_node;
  1368. break;
  1369. }
  1370. }
  1371. }
  1372. shrd_nodes[i]=srch_node;
  1373. }
  1374. #pragma omp parallel for
  1375. for(size_t i=0;i<shrd_data.size();i++){
  1376. if(shrd_nodes[i]->IsGhost()) { // Initialize ghost node.
  1377. PackedData p=shrd_data[i];
  1378. p.data=((char*)p.data)+sizeof(MortonId);
  1379. p.length-=sizeof(MortonId);
  1380. shrd_nodes[i]->Unpack(p);
  1381. }
  1382. }
  1383. //Now LET is complete.
  1384. }
  1385. /**
  1386. * \brief Sparse communication scheme to exchange Ghost octants.
  1387. */
  1388. template <class TreeNode>
  1389. void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
  1390. typedef int MPI_size_t;
  1391. struct CommData{
  1392. MortonId mid;
  1393. TreeNode* node;
  1394. size_t pkd_length;
  1395. size_t usr_cnt;
  1396. MortonId usr_mid[COLLEAGUE_COUNT];
  1397. size_t usr_pid[COLLEAGUE_COUNT];
  1398. };
  1399. int num_p,rank;
  1400. MPI_Comm_size(*Comm(),&num_p);
  1401. MPI_Comm_rank(*Comm(),&rank );
  1402. if(num_p==1) return;
  1403. int omp_p=omp_get_max_threads();
  1404. std::vector<MortonId> mins=GetMins();
  1405. // Allocate Memory.
  1406. static std::vector<char> send_buff;
  1407. static std::vector<char> recv_buff;
  1408. //Profile::Tic("SharedNodes", &comm, false, 5);
  1409. CommData* node_comm_data=NULL; // CommData for all nodes.
  1410. std::vector<void*> shared_data; // CommData for shared nodes.
  1411. std::vector<par::SortPair<size_t,size_t> > pid_node_pair; // <pid, shared_data index> list
  1412. { // Set node_comm_data
  1413. MortonId mins_r0=mins[ rank+0 ].getDFD();
  1414. MortonId mins_r1=mins[std::min(rank+1,num_p-1)].getDFD();
  1415. std::vector<TreeNode*> nodes=this->GetNodeList();
  1416. node_comm_data=(CommData*)this->memgr.malloc(sizeof(CommData)*nodes.size());
  1417. #pragma omp parallel for
  1418. for(size_t tid=0;tid<omp_p;tid++){
  1419. std::vector<MortonId> nbr_lst;
  1420. size_t a=(nodes.size()* tid )/omp_p;
  1421. size_t b=(nodes.size()*(tid+1))/omp_p;
  1422. for(size_t i=a;i<b;i++){
  1423. bool shared=false;
  1424. CommData& comm_data=node_comm_data[i];
  1425. comm_data.node=nodes[i];
  1426. comm_data.mid=comm_data.node->GetMortonId();
  1427. comm_data.usr_cnt=0;
  1428. if(comm_data.node->IsGhost()) continue;
  1429. if(comm_data.node->Depth()==0) continue;
  1430. if(comm_data.mid.getDFD()<mins_r0) continue;
  1431. //MortonId mid0=comm_data.mid. getDFD();
  1432. //MortonId mid1=comm_data.mid.NextId().getDFD();
  1433. comm_data.mid.NbrList(nbr_lst,comm_data.node->Depth()-1, bndry==Periodic);
  1434. comm_data.usr_cnt=nbr_lst.size();
  1435. for(size_t j=0;j<nbr_lst.size();j++){
  1436. MortonId usr_mid=nbr_lst[j];
  1437. MortonId usr_mid_dfd=usr_mid.getDFD();
  1438. comm_data.usr_mid[j]=usr_mid;
  1439. comm_data.usr_pid[j]=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
  1440. // if(usr_mid_dfd<mins_r0 || (rank+1<num_p && usr_mid_dfd>=mins_r1)){ // Find the user pid.
  1441. // size_t usr_pid=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
  1442. // comm_data.usr_pid[j]=usr_pid;
  1443. // }else comm_data.usr_pid[j]=rank;
  1444. if(!shared){ // Check if this node needs to be transferred during broadcast.
  1445. if(comm_data.usr_pid[j]!=rank || (rank+1<num_p && usr_mid.NextId()>mins_r1) ){
  1446. shared=true;
  1447. }
  1448. }
  1449. }
  1450. if(shared){
  1451. #pragma omp critical (ADD_SHARED)
  1452. {
  1453. for(size_t j=0;j<comm_data.usr_cnt;j++)
  1454. if(comm_data.usr_pid[j]!=rank){
  1455. bool unique_pid=true;
  1456. for(size_t k=0;k<j;k++){
  1457. if(comm_data.usr_pid[j]==comm_data.usr_pid[k]){
  1458. unique_pid=false;
  1459. break;
  1460. }
  1461. }
  1462. if(unique_pid){
  1463. par::SortPair<size_t,size_t> p;
  1464. p.key=comm_data.usr_pid[j];
  1465. p.data=shared_data.size();
  1466. pid_node_pair.push_back(p);
  1467. }
  1468. }
  1469. shared_data.push_back(&comm_data);
  1470. }
  1471. }
  1472. }
  1473. }
  1474. omp_par::merge_sort(&pid_node_pair[0], &pid_node_pair[pid_node_pair.size()]);
  1475. //std::cout<<rank<<' '<<shared_data.size()<<' '<<pid_node_pair.size()<<'\n';
  1476. }
  1477. //Profile::Toc();
  1478. //Profile::Tic("PackNodes", &comm, false, 5);
  1479. { // Pack shared nodes.
  1480. #pragma omp parallel for
  1481. for(size_t tid=0;tid<omp_p;tid++){
  1482. size_t buff_length=10l*1024l*1024l; // 10MB buffer per thread.
  1483. char* buff=(char*)this->memgr.malloc(buff_length);
  1484. size_t a=( tid *shared_data.size())/omp_p;
  1485. size_t b=((tid+1)*shared_data.size())/omp_p;
  1486. for(size_t i=a;i<b;i++){
  1487. CommData& comm_data=*(CommData*)shared_data[i];
  1488. PackedData p0=comm_data.node->Pack(true,buff);
  1489. assert(p0.length<buff_length);
  1490. shared_data[i]=this->memgr.malloc(sizeof(CommData)+p0.length);
  1491. CommData& new_comm_data=*(CommData*)shared_data[i];
  1492. new_comm_data=comm_data;
  1493. new_comm_data.pkd_length=sizeof(CommData)+p0.length;
  1494. mem::memcopy(((char*)shared_data[i])+sizeof(CommData),buff,p0.length);
  1495. }
  1496. this->memgr.free(buff);
  1497. }
  1498. // now CommData is stored in shared_data
  1499. this->memgr.free(node_comm_data);
  1500. node_comm_data=NULL;
  1501. }
  1502. //Profile::Toc();
  1503. //Profile::Tic("SendBuff", &comm, false, 5);
  1504. std::vector<MPI_size_t> send_size(num_p,0);
  1505. std::vector<MPI_size_t> send_disp(num_p,0);
  1506. if(pid_node_pair.size()){ // Build send_buff.
  1507. std::vector<size_t> size(pid_node_pair.size(),0);
  1508. std::vector<size_t> disp(pid_node_pair.size(),0);
  1509. #pragma omp parallel for
  1510. for(size_t i=0;i<pid_node_pair.size();i++){
  1511. size[i]=((CommData*)shared_data[pid_node_pair[i].data])->pkd_length;
  1512. }
  1513. omp_par::scan(&size[0],&disp[0],pid_node_pair.size());
  1514. // Resize send_buff.
  1515. if(send_buff.size()<size[pid_node_pair.size()-1]+disp[pid_node_pair.size()-1]){
  1516. send_buff.resize(size[pid_node_pair.size()-1]+disp[pid_node_pair.size()-1]);
  1517. }
  1518. // Copy data to send_buff.
  1519. #pragma omp parallel for
  1520. for(size_t i=0;i<pid_node_pair.size();i++){
  1521. size_t shrd_idx=pid_node_pair[i].data;
  1522. mem::memcopy(&send_buff[disp[i]], shared_data[shrd_idx], size[i]);
  1523. }
  1524. // Compute send_size, send_disp.
  1525. {
  1526. // Compute send_size.
  1527. #pragma omp parallel for
  1528. for(size_t tid=0;tid<omp_p;tid++){
  1529. size_t a=(pid_node_pair.size()* tid )/omp_p;
  1530. size_t b=(pid_node_pair.size()*(tid+1))/omp_p;
  1531. if(a>0 && a<pid_node_pair.size()){
  1532. size_t p0=pid_node_pair[a].key;
  1533. while(a<pid_node_pair.size() && p0==pid_node_pair[a].key) a++;
  1534. }
  1535. if(b>0 && b<pid_node_pair.size()){
  1536. size_t p1=pid_node_pair[b].key;
  1537. while(b<pid_node_pair.size() && p1==pid_node_pair[b].key) b++;
  1538. }
  1539. for(size_t i=a;i<b;i++){
  1540. send_size[pid_node_pair[i].key]+=size[i];
  1541. }
  1542. }
  1543. // Compute send_disp.
  1544. omp_par::scan(&send_size[0],&send_disp[0],num_p);
  1545. }
  1546. }
  1547. //Profile::Toc();
  1548. //Profile::Tic("A2A_Sparse", &comm, true, 5);
  1549. size_t recv_length=0;
  1550. { // Allocate recv_buff.
  1551. std::vector<MPI_size_t> recv_size(num_p,0);
  1552. std::vector<MPI_size_t> recv_disp(num_p,0);
  1553. MPI_Alltoall(&send_size[0], 1, par::Mpi_datatype<MPI_size_t>::value(),
  1554. &recv_size[0], 1, par::Mpi_datatype<MPI_size_t>::value(), *Comm());
  1555. omp_par::scan(&recv_size[0],&recv_disp[0],num_p);
  1556. recv_length=recv_size[num_p-1]+recv_disp[num_p-1];
  1557. if(recv_buff.size()<recv_length){
  1558. recv_buff.resize(recv_length);
  1559. }
  1560. par::Mpi_Alltoallv_sparse(&send_buff[0], &send_size[0], &send_disp[0],
  1561. &recv_buff[0], &recv_size[0], &recv_disp[0], *Comm());
  1562. }
  1563. //Profile::Toc();
  1564. //Profile::Tic("Unpack", &comm, false, 5);
  1565. std::vector<void*> recv_data; // CommData for received nodes.
  1566. { // Unpack received octants.
  1567. std::vector<par::SortPair<MortonId,size_t> > mid_indx_pair;
  1568. for(size_t i=0; i<recv_length;){
  1569. CommData& comm_data=*(CommData*)&recv_buff[i];
  1570. recv_data.push_back(&comm_data);
  1571. { // Add mid_indx_pair
  1572. par::SortPair<MortonId,size_t> p;
  1573. p.key=comm_data.mid;
  1574. p.data=mid_indx_pair.size();
  1575. mid_indx_pair.push_back(p);
  1576. }
  1577. i+=comm_data.pkd_length;
  1578. assert(comm_data.pkd_length>0);
  1579. }
  1580. std::vector<Node_t*> recv_nodes(recv_data.size());
  1581. { // Find received octants in tree.
  1582. omp_par::merge_sort(&mid_indx_pair[0], &mid_indx_pair[0]+mid_indx_pair.size());
  1583. std::vector<size_t> indx(omp_p+1);
  1584. for(size_t i=0;i<=omp_p;i++){
  1585. size_t j=(mid_indx_pair.size()*i)/omp_p;
  1586. if(j>0) while(j<mid_indx_pair.size()-1){
  1587. if(mid_indx_pair[j+1].key.GetDepth()<=
  1588. mid_indx_pair[j].key.GetDepth()) break;
  1589. j++;
  1590. }
  1591. indx[i]=j;
  1592. }
  1593. int nchld=(1UL<<this->Dim()); // Number of children.
  1594. if(mid_indx_pair.size()>0)
  1595. for(size_t tid=1;tid<omp_p;tid++){
  1596. size_t j=indx[tid];
  1597. MortonId& mid=mid_indx_pair[j].key;
  1598. Node_t* srch_node=this->RootNode();
  1599. while(srch_node->GetMortonId()!=mid){
  1600. Node_t* ch_node;
  1601. if(srch_node->IsLeaf()){
  1602. srch_node->SetGhost(true);
  1603. srch_node->Subdivide();
  1604. }
  1605. for(int j=nchld-1;j>=0;j--){
  1606. ch_node=(Node_t*)srch_node->Child(j);
  1607. if(ch_node->GetMortonId()<=mid){
  1608. srch_node=ch_node;
  1609. break;
  1610. }
  1611. }
  1612. }
  1613. }
  1614. #pragma omp parallel for
  1615. for(size_t tid=0;tid<omp_p;tid++){
  1616. size_t a=indx[tid ];
  1617. size_t b=indx[tid+1];
  1618. for(size_t j=a;j<b;j++){ // Find shared nodes.
  1619. size_t i=mid_indx_pair[j].data;
  1620. MortonId& mid=mid_indx_pair[j].key;
  1621. Node_t* srch_node=this->RootNode();
  1622. while(srch_node->GetMortonId()!=mid){
  1623. Node_t* ch_node;
  1624. if(srch_node->IsLeaf()){
  1625. srch_node->SetGhost(true);
  1626. srch_node->Subdivide();
  1627. }
  1628. for(int j=nchld-1;j>=0;j--){
  1629. ch_node=(Node_t*)srch_node->Child(j);
  1630. if(ch_node->GetMortonId()<=mid){
  1631. srch_node=ch_node;
  1632. break;
  1633. }
  1634. }
  1635. }
  1636. recv_nodes[i]=srch_node;
  1637. }
  1638. }
  1639. }
  1640. #pragma omp parallel for
  1641. for(size_t i=0;i<recv_data.size();i++){ // Unpack
  1642. if(!recv_nodes[i]->IsGhost()) continue;
  1643. assert(recv_nodes[i]->IsGhost());
  1644. CommData& comm_data=*(CommData*)recv_data[i];
  1645. PackedData p;
  1646. p.data=((char*)recv_data[i])+sizeof(CommData);
  1647. p.length=comm_data.pkd_length-sizeof(CommData);
  1648. recv_nodes[i]->Unpack(p);
  1649. }
  1650. }
  1651. //Profile::Toc();
  1652. //Profile::Tic("Broadcast", &comm, true, 5);
  1653. { // Broadcast octants.
  1654. std::vector<MortonId> shrd_mid;
  1655. if(rank+1<num_p){ // Set shrd_mid.
  1656. MortonId m=mins[rank+1];
  1657. while(m.GetDepth()>0 && m.getDFD()>=mins[rank+1]){
  1658. m=m.getAncestor(m.GetDepth()-1);
  1659. }
  1660. size_t d=m.GetDepth()+1;
  1661. shrd_mid.resize(d);
  1662. for(size_t i=0;i<d;i++){
  1663. shrd_mid[i]=m.getAncestor(i);
  1664. }
  1665. }
  1666. std::vector<void*> shrd_data; // CommData for shared nodes.
  1667. { // Set shrd_data
  1668. for(size_t i=0;i<shared_data.size();i++){
  1669. CommData& comm_data=*(CommData*)shared_data[i];
  1670. assert(comm_data.mid.GetDepth()>0);
  1671. size_t d=comm_data.mid.GetDepth()-1;
  1672. if(d<shrd_mid.size() && shrd_mid[d].getDFD()>=mins[rank])
  1673. for(size_t j=0;j<comm_data.usr_cnt;j++){
  1674. if(comm_data.usr_mid[j]==shrd_mid[d]){
  1675. shrd_data.push_back(&comm_data);
  1676. break;
  1677. }
  1678. }
  1679. if(shrd_data.size()==0 || shrd_data.back()!=&comm_data) this->memgr.free(&comm_data);
  1680. }
  1681. for(size_t i=0;i<recv_data.size();i++){
  1682. CommData& comm_data=*(CommData*)recv_data[i];
  1683. assert(comm_data.mid.GetDepth()>0);
  1684. size_t d=comm_data.mid.GetDepth()-1;
  1685. if(d<shrd_mid.size() && shrd_mid[d].getDFD()>=mins[rank])
  1686. for(size_t j=0;j<comm_data.usr_cnt;j++){
  1687. if(comm_data.usr_mid[j]==shrd_mid[d]){
  1688. char* data_ptr=(char*)this->memgr.malloc(comm_data.pkd_length);
  1689. mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
  1690. shrd_data.push_back(data_ptr);
  1691. break;
  1692. }
  1693. }
  1694. }
  1695. }
  1696. size_t pid_shift=1;
  1697. while(pid_shift<num_p){
  1698. MPI_size_t recv_pid=(rank>=pid_shift?rank-pid_shift:rank);
  1699. MPI_size_t send_pid=(rank+pid_shift<num_p?rank+pid_shift:rank);
  1700. MPI_size_t send_length=0;
  1701. if(send_pid!=rank){ // Send data for send_pid
  1702. std::vector<void*> send_data;
  1703. std::vector<size_t> send_size;
  1704. for(size_t i=0; i<shrd_data.size();i++){
  1705. CommData& comm_data=*(CommData*)shrd_data[i];
  1706. size_t d=comm_data.mid.GetDepth()-1;
  1707. bool shared=(d<shrd_mid.size() && shrd_mid[d].NextId().getDFD()>mins[send_pid].getDFD());
  1708. if(shared) for(size_t j=0;j<comm_data.usr_cnt;j++){ // if send_pid already has this node then skip
  1709. if(comm_data.usr_pid[j]==send_pid){
  1710. shared=false;
  1711. break;
  1712. }
  1713. }
  1714. if(!shared) continue;
  1715. send_data.push_back(&comm_data);
  1716. send_size.push_back(comm_data.pkd_length);
  1717. }
  1718. std::vector<size_t> send_disp(send_data.size(),0);
  1719. omp_par::scan(&send_size[0],&send_disp[0],send_data.size());
  1720. if(send_data.size()>0) send_length=send_size.back()+send_disp.back();
  1721. // Resize send_buff.
  1722. if(send_buff.size()<send_length){
  1723. send_buff.resize(send_length);
  1724. }
  1725. // Copy data to send_buff.
  1726. #pragma omp parallel for
  1727. for(size_t i=0;i<send_data.size();i++){
  1728. CommData& comm_data=*(CommData*)send_data[i];
  1729. mem::memcopy(&send_buff[send_disp[i]], &comm_data, comm_data.pkd_length);
  1730. }
  1731. }
  1732. MPI_size_t recv_length=0;
  1733. { // Send-Recv data
  1734. MPI_Request request;
  1735. MPI_Status status;
  1736. if(recv_pid!=rank) MPI_Irecv(&recv_length, 1, par::Mpi_datatype<MPI_size_t>::value(),recv_pid, 1, *Comm(), &request);
  1737. if(send_pid!=rank) MPI_Send (&send_length, 1, par::Mpi_datatype<MPI_size_t>::value(),send_pid, 1, *Comm());
  1738. if(recv_pid!=rank) MPI_Wait(&request, &status);
  1739. // Resize recv_buff
  1740. if(recv_buff.size()<recv_length){
  1741. recv_buff.resize(recv_length);
  1742. }
  1743. if(recv_length>0) MPI_Irecv(&recv_buff[0], recv_length, par::Mpi_datatype<char>::value(),recv_pid, 1, *Comm(), &request);
  1744. if(send_length>0) MPI_Send (&send_buff[0], send_length, par::Mpi_datatype<char>::value(),send_pid, 1, *Comm());
  1745. if(recv_length>0) MPI_Wait(&request, &status);
  1746. }
  1747. std::vector<void*> recv_data; // CommData for received nodes.
  1748. { // Unpack received octants.
  1749. std::vector<par::SortPair<MortonId,size_t> > mid_indx_pair;
  1750. for(size_t i=0; i<recv_length;){
  1751. CommData& comm_data=*(CommData*)&recv_buff[i];
  1752. recv_data.push_back(&comm_data);
  1753. { // Add mid_indx_pair
  1754. par::SortPair<MortonId,size_t> p;
  1755. p.key=comm_data.mid;
  1756. p.data=mid_indx_pair.size();
  1757. mid_indx_pair.push_back(p);
  1758. }
  1759. i+=comm_data.pkd_length;
  1760. assert(comm_data.pkd_length>0);
  1761. }
  1762. std::vector<Node_t*> recv_nodes(recv_data.size());
  1763. // int nchld=(1UL<<this->Dim()); // Number of children.
  1764. // for(size_t i=0;i<recv_data.size();i++){ // Find received octants in tree.
  1765. // CommData& comm_data=*(CommData*)recv_data[i];
  1766. // MortonId& mid=comm_data.mid;
  1767. // Node_t* srch_node=this->RootNode();
  1768. // while(srch_node->GetMortonId()!=mid){
  1769. // Node_t* ch_node;
  1770. // if(srch_node->IsLeaf()){
  1771. // srch_node->SetGhost(true);
  1772. // srch_node->Subdivide();
  1773. // }
  1774. // for(int j=nchld-1;j>=0;j--){
  1775. // ch_node=(Node_t*)srch_node->Child(j);
  1776. // if(ch_node->GetMortonId()<=mid){
  1777. // srch_node=ch_node;
  1778. // break;
  1779. // }
  1780. // }
  1781. // }
  1782. // recv_nodes[i]=srch_node;
  1783. // }
  1784. { // Find received octants in tree.
  1785. omp_par::merge_sort(&mid_indx_pair[0], &mid_indx_pair[0]+mid_indx_pair.size());
  1786. std::vector<size_t> indx(omp_p+1);
  1787. for(size_t i=0;i<=omp_p;i++){
  1788. size_t j=(mid_indx_pair.size()*i)/omp_p;
  1789. if(j>0) while(j<mid_indx_pair.size()-1){
  1790. if(mid_indx_pair[j+1].key.GetDepth()<=
  1791. mid_indx_pair[j].key.GetDepth()) break;
  1792. j++;
  1793. }
  1794. indx[i]=j;
  1795. }
  1796. int nchld=(1UL<<this->Dim()); // Number of children.
  1797. if(mid_indx_pair.size()>0)
  1798. for(size_t tid=1;tid<omp_p;tid++){
  1799. size_t j=indx[tid];
  1800. MortonId& mid=mid_indx_pair[j].key;
  1801. Node_t* srch_node=this->RootNode();
  1802. while(srch_node->GetMortonId()!=mid){
  1803. Node_t* ch_node;
  1804. if(srch_node->IsLeaf()){
  1805. srch_node->SetGhost(true);
  1806. srch_node->Subdivide();
  1807. }
  1808. for(int j=nchld-1;j>=0;j--){
  1809. ch_node=(Node_t*)srch_node->Child(j);
  1810. if(ch_node->GetMortonId()<=mid){
  1811. srch_node=ch_node;
  1812. break;
  1813. }
  1814. }
  1815. }
  1816. }
  1817. #pragma omp parallel for
  1818. for(size_t tid=0;tid<omp_p;tid++){
  1819. size_t a=indx[tid ];
  1820. size_t b=indx[tid+1];
  1821. for(size_t j=a;j<b;j++){ // Find shared nodes.
  1822. size_t i=mid_indx_pair[j].data;
  1823. MortonId& mid=mid_indx_pair[j].key;
  1824. Node_t* srch_node=this->RootNode();
  1825. while(srch_node->GetMortonId()!=mid){
  1826. Node_t* ch_node;
  1827. if(srch_node->IsLeaf()){
  1828. srch_node->SetGhost(true);
  1829. srch_node->Subdivide();
  1830. }
  1831. for(int j=nchld-1;j>=0;j--){
  1832. ch_node=(Node_t*)srch_node->Child(j);
  1833. if(ch_node->GetMortonId()<=mid){
  1834. srch_node=ch_node;
  1835. break;
  1836. }
  1837. }
  1838. }
  1839. recv_nodes[i]=srch_node;
  1840. }
  1841. }
  1842. }
  1843. #pragma omp parallel for
  1844. for(size_t i=0;i<recv_data.size();i++){
  1845. if(!recv_nodes[i]->IsGhost()) continue;
  1846. assert(recv_nodes[i]->IsGhost());
  1847. CommData& comm_data=*(CommData*)recv_data[i];
  1848. PackedData p;
  1849. p.data=((char*)recv_data[i])+sizeof(CommData);
  1850. p.length=comm_data.pkd_length-sizeof(CommData);
  1851. recv_nodes[i]->Unpack(p);
  1852. }
  1853. }
  1854. pid_shift<<=1;
  1855. send_pid=(rank+pid_shift<num_p?rank+pid_shift:rank);
  1856. if(send_pid!=rank){ // Set shrd_data
  1857. for(size_t i=0;i<recv_data.size();i++){
  1858. CommData& comm_data=*(CommData*)recv_data[i];
  1859. //{ // Skip if this node already exists.
  1860. // bool skip=false;
  1861. // for(size_t k=0;k<shrd_data.size();k++){
  1862. // CommData& comm_data_=*(CommData*)shrd_data[k];
  1863. // if(comm_data_.mid==comm_data.mid){
  1864. // assert(false);
  1865. // skip=true;
  1866. // break;
  1867. // }
  1868. // }
  1869. // if(skip) continue;
  1870. //}
  1871. assert(comm_data.mid.GetDepth()>0);
  1872. size_t d=comm_data.mid.GetDepth()-1;
  1873. if(d<shrd_mid.size() && shrd_mid[d].isAncestor(mins[rank]) && shrd_mid[d].NextId().getDFD()>mins[send_pid].getDFD())
  1874. for(size_t j=0;j<comm_data.usr_cnt;j++){
  1875. if(comm_data.usr_mid[j]==shrd_mid[d]){
  1876. char* data_ptr=(char*)this->memgr.malloc(comm_data.pkd_length);
  1877. mem::memcopy(data_ptr, &comm_data, comm_data.pkd_length);
  1878. shrd_data.push_back(data_ptr);
  1879. break;
  1880. }
  1881. }
  1882. }
  1883. }
  1884. }
  1885. // Free data
  1886. //Profile::Tic("Free", &comm, false, 5);
  1887. for(size_t i=0;i<shrd_data.size();i++) this->memgr.free(shrd_data[i]);
  1888. //Profile::Toc();
  1889. }
  1890. //Profile::Toc();
  1891. }
  1892. inline bool isLittleEndian(){
  1893. uint16_t number = 0x1;
  1894. uint8_t *numPtr = (uint8_t*)&number;
  1895. return (numPtr[0] == 1);
  1896. }
  1897. template <class TreeNode>
  1898. void MPI_Tree<TreeNode>::Write2File(const char* fname, int lod){
  1899. typedef double VTKReal_t;
  1900. int myrank, np;
  1901. MPI_Comm_size(*Comm(),&np);
  1902. MPI_Comm_rank(*Comm(),&myrank);
  1903. VTUData_t<VTKReal_t> vtu_data;
  1904. TreeNode::VTU_Data(vtu_data, this->GetNodeList(), lod);
  1905. std::vector<VTKReal_t>& coord=vtu_data.coord;
  1906. std::vector<std::string>& name =vtu_data.name;
  1907. std::vector<std::vector<VTKReal_t> >& value=vtu_data.value;
  1908. std::vector<int32_t>& connect=vtu_data.connect;
  1909. std::vector<int32_t>& offset =vtu_data.offset;
  1910. std::vector<uint8_t>& types =vtu_data.types;
  1911. int pt_cnt=coord.size()/COORD_DIM;
  1912. int cell_cnt=types.size();
  1913. std::vector<int32_t> mpi_rank; //MPI_Rank at points.
  1914. int new_myrank=myrank;//rand();
  1915. mpi_rank.resize(pt_cnt,new_myrank);
  1916. //Open file for writing.
  1917. std::stringstream vtufname;
  1918. vtufname<<fname<<std::setfill('0')<<std::setw(6)<<myrank<<".vtu";
  1919. std::ofstream vtufile;
  1920. vtufile.open(vtufname.str().c_str());
  1921. if(vtufile.fail()) return;
  1922. //Proceed to write to file.
  1923. size_t data_size=0;
  1924. vtufile<<"<?xml version=\"1.0\"?>\n";
  1925. if(isLittleEndian()) vtufile<<"<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n";
  1926. else vtufile<<"<VTKFile type=\"UnstructuredGrid\" version=\"0.1\" byte_order=\"BigEndian\">\n";
  1927. //===========================================================================
  1928. vtufile<<" <UnstructuredGrid>\n";
  1929. vtufile<<" <Piece NumberOfPoints=\""<<pt_cnt<<"\" NumberOfCells=\""<<cell_cnt<<"\">\n";
  1930. //---------------------------------------------------------------------------
  1931. vtufile<<" <Points>\n";
  1932. vtufile<<" <DataArray type=\"Float"<<sizeof(VTKReal_t)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1933. data_size+=sizeof(uint32_t)+coord.size()*sizeof(VTKReal_t);
  1934. vtufile<<" </Points>\n";
  1935. //---------------------------------------------------------------------------
  1936. vtufile<<" <PointData>\n";
  1937. for(size_t i=0;i<name.size();i++) if(value[i].size()){ // value
  1938. vtufile<<" <DataArray type=\"Float"<<sizeof(VTKReal_t)*8<<"\" NumberOfComponents=\""<<value[i].size()/pt_cnt<<"\" Name=\""<<name[i]<<"\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1939. data_size+=sizeof(uint32_t)+value[i].size()*sizeof(VTKReal_t);
  1940. }
  1941. { // mpi_rank
  1942. vtufile<<" <DataArray type=\"Int32\" NumberOfComponents=\"1\" Name=\"mpi_rank\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1943. data_size+=sizeof(uint32_t)+mpi_rank.size()*sizeof(int32_t);
  1944. }
  1945. vtufile<<" </PointData>\n";
  1946. //---------------------------------------------------------------------------
  1947. //---------------------------------------------------------------------------
  1948. vtufile<<" <Cells>\n";
  1949. vtufile<<" <DataArray type=\"Int32\" Name=\"connectivity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1950. data_size+=sizeof(uint32_t)+connect.size()*sizeof(int32_t);
  1951. vtufile<<" <DataArray type=\"Int32\" Name=\"offsets\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1952. data_size+=sizeof(uint32_t)+offset.size() *sizeof(int32_t);
  1953. vtufile<<" <DataArray type=\"UInt8\" Name=\"types\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1954. data_size+=sizeof(uint32_t)+types.size() *sizeof(uint8_t);
  1955. vtufile<<" </Cells>\n";
  1956. //---------------------------------------------------------------------------
  1957. //vtufile<<" <CellData>\n";
  1958. //vtufile<<" <DataArray type=\"Float"<<sizeof(VTKReal_t)*8<<"\" Name=\"Velocity\" format=\"appended\" offset=\""<<data_size<<"\" />\n";
  1959. //vtufile<<" </CellData>\n";
  1960. //---------------------------------------------------------------------------
  1961. vtufile<<" </Piece>\n";
  1962. vtufile<<" </UnstructuredGrid>\n";
  1963. //===========================================================================
  1964. vtufile<<" <AppendedData encoding=\"raw\">\n";
  1965. vtufile<<" _";
  1966. int32_t block_size;
  1967. block_size=coord .size()*sizeof(VTKReal_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&coord [0], coord .size()*sizeof(VTKReal_t));
  1968. for(size_t i=0;i<name.size();i++) if(value[i].size()){ // value
  1969. block_size=value[i].size()*sizeof(VTKReal_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&value[i][0], value[i].size()*sizeof(VTKReal_t));
  1970. }
  1971. block_size=mpi_rank.size()*sizeof( int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&mpi_rank[0], mpi_rank.size()*sizeof( int32_t));
  1972. block_size=connect.size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&connect[0], connect.size()*sizeof(int32_t));
  1973. block_size=offset .size()*sizeof(int32_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&offset [0], offset .size()*sizeof(int32_t));
  1974. block_size=types .size()*sizeof(uint8_t); vtufile.write((char*)&block_size, sizeof(int32_t)); vtufile.write((char*)&types [0], types .size()*sizeof(uint8_t));
  1975. vtufile<<"\n";
  1976. vtufile<<" </AppendedData>\n";
  1977. //===========================================================================
  1978. vtufile<<"</VTKFile>\n";
  1979. vtufile.close();
  1980. if(myrank) return;
  1981. std::stringstream pvtufname;
  1982. pvtufname<<fname<<".pvtu";
  1983. std::ofstream pvtufile;
  1984. pvtufile.open(pvtufname.str().c_str());
  1985. if(pvtufile.fail()) return;
  1986. pvtufile<<"<?xml version=\"1.0\"?>\n";
  1987. pvtufile<<"<VTKFile type=\"PUnstructuredGrid\">\n";
  1988. pvtufile<<" <PUnstructuredGrid GhostLevel=\"0\">\n";
  1989. pvtufile<<" <PPoints>\n";
  1990. pvtufile<<" <PDataArray type=\"Float"<<sizeof(VTKReal_t)*8<<"\" NumberOfComponents=\""<<COORD_DIM<<"\" Name=\"Position\"/>\n";
  1991. pvtufile<<" </PPoints>\n";
  1992. pvtufile<<" <PPointData>\n";
  1993. for(size_t i=0;i<name.size();i++) if(value[i].size()){ // value
  1994. pvtufile<<" <PDataArray type=\"Float"<<sizeof(VTKReal_t)*8<<"\" NumberOfComponents=\""<<value[i].size()/pt_cnt<<"\" Name=\""<<name[i]<<"\"/>\n";
  1995. }
  1996. { // mpi_rank
  1997. pvtufile<<" <PDataArray type=\"Int32\" NumberOfComponents=\"1\" Name=\"mpi_rank\"/>\n";
  1998. }
  1999. pvtufile<<" </PPointData>\n";
  2000. {
  2001. // Extract filename from path.
  2002. std::stringstream vtupath;
  2003. vtupath<<'/'<<fname<<'\0';
  2004. char *fname_ = (char*)strrchr(vtupath.str().c_str(), '/') + 1;
  2005. //std::string fname_ = boost::filesystem::path(fname).filename().string().
  2006. for(int i=0;i<np;i++) pvtufile<<" <Piece Source=\""<<fname_<<std::setfill('0')<<std::setw(6)<<i<<".vtu\"/>\n";
  2007. }
  2008. pvtufile<<" </PUnstructuredGrid>\n";
  2009. pvtufile<<"</VTKFile>\n";
  2010. pvtufile.close();
  2011. }
  2012. template <class TreeNode>
  2013. const std::vector<MortonId>& MPI_Tree<TreeNode>::GetMins(){
  2014. Node_t* n=this->PreorderFirst();
  2015. while(n!=NULL){
  2016. if(!n->IsGhost() && n->IsLeaf()) break;
  2017. n=this->PreorderNxt(n);
  2018. }
  2019. ASSERT_WITH_MSG(n!=NULL,"No non-ghost nodes found on this process.");
  2020. MortonId my_min;
  2021. my_min=n->GetMortonId();
  2022. int np;
  2023. MPI_Comm_size(*Comm(),&np);
  2024. mins.resize(np);
  2025. MPI_Allgather(&my_min , 1, par::Mpi_datatype<MortonId>::value(),
  2026. &mins[0], 1, par::Mpi_datatype<MortonId>::value(), *Comm());
  2027. return mins;
  2028. }
  2029. }//end namespace