瀏覽代碼

Add memory manager.

Dhairya Malhotra 11 年之前
父節點
當前提交
6844053776
共有 3 個文件被更改,包括 347 次插入42 次删除
  1. 277 0
      include/mem_mgr.hpp
  2. 3 1
      include/mpi_tree.hpp
  3. 67 41
      include/mpi_tree.txx

+ 277 - 0
include/mem_mgr.hpp

@@ -0,0 +1,277 @@
+/**
+ * \file memgr.hpp
+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
+ * \date 6-30-2014
+ * \brief This file contains the definition of a simple memory manager which
+ * uses a pre-allocated buffer of size defined in call to the constructor.
+ */
+
+#ifndef _PVFMM_MEMGR_HPP_
+#define _PVFMM_MEMGR_HPP_
+
+#include <map>
+#include <stack>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <omp.h>
+
+namespace pvfmm{
+namespace mem{
+
+class MemoryManager{
+
+  public:
+
+    MemoryManager(size_t N){
+      buff_size=N;
+      buff=(char*)::malloc(buff_size); assert(buff);
+      n_dummy_indx=new_node();
+      size_t n_indx=new_node();
+      node& n_dummy=node_buff[n_dummy_indx-1];
+      node& n=node_buff[n_indx-1];
+
+      n_dummy.size=0;
+      n_dummy.free=false;
+      n_dummy.prev=NULL;
+      n_dummy.next=n_indx;
+      n_dummy.mem_ptr=&buff[0];
+      assert(n_indx);
+
+      n.size=N;
+      n.free=true;
+      n.prev=n_dummy_indx;
+      n.next=NULL;
+      n.mem_ptr=&buff[0];
+      n.it=free_map.insert(std::make_pair(N,n_indx));
+
+      omp_init_lock(&omp_lock);
+    }
+
+    ~MemoryManager(){
+      node* n=&node_buff[n_dummy_indx-1];
+      n=&node_buff[n->next-1];
+      if(n==NULL || !n->free || n->size!=buff_size ||
+          node_stack.size()!=node_buff.size()-2){
+        std::cout<<"\nWarning: memory leak detected.\n";
+      }
+
+      omp_destroy_lock(&omp_lock);
+      if(buff) ::free(buff);
+    }
+
+    void* malloc(size_t size){
+      if(!size) return NULL;
+      size+=sizeof(size_t);
+      std::multimap<size_t, size_t>::iterator it;
+
+      omp_set_lock(&omp_lock);
+      it=free_map.lower_bound(size);
+      if(it==free_map.end()){
+        omp_unset_lock(&omp_lock);
+        return ::malloc(size);
+      }else if(it->first==size){
+        size_t n_indx=it->second;
+        node& n=node_buff[n_indx-1];
+        //assert(n.size==it->first);
+        //assert(n.it==it);
+        //assert(n.free);
+
+        n.free=false;
+        free_map.erase(it);
+        ((size_t*)n.mem_ptr)[0]=n_indx;
+        omp_unset_lock(&omp_lock);
+        return &((size_t*)n.mem_ptr)[1];
+      }else{
+        size_t n_indx=it->second;
+        size_t n_free_indx=new_node();
+        node& n_free=node_buff[n_free_indx-1];
+        node& n     =node_buff[n_indx-1];
+        //assert(n.size==it->first);
+        //assert(n.it==it);
+        //assert(n.free);
+
+        n_free=n;
+        n_free.size-=size;
+        n_free.mem_ptr=(char*)n_free.mem_ptr+size;
+        n_free.prev=n_indx;
+        if(n_free.next){
+          size_t n_next_indx=n_free.next;
+          node& n_next=node_buff[n_next_indx-1];
+          n_next.prev=n_free_indx;
+        }
+
+        n.free=false;
+        n.size=size;
+        n.next=n_free_indx;
+
+        free_map.erase(it);
+        n_free.it=free_map.insert(std::make_pair(n_free.size,n_free_indx));
+        ((size_t*)n.mem_ptr)[0]=n_indx;
+        omp_unset_lock(&omp_lock);
+        return &((size_t*)n.mem_ptr)[1];
+      }
+    }
+
+    void free(void* p){
+      if(p<&buff[0] || p>=&buff[buff_size]) return ::free(p);
+
+      size_t n_indx=((size_t*)p)[-1];
+      assert(n_indx>0 && n_indx<=node_buff.size());
+      ((size_t*)p)[-1]=0;
+
+      omp_set_lock(&omp_lock);
+      node& n=node_buff[n_indx-1];
+      assert(!n.free && n.size>0 && n.mem_ptr==&((size_t*)p)[-1]);
+      n.free=true;
+
+      if(n.prev!=NULL && node_buff[n.prev-1].free){
+        size_t n_prev_indx=n.prev;
+        node& n_prev=node_buff[n_prev_indx-1];
+        free_map.erase(n_prev.it);
+        n.size+=n_prev.size;
+        n.mem_ptr=n_prev.mem_ptr;
+        n.prev=n_prev.prev;
+        delete_node(n_prev_indx);
+
+        if(n.prev){
+          size_t n_prev_indx=n.prev;
+          node& n_prev=node_buff[n_prev_indx-1];
+          n_prev.next=n_indx;
+        }
+      }
+      if(n.next!=NULL && node_buff[n.next-1].free){
+        size_t n_next_indx=n.next;
+        node& n_next=node_buff[n_next_indx-1];
+        free_map.erase(n_next.it);
+        n.size+=n_next.size;
+        n.next=n_next.next;
+        delete_node(n_next_indx);
+
+        if(n.next){
+          size_t n_next_indx=n.next;
+          node& n_next=node_buff[n_next_indx-1];
+          n_next.prev=n_indx;
+        }
+      }
+      n.it=free_map.insert(std::make_pair(n.size,n_indx));
+      omp_unset_lock(&omp_lock);
+    }
+
+    void print(){
+      if(!buff_size) return;
+      omp_set_lock(&omp_lock);
+
+      size_t size=0;
+      size_t largest_size=0;
+      node* n=&node_buff[n_dummy_indx-1];
+      std::cout<<"\n|";
+      while(n->next){
+        n=&node_buff[n->next-1];
+        if(n->free){
+          std::cout<<' ';
+          largest_size=std::max(largest_size,n->size);
+        }
+        else{
+          std::cout<<'#';
+          size+=n->size;
+        }
+      }
+      std::cout<<"|  allocated="<<round(size*1000.0/buff_size)/10<<"%";
+      std::cout<<"  largest_free="<<round(largest_size*1000.0/buff_size)/10<<"%\n";
+
+      omp_unset_lock(&omp_lock);
+    }
+
+    static void test(){
+      size_t M=2000000000;
+      { // With memory manager
+        size_t N=M*sizeof(double)*1.1;
+        double tt;
+        double* tmp;
+
+        std::cout<<"With memory manager: ";
+        MemoryManager memgr(N);
+
+        for(size_t j=0;j<3;j++){
+          tmp=(double*)memgr.malloc(M*sizeof(double)); assert(tmp);
+          tt=omp_get_wtime();
+          #pragma omp parallel for
+          for(size_t i=0;i<M;i++) tmp[i]=i;
+          tt=omp_get_wtime()-tt;
+          std::cout<<tt<<' ';
+          memgr.free(tmp);
+        }
+        std::cout<<'\n';
+      }
+      { // Without memory manager
+        double tt;
+        double* tmp;
+
+        //pvfmm::MemoryManager memgr(N);
+
+        std::cout<<"Without memory manager: ";
+        for(size_t j=0;j<3;j++){
+          tmp=(double*)::malloc(M*sizeof(double)); assert(tmp);
+          tt=omp_get_wtime();
+          #pragma omp parallel for
+          for(size_t i=0;i<M;i++) tmp[i]=i;
+          tt=omp_get_wtime()-tt;
+          std::cout<<tt<<' ';
+          ::free(tmp);
+        }
+        std::cout<<'\n';
+      }
+    }
+
+  private:
+
+    struct node{
+      bool free;
+      size_t size;
+      void* mem_ptr;
+      size_t prev, next;
+      std::multimap<size_t, size_t>::iterator it;
+    };
+
+    MemoryManager();
+
+    MemoryManager(const MemoryManager& m);
+
+    size_t new_node(){
+      if(node_stack.empty()){
+        node_buff.resize(node_buff.size()+1);
+        node_stack.push(node_buff.size());
+      }
+
+      size_t indx=node_stack.top();
+      node_stack.pop();
+      assert(indx);
+      return indx;
+    }
+
+    void delete_node(size_t indx){
+      assert(indx);
+      assert(indx<=node_buff.size());
+      node& n=node_buff[indx-1];
+      n.size=0;
+      n.prev=0;
+      n.next=0;
+      n.mem_ptr=NULL;
+      node_stack.push(indx);
+    }
+
+    char* buff;
+    size_t buff_size;
+    std::vector<node> node_buff;
+    std::stack<size_t> node_stack;
+    std::multimap<size_t, size_t> free_map;
+    size_t n_dummy_indx;
+
+    omp_lock_t omp_lock;
+};
+
+}//end namespace
+}//end namespace
+
+#endif //_PVFMM_MEMGR_HPP_

+ 3 - 1
include/mpi_tree.hpp

@@ -103,7 +103,6 @@ class MPI_Tree: public Tree<TreeNode>{
    * \brief Construct the LET by exchanging ghost octants.
    */
   void ConstructLET(BoundaryType bndry=FreeSpace);
-  void ConstructLET_Sparse(BoundaryType bndry=FreeSpace);
 
   /**
    * \brief Write to a <fname>.vtu file.
@@ -125,6 +124,9 @@ class MPI_Tree: public Tree<TreeNode>{
 
  private:
 
+  void ConstructLET_Hypercube(BoundaryType bndry=FreeSpace);
+  void ConstructLET_Sparse(BoundaryType bndry=FreeSpace);
+
   MPI_Comm comm;
   std::vector<MortonId> mins;
 

+ 67 - 41
include/mpi_tree.txx

@@ -14,6 +14,7 @@
 #include <parUtils.h>
 #include <ompUtils.h>
 #include <profile.hpp>
+#include <mem_mgr.hpp>
 
 namespace pvfmm{
 
@@ -1228,14 +1229,30 @@ inline void IsShared(std::vector<PackedData>& nodes, MortonId* m1, MortonId* m2,
 }
 
 
+/**
+ * \brief Construct Locally Essential Tree by exchanging Ghost octants.
+ */
+template <class TreeNode>
+void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
+  Profile::Tic("LET_Hypercube", &comm, true, 5);
+  ConstructLET_Hypercube(bndry);
+  Profile::Toc();
+
+  Profile::Tic("LET_Sparse", &comm, true, 5);
+  ConstructLET_Sparse(bndry);
+  Profile::Toc();
+
+#ifndef NDEBUG
+  CheckTree();
+#endif
+}
+
 /**
  * \brief Hypercube based scheme to exchange Ghost octants.
  */
 //#define PREFETCH_T0(addr,nrOfBytesAhead) _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)
 template <class TreeNode>
-void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
-#if 1
-  Profile::Tic("LET_Hypercube", &comm, true, 5); //----
+void MPI_Tree<TreeNode>::ConstructLET_Hypercube(BoundaryType bndry){
   int num_p,rank;
   MPI_Comm_size(*Comm(),&num_p);
   MPI_Comm_rank(*Comm(),&rank );
@@ -1427,21 +1444,12 @@ void MPI_Tree<TreeNode>::ConstructLET(BoundaryType bndry){
     }
   }
   //Now LET is complete.
-  Profile::Toc();
-#endif
-
-  Profile::Tic("LET_Sparse", &comm, true, 5);
-  ConstructLET_Sparse(bndry);
-  Profile::Toc();
 
-  Profile::Tic("LET_Sparse_", &comm, true, 5);
-  ConstructLET_Sparse_(bndry);
-  Profile::Toc();
-#ifndef NDEBUG
-  CheckTree();
-#endif
 }
 
+/**
+ * \brief Sparse communication scheme to exchange Ghost octants.
+ */
 template <class TreeNode>
 void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
   typedef int MPI_size_t;
@@ -1460,11 +1468,12 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
   MPI_Comm_rank(*Comm(),&rank );
   if(num_p==1) return;
 
-  int omp_p=omp_get_max_threads(); omp_p=1;
+  int omp_p=omp_get_max_threads();
   std::vector<MortonId> mins=GetMins();
 
   // Allocate Memory.
-  static std::vector<char> shrd_buff_vec0(16*64l*1024l*1024l); // TODO: Build memory manager for such allocations.
+  #define BUFFER_SIZE 1024
+  static mem::MemoryManager memgr(BUFFER_SIZE*1024l*1024l);
   static std::vector<char> shrd_buff_vec1(16*64l*1024l*1024l); // TODO: Build memory manager for such allocations.
   static std::vector<char> send_buff;
   static std::vector<char> recv_buff;
@@ -1478,7 +1487,7 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
     MortonId mins_r1=mins[std::min(rank+1,num_p-1)].getDFD();
 
     std::vector<TreeNode*> nodes=this->GetNodeList();
-    node_comm_data=new CommData[nodes.size()];
+    node_comm_data=(CommData*)memgr.malloc(sizeof(CommData)*nodes.size());
     #pragma omp parallel for
     for(size_t tid=0;tid<omp_p;tid++){
       std::vector<MortonId> nbr_lst;
@@ -1505,6 +1514,10 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
           MortonId usr_mid_dfd=usr_mid.getDFD();
           comm_data.usr_mid[j]=usr_mid;
           comm_data.usr_pid[j]=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
+//          if(usr_mid_dfd<mins_r0 || (rank+1<num_p && usr_mid_dfd>=mins_r1)){ // Find the user pid.
+//            size_t usr_pid=std::upper_bound(&mins[0],&mins[num_p],usr_mid_dfd)-&mins[0]-1;
+//            comm_data.usr_pid[j]=usr_pid;
+//          }else comm_data.usr_pid[j]=rank;
           if(!shared){ // Check if this node needs to be transferred during broadcast.
             if(comm_data.usr_pid[j]!=rank || (rank+1<num_p && usr_mid.NextId()>mins_r1) ){
               shared=true;
@@ -1539,25 +1552,31 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
   //Profile::Toc();
 
   //Profile::Tic("PackNodes", &comm, false, 5);
-  std::vector<PackedData> pkd_data(shared_data.size()); // PackedData for all shared nodes.
   { // Pack shared nodes.
-    char* data_ptr=&shrd_buff_vec0[0];
-    // TODO: Add OpenMP parallelism.
-    for(size_t i=0;i<shared_data.size();i++){
-      PackedData& p=pkd_data[i];
-      CommData& comm_data=*(CommData*)shared_data[i];
-      PackedData p0=comm_data.node->Pack(true,data_ptr,sizeof(CommData));
-      p.data=data_ptr; p.length=sizeof(CommData)+p0.length;
-      comm_data.pkd_length=p.length;
-
-      ((CommData*)data_ptr)[0]=comm_data;
-      shared_data[i]=data_ptr;
-      data_ptr+=p.length;
-      assert(data_ptr<=&(*shrd_buff_vec0.end())); //TODO: resize if needed.
+    #pragma omp parallel for
+    for(size_t tid=0;tid<omp_p;tid++){
+      size_t buff_length=10l*1024l*1024l; // 1MB buffer per thread.
+      char* buff=(char*)memgr.malloc(buff_length);
+
+      size_t a=( tid   *shared_data.size())/omp_p;
+      size_t b=((tid+1)*shared_data.size())/omp_p;
+      for(size_t i=a;i<b;i++){
+        CommData& comm_data=*(CommData*)shared_data[i];
+        PackedData p0=comm_data.node->Pack(true,buff);
+        assert(p0.length<buff_length);
+
+        shared_data[i]=memgr.malloc(sizeof(CommData)+p0.length);
+        CommData& new_comm_data=*(CommData*)shared_data[i];
+        new_comm_data=comm_data;
+
+        new_comm_data.pkd_length=sizeof(CommData)+p0.length;
+        mem::memcopy(((char*)shared_data[i])+sizeof(CommData),buff,p0.length);
+      }
+      memgr.free(buff);
     }
 
-    // now CommData is stored in shrd_buff_vec0.
-    delete[] node_comm_data;
+    // now CommData is stored in shared_data
+    memgr.free(node_comm_data);
     node_comm_data=NULL;
   }
   //Profile::Toc();
@@ -1570,7 +1589,7 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
     std::vector<size_t> disp(pid_node_pair.size(),0);
     #pragma omp parallel for
     for(size_t i=0;i<pid_node_pair.size();i++){
-      size[i]=pkd_data[pid_node_pair[i].data].length;
+      size[i]=((CommData*)shared_data[pid_node_pair[i].data])->pkd_length;
     }
     omp_par::scan(&size[0],&disp[0],pid_node_pair.size());
 
@@ -1582,8 +1601,8 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
     // Copy data to send_buff.
     #pragma omp parallel for
     for(size_t i=0;i<pid_node_pair.size();i++){
-      PackedData& p=pkd_data[pid_node_pair[i].data];
-      mem::memcopy(&send_buff[disp[i]], p.data, p.length);
+      size_t shrd_idx=pid_node_pair[i].data;
+      mem::memcopy(&send_buff[disp[i]], shared_data[shrd_idx], size[i]);
     }
 
     // Compute send_size, send_disp.
@@ -1593,10 +1612,14 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
       for(size_t tid=0;tid<omp_p;tid++){
         size_t a=(pid_node_pair.size()* tid   )/omp_p;
         size_t b=(pid_node_pair.size()*(tid+1))/omp_p;
-        size_t p0=pid_node_pair[a  ].key;
-        size_t p1=pid_node_pair[b-1].key;
-        if(tid>    0) while(a<pid_node_pair.size() && p0==pid_node_pair[a].key) a++;
-        if(tid<omp_p) while(b<pid_node_pair.size() && p1==pid_node_pair[b].key) b++;
+        if(a>                   0){
+          size_t p0=pid_node_pair[a].key;
+          while(a<pid_node_pair.size() && p0==pid_node_pair[a].key) a++;
+        }
+        if(b<pid_node_pair.size()){
+          size_t p1=pid_node_pair[b].key;
+          while(b<pid_node_pair.size() && p1==pid_node_pair[b].key) b++;
+        }
         for(size_t i=a;i<b;i++){
           send_size[pid_node_pair[i].key]+=size[i];
         }
@@ -1861,6 +1884,9 @@ void MPI_Tree<TreeNode>::ConstructLET_Sparse(BoundaryType bndry){
     }
   }
   //Profile::Toc();
+
+  // Free data
+  for(size_t i=0;i<shared_data.size();i++) memgr.free(shared_data[i]);
 }