| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200 | #include <cstdlib>#include <omp.h>#include <iterator>#include <vector>namespace pvfmm{template <class T,class StrictWeakOrdering>void omp_par::merge(T A_,T A_last,T B_,T B_last,T C_,int p,StrictWeakOrdering comp){  typedef typename std::iterator_traits<T>::difference_type _DiffType;  typedef typename std::iterator_traits<T>::value_type _ValType;  _DiffType N1=A_last-A_;  _DiffType N2=B_last-B_;  if(N1==0 && N2==0) return;  if(N1==0 || N2==0){    _ValType* A=(N1==0? &B_[0]: &A_[0]);    _DiffType N=(N1==0?  N2  :  N1   );    #pragma omp parallel for    for(int i=0;i<p;i++){      _DiffType indx1=( i   *N)/p;      _DiffType indx2=((i+1)*N)/p;      memcpy(&C_[indx1], &A[indx1], (indx2-indx1)*sizeof(_ValType));    }    return;  }  //Split both arrays ( A and B ) into n equal parts.  //Find the position of each split in the final merged array.  int n=10;  _ValType* split=new _ValType[p*n*2];  _DiffType* split_size=new _DiffType[p*n*2];  #pragma omp parallel for  for(int i=0;i<p;i++){    for(int j=0;j<n;j++){      int indx=i*n+j;      _DiffType indx1=(indx*N1)/(p*n);      split   [indx]=A_[indx1];      split_size[indx]=indx1+(std::lower_bound(B_,B_last,split[indx],comp)-B_);      indx1=(indx*N2)/(p*n);      indx+=p*n;      split   [indx]=B_[indx1];      split_size[indx]=indx1+(std::lower_bound(A_,A_last,split[indx],comp)-A_);    }  }  //Find the closest split position for each thread that will  //divide the final array equally between the threads.  _DiffType* split_indx_A=new _DiffType[p+1];  _DiffType* split_indx_B=new _DiffType[p+1];  split_indx_A[0]=0;  split_indx_B[0]=0;  split_indx_A[p]=N1;  split_indx_B[p]=N2;  #pragma omp parallel for  for(int i=1;i<p;i++){    _DiffType req_size=(i*(N1+N2))/p;    int j=std::lower_bound(&split_size[0],&split_size[p*n],req_size,std::less<_DiffType>())-&split_size[0];    if(j>=p*n)      j=p*n-1;    _ValType  split1     =split     [j];    _DiffType split_size1=split_size[j];    j=(std::lower_bound(&split_size[p*n],&split_size[p*n*2],req_size,std::less<_DiffType>())-&split_size[p*n])+p*n;    if(j>=2*p*n)      j=2*p*n-1;    if(abs(split_size[j]-req_size)<abs(split_size1-req_size)){      split1     =split   [j];      split_size1=split_size[j];    }    split_indx_A[i]=std::lower_bound(A_,A_last,split1,comp)-A_;    split_indx_B[i]=std::lower_bound(B_,B_last,split1,comp)-B_;  }  delete[] split;  delete[] split_size;  //Merge for each thread independently.  #pragma omp parallel for  for(int i=0;i<p;i++){    T C=C_+split_indx_A[i]+split_indx_B[i];    std::merge(A_+split_indx_A[i],A_+split_indx_A[i+1],B_+split_indx_B[i],B_+split_indx_B[i+1],C,comp);  }  delete[] split_indx_A;  delete[] split_indx_B;}template <class T,class StrictWeakOrdering>void omp_par::merge_sort(T A,T A_last,StrictWeakOrdering comp){  typedef typename std::iterator_traits<T>::difference_type _DiffType;  typedef typename std::iterator_traits<T>::value_type _ValType;  int p=omp_get_max_threads();  _DiffType N=A_last-A;  if(N<2*p){    std::sort(A,A_last,comp);    return;  }  //Split the array A into p equal parts.  _DiffType* split=new _DiffType[p+1];  split[p]=N;  #pragma omp parallel for  for(int id=0;id<p;id++){    split[id]=(id*N)/p;  }  //Sort each part independently.  #pragma omp parallel for  for(int id=0;id<p;id++){    std::sort(A+split[id],A+split[id+1],comp);  }  //Merge two parts at a time.  _ValType* B=new _ValType[N];  _ValType* A_=&A[0];  _ValType* B_=&B[0];  for(int j=1;j<p;j=j*2){    for(int i=0;i<p;i=i+2*j){      if(i+j<p){        omp_par::merge(A_+split[i],A_+split[i+j],A_+split[i+j],A_+split[(i+2*j<=p?i+2*j:p)],B_+split[i],p,comp);      }else{        #pragma omp parallel for        for(int k=split[i];k<split[p];k++)          B_[k]=A_[k];      }    }    _ValType* tmp_swap=A_;    A_=B_;    B_=tmp_swap;  }  //The final result should be in A.  if(A_!=&A[0]){    #pragma omp parallel for    for(int i=0;i<N;i++)      A[i]=A_[i];  }  //Free memory.  delete[] split;  delete[] B;}template <class T>void omp_par::merge_sort(T A,T A_last){  typedef typename std::iterator_traits<T>::value_type _ValType;  omp_par::merge_sort(A,A_last,std::less<_ValType>());}template <class T, class I>T omp_par::reduce(T* A, I cnt){  T sum=0;  #pragma omp parallel for reduction(+:sum)  for(I i = 0; i < cnt; i++)    sum+=A[i];  return sum;}template <class T, class I>void omp_par::scan(T* A, T* B,I cnt){  int p=omp_get_max_threads();  if(cnt<(I)100*p){    for(I i=1;i<cnt;i++)      B[i]=B[i-1]+A[i-1];    return;  }  I step_size=cnt/p;  #pragma omp parallel for  for(int i=0; i<p; i++){    int start=i*step_size;    int end=start+step_size;    if(i==p-1) end=cnt;    if(i!=0)B[start]=0;    for(I j=(I)start+1; j<(I)end; j++)      B[j]=B[j-1]+A[j-1];  }  T* sum=new T[p];  sum[0]=0;  for(int i=1;i<p;i++)    sum[i]=sum[i-1]+B[i*step_size-1]+A[i*step_size-1];  #pragma omp parallel for  for(int i=1; i<p; i++){    int start=i*step_size;    int end=start+step_size;    if(i==p-1) end=cnt;    T sum_=sum[i];    for(I j=(I)start; j<(I)end; j++)      B[j]+=sum_;  }  delete[] sum;}}//end namespace
 |