123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189 |
- #include <omp.h>
- #include <cstring>
- #include <algorithm>
- #include SCTL_INCLUDE(vector.hpp)
- #include SCTL_INCLUDE(mem_mgr.hpp)
- namespace SCTL_NAMESPACE {
- template <class ConstIter, class Iter, class Int, class StrictWeakOrdering> inline void omp_par::merge(ConstIter A_, ConstIter A_last, ConstIter B_, ConstIter B_last, Iter C_, Int p, StrictWeakOrdering comp) {
- typedef typename std::iterator_traits<Iter>::difference_type _DiffType;
- typedef typename std::iterator_traits<Iter>::value_type _ValType;
- _DiffType N1 = A_last - A_;
- _DiffType N2 = B_last - B_;
- if (N1 == 0 && N2 == 0) return;
- if (N1 == 0 || N2 == 0) {
- ConstIter A = (N1 == 0 ? B_ : A_);
- _DiffType N = (N1 == 0 ? N2 : N1);
- #pragma omp parallel for schedule(static)
- for (int i = 0; i < p; i++) {
- _DiffType indx1 = (i * N) / p;
- _DiffType indx2 = ((i + 1) * N) / p;
- if (indx2 - indx1 > 0) memcpy(&C_[indx1], &A[indx1], (indx2 - indx1) * sizeof(_ValType));
- }
- return;
- }
- // Split both arrays ( A and B ) into n equal parts.
- // Find the position of each split in the final merged array.
- int n = 10;
- Vector<_ValType> split;
- split.ReInit(p * n * 2);
- Vector<_DiffType> split_size;
- split_size.ReInit(p * n * 2);
- #pragma omp parallel for
- for (int i = 0; i < p; i++) {
- for (int j = 0; j < n; j++) {
- int indx = i * n + j;
- _DiffType indx1 = (indx * N1) / (p * n);
- split[indx] = A_[indx1];
- split_size[indx] = indx1 + (std::lower_bound(B_, B_last, split[indx], comp) - B_);
- indx1 = (indx * N2) / (p * n);
- indx += p * n;
- split[indx] = B_[indx1];
- split_size[indx] = indx1 + (std::lower_bound(A_, A_last, split[indx], comp) - A_);
- }
- }
- // Find the closest split position for each thread that will
- // divide the final array equally between the threads.
- Vector<_DiffType> split_indx_A;
- split_indx_A.ReInit(p + 1);
- Vector<_DiffType> split_indx_B;
- split_indx_B.ReInit(p + 1);
- split_indx_A[0] = 0;
- split_indx_B[0] = 0;
- split_indx_A[p] = N1;
- split_indx_B[p] = N2;
- #pragma omp parallel for schedule(static)
- for (int i = 1; i < p; i++) {
- _DiffType req_size = (i * (N1 + N2)) / p;
- int j = std::lower_bound(split_size.begin(), split_size.begin() + p * n, req_size, std::less<_DiffType>()) - split_size.begin();
- if (j >= p * n) j = p * n - 1;
- _ValType split1 = split[j];
- _DiffType split_size1 = split_size[j];
- j = (std::lower_bound(split_size.begin() + p * n, split_size.begin() + p * n * 2, req_size, std::less<_DiffType>()) - split_size.begin() + p * n) + p * n;
- if (j >= 2 * p * n) j = 2 * p * n - 1;
- if (abs(split_size[j] - req_size) < abs(split_size1 - req_size)) {
- split1 = split[j];
- split_size1 = split_size[j];
- }
- split_indx_A[i] = std::lower_bound(A_, A_last, split1, comp) - A_;
- split_indx_B[i] = std::lower_bound(B_, B_last, split1, comp) - B_;
- }
- // Merge for each thread independently.
- #pragma omp parallel for schedule(static)
- for (int i = 0; i < p; i++) {
- Iter C = C_ + split_indx_A[i] + split_indx_B[i];
- std::merge(A_ + split_indx_A[i], A_ + split_indx_A[i + 1], B_ + split_indx_B[i], B_ + split_indx_B[i + 1], C, comp);
- }
- }
- template <class T, class StrictWeakOrdering> inline void omp_par::merge_sort(T A, T A_last, StrictWeakOrdering comp) {
- typedef typename std::iterator_traits<T>::difference_type _DiffType;
- typedef typename std::iterator_traits<T>::value_type _ValType;
- int p = omp_get_max_threads();
- _DiffType N = A_last - A;
- if (N < 2 * p) {
- std::sort(A, A_last, comp);
- return;
- }
- SCTL_UNUSED(A[0]);
- SCTL_UNUSED(A[N - 1]);
- // Split the array A into p equal parts.
- Vector<_DiffType> split;
- split.ReInit(p + 1);
- split[p] = N;
- #pragma omp parallel for schedule(static)
- for (int id = 0; id < p; id++) {
- split[id] = (id * N) / p;
- }
- // Sort each part independently.
- #pragma omp parallel for schedule(static)
- for (int id = 0; id < p; id++) {
- std::sort(A + split[id], A + split[id + 1], comp);
- }
- // Merge two parts at a time.
- Vector<_ValType> B;
- B.ReInit(N);
- Iterator<_ValType> A_ = Ptr2Itr<_ValType>(&A[0], N);
- Iterator<_ValType> B_ = B.begin();
- for (int j = 1; j < p; j = j * 2) {
- for (int i = 0; i < p; i = i + 2 * j) {
- if (i + j < p) {
- omp_par::merge(A_ + split[i], A_ + split[i + j], A_ + split[i + j], A_ + split[(i + 2 * j <= p ? i + 2 * j : p)], B_ + split[i], p, comp);
- } else {
- #pragma omp parallel for
- for (int k = split[i]; k < split[p]; k++) B_[k] = A_[k];
- }
- }
- Iterator<_ValType> tmp_swap = A_;
- A_ = B_;
- B_ = tmp_swap;
- }
- // The final result should be in A.
- if (A_ != Ptr2Itr<_ValType>(&A[0], N)) {
- #pragma omp parallel for
- for (int i = 0; i < N; i++) A[i] = A_[i];
- }
- }
- template <class T> inline void omp_par::merge_sort(T A, T A_last) {
- typedef typename std::iterator_traits<T>::value_type _ValType;
- omp_par::merge_sort(A, A_last, std::less<_ValType>());
- }
- template <class ConstIter, class Int> typename std::iterator_traits<ConstIter>::value_type omp_par::reduce(ConstIter A, Int cnt) {
- typedef typename std::iterator_traits<ConstIter>::value_type ValueType;
- ValueType sum = 0;
- #pragma omp parallel for reduction(+ : sum)
- for (Int i = 0; i < cnt; i++) sum += A[i];
- return sum;
- }
- template <class ConstIter, class Iter, class Int> void omp_par::scan(ConstIter A, Iter B, Int cnt) {
- typedef typename std::iterator_traits<Iter>::value_type ValueType;
- Integer p = omp_get_max_threads();
- if (cnt < (Int)100 * p) {
- for (Int i = 1; i < cnt; i++) B[i] = B[i - 1] + A[i - 1];
- return;
- }
- Int step_size = cnt / p;
- #pragma omp parallel for schedule(static)
- for (Integer i = 0; i < p; i++) {
- Int start = i * step_size;
- Int end = start + step_size;
- if (i == p - 1) end = cnt;
- if (i != 0) B[start] = 0;
- for (Int j = (Int)start + 1; j < (Int)end; j++) B[j] = B[j - 1] + A[j - 1];
- }
- Vector<ValueType> sum(p);
- sum[0] = 0;
- for (Integer i = 1; i < p; i++) sum[i] = sum[i - 1] + B[i * step_size - 1] + A[i * step_size - 1];
- #pragma omp parallel for schedule(static)
- for (Integer i = 1; i < p; i++) {
- Int start = i * step_size;
- Int end = start + step_size;
- if (i == p - 1) end = cnt;
- ValueType sum_ = sum[i];
- for (Int j = (Int)start; j < (Int)end; j++) B[j] += sum_;
- }
- }
- } // end namespace
|