ompUtils.txx 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. #include <omp.h>
  2. #include <cstring>
  3. #include <algorithm>
  4. #include SCTL_INCLUDE(vector.hpp)
  5. #include SCTL_INCLUDE(mem_mgr.hpp)
  6. namespace SCTL_NAMESPACE {
  7. template <class ConstIter, class Iter, class Int, class StrictWeakOrdering> inline void omp_par::merge(ConstIter A_, ConstIter A_last, ConstIter B_, ConstIter B_last, Iter C_, Int p, StrictWeakOrdering comp) {
  8. typedef typename std::iterator_traits<Iter>::difference_type _DiffType;
  9. typedef typename std::iterator_traits<Iter>::value_type _ValType;
  10. _DiffType N1 = A_last - A_;
  11. _DiffType N2 = B_last - B_;
  12. if (N1 == 0 && N2 == 0) return;
  13. if (N1 == 0 || N2 == 0) {
  14. ConstIter A = (N1 == 0 ? B_ : A_);
  15. _DiffType N = (N1 == 0 ? N2 : N1);
  16. #pragma omp parallel for schedule(static)
  17. for (int i = 0; i < p; i++) {
  18. _DiffType indx1 = (i * N) / p;
  19. _DiffType indx2 = ((i + 1) * N) / p;
  20. if (indx2 - indx1 > 0) memcpy(&C_[indx1], &A[indx1], (indx2 - indx1) * sizeof(_ValType));
  21. }
  22. return;
  23. }
  24. // Split both arrays ( A and B ) into n equal parts.
  25. // Find the position of each split in the final merged array.
  26. int n = 10;
  27. Vector<_ValType> split;
  28. split.ReInit(p * n * 2);
  29. Vector<_DiffType> split_size;
  30. split_size.ReInit(p * n * 2);
  31. #pragma omp parallel for
  32. for (int i = 0; i < p; i++) {
  33. for (int j = 0; j < n; j++) {
  34. int indx = i * n + j;
  35. _DiffType indx1 = (indx * N1) / (p * n);
  36. split[indx] = A_[indx1];
  37. split_size[indx] = indx1 + (std::lower_bound(B_, B_last, split[indx], comp) - B_);
  38. indx1 = (indx * N2) / (p * n);
  39. indx += p * n;
  40. split[indx] = B_[indx1];
  41. split_size[indx] = indx1 + (std::lower_bound(A_, A_last, split[indx], comp) - A_);
  42. }
  43. }
  44. // Find the closest split position for each thread that will
  45. // divide the final array equally between the threads.
  46. Vector<_DiffType> split_indx_A;
  47. split_indx_A.ReInit(p + 1);
  48. Vector<_DiffType> split_indx_B;
  49. split_indx_B.ReInit(p + 1);
  50. split_indx_A[0] = 0;
  51. split_indx_B[0] = 0;
  52. split_indx_A[p] = N1;
  53. split_indx_B[p] = N2;
  54. #pragma omp parallel for schedule(static)
  55. for (int i = 1; i < p; i++) {
  56. _DiffType req_size = (i * (N1 + N2)) / p;
  57. int j = std::lower_bound(split_size.begin(), split_size.begin() + p * n, req_size, std::less<_DiffType>()) - split_size.begin();
  58. if (j >= p * n) j = p * n - 1;
  59. _ValType split1 = split[j];
  60. _DiffType split_size1 = split_size[j];
  61. j = (std::lower_bound(split_size.begin() + p * n, split_size.begin() + p * n * 2, req_size, std::less<_DiffType>()) - split_size.begin() + p * n) + p * n;
  62. if (j >= 2 * p * n) j = 2 * p * n - 1;
  63. if (abs(split_size[j] - req_size) < abs(split_size1 - req_size)) {
  64. split1 = split[j];
  65. split_size1 = split_size[j];
  66. }
  67. split_indx_A[i] = std::lower_bound(A_, A_last, split1, comp) - A_;
  68. split_indx_B[i] = std::lower_bound(B_, B_last, split1, comp) - B_;
  69. }
  70. // Merge for each thread independently.
  71. #pragma omp parallel for schedule(static)
  72. for (int i = 0; i < p; i++) {
  73. Iter C = C_ + split_indx_A[i] + split_indx_B[i];
  74. std::merge(A_ + split_indx_A[i], A_ + split_indx_A[i + 1], B_ + split_indx_B[i], B_ + split_indx_B[i + 1], C, comp);
  75. }
  76. }
  77. template <class T, class StrictWeakOrdering> inline void omp_par::merge_sort(T A, T A_last, StrictWeakOrdering comp) {
  78. typedef typename std::iterator_traits<T>::difference_type _DiffType;
  79. typedef typename std::iterator_traits<T>::value_type _ValType;
  80. int p = omp_get_max_threads();
  81. _DiffType N = A_last - A;
  82. if (N < 2 * p) {
  83. std::sort(A, A_last, comp);
  84. return;
  85. }
  86. SCTL_UNUSED(A[0]);
  87. SCTL_UNUSED(A[N - 1]);
  88. // Split the array A into p equal parts.
  89. Vector<_DiffType> split;
  90. split.ReInit(p + 1);
  91. split[p] = N;
  92. #pragma omp parallel for schedule(static)
  93. for (int id = 0; id < p; id++) {
  94. split[id] = (id * N) / p;
  95. }
  96. // Sort each part independently.
  97. #pragma omp parallel for schedule(static)
  98. for (int id = 0; id < p; id++) {
  99. std::sort(A + split[id], A + split[id + 1], comp);
  100. }
  101. // Merge two parts at a time.
  102. Vector<_ValType> B;
  103. B.ReInit(N);
  104. Iterator<_ValType> A_ = Ptr2Itr<_ValType>(&A[0], N);
  105. Iterator<_ValType> B_ = B.begin();
  106. for (int j = 1; j < p; j = j * 2) {
  107. for (int i = 0; i < p; i = i + 2 * j) {
  108. if (i + j < p) {
  109. omp_par::merge(A_ + split[i], A_ + split[i + j], A_ + split[i + j], A_ + split[(i + 2 * j <= p ? i + 2 * j : p)], B_ + split[i], p, comp);
  110. } else {
  111. #pragma omp parallel for
  112. for (int k = split[i]; k < split[p]; k++) B_[k] = A_[k];
  113. }
  114. }
  115. Iterator<_ValType> tmp_swap = A_;
  116. A_ = B_;
  117. B_ = tmp_swap;
  118. }
  119. // The final result should be in A.
  120. if (A_ != Ptr2Itr<_ValType>(&A[0], N)) {
  121. #pragma omp parallel for
  122. for (int i = 0; i < N; i++) A[i] = A_[i];
  123. }
  124. }
  125. template <class T> inline void omp_par::merge_sort(T A, T A_last) {
  126. typedef typename std::iterator_traits<T>::value_type _ValType;
  127. omp_par::merge_sort(A, A_last, std::less<_ValType>());
  128. }
  129. template <class ConstIter, class Int> typename std::iterator_traits<ConstIter>::value_type omp_par::reduce(ConstIter A, Int cnt) {
  130. typedef typename std::iterator_traits<ConstIter>::value_type ValueType;
  131. ValueType sum = 0;
  132. #pragma omp parallel for reduction(+ : sum)
  133. for (Int i = 0; i < cnt; i++) sum += A[i];
  134. return sum;
  135. }
  136. template <class ConstIter, class Iter, class Int> void omp_par::scan(ConstIter A, Iter B, Int cnt) {
  137. typedef typename std::iterator_traits<Iter>::value_type ValueType;
  138. Integer p = omp_get_max_threads();
  139. if (cnt < (Int)100 * p) {
  140. for (Int i = 1; i < cnt; i++) B[i] = B[i - 1] + A[i - 1];
  141. return;
  142. }
  143. Int step_size = cnt / p;
  144. #pragma omp parallel for schedule(static)
  145. for (Integer i = 0; i < p; i++) {
  146. Int start = i * step_size;
  147. Int end = start + step_size;
  148. if (i == p - 1) end = cnt;
  149. if (i != 0) B[start] = 0;
  150. for (Int j = (Int)start + 1; j < (Int)end; j++) B[j] = B[j - 1] + A[j - 1];
  151. }
  152. Vector<ValueType> sum(p);
  153. sum[0] = 0;
  154. for (Integer i = 1; i < p; i++) sum[i] = sum[i - 1] + B[i * step_size - 1] + A[i * step_size - 1];
  155. #pragma omp parallel for schedule(static)
  156. for (Integer i = 1; i < p; i++) {
  157. Int start = i * step_size;
  158. Int end = start + step_size;
  159. if (i == p - 1) end = cnt;
  160. ValueType sum_ = sum[i];
  161. for (Int j = (Int)start; j < (Int)end; j++) B[j] += sum_;
  162. }
  163. }
  164. } // end namespace