17 #ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
18 #define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
31 #define HAVE_AVX2SORT 0
33 #define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
34 #define HAVE_PDQSORT 0
35 #define HAVE_SORT512 0
42 #if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
43 #include "third_party/ips4o/include/ips4o.hpp"
44 #include "third_party/ips4o/include/ips4o/thread_pool.hpp"
47 #include "third_party/boost/allowed/sort/sort.hpp"
79 sumf_ +=
static_cast<double>(value);
86 static_cast<int>(other.
count_));
91 double(other.
min_),
double(other.
max_));
98 const double mul = 1E-9;
99 const double err = std::abs(
sumf_ * mul - other.
sumf_ * mul);
110 T
min_ = hwy::HighestValue<T>();
111 T
max_ = hwy::LowestValue<T>();
123 #if HAVE_PARALLEL_IPS4O
147 #if HAVE_PARALLEL_IPS4O
148 case Algo::kParallelIPS4O:
166 return "unreachable";
173 #if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
174 defined(HWY_TARGET_TOGGLE)
175 #ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
176 #undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
178 #define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
192 z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
193 z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
194 return z ^ (z >> 31);
200 template <
class DU64>
203 for (
size_t i = 1; i < 2 *
Lanes(du64); ++i) {
209 template <
class DU64>
216 s1 =
Xor(s1, ShiftLeft<23>(s1));
217 state1 =
Xor(s1,
Xor(s0,
Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
222 template <
typename T,
class DU64, HWY_IF_NOT_FLOAT(T)>
226 return And(bits, mask);
231 template <
typename T,
class DU64, HWY_IF_FLOAT(T)>
232 Vec<DU64>
RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
233 const Vec<DU64> mask) {
235 const Vec<DU64> values =
And(bits, mask);
236 #if HWY_TARGET == HWY_SCALAR
237 const RebindToSigned<DU64> di;
239 const Repartition<MakeSigned<T>, DU64> di;
243 const Vec<DU64> no_nan =
248 template <
class DU64>
253 : 0xFFFFFFFFFFFFFFFFull);
257 : 0xFFFFFFFFFFFFFFFFull);
261 : 0x00000000FFFFFFFFull);
268 template <
typename T>
271 using VU64 =
Vec<decltype(du64)>;
272 const size_t N64 =
Lanes(du64);
273 auto buf = hwy::AllocateAligned<uint64_t>(2 * N64);
275 auto s0 =
Load(du64, buf.get());
276 auto s1 =
Load(du64, buf.get() + N64);
278 const VU64 mask =
MaskForDist(du64, dist,
sizeof(T));
283 for (; i +
N <= num; i +=
N) {
284 const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
287 StoreU(bits, du64, buf.get());
288 memcpy(
v + i, buf.get(), N64 *
sizeof(uint64_t));
290 StoreU(bits, du64,
reinterpret_cast<uint64_t*
>(
v + i));
294 const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
295 StoreU(bits, du64, buf.get());
296 memcpy(
v + i, buf.get(), (num - i) *
sizeof(T));
300 for (
size_t i = 0; i < num; ++i) {
311 #if HAVE_PARALLEL_IPS4O
312 ips4o::StdThreadPool pool{
313 static_cast<int>(std::thread::hardware_concurrency() / 2)};
315 std::vector<ThreadLocal>
tls{1};
318 template <
class Order,
typename T>
328 return avx2::quicksort(inout,
static_cast<int>(num));
333 if (Order().IsAscending()) {
334 return ips4o::sort(inout, inout + num, std::less<T>());
336 return ips4o::sort(inout, inout + num, std::greater<T>());
340 #if HAVE_PARALLEL_IPS4O
341 case Algo::kParallelIPS4O:
342 if (Order().IsAscending()) {
343 return ips4o::parallel::sort(inout, inout + num, std::less<T>(),
346 return ips4o::parallel::sort(inout, inout + num, std::greater<T>(),
359 if (Order().IsAscending()) {
360 return boost::sort::pdqsort_branchless(inout, inout + num,
363 return boost::sort::pdqsort_branchless(inout, inout + num,
369 if (Order().IsAscending()) {
370 return std::sort(inout, inout + num, std::less<T>());
372 return std::sort(inout, inout + num, std::greater<T>());
376 return shared.
tls[thread].sorter(inout, num, Order());
380 if (Order().IsAscending()) {
381 const SharedTraits<TraitsLane<detail::OrderAscending>> st;
384 const SharedTraits<TraitsLane<detail::OrderDescending>> st;
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_POP_ATTRIBUTES
Definition: base.h:116
#define HWY_ABORT(format,...)
Definition: base.h:143
#define HWY_INLINE
Definition: base.h:64
#define HWY_PUSH_ATTRIBUTES(targets_str)
Definition: base.h:115
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: algo-inl.h:190
static void GenerateSeeds(DU64 du64, TFromD< DU64 > *HWY_RESTRICT seeds)
Definition: algo-inl.h:201
static Vec< DU64 > RandomBits(DU64, Vec< DU64 > &state0, Vec< DU64 > &state1)
Definition: algo-inl.h:210
static HWY_INLINE uint64_t SplitMix64(uint64_t z)
Definition: algo-inl.h:191
void HeapSort(Traits st, T *HWY_RESTRICT keys, const size_t num)
Definition: vqsort-inl.h:71
d
Definition: rvv-inl.h:1656
void Run(Algo algo, T *HWY_RESTRICT inout, size_t num, SharedState &shared, size_t thread)
Definition: algo-inl.h:319
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:203
Vec< DU64 > MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t)
Definition: algo-inl.h:249
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
InputStats< T > GenerateInput(const Dist dist, T *v, size_t num)
Definition: algo-inl.h:269
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
N
Definition: rvv-inl.h:1656
ScalableTag< T, -1 > SortTag
Definition: contrib/sort/shared-inl.h:112
Vec< DU64 > RandomValues(DU64 du64, Vec< DU64 > &s0, Vec< DU64 > &s1, const Vec< DU64 > mask)
Definition: algo-inl.h:223
const vfloat64m1_t v
Definition: rvv-inl.h:1656
typename D::T TFromD
Definition: ops/shared-inl.h:192
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
std::vector< Dist > AllDist()
Definition: algo-inl.h:57
const char * AlgoName(Algo algo)
Definition: algo-inl.h:137
Dist
Definition: algo-inl.h:55
const char * DistName(Dist dist)
Definition: algo-inl.h:61
constexpr T MantissaMask()
Definition: base.h:554
Algo
Definition: algo-inl.h:116
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
Definition: algo-inl.h:310
std::vector< ThreadLocal > tls
Definition: algo-inl.h:315
Definition: algo-inl.h:306
Sorter sorter
Definition: algo-inl.h:307
Definition: sorting_networks-inl.h:42
Definition: traits-inl.h:247