Grok  9.7.5
contrib/sort/shared-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Definitions shared between vqsort-inl and sorting_networks-inl.
17 
18 // Normal include guard for target-independent parts
19 #ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
20 #define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
21 
22 #include "hwy/base.h"
23 
24 namespace hwy {
25 
26 // Internal constants - these are to avoid magic numbers/literals and cannot be
27 // changed without also changing the associated code.
28 struct SortConstants {
29 // SortingNetwork reshapes its input into a matrix. This is the maximum number
30 // of *keys* per vector.
31 #if HWY_COMPILER_MSVC
32  static constexpr size_t kMaxCols = 8; // avoids build timeout
33 #else
34  static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector
35 #endif
36 
37  // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
38  // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
39  // code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
40  // extra logN factor for larger networks (for which only loose upper bounds
41  // on size are known).
42  static constexpr size_t kMaxRowsLog2 = 4;
43  static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
44 
45  static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) {
46  return kMaxRows * HWY_MIN(N, kMaxCols);
47  }
48 
49  // Unrolling is important (pipelining and amortizing branch mispredictions);
50  // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
51  // somewhat slower for sorting than 4x.
52  //
53  // To change, must also update left + 3 * N etc. in the loop.
54  static constexpr size_t kPartitionUnroll = 4;
55 
56  static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
57  // The main loop reads kPartitionUnroll vectors, and first loads from
58  // both left and right beforehand, so it requires min = 2 *
59  // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
60  // >= BaseCaseNum), we partition the right side into a buffer. We need
61  // another vector at the end so CompressStore does not overwrite anything.
62  return (2 * kPartitionUnroll + 1) * N;
63  }
64 
65  // Chunk := group of keys loaded for sampling a pivot. Matches the typical
66  // cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
67  // are larger, use entire vectors to ensure we do not overrun the array.
68  static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
69  return HWY_MAX(64 / sizeof_t, N);
70  }
71 
72  static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
73  // 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
74  return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
75  }
76 
77  template <typename T>
78  static constexpr HWY_INLINE size_t BufNum(size_t N) {
79  // One extra for padding plus another for full-vector loads.
80  return HWY_MAX(BaseCaseNum(N) + 2 * N,
81  HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N)));
82  }
83 
84  template <typename T>
85  static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
86  return sizeof(T) * BufNum<T>(vector_size / sizeof(T));
87  }
88 };
89 
90 } // namespace hwy
91 
92 #endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
93 
94 // Per-target
95 #if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
96  defined(HWY_TARGET_TOGGLE)
97 #ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
98 #undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
99 #else
100 #define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
101 #endif
102 
103 #include "hwy/highway.h"
104 
105 namespace hwy {
106 namespace HWY_NAMESPACE {
107 
108 // Default tag / vector width selector.
109 #if HWY_TARGET == HWY_RVV
110 // Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
111 template <typename T>
112 using SortTag = ScalableTag<T, -1>;
113 #else
114 template <typename T>
115 using SortTag = ScalableTag<T>;
116 #endif
117 
118 // NOLINTNEXTLINE(google-readability-namespace-comments)
119 } // namespace HWY_NAMESPACE
120 } // namespace hwy
121 
122 #endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
#define HWY_MAX(a, b)
Definition: base.h:128
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:162
N
Definition: rvv-inl.h:1656
ScalableTag< T, -1 > SortTag
Definition: contrib/sort/shared-inl.h:112
Definition: aligned_allocator.h:27
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
Definition: contrib/sort/shared-inl.h:28
static constexpr HWY_INLINE size_t BufBytes(size_t vector_size)
Definition: contrib/sort/shared-inl.h:85
static constexpr size_t kMaxCols
Definition: contrib/sort/shared-inl.h:34
static constexpr HWY_INLINE size_t PartitionBufNum(size_t N)
Definition: contrib/sort/shared-inl.h:56
static constexpr HWY_INLINE size_t BufNum(size_t N)
Definition: contrib/sort/shared-inl.h:78
static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N)
Definition: contrib/sort/shared-inl.h:72
static constexpr size_t kMaxRows
Definition: contrib/sort/shared-inl.h:43
static constexpr HWY_INLINE size_t BaseCaseNum(size_t N)
Definition: contrib/sort/shared-inl.h:45
static constexpr size_t kMaxRowsLog2
Definition: contrib/sort/shared-inl.h:42
static constexpr size_t kPartitionUnroll
Definition: contrib/sort/shared-inl.h:54
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N)
Definition: contrib/sort/shared-inl.h:68