Grok  9.7.5
ops/shared-inl.h
Go to the documentation of this file.
1 // Copyright 2020 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Per-target definitions shared by ops/*.h and user code.
17 
18 #include <cmath>
19 
20 #include "hwy/base.h"
21 
22 // Separate header because foreach_target.h re-enables its include guard.
23 #include "hwy/ops/set_macros-inl.h"
24 
25 // Relies on the external include guard in highway.h.
27 namespace hwy {
28 namespace HWY_NAMESPACE {
29 
30 // Highway operations are implemented as overloaded functions selected using an
31 // internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
32 // shift count applied to scalable vectors. Instead of referring to Simd<>
33 // directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
34 // full vector, or fractions/groups if the argument is negative/positive),
35 // CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
36 // Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
37 // cap. For constexpr-size vectors, N is the actual number of lanes. This
38 // ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
39 template <typename Lane, size_t N, int kPow2>
40 struct Simd {
41  constexpr Simd() = default;
42  using T = Lane;
43  static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
44 
45  // Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
46  // warns when using enums and non-enums in the same expression. Cannot be
47  // static constexpr function (another MSVC limitation).
48  static constexpr size_t kPrivateN = N;
49  static constexpr int kPrivatePow2 = kPow2;
50 
51  template <typename NewT>
52  static constexpr size_t NewN() {
53  // Round up to correctly handle scalars with N=1.
54  return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
55  }
56 
57 #if HWY_HAVE_SCALABLE
58  template <typename NewT>
59  static constexpr int Pow2Ratio() {
60  return (sizeof(NewT) > sizeof(T))
61  ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
62  : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)));
63  }
64 #endif
65 
66  // Widening/narrowing ops change the number of lanes and/or their type.
67  // To initialize such vectors, we need the corresponding tag types:
68 
69 // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
70 #if HWY_HAVE_SCALABLE
71  template <typename NewT>
72  using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>;
73 #else
74  template <typename NewT>
76 #endif
77 
78  // Change lane type while keeping the same vector size, e.g. for MulEven.
79  template <typename NewT>
81 
82 // Half the lanes while keeping the same lane type, e.g. for LowerHalf.
83 // Round up to correctly handle scalars with N=1.
84 #if HWY_HAVE_SCALABLE
85  // Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
86  // then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
87  using Half = Simd<T, (N + 1) / 2, kPow2 - 1>;
88 #else
89  using Half = Simd<T, (N + 1) / 2, kPow2>;
90 #endif
91 
92 // Twice the lanes while keeping the same lane type, e.g. for Combine.
93 #if HWY_HAVE_SCALABLE
95 #else
97 #endif
98 };
99 
100 namespace detail {
101 
102 #if HWY_HAVE_SCALABLE
103 
104 template <typename T, size_t N, int kPow2>
105 constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
106  return N == HWY_LANES(T) && kPow2 == 0;
107 }
108 
109 #endif
110 
111 // Returns the number of lanes (possibly zero) after applying a shift:
112 // - 0: no change;
113 // - [1,3]: a group of 2,4,8 [fractional] vectors;
114 // - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
115 constexpr size_t ScaleByPower(size_t N, int pow2) {
116  return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
117 }
118 
119 // Struct wrappers enable validation of arguments via static_assert.
120 template <typename T, int kPow2>
122  static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
123 #if HWY_TARGET == HWY_RVV
124  // Only RVV supports register groups.
125  using type = Simd<T, HWY_LANES(T), kPow2>;
126 #elif HWY_HAVE_SCALABLE
127  // For SVE[2], only allow full or fractions.
128  using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>;
129 #elif HWY_TARGET == HWY_SCALAR
130  using type = Simd<T, /*N=*/1, 0>;
131 #else
132  // Only allow full or fractions.
133  using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>;
134 #endif
135 };
136 
137 template <typename T, size_t kLimit>
139  static_assert(kLimit != 0, "Does not make sense to have zero lanes");
140  using type = Simd<T, HWY_MIN(kLimit, HWY_MAX_BYTES / sizeof(T)), 0>;
141 };
142 
143 template <typename T, size_t kNumLanes>
145  static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
146  static_assert(kNumLanes * sizeof(T) <= HWY_MAX_BYTES, "Too many lanes");
147 #if HWY_TARGET == HWY_SCALAR
148  // HWY_MAX_BYTES would still allow uint8x8, which is not supported.
149  static_assert(kNumLanes == 1, "Scalar only supports one lane");
150 #endif
152 };
153 
154 } // namespace detail
155 
156 // Alias for a tag describing a full vector (kPow2 == 0: the most common usage,
157 // e.g. 1D loops where the application does not care about the vector size) or a
158 // fraction/multiple of one. Multiples are the same as full vectors for all
159 // targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return
160 // value of type promotion and demotion.
161 template <typename T, int kPow2 = 0>
163 
164 // Alias for a tag describing a vector with *up to* kLimit active lanes, even on
165 // targets with scalable vectors and HWY_SCALAR. The runtime lane count
166 // `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is
167 // typically used for 1D loops with a relatively low application-defined upper
168 // bound, e.g. for 8x8 DCTs. However, it is better if data structures are
169 // designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
170 // chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
171 // this would enable vector-length-agnostic loops using ScalableTag).
172 template <typename T, size_t kLimit>
174 
175 // Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
176 // even on targets with scalable vectors. HWY_SCALAR only supports one lane.
177 // All other targets allow kNumLanes up to HWY_MAX_BYTES / sizeof(T).
178 //
179 // NOTE: if the application does not need to support HWY_SCALAR (+), use this
180 // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
181 // This is useful for data structures that rely on exactly 128-bit SIMD, but
182 // these are discouraged because they cannot benefit from wider vectors.
183 // Instead, applications would ideally define a larger problem size and loop
184 // over it with the (unknown size) vectors from ScalableTag.
185 //
186 // + e.g. if the baseline is known to support SIMD, or the application requires
187 // ops such as TableLookupBytes not supported by HWY_SCALAR.
188 template <typename T, size_t kNumLanes>
190 
191 template <class D>
192 using TFromD = typename D::T;
193 
194 // Tag for the same number of lanes as D, but with the LaneType T.
195 template <class T, class D>
196 using Rebind = typename D::template Rebind<T>;
197 
198 template <class D>
200 template <class D>
202 template <class D>
204 
205 // Tag for the same total size as D, but with the LaneType T.
206 template <class T, class D>
207 using Repartition = typename D::template Repartition<T>;
208 
209 template <class D>
211 template <class D>
213 
214 // Tag for the same lane type as D, but half the lanes.
215 template <class D>
216 using Half = typename D::Half;
217 
218 // Tag for the same lane type as D, but twice the lanes.
219 template <class D>
220 using Twice = typename D::Twice;
221 
222 // Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
223 #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
224 #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
225 #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
226 #define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
227 #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
228 #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
229 
230 // MSVC workaround: use PrivateN directly instead of MaxLanes.
231 #define HWY_IF_LT128_D(D) \
232  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr
233 #define HWY_IF_GE128_D(D) \
234  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
235 
236 // Same, but with a vector argument.
237 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
238 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
239 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
240 #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
241 
242 // For implementing functions for a specific type.
243 // IsSame<...>() in template arguments is broken on MSVC2015.
244 #define HWY_IF_LANES_ARE(T, V) EnableIf<IsSameT<T, TFromV<V>>::value>* = nullptr
245 
246 template <class D>
247 HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
248  return D::kPrivatePow2;
249 }
250 
251 // MSVC requires the explicit <D>.
252 #define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr
253 
254 #if HWY_HAVE_SCALABLE
255 
256 // Upper bound on the number of lanes. Intended for template arguments and
257 // reducing code size (e.g. for SSE4, we know at compile-time that vectors will
258 // not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
259 // actual size for allocating storage. WARNING: MSVC might not be able to deduce
260 // arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
261 template <class D>
262 HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
263  return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)),
264  D::kPrivatePow2);
265 }
266 
267 #else
268 // Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
269 // is not an option, nor does a member function work.
270 template <class D>
271 HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
272  return D::kPrivateN;
273 }
274 
275 // (Potentially) non-constant actual size of the vector at runtime, subject to
276 // the limit imposed by the Simd. Useful for advancing loop counters.
277 // Targets with scalable vectors define this themselves.
278 template <typename T, size_t N, int kPow2>
280  return N;
281 }
282 
283 #endif // !HWY_HAVE_SCALABLE
284 
285 // NOTE: GCC generates incorrect code for vector arguments to non-inlined
286 // functions in two situations:
287 // - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
288 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
289 // - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
290 // all) tests to fail.
291 //
292 // We therefore pass by const& only on GCC and (Windows or ARM64). This alias
293 // must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
294 // and possibly also other functions that are not inlined.
295 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
296  ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64)
297 template <class V>
298 using VecArg = const V&;
299 #else
300 template <class V>
301 using VecArg = V;
302 #endif
303 
304 // NOLINTNEXTLINE(google-readability-namespace-comments)
305 } // namespace HWY_NAMESPACE
306 } // namespace hwy
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_MAYBE_UNUSED
Definition: base.h:75
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:115
V VecArg
Definition: ops/shared-inl.h:301
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:247
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition: ops/shared-inl.h:173
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:203
typename D::Twice Twice
Definition: ops/shared-inl.h:220
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:212
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:162
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_INLINE constexpr HWY_MAYBE_UNUSED size_t MaxLanes(D)
Definition: ops/shared-inl.h:271
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition: ops/shared-inl.h:189
typename D::Half Half
Definition: ops/shared-inl.h:216
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
N
Definition: rvv-inl.h:1656
typename D::T TFromD
Definition: ops/shared-inl.h:192
Definition: aligned_allocator.h:27
constexpr size_t CeilLog2(TI x)
Definition: base.h:700
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_MAX_BYTES
Definition: set_macros-inl.h:82
#define HWY_LANES(T)
Definition: set_macros-inl.h:83
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
Definition: ops/shared-inl.h:40
constexpr Simd()=default
Simd< NewT, N, kPow2 > Rebind
Definition: ops/shared-inl.h:75
static constexpr size_t NewN()
Definition: ops/shared-inl.h:52
static constexpr int kPrivatePow2
Definition: ops/shared-inl.h:49
static constexpr size_t kPrivateN
Definition: ops/shared-inl.h:48
Lane T
Definition: ops/shared-inl.h:42
Definition: ops/shared-inl.h:138
Definition: ops/shared-inl.h:144
Definition: ops/shared-inl.h:121