Grok  9.7.5
arm_sve-inl.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // ARM SVE[2] vectors (length not known at compile time).
17 // External include guard in highway.h - see comment there.
18 
19 #include <stddef.h>
20 #include <stdint.h>
21 
22 #include <arm_sve.h>
23 
24 #include "hwy/base.h"
25 #include "hwy/ops/shared-inl.h"
26 
28 namespace hwy {
29 namespace HWY_NAMESPACE {
30 
31 template <class V>
32 struct DFromV_t {}; // specialized in macros
33 template <class V>
34 using DFromV = typename DFromV_t<RemoveConst<V>>::type;
35 
36 template <class V>
37 using TFromV = TFromD<DFromV<V>>;
38 
39 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
40 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
41 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
42 #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
43 
44 // ================================================== MACROS
45 
46 // Generate specializations and function definitions using X macros. Although
47 // harder to read and debug, writing everything manually is too bulky.
48 
49 namespace detail { // for code folding
50 
51 // Unsigned:
52 #define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP)
53 #define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP)
54 #define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
55  X_MACRO(uint, u, 32, 16, NAME, OP)
56 #define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
57  X_MACRO(uint, u, 64, 32, NAME, OP)
58 
59 // Signed:
60 #define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP)
61 #define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP)
62 #define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP)
63 #define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP)
64 
65 // Float:
66 #define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
67  X_MACRO(float, f, 16, 16, NAME, OP)
68 #define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
69  X_MACRO(float, f, 32, 16, NAME, OP)
70 #define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
71  X_MACRO(float, f, 64, 32, NAME, OP)
72 
73 // For all element sizes:
74 #define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
75  HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
76  HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
77  HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
78  HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
79 
80 #define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
81  HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
82  HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
83  HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
84  HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
85 
86 #define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
87  HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
88  HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
89  HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
90 
91 // Commonly used type categories for a given element size:
92 #define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
93  HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
94  HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
95 
96 #define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
97  HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
98  HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
99 
100 #define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
101  HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
102  HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
103 
104 #define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
105  HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
106  HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
107 
108 #define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
109  HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
110  HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
111  HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
112  HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
113 
114 // Commonly used type categories:
115 #define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
116  HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
117  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
118 
119 #define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
120  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
121  HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
122 
123 #define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
124  HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
125  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
126  HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
127 
128 // Assemble types for use in x-macros
129 #define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
130 #define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2>
131 #define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
132 
133 } // namespace detail
134 
135 #define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \
136  template <> \
137  struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
138  using type = ScalableTag<HWY_SVE_T(BASE, BITS)>; \
139  };
140 
142 #undef HWY_SPECIALIZE
143 
144 // Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
145 // instructions, and we anyway only use it when the predicate is ptrue.
146 
147 // vector = f(vector), e.g. Not
148 #define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \
149  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
150  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
151  }
152 #define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \
153  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
154  return sv##OP##_##CHAR##BITS(v); \
155  }
156 
157 // vector = f(vector, scalar), e.g. detail::AddN
158 #define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
159  HWY_API HWY_SVE_V(BASE, BITS) \
160  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
161  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
162  }
163 #define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \
164  HWY_API HWY_SVE_V(BASE, BITS) \
165  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
166  return sv##OP##_##CHAR##BITS(a, b); \
167  }
168 
169 // vector = f(vector, vector), e.g. Add
170 #define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
171  HWY_API HWY_SVE_V(BASE, BITS) \
172  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
173  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
174  }
175 #define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
176  HWY_API HWY_SVE_V(BASE, BITS) \
177  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
178  return sv##OP##_##CHAR##BITS(a, b); \
179  }
180 
181 // ------------------------------ Lanes
182 
183 namespace detail {
184 
185 // Returns actual lanes of a hardware vector without rounding to a power of two.
187  return svcntb_pat(SV_ALL);
188 }
190  return svcnth_pat(SV_ALL);
191 }
193  return svcntw_pat(SV_ALL);
194 }
196  return svcntd_pat(SV_ALL);
197 }
198 
199 // Returns actual lanes of a hardware vector, rounded down to a power of two.
201  return svcntb_pat(SV_POW2);
202 }
204  return svcnth_pat(SV_POW2);
205 }
207  return svcntw_pat(SV_POW2);
208 }
210  return svcntd_pat(SV_POW2);
211 }
212 
213 } // namespace detail
214 
215 // Returns actual number of lanes after capping by N and shifting. May return 0
216 // (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8).
217 template <typename T, size_t N, int kPow2>
219  const size_t actual = detail::HardwareLanes(hwy::SizeTag<sizeof(T)>());
220  // Common case of full vectors: avoid any extra instructions.
221  if (detail::IsFull(d)) return actual;
222  return HWY_MIN(detail::ScaleByPower(actual, kPow2), N);
223 }
224 
225 // ================================================== MASK INIT
226 
227 // One mask bit per byte; only the one belonging to the lowest byte is valid.
228 
229 // ------------------------------ FirstN
230 #define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \
231  template <size_t N, int kPow2> \
232  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \
233  const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \
234  return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
235  }
237 #undef HWY_SVE_FIRSTN
238 
239 namespace detail {
240 
241 // All-true mask from a macro
242 #define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
243 
244 #define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
245  template <size_t N, int kPow2> \
246  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
247  return HWY_SVE_PTRUE(BITS); \
248  }
249 
250 HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true
251 #undef HWY_SVE_WRAP_PTRUE
252 
253 HWY_API svbool_t PFalse() { return svpfalse_b(); }
254 
255 // Returns all-true if d is HWY_FULL or FirstN(N) after capping N.
256 //
257 // This is used in functions that load/store memory; other functions (e.g.
258 // arithmetic) can ignore d and use PTrue instead.
259 template <class D>
260 svbool_t MakeMask(D d) {
261  return IsFull(d) ? PTrue(d) : FirstN(d, Lanes(d));
262 }
263 
264 } // namespace detail
265 
266 // ================================================== INIT
267 
268 // ------------------------------ Set
269 // vector = f(d, scalar), e.g. Set
270 #define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
271  template <size_t N, int kPow2> \
272  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
273  HWY_SVE_T(BASE, BITS) arg) { \
274  return sv##OP##_##CHAR##BITS(arg); \
275  }
276 
278 #undef HWY_SVE_SET
279 
280 // Required for Zero and VFromD
281 template <size_t N, int kPow2>
283  return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
284 }
285 
286 template <class D>
287 using VFromD = decltype(Set(D(), TFromD<D>()));
288 
289 // ------------------------------ Zero
290 
291 template <class D>
293  return Set(d, 0);
294 }
295 
296 // ------------------------------ Undefined
297 
298 #define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
299  template <size_t N, int kPow2> \
300  HWY_API HWY_SVE_V(BASE, BITS) \
301  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
302  return sv##OP##_##CHAR##BITS(); \
303  }
304 
306 
307 // ------------------------------ BitCast
308 
309 namespace detail {
310 
311 // u8: no change
312 #define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \
313  HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
314  return v; \
315  } \
316  template <size_t N, int kPow2> \
317  HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
318  HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
319  return v; \
320  }
321 
322 // All other types
323 #define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \
324  HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
325  return sv##OP##_u8_##CHAR##BITS(v); \
326  } \
327  template <size_t N, int kPow2> \
328  HWY_INLINE HWY_SVE_V(BASE, BITS) \
329  BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svuint8_t v) { \
330  return sv##OP##_##CHAR##BITS##_u8(v); \
331  }
332 
334 HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
335 HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
336 HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
337 HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
338 HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
339 
340 #undef HWY_SVE_CAST_NOP
341 #undef HWY_SVE_CAST
342 
343 template <size_t N, int kPow2>
345  svuint8_t v) {
347 }
348 
349 } // namespace detail
350 
351 template <class D, class FromV>
354 }
355 
356 // ================================================== LOGICAL
357 
358 // detail::*N() functions accept a scalar argument to avoid extra Set().
359 
360 // ------------------------------ Not
361 
363 
364 // ------------------------------ And
365 
366 namespace detail {
368 } // namespace detail
369 
371 
372 template <class V, HWY_IF_FLOAT_V(V)>
373 HWY_API V And(const V a, const V b) {
374  const DFromV<V> df;
375  const RebindToUnsigned<decltype(df)> du;
376  return BitCast(df, And(BitCast(du, a), BitCast(du, b)));
377 }
378 
379 // ------------------------------ Or
380 
382 
383 template <class V, HWY_IF_FLOAT_V(V)>
384 HWY_API V Or(const V a, const V b) {
385  const DFromV<V> df;
386  const RebindToUnsigned<decltype(df)> du;
387  return BitCast(df, Or(BitCast(du, a), BitCast(du, b)));
388 }
389 
390 // ------------------------------ Xor
391 
392 namespace detail {
394 } // namespace detail
395 
397 
398 template <class V, HWY_IF_FLOAT_V(V)>
399 HWY_API V Xor(const V a, const V b) {
400  const DFromV<V> df;
401  const RebindToUnsigned<decltype(df)> du;
402  return BitCast(df, Xor(BitCast(du, a), BitCast(du, b)));
403 }
404 
405 // ------------------------------ AndNot
406 
407 namespace detail {
408 #define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
409  HWY_API HWY_SVE_V(BASE, BITS) \
410  NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
411  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
412  }
413 
415 #undef HWY_SVE_RETV_ARGPVN_SWAP
416 } // namespace detail
417 
418 #define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
419  HWY_API HWY_SVE_V(BASE, BITS) \
420  NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
421  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
422  }
424 #undef HWY_SVE_RETV_ARGPVV_SWAP
425 
426 template <class V, HWY_IF_FLOAT_V(V)>
427 HWY_API V AndNot(const V a, const V b) {
428  const DFromV<V> df;
429  const RebindToUnsigned<decltype(df)> du;
430  return BitCast(df, AndNot(BitCast(du, a), BitCast(du, b)));
431 }
432 
433 // ------------------------------ OrAnd
434 
435 template <class V>
436 HWY_API V OrAnd(const V o, const V a1, const V a2) {
437  return Or(o, And(a1, a2));
438 }
439 
440 // ------------------------------ PopulationCount
441 
442 #ifdef HWY_NATIVE_POPCNT
443 #undef HWY_NATIVE_POPCNT
444 #else
445 #define HWY_NATIVE_POPCNT
446 #endif
447 
448 // Need to return original type instead of unsigned.
449 #define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \
450  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
451  return BitCast(DFromV<decltype(v)>(), \
452  sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
453  }
455 #undef HWY_SVE_POPCNT
456 
457 // ================================================== SIGN
458 
459 // ------------------------------ Neg
461 
462 // ------------------------------ Abs
464 
465 // ------------------------------ CopySign[ToAbs]
466 
467 template <class V>
468 HWY_API V CopySign(const V magn, const V sign) {
469  const auto msb = SignBit(DFromV<V>());
470  return Or(AndNot(msb, magn), And(msb, sign));
471 }
472 
473 template <class V>
474 HWY_API V CopySignToAbs(const V abs, const V sign) {
475  const auto msb = SignBit(DFromV<V>());
476  return Or(abs, And(msb, sign));
477 }
478 
479 // ================================================== ARITHMETIC
480 
481 // ------------------------------ Add
482 
483 namespace detail {
485 } // namespace detail
486 
488 
489 // ------------------------------ Sub
490 
491 namespace detail {
492 // Can't use HWY_SVE_RETV_ARGPVN because caller wants to specify pg.
493 #define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \
494  HWY_API HWY_SVE_V(BASE, BITS) \
495  NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
496  return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
497  }
498 
500 #undef HWY_SVE_RETV_ARGPVN_MASK
501 } // namespace detail
502 
504 
505 // ------------------------------ SumsOf8
506 HWY_API svuint64_t SumsOf8(const svuint8_t v) {
507  const ScalableTag<uint32_t> du32;
508  const ScalableTag<uint64_t> du64;
509  const svbool_t pg = detail::PTrue(du64);
510 
511  const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1);
512  // Compute pairwise sum of u32 and extend to u64.
513  // TODO(janwas): on SVE2, we can instead use svaddp.
514  const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32);
515  // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended)
516  const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4));
517  return Add(hi, lo);
518 }
519 
520 // ------------------------------ SaturatedAdd
521 
524 
525 // ------------------------------ SaturatedSub
526 
529 
530 // ------------------------------ AbsDiff
532 
533 // ------------------------------ ShiftLeft[Same]
534 
535 #define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
536  template <int kBits> \
537  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
538  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
539  } \
540  HWY_API HWY_SVE_V(BASE, BITS) \
541  NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
542  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
543  }
544 
546 
547 // ------------------------------ ShiftRight[Same]
548 
551 
552 #undef HWY_SVE_SHIFT_N
553 
554 // ------------------------------ RotateRight
555 
556 // TODO(janwas): svxar on SVE2
557 template <int kBits, class V>
558 HWY_API V RotateRight(const V v) {
559  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
560  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
561  if (kBits == 0) return v;
562  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
563 }
564 
565 // ------------------------------ Shl/r
566 
567 #define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
568  HWY_API HWY_SVE_V(BASE, BITS) \
569  NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
570  const RebindToUnsigned<DFromV<decltype(v)>> du; \
571  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
572  BitCast(du, bits)); \
573  }
574 
576 
579 
580 #undef HWY_SVE_SHIFT
581 
582 // ------------------------------ Min/Max
583 
588 
589 namespace detail {
592 } // namespace detail
593 
594 // ------------------------------ Mul
597 
598 // ------------------------------ MulHigh
600 namespace detail {
603 } // namespace detail
604 
605 // ------------------------------ MulFixedPoint15
606 HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
607 #if HWY_TARGET == HWY_SVE2
608  return svqrdmulh_s16(a, b);
609 #else
610  const DFromV<decltype(a)> d;
611  const RebindToUnsigned<decltype(d)> du;
612 
613  const svuint16_t lo = BitCast(du, Mul(a, b));
614  const svint16_t hi = MulHigh(a, b);
615  // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must
616  // carry that into the result. Instead isolate the top two bits because only
617  // they can influence the result.
618  const svuint16_t lo_top2 = ShiftRight<14>(lo);
619  // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0.
620  const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1));
621  return Add(Add(hi, hi), BitCast(d, rounding));
622 #endif
623 }
624 
625 // ------------------------------ Div
627 
628 // ------------------------------ ApproximateReciprocal
630 
631 // ------------------------------ Sqrt
633 
634 // ------------------------------ ApproximateReciprocalSqrt
636 
637 // ------------------------------ MulAdd
638 #define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
639  HWY_API HWY_SVE_V(BASE, BITS) \
640  NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
641  HWY_SVE_V(BASE, BITS) add) { \
642  return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
643  }
644 
646 
647 // ------------------------------ NegMulAdd
649 
650 // ------------------------------ MulSub
652 
653 // ------------------------------ NegMulSub
655 
656 #undef HWY_SVE_FMA
657 
658 // ------------------------------ Round etc.
659 
664 
665 // ================================================== MASK
666 
667 // ------------------------------ RebindMask
668 template <class D, typename MFrom>
669 HWY_API svbool_t RebindMask(const D /*d*/, const MFrom mask) {
670  return mask;
671 }
672 
673 // ------------------------------ Mask logical
674 
675 HWY_API svbool_t Not(svbool_t m) {
676  // We don't know the lane type, so assume 8-bit. For larger types, this will
677  // de-canonicalize the predicate, i.e. set bits to 1 even though they do not
678  // correspond to the lowest byte in the lane. Per ARM, such bits are ignored.
679  return svnot_b_z(HWY_SVE_PTRUE(8), m);
680 }
681 HWY_API svbool_t And(svbool_t a, svbool_t b) {
682  return svand_b_z(b, b, a); // same order as AndNot for consistency
683 }
684 HWY_API svbool_t AndNot(svbool_t a, svbool_t b) {
685  return svbic_b_z(b, b, a); // reversed order like NEON
686 }
687 HWY_API svbool_t Or(svbool_t a, svbool_t b) {
688  return svsel_b(a, a, b); // a ? true : b
689 }
690 HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
691  return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b.
692 }
693 
694 // ------------------------------ CountTrue
695 
696 #define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
697  template <size_t N, int kPow2> \
698  HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \
699  return sv##OP##_b##BITS(detail::MakeMask(d), m); \
700  }
701 
703 #undef HWY_SVE_COUNT_TRUE
704 
705 // For 16-bit Compress: full vector, not limited to SV_POW2.
706 namespace detail {
707 
708 #define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \
709  template <size_t N, int kPow2> \
710  HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m) { \
711  return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
712  }
713 
714 HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE_FULL, CountTrueFull, cntp)
715 #undef HWY_SVE_COUNT_TRUE_FULL
716 
717 } // namespace detail
718 
719 // ------------------------------ AllFalse
720 template <class D>
721 HWY_API bool AllFalse(D d, svbool_t m) {
722  return !svptest_any(detail::MakeMask(d), m);
723 }
724 
725 // ------------------------------ AllTrue
726 template <class D>
727 HWY_API bool AllTrue(D d, svbool_t m) {
728  return CountTrue(d, m) == Lanes(d);
729 }
730 
731 // ------------------------------ FindFirstTrue
732 template <class D>
733 HWY_API intptr_t FindFirstTrue(D d, svbool_t m) {
734  return AllFalse(d, m) ? intptr_t{-1}
735  : static_cast<intptr_t>(
736  CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)));
737 }
738 
739 // ------------------------------ IfThenElse
740 #define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
741  HWY_API HWY_SVE_V(BASE, BITS) \
742  NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
743  return sv##OP##_##CHAR##BITS(m, yes, no); \
744  }
745 
747 #undef HWY_SVE_IF_THEN_ELSE
748 
749 // ------------------------------ IfThenElseZero
750 template <class M, class V>
751 HWY_API V IfThenElseZero(const M mask, const V yes) {
752  return IfThenElse(mask, yes, Zero(DFromV<V>()));
753 }
754 
755 // ------------------------------ IfThenZeroElse
756 template <class M, class V>
757 HWY_API V IfThenZeroElse(const M mask, const V no) {
758  return IfThenElse(mask, Zero(DFromV<V>()), no);
759 }
760 
761 // ================================================== COMPARE
762 
763 // mask = f(vector, vector)
764 #define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \
765  HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
766  return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
767  }
768 #define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \
769  HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
770  return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
771  }
772 
773 // ------------------------------ Eq
775 namespace detail {
776 HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, EqN, cmpeq_n)
777 } // namespace detail
778 
779 // ------------------------------ Ne
781 namespace detail {
782 HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, NeN, cmpne_n)
783 } // namespace detail
784 
785 // ------------------------------ Lt
787 namespace detail {
788 HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, LtN, cmplt_n)
789 } // namespace detail
790 
791 // ------------------------------ Le
793 
794 #undef HWY_SVE_COMPARE
795 #undef HWY_SVE_COMPARE_N
796 
797 // ------------------------------ Gt/Ge (swapped order)
798 
799 template <class V>
800 HWY_API svbool_t Gt(const V a, const V b) {
801  return Lt(b, a);
802 }
803 template <class V>
804 HWY_API svbool_t Ge(const V a, const V b) {
805  return Le(b, a);
806 }
807 
808 // ------------------------------ TestBit
809 template <class V>
810 HWY_API svbool_t TestBit(const V a, const V bit) {
811  return detail::NeN(And(a, bit), 0);
812 }
813 
814 // ------------------------------ MaskFromVec (Ne)
815 template <class V>
816 HWY_API svbool_t MaskFromVec(const V v) {
817  return detail::NeN(v, static_cast<TFromV<V>>(0));
818 }
819 
820 // ------------------------------ VecFromMask
821 
822 template <class D, HWY_IF_NOT_FLOAT_D(D)>
823 HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
824  const auto v0 = Zero(RebindToSigned<decltype(d)>());
825  return BitCast(d, detail::SubN(mask, v0, 1));
826 }
827 
828 template <class D, HWY_IF_FLOAT_D(D)>
829 HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
830  return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
831 }
832 
833 // ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
834 
835 template <class V>
836 HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
837  // TODO(janwas): use svbsl for SVE2
838  return IfThenElse(MaskFromVec(mask), yes, no);
839 }
840 
841 // ================================================== MEMORY
842 
843 // ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream
844 
845 #define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
846  template <size_t N, int kPow2> \
847  HWY_API HWY_SVE_V(BASE, BITS) \
848  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
849  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
850  return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p); \
851  }
852 
853 #define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
854  template <size_t N, int kPow2> \
855  HWY_API HWY_SVE_V(BASE, BITS) \
856  NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
857  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
858  return sv##OP##_##CHAR##BITS(m, p); \
859  }
860 
861 #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
862  template <size_t N, int kPow2> \
863  HWY_API HWY_SVE_V(BASE, BITS) \
864  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
865  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
866  /* All-true predicate to load all 128 bits. */ \
867  return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \
868  }
869 
870 #define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
871  template <size_t N, int kPow2> \
872  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
873  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
874  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
875  sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v); \
876  }
877 
878 #define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
879  template <size_t N, int kPow2> \
880  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
881  HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
882  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
883  sv##OP##_##CHAR##BITS(m, p, v); \
884  }
885 
892 
893 #undef HWY_SVE_LOAD
894 #undef HWY_SVE_MASKED_LOAD
895 #undef HWY_SVE_LOAD_DUP128
896 #undef HWY_SVE_STORE
897 #undef HWY_SVE_BLENDED_STORE
898 
899 // BF16 is the same as svuint16_t because BF16 is optional before v8.6.
900 template <size_t N, int kPow2>
902  const bfloat16_t* HWY_RESTRICT p) {
903  return Load(RebindToUnsigned<decltype(d)>(),
904  reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
905 }
906 
907 template <size_t N, int kPow2>
910  Store(v, RebindToUnsigned<decltype(d)>(),
911  reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
912 }
913 
914 // ------------------------------ Load/StoreU
915 
916 // SVE only requires lane alignment, not natural alignment of the entire
917 // vector.
918 template <class D>
920  return Load(d, p);
921 }
922 
923 template <class V, class D>
924 HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
925  Store(v, d, p);
926 }
927 
928 // ------------------------------ ScatterOffset/Index
929 
930 #define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
931  template <size_t N, int kPow2> \
932  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
933  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
934  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
935  HWY_SVE_V(int, BITS) offset) { \
936  sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \
937  v); \
938  }
939 
940 #define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
941  template <size_t N, int kPow2> \
942  HWY_API void NAME( \
943  HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
944  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, HWY_SVE_V(int, BITS) index) { \
945  sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, index, v); \
946  }
947 
950 #undef HWY_SVE_SCATTER_OFFSET
951 #undef HWY_SVE_SCATTER_INDEX
952 
953 // ------------------------------ GatherOffset/Index
954 
955 #define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
956  template <size_t N, int kPow2> \
957  HWY_API HWY_SVE_V(BASE, BITS) \
958  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
959  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
960  HWY_SVE_V(int, BITS) offset) { \
961  return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \
962  offset); \
963  }
964 #define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
965  template <size_t N, int kPow2> \
966  HWY_API HWY_SVE_V(BASE, BITS) \
967  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
968  const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
969  HWY_SVE_V(int, BITS) index) { \
970  return sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, \
971  index); \
972  }
973 
976 #undef HWY_SVE_GATHER_OFFSET
977 #undef HWY_SVE_GATHER_INDEX
978 
979 // ------------------------------ StoreInterleaved3
980 
981 #define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \
982  template <size_t N, int kPow2> \
983  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
984  HWY_SVE_V(BASE, BITS) v2, \
985  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
986  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
987  const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \
988  sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, triple); \
989  }
991 
992 #undef HWY_SVE_STORE3
993 
994 // ------------------------------ StoreInterleaved4
995 
996 #define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \
997  template <size_t N, int kPow2> \
998  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
999  HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
1000  HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1001  HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1002  const sv##BASE##BITS##x4_t quad = \
1003  svcreate4##_##CHAR##BITS(v0, v1, v2, v3); \
1004  sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, quad); \
1005  }
1007 
1008 #undef HWY_SVE_STORE4
1009 
1010 // ================================================== CONVERT
1011 
1012 // ------------------------------ PromoteTo
1013 
1014 // Same sign
1015 #define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \
1016  template <size_t N, int kPow2> \
1017  HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1018  HWY_SVE_D(BASE, BITS, N, kPow2) /* tag */, HWY_SVE_V(BASE, HALF) v) { \
1019  return sv##OP##_##CHAR##BITS(v); \
1020  }
1021 
1025 
1026 // 2x
1027 template <size_t N, int kPow2>
1028 HWY_API svuint32_t PromoteTo(Simd<uint32_t, N, kPow2> dto, svuint8_t vfrom) {
1029  const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
1030  return PromoteTo(dto, PromoteTo(d2, vfrom));
1031 }
1032 template <size_t N, int kPow2>
1033 HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svint8_t vfrom) {
1034  const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
1035  return PromoteTo(dto, PromoteTo(d2, vfrom));
1036 }
1037 
1038 // Sign change
1039 template <size_t N, int kPow2>
1040 HWY_API svint16_t PromoteTo(Simd<int16_t, N, kPow2> dto, svuint8_t vfrom) {
1041  const RebindToUnsigned<decltype(dto)> du;
1042  return BitCast(dto, PromoteTo(du, vfrom));
1043 }
1044 template <size_t N, int kPow2>
1045 HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint16_t vfrom) {
1046  const RebindToUnsigned<decltype(dto)> du;
1047  return BitCast(dto, PromoteTo(du, vfrom));
1048 }
1049 template <size_t N, int kPow2>
1050 HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint8_t vfrom) {
1051  const Repartition<uint16_t, DFromV<decltype(vfrom)>> du16;
1052  const Repartition<int16_t, decltype(du16)> di16;
1053  return PromoteTo(dto, BitCast(di16, PromoteTo(du16, vfrom)));
1054 }
1055 
1056 // ------------------------------ PromoteTo F
1057 
1058 // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
1059 // first replicate each lane once.
1060 namespace detail {
1062 // Do not use zip2 to implement PromoteUpperTo or similar because vectors may be
1063 // non-powers of two, so getting the actual "upper half" requires MaskUpperHalf.
1064 } // namespace detail
1065 
1066 template <size_t N, int kPow2>
1068  const svfloat16_t v) {
1069  const svfloat16_t vv = detail::ZipLower(v, v);
1070  return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
1071 }
1072 
1073 template <size_t N, int kPow2>
1075  const svfloat32_t v) {
1076  const svfloat32_t vv = detail::ZipLower(v, v);
1077  return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv);
1078 }
1079 
1080 template <size_t N, int kPow2>
1082  const svint32_t v) {
1083  const svint32_t vv = detail::ZipLower(v, v);
1084  return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv);
1085 }
1086 
1087 // For 16-bit Compress
1088 namespace detail {
1090 #undef HWY_SVE_PROMOTE_TO
1091 
1092 template <size_t N, int kPow2>
1093 HWY_API svfloat32_t PromoteUpperTo(Simd<float, N, kPow2> df, svfloat16_t v) {
1094  const RebindToUnsigned<decltype(df)> du;
1095  const RepartitionToNarrow<decltype(du)> dn;
1096  return BitCast(df, PromoteUpperTo(du, BitCast(dn, v)));
1097 }
1098 
1099 } // namespace detail
1100 
1101 // ------------------------------ DemoteTo U
1102 
1103 namespace detail {
1104 
1105 // Saturates unsigned vectors to half/quarter-width TN.
1106 template <typename TN, class VU>
1107 VU SaturateU(VU v) {
1108  return detail::MinN(v, static_cast<TFromV<VU>>(LimitsMax<TN>()));
1109 }
1110 
1111 // Saturates unsigned vectors to half/quarter-width TN.
1112 template <typename TN, class VI>
1113 VI SaturateI(VI v) {
1114  return detail::MinN(detail::MaxN(v, LimitsMin<TN>()), LimitsMax<TN>());
1115 }
1116 
1117 } // namespace detail
1118 
1119 template <size_t N, int kPow2>
1120 HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint16_t v) {
1121  const DFromV<decltype(v)> di;
1122  const RebindToUnsigned<decltype(di)> du;
1123  using TN = TFromD<decltype(dn)>;
1124  // First clamp negative numbers to zero and cast to unsigned.
1125  const svuint16_t clamped = BitCast(du, detail::MaxN(v, 0));
1126  // Saturate to unsigned-max and halve the width.
1127  const svuint8_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
1128  return svuzp1_u8(vn, vn);
1129 }
1130 
1131 template <size_t N, int kPow2>
1132 HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svint32_t v) {
1133  const DFromV<decltype(v)> di;
1134  const RebindToUnsigned<decltype(di)> du;
1135  using TN = TFromD<decltype(dn)>;
1136  // First clamp negative numbers to zero and cast to unsigned.
1137  const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
1138  // Saturate to unsigned-max and halve the width.
1139  const svuint16_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
1140  return svuzp1_u16(vn, vn);
1141 }
1142 
1143 template <size_t N, int kPow2>
1144 HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint32_t v) {
1145  const DFromV<decltype(v)> di;
1146  const RebindToUnsigned<decltype(di)> du;
1147  const RepartitionToNarrow<decltype(du)> d2;
1148  using TN = TFromD<decltype(dn)>;
1149  // First clamp negative numbers to zero and cast to unsigned.
1150  const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
1151  // Saturate to unsigned-max and quarter the width.
1152  const svuint16_t cast16 = BitCast(d2, detail::SaturateU<TN>(clamped));
1153  const svuint8_t x2 = BitCast(dn, svuzp1_u16(cast16, cast16));
1154  return svuzp1_u8(x2, x2);
1155 }
1156 
1157 HWY_API svuint8_t U8FromU32(const svuint32_t v) {
1158  const DFromV<svuint32_t> du32;
1159  const RepartitionToNarrow<decltype(du32)> du16;
1160  const RepartitionToNarrow<decltype(du16)> du8;
1161 
1162  const svuint16_t cast16 = BitCast(du16, v);
1163  const svuint16_t x2 = svuzp1_u16(cast16, cast16);
1164  const svuint8_t cast8 = BitCast(du8, x2);
1165  return svuzp1_u8(cast8, cast8);
1166 }
1167 
1168 // ------------------------------ DemoteTo I
1169 
1170 template <size_t N, int kPow2>
1171 HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint16_t v) {
1172 #if HWY_TARGET == HWY_SVE2
1173  const svint8_t vn = BitCast(dn, svqxtnb_s16(v));
1174 #else
1175  using TN = TFromD<decltype(dn)>;
1176  const svint8_t vn = BitCast(dn, detail::SaturateI<TN>(v));
1177 #endif
1178  return svuzp1_s8(vn, vn);
1179 }
1180 
1181 template <size_t N, int kPow2>
1182 HWY_API svint16_t DemoteTo(Simd<int16_t, N, kPow2> dn, const svint32_t v) {
1183 #if HWY_TARGET == HWY_SVE2
1184  const svint16_t vn = BitCast(dn, svqxtnb_s32(v));
1185 #else
1186  using TN = TFromD<decltype(dn)>;
1187  const svint16_t vn = BitCast(dn, detail::SaturateI<TN>(v));
1188 #endif
1189  return svuzp1_s16(vn, vn);
1190 }
1191 
1192 template <size_t N, int kPow2>
1193 HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint32_t v) {
1194  const RepartitionToWide<decltype(dn)> d2;
1195 #if HWY_TARGET == HWY_SVE2
1196  const svint16_t cast16 = BitCast(d2, svqxtnb_s16(svqxtnb_s32(v)));
1197 #else
1198  using TN = TFromD<decltype(dn)>;
1199  const svint16_t cast16 = BitCast(d2, detail::SaturateI<TN>(v));
1200 #endif
1201  const svint8_t v2 = BitCast(dn, svuzp1_s16(cast16, cast16));
1202  return BitCast(dn, svuzp1_s8(v2, v2));
1203 }
1204 
1205 // ------------------------------ ConcatEven/ConcatOdd
1206 
1207 // WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
1208 // full vector length, not rounded down to a power of two as we require).
1209 namespace detail {
1210 
1211 #define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
1212  HWY_INLINE HWY_SVE_V(BASE, BITS) \
1213  NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1214  return sv##OP##_##CHAR##BITS(lo, hi); \
1215  }
1218 #undef HWY_SVE_CONCAT_EVERY_SECOND
1219 
1220 // Used to slide up / shift whole register left; mask indicates which range
1221 // to take from lo, and the rest is filled from hi starting at its lowest.
1222 #define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
1223  HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1224  HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
1225  return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1226  }
1227 HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
1228 #undef HWY_SVE_SPLICE
1229 
1230 } // namespace detail
1231 
1232 template <class D>
1234 #if 0 // if we could assume VL is a power of two
1235  return detail::ConcatOdd(hi, lo);
1236 #else
1237  const VFromD<D> hi_odd = detail::ConcatOdd(hi, hi);
1238  const VFromD<D> lo_odd = detail::ConcatOdd(lo, lo);
1239  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
1240 #endif
1241 }
1242 
1243 template <class D>
1245 #if 0 // if we could assume VL is a power of two
1246  return detail::ConcatEven(hi, lo);
1247 #else
1248  const VFromD<D> hi_odd = detail::ConcatEven(hi, hi);
1249  const VFromD<D> lo_odd = detail::ConcatEven(lo, lo);
1250  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
1251 #endif
1252 }
1253 
1254 // ------------------------------ DemoteTo F
1255 
1256 template <size_t N, int kPow2>
1257 HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
1258  const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v);
1259  return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
1260 }
1261 
1262 template <size_t N, int kPow2>
1263 HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N, kPow2> /* d */, svfloat32_t v) {
1264  const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
1265  return detail::ConcatOdd(in_even, in_even); // can ignore upper half of vec
1266 }
1267 
1268 template <size_t N, int kPow2>
1269 HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) {
1270  const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v);
1271  return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
1272 }
1273 
1274 template <size_t N, int kPow2>
1275 HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
1276  const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v);
1277  return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
1278 }
1279 
1280 // ------------------------------ ConvertTo F
1281 
1282 #define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
1283  template <size_t N, int kPow2> \
1284  HWY_API HWY_SVE_V(BASE, BITS) \
1285  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
1286  return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1287  } \
1288  /* Truncates (rounds toward zero). */ \
1289  template <size_t N, int kPow2> \
1290  HWY_API HWY_SVE_V(int, BITS) \
1291  NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
1292  return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1293  }
1294 
1295 // API only requires f32 but we provide f64 for use by Iota.
1297 #undef HWY_SVE_CONVERT
1298 
1299 // ------------------------------ NearestInt (Round, ConvertTo)
1300 
1301 template <class VF, class DI = RebindToSigned<DFromV<VF>>>
1303  // No single instruction, round then truncate.
1304  return ConvertTo(DI(), Round(v));
1305 }
1306 
1307 // ------------------------------ Iota (Add, ConvertTo)
1308 
1309 #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
1310  template <size_t N, int kPow2> \
1311  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
1312  HWY_SVE_T(BASE, BITS) first) { \
1313  return sv##OP##_##CHAR##BITS(first, 1); \
1314  }
1315 
1317 #undef HWY_SVE_IOTA
1318 
1319 template <class D, HWY_IF_FLOAT_D(D)>
1320 HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
1321  const RebindToSigned<D> di;
1322  return detail::AddN(ConvertTo(d, Iota(di, 0)), first);
1323 }
1324 
1325 // ================================================== COMBINE
1326 
1327 namespace detail {
1328 
1329 template <class D>
1330 svbool_t MaskLowerHalf(D d) {
1331  return FirstN(d, Lanes(d) / 2);
1332 }
1333 template <class D>
1334 svbool_t MaskUpperHalf(D d) {
1335  // For Splice to work as intended, make sure bits above Lanes(d) are zero.
1337 }
1338 
1339 // Right-shift vector pair by constexpr; can be used to slide down (=N) or up
1340 // (=Lanes()-N).
1341 #define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \
1342  template <size_t kIndex> \
1343  HWY_API HWY_SVE_V(BASE, BITS) \
1344  NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1345  return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
1346  }
1347 HWY_SVE_FOREACH(HWY_SVE_EXT, Ext, ext)
1348 #undef HWY_SVE_EXT
1349 
1350 } // namespace detail
1351 
1352 // ------------------------------ ConcatUpperLower
1353 template <class D, class V>
1354 HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) {
1355  return IfThenElse(detail::MaskLowerHalf(d), lo, hi);
1356 }
1357 
1358 // ------------------------------ ConcatLowerLower
1359 template <class D, class V>
1360 HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
1361  return detail::Splice(hi, lo, detail::MaskLowerHalf(d));
1362 }
1363 
1364 // ------------------------------ ConcatLowerUpper
1365 template <class D, class V>
1366 HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
1367  return detail::Splice(hi, lo, detail::MaskUpperHalf(d));
1368 }
1369 
1370 // ------------------------------ ConcatUpperUpper
1371 template <class D, class V>
1372 HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) {
1373  const svbool_t mask_upper = detail::MaskUpperHalf(d);
1374  const V lo_upper = detail::Splice(lo, lo, mask_upper);
1375  return IfThenElse(mask_upper, hi, lo_upper);
1376 }
1377 
1378 // ------------------------------ Combine
1379 template <class D, class V2>
1380 HWY_API VFromD<D> Combine(const D d, const V2 hi, const V2 lo) {
1381  return ConcatLowerLower(d, hi, lo);
1382 }
1383 
1384 // ------------------------------ ZeroExtendVector
1385 
1386 template <class D, class V>
1387 HWY_API V ZeroExtendVector(const D d, const V lo) {
1388  return Combine(d, Zero(Half<D>()), lo);
1389 }
1390 
1391 // ------------------------------ Lower/UpperHalf
1392 
1393 template <class D2, class V>
1394 HWY_API V LowerHalf(D2 /* tag */, const V v) {
1395  return v;
1396 }
1397 
1398 template <class V>
1399 HWY_API V LowerHalf(const V v) {
1400  return v;
1401 }
1402 
1403 template <class D2, class V>
1404 HWY_API V UpperHalf(const D2 /* d2 */, const V v) {
1405  return detail::Splice(v, v, detail::MaskUpperHalf(Twice<D2>()));
1406 }
1407 
1408 // ================================================== SWIZZLE
1409 
1410 // ------------------------------ GetLane
1411 
1412 #define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
1413  HWY_API HWY_SVE_T(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1414  return sv##OP##_##CHAR##BITS(detail::PFalse(), v); \
1415  }
1416 
1418 #undef HWY_SVE_GET_LANE
1419 
1420 // ------------------------------ DupEven
1421 
1422 namespace detail {
1423 HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveEven, trn1)
1424 } // namespace detail
1425 
1426 template <class V>
1427 HWY_API V DupEven(const V v) {
1428  return detail::InterleaveEven(v, v);
1429 }
1430 
1431 // ------------------------------ DupOdd
1432 
1433 namespace detail {
1434 HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveOdd, trn2)
1435 } // namespace detail
1436 
1437 template <class V>
1438 HWY_API V DupOdd(const V v) {
1439  return detail::InterleaveOdd(v, v);
1440 }
1441 
1442 // ------------------------------ OddEven
1443 
1444 namespace detail {
1445 HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVN, Insert, insr_n)
1446 } // namespace detail
1447 
1448 template <class V>
1449 HWY_API V OddEven(const V odd, const V even) {
1450  const auto even_in_odd = detail::Insert(even, 0);
1451  return detail::InterleaveOdd(even_in_odd, odd);
1452 }
1453 
1454 // ------------------------------ OddEvenBlocks
1455 template <class V>
1456 HWY_API V OddEvenBlocks(const V odd, const V even) {
1457  const RebindToUnsigned<DFromV<V>> du;
1458  using TU = TFromD<decltype(du)>;
1459  constexpr size_t kShift = CeilLog2(16 / sizeof(TU));
1460  const auto idx_block = ShiftRight<kShift>(Iota(du, 0));
1461  const auto lsb = detail::AndN(idx_block, static_cast<TU>(1));
1462  const svbool_t is_even = detail::EqN(lsb, static_cast<TU>(0));
1463  return IfThenElse(is_even, even, odd);
1464 }
1465 
1466 // ------------------------------ TableLookupLanes
1467 
1468 template <class D, class VI>
1470  using TI = TFromV<VI>;
1471  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size mismatch");
1472  const RebindToUnsigned<D> du;
1473  const auto indices = BitCast(du, vec);
1474 #if HWY_IS_DEBUG_BUILD
1475  HWY_DASSERT(AllTrue(du, detail::LtN(indices, static_cast<TI>(Lanes(d)))));
1476 #else
1477  (void)d;
1478 #endif
1479  return indices;
1480 }
1481 
1482 template <class D, typename TI>
1484  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
1485  return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
1486 }
1487 
1488 // <32bit are not part of Highway API, but used in Broadcast.
1489 #define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \
1490  HWY_API HWY_SVE_V(BASE, BITS) \
1491  NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \
1492  return sv##OP##_##CHAR##BITS(v, idx); \
1493  }
1494 
1496 #undef HWY_SVE_TABLE
1497 
1498 // ------------------------------ SwapAdjacentBlocks (TableLookupLanes)
1499 
1500 namespace detail {
1501 
1502 template <typename T, size_t N, int kPow2>
1503 constexpr size_t LanesPerBlock(Simd<T, N, kPow2> /* tag */) {
1504  // We might have a capped vector smaller than a block, so honor that.
1505  return HWY_MIN(16 / sizeof(T), detail::ScaleByPower(N, kPow2));
1506 }
1507 
1508 } // namespace detail
1509 
1510 template <class V>
1512  const DFromV<V> d;
1513  const RebindToUnsigned<decltype(d)> du;
1514  constexpr auto kLanesPerBlock =
1515  static_cast<TFromV<V>>(detail::LanesPerBlock(d));
1516  const VFromD<decltype(du)> idx = detail::XorN(Iota(du, 0), kLanesPerBlock);
1517  return TableLookupLanes(v, idx);
1518 }
1519 
1520 // ------------------------------ Reverse
1521 
1522 #if 0 // if we could assume VL is a power of two
1523 #error "Update macro"
1524 #endif
1525 #define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \
1526  template <size_t N, int kPow2> \
1527  HWY_API HWY_SVE_V(BASE, BITS) \
1528  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, HWY_SVE_V(BASE, BITS) v) { \
1529  const auto reversed = sv##OP##_##CHAR##BITS(v); \
1530  /* Shift right to remove extra (non-pow2 and remainder) lanes. */ \
1531  const size_t all_lanes = \
1532  detail::AllHardwareLanes(hwy::SizeTag<BITS / 8>()); \
1533  /* TODO(janwas): on SVE2, use whilege. */ \
1534  /* Avoids FirstN truncating to the return vector size. */ \
1535  const ScalableTag<HWY_SVE_T(BASE, BITS)> dfull; \
1536  const svbool_t mask = Not(FirstN(dfull, all_lanes - Lanes(d))); \
1537  return detail::Splice(reversed, reversed, mask); \
1538  }
1539 
1541 #undef HWY_SVE_REVERSE
1542 
1543 // ------------------------------ Reverse2
1544 
1545 template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
1547  const RebindToUnsigned<decltype(d)> du;
1548  const RepartitionToWide<decltype(du)> dw;
1549  return BitCast(d, svrevh_u32_x(detail::PTrue(d), BitCast(dw, v)));
1550 }
1551 
1552 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
1553 HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
1554  const RebindToUnsigned<decltype(d)> du;
1555  const RepartitionToWide<decltype(du)> dw;
1556  return BitCast(d, svrevw_u64_x(detail::PTrue(d), BitCast(dw, v)));
1557 }
1558 
1559 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
1560 HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) { // 3210
1561  const auto even_in_odd = detail::Insert(v, 0); // 210z
1562  return detail::InterleaveOdd(v, even_in_odd); // 2301
1563 }
1564 
1565 // ------------------------------ Reverse4 (TableLookupLanes)
1566 
1567 // TODO(janwas): is this approach faster than Shuffle0123?
1568 template <class D>
1570  const RebindToUnsigned<decltype(d)> du;
1571  const auto idx = detail::XorN(Iota(du, 0), 3);
1572  return TableLookupLanes(v, idx);
1573 }
1574 
1575 // ------------------------------ Reverse8 (TableLookupLanes)
1576 
1577 template <class D>
1579  const RebindToUnsigned<decltype(d)> du;
1580  const auto idx = detail::XorN(Iota(du, 0), 7);
1581  return TableLookupLanes(v, idx);
1582 }
1583 
1584 // ------------------------------ Compress (PromoteTo)
1585 
1586 template <typename T>
1587 struct CompressIsPartition {
1588  enum { value = 0 };
1589 };
1590 
1591 #define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \
1592  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
1593  return sv##OP##_##CHAR##BITS(mask, v); \
1594  }
1595 
1597 #undef HWY_SVE_COMPRESS
1598 
1599 template <class V, HWY_IF_LANE_SIZE_V(V, 2)>
1600 HWY_API V Compress(V v, svbool_t mask16) {
1601  static_assert(!IsSame<V, svfloat16_t>(), "Must use overload");
1602  const DFromV<V> d16;
1603 
1604  // Promote vector and mask to 32-bit
1605  const RepartitionToWide<decltype(d16)> dw;
1606  const auto v32L = PromoteTo(dw, v);
1607  const auto v32H = detail::PromoteUpperTo(dw, v);
1608  const svbool_t mask32L = svunpklo_b(mask16);
1609  const svbool_t mask32H = svunpkhi_b(mask16);
1610 
1611  const auto compressedL = Compress(v32L, mask32L);
1612  const auto compressedH = Compress(v32H, mask32H);
1613 
1614  // Demote to 16-bit (already in range) - separately so we can splice
1615  const V evenL = BitCast(d16, compressedL);
1616  const V evenH = BitCast(d16, compressedH);
1617  const V v16L = detail::ConcatEven(evenL, evenL); // only lower half needed
1618  const V v16H = detail::ConcatEven(evenH, evenH);
1619 
1620  // We need to combine two vectors of non-constexpr length, so the only option
1621  // is Splice, which requires us to synthesize a mask. NOTE: this function uses
1622  // full vectors (SV_ALL instead of SV_POW2), hence we need unmasked svcnt.
1623  const size_t countL = detail::CountTrueFull(dw, mask32L);
1624  const auto compressed_maskL = FirstN(d16, countL);
1625  return detail::Splice(v16H, v16L, compressed_maskL);
1626 }
1627 
1628 // Must treat float16_t as integers so we can ConcatEven.
1629 HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) {
1630  const DFromV<decltype(v)> df;
1631  const RebindToSigned<decltype(df)> di;
1632  return BitCast(df, Compress(BitCast(di, v), mask16));
1633 }
1634 
1635 // ------------------------------ CompressStore
1636 
1637 template <class V, class M, class D>
1638 HWY_API size_t CompressStore(const V v, const M mask, const D d,
1639  TFromD<D>* HWY_RESTRICT unaligned) {
1640  StoreU(Compress(v, mask), d, unaligned);
1641  return CountTrue(d, mask);
1642 }
1643 
1644 // ------------------------------ CompressBlendedStore
1645 
1646 template <class V, class M, class D>
1647 HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
1648  TFromD<D>* HWY_RESTRICT unaligned) {
1649  const size_t count = CountTrue(d, mask);
1650  const svbool_t store_mask = FirstN(d, count);
1651  BlendedStore(Compress(v, mask), store_mask, d, unaligned);
1652  return count;
1653 }
1654 
1655 // ================================================== BLOCKWISE
1656 
1657 // ------------------------------ CombineShiftRightBytes
1658 
1659 namespace detail {
1660 
1661 // For x86-compatible behaviour mandated by Highway API: TableLookupBytes
1662 // offsets are implicitly relative to the start of their 128-bit block.
1663 template <class D, class V>
1664 HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
1665  using T = MakeUnsigned<TFromD<D>>;
1666  return detail::AndNotN(static_cast<T>(LanesPerBlock(d) - 1), iota0);
1667 }
1668 
1669 template <size_t kLanes, class D>
1670 svbool_t FirstNPerBlock(D d) {
1671  const RebindToSigned<decltype(d)> di;
1672  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(di);
1673  const auto idx_mod = detail::AndN(Iota(di, 0), kLanesPerBlock - 1);
1674  return detail::LtN(BitCast(di, idx_mod), kLanes);
1675 }
1676 
1677 } // namespace detail
1678 
1679 template <size_t kBytes, class D, class V = VFromD<D>>
1680 HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) {
1681  const Repartition<uint8_t, decltype(d)> d8;
1682  const auto hi8 = BitCast(d8, hi);
1683  const auto lo8 = BitCast(d8, lo);
1684  const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes));
1685  const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
1686  const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
1687  return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
1688 }
1689 
1690 // ------------------------------ Shuffle2301
1691 
1692 template <class V>
1693 HWY_API V Shuffle2301(const V v) {
1694  const DFromV<V> d;
1695  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1696  return Reverse2(d, v);
1697 }
1698 
1699 // ------------------------------ Shuffle2103
1700 template <class V>
1701 HWY_API V Shuffle2103(const V v) {
1702  const DFromV<V> d;
1703  const Repartition<uint8_t, decltype(d)> d8;
1704  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1705  const svuint8_t v8 = BitCast(d8, v);
1706  return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8));
1707 }
1708 
1709 // ------------------------------ Shuffle0321
1710 template <class V>
1711 HWY_API V Shuffle0321(const V v) {
1712  const DFromV<V> d;
1713  const Repartition<uint8_t, decltype(d)> d8;
1714  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1715  const svuint8_t v8 = BitCast(d8, v);
1716  return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8));
1717 }
1718 
1719 // ------------------------------ Shuffle1032
1720 template <class V>
1721 HWY_API V Shuffle1032(const V v) {
1722  const DFromV<V> d;
1723  const Repartition<uint8_t, decltype(d)> d8;
1724  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
1725  const svuint8_t v8 = BitCast(d8, v);
1726  return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
1727 }
1728 
1729 // ------------------------------ Shuffle01
1730 template <class V>
1731 HWY_API V Shuffle01(const V v) {
1732  const DFromV<V> d;
1733  const Repartition<uint8_t, decltype(d)> d8;
1734  static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
1735  const svuint8_t v8 = BitCast(d8, v);
1736  return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
1737 }
1738 
1739 // ------------------------------ Shuffle0123
1740 template <class V>
1741 HWY_API V Shuffle0123(const V v) {
1742  return Shuffle2301(Shuffle1032(v));
1743 }
1744 
1745 // ------------------------------ ReverseBlocks (Reverse, Shuffle01)
1746 template <class D, class V = VFromD<D>>
1748  const Repartition<uint64_t, D> du64;
1749  return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v))));
1750 }
1751 
1752 // ------------------------------ TableLookupBytes
1753 
1754 template <class V, class VI>
1755 HWY_API VI TableLookupBytes(const V v, const VI idx) {
1756  const DFromV<VI> d;
1757  const Repartition<uint8_t, decltype(d)> du8;
1758  const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0));
1759  const auto idx8 = Add(BitCast(du8, idx), offsets128);
1760  return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8));
1761 }
1762 
1763 template <class V, class VI>
1764 HWY_API VI TableLookupBytesOr0(const V v, const VI idx) {
1765  const DFromV<VI> d;
1766  // Mask size must match vector type, so cast everything to this type.
1767  const Repartition<int8_t, decltype(d)> di8;
1768 
1769  auto idx8 = BitCast(di8, idx);
1770  const auto msb = detail::LtN(idx8, 0);
1771 
1772  const auto lookup = TableLookupBytes(BitCast(di8, v), idx8);
1773  return BitCast(d, IfThenZeroElse(msb, lookup));
1774 }
1775 
1776 // ------------------------------ Broadcast
1777 
1778 template <int kLane, class V>
1779 HWY_API V Broadcast(const V v) {
1780  const DFromV<V> d;
1781  const RebindToUnsigned<decltype(d)> du;
1782  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
1783  static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
1784  auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
1785  if (kLane != 0) {
1786  idx = detail::AddN(idx, kLane);
1787  }
1788  return TableLookupLanes(v, idx);
1789 }
1790 
1791 // ------------------------------ ShiftLeftLanes
1792 
1793 template <size_t kLanes, class D, class V = VFromD<D>>
1794 HWY_API V ShiftLeftLanes(D d, const V v) {
1795  const auto zero = Zero(d);
1796  const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes));
1797  // Match x86 semantics by zeroing lower lanes in 128-bit blocks
1798  return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted);
1799 }
1800 
1801 template <size_t kLanes, class V>
1803  return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
1804 }
1805 
1806 // ------------------------------ ShiftRightLanes
1807 template <size_t kLanes, class D, class V = VFromD<D>>
1809  // For capped/fractional vectors, clear upper lanes so we shift in zeros.
1810  if (!detail::IsFull(d)) {
1812  }
1813 
1814  const auto shifted = detail::Ext<kLanes>(v, v);
1815  // Match x86 semantics by zeroing upper lanes in 128-bit blocks
1816  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
1817  const svbool_t mask = detail::FirstNPerBlock<kLanesPerBlock - kLanes>(d);
1818  return IfThenElseZero(mask, shifted);
1819 }
1820 
1821 // ------------------------------ ShiftLeftBytes
1822 
1823 template <int kBytes, class D, class V = VFromD<D>>
1824 HWY_API V ShiftLeftBytes(const D d, const V v) {
1825  const Repartition<uint8_t, decltype(d)> d8;
1826  return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
1827 }
1828 
1829 template <int kBytes, class V>
1831  return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
1832 }
1833 
1834 // ------------------------------ ShiftRightBytes
1835 template <int kBytes, class D, class V = VFromD<D>>
1836 HWY_API V ShiftRightBytes(const D d, const V v) {
1837  const Repartition<uint8_t, decltype(d)> d8;
1838  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
1839 }
1840 
1841 // ------------------------------ InterleaveLower
1842 
1843 template <class D, class V>
1844 HWY_API V InterleaveLower(D d, const V a, const V b) {
1845  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
1846  // Move lower halves of blocks to lower half of vector.
1847  const Repartition<uint64_t, decltype(d)> d64;
1848  const auto a64 = BitCast(d64, a);
1849  const auto b64 = BitCast(d64, b);
1850  const auto a_blocks = detail::ConcatEven(a64, a64); // only lower half needed
1851  const auto b_blocks = detail::ConcatEven(b64, b64);
1852 
1853  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
1854 }
1855 
1856 template <class V>
1857 HWY_API V InterleaveLower(const V a, const V b) {
1858  return InterleaveLower(DFromV<V>(), a, b);
1859 }
1860 
1861 // ------------------------------ InterleaveUpper
1862 
1863 // Full vector: guaranteed to have at least one block
1864 template <class D, class V = VFromD<D>,
1865  hwy::EnableIf<detail::IsFull(D())>* = nullptr>
1866 HWY_API V InterleaveUpper(D d, const V a, const V b) {
1867  // Move upper halves of blocks to lower half of vector.
1868  const Repartition<uint64_t, decltype(d)> d64;
1869  const auto a64 = BitCast(d64, a);
1870  const auto b64 = BitCast(d64, b);
1871  const auto a_blocks = detail::ConcatOdd(a64, a64); // only lower half needed
1872  const auto b_blocks = detail::ConcatOdd(b64, b64);
1873  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
1874 }
1875 
1876 // Capped/fraction: need runtime check
1877 template <class D, class V = VFromD<D>,
1878  hwy::EnableIf<!detail::IsFull(D())>* = nullptr>
1879 HWY_API V InterleaveUpper(D d, const V a, const V b) {
1880  // Less than one block: treat as capped
1881  if (Lanes(d) * sizeof(TFromD<D>) < 16) {
1882  const Half<decltype(d)> d2;
1883  return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
1884  }
1885  return InterleaveUpper(DFromV<V>(), a, b);
1886 }
1887 
1888 // ------------------------------ ZipLower
1889 
1890 template <class V, class DW = RepartitionToWide<DFromV<V>>>
1891 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
1892  const RepartitionToNarrow<DW> dn;
1893  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
1894  return BitCast(dw, InterleaveLower(dn, a, b));
1895 }
1896 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
1897 HWY_API VFromD<DW> ZipLower(const V a, const V b) {
1898  return BitCast(DW(), InterleaveLower(D(), a, b));
1899 }
1900 
1901 // ------------------------------ ZipUpper
1902 template <class V, class DW = RepartitionToWide<DFromV<V>>>
1903 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
1904  const RepartitionToNarrow<DW> dn;
1905  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
1906  return BitCast(dw, InterleaveUpper(dn, a, b));
1907 }
1908 
1909 // ================================================== REDUCE
1910 
1911 #define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
1912  template <size_t N, int kPow2> \
1913  HWY_API HWY_SVE_V(BASE, BITS) \
1914  NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, HWY_SVE_V(BASE, BITS) v) { \
1915  return Set(d, static_cast<HWY_SVE_T(BASE, BITS)>( \
1916  sv##OP##_##CHAR##BITS(detail::MakeMask(d), v))); \
1917  }
1918 
1922 // NaN if all are
1925 
1926 #undef HWY_SVE_REDUCE
1927 
1928 // ================================================== Ops with dependencies
1929 
1930 // ------------------------------ PromoteTo bfloat16 (ZipLower)
1931 
1932 template <size_t N, int kPow2>
1934  const svuint16_t v) {
1935  return BitCast(df32, detail::ZipLower(svdup_n_u16(0), v));
1936 }
1937 
1938 // ------------------------------ ReorderDemote2To (OddEven)
1939 
1940 template <size_t N, int kPow2>
1942  svfloat32_t a, svfloat32_t b) {
1943  const RebindToUnsigned<decltype(dbf16)> du16;
1944  const Repartition<uint32_t, decltype(dbf16)> du32;
1945  const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b));
1946  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
1947 }
1948 
1949 // ------------------------------ ZeroIfNegative (Lt, IfThenElse)
1950 template <class V>
1952  return IfThenZeroElse(detail::LtN(v, 0), v);
1953 }
1954 
1955 // ------------------------------ BroadcastSignBit (ShiftRight)
1956 template <class V>
1958  return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
1959 }
1960 
1961 // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1962 template <class V>
1963 HWY_API V IfNegativeThenElse(V v, V yes, V no) {
1964  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
1965  const DFromV<V> d;
1966  const RebindToSigned<decltype(d)> di;
1967 
1968  const svbool_t m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
1969  return IfThenElse(m, yes, no);
1970 }
1971 
1972 // ------------------------------ AverageRound (ShiftRight)
1973 
1974 #if HWY_TARGET == HWY_SVE2
1977 #else
1978 template <class V>
1979 V AverageRound(const V a, const V b) {
1980  return ShiftRight<1>(detail::AddN(Add(a, b), 1));
1981 }
1982 #endif // HWY_TARGET == HWY_SVE2
1983 
1984 // ------------------------------ LoadMaskBits (TestBit)
1985 
1986 // `p` points to at least 8 readable bytes, not all of which need be valid.
1987 template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
1988 HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
1989  const RebindToUnsigned<D> du;
1990  const svuint8_t iota = Iota(du, 0);
1991 
1992  // Load correct number of bytes (bits/8) with 7 zeros after each.
1993  const svuint8_t bytes = BitCast(du, svld1ub_u64(detail::PTrue(d), bits));
1994  // Replicate bytes 8x such that each byte contains the bit that governs it.
1995  const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
1996 
1997  // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 ..
1998  const svuint8_t bit = Shl(Set(du, 1), detail::AndN(iota, 7));
1999 
2000  return TestBit(rep8, bit);
2001 }
2002 
2003 template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
2004 HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2005  const uint8_t* HWY_RESTRICT bits) {
2006  const RebindToUnsigned<D> du;
2007  const Repartition<uint8_t, D> du8;
2008 
2009  // There may be up to 128 bits; avoid reading past the end.
2010  const svuint8_t bytes = svld1(FirstN(du8, (Lanes(du) + 7) / 8), bits);
2011 
2012  // Replicate bytes 16x such that each lane contains the bit that governs it.
2013  const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(Iota(du8, 0)));
2014 
2015  // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 ..
2016  const svuint16_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7));
2017 
2018  return TestBit(BitCast(du, rep16), bit);
2019 }
2020 
2021 template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
2022 HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2023  const uint8_t* HWY_RESTRICT bits) {
2024  const RebindToUnsigned<D> du;
2025  const Repartition<uint8_t, D> du8;
2026 
2027  // Upper bound = 2048 bits / 32 bit = 64 bits; at least 8 bytes are readable,
2028  // so we can skip computing the actual length (Lanes(du)+7)/8.
2029  const svuint8_t bytes = svld1(FirstN(du8, 8), bits);
2030 
2031  // Replicate bytes 32x such that each lane contains the bit that governs it.
2032  const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(Iota(du8, 0)));
2033 
2034  // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 ..
2035  const svuint32_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7));
2036 
2037  return TestBit(BitCast(du, rep32), bit);
2038 }
2039 
2040 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
2041 HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
2042  const uint8_t* HWY_RESTRICT bits) {
2043  const RebindToUnsigned<D> du;
2044 
2045  // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane.
2046  // The "at least 8 byte" guarantee in quick_reference ensures this is safe.
2047  uint32_t mask_bits;
2048  CopyBytes<4>(bits, &mask_bits);
2049  const auto vbits = Set(du, mask_bits);
2050 
2051  // 2 ^ {0,1, .., 31}, will not have more lanes than that.
2052  const svuint64_t bit = Shl(Set(du, 1), Iota(du, 0));
2053 
2054  return TestBit(vbits, bit);
2055 }
2056 
2057 // ------------------------------ StoreMaskBits
2058 
2059 namespace detail {
2060 
2061 // For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes.
2062 template <class T, HWY_IF_LANE_SIZE(T, 1)>
2063 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2064  return svdup_n_u8_z(m, 1);
2065 }
2066 template <class T, HWY_IF_LANE_SIZE(T, 2)>
2067 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2068  const ScalableTag<uint8_t> d8;
2069  const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
2070  return detail::ConcatEven(b16, b16); // only lower half needed
2071 }
2072 template <class T, HWY_IF_LANE_SIZE(T, 4)>
2073 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2074  return U8FromU32(svdup_n_u32_z(m, 1));
2075 }
2076 template <class T, HWY_IF_LANE_SIZE(T, 8)>
2077 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
2078  const ScalableTag<uint32_t> d32;
2079  const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
2080  return U8FromU32(detail::ConcatEven(b64, b64)); // only lower half needed
2081 }
2082 
2083 // Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
2084 HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
2085  const ScalableTag<uint8_t> d8;
2086  const ScalableTag<uint16_t> d16;
2087  const ScalableTag<uint32_t> d32;
2088  const ScalableTag<uint64_t> d64;
2089  // TODO(janwas): could use SVE2 BDEP, but it's optional.
2090  x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
2091  x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
2092  x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
2093  return BitCast(d64, x);
2094 }
2095 
2096 } // namespace detail
2097 
2098 // `p` points to at least 8 writable bytes.
2099 template <class D>
2100 HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
2101  svuint64_t bits_in_u64 =
2103 
2104  const size_t num_bits = Lanes(d);
2105  const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below
2106 
2107  // Truncate each u64 to 8 bits and store to u8.
2108  svst1b_u64(FirstN(ScalableTag<uint64_t>(), num_bytes), bits, bits_in_u64);
2109 
2110  // Non-full byte, need to clear the undefined upper bits. Can happen for
2111  // capped/fractional vectors or large T and small hardware vectors.
2112  if (num_bits < 8) {
2113  const int mask = (1 << num_bits) - 1;
2114  bits[0] = static_cast<uint8_t>(bits[0] & mask);
2115  }
2116  // Else: we wrote full bytes because num_bits is a power of two >= 8.
2117 
2118  return num_bytes;
2119 }
2120 
2121 // ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits)
2122 
2123 template <class V>
2124 HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
2125  return Compress(v, LoadMaskBits(DFromV<V>(), bits));
2126 }
2127 
2128 template <class D>
2129 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
2130  D d, TFromD<D>* HWY_RESTRICT unaligned) {
2131  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
2132 }
2133 
2134 // ------------------------------ MulEven (InterleaveEven)
2135 
2136 #if HWY_TARGET == HWY_SVE2
2137 namespace detail {
2138 #define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
2139  HWY_API HWY_SVE_V(BASE, BITS) \
2140  NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \
2141  return sv##OP##_##CHAR##BITS(a, b); \
2142  }
2143 
2145 #undef HWY_SVE_MUL_EVEN
2146 } // namespace detail
2147 #endif
2148 
2149 template <class V, class DW = RepartitionToWide<DFromV<V>>>
2150 HWY_API VFromD<DW> MulEven(const V a, const V b) {
2151 #if HWY_TARGET == HWY_SVE2
2152  return BitCast(DW(), detail::MulEven(a, b));
2153 #else
2154  const auto lo = Mul(a, b);
2155  const auto hi = detail::MulHigh(a, b);
2156  return BitCast(DW(), detail::InterleaveEven(lo, hi));
2157 #endif
2158 }
2159 
2160 HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
2161  const auto lo = Mul(a, b);
2162  const auto hi = detail::MulHigh(a, b);
2163  return detail::InterleaveEven(lo, hi);
2164 }
2165 
2166 HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
2167  const auto lo = Mul(a, b);
2168  const auto hi = detail::MulHigh(a, b);
2169  return detail::InterleaveOdd(lo, hi);
2170 }
2171 
2172 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
2173 
2174 template <size_t N, int kPow2>
2176  svuint16_t a, svuint16_t b,
2177  const svfloat32_t sum0,
2178  svfloat32_t& sum1) {
2179  // TODO(janwas): svbfmlalb_f32 if __ARM_FEATURE_SVE_BF16.
2180  const Repartition<uint16_t, decltype(df32)> du16;
2181  const RebindToUnsigned<decltype(df32)> du32;
2182  const svuint16_t zero = Zero(du16);
2183  const svuint32_t a0 = ZipLower(du32, zero, BitCast(du16, a));
2184  const svuint32_t a1 = ZipUpper(du32, zero, BitCast(du16, a));
2185  const svuint32_t b0 = ZipLower(du32, zero, BitCast(du16, b));
2186  const svuint32_t b1 = ZipUpper(du32, zero, BitCast(du16, b));
2187  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
2188  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
2189 }
2190 
2191 // ------------------------------ AESRound / CLMul
2192 
2193 #if defined(__ARM_FEATURE_SVE2_AES)
2194 
2195 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
2196 #ifdef HWY_NATIVE_AES
2197 #undef HWY_NATIVE_AES
2198 #else
2199 #define HWY_NATIVE_AES
2200 #endif
2201 
2202 HWY_API svuint8_t AESRound(svuint8_t state, svuint8_t round_key) {
2203  // It is not clear whether E and MC fuse like they did on NEON.
2204  const svuint8_t zero = svdup_n_u8(0);
2205  return Xor(svaesmc_u8(svaese_u8(state, zero)), round_key);
2206 }
2207 
2208 HWY_API svuint8_t AESLastRound(svuint8_t state, svuint8_t round_key) {
2209  return Xor(svaese_u8(state, svdup_n_u8(0)), round_key);
2210 }
2211 
2212 HWY_API svuint64_t CLMulLower(const svuint64_t a, const svuint64_t b) {
2213  return svpmullb_pair(a, b);
2214 }
2215 
2216 HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) {
2217  return svpmullt_pair(a, b);
2218 }
2219 
2220 #endif // __ARM_FEATURE_SVE2_AES
2221 
2222 // ------------------------------ Lt128
2223 
2224 template <class D>
2225 HWY_INLINE svbool_t Lt128(D /* d */, const svuint64_t a, const svuint64_t b) {
2226  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
2227  // Truth table of Eq and Compare for Hi and Lo u64.
2228  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
2229  // =H =L cH cL | out = cH | (=H & cL) = IfThenElse(=H, cL, cH)
2230  // 0 0 0 0 | 0
2231  // 0 0 0 1 | 0
2232  // 0 0 1 0 | 1
2233  // 0 0 1 1 | 1
2234  // 0 1 0 0 | 0
2235  // 0 1 0 1 | 0
2236  // 0 1 1 0 | 1
2237  // 1 0 0 0 | 0
2238  // 1 0 0 1 | 1
2239  // 1 1 0 0 | 0
2240  const svbool_t eqHL = Eq(a, b);
2241  const svbool_t ltHL = Lt(a, b);
2242  // trn (interleave even/odd) allow us to move and copy masks across lanes.
2243  const svbool_t cmpLL = svtrn1_b64(ltHL, ltHL);
2244  const svbool_t outHx = svsel_b(eqHL, cmpLL, ltHL); // See truth table above.
2245  return svtrn2_b64(outHx, outHx); // replicate to HH
2246 }
2247 
2248 // ------------------------------ Min128, Max128 (Lt128)
2249 
2250 template <class D>
2251 HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) {
2252  return IfThenElse(Lt128(d, a, b), a, b);
2253 }
2254 
2255 template <class D>
2256 HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) {
2257  return IfThenElse(Lt128(d, a, b), b, a);
2258 }
2259 
2260 // ================================================== END MACROS
2261 namespace detail { // for code folding
2262 #undef HWY_IF_FLOAT_V
2263 #undef HWY_IF_LANE_SIZE_V
2264 #undef HWY_IF_SIGNED_V
2265 #undef HWY_IF_UNSIGNED_V
2266 #undef HWY_SVE_D
2267 #undef HWY_SVE_FOREACH
2268 #undef HWY_SVE_FOREACH_F
2269 #undef HWY_SVE_FOREACH_F16
2270 #undef HWY_SVE_FOREACH_F32
2271 #undef HWY_SVE_FOREACH_F64
2272 #undef HWY_SVE_FOREACH_I
2273 #undef HWY_SVE_FOREACH_I08
2274 #undef HWY_SVE_FOREACH_I16
2275 #undef HWY_SVE_FOREACH_I32
2276 #undef HWY_SVE_FOREACH_I64
2277 #undef HWY_SVE_FOREACH_IF
2278 #undef HWY_SVE_FOREACH_U
2279 #undef HWY_SVE_FOREACH_U08
2280 #undef HWY_SVE_FOREACH_U16
2281 #undef HWY_SVE_FOREACH_U32
2282 #undef HWY_SVE_FOREACH_U64
2283 #undef HWY_SVE_FOREACH_UI
2284 #undef HWY_SVE_FOREACH_UI08
2285 #undef HWY_SVE_FOREACH_UI16
2286 #undef HWY_SVE_FOREACH_UI32
2287 #undef HWY_SVE_FOREACH_UI64
2288 #undef HWY_SVE_FOREACH_UIF3264
2289 #undef HWY_SVE_PTRUE
2290 #undef HWY_SVE_RETV_ARGPV
2291 #undef HWY_SVE_RETV_ARGPVN
2292 #undef HWY_SVE_RETV_ARGPVV
2293 #undef HWY_SVE_RETV_ARGV
2294 #undef HWY_SVE_RETV_ARGVN
2295 #undef HWY_SVE_RETV_ARGVV
2296 #undef HWY_SVE_T
2297 #undef HWY_SVE_UNDEFINED
2298 #undef HWY_SVE_V
2299 
2300 } // namespace detail
2301 // NOLINTNEXTLINE(google-readability-namespace-comments)
2302 } // namespace HWY_NAMESPACE
2303 } // namespace hwy
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:100
#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:56
HWY_AFTER_NAMESPACE()
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:68
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:696
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1222
#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:940
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1211
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:52
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1309
#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:323
#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:638
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:312
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:123
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:230
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1525
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:861
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:493
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:955
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:270
#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:853
#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:115
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:108
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:152
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:60
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:298
#define HWY_SVE_PTRUE(BITS)
Definition: arm_sve-inl.h:242
#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1341
#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:996
#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:981
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:708
#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:964
#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:163
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:86
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:158
#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:870
#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1489
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:175
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:567
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:92
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1015
#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:878
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1591
#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:845
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:768
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:449
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:80
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:135
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1282
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:740
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:408
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:119
HWY_BEFORE_NAMESPACE()
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:53
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:74
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:764
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:244
#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2138
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:104
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:930
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:535
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1911
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:96
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:170
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:148
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:418
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1412
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DASSERT(condition)
Definition: base.h:193
HWY_INLINE svuint8_t BoolFromMask(svbool_t m)
Definition: arm_sve-inl.h:2063
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:186
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x)
Definition: arm_sve-inl.h:2084
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: arm_sve-inl.h:1664
svbool_t MakeMask(D d)
Definition: arm_sve-inl.h:260
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:1503
VI SaturateI(VI v)
Definition: arm_sve-inl.h:1113
svbool_t FirstNPerBlock(D d)
Definition: arm_sve-inl.h:1670
svbool_t MaskUpperHalf(D d)
Definition: arm_sve-inl.h:1334
VU SaturateU(VU v)
Definition: arm_sve-inl.h:1107
HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) HWY_API svbool_t PFalse()
Definition: arm_sve-inl.h:250
svbool_t MaskLowerHalf(D d)
Definition: arm_sve-inl.h:1330
HWY_INLINE size_t HardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:200
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N, kPow2 > df, svfloat16_t v)
Definition: arm_sve-inl.h:1093
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:115
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:862
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
_
Definition: rvv-inl.h:1405
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
typename D::Twice Twice
Definition: ops/shared-inl.h:220
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2160
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API VFromD< D > ConcatEven(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1244
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:212
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:162
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API VFromD< DW > ZipLower(const V a, const V b)
Definition: arm_sve-inl.h:1897
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
HWY_API VFromD< D > ConcatOdd(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1233
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
constexpr HWY_API bool IsSame()
Definition: base.h:286
constexpr size_t CeilLog2(TI x)
Definition: base.h:700
constexpr HWY_API bool IsSigned()
Definition: base.h:483
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: arm_sve-inl.h:32
Definition: ops/shared-inl.h:40
Definition: base.h:317
Definition: base.h:253
uint16_t bits
Definition: base.h:254