Grok  9.7.5
x86_128-inl.h
Go to the documentation of this file.
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
17 // operations when compiling for those targets.
18 // External include guard in highway.h - see comment there.
19 
20 #include <emmintrin.h>
21 #include <stdio.h>
22 #if HWY_TARGET == HWY_SSSE3
23 #include <tmmintrin.h> // SSSE3
24 #else
25 #include <smmintrin.h> // SSE4
26 #include <wmmintrin.h> // CLMUL
27 #endif
28 #include <stddef.h>
29 #include <stdint.h>
30 
31 #include "hwy/base.h"
32 #include "hwy/ops/shared-inl.h"
33 
34 // Clang 3.9 generates VINSERTF128 instead of the desired VBROADCASTF128,
35 // which would free up port5. However, inline assembly isn't supported on
36 // MSVC, results in incorrect output on GCC 8.3, and raises "invalid output size
37 // for constraint" errors on Clang (https://gcc.godbolt.org/z/-Jt_-F), hence we
38 // disable it.
39 #ifndef HWY_LOADDUP_ASM
40 #define HWY_LOADDUP_ASM 0
41 #endif
42 
44 namespace hwy {
45 namespace HWY_NAMESPACE {
46 
47 template <typename T>
48 using Full32 = Simd<T, 4 / sizeof(T), 0>;
49 
50 template <typename T>
51 using Full64 = Simd<T, 8 / sizeof(T), 0>;
52 
53 template <typename T>
54 using Full128 = Simd<T, 16 / sizeof(T), 0>;
55 
56 #if HWY_TARGET <= HWY_AVX2
57 template <typename T>
58 using Full256 = Simd<T, 32 / sizeof(T), 0>;
59 #endif
60 
61 #if HWY_TARGET <= HWY_AVX3
62 template <typename T>
63 using Full512 = Simd<T, 64 / sizeof(T), 0>;
64 #endif
65 
66 namespace detail {
67 
68 template <typename T>
69 struct Raw128 {
70  using type = __m128i;
71 };
72 template <>
73 struct Raw128<float> {
74  using type = __m128;
75 };
76 template <>
77 struct Raw128<double> {
78  using type = __m128d;
79 };
80 
81 } // namespace detail
82 
83 template <typename T, size_t N = 16 / sizeof(T)>
84 class Vec128 {
85  using Raw = typename detail::Raw128<T>::type;
86 
87  public:
88  // Compound assignment. Only usable if there is a corresponding non-member
89  // binary operator overload. For example, only f32 and f64 support division.
91  return *this = (*this * other);
92  }
94  return *this = (*this / other);
95  }
97  return *this = (*this + other);
98  }
100  return *this = (*this - other);
101  }
103  return *this = (*this & other);
104  }
106  return *this = (*this | other);
107  }
109  return *this = (*this ^ other);
110  }
111 
112  Raw raw;
113 };
114 
115 template <typename T>
116 using Vec64 = Vec128<T, 8 / sizeof(T)>;
117 
118 #if HWY_TARGET <= HWY_AVX3
119 
120 // Forward-declare for use by DeduceD, see below.
121 template <typename T>
122 class Vec512;
123 
124 namespace detail {
125 
126 // Template arg: sizeof(lane type)
127 template <size_t size>
128 struct RawMask128 {};
129 template <>
131  using type = __mmask16;
132 };
133 template <>
135  using type = __mmask8;
136 };
137 template <>
139  using type = __mmask8;
140 };
141 template <>
143  using type = __mmask8;
144 };
145 
146 } // namespace detail
147 
148 template <typename T, size_t N>
149 struct Mask128 {
150  using Raw = typename detail::RawMask128<sizeof(T)>::type;
151 
152  static Mask128<T, N> FromBits(uint64_t mask_bits) {
153  return Mask128<T, N>{static_cast<Raw>(mask_bits)};
154  }
155 
156  Raw raw;
157 };
158 
159 #else // AVX2 or below
160 
161 // FF..FF or 0.
162 template <typename T, size_t N = 16 / sizeof(T)>
163 struct Mask128 {
164  typename detail::Raw128<T>::type raw;
165 };
166 
167 #endif // HWY_TARGET <= HWY_AVX3
168 
169 #if HWY_TARGET <= HWY_AVX2
170 // Forward-declare for use by DeduceD, see below.
171 template <typename T>
172 class Vec256;
173 #endif
174 
175 namespace detail {
176 
177 // Deduce Simd<T, N, 0> from Vec*<T, N> (pointers because Vec256/512 may be
178 // incomplete types at this point; this is simpler than avoiding multiple
179 // definitions of DFromV via #if)
180 struct DeduceD {
181  template <typename T, size_t N>
183  return Simd<T, N, 0>();
184  }
185 #if HWY_TARGET <= HWY_AVX2
186  template <typename T>
188  return Full256<T>();
189  }
190 #endif
191 #if HWY_TARGET <= HWY_AVX3
192  template <typename T>
194  return Full512<T>();
195  }
196 #endif
197 };
198 
199 // Workaround for MSVC v19.14: alias with a dependent type fails to specialize.
200 template <class V>
201 struct ExpandDFromV {
202  using type = decltype(DeduceD()(static_cast<V*>(nullptr)));
203 };
204 
205 } // namespace detail
206 
207 template <class V>
208 using DFromV = typename detail::ExpandDFromV<V>::type;
209 
210 template <class V>
211 using TFromV = TFromD<DFromV<V>>;
212 
213 // ------------------------------ BitCast
214 
215 namespace detail {
216 
217 HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
218 HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
219 HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
220 
221 template <typename T, size_t N>
222 HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
223  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
224 }
225 
226 // Cannot rely on function overloading because return types differ.
227 template <typename T>
228 struct BitCastFromInteger128 {
229  HWY_INLINE __m128i operator()(__m128i v) { return v; }
230 };
231 template <>
232 struct BitCastFromInteger128<float> {
233  HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); }
234 };
235 template <>
236 struct BitCastFromInteger128<double> {
237  HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); }
238 };
239 
240 template <typename T, size_t N>
242  Vec128<uint8_t, N * sizeof(T)> v) {
243  return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
244 }
245 
246 } // namespace detail
247 
248 template <typename T, size_t N, typename FromT>
249 HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
250  Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
252 }
253 
254 // ------------------------------ Zero
255 
256 // Returns an all-zero vector/part.
257 template <typename T, size_t N, HWY_IF_LE128(T, N)>
258 HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
259  return Vec128<T, N>{_mm_setzero_si128()};
260 }
261 template <size_t N, HWY_IF_LE128(float, N)>
262 HWY_API Vec128<float, N> Zero(Simd<float, N, 0> /* tag */) {
263  return Vec128<float, N>{_mm_setzero_ps()};
264 }
265 template <size_t N, HWY_IF_LE128(double, N)>
267  return Vec128<double, N>{_mm_setzero_pd()};
268 }
269 
270 template <class D>
271 using VFromD = decltype(Zero(D()));
272 
273 // ------------------------------ Set
274 
275 // Returns a vector/part with all lanes set to "t".
276 template <size_t N, HWY_IF_LE128(uint8_t, N)>
277 HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
278  return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
279 }
280 template <size_t N, HWY_IF_LE128(uint16_t, N)>
281 HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N, 0> /* tag */,
282  const uint16_t t) {
283  return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
284 }
285 template <size_t N, HWY_IF_LE128(uint32_t, N)>
286 HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N, 0> /* tag */,
287  const uint32_t t) {
288  return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
289 }
290 template <size_t N, HWY_IF_LE128(uint64_t, N)>
291 HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N, 0> /* tag */,
292  const uint64_t t) {
293  return Vec128<uint64_t, N>{
294  _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
295 }
296 template <size_t N, HWY_IF_LE128(int8_t, N)>
297 HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
298  return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))}; // NOLINT
299 }
300 template <size_t N, HWY_IF_LE128(int16_t, N)>
301 HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
302  return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))}; // NOLINT
303 }
304 template <size_t N, HWY_IF_LE128(int32_t, N)>
305 HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
306  return Vec128<int32_t, N>{_mm_set1_epi32(t)};
307 }
308 template <size_t N, HWY_IF_LE128(int64_t, N)>
309 HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
310  return Vec128<int64_t, N>{
311  _mm_set1_epi64x(static_cast<long long>(t))}; // NOLINT
312 }
313 template <size_t N, HWY_IF_LE128(float, N)>
314 HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
315  return Vec128<float, N>{_mm_set1_ps(t)};
316 }
317 template <size_t N, HWY_IF_LE128(double, N)>
318 HWY_API Vec128<double, N> Set(Simd<double, N, 0> /* tag */, const double t) {
319  return Vec128<double, N>{_mm_set1_pd(t)};
320 }
321 
322 HWY_DIAGNOSTICS(push)
323 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
324 
325 // Returns a vector with uninitialized elements.
326 template <typename T, size_t N, HWY_IF_LE128(T, N)>
327 HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /* tag */) {
328  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
329  // generate an XOR instruction.
330  return Vec128<T, N>{_mm_undefined_si128()};
331 }
332 template <size_t N, HWY_IF_LE128(float, N)>
334  return Vec128<float, N>{_mm_undefined_ps()};
335 }
336 template <size_t N, HWY_IF_LE128(double, N)>
338  return Vec128<double, N>{_mm_undefined_pd()};
339 }
340 
341 HWY_DIAGNOSTICS(pop)
342 
343 // ------------------------------ GetLane
344 
345 // Gets the single value stored in a vector/part.
346 template <size_t N>
347 HWY_API uint8_t GetLane(const Vec128<uint8_t, N> v) {
348  return static_cast<uint8_t>(_mm_cvtsi128_si32(v.raw) & 0xFF);
349 }
350 template <size_t N>
351 HWY_API int8_t GetLane(const Vec128<int8_t, N> v) {
352  return static_cast<int8_t>(_mm_cvtsi128_si32(v.raw) & 0xFF);
353 }
354 template <size_t N>
355 HWY_API uint16_t GetLane(const Vec128<uint16_t, N> v) {
356  return static_cast<uint16_t>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
357 }
358 template <size_t N>
359 HWY_API int16_t GetLane(const Vec128<int16_t, N> v) {
360  return static_cast<int16_t>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
361 }
362 template <size_t N>
363 HWY_API uint32_t GetLane(const Vec128<uint32_t, N> v) {
364  return static_cast<uint32_t>(_mm_cvtsi128_si32(v.raw));
365 }
366 template <size_t N>
367 HWY_API int32_t GetLane(const Vec128<int32_t, N> v) {
368  return _mm_cvtsi128_si32(v.raw);
369 }
370 template <size_t N>
371 HWY_API float GetLane(const Vec128<float, N> v) {
372  return _mm_cvtss_f32(v.raw);
373 }
374 template <size_t N>
375 HWY_API uint64_t GetLane(const Vec128<uint64_t, N> v) {
376 #if HWY_ARCH_X86_32
377  alignas(16) uint64_t lanes[2];
378  Store(v, Simd<uint64_t, N, 0>(), lanes);
379  return lanes[0];
380 #else
381  return static_cast<uint64_t>(_mm_cvtsi128_si64(v.raw));
382 #endif
383 }
384 template <size_t N>
385 HWY_API int64_t GetLane(const Vec128<int64_t, N> v) {
386 #if HWY_ARCH_X86_32
387  alignas(16) int64_t lanes[2];
388  Store(v, Simd<int64_t, N, 0>(), lanes);
389  return lanes[0];
390 #else
391  return _mm_cvtsi128_si64(v.raw);
392 #endif
393 }
394 template <size_t N>
396  return _mm_cvtsd_f64(v.raw);
397 }
398 
399 // ================================================== LOGICAL
400 
401 // ------------------------------ And
402 
403 template <typename T, size_t N>
404 HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
405  return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
406 }
407 template <size_t N>
409  const Vec128<float, N> b) {
410  return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)};
411 }
412 template <size_t N>
414  const Vec128<double, N> b) {
415  return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)};
416 }
417 
418 // ------------------------------ AndNot
419 
420 // Returns ~not_mask & mask.
421 template <typename T, size_t N>
422 HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
423  return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
424 }
425 template <size_t N>
427  const Vec128<float, N> mask) {
428  return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
429 }
430 template <size_t N>
432  const Vec128<double, N> mask) {
433  return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
434 }
435 
436 // ------------------------------ Or
437 
438 template <typename T, size_t N>
439 HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
440  return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
441 }
442 
443 template <size_t N>
445  const Vec128<float, N> b) {
446  return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)};
447 }
448 template <size_t N>
450  const Vec128<double, N> b) {
451  return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)};
452 }
453 
454 // ------------------------------ Xor
455 
456 template <typename T, size_t N>
457 HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
458  return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
459 }
460 
461 template <size_t N>
463  const Vec128<float, N> b) {
464  return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)};
465 }
466 template <size_t N>
468  const Vec128<double, N> b) {
469  return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)};
470 }
471 
472 // ------------------------------ Not
473 
474 template <typename T, size_t N>
475 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
476  const DFromV<decltype(v)> d;
477  const RebindToUnsigned<decltype(d)> du;
478  using VU = VFromD<decltype(du)>;
479 #if HWY_TARGET <= HWY_AVX3
480  const __m128i vu = BitCast(du, v).raw;
481  return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
482 #else
483  return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)}));
484 #endif
485 }
486 
487 // ------------------------------ OrAnd
488 
489 template <typename T, size_t N>
490 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
491 #if HWY_TARGET <= HWY_AVX3
492  const DFromV<decltype(o)> d;
493  const RebindToUnsigned<decltype(d)> du;
494  using VU = VFromD<decltype(du)>;
495  const __m128i ret = _mm_ternarylogic_epi64(
496  BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
497  return BitCast(d, VU{ret});
498 #else
499  return Or(o, And(a1, a2));
500 #endif
501 }
502 
503 // ------------------------------ IfVecThenElse
504 
505 template <typename T, size_t N>
506 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
507  Vec128<T, N> no) {
508 #if HWY_TARGET <= HWY_AVX3
509  const DFromV<decltype(no)> d;
510  const RebindToUnsigned<decltype(d)> du;
511  using VU = VFromD<decltype(du)>;
512  return BitCast(
513  d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw,
514  BitCast(du, no).raw, 0xCA)});
515 #else
516  return IfThenElse(MaskFromVec(mask), yes, no);
517 #endif
518 }
519 
520 // ------------------------------ Operator overloads (internal-only if float)
521 
522 template <typename T, size_t N>
523 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
524  return And(a, b);
525 }
526 
527 template <typename T, size_t N>
528 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
529  return Or(a, b);
530 }
531 
532 template <typename T, size_t N>
533 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
534  return Xor(a, b);
535 }
536 
537 // ------------------------------ PopulationCount
538 
539 // 8/16 require BITALG, 32/64 require VPOPCNTDQ.
540 #if HWY_TARGET == HWY_AVX3_DL
541 
542 #ifdef HWY_NATIVE_POPCNT
543 #undef HWY_NATIVE_POPCNT
544 #else
545 #define HWY_NATIVE_POPCNT
546 #endif
547 
548 namespace detail {
549 
550 template <typename T, size_t N>
552  Vec128<T, N> v) {
553  return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
554 }
555 template <typename T, size_t N>
557  Vec128<T, N> v) {
558  return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
559 }
560 template <typename T, size_t N>
562  Vec128<T, N> v) {
563  return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
564 }
565 template <typename T, size_t N>
567  Vec128<T, N> v) {
568  return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
569 }
570 
571 } // namespace detail
572 
573 template <typename T, size_t N>
575  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
576 }
577 
578 #endif // HWY_TARGET == HWY_AVX3_DL
579 
580 // ================================================== SIGN
581 
582 // ------------------------------ Neg
583 
584 template <typename T, size_t N, HWY_IF_FLOAT(T)>
585 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
586  return Xor(v, SignBit(DFromV<decltype(v)>()));
587 }
588 
589 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
590 HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
591  return Zero(DFromV<decltype(v)>()) - v;
592 }
593 
594 // ------------------------------ Abs
595 
596 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
597 template <size_t N>
598 HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
599 #if HWY_COMPILER_MSVC
600  // Workaround for incorrect codegen? (reaches breakpoint)
601  const auto zero = Zero(DFromV<decltype(v)>());
602  return Vec128<int8_t, N>{_mm_max_epi8(v.raw, (zero - v).raw)};
603 #else
604  return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
605 #endif
606 }
607 template <size_t N>
608 HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
609  return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
610 }
611 template <size_t N>
612 HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
613  return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
614 }
615 // i64 is implemented after BroadcastSignBit.
616 template <size_t N>
617 HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
618  const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
619  return v & BitCast(DFromV<decltype(v)>(), mask);
620 }
621 template <size_t N>
623  const Vec128<int64_t, N> mask{_mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
624  return v & BitCast(DFromV<decltype(v)>(), mask);
625 }
626 
627 // ------------------------------ CopySign
628 
629 template <typename T, size_t N>
630 HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
631  const Vec128<T, N> sign) {
632  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
633 
634  const DFromV<decltype(magn)> d;
635  const auto msb = SignBit(d);
636 
637 #if HWY_TARGET <= HWY_AVX3
638  const RebindToUnsigned<decltype(d)> du;
639  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
640  // 0 0 0 | 0
641  // 0 0 1 | 0
642  // 0 1 0 | 1
643  // 0 1 1 | 1
644  // 1 0 0 | 0
645  // 1 0 1 | 1
646  // 1 1 0 | 0
647  // 1 1 1 | 1
648  // The lane size does not matter because we are not using predication.
649  const __m128i out = _mm_ternarylogic_epi32(
650  BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
651  return BitCast(d, VFromD<decltype(du)>{out});
652 #else
653  return Or(AndNot(msb, magn), And(msb, sign));
654 #endif
655 }
656 
657 template <typename T, size_t N>
658 HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
659  const Vec128<T, N> sign) {
660 #if HWY_TARGET <= HWY_AVX3
661  // AVX3 can also handle abs < 0, so no extra action needed.
662  return CopySign(abs, sign);
663 #else
664  return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
665 #endif
666 }
667 
668 // ================================================== MASK
669 
670 #if HWY_TARGET <= HWY_AVX3
671 
672 // ------------------------------ IfThenElse
673 
674 // Returns mask ? b : a.
675 
676 namespace detail {
677 
678 // Templates for signed/unsigned integer of a particular size.
679 template <typename T, size_t N>
681  Mask128<T, N> mask, Vec128<T, N> yes,
682  Vec128<T, N> no) {
683  return Vec128<T, N>{_mm_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
684 }
685 template <typename T, size_t N>
687  Mask128<T, N> mask, Vec128<T, N> yes,
688  Vec128<T, N> no) {
689  return Vec128<T, N>{_mm_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
690 }
691 template <typename T, size_t N>
693  Mask128<T, N> mask, Vec128<T, N> yes,
694  Vec128<T, N> no) {
695  return Vec128<T, N>{_mm_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
696 }
697 template <typename T, size_t N>
699  Mask128<T, N> mask, Vec128<T, N> yes,
700  Vec128<T, N> no) {
701  return Vec128<T, N>{_mm_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
702 }
703 
704 } // namespace detail
705 
706 template <typename T, size_t N>
707 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
708  Vec128<T, N> no) {
709  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
710 }
711 
712 template <size_t N>
715  return Vec128<float, N>{_mm_mask_mov_ps(no.raw, mask.raw, yes.raw)};
716 }
717 
718 template <size_t N>
720  Vec128<double, N> yes,
721  Vec128<double, N> no) {
722  return Vec128<double, N>{_mm_mask_mov_pd(no.raw, mask.raw, yes.raw)};
723 }
724 
725 namespace detail {
726 
727 template <typename T, size_t N>
729  Mask128<T, N> mask, Vec128<T, N> yes) {
730  return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
731 }
732 template <typename T, size_t N>
734  Mask128<T, N> mask, Vec128<T, N> yes) {
735  return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
736 }
737 template <typename T, size_t N>
739  Mask128<T, N> mask, Vec128<T, N> yes) {
740  return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
741 }
742 template <typename T, size_t N>
744  Mask128<T, N> mask, Vec128<T, N> yes) {
745  return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
746 }
747 
748 } // namespace detail
749 
750 template <typename T, size_t N>
751 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
752  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
753 }
754 
755 template <size_t N>
757  Vec128<float, N> yes) {
758  return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
759 }
760 
761 template <size_t N>
763  Vec128<double, N> yes) {
764  return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
765 }
766 
767 namespace detail {
768 
769 template <typename T, size_t N>
771  Mask128<T, N> mask, Vec128<T, N> no) {
772  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
773  return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
774 }
775 template <typename T, size_t N>
777  Mask128<T, N> mask, Vec128<T, N> no) {
778  return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
779 }
780 template <typename T, size_t N>
782  Mask128<T, N> mask, Vec128<T, N> no) {
783  return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
784 }
785 template <typename T, size_t N>
787  Mask128<T, N> mask, Vec128<T, N> no) {
788  return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
789 }
790 
791 } // namespace detail
792 
793 template <typename T, size_t N>
794 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
795  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
796 }
797 
798 template <size_t N>
800  Vec128<float, N> no) {
801  return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
802 }
803 
804 template <size_t N>
806  Vec128<double, N> no) {
807  return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
808 }
809 
810 // ------------------------------ Mask logical
811 
812 // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
813 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
814 #if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \
815  HWY_COMPILER_CLANG >= 800
816 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
817 #else
818 #define HWY_COMPILER_HAS_MASK_INTRINSICS 0
819 #endif
820 #endif // HWY_COMPILER_HAS_MASK_INTRINSICS
821 
822 namespace detail {
823 
824 template <typename T, size_t N>
826  const Mask128<T, N> b) {
827 #if HWY_COMPILER_HAS_MASK_INTRINSICS
828  return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
829 #else
830  return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)};
831 #endif
832 }
833 template <typename T, size_t N>
835  const Mask128<T, N> b) {
836 #if HWY_COMPILER_HAS_MASK_INTRINSICS
837  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
838 #else
839  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
840 #endif
841 }
842 template <typename T, size_t N>
844  const Mask128<T, N> b) {
845 #if HWY_COMPILER_HAS_MASK_INTRINSICS
846  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
847 #else
848  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
849 #endif
850 }
851 template <typename T, size_t N>
853  const Mask128<T, N> b) {
854 #if HWY_COMPILER_HAS_MASK_INTRINSICS
855  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
856 #else
857  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
858 #endif
859 }
860 
861 template <typename T, size_t N>
863  const Mask128<T, N> b) {
864 #if HWY_COMPILER_HAS_MASK_INTRINSICS
865  return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
866 #else
867  return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)};
868 #endif
869 }
870 template <typename T, size_t N>
872  const Mask128<T, N> b) {
873 #if HWY_COMPILER_HAS_MASK_INTRINSICS
874  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
875 #else
876  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
877 #endif
878 }
879 template <typename T, size_t N>
881  const Mask128<T, N> b) {
882 #if HWY_COMPILER_HAS_MASK_INTRINSICS
883  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
884 #else
885  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
886 #endif
887 }
888 template <typename T, size_t N>
890  const Mask128<T, N> b) {
891 #if HWY_COMPILER_HAS_MASK_INTRINSICS
892  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
893 #else
894  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
895 #endif
896 }
897 
898 template <typename T, size_t N>
900  const Mask128<T, N> b) {
901 #if HWY_COMPILER_HAS_MASK_INTRINSICS
902  return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
903 #else
904  return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)};
905 #endif
906 }
907 template <typename T, size_t N>
909  const Mask128<T, N> b) {
910 #if HWY_COMPILER_HAS_MASK_INTRINSICS
911  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
912 #else
913  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
914 #endif
915 }
916 template <typename T, size_t N>
918  const Mask128<T, N> b) {
919 #if HWY_COMPILER_HAS_MASK_INTRINSICS
920  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
921 #else
922  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
923 #endif
924 }
925 template <typename T, size_t N>
927  const Mask128<T, N> b) {
928 #if HWY_COMPILER_HAS_MASK_INTRINSICS
929  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
930 #else
931  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
932 #endif
933 }
934 
935 template <typename T, size_t N>
937  const Mask128<T, N> b) {
938 #if HWY_COMPILER_HAS_MASK_INTRINSICS
939  return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
940 #else
941  return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)};
942 #endif
943 }
944 template <typename T, size_t N>
946  const Mask128<T, N> b) {
947 #if HWY_COMPILER_HAS_MASK_INTRINSICS
948  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
949 #else
950  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
951 #endif
952 }
953 template <typename T, size_t N>
955  const Mask128<T, N> b) {
956 #if HWY_COMPILER_HAS_MASK_INTRINSICS
957  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
958 #else
959  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
960 #endif
961 }
962 template <typename T, size_t N>
964  const Mask128<T, N> b) {
965 #if HWY_COMPILER_HAS_MASK_INTRINSICS
966  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
967 #else
968  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
969 #endif
970 }
971 
972 } // namespace detail
973 
974 template <typename T, size_t N>
975 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
976  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
977 }
978 
979 template <typename T, size_t N>
980 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
981  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
982 }
983 
984 template <typename T, size_t N>
985 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
986  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
987 }
988 
989 template <typename T, size_t N>
990 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
991  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
992 }
993 
994 template <typename T, size_t N>
995 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
996  // Flip only the valid bits.
997  return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
998 }
999 
1000 #else // AVX2 or below
1001 
1002 // ------------------------------ Mask
1003 
1004 // Mask and Vec are the same (true = FF..FF).
1005 template <typename T, size_t N>
1006 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1007  return Mask128<T, N>{v.raw};
1008 }
1009 
1010 template <typename T, size_t N>
1011 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1012  return Vec128<T, N>{v.raw};
1013 }
1014 
1015 template <typename T, size_t N>
1016 HWY_API Vec128<T, N> VecFromMask(const Simd<T, N, 0> /* tag */,
1017  const Mask128<T, N> v) {
1018  return Vec128<T, N>{v.raw};
1019 }
1020 
1021 #if HWY_TARGET == HWY_SSSE3
1022 
1023 // mask ? yes : no
1024 template <typename T, size_t N>
1025 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1026  Vec128<T, N> no) {
1027  const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask);
1028  return Or(And(vmask, yes), AndNot(vmask, no));
1029 }
1030 
1031 #else // HWY_TARGET == HWY_SSSE3
1032 
1033 // mask ? yes : no
1034 template <typename T, size_t N>
1035 HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
1036  Vec128<T, N> no) {
1037  return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
1038 }
1039 template <size_t N>
1040 HWY_API Vec128<float, N> IfThenElse(const Mask128<float, N> mask,
1041  const Vec128<float, N> yes,
1042  const Vec128<float, N> no) {
1043  return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
1044 }
1045 template <size_t N>
1046 HWY_API Vec128<double, N> IfThenElse(const Mask128<double, N> mask,
1047  const Vec128<double, N> yes,
1048  const Vec128<double, N> no) {
1049  return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
1050 }
1051 
1052 #endif // HWY_TARGET == HWY_SSSE3
1053 
1054 // mask ? yes : 0
1055 template <typename T, size_t N>
1056 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
1057  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
1058 }
1059 
1060 // mask ? 0 : no
1061 template <typename T, size_t N>
1062 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
1063  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
1064 }
1065 
1066 // ------------------------------ Mask logical
1067 
1068 template <typename T, size_t N>
1069 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
1070  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
1071 }
1072 
1073 template <typename T, size_t N>
1074 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
1075  const Simd<T, N, 0> d;
1076  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
1077 }
1078 
1079 template <typename T, size_t N>
1080 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
1081  const Simd<T, N, 0> d;
1082  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
1083 }
1084 
1085 template <typename T, size_t N>
1086 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
1087  const Simd<T, N, 0> d;
1088  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
1089 }
1090 
1091 template <typename T, size_t N>
1092 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
1093  const Simd<T, N, 0> d;
1094  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
1095 }
1096 
1097 #endif // HWY_TARGET <= HWY_AVX3
1098 
1099 // ================================================== SWIZZLE (1)
1100 
1101 // ------------------------------ Hard-coded shuffles
1102 
1103 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
1104 // Shuffle0321 rotates one lane to the right (the previous least-significant
1105 // lane is now most-significant). These could also be implemented via
1106 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
1107 
1108 // Swap 32-bit halves in 64-bit halves.
1109 template <size_t N>
1111  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1112  return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
1113 }
1114 template <size_t N>
1116  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1117  return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
1118 }
1119 template <size_t N>
1121  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
1122  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)};
1123 }
1124 
1125 // Swap 64-bit halves
1126 HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
1127  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1128 }
1129 HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
1130  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1131 }
1132 HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
1133  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
1134 }
1136  return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1137 }
1139  return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
1140 }
1142  return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)};
1143 }
1144 
1145 // Rotate right 32 bits
1146 HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
1147  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1148 }
1149 HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
1150  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
1151 }
1152 HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
1153  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
1154 }
1155 // Rotate left 32 bits
1156 HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
1157  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1158 }
1159 HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
1160  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
1161 }
1162 HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
1163  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
1164 }
1165 
1166 // Reverse
1167 HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
1168  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1169 }
1170 HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
1171  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
1172 }
1173 HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
1174  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
1175 }
1176 
1177 // ================================================== COMPARE
1178 
1179 #if HWY_TARGET <= HWY_AVX3
1180 
1181 // Comparisons set a mask bit to 1 if the condition is true, else 0.
1182 
1183 template <typename TFrom, size_t NFrom, typename TTo, size_t NTo>
1186  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1187  return Mask128<TTo, NTo>{m.raw};
1188 }
1189 
1190 namespace detail {
1191 
1192 template <typename T, size_t N>
1194  const Vec128<T, N> bit) {
1195  return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)};
1196 }
1197 template <typename T, size_t N>
1199  const Vec128<T, N> bit) {
1200  return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)};
1201 }
1202 template <typename T, size_t N>
1204  const Vec128<T, N> bit) {
1205  return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)};
1206 }
1207 template <typename T, size_t N>
1209  const Vec128<T, N> bit) {
1210  return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)};
1211 }
1212 
1213 } // namespace detail
1214 
1215 template <typename T, size_t N>
1216 HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
1217  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1218  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
1219 }
1220 
1221 // ------------------------------ Equality
1222 
1223 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1225  return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)};
1226 }
1227 
1228 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1229 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1230  return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
1231 }
1232 
1233 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1234 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1235  return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
1236 }
1237 
1238 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1239 HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
1240  return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
1241 }
1242 
1243 template <size_t N>
1244 HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
1245  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1246 }
1247 
1248 template <size_t N>
1250  Vec128<double, N> b) {
1251  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1252 }
1253 
1254 // ------------------------------ Inequality
1255 
1256 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1258  return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)};
1259 }
1260 
1261 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1262 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1263  return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
1264 }
1265 
1266 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1267 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1268  return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
1269 }
1270 
1271 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1272 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1273  return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
1274 }
1275 
1276 template <size_t N>
1277 HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
1278  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1279 }
1280 
1281 template <size_t N>
1283  Vec128<double, N> b) {
1284  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1285 }
1286 
1287 // ------------------------------ Strict inequality
1288 
1289 // Signed/float <
1290 template <size_t N>
1291 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1292  return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
1293 }
1294 template <size_t N>
1295 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1296  Vec128<int16_t, N> b) {
1297  return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
1298 }
1299 template <size_t N>
1300 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1301  Vec128<int32_t, N> b) {
1302  return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
1303 }
1304 template <size_t N>
1305 HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a,
1306  Vec128<int64_t, N> b) {
1307  return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
1308 }
1309 
1310 template <size_t N>
1311 HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a,
1312  Vec128<uint8_t, N> b) {
1313  return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
1314 }
1315 template <size_t N>
1316 HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a,
1317  Vec128<uint16_t, N> b) {
1318  return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
1319 }
1320 template <size_t N>
1321 HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a,
1322  Vec128<uint32_t, N> b) {
1323  return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
1324 }
1325 template <size_t N>
1326 HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
1327  Vec128<uint64_t, N> b) {
1328  return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
1329 }
1330 
1331 template <size_t N>
1332 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1333  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1334 }
1335 template <size_t N>
1337  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
1338 }
1339 
1340 // ------------------------------ Weak inequality
1341 
1342 template <size_t N>
1343 HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) {
1344  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1345 }
1346 template <size_t N>
1348  Vec128<double, N> b) {
1349  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
1350 }
1351 
1352 // ------------------------------ Mask
1353 
1354 namespace detail {
1355 
1356 template <typename T, size_t N>
1358  const Vec128<T, N> v) {
1359  return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
1360 }
1361 template <typename T, size_t N>
1363  const Vec128<T, N> v) {
1364  return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
1365 }
1366 template <typename T, size_t N>
1368  const Vec128<T, N> v) {
1369  return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
1370 }
1371 template <typename T, size_t N>
1373  const Vec128<T, N> v) {
1374  return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
1375 }
1376 
1377 } // namespace detail
1378 
1379 template <typename T, size_t N>
1380 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
1381  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
1382 }
1383 // There do not seem to be native floating-point versions of these instructions.
1384 template <size_t N>
1386  const RebindToSigned<DFromV<decltype(v)>> di;
1387  return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw};
1388 }
1389 template <size_t N>
1391  const RebindToSigned<DFromV<decltype(v)>> di;
1392  return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw};
1393 }
1394 
1395 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1397  return Vec128<T, N>{_mm_movm_epi8(v.raw)};
1398 }
1399 
1400 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1401 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1402  return Vec128<T, N>{_mm_movm_epi16(v.raw)};
1403 }
1404 
1405 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1406 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1407  return Vec128<T, N>{_mm_movm_epi32(v.raw)};
1408 }
1409 
1410 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1411 HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
1412  return Vec128<T, N>{_mm_movm_epi64(v.raw)};
1413 }
1414 
1415 template <size_t N>
1417  return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
1418 }
1419 
1420 template <size_t N>
1422  return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
1423 }
1424 
1425 template <typename T, size_t N>
1426 HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */,
1427  const Mask128<T, N> v) {
1428  return VecFromMask(v);
1429 }
1430 
1431 #else // AVX2 or below
1432 
1433 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
1434 
1435 template <typename TFrom, typename TTo, size_t N>
1436 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
1437  Mask128<TFrom, N> m) {
1438  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
1439  const Simd<TFrom, N, 0> d;
1440  return MaskFromVec(BitCast(Simd<TTo, N, 0>(), VecFromMask(d, m)));
1441 }
1442 
1443 template <typename T, size_t N>
1444 HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
1445  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
1446  return (v & bit) == bit;
1447 }
1448 
1449 // ------------------------------ Equality
1450 
1451 // Unsigned
1452 template <size_t N>
1453 HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
1454  const Vec128<uint8_t, N> b) {
1455  return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1456 }
1457 template <size_t N>
1458 HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
1459  const Vec128<uint16_t, N> b) {
1460  return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1461 }
1462 template <size_t N>
1463 HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
1464  const Vec128<uint32_t, N> b) {
1465  return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1466 }
1467 template <size_t N>
1468 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
1469  const Vec128<uint64_t, N> b) {
1470 #if HWY_TARGET == HWY_SSSE3
1471  const Simd<uint32_t, N * 2, 0> d32;
1472  const Simd<uint64_t, N, 0> d64;
1473  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
1474  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
1475  return MaskFromVec(BitCast(d64, cmp64));
1476 #else
1477  return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
1478 #endif
1479 }
1480 
1481 // Signed
1482 template <size_t N>
1483 HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
1484  const Vec128<int8_t, N> b) {
1485  return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1486 }
1487 template <size_t N>
1488 HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
1489  Vec128<int16_t, N> b) {
1490  return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1491 }
1492 template <size_t N>
1493 HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
1494  const Vec128<int32_t, N> b) {
1495  return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1496 }
1497 template <size_t N>
1498 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
1499  const Vec128<int64_t, N> b) {
1500  // Same as signed ==; avoid duplicating the SSSE3 version.
1501  const DFromV<decltype(a)> d;
1502  RebindToUnsigned<decltype(d)> du;
1503  return RebindMask(d, BitCast(du, a) == BitCast(du, b));
1504 }
1505 
1506 // Float
1507 template <size_t N>
1508 HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
1509  const Vec128<float, N> b) {
1510  return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
1511 }
1512 template <size_t N>
1513 HWY_API Mask128<double, N> operator==(const Vec128<double, N> a,
1514  const Vec128<double, N> b) {
1515  return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
1516 }
1517 
1518 // ------------------------------ Inequality
1519 
1520 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
1521 HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
1522  return Not(a == b);
1523 }
1524 
1525 template <size_t N>
1526 HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
1527  const Vec128<float, N> b) {
1528  return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
1529 }
1530 template <size_t N>
1531 HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a,
1532  const Vec128<double, N> b) {
1533  return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
1534 }
1535 
1536 // ------------------------------ Strict inequality
1537 
1538 // Signed/float <
1539 template <size_t N>
1540 HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1541  return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
1542 }
1543 template <size_t N>
1544 HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
1545  Vec128<int16_t, N> b) {
1546  return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
1547 }
1548 template <size_t N>
1549 HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
1550  Vec128<int32_t, N> b) {
1551  return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
1552 }
1553 
1554 template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
1555 HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
1556  const DFromV<decltype(a)> du;
1557  const RebindToSigned<decltype(du)> di;
1558  const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
1559  return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
1560 }
1561 
1562 template <size_t N>
1563 HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
1564  return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
1565 }
1566 template <size_t N>
1567 HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
1568  return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
1569 }
1570 
1571 template <size_t N>
1572 HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
1573  const Vec128<int64_t, N> b) {
1574 #if HWY_TARGET == HWY_SSSE3
1575  // If the upper half is less than or greater, this is the answer.
1576  const __m128i m_gt = _mm_cmpgt_epi32(a.raw, b.raw);
1577 
1578  // Otherwise, the lower half decides.
1579  const __m128i m_eq = _mm_cmpeq_epi32(a.raw, b.raw);
1580  const __m128i lo_in_hi = _mm_shuffle_epi32(m_gt, _MM_SHUFFLE(2, 2, 0, 0));
1581  const __m128i lo_gt = _mm_and_si128(m_eq, lo_in_hi);
1582 
1583  const __m128i gt = _mm_or_si128(lo_gt, m_gt);
1584  // Copy result in upper 32 bits to lower 32 bits.
1585  return Mask128<int64_t, N>{_mm_shuffle_epi32(gt, _MM_SHUFFLE(3, 3, 1, 1))};
1586 #else
1587  return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2
1588 #endif
1589 }
1590 
1591 // ------------------------------ Weak inequality
1592 
1593 template <size_t N>
1594 HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
1595  const Vec128<float, N> b) {
1596  return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
1597 }
1598 template <size_t N>
1599 HWY_API Mask128<double, N> operator>=(const Vec128<double, N> a,
1600  const Vec128<double, N> b) {
1601  return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
1602 }
1603 
1604 #endif // HWY_TARGET <= HWY_AVX3
1605 
1606 // ------------------------------ Reversed comparisons
1607 
1608 template <typename T, size_t N>
1609 HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
1610  return b > a;
1611 }
1612 
1613 template <typename T, size_t N>
1615  return b >= a;
1616 }
1617 
1618 // ------------------------------ FirstN (Iota, Lt)
1619 
1620 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1622 #if HWY_TARGET <= HWY_AVX3
1623  (void)d;
1624  const uint64_t all = (1ull << N) - 1;
1625  // BZHI only looks at the lower 8 bits of num!
1626  const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num);
1627  return Mask128<T, N>::FromBits(bits);
1628 #else
1629  const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper.
1630  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
1631 #endif
1632 }
1633 
1634 template <class D>
1635 using MFromD = decltype(FirstN(D(), 0));
1636 
1637 // ================================================== MEMORY (1)
1638 
1639 // Clang static analysis claims the memory immediately after a partial vector
1640 // store is uninitialized, and also flags the input to partial loads (at least
1641 // for loadl_pd) as "garbage". This is a false alarm because msan does not
1642 // raise errors. We work around this by using CopyBytes instead of intrinsics,
1643 // but only for the analyzer to avoid potentially bad code generation.
1644 // Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
1645 #ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1646 #if defined(__clang_analyzer__) || \
1647  (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1648 #define HWY_SAFE_PARTIAL_LOAD_STORE 1
1649 #else
1650 #define HWY_SAFE_PARTIAL_LOAD_STORE 0
1651 #endif
1652 #endif // HWY_SAFE_PARTIAL_LOAD_STORE
1653 
1654 // ------------------------------ Load
1655 
1656 template <typename T>
1657 HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
1658  return Vec128<T>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
1659 }
1661  const float* HWY_RESTRICT aligned) {
1662  return Vec128<float>{_mm_load_ps(aligned)};
1663 }
1665  const double* HWY_RESTRICT aligned) {
1666  return Vec128<double>{_mm_load_pd(aligned)};
1667 }
1668 
1669 template <typename T>
1671  return Vec128<T>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
1672 }
1673 HWY_API Vec128<float> LoadU(Full128<float> /* tag */,
1674  const float* HWY_RESTRICT p) {
1675  return Vec128<float>{_mm_loadu_ps(p)};
1676 }
1678  const double* HWY_RESTRICT p) {
1679  return Vec128<double>{_mm_loadu_pd(p)};
1680 }
1681 
1682 template <typename T>
1683 HWY_API Vec64<T> Load(Full64<T> /* tag */, const T* HWY_RESTRICT p) {
1684 #if HWY_SAFE_PARTIAL_LOAD_STORE
1685  __m128i v = _mm_setzero_si128();
1686  CopyBytes<8>(p, &v);
1687  return Vec64<T>{v};
1688 #else
1689  return Vec64<T>{_mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))};
1690 #endif
1691 }
1692 
1694  const float* HWY_RESTRICT p) {
1695 #if HWY_SAFE_PARTIAL_LOAD_STORE
1696  __m128 v = _mm_setzero_ps();
1697  CopyBytes<8>(p, &v);
1698  return Vec128<float, 2>{v};
1699 #else
1700  const __m128 hi = _mm_setzero_ps();
1701  return Vec128<float, 2>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))};
1702 #endif
1703 }
1704 
1706  const double* HWY_RESTRICT p) {
1707 #if HWY_SAFE_PARTIAL_LOAD_STORE
1708  __m128d v = _mm_setzero_pd();
1709  CopyBytes<8>(p, &v);
1710  return Vec64<double>{v};
1711 #else
1712  return Vec64<double>{_mm_load_sd(p)};
1713 #endif
1714 }
1715 
1717  const float* HWY_RESTRICT p) {
1718 #if HWY_SAFE_PARTIAL_LOAD_STORE
1719  __m128 v = _mm_setzero_ps();
1720  CopyBytes<4>(p, &v);
1721  return Vec128<float, 1>{v};
1722 #else
1723  return Vec128<float, 1>{_mm_load_ss(p)};
1724 #endif
1725 }
1726 
1727 // Any <= 32 bit except <float, 1>
1728 template <typename T, size_t N, HWY_IF_LE32(T, N)>
1729 HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
1730  constexpr size_t kSize = sizeof(T) * N;
1731 #if HWY_SAFE_PARTIAL_LOAD_STORE
1732  __m128 v = _mm_setzero_ps();
1733  CopyBytes<kSize>(p, &v);
1734  return Vec128<T, N>{v};
1735 #else
1736  int32_t bits;
1737  CopyBytes<kSize>(p, &bits);
1738  return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
1739 #endif
1740 }
1741 
1742 // For < 128 bit, LoadU == Load.
1743 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1745  return Load(d, p);
1746 }
1747 
1748 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
1749 template <typename T, size_t N, HWY_IF_LE128(T, N)>
1750 HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
1751  return LoadU(d, p);
1752 }
1753 
1754 // Returns a vector with lane i=[0, N) set to "first" + i.
1755 template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
1756 HWY_API Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
1757  HWY_ALIGN T lanes[16 / sizeof(T)];
1758  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
1759  lanes[i] = static_cast<T>(first + static_cast<T2>(i));
1760  }
1761  return Load(d, lanes);
1762 }
1763 
1764 // ------------------------------ MaskedLoad
1765 
1766 #if HWY_TARGET <= HWY_AVX3
1767 
1768 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1770  const T* HWY_RESTRICT p) {
1771  return Vec128<T, N>{_mm_maskz_loadu_epi8(m.raw, p)};
1772 }
1773 
1774 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1775 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1776  const T* HWY_RESTRICT p) {
1777  return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, p)};
1778 }
1779 
1780 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1781 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1782  const T* HWY_RESTRICT p) {
1783  return Vec128<T, N>{_mm_maskz_loadu_epi32(m.raw, p)};
1784 }
1785 
1786 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1787 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1788  const T* HWY_RESTRICT p) {
1789  return Vec128<T, N>{_mm_maskz_loadu_epi64(m.raw, p)};
1790 }
1791 
1792 template <size_t N>
1794  Simd<float, N, 0> /* tag */,
1795  const float* HWY_RESTRICT p) {
1796  return Vec128<float, N>{_mm_maskz_loadu_ps(m.raw, p)};
1797 }
1798 
1799 template <size_t N>
1801  Simd<double, N, 0> /* tag */,
1802  const double* HWY_RESTRICT p) {
1803  return Vec128<double, N>{_mm_maskz_loadu_pd(m.raw, p)};
1804 }
1805 
1806 #elif HWY_TARGET == HWY_AVX2
1807 
1808 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1809 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1810  const T* HWY_RESTRICT p) {
1811  auto p_p = reinterpret_cast<const int*>(p); // NOLINT
1812  return Vec128<T, N>{_mm_maskload_epi32(p_p, m.raw)};
1813 }
1814 
1815 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1816 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
1817  const T* HWY_RESTRICT p) {
1818  auto p_p = reinterpret_cast<const long long*>(p); // NOLINT
1819  return Vec128<T, N>{_mm_maskload_epi64(p_p, m.raw)};
1820 }
1821 
1822 template <size_t N>
1823 HWY_API Vec128<float, N> MaskedLoad(Mask128<float, N> m, Simd<float, N, 0> d,
1824  const float* HWY_RESTRICT p) {
1825  const Vec128<int32_t, N> mi =
1826  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
1827  return Vec128<float, N>{_mm_maskload_ps(p, mi.raw)};
1828 }
1829 
1830 template <size_t N>
1831 HWY_API Vec128<double, N> MaskedLoad(Mask128<double, N> m, Simd<double, N, 0> d,
1832  const double* HWY_RESTRICT p) {
1833  const Vec128<int64_t, N> mi =
1834  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
1835  return Vec128<double, N>{_mm_maskload_pd(p, mi.raw)};
1836 }
1837 
1838 // There is no maskload_epi8/16, so blend instead.
1839 template <typename T, size_t N, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
1840 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
1841  const T* HWY_RESTRICT p) {
1842  return IfThenElseZero(m, Load(d, p));
1843 }
1844 
1845 #else // <= SSE4
1846 
1847 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
1848 template <typename T, size_t N>
1849 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
1850  const T* HWY_RESTRICT p) {
1851  return IfThenElseZero(m, Load(d, p));
1852 }
1853 
1854 #endif
1855 
1856 // ------------------------------ Store
1857 
1858 template <typename T>
1859 HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
1860  _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
1861 }
1863  float* HWY_RESTRICT aligned) {
1864  _mm_store_ps(aligned, v.raw);
1865 }
1867  double* HWY_RESTRICT aligned) {
1868  _mm_store_pd(aligned, v.raw);
1869 }
1870 
1871 template <typename T>
1873  _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
1874 }
1875 HWY_API void StoreU(const Vec128<float> v, Full128<float> /* tag */,
1876  float* HWY_RESTRICT p) {
1877  _mm_storeu_ps(p, v.raw);
1878 }
1880  double* HWY_RESTRICT p) {
1881  _mm_storeu_pd(p, v.raw);
1882 }
1883 
1884 template <typename T>
1885 HWY_API void Store(Vec64<T> v, Full64<T> /* tag */, T* HWY_RESTRICT p) {
1886 #if HWY_SAFE_PARTIAL_LOAD_STORE
1887  CopyBytes<8>(&v, p);
1888 #else
1889  _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw);
1890 #endif
1891 }
1893  float* HWY_RESTRICT p) {
1894 #if HWY_SAFE_PARTIAL_LOAD_STORE
1895  CopyBytes<8>(&v, p);
1896 #else
1897  _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
1898 #endif
1899 }
1901  double* HWY_RESTRICT p) {
1902 #if HWY_SAFE_PARTIAL_LOAD_STORE
1903  CopyBytes<8>(&v, p);
1904 #else
1905  _mm_storel_pd(p, v.raw);
1906 #endif
1907 }
1908 
1909 // Any <= 32 bit except <float, 1>
1910 template <typename T, size_t N, HWY_IF_LE32(T, N)>
1911 HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
1912  CopyBytes<sizeof(T) * N>(&v, p);
1913 }
1915  float* HWY_RESTRICT p) {
1916 #if HWY_SAFE_PARTIAL_LOAD_STORE
1917  CopyBytes<4>(&v, p);
1918 #else
1919  _mm_store_ss(p, v.raw);
1920 #endif
1921 }
1922 
1923 // For < 128 bit, StoreU == Store.
1924 template <typename T, size_t N, HWY_IF_LE64(T, N)>
1926  Store(v, d, p);
1927 }
1928 
1929 // ------------------------------ BlendedStore
1930 
1931 namespace detail {
1932 
1933 // There is no maskload_epi8/16 with which we could safely implement
1934 // BlendedStore. Manual blending is also unsafe because loading a full vector
1935 // that crosses the array end causes asan faults. Resort to scalar code; the
1936 // caller should instead use memcpy, assuming m is FirstN(d, n).
1937 template <typename T, size_t N>
1939  T* HWY_RESTRICT p) {
1940  const RebindToSigned<decltype(d)> di; // for testing mask if T=bfloat16_t.
1941  using TI = TFromD<decltype(di)>;
1942  alignas(16) TI buf[N];
1943  alignas(16) TI mask[N];
1944  Store(BitCast(di, v), di, buf);
1945  Store(BitCast(di, VecFromMask(d, m)), di, mask);
1946  for (size_t i = 0; i < N; ++i) {
1947  if (mask[i]) {
1948  CopyBytes<sizeof(T)>(buf + i, p + i);
1949  }
1950  }
1951 }
1952 } // namespace detail
1953 
1954 #if HWY_TARGET <= HWY_AVX3
1955 
1956 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
1958  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
1959  _mm_mask_storeu_epi8(p, m.raw, v.raw);
1960 }
1961 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
1962 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
1963  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
1964  _mm_mask_storeu_epi16(p, m.raw, v.raw);
1965 }
1966 
1967 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
1968 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
1969  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
1970  auto pi = reinterpret_cast<int*>(p); // NOLINT
1971  _mm_mask_storeu_epi32(pi, m.raw, v.raw);
1972 }
1973 
1974 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
1975 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
1976  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
1977  auto pi = reinterpret_cast<long long*>(p); // NOLINT
1978  _mm_mask_storeu_epi64(pi, m.raw, v.raw);
1979 }
1980 
1981 template <size_t N>
1983  Simd<float, N, 0>, float* HWY_RESTRICT p) {
1984  _mm_mask_storeu_ps(p, m.raw, v.raw);
1985 }
1986 
1987 template <size_t N>
1989  Simd<double, N, 0>, double* HWY_RESTRICT p) {
1990  _mm_mask_storeu_pd(p, m.raw, v.raw);
1991 }
1992 
1993 #elif HWY_TARGET == HWY_AVX2
1994 
1995 template <typename T, size_t N, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
1996 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
1997  T* HWY_RESTRICT p) {
1998  detail::ScalarMaskedStore(v, m, d, p);
1999 }
2000 
2001 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
2002 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
2003  T* HWY_RESTRICT p) {
2004 #if HWY_IS_ASAN
2005  // HACK: asan raises errors for partial vectors.
2006  if (N < 4) {
2007  detail::ScalarMaskedStore(v, m, d, p);
2008  return;
2009  }
2010 #endif
2011 
2012  const RebindToSigned<decltype(d)> di;
2013  auto pi = reinterpret_cast<int*>(p); // NOLINT
2014  const Vec128<int32_t, N> vi = BitCast(di, v);
2015  _mm_maskstore_epi32(pi, m.raw, vi.raw);
2016 }
2017 
2018 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
2019 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
2020  T* HWY_RESTRICT p) {
2021 #if HWY_IS_ASAN
2022  // HACK: asan raises errors for partial vectors.
2023  if (N < 2) {
2024  detail::ScalarMaskedStore(v, m, d, p);
2025  return;
2026  }
2027 #endif
2028 
2029  auto pi = reinterpret_cast<long long*>(p); // NOLINT
2030  const Vec128<int64_t, N> vi = BitCast(RebindToSigned<decltype(d)>(), v);
2031  _mm_maskstore_epi64(pi, m.raw, vi.raw);
2032 }
2033 
2034 template <size_t N>
2035 HWY_API void BlendedStore(Vec128<float, N> v, Mask128<float, N> m,
2036  Simd<float, N, 0> d, float* HWY_RESTRICT p) {
2037 #if HWY_IS_ASAN
2038  // HACK: asan raises errors for partial vectors.
2039  if (N < 4) {
2040  detail::ScalarMaskedStore(v, m, d, p);
2041  return;
2042  }
2043 #endif
2044 
2045  const Vec128<int32_t, N> mi =
2046  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2047  _mm_maskstore_ps(p, mi.raw, v.raw);
2048 }
2049 
2050 template <size_t N>
2051 HWY_API void BlendedStore(Vec128<double, N> v, Mask128<double, N> m,
2052  Simd<double, N, 0> d, double* HWY_RESTRICT p) {
2053 #if HWY_IS_ASAN
2054  // HACK: asan raises errors for partial vectors.
2055  if (N < 2) {
2056  detail::ScalarMaskedStore(v, m, d, p);
2057  return;
2058  }
2059 #endif
2060 
2061  const Vec128<int64_t, N> mi =
2062  BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
2063  _mm_maskstore_pd(p, mi.raw, v.raw);
2064 }
2065 
2066 #else // <= SSE4
2067 
2068 template <typename T, size_t N>
2069 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
2070  T* HWY_RESTRICT p) {
2071  // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
2072  detail::ScalarMaskedStore(v, m, d, p);
2073 }
2074 
2075 #endif // SSE4
2076 
2077 // ================================================== ARITHMETIC
2078 
2079 // ------------------------------ Addition
2080 
2081 // Unsigned
2082 template <size_t N>
2083 HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
2084  const Vec128<uint8_t, N> b) {
2085  return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2086 }
2087 template <size_t N>
2088 HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
2089  const Vec128<uint16_t, N> b) {
2090  return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2091 }
2092 template <size_t N>
2093 HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
2094  const Vec128<uint32_t, N> b) {
2095  return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2096 }
2097 template <size_t N>
2098 HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
2099  const Vec128<uint64_t, N> b) {
2100  return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2101 }
2102 
2103 // Signed
2104 template <size_t N>
2105 HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
2106  const Vec128<int8_t, N> b) {
2107  return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2108 }
2109 template <size_t N>
2110 HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
2111  const Vec128<int16_t, N> b) {
2112  return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2113 }
2114 template <size_t N>
2115 HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
2116  const Vec128<int32_t, N> b) {
2117  return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2118 }
2119 template <size_t N>
2120 HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
2121  const Vec128<int64_t, N> b) {
2122  return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2123 }
2124 
2125 // Float
2126 template <size_t N>
2127 HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
2128  const Vec128<float, N> b) {
2129  return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
2130 }
2131 template <size_t N>
2133  const Vec128<double, N> b) {
2134  return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
2135 }
2136 
2137 // ------------------------------ Subtraction
2138 
2139 // Unsigned
2140 template <size_t N>
2141 HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
2142  const Vec128<uint8_t, N> b) {
2143  return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2144 }
2145 template <size_t N>
2146 HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
2147  Vec128<uint16_t, N> b) {
2148  return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2149 }
2150 template <size_t N>
2151 HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
2152  const Vec128<uint32_t, N> b) {
2153  return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2154 }
2155 template <size_t N>
2156 HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
2157  const Vec128<uint64_t, N> b) {
2158  return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2159 }
2160 
2161 // Signed
2162 template <size_t N>
2163 HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
2164  const Vec128<int8_t, N> b) {
2165  return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2166 }
2167 template <size_t N>
2168 HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
2169  const Vec128<int16_t, N> b) {
2170  return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2171 }
2172 template <size_t N>
2173 HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
2174  const Vec128<int32_t, N> b) {
2175  return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2176 }
2177 template <size_t N>
2178 HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
2179  const Vec128<int64_t, N> b) {
2180  return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2181 }
2182 
2183 // Float
2184 template <size_t N>
2185 HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
2186  const Vec128<float, N> b) {
2187  return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
2188 }
2189 template <size_t N>
2191  const Vec128<double, N> b) {
2192  return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
2193 }
2194 
2195 // ------------------------------ SumsOf8
2196 template <size_t N>
2197 HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
2198  return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
2199 }
2200 
2201 // ------------------------------ SaturatedAdd
2202 
2203 // Returns a + b clamped to the destination range.
2204 
2205 // Unsigned
2206 template <size_t N>
2207 HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
2208  const Vec128<uint8_t, N> b) {
2209  return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
2210 }
2211 template <size_t N>
2212 HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
2213  const Vec128<uint16_t, N> b) {
2214  return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
2215 }
2216 
2217 // Signed
2218 template <size_t N>
2219 HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
2220  const Vec128<int8_t, N> b) {
2221  return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
2222 }
2223 template <size_t N>
2224 HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
2225  const Vec128<int16_t, N> b) {
2226  return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
2227 }
2228 
2229 // ------------------------------ SaturatedSub
2230 
2231 // Returns a - b clamped to the destination range.
2232 
2233 // Unsigned
2234 template <size_t N>
2235 HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
2236  const Vec128<uint8_t, N> b) {
2237  return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
2238 }
2239 template <size_t N>
2240 HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
2241  const Vec128<uint16_t, N> b) {
2242  return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
2243 }
2244 
2245 // Signed
2246 template <size_t N>
2247 HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
2248  const Vec128<int8_t, N> b) {
2249  return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
2250 }
2251 template <size_t N>
2252 HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
2253  const Vec128<int16_t, N> b) {
2254  return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
2255 }
2256 
2257 // ------------------------------ AverageRound
2258 
2259 // Returns (a + b + 1) / 2
2260 
2261 // Unsigned
2262 template <size_t N>
2263 HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
2264  const Vec128<uint8_t, N> b) {
2265  return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
2266 }
2267 template <size_t N>
2268 HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
2269  const Vec128<uint16_t, N> b) {
2270  return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
2271 }
2272 
2273 // ------------------------------ Integer multiplication
2274 
2275 template <size_t N>
2276 HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
2277  const Vec128<uint16_t, N> b) {
2278  return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2279 }
2280 template <size_t N>
2281 HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
2282  const Vec128<int16_t, N> b) {
2283  return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2284 }
2285 
2286 // Returns the upper 16 bits of a * b in each lane.
2287 template <size_t N>
2288 HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
2289  const Vec128<uint16_t, N> b) {
2290  return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
2291 }
2292 template <size_t N>
2293 HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
2294  const Vec128<int16_t, N> b) {
2295  return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
2296 }
2297 
2298 template <size_t N>
2299 HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
2300  const Vec128<int16_t, N> b) {
2301  return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
2302 }
2303 
2304 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
2305 // even and the upper half into its odd neighbor lane.
2306 template <size_t N>
2307 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
2308  const Vec128<uint32_t, N> b) {
2309  return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
2310 }
2311 
2312 #if HWY_TARGET == HWY_SSSE3
2313 
2314 template <size_t N, HWY_IF_LE64(int32_t, N)> // N=1 or 2
2315 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2316  const Vec128<int32_t, N> b) {
2317  return Set(Simd<int64_t, (N + 1) / 2, 0>(), int64_t(GetLane(a)) * GetLane(b));
2318 }
2319 HWY_API Vec128<int64_t> MulEven(const Vec128<int32_t> a,
2320  const Vec128<int32_t> b) {
2321  alignas(16) int32_t a_lanes[4];
2322  alignas(16) int32_t b_lanes[4];
2323  const Full128<int32_t> di32;
2324  Store(a, di32, a_lanes);
2325  Store(b, di32, b_lanes);
2326  alignas(16) int64_t mul[2];
2327  mul[0] = int64_t(a_lanes[0]) * b_lanes[0];
2328  mul[1] = int64_t(a_lanes[2]) * b_lanes[2];
2329  return Load(Full128<int64_t>(), mul);
2330 }
2331 
2332 #else // HWY_TARGET == HWY_SSSE3
2333 
2334 template <size_t N>
2335 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
2336  const Vec128<int32_t, N> b) {
2337  return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
2338 }
2339 
2340 #endif // HWY_TARGET == HWY_SSSE3
2341 
2342 template <size_t N>
2343 HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
2344  const Vec128<uint32_t, N> b) {
2345 #if HWY_TARGET == HWY_SSSE3
2346  // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency.
2347  // 64-bit right shift would also work but also needs port 5, so no benefit.
2348  // Notation: x=don't care, z=0.
2349  const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2350  const auto mullo_x2x0 = MulEven(a, b);
2351  const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2352  const auto mullo_x3x1 =
2353  MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2354  // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating
2355  // the latter requires one more instruction or a constant.
2356  const __m128i mul_20 =
2357  _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2358  const __m128i mul_31 =
2359  _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2360  return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2361 #else
2362  return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
2363 #endif
2364 }
2365 
2366 template <size_t N>
2367 HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
2368  const Vec128<int32_t, N> b) {
2369  // Same as unsigned; avoid duplicating the SSSE3 code.
2370  const DFromV<decltype(a)> d;
2371  const RebindToUnsigned<decltype(d)> du;
2372  return BitCast(d, BitCast(du, a) * BitCast(du, b));
2373 }
2374 
2375 // ------------------------------ ShiftLeft
2376 
2377 template <int kBits, size_t N>
2378 HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
2379  return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
2380 }
2381 
2382 template <int kBits, size_t N>
2383 HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
2384  return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
2385 }
2386 
2387 template <int kBits, size_t N>
2388 HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
2389  return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
2390 }
2391 
2392 template <int kBits, size_t N>
2393 HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
2394  return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
2395 }
2396 template <int kBits, size_t N>
2397 HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
2398  return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
2399 }
2400 template <int kBits, size_t N>
2401 HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
2402  return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
2403 }
2404 
2405 template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2406 HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
2407  const DFromV<decltype(v)> d8;
2408  // Use raw instead of BitCast to support N=1.
2409  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
2410  return kBits == 1
2411  ? (v + v)
2412  : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
2413 }
2414 
2415 // ------------------------------ ShiftRight
2416 
2417 template <int kBits, size_t N>
2418 HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
2419  return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
2420 }
2421 template <int kBits, size_t N>
2422 HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
2423  return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
2424 }
2425 template <int kBits, size_t N>
2426 HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
2427  return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
2428 }
2429 
2430 template <int kBits, size_t N>
2431 HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
2432  const DFromV<decltype(v)> d8;
2433  // Use raw instead of BitCast to support N=1.
2434  const Vec128<uint8_t, N> shifted{
2435  ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
2436  return shifted & Set(d8, 0xFF >> kBits);
2437 }
2438 
2439 template <int kBits, size_t N>
2440 HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
2441  return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
2442 }
2443 template <int kBits, size_t N>
2444 HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
2445  return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
2446 }
2447 
2448 template <int kBits, size_t N>
2449 HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
2450  const DFromV<decltype(v)> di;
2451  const RebindToUnsigned<decltype(di)> du;
2452  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2453  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
2454  return (shifted ^ shifted_sign) - shifted_sign;
2455 }
2456 
2457 // i64 is implemented after BroadcastSignBit.
2458 
2459 // ------------------------------ RotateRight (ShiftRight, Or)
2460 
2461 template <int kBits, size_t N>
2462 HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) {
2463  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
2464 #if HWY_TARGET <= HWY_AVX3
2465  return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)};
2466 #else
2467  if (kBits == 0) return v;
2468  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
2469 #endif
2470 }
2471 
2472 template <int kBits, size_t N>
2473 HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
2474  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
2475 #if HWY_TARGET <= HWY_AVX3
2476  return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)};
2477 #else
2478  if (kBits == 0) return v;
2479  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
2480 #endif
2481 }
2482 
2483 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
2484 
2485 template <size_t N>
2486 HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
2487  const DFromV<decltype(v)> d;
2488  return VecFromMask(v < Zero(d));
2489 }
2490 
2491 template <size_t N>
2493  return ShiftRight<15>(v);
2494 }
2495 
2496 template <size_t N>
2498  return ShiftRight<31>(v);
2499 }
2500 
2501 template <size_t N>
2503  const DFromV<decltype(v)> d;
2504 #if HWY_TARGET <= HWY_AVX3
2505  (void)d;
2506  return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)};
2507 #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2508  return VecFromMask(v < Zero(d));
2509 #else
2510  // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift
2511  // avoids generating a zero.
2512  const RepartitionToNarrow<decltype(d)> d32;
2513  const auto sign = ShiftRight<31>(BitCast(d32, v));
2514  return Vec128<int64_t, N>{
2515  _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2516 #endif
2517 }
2518 
2519 template <size_t N>
2520 HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
2521 #if HWY_TARGET <= HWY_AVX3
2522  return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
2523 #else
2524  const auto zero = Zero(DFromV<decltype(v)>());
2525  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
2526 #endif
2527 }
2528 
2529 template <int kBits, size_t N>
2530 HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
2531 #if HWY_TARGET <= HWY_AVX3
2532  return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, kBits)};
2533 #else
2534  const DFromV<decltype(v)> di;
2535  const RebindToUnsigned<decltype(di)> du;
2536  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
2537  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
2538  return right | sign;
2539 #endif
2540 }
2541 
2542 // ------------------------------ ZeroIfNegative (BroadcastSignBit)
2543 template <typename T, size_t N, HWY_IF_FLOAT(T)>
2544 HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
2545  const DFromV<decltype(v)> d;
2546 #if HWY_TARGET == HWY_SSSE3
2547  const RebindToSigned<decltype(d)> di;
2548  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
2549 #else
2550  const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS
2551 #endif
2552  return IfThenElse(mask, Zero(d), v);
2553 }
2554 
2555 // ------------------------------ IfNegativeThenElse
2556 template <size_t N>
2558  const Vec128<int8_t, N> yes,
2559  const Vec128<int8_t, N> no) {
2560  // int8: IfThenElse only looks at the MSB.
2561  return IfThenElse(MaskFromVec(v), yes, no);
2562 }
2563 
2564 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
2566  Vec128<T, N> no) {
2567  static_assert(IsSigned<T>(), "Only works for signed/float");
2568  const DFromV<decltype(v)> d;
2569  const RebindToSigned<decltype(d)> di;
2570 
2571  // 16-bit: no native blendv, so copy sign to lower byte's MSB.
2572  v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
2573  return IfThenElse(MaskFromVec(v), yes, no);
2574 }
2575 
2576 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2577 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
2578  Vec128<T, N> no) {
2579  static_assert(IsSigned<T>(), "Only works for signed/float");
2580  const DFromV<decltype(v)> d;
2581  const RebindToFloat<decltype(d)> df;
2582 
2583  // 32/64-bit: use float IfThenElse, which only looks at the MSB.
2584  return BitCast(d, IfThenElse(MaskFromVec(BitCast(df, v)), BitCast(df, yes),
2585  BitCast(df, no)));
2586 }
2587 
2588 // ------------------------------ ShiftLeftSame
2589 
2590 template <size_t N>
2591 HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
2592  const int bits) {
2593  return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2594 }
2595 template <size_t N>
2596 HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
2597  const int bits) {
2598  return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2599 }
2600 template <size_t N>
2601 HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
2602  const int bits) {
2603  return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2604 }
2605 
2606 template <size_t N>
2607 HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
2608  const int bits) {
2609  return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2610 }
2611 
2612 template <size_t N>
2613 HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
2614  const int bits) {
2615  return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2616 }
2617 
2618 template <size_t N>
2619 HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
2620  const int bits) {
2621  return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2622 }
2623 
2624 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
2625 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
2626  const DFromV<decltype(v)> d8;
2627  // Use raw instead of BitCast to support N=1.
2628  const Vec128<T, N> shifted{
2629  ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
2630  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
2631 }
2632 
2633 // ------------------------------ ShiftRightSame (BroadcastSignBit)
2634 
2635 template <size_t N>
2636 HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
2637  const int bits) {
2638  return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2639 }
2640 template <size_t N>
2641 HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
2642  const int bits) {
2643  return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2644 }
2645 template <size_t N>
2646 HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
2647  const int bits) {
2648  return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2649 }
2650 
2651 template <size_t N>
2652 HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
2653  const int bits) {
2654  const DFromV<decltype(v)> d8;
2655  // Use raw instead of BitCast to support N=1.
2656  const Vec128<uint8_t, N> shifted{
2657  ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
2658  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
2659 }
2660 
2661 template <size_t N>
2662 HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
2663  const int bits) {
2664  return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
2665 }
2666 
2667 template <size_t N>
2668 HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
2669  const int bits) {
2670  return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
2671 }
2672 template <size_t N>
2673 HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
2674  const int bits) {
2675 #if HWY_TARGET <= HWY_AVX3
2676  return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
2677 #else
2678  const DFromV<decltype(v)> di;
2679  const RebindToUnsigned<decltype(di)> du;
2680  const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2681  const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
2682  return right | sign;
2683 #endif
2684 }
2685 
2686 template <size_t N>
2687 HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
2688  const DFromV<decltype(v)> di;
2689  const RebindToUnsigned<decltype(di)> du;
2690  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
2691  const auto shifted_sign =
2692  BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
2693  return (shifted ^ shifted_sign) - shifted_sign;
2694 }
2695 
2696 // ------------------------------ Floating-point mul / div
2697 
2698 template <size_t N>
2699 HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
2700  return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
2701 }
2703  const Vec128<float, 1> b) {
2704  return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)};
2705 }
2706 template <size_t N>
2708  const Vec128<double, N> b) {
2709  return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
2710 }
2712  return Vec64<double>{_mm_mul_sd(a.raw, b.raw)};
2713 }
2714 
2715 template <size_t N>
2716 HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
2717  const Vec128<float, N> b) {
2718  return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
2719 }
2721  const Vec128<float, 1> b) {
2722  return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)};
2723 }
2724 template <size_t N>
2726  const Vec128<double, N> b) {
2727  return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
2728 }
2730  return Vec64<double>{_mm_div_sd(a.raw, b.raw)};
2731 }
2732 
2733 // Approximate reciprocal
2734 template <size_t N>
2735 HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
2736  return Vec128<float, N>{_mm_rcp_ps(v.raw)};
2737 }
2739  return Vec128<float, 1>{_mm_rcp_ss(v.raw)};
2740 }
2741 
2742 // Absolute value of difference.
2743 template <size_t N>
2744 HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
2745  const Vec128<float, N> b) {
2746  return Abs(a - b);
2747 }
2748 
2749 // ------------------------------ Floating-point multiply-add variants
2750 
2751 // Returns mul * x + add
2752 template <size_t N>
2753 HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
2754  const Vec128<float, N> x,
2755  const Vec128<float, N> add) {
2756 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2757  return mul * x + add;
2758 #else
2759  return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
2760 #endif
2761 }
2762 template <size_t N>
2764  const Vec128<double, N> x,
2765  const Vec128<double, N> add) {
2766 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2767  return mul * x + add;
2768 #else
2769  return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
2770 #endif
2771 }
2772 
2773 // Returns add - mul * x
2774 template <size_t N>
2775 HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
2776  const Vec128<float, N> x,
2777  const Vec128<float, N> add) {
2778 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2779  return add - mul * x;
2780 #else
2781  return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
2782 #endif
2783 }
2784 template <size_t N>
2786  const Vec128<double, N> x,
2787  const Vec128<double, N> add) {
2788 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2789  return add - mul * x;
2790 #else
2791  return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
2792 #endif
2793 }
2794 
2795 // Returns mul * x - sub
2796 template <size_t N>
2797 HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
2798  const Vec128<float, N> x,
2799  const Vec128<float, N> sub) {
2800 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2801  return mul * x - sub;
2802 #else
2803  return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
2804 #endif
2805 }
2806 template <size_t N>
2808  const Vec128<double, N> x,
2809  const Vec128<double, N> sub) {
2810 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2811  return mul * x - sub;
2812 #else
2813  return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
2814 #endif
2815 }
2816 
2817 // Returns -mul * x - sub
2818 template <size_t N>
2819 HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
2820  const Vec128<float, N> x,
2821  const Vec128<float, N> sub) {
2822 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2823  return Neg(mul) * x - sub;
2824 #else
2825  return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2826 #endif
2827 }
2828 template <size_t N>
2830  const Vec128<double, N> x,
2831  const Vec128<double, N> sub) {
2832 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2833  return Neg(mul) * x - sub;
2834 #else
2835  return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
2836 #endif
2837 }
2838 
2839 // ------------------------------ Floating-point square root
2840 
2841 // Full precision square root
2842 template <size_t N>
2843 HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
2844  return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
2845 }
2847  return Vec128<float, 1>{_mm_sqrt_ss(v.raw)};
2848 }
2849 template <size_t N>
2851  return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
2852 }
2854  return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)};
2855 }
2856 
2857 // Approximate reciprocal square root
2858 template <size_t N>
2859 HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
2860  return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
2861 }
2863  return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)};
2864 }
2865 
2866 // ------------------------------ Min (Gt, IfThenElse)
2867 
2868 namespace detail {
2869 
2870 template <typename T, size_t N>
2872  const Vec128<T, N> b) {
2873  const DFromV<decltype(a)> d;
2874  const RebindToUnsigned<decltype(d)> du;
2875  const RebindToSigned<decltype(d)> di;
2876  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
2877  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
2878  return IfThenElse(gt, b, a);
2879 }
2880 
2881 } // namespace detail
2882 
2883 // Unsigned
2884 template <size_t N>
2885 HWY_API Vec128<uint8_t, N> Min(const Vec128<uint8_t, N> a,
2886  const Vec128<uint8_t, N> b) {
2887  return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
2888 }
2889 template <size_t N>
2890 HWY_API Vec128<uint16_t, N> Min(const Vec128<uint16_t, N> a,
2891  const Vec128<uint16_t, N> b) {
2892 #if HWY_TARGET == HWY_SSSE3
2893  return detail::MinU(a, b);
2894 #else
2895  return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
2896 #endif
2897 }
2898 template <size_t N>
2899 HWY_API Vec128<uint32_t, N> Min(const Vec128<uint32_t, N> a,
2900  const Vec128<uint32_t, N> b) {
2901 #if HWY_TARGET == HWY_SSSE3
2902  return detail::MinU(a, b);
2903 #else
2904  return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
2905 #endif
2906 }
2907 template <size_t N>
2908 HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
2909  const Vec128<uint64_t, N> b) {
2910 #if HWY_TARGET <= HWY_AVX3
2911  return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
2912 #else
2913  return detail::MinU(a, b);
2914 #endif
2915 }
2916 
2917 // Signed
2918 template <size_t N>
2919 HWY_API Vec128<int8_t, N> Min(const Vec128<int8_t, N> a,
2920  const Vec128<int8_t, N> b) {
2921 #if HWY_TARGET == HWY_SSSE3
2922  return IfThenElse(a < b, a, b);
2923 #else
2924  return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
2925 #endif
2926 }
2927 template <size_t N>
2928 HWY_API Vec128<int16_t, N> Min(const Vec128<int16_t, N> a,
2929  const Vec128<int16_t, N> b) {
2930  return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
2931 }
2932 template <size_t N>
2933 HWY_API Vec128<int32_t, N> Min(const Vec128<int32_t, N> a,
2934  const Vec128<int32_t, N> b) {
2935 #if HWY_TARGET == HWY_SSSE3
2936  return IfThenElse(a < b, a, b);
2937 #else
2938  return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
2939 #endif
2940 }
2941 template <size_t N>
2942 HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
2943  const Vec128<int64_t, N> b) {
2944 #if HWY_TARGET <= HWY_AVX3
2945  return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
2946 #else
2947  return IfThenElse(a < b, a, b);
2948 #endif
2949 }
2950 
2951 // Float
2952 template <size_t N>
2953 HWY_API Vec128<float, N> Min(const Vec128<float, N> a,
2954  const Vec128<float, N> b) {
2955  return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
2956 }
2957 template <size_t N>
2959  const Vec128<double, N> b) {
2960  return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
2961 }
2962 
2963 // ------------------------------ Max (Gt, IfThenElse)
2964 
2965 namespace detail {
2966 template <typename T, size_t N>
2968  const Vec128<T, N> b) {
2969  const DFromV<decltype(a)> d;
2970  const RebindToUnsigned<decltype(d)> du;
2971  const RebindToSigned<decltype(d)> di;
2972  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
2973  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
2974  return IfThenElse(gt, a, b);
2975 }
2976 
2977 } // namespace detail
2978 
2979 // Unsigned
2980 template <size_t N>
2981 HWY_API Vec128<uint8_t, N> Max(const Vec128<uint8_t, N> a,
2982  const Vec128<uint8_t, N> b) {
2983  return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
2984 }
2985 template <size_t N>
2986 HWY_API Vec128<uint16_t, N> Max(const Vec128<uint16_t, N> a,
2987  const Vec128<uint16_t, N> b) {
2988 #if HWY_TARGET == HWY_SSSE3
2989  return detail::MaxU(a, b);
2990 #else
2991  return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
2992 #endif
2993 }
2994 template <size_t N>
2995 HWY_API Vec128<uint32_t, N> Max(const Vec128<uint32_t, N> a,
2996  const Vec128<uint32_t, N> b) {
2997 #if HWY_TARGET == HWY_SSSE3
2998  return detail::MaxU(a, b);
2999 #else
3000  return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
3001 #endif
3002 }
3003 template <size_t N>
3004 HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
3005  const Vec128<uint64_t, N> b) {
3006 #if HWY_TARGET <= HWY_AVX3
3007  return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
3008 #else
3009  return detail::MaxU(a, b);
3010 #endif
3011 }
3012 
3013 // Signed
3014 template <size_t N>
3015 HWY_API Vec128<int8_t, N> Max(const Vec128<int8_t, N> a,
3016  const Vec128<int8_t, N> b) {
3017 #if HWY_TARGET == HWY_SSSE3
3018  return IfThenElse(a < b, b, a);
3019 #else
3020  return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
3021 #endif
3022 }
3023 template <size_t N>
3024 HWY_API Vec128<int16_t, N> Max(const Vec128<int16_t, N> a,
3025  const Vec128<int16_t, N> b) {
3026  return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
3027 }
3028 template <size_t N>
3029 HWY_API Vec128<int32_t, N> Max(const Vec128<int32_t, N> a,
3030  const Vec128<int32_t, N> b) {
3031 #if HWY_TARGET == HWY_SSSE3
3032  return IfThenElse(a < b, b, a);
3033 #else
3034  return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
3035 #endif
3036 }
3037 template <size_t N>
3038 HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
3039  const Vec128<int64_t, N> b) {
3040 #if HWY_TARGET <= HWY_AVX3
3041  return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
3042 #else
3043  return IfThenElse(a < b, b, a);
3044 #endif
3045 }
3046 
3047 // Float
3048 template <size_t N>
3049 HWY_API Vec128<float, N> Max(const Vec128<float, N> a,
3050  const Vec128<float, N> b) {
3051  return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
3052 }
3053 template <size_t N>
3055  const Vec128<double, N> b) {
3056  return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
3057 }
3058 
3059 // ================================================== MEMORY (2)
3060 
3061 // ------------------------------ Non-temporal stores
3062 
3063 // On clang6, we see incorrect code generated for _mm_stream_pi, so
3064 // round even partial vectors up to 16 bytes.
3065 template <typename T, size_t N>
3066 HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
3067  T* HWY_RESTRICT aligned) {
3068  _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
3069 }
3070 template <size_t N>
3072  float* HWY_RESTRICT aligned) {
3073  _mm_stream_ps(aligned, v.raw);
3074 }
3075 template <size_t N>
3077  double* HWY_RESTRICT aligned) {
3078  _mm_stream_pd(aligned, v.raw);
3079 }
3080 
3081 // ------------------------------ Scatter
3082 
3083 // Work around warnings in the intrinsic definitions (passing -1 as a mask).
3084 HWY_DIAGNOSTICS(push)
3085 HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
3086 
3087 // Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
3088 using GatherIndex64 = long long int; // NOLINT(google-runtime-int)
3089 static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
3090 
3091 #if HWY_TARGET <= HWY_AVX3
3092 namespace detail {
3093 
3094 template <typename T, size_t N>
3096  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3097  const Vec128<int32_t, N> offset) {
3098  if (N == 4) {
3099  _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
3100  } else {
3101  const __mmask8 mask = (1u << N) - 1;
3102  _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
3103  }
3104 }
3105 template <typename T, size_t N>
3107  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3108  const Vec128<int32_t, N> index) {
3109  if (N == 4) {
3110  _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
3111  } else {
3112  const __mmask8 mask = (1u << N) - 1;
3113  _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
3114  }
3115 }
3116 
3117 template <typename T, size_t N>
3119  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3120  const Vec128<int64_t, N> offset) {
3121  if (N == 2) {
3122  _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
3123  } else {
3124  const __mmask8 mask = (1u << N) - 1;
3125  _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
3126  }
3127 }
3128 template <typename T, size_t N>
3130  Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
3131  const Vec128<int64_t, N> index) {
3132  if (N == 2) {
3133  _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
3134  } else {
3135  const __mmask8 mask = (1u << N) - 1;
3136  _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
3137  }
3138 }
3139 
3140 } // namespace detail
3141 
3142 template <typename T, size_t N, typename Offset>
3144  T* HWY_RESTRICT base,
3145  const Vec128<Offset, N> offset) {
3146  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3147  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
3148 }
3149 template <typename T, size_t N, typename Index>
3151  const Vec128<Index, N> index) {
3152  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3153  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
3154 }
3155 
3156 template <size_t N>
3158  float* HWY_RESTRICT base,
3159  const Vec128<int32_t, N> offset) {
3160  if (N == 4) {
3161  _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
3162  } else {
3163  const __mmask8 mask = (1u << N) - 1;
3164  _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
3165  }
3166 }
3167 template <size_t N>
3169  float* HWY_RESTRICT base,
3170  const Vec128<int32_t, N> index) {
3171  if (N == 4) {
3172  _mm_i32scatter_ps(base, index.raw, v.raw, 4);
3173  } else {
3174  const __mmask8 mask = (1u << N) - 1;
3175  _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
3176  }
3177 }
3178 
3179 template <size_t N>
3181  double* HWY_RESTRICT base,
3182  const Vec128<int64_t, N> offset) {
3183  if (N == 2) {
3184  _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
3185  } else {
3186  const __mmask8 mask = (1u << N) - 1;
3187  _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
3188  }
3189 }
3190 template <size_t N>
3192  double* HWY_RESTRICT base,
3193  const Vec128<int64_t, N> index) {
3194  if (N == 2) {
3195  _mm_i64scatter_pd(base, index.raw, v.raw, 8);
3196  } else {
3197  const __mmask8 mask = (1u << N) - 1;
3198  _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
3199  }
3200 }
3201 #else // HWY_TARGET <= HWY_AVX3
3202 
3203 template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
3204 HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
3205  T* HWY_RESTRICT base,
3206  const Vec128<Offset, N> offset) {
3207  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3208 
3209  alignas(16) T lanes[N];
3210  Store(v, d, lanes);
3211 
3212  alignas(16) Offset offset_lanes[N];
3213  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
3214 
3215  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
3216  for (size_t i = 0; i < N; ++i) {
3217  CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
3218  }
3219 }
3220 
3221 template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
3222 HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
3223  const Vec128<Index, N> index) {
3224  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3225 
3226  alignas(16) T lanes[N];
3227  Store(v, d, lanes);
3228 
3229  alignas(16) Index index_lanes[N];
3230  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
3231 
3232  for (size_t i = 0; i < N; ++i) {
3233  base[index_lanes[i]] = lanes[i];
3234  }
3235 }
3236 
3237 #endif
3238 
3239 // ------------------------------ Gather (Load/Store)
3240 
3241 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3242 
3243 template <typename T, size_t N, typename Offset>
3244 HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
3245  const T* HWY_RESTRICT base,
3246  const Vec128<Offset, N> offset) {
3247  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
3248 
3249  alignas(16) Offset offset_lanes[N];
3250  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
3251 
3252  alignas(16) T lanes[N];
3253  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
3254  for (size_t i = 0; i < N; ++i) {
3255  CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
3256  }
3257  return Load(d, lanes);
3258 }
3259 
3260 template <typename T, size_t N, typename Index>
3261 HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
3262  const T* HWY_RESTRICT base,
3263  const Vec128<Index, N> index) {
3264  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
3265 
3266  alignas(16) Index index_lanes[N];
3267  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
3268 
3269  alignas(16) T lanes[N];
3270  for (size_t i = 0; i < N; ++i) {
3271  lanes[i] = base[index_lanes[i]];
3272  }
3273  return Load(d, lanes);
3274 }
3275 
3276 #else
3277 
3278 namespace detail {
3279 
3280 template <typename T, size_t N>
3281 HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<4> /* tag */,
3282  Simd<T, N, 0> /* d */,
3283  const T* HWY_RESTRICT base,
3284  const Vec128<int32_t, N> offset) {
3285  return Vec128<T, N>{_mm_i32gather_epi32(
3286  reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
3287 }
3288 template <typename T, size_t N>
3289 HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<4> /* tag */,
3290  Simd<T, N, 0> /* d */,
3291  const T* HWY_RESTRICT base,
3292  const Vec128<int32_t, N> index) {
3293  return Vec128<T, N>{_mm_i32gather_epi32(
3294  reinterpret_cast<const int32_t*>(base), index.raw, 4)};
3295 }
3296 
3297 template <typename T, size_t N>
3298 HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<8> /* tag */,
3299  Simd<T, N, 0> /* d */,
3300  const T* HWY_RESTRICT base,
3301  const Vec128<int64_t, N> offset) {
3302  return Vec128<T, N>{_mm_i64gather_epi64(
3303  reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
3304 }
3305 template <typename T, size_t N>
3306 HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<8> /* tag */,
3307  Simd<T, N, 0> /* d */,
3308  const T* HWY_RESTRICT base,
3309  const Vec128<int64_t, N> index) {
3310  return Vec128<T, N>{_mm_i64gather_epi64(
3311  reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
3312 }
3313 
3314 } // namespace detail
3315 
3316 template <typename T, size_t N, typename Offset>
3317 HWY_API Vec128<T, N> GatherOffset(Simd<T, N, 0> d, const T* HWY_RESTRICT base,
3318  const Vec128<Offset, N> offset) {
3319  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
3320 }
3321 template <typename T, size_t N, typename Index>
3322 HWY_API Vec128<T, N> GatherIndex(Simd<T, N, 0> d, const T* HWY_RESTRICT base,
3323  const Vec128<Index, N> index) {
3324  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
3325 }
3326 
3327 template <size_t N>
3328 HWY_API Vec128<float, N> GatherOffset(Simd<float, N, 0> /* tag */,
3329  const float* HWY_RESTRICT base,
3330  const Vec128<int32_t, N> offset) {
3331  return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
3332 }
3333 template <size_t N>
3334 HWY_API Vec128<float, N> GatherIndex(Simd<float, N, 0> /* tag */,
3335  const float* HWY_RESTRICT base,
3336  const Vec128<int32_t, N> index) {
3337  return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
3338 }
3339 
3340 template <size_t N>
3341 HWY_API Vec128<double, N> GatherOffset(Simd<double, N, 0> /* tag */,
3342  const double* HWY_RESTRICT base,
3343  const Vec128<int64_t, N> offset) {
3344  return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
3345 }
3346 template <size_t N>
3347 HWY_API Vec128<double, N> GatherIndex(Simd<double, N, 0> /* tag */,
3348  const double* HWY_RESTRICT base,
3349  const Vec128<int64_t, N> index) {
3350  return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
3351 }
3352 
3353 #endif // HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3354 
3355 HWY_DIAGNOSTICS(pop)
3356 
3357 // ================================================== SWIZZLE (2)
3358 
3359 // ------------------------------ LowerHalf
3360 
3361 // Returns upper/lower half of a vector.
3362 template <typename T, size_t N>
3363 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
3364  Vec128<T, N> v) {
3365  return Vec128<T, N / 2>{v.raw};
3366 }
3367 
3368 template <typename T, size_t N>
3369 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
3370  return LowerHalf(Simd<T, N / 2, 0>(), v);
3371 }
3372 
3373 // ------------------------------ ShiftLeftBytes
3374 
3375 template <int kBytes, typename T, size_t N>
3376 HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
3377  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3378  return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
3379 }
3380 
3381 template <int kBytes, typename T, size_t N>
3382 HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
3383  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
3384 }
3385 
3386 // ------------------------------ ShiftLeftLanes
3387 
3388 template <int kLanes, typename T, size_t N>
3389 HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
3390  const Repartition<uint8_t, decltype(d)> d8;
3391  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
3392 }
3393 
3394 template <int kLanes, typename T, size_t N>
3395 HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
3396  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
3397 }
3398 
3399 // ------------------------------ ShiftRightBytes
3400 template <int kBytes, typename T, size_t N>
3401 HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
3402  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
3403  // For partial vectors, clear upper lanes so we shift in zeros.
3404  if (N != 16 / sizeof(T)) {
3405  const Vec128<T> vfull{v.raw};
3406  v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
3407  }
3408  return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
3409 }
3410 
3411 // ------------------------------ ShiftRightLanes
3412 template <int kLanes, typename T, size_t N>
3413 HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
3414  const Repartition<uint8_t, decltype(d)> d8;
3415  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
3416 }
3417 
3418 // ------------------------------ UpperHalf (ShiftRightBytes)
3419 
3420 // Full input: copy hi into lo (smaller instruction encoding than shifts).
3421 template <typename T>
3423  return Vec64<T>{_mm_unpackhi_epi64(v.raw, v.raw)};
3424 }
3425 HWY_API Vec128<float, 2> UpperHalf(Full64<float> /* tag */, Vec128<float> v) {
3426  return Vec128<float, 2>{_mm_movehl_ps(v.raw, v.raw)};
3427 }
3429  return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)};
3430 }
3431 
3432 // Partial
3433 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3434 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
3435  Vec128<T, N> v) {
3436  const DFromV<decltype(v)> d;
3437  const RebindToUnsigned<decltype(d)> du;
3438  const auto vu = BitCast(du, v);
3439  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
3440  return Vec128<T, (N + 1) / 2>{upper.raw};
3441 }
3442 
3443 // ------------------------------ CombineShiftRightBytes
3444 
3445 template <int kBytes, typename T, class V = Vec128<T>>
3446 HWY_API V CombineShiftRightBytes(Full128<T> d, V hi, V lo) {
3447  const Repartition<uint8_t, decltype(d)> d8;
3448  return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
3449  BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
3450 }
3451 
3452 template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
3453  class V = Vec128<T, N>>
3454 HWY_API V CombineShiftRightBytes(Simd<T, N, 0> d, V hi, V lo) {
3455  constexpr size_t kSize = N * sizeof(T);
3456  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
3457  const Repartition<uint8_t, decltype(d)> d8;
3458  const Full128<uint8_t> d_full8;
3459  using V8 = VFromD<decltype(d_full8)>;
3460  const V8 hi8{BitCast(d8, hi).raw};
3461  // Move into most-significant bytes
3462  const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
3463  const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
3464  return V{BitCast(Full128<T>(), r).raw};
3465 }
3466 
3467 // ------------------------------ Broadcast/splat any lane
3468 
3469 // Unsigned
3470 template <int kLane, size_t N>
3472  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3473  if (kLane < 4) {
3474  const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
3475  return Vec128<uint16_t, N>{_mm_unpacklo_epi64(lo, lo)};
3476  } else {
3477  const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
3478  return Vec128<uint16_t, N>{_mm_unpackhi_epi64(hi, hi)};
3479  }
3480 }
3481 template <int kLane, size_t N>
3483  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3484  return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
3485 }
3486 template <int kLane, size_t N>
3488  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3489  return Vec128<uint64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
3490 }
3491 
3492 // Signed
3493 template <int kLane, size_t N>
3495  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3496  if (kLane < 4) {
3497  const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
3498  return Vec128<int16_t, N>{_mm_unpacklo_epi64(lo, lo)};
3499  } else {
3500  const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
3501  return Vec128<int16_t, N>{_mm_unpackhi_epi64(hi, hi)};
3502  }
3503 }
3504 template <int kLane, size_t N>
3506  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3507  return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
3508 }
3509 template <int kLane, size_t N>
3511  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3512  return Vec128<int64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
3513 }
3514 
3515 // Float
3516 template <int kLane, size_t N>
3518  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3519  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
3520 }
3521 template <int kLane, size_t N>
3523  static_assert(0 <= kLane && kLane < N, "Invalid lane");
3524  return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)};
3525 }
3526 
3527 // ------------------------------ TableLookupBytes
3528 template <typename T, size_t N, typename TI, size_t NI>
3529 HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
3530  const Vec128<TI, NI> from) {
3531  return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
3532 }
3533 
3534 // ------------------------------ TableLookupBytesOr0
3535 // For all vector widths; x86 anyway zeroes if >= 0x80.
3536 template <class V, class VI>
3537 HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
3538  return TableLookupBytes(bytes, from);
3539 }
3540 
3541 // ------------------------------ TableLookupLanes (Shuffle01)
3542 
3543 // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
3544 template <typename T, size_t N = 16 / sizeof(T)>
3545 struct Indices128 {
3546  __m128i raw;
3547 };
3548 
3549 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
3550  HWY_IF_LANE_SIZE(T, 4)>
3552  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
3553 #if HWY_IS_DEBUG_BUILD
3554  const Rebind<TI, decltype(d)> di;
3555  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
3556  AllTrue(di, Lt(vec, Set(di, N))));
3557 #endif
3558 
3559 #if HWY_TARGET <= HWY_AVX2
3560  (void)d;
3561  return Indices128<T, N>{vec.raw};
3562 #else
3563  const Repartition<uint8_t, decltype(d)> d8;
3564  using V8 = VFromD<decltype(d8)>;
3565  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3566  0, 1, 2, 3, 0, 1, 2, 3};
3567 
3568  // Broadcast each lane index to all 4 bytes of T
3569  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3570  0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3571  const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
3572 
3573  // Shift to bytes
3574  const Repartition<uint16_t, decltype(d)> d16;
3575  const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
3576 
3577  return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
3578 #endif
3579 }
3580 
3581 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
3582  HWY_IF_LANE_SIZE(T, 8)>
3583 HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
3584  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
3585 #if HWY_IS_DEBUG_BUILD
3586  const Rebind<TI, decltype(d)> di;
3587  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
3588  AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
3589 #else
3590  (void)d;
3591 #endif
3592 
3593  // No change - even without AVX3, we can shuffle+blend.
3594  return Indices128<T, N>{vec.raw};
3595 }
3596 
3597 template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
3598 HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
3599  const Rebind<TI, decltype(d)> di;
3600  return IndicesFromVec(d, LoadU(di, idx));
3601 }
3602 
3603 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3605 #if HWY_TARGET <= HWY_AVX2
3606  const DFromV<decltype(v)> d;
3607  const RebindToFloat<decltype(d)> df;
3608  const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)};
3609  return BitCast(d, perm);
3610 #else
3611  return TableLookupBytes(v, Vec128<T, N>{idx.raw});
3612 #endif
3613 }
3614 
3615 template <size_t N, HWY_IF_GE64(float, N)>
3617  Indices128<float, N> idx) {
3618 #if HWY_TARGET <= HWY_AVX2
3619  return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)};
3620 #else
3621  const DFromV<decltype(v)> df;
3622  const RebindToSigned<decltype(df)> di;
3623  return BitCast(df,
3625 #endif
3626 }
3627 
3628 // Single lane: no change
3629 template <typename T>
3631  Indices128<T, 1> /* idx */) {
3632  return v;
3633 }
3634 
3635 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3637  const Full128<T> d;
3638  Vec128<int64_t> vidx{idx.raw};
3639 #if HWY_TARGET <= HWY_AVX2
3640  // There is no _mm_permute[x]var_epi64.
3641  vidx += vidx; // bit1 is the decider (unusual)
3642  const Full128<double> df;
3643  return BitCast(
3644  d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)});
3645 #else
3646  // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
3647  // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
3648  // to obtain an all-zero or all-one mask.
3649  const Full128<int64_t> di;
3650  const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
3651  const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same));
3652  return IfThenElse(mask_same, v, Shuffle01(v));
3653 #endif
3654 }
3655 
3657  Indices128<double> idx) {
3658  Vec128<int64_t> vidx{idx.raw};
3659 #if HWY_TARGET <= HWY_AVX2
3660  vidx += vidx; // bit1 is the decider (unusual)
3661  return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)};
3662 #else
3663  // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
3664  // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
3665  // to obtain an all-zero or all-one mask.
3666  const Full128<double> d;
3667  const Full128<int64_t> di;
3668  const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
3669  const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same));
3670  return IfThenElse(mask_same, v, Shuffle01(v));
3671 #endif
3672 }
3673 
3674 // ------------------------------ ReverseBlocks
3675 
3676 // Single block: no change
3677 template <typename T>
3678 HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
3679  return v;
3680 }
3681 
3682 // ------------------------------ Reverse (Shuffle0123, Shuffle2301)
3683 
3684 // Single lane: no change
3685 template <typename T>
3686 HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
3687  return v;
3688 }
3689 
3690 // Two lanes: shuffle
3691 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3693  return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
3694 }
3695 
3696 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
3697 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
3698  return Shuffle01(v);
3699 }
3700 
3701 // Four lanes: shuffle
3702 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3703 HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
3704  return Shuffle0123(v);
3705 }
3706 
3707 // 16-bit
3708 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3709 HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
3710 #if HWY_TARGET <= HWY_AVX3
3711  if (N == 1) return v;
3712  if (N == 2) {
3713  const Repartition<uint32_t, decltype(d)> du32;
3714  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
3715  }
3716  const RebindToSigned<decltype(d)> di;
3717  alignas(16) constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
3718  const Vec128<int16_t, N> idx = Load(di, kReverse + (N == 8 ? 0 : 4));
3719  return BitCast(d, Vec128<int16_t, N>{
3720  _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3721 #else
3722  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
3723  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
3724 #endif
3725 }
3726 
3727 // ------------------------------ Reverse2
3728 
3729 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3730 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
3731  const Repartition<uint32_t, decltype(d)> du32;
3732  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
3733 }
3734 
3735 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
3736 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
3737  return Shuffle2301(v);
3738 }
3739 
3740 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3741 HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
3742  return Shuffle01(v);
3743 }
3744 
3745 // ------------------------------ Reverse4
3746 
3747 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3748 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
3749  const RebindToSigned<decltype(d)> di;
3750  // 4x 16-bit: a single shufflelo suffices.
3751  if (N == 4) {
3752  return BitCast(d, Vec128<int16_t, N>{_mm_shufflelo_epi16(
3753  BitCast(di, v).raw, _MM_SHUFFLE(0, 1, 2, 3))});
3754  }
3755 
3756 #if HWY_TARGET <= HWY_AVX3
3757  alignas(16) constexpr int16_t kReverse4[8] = {3, 2, 1, 0, 7, 6, 5, 4};
3758  const Vec128<int16_t, N> idx = Load(di, kReverse4);
3759  return BitCast(d, Vec128<int16_t, N>{
3760  _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3761 #else
3762  const RepartitionToWide<decltype(di)> dw;
3763  return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
3764 #endif
3765 }
3766 
3767 // 4x 32-bit: use Shuffle0123
3768 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
3769 HWY_API Vec128<T> Reverse4(Full128<T> /* tag */, const Vec128<T> v) {
3770  return Shuffle0123(v);
3771 }
3772 
3773 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
3774 HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, Vec128<T, N> /* v */) {
3775  HWY_ASSERT(0); // don't have 4 u64 lanes
3776 }
3777 
3778 // ------------------------------ Reverse8
3779 
3780 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
3781 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
3782 #if HWY_TARGET <= HWY_AVX3
3783  const RebindToSigned<decltype(d)> di;
3784  alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
3785  15, 14, 13, 12, 11, 10, 9, 8};
3786  const Vec128<int16_t, N> idx = Load(di, kReverse8);
3787  return BitCast(d, Vec128<int16_t, N>{
3788  _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
3789 #else
3790  const RepartitionToWide<decltype(d)> dw;
3791  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
3792 #endif
3793 }
3794 
3795 template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
3796 HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> /* tag */, Vec128<T, N> /* v */) {
3797  HWY_ASSERT(0); // don't have 8 lanes unless 16-bit
3798 }
3799 
3800 // ------------------------------ InterleaveLower
3801 
3802 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
3803 // the least-significant lane) and "b". To concatenate two half-width integers
3804 // into one, use ZipLower/Upper instead (also works with scalar).
3805 
3806 template <size_t N, HWY_IF_LE128(uint8_t, N)>
3808  const Vec128<uint8_t, N> b) {
3809  return Vec128<uint8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
3810 }
3811 template <size_t N, HWY_IF_LE128(uint16_t, N)>
3813  const Vec128<uint16_t, N> b) {
3814  return Vec128<uint16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
3815 }
3816 template <size_t N, HWY_IF_LE128(uint32_t, N)>
3818  const Vec128<uint32_t, N> b) {
3819  return Vec128<uint32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
3820 }
3821 template <size_t N, HWY_IF_LE128(uint64_t, N)>
3823  const Vec128<uint64_t, N> b) {
3824  return Vec128<uint64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
3825 }
3826 
3827 template <size_t N, HWY_IF_LE128(int8_t, N)>
3829  const Vec128<int8_t, N> b) {
3830  return Vec128<int8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
3831 }
3832 template <size_t N, HWY_IF_LE128(int16_t, N)>
3834  const Vec128<int16_t, N> b) {
3835  return Vec128<int16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
3836 }
3837 template <size_t N, HWY_IF_LE128(int32_t, N)>
3839  const Vec128<int32_t, N> b) {
3840  return Vec128<int32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
3841 }
3842 template <size_t N, HWY_IF_LE128(int64_t, N)>
3844  const Vec128<int64_t, N> b) {
3845  return Vec128<int64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
3846 }
3847 
3848 template <size_t N, HWY_IF_LE128(float, N)>
3849 HWY_API Vec128<float, N> InterleaveLower(const Vec128<float, N> a,
3850  const Vec128<float, N> b) {
3851  return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
3852 }
3853 template <size_t N, HWY_IF_LE128(double, N)>
3855  const Vec128<double, N> b) {
3856  return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
3857 }
3858 
3859 // Additional overload for the optional tag (also for 256/512).
3860 template <class V>
3861 HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
3862  return InterleaveLower(a, b);
3863 }
3864 
3865 // ------------------------------ InterleaveUpper (UpperHalf)
3866 
3867 // All functions inside detail lack the required D parameter.
3868 namespace detail {
3869 
3871  const Vec128<uint8_t> b) {
3872  return Vec128<uint8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
3873 }
3875  const Vec128<uint16_t> b) {
3876  return Vec128<uint16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
3877 }
3879  const Vec128<uint32_t> b) {
3880  return Vec128<uint32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
3881 }
3883  const Vec128<uint64_t> b) {
3884  return Vec128<uint64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
3885 }
3886 
3888  const Vec128<int8_t> b) {
3889  return Vec128<int8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
3890 }
3892  const Vec128<int16_t> b) {
3893  return Vec128<int16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
3894 }
3896  const Vec128<int32_t> b) {
3897  return Vec128<int32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
3898 }
3900  const Vec128<int64_t> b) {
3901  return Vec128<int64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
3902 }
3903 
3904 HWY_API Vec128<float> InterleaveUpper(const Vec128<float> a,
3905  const Vec128<float> b) {
3906  return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
3907 }
3909  const Vec128<double> b) {
3910  return Vec128<double>{_mm_unpackhi_pd(a.raw, b.raw)};
3911 }
3912 
3913 } // namespace detail
3914 
3915 // Full
3916 template <typename T, class V = Vec128<T>>
3917 HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
3918  return detail::InterleaveUpper(a, b);
3919 }
3920 
3921 // Partial
3922 template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
3923 HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
3924  const Half<decltype(d)> d2;
3925  return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
3926 }
3927 
3928 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
3929 
3930 // Same as Interleave*, except that the return lanes are double-width integers;
3931 // this is necessary because the single-lane scalar cannot return two values.
3932 template <class V, class DW = RepartitionToWide<DFromV<V>>>
3933 HWY_API VFromD<DW> ZipLower(V a, V b) {
3934  return BitCast(DW(), InterleaveLower(a, b));
3935 }
3936 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
3937 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
3938  return BitCast(dw, InterleaveLower(D(), a, b));
3939 }
3940 
3941 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
3942 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
3943  return BitCast(dw, InterleaveUpper(D(), a, b));
3944 }
3945 
3946 // ================================================== COMBINE
3947 
3948 // ------------------------------ Combine (InterleaveLower)
3949 
3950 // N = N/2 + N/2 (upper half undefined)
3951 template <typename T, size_t N, HWY_IF_LE128(T, N)>
3952 HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
3953  Vec128<T, N / 2> lo_half) {
3954  const Half<decltype(d)> d2;
3955  const RebindToUnsigned<decltype(d2)> du2;
3956  // Treat half-width input as one lane, and expand to two lanes.
3957  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
3958  const VU lo{BitCast(du2, lo_half).raw};
3959  const VU hi{BitCast(du2, hi_half).raw};
3960  return BitCast(d, InterleaveLower(lo, hi));
3961 }
3962 
3963 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
3964 
3965 template <typename T, HWY_IF_NOT_FLOAT(T)>
3967  return Vec128<T>{_mm_move_epi64(lo.raw)};
3968 }
3969 
3970 template <typename T, HWY_IF_FLOAT(T)>
3971 HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec64<T> lo) {
3972  const RebindToUnsigned<decltype(d)> du;
3973  return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo)));
3974 }
3975 
3976 template <typename T, size_t N, HWY_IF_LE64(T, N)>
3978  return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
3979 }
3980 
3981 // ------------------------------ Concat full (InterleaveLower)
3982 
3983 // hiH,hiL loH,loL |-> hiL,loL (= lower halves)
3984 template <typename T>
3985 HWY_API Vec128<T> ConcatLowerLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3986  const Repartition<uint64_t, decltype(d)> d64;
3987  return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
3988 }
3989 
3990 // hiH,hiL loH,loL |-> hiH,loH (= upper halves)
3991 template <typename T>
3992 HWY_API Vec128<T> ConcatUpperUpper(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
3993  const Repartition<uint64_t, decltype(d)> d64;
3994  return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
3995 }
3996 
3997 // hiH,hiL loH,loL |-> hiL,loH (= inner halves)
3998 template <typename T>
3999 HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
4000  const Vec128<T> lo) {
4001  return CombineShiftRightBytes<8>(d, hi, lo);
4002 }
4003 
4004 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
4005 template <typename T>
4007 #if HWY_TARGET == HWY_SSSE3
4008  const Full128<double> dd;
4009  const __m128d concat = _mm_move_sd(BitCast(dd, hi).raw, BitCast(dd, lo).raw);
4010  return BitCast(d, Vec128<double>{concat});
4011 #else
4012  (void)d;
4013  return Vec128<T>{_mm_blend_epi16(hi.raw, lo.raw, 0x0F)};
4014 #endif
4015 }
4017  const Vec128<float> hi,
4018  const Vec128<float> lo) {
4019  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))};
4020 }
4022  const Vec128<double> hi,
4023  const Vec128<double> lo) {
4024  return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))};
4025 }
4026 
4027 // ------------------------------ Concat partial (Combine, LowerHalf)
4028 
4029 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4030 HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, Vec128<T, N> hi,
4031  Vec128<T, N> lo) {
4032  const Half<decltype(d)> d2;
4033  return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
4034 }
4035 
4036 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4037 HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, Vec128<T, N> hi,
4038  Vec128<T, N> lo) {
4039  const Half<decltype(d)> d2;
4040  return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
4041 }
4042 
4043 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4044 HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
4045  const Vec128<T, N> lo) {
4046  const Half<decltype(d)> d2;
4047  return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
4048 }
4049 
4050 template <typename T, size_t N, HWY_IF_LE64(T, N)>
4052  Vec128<T, N> lo) {
4053  const Half<decltype(d)> d2;
4054  return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
4055 }
4056 
4057 // ------------------------------ ConcatOdd
4058 
4059 // 32-bit full
4060 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4061 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4062  const RebindToFloat<decltype(d)> df;
4063  return BitCast(
4064  d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
4065  _MM_SHUFFLE(3, 1, 3, 1))});
4066 }
4067 template <size_t N>
4068 HWY_API Vec128<float> ConcatOdd(Full128<float> /* tag */, Vec128<float> hi,
4069  Vec128<float> lo) {
4070  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
4071 }
4072 
4073 // 32-bit partial
4074 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4076  return InterleaveUpper(d, lo, hi);
4077 }
4078 
4079 // 64-bit full - no partial because we need at least two inputs to have
4080 // even/odd.
4081 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4082 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4083  return InterleaveUpper(d, lo, hi);
4084 }
4085 
4086 // ------------------------------ ConcatEven (InterleaveLower)
4087 
4088 // 32-bit full
4089 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4090 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4091  const RebindToFloat<decltype(d)> df;
4092  return BitCast(
4093  d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
4094  _MM_SHUFFLE(2, 0, 2, 0))});
4095 }
4096 template <size_t N>
4097 HWY_API Vec128<float> ConcatEven(Full128<float> /* tag */, Vec128<float> hi,
4098  Vec128<float> lo) {
4099  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
4100 }
4101 
4102 // 32-bit partial
4103 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
4105  return InterleaveLower(d, lo, hi);
4106 }
4107 
4108 // 64-bit full - no partial because we need at least two inputs to have
4109 // even/odd.
4110 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
4111 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
4112  return InterleaveLower(d, lo, hi);
4113 }
4114 
4115 // ------------------------------ DupEven (InterleaveLower)
4116 
4117 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4118 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
4119  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4120 }
4121 template <size_t N>
4123  return Vec128<float, N>{
4124  _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4125 }
4126 
4127 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4128 HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
4129  return InterleaveLower(DFromV<decltype(v)>(), v, v);
4130 }
4131 
4132 // ------------------------------ DupOdd (InterleaveUpper)
4133 
4134 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4135 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
4136  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4137 }
4138 template <size_t N>
4140  return Vec128<float, N>{
4141  _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4142 }
4143 
4144 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
4145 HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
4146  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
4147 }
4148 
4149 // ------------------------------ OddEven (IfThenElse)
4150 
4151 namespace detail {
4152 
4153 template <typename T, size_t N>
4154 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
4155  const Vec128<T, N> b) {
4156  const DFromV<decltype(a)> d;
4157  const Repartition<uint8_t, decltype(d)> d8;
4158  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
4159  0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
4160  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
4161 }
4162 template <typename T, size_t N>
4163 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
4164  const Vec128<T, N> b) {
4165 #if HWY_TARGET == HWY_SSSE3
4166  const DFromV<decltype(a)> d;
4167  const Repartition<uint8_t, decltype(d)> d8;
4168  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
4169  0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
4170  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
4171 #else
4172  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
4173 #endif
4174 }
4175 template <typename T, size_t N>
4176 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
4177  const Vec128<T, N> b) {
4178 #if HWY_TARGET == HWY_SSSE3
4179  const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4180  const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4181  return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
4182 #else
4183  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
4184 #endif
4185 }
4186 template <typename T, size_t N>
4187 HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
4188  const Vec128<T, N> b) {
4189 #if HWY_TARGET == HWY_SSSE3
4190  const Full128<double> dd;
4191  const __m128d concat = _mm_move_sd(BitCast(dd, a).raw, BitCast(dd, b).raw);
4192  return BitCast(Full128<T>(), Vec128<double>{concat});
4193 #else
4194  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
4195 #endif
4196 }
4197 
4198 } // namespace detail
4199 
4200 template <typename T, size_t N>
4201 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
4202  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
4203 }
4204 template <size_t N>
4205 HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
4206  const Vec128<float, N> b) {
4207 #if HWY_TARGET == HWY_SSSE3
4208  // SHUFPS must fill the lower half of the output from one register, so we
4209  // need another shuffle. Unpack avoids another immediate byte.
4210  const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4211  const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4212  return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
4213 #else
4214  return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
4215 #endif
4216 }
4217 
4218 template <size_t N>
4220  const Vec128<double, N> b) {
4221  return Vec128<double>{_mm_shuffle_pd(b.raw, a.raw, _MM_SHUFFLE2(1, 0))};
4222 }
4223 
4224 // ------------------------------ OddEvenBlocks
4225 template <typename T, size_t N>
4226 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
4227  return even;
4228 }
4229 
4230 // ------------------------------ SwapAdjacentBlocks
4231 
4232 template <typename T, size_t N>
4233 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
4234  return v;
4235 }
4236 
4237 // ------------------------------ Shl (ZipLower, Mul)
4238 
4239 // Use AVX2/3 variable shifts where available, otherwise multiply by powers of
4240 // two from loading float exponents, which is considerably faster (according
4241 // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
4242 
4243 #if HWY_TARGET > HWY_AVX3 // AVX2 or older
4244 namespace detail {
4245 
4246 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
4247 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
4248 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
4249  const DFromV<decltype(v)> d;
4250  const RepartitionToWide<decltype(d)> dw;
4251  const Rebind<float, decltype(dw)> df;
4252  const auto zero = Zero(d);
4253  // Move into exponent (this u16 will become the upper half of an f32)
4254  const auto exp = ShiftLeft<23 - 16>(v);
4255  const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f
4256  // Insert 0 into lower halves for reinterpreting as binary32.
4257  const auto f0 = ZipLower(dw, zero, upper);
4258  const auto f1 = ZipUpper(dw, zero, upper);
4259  // See comment below.
4260  const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)};
4261  const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
4262  return Vec128<MakeUnsigned<T>, N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
4263 }
4264 
4265 // Same, for 32-bit shifts.
4266 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
4267 HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
4268  const DFromV<decltype(v)> d;
4269  const auto exp = ShiftLeft<23>(v);
4270  const auto f = exp + Set(d, 0x3F800000); // 1.0f
4271  // Do not use ConvertTo because we rely on the native 0x80..00 overflow
4272  // behavior. cvt instead of cvtt should be equivalent, but avoids test
4273  // failure under GCC 10.2.1.
4274  return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
4275 }
4276 
4277 } // namespace detail
4278 #endif // HWY_TARGET > HWY_AVX3
4279 
4280 template <size_t N>
4282  const Vec128<uint16_t, N> bits) {
4283 #if HWY_TARGET <= HWY_AVX3
4284  return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
4285 #else
4286  return v * detail::Pow2(bits);
4287 #endif
4288 }
4290  const Vec128<uint16_t, 1> bits) {
4291  return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)};
4292 }
4293 
4294 template <size_t N>
4296  const Vec128<uint32_t, N> bits) {
4297 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4298  return v * detail::Pow2(bits);
4299 #else
4300  return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
4301 #endif
4302 }
4304  const Vec128<uint32_t, 1> bits) {
4305  return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)};
4306 }
4307 
4308 HWY_API Vec128<uint64_t> operator<<(const Vec128<uint64_t> v,
4309  const Vec128<uint64_t> bits) {
4310 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4311  // Individual shifts and combine
4312  const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
4313  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
4314  const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
4315  return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
4316 #else
4317  return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
4318 #endif
4319 }
4320 HWY_API Vec64<uint64_t> operator<<(const Vec64<uint64_t> v,
4321  const Vec64<uint64_t> bits) {
4322  return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)};
4323 }
4324 
4325 // Signed left shift is the same as unsigned.
4326 template <typename T, size_t N, HWY_IF_SIGNED(T)>
4327 HWY_API Vec128<T, N> operator<<(const Vec128<T, N> v, const Vec128<T, N> bits) {
4328  const DFromV<decltype(v)> di;
4329  const RebindToUnsigned<decltype(di)> du;
4330  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
4331 }
4332 
4333 // ------------------------------ Shr (mul, mask, BroadcastSignBit)
4334 
4335 // Use AVX2+ variable shifts except for SSSE3/SSE4 or 16-bit. There, we use
4336 // widening multiplication by powers of two obtained by loading float exponents,
4337 // followed by a constant right-shift. This is still faster than a scalar or
4338 // bit-test approach: https://gcc.godbolt.org/z/9G7Y9v.
4339 
4340 template <size_t N>
4342  const Vec128<uint16_t, N> bits) {
4343 #if HWY_TARGET <= HWY_AVX3
4344  return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)};
4345 #else
4346  const Simd<uint16_t, N, 0> d;
4347  // For bits=0, we cannot mul by 2^16, so fix the result later.
4348  const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits));
4349  // Replace output with input where bits == 0.
4350  return IfThenElse(bits == Zero(d), in, out);
4351 #endif
4352 }
4354  const Vec128<uint16_t, 1> bits) {
4355  return Vec128<uint16_t, 1>{_mm_srl_epi16(in.raw, bits.raw)};
4356 }
4357 
4358 template <size_t N>
4360  const Vec128<uint32_t, N> bits) {
4361 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4362  // 32x32 -> 64 bit mul, then shift right by 32.
4363  const Simd<uint32_t, N, 0> d32;
4364  // Move odd lanes into position for the second mul. Shuffle more gracefully
4365  // handles N=1 than repartitioning to u64 and shifting 32 bits right.
4366  const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)};
4367  // For bits=0, we cannot mul by 2^32, so fix the result later.
4368  const auto mul = detail::Pow2(Set(d32, 32) - bits);
4369  const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0
4370  const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)};
4371  // No need to shift right, already in the correct position.
4372  const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ?
4373  const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20));
4374  // Replace output with input where bits == 0.
4375  return IfThenElse(bits == Zero(d32), in, out);
4376 #else
4377  return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)};
4378 #endif
4379 }
4381  const Vec128<uint32_t, 1> bits) {
4382  return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits.raw)};
4383 }
4384 
4385 HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
4386  const Vec128<uint64_t> bits) {
4387 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4388  // Individual shifts and combine
4389  const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
4390  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
4391  const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
4392  return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
4393 #else
4394  return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
4395 #endif
4396 }
4397 HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v,
4398  const Vec64<uint64_t> bits) {
4399  return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)};
4400 }
4401 
4402 #if HWY_TARGET > HWY_AVX3 // AVX2 or older
4403 namespace detail {
4404 
4405 // Also used in x86_256-inl.h.
4406 template <class DI, class V>
4407 HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) {
4408  const RebindToUnsigned<DI> du;
4409  const auto count = BitCast(du, count_i); // same type as value to shift
4410  // Clear sign and restore afterwards. This is preferable to shifting the MSB
4411  // downwards because Shr is somewhat more expensive than Shl.
4412  const auto sign = BroadcastSignBit(v);
4413  const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below
4414  return BitCast(di, abs >> count) ^ sign;
4415 }
4416 
4417 } // namespace detail
4418 #endif // HWY_TARGET > HWY_AVX3
4419 
4420 template <size_t N>
4422  const Vec128<int16_t, N> bits) {
4423 #if HWY_TARGET <= HWY_AVX3
4424  return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)};
4425 #else
4426  return detail::SignedShr(Simd<int16_t, N, 0>(), v, bits);
4427 #endif
4428 }
4430  const Vec128<int16_t, 1> bits) {
4431  return Vec128<int16_t, 1>{_mm_sra_epi16(v.raw, bits.raw)};
4432 }
4433 
4434 template <size_t N>
4436  const Vec128<int32_t, N> bits) {
4437 #if HWY_TARGET <= HWY_AVX3
4438  return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)};
4439 #else
4440  return detail::SignedShr(Simd<int32_t, N, 0>(), v, bits);
4441 #endif
4442 }
4444  const Vec128<int32_t, 1> bits) {
4445  return Vec128<int32_t, 1>{_mm_sra_epi32(v.raw, bits.raw)};
4446 }
4447 
4448 template <size_t N>
4450  const Vec128<int64_t, N> bits) {
4451 #if HWY_TARGET <= HWY_AVX3
4452  return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)};
4453 #else
4454  return detail::SignedShr(Simd<int64_t, N, 0>(), v, bits);
4455 #endif
4456 }
4457 
4458 // ------------------------------ MulEven/Odd 64x64 (UpperHalf)
4459 
4460 HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
4461  const Vec128<uint64_t> b) {
4462  alignas(16) uint64_t mul[2];
4463  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
4464  return Load(Full128<uint64_t>(), mul);
4465 }
4466 
4467 HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
4468  const Vec128<uint64_t> b) {
4469  alignas(16) uint64_t mul[2];
4470  const Half<Full128<uint64_t>> d2;
4471  mul[0] =
4472  Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
4473  return Load(Full128<uint64_t>(), mul);
4474 }
4475 
4476 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
4477 
4478 template <size_t N>
4479 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
4480  Vec128<bfloat16_t, 2 * N> a,
4481  Vec128<bfloat16_t, 2 * N> b,
4482  const Vec128<float, N> sum0,
4483  Vec128<float, N>& sum1) {
4484  // TODO(janwas): _mm_dpbf16_ps when available
4485  const Repartition<uint16_t, decltype(df32)> du16;
4486  const RebindToUnsigned<decltype(df32)> du32;
4487  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
4488  // Lane order within sum0/1 is undefined, hence we can avoid the
4489  // longer-latency lane-crossing PromoteTo.
4490  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
4491  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
4492  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
4493  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
4494  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
4495  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
4496 }
4497 
4498 // ================================================== CONVERT
4499 
4500 // ------------------------------ Promotions (part w/ narrow lanes -> full)
4501 
4502 // Unsigned: zero-extend.
4503 template <size_t N>
4504 HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
4505  const Vec128<uint8_t, N> v) {
4506 #if HWY_TARGET == HWY_SSSE3
4507  const __m128i zero = _mm_setzero_si128();
4508  return Vec128<uint16_t, N>{_mm_unpacklo_epi8(v.raw, zero)};
4509 #else
4510  return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(v.raw)};
4511 #endif
4512 }
4513 template <size_t N>
4514 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
4515  const Vec128<uint16_t, N> v) {
4516 #if HWY_TARGET == HWY_SSSE3
4517  return Vec128<uint32_t, N>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
4518 #else
4519  return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(v.raw)};
4520 #endif
4521 }
4522 template <size_t N>
4523 HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
4524  const Vec128<uint32_t, N> v) {
4525 #if HWY_TARGET == HWY_SSSE3
4526  return Vec128<uint64_t, N>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
4527 #else
4528  return Vec128<uint64_t, N>{_mm_cvtepu32_epi64(v.raw)};
4529 #endif
4530 }
4531 template <size_t N>
4532 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
4533  const Vec128<uint8_t, N> v) {
4534 #if HWY_TARGET == HWY_SSSE3
4535  const __m128i zero = _mm_setzero_si128();
4536  const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
4537  return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)};
4538 #else
4539  return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(v.raw)};
4540 #endif
4541 }
4542 
4543 // Unsigned to signed: same plus cast.
4544 template <size_t N>
4545 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> di,
4546  const Vec128<uint8_t, N> v) {
4547  return BitCast(di, PromoteTo(Simd<uint16_t, N, 0>(), v));
4548 }
4549 template <size_t N>
4550 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> di,
4551  const Vec128<uint16_t, N> v) {
4552  return BitCast(di, PromoteTo(Simd<uint32_t, N, 0>(), v));
4553 }
4554 template <size_t N>
4555 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> di,
4556  const Vec128<uint8_t, N> v) {
4557  return BitCast(di, PromoteTo(Simd<uint32_t, N, 0>(), v));
4558 }
4559 
4560 // Signed: replicate sign bit.
4561 template <size_t N>
4562 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
4563  const Vec128<int8_t, N> v) {
4564 #if HWY_TARGET == HWY_SSSE3
4565  return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(v.raw, v.raw)});
4566 #else
4567  return Vec128<int16_t, N>{_mm_cvtepi8_epi16(v.raw)};
4568 #endif
4569 }
4570 template <size_t N>
4571 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
4572  const Vec128<int16_t, N> v) {
4573 #if HWY_TARGET == HWY_SSSE3
4574  return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(v.raw, v.raw)});
4575 #else
4576  return Vec128<int32_t, N>{_mm_cvtepi16_epi32(v.raw)};
4577 #endif
4578 }
4579 template <size_t N>
4580 HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
4581  const Vec128<int32_t, N> v) {
4582 #if HWY_TARGET == HWY_SSSE3
4583  return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(v.raw, v.raw)});
4584 #else
4585  return Vec128<int64_t, N>{_mm_cvtepi32_epi64(v.raw)};
4586 #endif
4587 }
4588 template <size_t N>
4589 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
4590  const Vec128<int8_t, N> v) {
4591 #if HWY_TARGET == HWY_SSSE3
4592  const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
4593  const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
4594  return ShiftRight<24>(Vec128<int32_t, N>{x4});
4595 #else
4596  return Vec128<int32_t, N>{_mm_cvtepi8_epi32(v.raw)};
4597 #endif
4598 }
4599 
4600 // Workaround for origin tracking bug in Clang msan prior to 11.0
4601 // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
4602 #if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
4603 #define HWY_INLINE_F16 HWY_NOINLINE
4604 #else
4605 #define HWY_INLINE_F16 HWY_INLINE
4606 #endif
4607 template <size_t N>
4609  const Vec128<float16_t, N> v) {
4610 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
4611  const RebindToSigned<decltype(df32)> di32;
4612  const RebindToUnsigned<decltype(df32)> du32;
4613  // Expand to u32 so we can shift.
4614  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
4615  const auto sign = ShiftRight<15>(bits16);
4616  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
4617  const auto mantissa = bits16 & Set(du32, 0x3FF);
4618  const auto subnormal =
4619  BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
4620  Set(df32, 1.0f / 16384 / 1024));
4621 
4622  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
4623  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
4624  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
4625  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
4626  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
4627 #else
4628  (void)df32;
4629  return Vec128<float, N>{_mm_cvtph_ps(v.raw)};
4630 #endif
4631 }
4632 
4633 template <size_t N>
4634 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
4635  const Vec128<bfloat16_t, N> v) {
4636  const Rebind<uint16_t, decltype(df32)> du16;
4637  const RebindToSigned<decltype(df32)> di32;
4638  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
4639 }
4640 
4641 template <size_t N>
4643  const Vec128<float, N> v) {
4644  return Vec128<double, N>{_mm_cvtps_pd(v.raw)};
4645 }
4646 
4647 template <size_t N>
4648 HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
4649  const Vec128<int32_t, N> v) {
4650  return Vec128<double, N>{_mm_cvtepi32_pd(v.raw)};
4651 }
4652 
4653 // ------------------------------ Demotions (full -> part w/ narrow lanes)
4654 
4655 template <size_t N>
4656 HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N, 0> /* tag */,
4657  const Vec128<int32_t, N> v) {
4658 #if HWY_TARGET == HWY_SSSE3
4659  const Simd<int32_t, N, 0> di32;
4660  const Simd<uint16_t, N * 2, 0> du16;
4661  const auto zero_if_neg = AndNot(ShiftRight<31>(v), v);
4662  const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF)));
4663  const auto clamped = Or(zero_if_neg, too_big);
4664  // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
4665  alignas(16) constexpr uint16_t kLower2Bytes[16] = {
4666  0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
4667  const auto lo2 = Load(du16, kLower2Bytes);
4668  return Vec128<uint16_t, N>{TableLookupBytes(BitCast(du16, clamped), lo2).raw};
4669 #else
4670  return Vec128<uint16_t, N>{_mm_packus_epi32(v.raw, v.raw)};
4671 #endif
4672 }
4673 
4674 template <size_t N>
4675 HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N, 0> /* tag */,
4676  const Vec128<int32_t, N> v) {
4677  return Vec128<int16_t, N>{_mm_packs_epi32(v.raw, v.raw)};
4678 }
4679 
4680 template <size_t N>
4681 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
4682  const Vec128<int32_t, N> v) {
4683  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
4684  return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
4685 }
4686 
4687 template <size_t N>
4688 HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
4689  const Vec128<int16_t, N> v) {
4690  return Vec128<uint8_t, N>{_mm_packus_epi16(v.raw, v.raw)};
4691 }
4692 
4693 template <size_t N>
4694 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
4695  const Vec128<int32_t, N> v) {
4696  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
4697  return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
4698 }
4699 
4700 template <size_t N>
4701 HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
4702  const Vec128<int16_t, N> v) {
4703  return Vec128<int8_t, N>{_mm_packs_epi16(v.raw, v.raw)};
4704 }
4705 
4706 template <size_t N>
4707 HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
4708  const Vec128<float, N> v) {
4709 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
4710  const RebindToUnsigned<decltype(df16)> du16;
4711  const Rebind<uint32_t, decltype(df16)> du;
4712  const RebindToSigned<decltype(du)> di;
4713  const auto bits32 = BitCast(du, v);
4714  const auto sign = ShiftRight<31>(bits32);
4715  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
4716  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
4717 
4718  const auto k15 = Set(di, 15);
4719  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
4720  const auto is_tiny = exp < Set(di, -24);
4721 
4722  const auto is_subnormal = exp < Set(di, -14);
4723  const auto biased_exp16 =
4724  BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
4725  const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11)
4726  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
4727  (mantissa32 >> (Set(du, 13) + sub_exp));
4728  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
4729  ShiftRight<13>(mantissa32)); // <1024
4730 
4731  const auto sign16 = ShiftLeft<15>(sign);
4732  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
4733  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
4734  return BitCast(df16, DemoteTo(du16, bits16));
4735 #else
4736  (void)df16;
4737  return Vec128<float16_t, N>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
4738 #endif
4739 }
4740 
4741 template <size_t N>
4742 HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
4743  const Vec128<float, N> v) {
4744  // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
4745  const Rebind<int32_t, decltype(dbf16)> di32;
4746  const Rebind<uint32_t, decltype(dbf16)> du32; // for logical shift right
4747  const Rebind<uint16_t, decltype(dbf16)> du16;
4748  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
4749  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
4750 }
4751 
4752 template <size_t N>
4753 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
4754  Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
4755  // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16.
4756  const RebindToUnsigned<decltype(dbf16)> du16;
4757  const Repartition<uint32_t, decltype(dbf16)> du32;
4758  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
4759  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
4760 }
4761 
4762 template <size_t N>
4764  const Vec128<double, N> v) {
4765  return Vec128<float, N>{_mm_cvtpd_ps(v.raw)};
4766 }
4767 
4768 namespace detail {
4769 
4770 // For well-defined float->int demotion in all x86_*-inl.h.
4771 
4772 template <size_t N>
4774  -> decltype(Zero(d)) {
4775  // The max can be exactly represented in binary64, so clamping beforehand
4776  // prevents x86 conversion from raising an exception and returning 80..00.
4777  return Min(v, Set(d, 2147483647.0));
4778 }
4779 
4780 // For ConvertTo float->int of same size, clamping before conversion would
4781 // change the result because the max integer value is not exactly representable.
4782 // Instead detect the overflow result after conversion and fix it.
4783 template <class DI, class DF = RebindToFloat<DI>>
4785  decltype(Zero(di).raw) converted_raw)
4786  -> VFromD<DI> {
4787  // Combinations of original and output sign:
4788  // --: normal <0 or -huge_val to 80..00: OK
4789  // -+: -0 to 0 : OK
4790  // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF
4791  // ++: normal >0 : OK
4792  const auto converted = VFromD<DI>{converted_raw};
4793  const auto sign_wrong = AndNot(BitCast(di, original), converted);
4794 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
4795  // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also
4796  // Add() if using that instead. Work around with one more instruction.
4797  const RebindToUnsigned<DI> du;
4798  const VFromD<DI> mask = BroadcastSignBit(sign_wrong);
4799  const VFromD<DI> max = BitCast(di, ShiftRight<1>(BitCast(du, mask)));
4800  return IfVecThenElse(mask, max, converted);
4801 #else
4802  return Xor(converted, BroadcastSignBit(sign_wrong));
4803 #endif
4804 }
4805 
4806 } // namespace detail
4807 
4808 template <size_t N>
4809 HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* tag */,
4810  const Vec128<double, N> v) {
4811  const auto clamped = detail::ClampF64ToI32Max(Simd<double, N, 0>(), v);
4812  return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
4813 }
4814 
4815 // For already range-limited input [0, 255].
4816 template <size_t N>
4817 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
4818  const Simd<uint32_t, N, 0> d32;
4819  const Simd<uint8_t, N * 4, 0> d8;
4820  alignas(16) static constexpr uint32_t k8From32[4] = {
4821  0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
4822  // Also replicate bytes into all 32 bit lanes for safety.
4823  const auto quad = TableLookupBytes(v, Load(d32, k8From32));
4824  return LowerHalf(LowerHalf(BitCast(d8, quad)));
4825 }
4826 
4827 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
4828 
4829 template <size_t N>
4830 HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
4831  const Vec128<int32_t, N> v) {
4832  return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)};
4833 }
4834 
4835 template <size_t N>
4837  const Vec128<int64_t, N> v) {
4838 #if HWY_TARGET <= HWY_AVX3
4839  (void)dd;
4840  return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
4841 #else
4842  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
4843  const Repartition<uint32_t, decltype(dd)> d32;
4844  const Repartition<uint64_t, decltype(dd)> d64;
4845 
4846  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
4847  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
4848  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
4849 
4850  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
4851  const auto k52 = Set(d32, 0x43300000);
4852  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
4853 
4854  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
4855  return (v_upper - k84_63_52) + v_lower; // order matters!
4856 #endif
4857 }
4858 
4859 // Truncates (rounds toward zero).
4860 template <size_t N>
4861 HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N, 0> di,
4862  const Vec128<float, N> v) {
4863  return detail::FixConversionOverflow(di, v, _mm_cvttps_epi32(v.raw));
4864 }
4865 
4866 // Full (partial handled below)
4868 #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
4869  return detail::FixConversionOverflow(di, v, _mm_cvttpd_epi64(v.raw));
4870 #elif HWY_ARCH_X86_64
4871  const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
4872  const Half<Full128<double>> dd2;
4873  const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw));
4874  return detail::FixConversionOverflow(di, v, _mm_unpacklo_epi64(i0, i1));
4875 #else
4876  using VI = VFromD<decltype(di)>;
4877  const VI k0 = Zero(di);
4878  const VI k1 = Set(di, 1);
4879  const VI k51 = Set(di, 51);
4880 
4881  // Exponent indicates whether the number can be represented as int64_t.
4882  const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
4883  const VI exp = biased_exp - Set(di, 0x3FF);
4884  const auto in_range = exp < Set(di, 63);
4885 
4886  // If we were to cap the exponent at 51 and add 2^52, the number would be in
4887  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
4888  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
4889  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
4890  // manually shift the mantissa into place (we already have many of the
4891  // inputs anyway).
4892  const VI shift_mnt = Max(k51 - exp, k0);
4893  const VI shift_int = Max(exp - k51, k0);
4894  const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
4895  // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
4896  const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4897  // For inputs larger than 2^52, insert zeros at the bottom.
4898  const VI shifted = int52 << shift_int;
4899  // Restore the one bit lost when shifting in the implicit 1-bit.
4900  const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4901 
4902  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
4903  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
4904  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
4905  const VI magnitude = IfThenElse(in_range, restored, limit);
4906 
4907  // If the input was negative, negate the integer (two's complement).
4908  return (magnitude ^ sign_mask) - sign_mask;
4909 #endif
4910 }
4912  // Only need to specialize for non-AVX3, 64-bit (single scalar op)
4913 #if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
4914  const Vec64<int64_t> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
4915  return detail::FixConversionOverflow(di, v, i0.raw);
4916 #else
4917  (void)di;
4918  const auto full = ConvertTo(Full128<int64_t>(), Vec128<double>{v.raw});
4919  return Vec64<int64_t>{full.raw};
4920 #endif
4921 }
4922 
4923 template <size_t N>
4924 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
4925  const Simd<int32_t, N, 0> di;
4926  return detail::FixConversionOverflow(di, v, _mm_cvtps_epi32(v.raw));
4927 }
4928 
4929 // ------------------------------ Floating-point rounding (ConvertTo)
4930 
4931 #if HWY_TARGET == HWY_SSSE3
4932 
4933 // Toward nearest integer, ties to even
4934 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4936  // Rely on rounding after addition with a large value such that no mantissa
4937  // bits remain (assuming the current mode is nearest-even). We may need a
4938  // compiler flag for precise floating-point to prevent "optimizing" this out.
4939  const Simd<T, N, 0> df;
4940  const auto max = Set(df, MantissaEnd<T>());
4941  const auto large = CopySignToAbs(max, v);
4942  const auto added = large + v;
4943  const auto rounded = added - large;
4944  // Keep original if NaN or the magnitude is large (already an int).
4945  return IfThenElse(Abs(v) < max, rounded, v);
4946 }
4947 
4948 namespace detail {
4949 
4950 // Truncating to integer and converting back to float is correct except when the
4951 // input magnitude is large, in which case the input was already an integer
4952 // (because mantissa >> exponent is zero).
4953 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4955  return Abs(v) < Set(Simd<T, N, 0>(), MantissaEnd<T>());
4956 }
4957 
4958 } // namespace detail
4959 
4960 // Toward zero, aka truncate
4961 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4963  const Simd<T, N, 0> df;
4964  const RebindToSigned<decltype(df)> di;
4965 
4966  const auto integer = ConvertTo(di, v); // round toward 0
4967  const auto int_f = ConvertTo(df, integer);
4968 
4969  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
4970 }
4971 
4972 // Toward +infinity, aka ceiling
4973 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4975  const Simd<T, N, 0> df;
4976  const RebindToSigned<decltype(df)> di;
4977 
4978  const auto integer = ConvertTo(di, v); // round toward 0
4979  const auto int_f = ConvertTo(df, integer);
4980 
4981  // Truncating a positive non-integer ends up smaller; if so, add 1.
4982  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
4983 
4984  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
4985 }
4986 
4987 // Toward -infinity, aka floor
4988 template <typename T, size_t N, HWY_IF_FLOAT(T)>
4990  const Simd<T, N, 0> df;
4991  const RebindToSigned<decltype(df)> di;
4992 
4993  const auto integer = ConvertTo(di, v); // round toward 0
4994  const auto int_f = ConvertTo(df, integer);
4995 
4996  // Truncating a negative non-integer ends up larger; if so, subtract 1.
4997  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
4998 
4999  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
5000 }
5001 
5002 #else
5003 
5004 // Toward nearest integer, ties to even
5005 template <size_t N>
5006 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
5007  return Vec128<float, N>{
5008  _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5009 }
5010 template <size_t N>
5011 HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
5012  return Vec128<double, N>{
5013  _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5014 }
5015 
5016 // Toward zero, aka truncate
5017 template <size_t N>
5018 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
5019  return Vec128<float, N>{
5020  _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5021 }
5022 template <size_t N>
5023 HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
5024  return Vec128<double, N>{
5025  _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5026 }
5027 
5028 // Toward +infinity, aka ceiling
5029 template <size_t N>
5030 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
5031  return Vec128<float, N>{
5032  _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5033 }
5034 template <size_t N>
5035 HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
5036  return Vec128<double, N>{
5037  _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5038 }
5039 
5040 // Toward -infinity, aka floor
5041 template <size_t N>
5042 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
5043  return Vec128<float, N>{
5044  _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5045 }
5046 template <size_t N>
5047 HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
5048  return Vec128<double, N>{
5049  _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5050 }
5051 
5052 #endif // !HWY_SSSE3
5053 
5054 // ================================================== CRYPTO
5055 
5056 #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
5057 
5058 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
5059 #ifdef HWY_NATIVE_AES
5060 #undef HWY_NATIVE_AES
5061 #else
5062 #define HWY_NATIVE_AES
5063 #endif
5064 
5065 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
5066  Vec128<uint8_t> round_key) {
5067  return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
5068 }
5069 
5070 HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
5071  Vec128<uint8_t> round_key) {
5072  return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
5073 }
5074 
5075 template <size_t N, HWY_IF_LE128(uint64_t, N)>
5076 HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
5077  Vec128<uint64_t, N> b) {
5078  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
5079 }
5080 
5081 template <size_t N, HWY_IF_LE128(uint64_t, N)>
5082 HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
5083  Vec128<uint64_t, N> b) {
5084  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
5085 }
5086 
5087 #endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
5088 
5089 // ================================================== MISC
5090 
5091 template <typename T>
5092 struct CompressIsPartition {
5093 #if HWY_TARGET <= HWY_AVX3
5094  // AVX3 supports native compress, but a table-based approach allows
5095  // 'partitioning' (also moving mask=false lanes to the top), which helps
5096  // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8
5097  // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3
5098  // u32x8 etc.).
5099  enum { value = (sizeof(T) == 8) };
5100 #else
5101  enum { value = 1 };
5102 #endif
5103 };
5104 
5105 #if HWY_TARGET <= HWY_AVX3
5106 
5107 // ------------------------------ LoadMaskBits
5108 
5109 // `p` points to at least 8 readable bytes, not all of which need be valid.
5110 template <typename T, size_t N, HWY_IF_LE128(T, N)>
5111 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> /* tag */,
5112  const uint8_t* HWY_RESTRICT bits) {
5113  uint64_t mask_bits = 0;
5114  constexpr size_t kNumBytes = (N + 7) / 8;
5115  CopyBytes<kNumBytes>(bits, &mask_bits);
5116  if (N < 8) {
5117  mask_bits &= (1ull << N) - 1;
5118  }
5119 
5120  return Mask128<T, N>::FromBits(mask_bits);
5121 }
5122 
5123 // ------------------------------ StoreMaskBits
5124 
5125 // `p` points to at least 8 writable bytes.
5126 template <typename T, size_t N>
5127 HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
5128  const Mask128<T, N> mask, uint8_t* bits) {
5129  constexpr size_t kNumBytes = (N + 7) / 8;
5130  CopyBytes<kNumBytes>(&mask.raw, bits);
5131 
5132  // Non-full byte, need to clear the undefined upper bits.
5133  if (N < 8) {
5134  const int mask = (1 << N) - 1;
5135  bits[0] = static_cast<uint8_t>(bits[0] & mask);
5136  }
5137 
5138  return kNumBytes;
5139 }
5140 
5141 // ------------------------------ Mask testing
5142 
5143 // Beware: the suffix indicates the number of mask bits, not lane size!
5144 
5145 template <typename T, size_t N>
5146 HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
5147  const Mask128<T, N> mask) {
5148  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
5149  return PopCount(mask_bits);
5150 }
5151 
5152 template <typename T, size_t N>
5153 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
5154  const Mask128<T, N> mask) {
5155  const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
5156  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
5157 }
5158 
5159 template <typename T, size_t N>
5160 HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
5161  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
5162  return mask_bits == 0;
5163 }
5164 
5165 template <typename T, size_t N>
5166 HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
5167  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
5168  // Cannot use _kortestc because we may have less than 8 mask bits.
5169  return mask_bits == (1u << N) - 1;
5170 }
5171 
5172 // ------------------------------ Compress
5173 
5174 #if HWY_TARGET != HWY_AVX3_DL
5175 namespace detail {
5176 
5177 // Returns permutevar_epi16 indices for 16-bit Compress. Also used by x86_256.
5178 HWY_INLINE Vec128<uint16_t> IndicesForCompress16(uint64_t mask_bits) {
5179  Full128<uint16_t> du16;
5180  // Table of u16 indices packed into bytes to reduce L1 usage. Will be unpacked
5181  // to u16. Ideally we would broadcast 8*3 (half of the 8 bytes currently used)
5182  // bits into each lane and then varshift, but that does not fit in 16 bits.
5183  Rebind<uint8_t, decltype(du16)> du8;
5184  alignas(16) constexpr uint8_t tbl[2048] = {
5185  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
5186  1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2,
5187  0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
5188  0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0,
5189  0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0,
5190  0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0,
5191  0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
5192  0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0,
5193  0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2,
5194  3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1,
5195  2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0,
5196  0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0,
5197  0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0,
5198  0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0,
5199  0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0,
5200  0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0,
5201  1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0,
5202  2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4,
5203  5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3,
5204  4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4,
5205  5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0,
5206  0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0,
5207  0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0,
5208  0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0,
5209  0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1,
5210  2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4,
5211  6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6,
5212  0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4,
5213  6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0,
5214  0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0,
5215  0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0,
5216  0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0,
5217  2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0,
5218  1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3,
5219  5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3,
5220  5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0,
5221  0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6,
5222  0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0,
5223  0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0,
5224  0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0,
5225  0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7,
5226  0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1,
5227  7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7,
5228  0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0,
5229  0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0,
5230  0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0,
5231  0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0,
5232  0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0,
5233  1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0,
5234  3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3,
5235  4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2,
5236  3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0,
5237  0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0,
5238  0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0,
5239  0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0,
5240  0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0,
5241  0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1,
5242  4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2,
5243  4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5,
5244  7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4,
5245  5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5,
5246  7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0,
5247  0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0,
5248  0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0,
5249  3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0,
5250  1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2,
5251  3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6,
5252  7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7,
5253  0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6,
5254  7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0,
5255  0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0,
5256  0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0,
5257  0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2,
5258  5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1,
5259  2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5,
5260  6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5,
5261  6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0,
5262  0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7,
5263  0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0,
5264  0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0,
5265  1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0,
5266  2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
5267  return PromoteTo(du16, Load(du8, tbl + mask_bits * 8));
5268 }
5269 
5270 } // namespace detail
5271 #endif // HWY_TARGET != HWY_AVX3_DL
5272 
5273 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
5275  const Simd<T, N, 0> d;
5276  const Rebind<uint16_t, decltype(d)> du;
5277  const auto vu = BitCast(du, v); // (required for float16_t inputs)
5278 
5279 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
5280  const Vec128<uint16_t, N> cu{_mm_maskz_compress_epi16(mask.raw, vu.raw)};
5281 #else
5282  const auto idx = detail::IndicesForCompress16(uint64_t{mask.raw});
5283  const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
5284 #endif // HWY_TARGET != HWY_AVX3_DL
5285  return BitCast(d, cu);
5286 }
5287 
5288 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
5289 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
5290  return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
5291 }
5292 
5293 template <size_t N>
5295  return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
5296 }
5297 
5298 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
5299 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
5300  HWY_DASSERT(mask.raw < 4);
5301 
5302  // There are only 2 lanes, so we can afford to load the index vector directly.
5303  alignas(16) constexpr uint8_t packed_array[64] = {
5304  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5305  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5306  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5307  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5308 
5309  const Simd<T, N, 0> d;
5310  const Repartition<uint8_t, decltype(d)> d8;
5311  const auto index = Load(d8, packed_array + 16 * mask.raw);
5312  return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
5313 }
5314 
5315 // ------------------------------ CompressBits (LoadMaskBits)
5316 
5317 template <typename T, size_t N>
5318 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
5319  const uint8_t* HWY_RESTRICT bits) {
5320  return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
5321 }
5322 
5323 // ------------------------------ CompressStore
5324 
5325 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
5327  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
5328  const Rebind<uint16_t, decltype(d)> du;
5329  const auto vu = BitCast(du, v); // (required for float16_t inputs)
5330 
5331  const uint64_t mask_bits{mask.raw};
5332 
5333 #if HWY_TARGET == HWY_AVX3_DL // VBMI2
5334  _mm_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
5335 #else
5336  const auto idx = detail::IndicesForCompress16(mask_bits);
5337  const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
5338  StoreU(BitCast(d, cu), d, unaligned);
5339 #endif // HWY_TARGET == HWY_AVX3_DL
5340 
5341  const size_t count = PopCount(mask_bits & ((1ull << N) - 1));
5342  // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
5343 #if HWY_IS_MSAN
5344  __msan_unpoison(unaligned, count * sizeof(T));
5345 #endif
5346  return count;
5347 }
5348 
5349 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
5350 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
5351  Simd<T, N, 0> /* tag */,
5352  T* HWY_RESTRICT unaligned) {
5353  _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
5354  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
5355  // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
5356 #if HWY_IS_MSAN
5357  __msan_unpoison(unaligned, count * sizeof(T));
5358 #endif
5359  return count;
5360 }
5361 
5362 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
5363 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
5364  Simd<T, N, 0> /* tag */,
5365  T* HWY_RESTRICT unaligned) {
5366  _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
5367  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
5368  // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
5369 #if HWY_IS_MSAN
5370  __msan_unpoison(unaligned, count * sizeof(T));
5371 #endif
5372  return count;
5373 }
5374 
5375 template <size_t N, HWY_IF_LE128(float, N)>
5377  Simd<float, N, 0> /* tag */,
5378  float* HWY_RESTRICT unaligned) {
5379  _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
5380  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
5381  // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
5382 #if HWY_IS_MSAN
5383  __msan_unpoison(unaligned, count * sizeof(float));
5384 #endif
5385  return count;
5386 }
5387 
5388 template <size_t N, HWY_IF_LE128(double, N)>
5390  Simd<double, N, 0> /* tag */,
5391  double* HWY_RESTRICT unaligned) {
5392  _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
5393  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
5394  // Workaround: as of 2022-02-23 MSAN does not mark the output as initialized.
5395 #if HWY_IS_MSAN
5396  __msan_unpoison(unaligned, count * sizeof(double));
5397 #endif
5398  return count;
5399 }
5400 
5401 // ------------------------------ CompressBlendedStore (CompressStore)
5402 template <typename T, size_t N>
5403 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
5404  Simd<T, N, 0> d,
5405  T* HWY_RESTRICT unaligned) {
5406  // AVX-512 already does the blending at no extra cost (latency 11,
5407  // rthroughput 2 - same as compress plus store).
5408  if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
5409  // We're relying on the mask to blend. Clear the undefined upper bits.
5410  if (N != 16 / sizeof(T)) {
5411  m = And(m, FirstN(d, N));
5412  }
5413  return CompressStore(v, m, d, unaligned);
5414  } else {
5415  const size_t count = CountTrue(d, m);
5416  const Vec128<T, N> compressed = Compress(v, m);
5417 #if HWY_MEM_OPS_MIGHT_FAULT
5418  // BlendedStore tests mask for each lane, but we know that the mask is
5419  // FirstN, so we can just copy.
5420  alignas(16) T buf[N];
5421  Store(compressed, d, buf);
5422  memcpy(unaligned, buf, count * sizeof(T));
5423 #else
5424  BlendedStore(compressed, FirstN(d, count), d, unaligned);
5425 #endif
5426  // Workaround: as of 2022-02-23 MSAN does not mark the output as
5427  // initialized.
5428 #if HWY_IS_MSAN
5429  __msan_unpoison(unaligned, count * sizeof(T));
5430 #endif
5431  return count;
5432  }
5433 }
5434 
5435 // ------------------------------ CompressBitsStore (LoadMaskBits)
5436 
5437 template <typename T, size_t N>
5438 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
5439  const uint8_t* HWY_RESTRICT bits,
5440  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
5441  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
5442 }
5443 
5444 #else // AVX2 or below
5445 
5446 // ------------------------------ LoadMaskBits (TestBit)
5447 
5448 namespace detail {
5449 
5450 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
5451 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5452  const RebindToUnsigned<decltype(d)> du;
5453  // Easier than Set(), which would require an >8-bit type, which would not
5454  // compile for T=uint8_t, N=1.
5455  const Vec128<T, N> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))};
5456 
5457  // Replicate bytes 8x such that each byte contains the bit that governs it.
5458  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
5459  1, 1, 1, 1, 1, 1, 1, 1};
5460  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
5461 
5462  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
5463  1, 2, 4, 8, 16, 32, 64, 128};
5464  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
5465 }
5466 
5467 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
5468 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5469  const RebindToUnsigned<decltype(d)> du;
5470  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
5471  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
5472  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
5473 }
5474 
5475 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
5476 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5477  const RebindToUnsigned<decltype(d)> du;
5478  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
5479  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
5480  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
5481 }
5482 
5483 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
5484 HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5485  const RebindToUnsigned<decltype(d)> du;
5486  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
5487  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
5488 }
5489 
5490 } // namespace detail
5491 
5492 // `p` points to at least 8 readable bytes, not all of which need be valid.
5493 template <typename T, size_t N, HWY_IF_LE128(T, N)>
5494 HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
5495  const uint8_t* HWY_RESTRICT bits) {
5496  uint64_t mask_bits = 0;
5497  constexpr size_t kNumBytes = (N + 7) / 8;
5498  CopyBytes<kNumBytes>(bits, &mask_bits);
5499  if (N < 8) {
5500  mask_bits &= (1ull << N) - 1;
5501  }
5502 
5503  return detail::LoadMaskBits(d, mask_bits);
5504 }
5505 
5506 // ------------------------------ StoreMaskBits
5507 
5508 namespace detail {
5509 
5510 constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
5511  return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
5512 }
5513 
5514 template <typename T, size_t N>
5515 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
5516  const Mask128<T, N> mask) {
5517  const Simd<T, N, 0> d;
5518  const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
5519  return U64FromInt(_mm_movemask_epi8(sign_bits));
5520 }
5521 
5522 template <typename T, size_t N>
5523 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
5524  const Mask128<T, N> mask) {
5525  // Remove useless lower half of each u16 while preserving the sign bit.
5526  const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
5527  return U64FromInt(_mm_movemask_epi8(sign_bits));
5528 }
5529 
5530 template <typename T, size_t N>
5531 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
5532  const Mask128<T, N> mask) {
5533  const Simd<T, N, 0> d;
5534  const Simd<float, N, 0> df;
5535  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
5536  return U64FromInt(_mm_movemask_ps(sign_bits.raw));
5537 }
5538 
5539 template <typename T, size_t N>
5540 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
5541  const Mask128<T, N> mask) {
5542  const Simd<T, N, 0> d;
5543  const Simd<double, N, 0> df;
5544  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
5545  return U64FromInt(_mm_movemask_pd(sign_bits.raw));
5546 }
5547 
5548 // Returns the lowest N of the _mm_movemask* bits.
5549 template <typename T, size_t N>
5550 constexpr uint64_t OnlyActive(uint64_t mask_bits) {
5551  return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
5552 }
5553 
5554 template <typename T, size_t N>
5555 HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
5556  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
5557 }
5558 
5559 } // namespace detail
5560 
5561 // `p` points to at least 8 writable bytes.
5562 template <typename T, size_t N>
5563 HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
5564  const Mask128<T, N> mask, uint8_t* bits) {
5565  constexpr size_t kNumBytes = (N + 7) / 8;
5566  const uint64_t mask_bits = detail::BitsFromMask(mask);
5567  CopyBytes<kNumBytes>(&mask_bits, bits);
5568  return kNumBytes;
5569 }
5570 
5571 // ------------------------------ Mask testing
5572 
5573 template <typename T, size_t N>
5574 HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
5575  // Cheaper than PTEST, which is 2 uop / 3L.
5576  return detail::BitsFromMask(mask) == 0;
5577 }
5578 
5579 template <typename T, size_t N>
5580 HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
5581  constexpr uint64_t kAllBits =
5582  detail::OnlyActive<T, N>((1ull << (16 / sizeof(T))) - 1);
5583  return detail::BitsFromMask(mask) == kAllBits;
5584 }
5585 
5586 template <typename T, size_t N>
5587 HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
5588  const Mask128<T, N> mask) {
5589  return PopCount(detail::BitsFromMask(mask));
5590 }
5591 
5592 template <typename T, size_t N>
5593 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
5594  const Mask128<T, N> mask) {
5595  const uint64_t mask_bits = detail::BitsFromMask(mask);
5596  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
5597 }
5598 
5599 // ------------------------------ Compress, CompressBits
5600 
5601 namespace detail {
5602 
5603 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
5604 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5605  HWY_DASSERT(mask_bits < 256);
5606  const Rebind<uint8_t, decltype(d)> d8;
5607  const Simd<uint16_t, N, 0> du;
5608 
5609  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
5610  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
5611  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
5612  // store lane indices and convert to byte indices (2*lane + 0..1), with the
5613  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
5614  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
5615  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
5616  // is likely more costly than the higher cache footprint from storing bytes.
5617  alignas(16) constexpr uint8_t table[2048] = {
5618  0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5619  2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5620  4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14, //
5621  2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5622  6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14, //
5623  2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14, //
5624  4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14, //
5625  2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5626  8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14, //
5627  2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14, //
5628  4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14, //
5629  2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14, //
5630  6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14, //
5631  2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14, //
5632  4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14, //
5633  2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5634  10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14, //
5635  2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14, //
5636  4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14, //
5637  2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14, //
5638  6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14, //
5639  2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14, //
5640  4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14, //
5641  2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14, //
5642  8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14, //
5643  2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14, //
5644  4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14, //
5645  2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14, //
5646  6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14, //
5647  2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14, //
5648  4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14, //
5649  2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5650  12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14, //
5651  2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14, //
5652  4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14, //
5653  2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14, //
5654  6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14, //
5655  2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14, //
5656  4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14, //
5657  2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14, //
5658  8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14, //
5659  2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14, //
5660  4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14, //
5661  2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14, //
5662  6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14, //
5663  2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14, //
5664  4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14, //
5665  2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14, //
5666  10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14, //
5667  2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14, //
5668  4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14, //
5669  2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14, //
5670  6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14, //
5671  2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14, //
5672  4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14, //
5673  2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14, //
5674  8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14, //
5675  2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14, //
5676  4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14, //
5677  2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14, //
5678  6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14, //
5679  2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14, //
5680  4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14, //
5681  2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14, //
5682  14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12, //
5683  2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12, //
5684  4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12, //
5685  2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12, //
5686  6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12, //
5687  2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12, //
5688  4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12, //
5689  2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12, //
5690  8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12, //
5691  2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12, //
5692  4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12, //
5693  2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12, //
5694  6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12, //
5695  2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12, //
5696  4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12, //
5697  2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12, //
5698  10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12, //
5699  2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12, //
5700  4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12, //
5701  2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12, //
5702  6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12, //
5703  2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12, //
5704  4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12, //
5705  2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12, //
5706  8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12, //
5707  2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12, //
5708  4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12, //
5709  2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12, //
5710  6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12, //
5711  2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12, //
5712  4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12, //
5713  2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12, //
5714  12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10, //
5715  2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10, //
5716  4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10, //
5717  2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10, //
5718  6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10, //
5719  2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10, //
5720  4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10, //
5721  2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10, //
5722  8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10, //
5723  2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10, //
5724  4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10, //
5725  2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10, //
5726  6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10, //
5727  2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10, //
5728  4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10, //
5729  2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10, //
5730  10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8, //
5731  2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8, //
5732  4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8, //
5733  2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8, //
5734  6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8, //
5735  2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8, //
5736  4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8, //
5737  2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8, //
5738  8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6, //
5739  2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6, //
5740  4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6, //
5741  2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6, //
5742  6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4, //
5743  2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4, //
5744  4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2, //
5745  2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5746 
5747  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
5748  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
5749  return BitCast(d, pairs + Set(du, 0x0100));
5750 }
5751 
5752 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
5753 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5754  HWY_DASSERT(mask_bits < 16);
5755 
5756  // There are only 4 lanes, so we can afford to load the index vector directly.
5757  alignas(16) constexpr uint8_t packed_array[256] = {
5758  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5759  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5760  4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, //
5761  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5762  8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, //
5763  0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, //
5764  4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, //
5765  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //
5766  12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, //
5767  0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, //
5768  4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, //
5769  0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, //
5770  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, //
5771  0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, //
5772  4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, //
5773  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5774 
5775  const Repartition<uint8_t, decltype(d)> d8;
5776  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
5777 }
5778 
5779 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
5780 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
5781  HWY_DASSERT(mask_bits < 4);
5782 
5783  // There are only 2 lanes, so we can afford to load the index vector directly.
5784  alignas(16) constexpr uint8_t packed_array[64] = {
5785  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5786  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5787  8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5788  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5789 
5790  const Repartition<uint8_t, decltype(d)> d8;
5791  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
5792 }
5793 
5794 template <typename T, size_t N>
5795 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
5796  const Simd<T, N, 0> d;
5797  const RebindToUnsigned<decltype(d)> du;
5798 
5799  HWY_DASSERT(mask_bits < (1ull << N));
5800  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5801  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5802 }
5803 
5804 } // namespace detail
5805 
5806 template <typename T, size_t N>
5807 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> m) {
5809 }
5810 
5811 template <typename T, size_t N>
5812 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
5813  const uint8_t* HWY_RESTRICT bits) {
5814  uint64_t mask_bits = 0;
5815  constexpr size_t kNumBytes = (N + 7) / 8;
5816  CopyBytes<kNumBytes>(bits, &mask_bits);
5817  if (N < 8) {
5818  mask_bits &= (1ull << N) - 1;
5819  }
5820 
5821  return detail::CompressBits(v, mask_bits);
5822 }
5823 
5824 // ------------------------------ CompressStore, CompressBitsStore
5825 
5826 template <typename T, size_t N>
5827 HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
5828  T* HWY_RESTRICT unaligned) {
5829  const RebindToUnsigned<decltype(d)> du;
5830 
5831  const uint64_t mask_bits = detail::BitsFromMask(m);
5832  HWY_DASSERT(mask_bits < (1ull << N));
5833 
5834  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
5835  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5836  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5837  StoreU(compressed, d, unaligned);
5838  return PopCount(mask_bits);
5839 }
5840 
5841 template <typename T, size_t N>
5842 HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
5843  Simd<T, N, 0> d,
5844  T* HWY_RESTRICT unaligned) {
5845  const RebindToUnsigned<decltype(d)> du;
5846 
5847  const uint64_t mask_bits = detail::BitsFromMask(m);
5848  HWY_DASSERT(mask_bits < (1ull << N));
5849  const size_t count = PopCount(mask_bits);
5850 
5851  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
5852  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5853  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5854  BlendedStore(compressed, FirstN(d, count), d, unaligned);
5855  return count;
5856 }
5857 
5858 template <typename T, size_t N>
5859 HWY_API size_t CompressBitsStore(Vec128<T, N> v,
5860  const uint8_t* HWY_RESTRICT bits,
5861  Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
5862  const RebindToUnsigned<decltype(d)> du;
5863 
5864  uint64_t mask_bits = 0;
5865  constexpr size_t kNumBytes = (N + 7) / 8;
5866  CopyBytes<kNumBytes>(bits, &mask_bits);
5867  if (N < 8) {
5868  mask_bits &= (1ull << N) - 1;
5869  }
5870 
5871  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
5872  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
5873  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
5874  StoreU(compressed, d, unaligned);
5875  return PopCount(mask_bits);
5876 }
5877 
5878 #endif // HWY_TARGET <= HWY_AVX3
5879 
5880 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
5881 // TableLookupBytes)
5882 
5883 // 128 bits
5884 HWY_API void StoreInterleaved3(const Vec128<uint8_t> v0,
5885  const Vec128<uint8_t> v1,
5886  const Vec128<uint8_t> v2, Full128<uint8_t> d,
5887  uint8_t* HWY_RESTRICT unaligned) {
5888  const auto k5 = Set(d, 5);
5889  const auto k6 = Set(d, 6);
5890 
5891  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
5892  // 0x80 so lanes to be filled from other vectors are 0 for blending.
5893  alignas(16) static constexpr uint8_t tbl_r0[16] = {
5894  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
5895  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
5896  alignas(16) static constexpr uint8_t tbl_g0[16] = {
5897  0x80, 0, 0x80, 0x80, 1, 0x80, //
5898  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
5899  const auto shuf_r0 = Load(d, tbl_r0);
5900  const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
5901  const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
5902  const auto r0 = TableLookupBytes(v0, shuf_r0); // 5..4..3..2..1..0
5903  const auto g0 = TableLookupBytes(v1, shuf_g0); // ..4..3..2..1..0.
5904  const auto b0 = TableLookupBytes(v2, shuf_b0); // .4..3..2..1..0..
5905  const auto int0 = r0 | g0 | b0;
5906  StoreU(int0, d, unaligned + 0 * 16);
5907 
5908  // Second vector: g10,r10, bgr[9:6], b5,g5
5909  const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
5910  const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
5911  const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
5912  const auto r1 = TableLookupBytes(v0, shuf_r1);
5913  const auto g1 = TableLookupBytes(v1, shuf_g1);
5914  const auto b1 = TableLookupBytes(v2, shuf_b1);
5915  const auto int1 = r1 | g1 | b1;
5916  StoreU(int1, d, unaligned + 1 * 16);
5917 
5918  // Third vector: bgr[15:11], b10
5919  const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
5920  const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
5921  const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
5922  const auto r2 = TableLookupBytes(v0, shuf_r2);
5923  const auto g2 = TableLookupBytes(v1, shuf_g2);
5924  const auto b2 = TableLookupBytes(v2, shuf_b2);
5925  const auto int2 = r2 | g2 | b2;
5926  StoreU(int2, d, unaligned + 2 * 16);
5927 }
5928 
5929 // 64 bits
5930 HWY_API void StoreInterleaved3(const Vec64<uint8_t> v0, const Vec64<uint8_t> v1,
5931  const Vec64<uint8_t> v2, Full64<uint8_t> d,
5932  uint8_t* HWY_RESTRICT unaligned) {
5933  // Use full vectors for the shuffles and first result.
5934  const Full128<uint8_t> d_full;
5935  const auto k5 = Set(d_full, 5);
5936  const auto k6 = Set(d_full, 6);
5937 
5938  const Vec128<uint8_t> full_a{v0.raw};
5939  const Vec128<uint8_t> full_b{v1.raw};
5940  const Vec128<uint8_t> full_c{v2.raw};
5941 
5942  // Shuffle (v0,v1,v2) vector bytes to (MSB on left): r5, bgr[4:0].
5943  // 0x80 so lanes to be filled from other vectors are 0 for blending.
5944  alignas(16) static constexpr uint8_t tbl_r0[16] = {
5945  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
5946  3, 0x80, 0x80, 4, 0x80, 0x80, 5};
5947  alignas(16) static constexpr uint8_t tbl_g0[16] = {
5948  0x80, 0, 0x80, 0x80, 1, 0x80, //
5949  0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
5950  const auto shuf_r0 = Load(d_full, tbl_r0);
5951  const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB
5952  const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
5953  const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0
5954  const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0.
5955  const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0..
5956  const auto int0 = r0 | g0 | b0;
5957  StoreU(int0, d_full, unaligned + 0 * 16);
5958 
5959  // Second (HALF) vector: bgr[7:6], b5,g5
5960  const auto shuf_r1 = shuf_b0 + k6; // ..7..6..
5961  const auto shuf_g1 = shuf_r0 + k5; // .7..6..5
5962  const auto shuf_b1 = shuf_g0 + k5; // 7..6..5.
5963  const auto r1 = TableLookupBytes(full_a, shuf_r1);
5964  const auto g1 = TableLookupBytes(full_b, shuf_g1);
5965  const auto b1 = TableLookupBytes(full_c, shuf_b1);
5966  const decltype(Zero(d)) int1{(r1 | g1 | b1).raw};
5967  StoreU(int1, d, unaligned + 1 * 16);
5968 }
5969 
5970 // <= 32 bits
5971 template <size_t N, HWY_IF_LE32(uint8_t, N)>
5972 HWY_API void StoreInterleaved3(const Vec128<uint8_t, N> v0,
5973  const Vec128<uint8_t, N> v1,
5974  const Vec128<uint8_t, N> v2,
5975  Simd<uint8_t, N, 0> /*tag*/,
5976  uint8_t* HWY_RESTRICT unaligned) {
5977  // Use full vectors for the shuffles and result.
5978  const Full128<uint8_t> d_full;
5979 
5980  const Vec128<uint8_t> full_a{v0.raw};
5981  const Vec128<uint8_t> full_b{v1.raw};
5982  const Vec128<uint8_t> full_c{v2.raw};
5983 
5984  // Shuffle (v0,v1,v2) vector bytes to bgr[3:0].
5985  // 0x80 so lanes to be filled from other vectors are 0 for blending.
5986  alignas(16) static constexpr uint8_t tbl_r0[16] = {
5987  0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, //
5988  0x80, 0x80, 0x80, 0x80};
5989  const auto shuf_r0 = Load(d_full, tbl_r0);
5990  const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
5991  const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
5992  const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0
5993  const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0.
5994  const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0..
5995  const auto int0 = r0 | g0 | b0;
5996  alignas(16) uint8_t buf[16];
5997  StoreU(int0, d_full, buf);
5998  CopyBytes<N * 3>(buf, unaligned);
5999 }
6000 
6001 // ------------------------------ StoreInterleaved4
6002 
6003 // 128 bits
6004 HWY_API void StoreInterleaved4(const Vec128<uint8_t> v0,
6005  const Vec128<uint8_t> v1,
6006  const Vec128<uint8_t> v2,
6007  const Vec128<uint8_t> v3, Full128<uint8_t> d8,
6008  uint8_t* HWY_RESTRICT unaligned) {
6009  const RepartitionToWide<decltype(d8)> d16;
6010  const RepartitionToWide<decltype(d16)> d32;
6011  // let a,b,c,d denote v0..3.
6012  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
6013  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
6014  const auto ba8 = ZipUpper(d16, v0, v1);
6015  const auto dc8 = ZipUpper(d16, v2, v3);
6016  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
6017  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
6018  const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..aB d..a8
6019  const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..aF d..aC
6020  StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16);
6021  StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16);
6022  StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16);
6023  StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16);
6024 }
6025 
6026 // 64 bits
6027 HWY_API void StoreInterleaved4(const Vec64<uint8_t> in0,
6028  const Vec64<uint8_t> in1,
6029  const Vec64<uint8_t> in2,
6030  const Vec64<uint8_t> in3,
6031  Full64<uint8_t> /*tag*/,
6032  uint8_t* HWY_RESTRICT unaligned) {
6033  // Use full vectors to reduce the number of stores.
6034  const Full128<uint8_t> d_full8;
6035  const RepartitionToWide<decltype(d_full8)> d16;
6036  const RepartitionToWide<decltype(d16)> d32;
6037  const Vec128<uint8_t> v0{in0.raw};
6038  const Vec128<uint8_t> v1{in1.raw};
6039  const Vec128<uint8_t> v2{in2.raw};
6040  const Vec128<uint8_t> v3{in3.raw};
6041  // let a,b,c,d denote v0..3.
6042  const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
6043  const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
6044  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
6045  const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
6046  StoreU(BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
6047  StoreU(BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
6048 }
6049 
6050 // <= 32 bits
6051 template <size_t N, HWY_IF_LE32(uint8_t, N)>
6052 HWY_API void StoreInterleaved4(const Vec128<uint8_t, N> in0,
6053  const Vec128<uint8_t, N> in1,
6054  const Vec128<uint8_t, N> in2,
6055  const Vec128<uint8_t, N> in3,
6056  Simd<uint8_t, N, 0> /*tag*/,
6057  uint8_t* HWY_RESTRICT unaligned) {
6058  // Use full vectors to reduce the number of stores.
6059  const Full128<uint8_t> d_full8;
6060  const RepartitionToWide<decltype(d_full8)> d16;
6061  const RepartitionToWide<decltype(d16)> d32;
6062  const Vec128<uint8_t> v0{in0.raw};
6063  const Vec128<uint8_t> v1{in1.raw};
6064  const Vec128<uint8_t> v2{in2.raw};
6065  const Vec128<uint8_t> v3{in3.raw};
6066  // let a,b,c,d denote v0..3.
6067  const auto ba0 = ZipLower(d16, v0, v1); // b3 a3 .. b0 a0
6068  const auto dc0 = ZipLower(d16, v2, v3); // d3 c3 .. d0 c0
6069  const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
6070  alignas(16) uint8_t buf[16];
6071  StoreU(BitCast(d_full8, dcba_0), d_full8, buf);
6072  CopyBytes<4 * N>(buf, unaligned);
6073 }
6074 
6075 // ------------------------------ Reductions
6076 
6077 namespace detail {
6078 
6079 // N=1 for any T: no-op
6080 template <typename T>
6081 HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
6082  const Vec128<T, 1> v) {
6083  return v;
6084 }
6085 template <typename T>
6086 HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
6087  const Vec128<T, 1> v) {
6088  return v;
6089 }
6090 template <typename T>
6091 HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
6092  const Vec128<T, 1> v) {
6093  return v;
6094 }
6095 
6096 // u32/i32/f32:
6097 
6098 // N=2
6099 template <typename T>
6100 HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
6101  const Vec128<T, 2> v10) {
6102  return v10 + Shuffle2301(v10);
6103 }
6104 template <typename T>
6105 HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
6106  const Vec128<T, 2> v10) {
6107  return Min(v10, Shuffle2301(v10));
6108 }
6109 template <typename T>
6110 HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
6111  const Vec128<T, 2> v10) {
6112  return Max(v10, Shuffle2301(v10));
6113 }
6114 
6115 // N=4 (full)
6116 template <typename T>
6117 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
6118  const Vec128<T> v3210) {
6119  const Vec128<T> v1032 = Shuffle1032(v3210);
6120  const Vec128<T> v31_20_31_20 = v3210 + v1032;
6121  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
6122  return v20_31_20_31 + v31_20_31_20;
6123 }
6124 template <typename T>
6125 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
6126  const Vec128<T> v3210) {
6127  const Vec128<T> v1032 = Shuffle1032(v3210);
6128  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
6129  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
6130  return Min(v20_31_20_31, v31_20_31_20);
6131 }
6132 template <typename T>
6133 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
6134  const Vec128<T> v3210) {
6135  const Vec128<T> v1032 = Shuffle1032(v3210);
6136  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
6137  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
6138  return Max(v20_31_20_31, v31_20_31_20);
6139 }
6140 
6141 // u64/i64/f64:
6142 
6143 // N=2 (full)
6144 template <typename T>
6145 HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
6146  const Vec128<T> v10) {
6147  const Vec128<T> v01 = Shuffle01(v10);
6148  return v10 + v01;
6149 }
6150 template <typename T>
6151 HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
6152  const Vec128<T> v10) {
6153  const Vec128<T> v01 = Shuffle01(v10);
6154  return Min(v10, v01);
6155 }
6156 template <typename T>
6157 HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
6158  const Vec128<T> v10) {
6159  const Vec128<T> v01 = Shuffle01(v10);
6160  return Max(v10, v01);
6161 }
6162 
6163 // u16/i16
6164 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
6165 HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
6166  const Repartition<int32_t, Simd<T, N, 0>> d32;
6167  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6168  const auto odd = ShiftRight<16>(BitCast(d32, v));
6169  const auto min = MinOfLanes(d32, Min(even, odd));
6170  // Also broadcast into odd lanes.
6171  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
6172 }
6173 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
6174 HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
6175  const Repartition<int32_t, Simd<T, N, 0>> d32;
6176  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
6177  const auto odd = ShiftRight<16>(BitCast(d32, v));
6178  const auto min = MaxOfLanes(d32, Max(even, odd));
6179  // Also broadcast into odd lanes.
6180  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
6181 }
6182 
6183 } // namespace detail
6184 
6185 // Supported for u/i/f 32/64. Returns the same value in each lane.
6186 template <typename T, size_t N>
6187 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
6188  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
6189 }
6190 template <typename T, size_t N>
6191 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
6192  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
6193 }
6194 template <typename T, size_t N>
6195 HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
6196  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
6197 }
6198 
6199 // ------------------------------ Lt128
6200 
6201 namespace detail {
6202 
6203 // Returns vector-mask for Lt128. Also used by x86_256/x86_512.
6204 template <class D, class V = VFromD<D>>
6205 HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
6206  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
6207  // Truth table of Eq and Lt for Hi and Lo u64.
6208  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
6209  // =H =L cH cL | out = cH | (=H & cL)
6210  // 0 0 0 0 | 0
6211  // 0 0 0 1 | 0
6212  // 0 0 1 0 | 1
6213  // 0 0 1 1 | 1
6214  // 0 1 0 0 | 0
6215  // 0 1 0 1 | 0
6216  // 0 1 1 0 | 1
6217  // 1 0 0 0 | 0
6218  // 1 0 0 1 | 1
6219  // 1 1 0 0 | 0
6220  const V eqHL = VecFromMask(d, Eq(a, b));
6221  const V ltHL = VecFromMask(d, Lt(a, b));
6222  const V ltLX = ShiftLeftLanes<1>(ltHL);
6223  const V vecHx = OrAnd(ltHL, eqHL, ltLX);
6224  return InterleaveUpper(d, vecHx, vecHx);
6225 }
6226 
6227 } // namespace detail
6228 
6229 template <class D, class V = VFromD<D>>
6230 HWY_API MFromD<D> Lt128(D d, const V a, const V b) {
6231  return MaskFromVec(detail::Lt128Vec(d, a, b));
6232 }
6233 
6234 // ------------------------------ Min128, Max128 (Lt128)
6235 
6236 // Avoids the extra MaskFromVec in Lt128.
6237 template <class D, class V = VFromD<D>>
6238 HWY_API V Min128(D d, const V a, const V b) {
6239  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
6240 }
6241 
6242 template <class D, class V = VFromD<D>>
6243 HWY_API V Max128(D d, const V a, const V b) {
6244  return IfVecThenElse(detail::Lt128Vec(d, a, b), b, a);
6245 }
6246 
6247 // ================================================== Operator wrapper
6248 
6249 // These apply to all x86_*-inl.h because there are no restrictions on V.
6250 
6251 template <class V>
6252 HWY_API V Add(V a, V b) {
6253  return a + b;
6254 }
6255 template <class V>
6256 HWY_API V Sub(V a, V b) {
6257  return a - b;
6258 }
6259 
6260 template <class V>
6261 HWY_API V Mul(V a, V b) {
6262  return a * b;
6263 }
6264 template <class V>
6265 HWY_API V Div(V a, V b) {
6266  return a / b;
6267 }
6268 
6269 template <class V>
6270 V Shl(V a, V b) {
6271  return a << b;
6272 }
6273 template <class V>
6274 V Shr(V a, V b) {
6275  return a >> b;
6276 }
6277 
6278 template <class V>
6279 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
6280  return a == b;
6281 }
6282 template <class V>
6283 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
6284  return a != b;
6285 }
6286 template <class V>
6287 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
6288  return a < b;
6289 }
6290 
6291 template <class V>
6292 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
6293  return a > b;
6294 }
6295 template <class V>
6296 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
6297  return a >= b;
6298 }
6299 
6300 template <class V>
6301 HWY_API auto Le(V a, V b) -> decltype(a == b) {
6302  return a <= b;
6303 }
6304 
6305 // NOLINTNEXTLINE(google-readability-namespace-comments)
6306 } // namespace HWY_NAMESPACE
6307 } // namespace hwy
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:310
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_IF_LE64(T, N)
Definition: base.h:297
#define HWY_API
Definition: base.h:122
#define HWY_IF_LE128(T, N)
Definition: base.h:296
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_MAYBE_UNUSED
Definition: base.h:75
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: arm_neon-inl.h:529
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition: x86_128-inl.h:152
Raw raw
Definition: arm_neon-inl.h:539
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:531
Definition: arm_neon-inl.h:485
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: x86_128-inl.h:102
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: x86_128-inl.h:105
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: x86_128-inl.h:93
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: x86_128-inl.h:108
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: x86_128-inl.h:90
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:486
Raw raw
Definition: arm_neon-inl.h:518
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: x86_128-inl.h:96
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: x86_128-inl.h:99
Definition: wasm_256-inl.h:39
Definition: x86_512-inl.h:103
#define HWY_AVX3_DL
Definition: detect_targets.h:58
#define HWY_TARGET
Definition: detect_targets.h:328
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1538
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4522
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:4784
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4773
HWY_API void ScalarMaskedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: x86_128-inl.h:1938
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3106
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2568
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3095
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:134
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4309
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:936
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:728
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4314
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2967
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2871
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2434
HWY_INLINE V Lt128Vec(const D d, const V a, const V b)
Definition: x86_128-inl.h:6205
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4664
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4467
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:770
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2426
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:862
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4305
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:680
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1120
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
Simd< T, 4/sizeof(T), 0 > Full32
Definition: arm_neon-inl.h:40
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:247
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
Simd< T, 8/sizeof(T), 0 > Full64
Definition: arm_neon-inl.h:37
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:203
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
Simd< T, 32/sizeof(T), 0 > Full256
Definition: wasm_256-inl.h:32
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:212
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
Simd< T, 16/sizeof(T), 0 > Full128
Definition: arm_neon-inl.h:34
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:522
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
long long int GatherIndex64
Definition: x86_128-inl.h:3088
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:711
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:466
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:598
constexpr HWY_API bool IsSigned()
Definition: base.h:483
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: arm_neon-inl.h:3368
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3369
__m128i raw
Definition: x86_128-inl.h:3546
Definition: ops/shared-inl.h:40
HWY_INLINE __m128d operator()(__m128i v)
Definition: x86_128-inl.h:237
HWY_INLINE __m128 operator()(__m128i v)
Definition: x86_128-inl.h:233
HWY_INLINE __m128i operator()(__m128i v)
Definition: x86_128-inl.h:229
Definition: arm_neon-inl.h:545
Full512< T > operator()(const hwy::HWY_NAMESPACE::Vec512< T > *) const
Definition: x86_128-inl.h:193
Simd< T, N, 0 > operator()(const Vec128< T, N > *) const
Definition: x86_128-inl.h:182
Full256< T > operator()(const hwy::HWY_NAMESPACE::Vec256< T > *) const
Definition: x86_128-inl.h:187
Definition: x86_128-inl.h:201
decltype(DeduceD()(static_cast< V * >(nullptr))) type
Definition: x86_128-inl.h:202
__m128d type
Definition: x86_128-inl.h:78
__f32x4 type
Definition: wasm_128-inl.h:66
Definition: x86_128-inl.h:69
__v128_u type
Definition: wasm_128-inl.h:62
Definition: x86_128-inl.h:130
__mmask16 type
Definition: x86_128-inl.h:131
Definition: x86_128-inl.h:134
__mmask8 type
Definition: x86_128-inl.h:135
Definition: x86_128-inl.h:138
__mmask8 type
Definition: x86_128-inl.h:139
Definition: x86_128-inl.h:142
__mmask8 type
Definition: x86_128-inl.h:143
Definition: x86_128-inl.h:128
Definition: base.h:317
HWY_AFTER_NAMESPACE()
#define HWY_INLINE_F16
Definition: x86_128-inl.h:4605
HWY_BEFORE_NAMESPACE()