20 #include <emmintrin.h>
22 #if HWY_TARGET == HWY_SSSE3
23 #include <tmmintrin.h>
25 #include <smmintrin.h>
26 #include <wmmintrin.h>
39 #ifndef HWY_LOADDUP_ASM
40 #define HWY_LOADDUP_ASM 0
48 using Full32 = Simd<T, 4 /
sizeof(T), 0>;
51 using Full64 = Simd<T, 8 /
sizeof(T), 0>;
54 using Full128 = Simd<T, 16 /
sizeof(T), 0>;
56 #if HWY_TARGET <= HWY_AVX2
58 using Full256 = Simd<T, 32 /
sizeof(T), 0>;
61 #if HWY_TARGET <= HWY_AVX3
83 template <
typename T,
size_t N = 16 /
sizeof(T)>
91 return *
this = (*
this * other);
94 return *
this = (*
this / other);
97 return *
this = (*
this + other);
100 return *
this = (*
this - other);
103 return *
this = (*
this & other);
106 return *
this = (*
this | other);
109 return *
this = (*
this ^ other);
115 template <
typename T>
116 using Vec64 = Vec128<T, 8 /
sizeof(T)>;
118 #if HWY_TARGET <= HWY_AVX3
121 template <
typename T>
127 template <
size_t size>
148 template <
typename T,
size_t N>
162 template <
typename T,
size_t N = 16 /
sizeof(T)>
169 #if HWY_TARGET <= HWY_AVX2
171 template <
typename T>
181 template <
typename T,
size_t N>
185 #if HWY_TARGET <= HWY_AVX2
186 template <
typename T>
191 #if HWY_TARGET <= HWY_AVX3
192 template <
typename T>
221 template <
typename T,
size_t N>
227 template <
typename T>
228 struct BitCastFromInteger128 {
232 struct BitCastFromInteger128<float> {
240 template <
typename T,
size_t N>
248 template <
typename T,
size_t N,
typename FromT>
250 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
257 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
259 return Vec128<T, N>{_mm_setzero_si128()};
261 template <
size_t N, HWY_IF_LE128(
float, N)>
262 HWY_API Vec128<float, N>
Zero(Simd<float, N, 0> ) {
263 return Vec128<float, N>{_mm_setzero_ps()};
265 template <
size_t N, HWY_IF_LE128(
double, N)>
276 template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
277 HWY_API Vec128<uint8_t, N>
Set(Simd<uint8_t, N, 0> ,
const uint8_t t) {
278 return Vec128<uint8_t, N>{_mm_set1_epi8(
static_cast<char>(t))};
280 template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
281 HWY_API Vec128<uint16_t, N>
Set(Simd<uint16_t, N, 0> ,
283 return Vec128<uint16_t, N>{_mm_set1_epi16(
static_cast<short>(t))};
285 template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
286 HWY_API Vec128<uint32_t, N>
Set(Simd<uint32_t, N, 0> ,
288 return Vec128<uint32_t, N>{_mm_set1_epi32(
static_cast<int>(t))};
290 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
291 HWY_API Vec128<uint64_t, N>
Set(Simd<uint64_t, N, 0> ,
293 return Vec128<uint64_t, N>{
294 _mm_set1_epi64x(
static_cast<long long>(t))};
296 template <
size_t N, HWY_IF_LE128(
int8_t, N)>
297 HWY_API Vec128<int8_t, N>
Set(Simd<int8_t, N, 0> ,
const int8_t t) {
298 return Vec128<int8_t, N>{_mm_set1_epi8(
static_cast<char>(t))};
300 template <
size_t N, HWY_IF_LE128(
int16_t, N)>
301 HWY_API Vec128<int16_t, N>
Set(Simd<int16_t, N, 0> ,
const int16_t t) {
302 return Vec128<int16_t, N>{_mm_set1_epi16(
static_cast<short>(t))};
304 template <
size_t N, HWY_IF_LE128(
int32_t, N)>
305 HWY_API Vec128<int32_t, N>
Set(Simd<int32_t, N, 0> ,
const int32_t t) {
306 return Vec128<int32_t, N>{_mm_set1_epi32(t)};
308 template <
size_t N, HWY_IF_LE128(
int64_t, N)>
309 HWY_API Vec128<int64_t, N>
Set(Simd<int64_t, N, 0> ,
const int64_t t) {
310 return Vec128<int64_t, N>{
311 _mm_set1_epi64x(
static_cast<long long>(t))};
313 template <
size_t N, HWY_IF_LE128(
float, N)>
314 HWY_API Vec128<float, N>
Set(Simd<float, N, 0> ,
const float t) {
315 return Vec128<float, N>{_mm_set1_ps(t)};
317 template <
size_t N, HWY_IF_LE128(
double, N)>
326 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
330 return Vec128<T, N>{_mm_undefined_si128()};
332 template <
size_t N, HWY_IF_LE128(
float, N)>
336 template <
size_t N, HWY_IF_LE128(
double, N)>
348 return static_cast<uint8_t
>(_mm_cvtsi128_si32(
v.raw) & 0xFF);
352 return static_cast<int8_t
>(_mm_cvtsi128_si32(
v.raw) & 0xFF);
356 return static_cast<uint16_t
>(_mm_cvtsi128_si32(
v.raw) & 0xFFFF);
360 return static_cast<int16_t
>(_mm_cvtsi128_si32(
v.raw) & 0xFFFF);
364 return static_cast<uint32_t
>(_mm_cvtsi128_si32(
v.raw));
368 return _mm_cvtsi128_si32(
v.raw);
372 return _mm_cvtss_f32(
v.raw);
377 alignas(16) uint64_t lanes[2];
378 Store(
v, Simd<uint64_t, N, 0>(), lanes);
381 return static_cast<uint64_t
>(_mm_cvtsi128_si64(
v.raw));
387 alignas(16) int64_t lanes[2];
388 Store(
v, Simd<int64_t, N, 0>(), lanes);
391 return _mm_cvtsi128_si64(
v.raw);
396 return _mm_cvtsd_f64(
v.raw);
403 template <
typename T,
size_t N>
404 HWY_API Vec128<T, N>
And(Vec128<T, N> a, Vec128<T, N> b) {
405 return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
421 template <
typename T,
size_t N>
422 HWY_API Vec128<T, N>
AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
423 return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
438 template <
typename T,
size_t N>
439 HWY_API Vec128<T, N>
Or(Vec128<T, N> a, Vec128<T, N> b) {
440 return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
456 template <
typename T,
size_t N>
457 HWY_API Vec128<T, N>
Xor(Vec128<T, N> a, Vec128<T, N> b) {
458 return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
474 template <
typename T,
size_t N>
478 using VU =
VFromD<decltype(du)>;
479 #if HWY_TARGET <= HWY_AVX3
480 const __m128i vu =
BitCast(du,
v).raw;
481 return BitCast(
d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
489 template <
typename T,
size_t N>
490 HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
491 #if HWY_TARGET <= HWY_AVX3
494 using VU =
VFromD<decltype(du)>;
495 const __m128i ret = _mm_ternarylogic_epi64(
499 return Or(o,
And(a1, a2));
505 template <
typename T,
size_t N>
508 #if HWY_TARGET <= HWY_AVX3
511 using VU =
VFromD<decltype(du)>;
513 d, VU{_mm_ternarylogic_epi64(
BitCast(du, mask).raw,
BitCast(du, yes).raw,
522 template <
typename T,
size_t N>
523 HWY_API Vec128<T, N>
operator&(
const Vec128<T, N> a,
const Vec128<T, N> b) {
527 template <
typename T,
size_t N>
528 HWY_API Vec128<T, N>
operator|(
const Vec128<T, N> a,
const Vec128<T, N> b) {
532 template <
typename T,
size_t N>
533 HWY_API Vec128<T, N>
operator^(
const Vec128<T, N> a,
const Vec128<T, N> b) {
540 #if HWY_TARGET == HWY_AVX3_DL
542 #ifdef HWY_NATIVE_POPCNT
543 #undef HWY_NATIVE_POPCNT
545 #define HWY_NATIVE_POPCNT
550 template <
typename T,
size_t N>
555 template <
typename T,
size_t N>
560 template <
typename T,
size_t N>
565 template <
typename T,
size_t N>
573 template <
typename T,
size_t N>
584 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
589 template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
598 HWY_API Vec128<int8_t, N>
Abs(
const Vec128<int8_t, N>
v) {
599 #if HWY_COMPILER_MSVC
602 return Vec128<int8_t, N>{_mm_max_epi8(
v.raw, (zero -
v).raw)};
604 return Vec128<int8_t, N>{_mm_abs_epi8(
v.raw)};
608 HWY_API Vec128<int16_t, N>
Abs(
const Vec128<int16_t, N>
v) {
609 return Vec128<int16_t, N>{_mm_abs_epi16(
v.raw)};
612 HWY_API Vec128<int32_t, N>
Abs(
const Vec128<int32_t, N>
v) {
613 return Vec128<int32_t, N>{_mm_abs_epi32(
v.raw)};
617 HWY_API Vec128<float, N>
Abs(
const Vec128<float, N>
v) {
618 const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
629 template <
typename T,
size_t N>
631 const Vec128<T, N> sign) {
632 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
634 const DFromV<decltype(magn)>
d;
637 #if HWY_TARGET <= HWY_AVX3
649 const __m128i out = _mm_ternarylogic_epi32(
657 template <
typename T,
size_t N>
659 const Vec128<T, N> sign) {
660 #if HWY_TARGET <= HWY_AVX3
670 #if HWY_TARGET <= HWY_AVX3
679 template <
typename T,
size_t N>
685 template <
typename T,
size_t N>
691 template <
typename T,
size_t N>
697 template <
typename T,
size_t N>
706 template <
typename T,
size_t N>
727 template <
typename T,
size_t N>
732 template <
typename T,
size_t N>
737 template <
typename T,
size_t N>
742 template <
typename T,
size_t N>
750 template <
typename T,
size_t N>
769 template <
typename T,
size_t N>
775 template <
typename T,
size_t N>
780 template <
typename T,
size_t N>
785 template <
typename T,
size_t N>
793 template <
typename T,
size_t N>
813 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
814 #if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \
815 HWY_COMPILER_CLANG >= 800
816 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
818 #define HWY_COMPILER_HAS_MASK_INTRINSICS 0
824 template <
typename T,
size_t N>
827 #if HWY_COMPILER_HAS_MASK_INTRINSICS
833 template <
typename T,
size_t N>
836 #if HWY_COMPILER_HAS_MASK_INTRINSICS
842 template <
typename T,
size_t N>
845 #if HWY_COMPILER_HAS_MASK_INTRINSICS
851 template <
typename T,
size_t N>
854 #if HWY_COMPILER_HAS_MASK_INTRINSICS
861 template <
typename T,
size_t N>
864 #if HWY_COMPILER_HAS_MASK_INTRINSICS
870 template <
typename T,
size_t N>
873 #if HWY_COMPILER_HAS_MASK_INTRINSICS
879 template <
typename T,
size_t N>
882 #if HWY_COMPILER_HAS_MASK_INTRINSICS
888 template <
typename T,
size_t N>
891 #if HWY_COMPILER_HAS_MASK_INTRINSICS
898 template <
typename T,
size_t N>
901 #if HWY_COMPILER_HAS_MASK_INTRINSICS
907 template <
typename T,
size_t N>
910 #if HWY_COMPILER_HAS_MASK_INTRINSICS
916 template <
typename T,
size_t N>
919 #if HWY_COMPILER_HAS_MASK_INTRINSICS
925 template <
typename T,
size_t N>
928 #if HWY_COMPILER_HAS_MASK_INTRINSICS
935 template <
typename T,
size_t N>
938 #if HWY_COMPILER_HAS_MASK_INTRINSICS
944 template <
typename T,
size_t N>
947 #if HWY_COMPILER_HAS_MASK_INTRINSICS
953 template <
typename T,
size_t N>
956 #if HWY_COMPILER_HAS_MASK_INTRINSICS
962 template <
typename T,
size_t N>
965 #if HWY_COMPILER_HAS_MASK_INTRINSICS
974 template <
typename T,
size_t N>
975 HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
979 template <
typename T,
size_t N>
980 HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
984 template <
typename T,
size_t N>
985 HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
989 template <
typename T,
size_t N>
990 HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
994 template <
typename T,
size_t N>
995 HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1005 template <
typename T,
size_t N>
1007 return Mask128<T, N>{
v.raw};
1010 template <
typename T,
size_t N>
1012 return Vec128<T, N>{
v.raw};
1015 template <
typename T,
size_t N>
1017 const Mask128<T, N>
v) {
1018 return Vec128<T, N>{
v.raw};
1021 #if HWY_TARGET == HWY_SSSE3
1024 template <
typename T,
size_t N>
1034 template <
typename T,
size_t N>
1037 return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
1041 const Vec128<float, N> yes,
1042 const Vec128<float, N> no) {
1043 return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
1047 const Vec128<double, N> yes,
1048 const Vec128<double, N> no) {
1049 return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
1055 template <
typename T,
size_t N>
1061 template <
typename T,
size_t N>
1068 template <
typename T,
size_t N>
1069 HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1073 template <
typename T,
size_t N>
1074 HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1075 const Simd<T, N, 0>
d;
1079 template <
typename T,
size_t N>
1080 HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1081 const Simd<T, N, 0>
d;
1085 template <
typename T,
size_t N>
1086 HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1087 const Simd<T, N, 0>
d;
1091 template <
typename T,
size_t N>
1092 HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1093 const Simd<T, N, 0>
d;
1111 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
1116 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
1121 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
1127 return Vec128<uint32_t>{_mm_shuffle_epi32(
v.raw, 0x4E)};
1130 return Vec128<int32_t>{_mm_shuffle_epi32(
v.raw, 0x4E)};
1133 return Vec128<float>{_mm_shuffle_ps(
v.raw,
v.raw, 0x4E)};
1147 return Vec128<uint32_t>{_mm_shuffle_epi32(
v.raw, 0x39)};
1150 return Vec128<int32_t>{_mm_shuffle_epi32(
v.raw, 0x39)};
1153 return Vec128<float>{_mm_shuffle_ps(
v.raw,
v.raw, 0x39)};
1157 return Vec128<uint32_t>{_mm_shuffle_epi32(
v.raw, 0x93)};
1160 return Vec128<int32_t>{_mm_shuffle_epi32(
v.raw, 0x93)};
1163 return Vec128<float>{_mm_shuffle_ps(
v.raw,
v.raw, 0x93)};
1168 return Vec128<uint32_t>{_mm_shuffle_epi32(
v.raw, 0x1B)};
1171 return Vec128<int32_t>{_mm_shuffle_epi32(
v.raw, 0x1B)};
1174 return Vec128<float>{_mm_shuffle_ps(
v.raw,
v.raw, 0x1B)};
1179 #if HWY_TARGET <= HWY_AVX3
1183 template <
typename TFrom,
size_t NFrom,
typename TTo,
size_t NTo>
1186 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1192 template <
typename T,
size_t N>
1197 template <
typename T,
size_t N>
1202 template <
typename T,
size_t N>
1207 template <
typename T,
size_t N>
1215 template <
typename T,
size_t N>
1216 HWY_API Mask128<T, N>
TestBit(
const Vec128<T, N>
v,
const Vec128<T, N> bit) {
1217 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1223 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1228 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1230 return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
1233 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1235 return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
1238 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1240 return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
1245 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
1256 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1261 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1263 return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
1266 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1268 return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
1271 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1273 return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
1278 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
1291 HWY_API Mask128<int8_t, N>
operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1292 return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
1296 Vec128<int16_t, N> b) {
1297 return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
1301 Vec128<int32_t, N> b) {
1302 return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
1306 Vec128<int64_t, N> b) {
1307 return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
1312 Vec128<uint8_t, N> b) {
1313 return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
1317 Vec128<uint16_t, N> b) {
1318 return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
1322 Vec128<uint32_t, N> b) {
1323 return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
1327 Vec128<uint64_t, N> b) {
1328 return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
1332 HWY_API Mask128<float, N>
operator>(Vec128<float, N> a, Vec128<float, N> b) {
1333 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
1344 return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1356 template <
typename T,
size_t N>
1361 template <
typename T,
size_t N>
1366 template <
typename T,
size_t N>
1371 template <
typename T,
size_t N>
1379 template <
typename T,
size_t N>
1395 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1400 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1402 return Vec128<T, N>{_mm_movm_epi16(
v.raw)};
1405 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1407 return Vec128<T, N>{_mm_movm_epi32(
v.raw)};
1410 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1412 return Vec128<T, N>{_mm_movm_epi64(
v.raw)};
1425 template <
typename T,
size_t N>
1427 const Mask128<T, N>
v) {
1435 template <
typename TFrom,
typename TTo,
size_t N>
1437 Mask128<TFrom, N> m) {
1438 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1439 const Simd<TFrom, N, 0>
d;
1443 template <
typename T,
size_t N>
1445 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1446 return (
v & bit) == bit;
1454 const Vec128<uint8_t, N> b) {
1455 return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1459 const Vec128<uint16_t, N> b) {
1460 return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1464 const Vec128<uint32_t, N> b) {
1465 return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1469 const Vec128<uint64_t, N> b) {
1470 #if HWY_TARGET == HWY_SSSE3
1471 const Simd<uint32_t, N * 2, 0> d32;
1472 const Simd<uint64_t, N, 0> d64;
1477 return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
1484 const Vec128<int8_t, N> b) {
1485 return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
1489 Vec128<int16_t, N> b) {
1490 return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
1494 const Vec128<int32_t, N> b) {
1495 return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
1499 const Vec128<int64_t, N> b) {
1509 const Vec128<float, N> b) {
1510 return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
1514 const Vec128<double, N> b) {
1515 return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
1520 template <
typename T,
size_t N, HWY_IF_NOT_FLOAT(T)>
1527 const Vec128<float, N> b) {
1528 return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
1532 const Vec128<double, N> b) {
1533 return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
1540 HWY_API Mask128<int8_t, N>
operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
1541 return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
1545 Vec128<int16_t, N> b) {
1546 return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
1550 Vec128<int32_t, N> b) {
1551 return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
1554 template <
typename T,
size_t N, HWY_IF_UNSIGNED(T)>
1556 const DFromV<decltype(a)> du;
1558 const Vec128<T, N> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
1563 HWY_API Mask128<float, N>
operator>(Vec128<float, N> a, Vec128<float, N> b) {
1564 return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
1567 HWY_API Mask128<double, N>
operator>(Vec128<double, N> a, Vec128<double, N> b) {
1568 return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
1573 const Vec128<int64_t, N> b) {
1574 #if HWY_TARGET == HWY_SSSE3
1576 const __m128i m_gt = _mm_cmpgt_epi32(a.raw, b.raw);
1579 const __m128i m_eq = _mm_cmpeq_epi32(a.raw, b.raw);
1580 const __m128i lo_in_hi = _mm_shuffle_epi32(m_gt, _MM_SHUFFLE(2, 2, 0, 0));
1581 const __m128i lo_gt = _mm_and_si128(m_eq, lo_in_hi);
1583 const __m128i gt = _mm_or_si128(lo_gt, m_gt);
1585 return Mask128<int64_t, N>{_mm_shuffle_epi32(gt, _MM_SHUFFLE(3, 3, 1, 1))};
1587 return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};
1595 const Vec128<float, N> b) {
1596 return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
1600 const Vec128<double, N> b) {
1601 return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
1608 template <
typename T,
size_t N>
1613 template <
typename T,
size_t N>
1620 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1622 #if HWY_TARGET <= HWY_AVX3
1624 const uint64_t all = (1ull <<
N) - 1;
1626 const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num);
1645 #ifndef HWY_SAFE_PARTIAL_LOAD_STORE
1646 #if defined(__clang_analyzer__) || \
1647 (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
1648 #define HWY_SAFE_PARTIAL_LOAD_STORE 1
1650 #define HWY_SAFE_PARTIAL_LOAD_STORE 0
1656 template <
typename T>
1658 return Vec128<T>{_mm_load_si128(
reinterpret_cast<const __m128i*
>(aligned))};
1669 template <
typename T>
1671 return Vec128<T>{_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(p))};
1675 return Vec128<float>{_mm_loadu_ps(p)};
1682 template <
typename T>
1684 #if HWY_SAFE_PARTIAL_LOAD_STORE
1685 __m128i
v = _mm_setzero_si128();
1686 CopyBytes<8>(p, &
v);
1689 return Vec64<T>{_mm_loadl_epi64(
reinterpret_cast<const __m128i*
>(p))};
1695 #if HWY_SAFE_PARTIAL_LOAD_STORE
1696 __m128
v = _mm_setzero_ps();
1697 CopyBytes<8>(p, &
v);
1700 const __m128 hi = _mm_setzero_ps();
1701 return Vec128<float, 2>{_mm_loadl_pi(hi,
reinterpret_cast<const __m64*
>(p))};
1707 #if HWY_SAFE_PARTIAL_LOAD_STORE
1708 __m128d
v = _mm_setzero_pd();
1709 CopyBytes<8>(p, &
v);
1718 #if HWY_SAFE_PARTIAL_LOAD_STORE
1719 __m128
v = _mm_setzero_ps();
1720 CopyBytes<4>(p, &
v);
1728 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
1730 constexpr
size_t kSize =
sizeof(T) *
N;
1731 #if HWY_SAFE_PARTIAL_LOAD_STORE
1732 __m128
v = _mm_setzero_ps();
1733 CopyBytes<kSize>(p, &
v);
1734 return Vec128<T, N>{
v};
1737 CopyBytes<kSize>(p, &bits);
1738 return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
1743 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1749 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1755 template <
typename T,
size_t N,
typename T2, HWY_IF_LE128(T, N)>
1758 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
1759 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
1761 return Load(
d, lanes);
1766 #if HWY_TARGET <= HWY_AVX3
1768 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1774 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1777 return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, p)};
1780 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1783 return Vec128<T, N>{_mm_maskz_loadu_epi32(m.raw, p)};
1786 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1789 return Vec128<T, N>{_mm_maskz_loadu_epi64(m.raw, p)};
1806 #elif HWY_TARGET == HWY_AVX2
1808 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1811 auto p_p =
reinterpret_cast<const int*
>(p);
1812 return Vec128<T, N>{_mm_maskload_epi32(p_p, m.raw)};
1815 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1818 auto p_p =
reinterpret_cast<const long long*
>(p);
1819 return Vec128<T, N>{_mm_maskload_epi64(p_p, m.raw)};
1825 const Vec128<int32_t, N> mi =
1827 return Vec128<float, N>{_mm_maskload_ps(p, mi.raw)};
1833 const Vec128<int64_t, N> mi =
1835 return Vec128<double, N>{_mm_maskload_pd(p, mi.raw)};
1839 template <
typename T,
size_t N, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
1840 HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
1841 const T* HWY_RESTRICT p) {
1842 return IfThenElseZero(m, Load(d, p));
1848 template <
typename T,
size_t N>
1858 template <
typename T>
1860 _mm_store_si128(
reinterpret_cast<__m128i*
>(aligned),
v.raw);
1864 _mm_store_ps(aligned,
v.raw);
1868 _mm_store_pd(aligned,
v.raw);
1871 template <
typename T>
1873 _mm_storeu_si128(
reinterpret_cast<__m128i*
>(p),
v.raw);
1877 _mm_storeu_ps(p,
v.raw);
1881 _mm_storeu_pd(p,
v.raw);
1884 template <
typename T>
1886 #if HWY_SAFE_PARTIAL_LOAD_STORE
1887 CopyBytes<8>(&
v, p);
1889 _mm_storel_epi64(
reinterpret_cast<__m128i*
>(p),
v.raw);
1894 #if HWY_SAFE_PARTIAL_LOAD_STORE
1895 CopyBytes<8>(&
v, p);
1897 _mm_storel_pi(
reinterpret_cast<__m64*
>(p),
v.raw);
1902 #if HWY_SAFE_PARTIAL_LOAD_STORE
1903 CopyBytes<8>(&
v, p);
1905 _mm_storel_pd(p,
v.raw);
1910 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
1912 CopyBytes<sizeof(T) * N>(&
v, p);
1916 #if HWY_SAFE_PARTIAL_LOAD_STORE
1917 CopyBytes<4>(&
v, p);
1919 _mm_store_ss(p,
v.raw);
1924 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1937 template <
typename T,
size_t N>
1941 using TI =
TFromD<decltype(di)>;
1942 alignas(16) TI buf[
N];
1943 alignas(16) TI mask[
N];
1946 for (
size_t i = 0; i <
N; ++i) {
1948 CopyBytes<sizeof(T)>(buf + i, p + i);
1954 #if HWY_TARGET <= HWY_AVX3
1956 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
1959 _mm_mask_storeu_epi8(p, m.
raw,
v.raw);
1961 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1964 _mm_mask_storeu_epi16(p, m.raw,
v.raw);
1967 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1970 auto pi =
reinterpret_cast<int*
>(p);
1971 _mm_mask_storeu_epi32(pi, m.raw,
v.raw);
1974 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1977 auto pi =
reinterpret_cast<long long*
>(p);
1978 _mm_mask_storeu_epi64(pi, m.raw,
v.raw);
1984 _mm_mask_storeu_ps(p, m.
raw,
v.raw);
1990 _mm_mask_storeu_pd(p, m.
raw,
v.raw);
1993 #elif HWY_TARGET == HWY_AVX2
1995 template <
typename T,
size_t N, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
1996 HWY_API
void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
1997 T* HWY_RESTRICT p) {
1998 detail::ScalarMaskedStore(v, m, d, p);
2001 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2013 auto pi =
reinterpret_cast<int*
>(p);
2014 const Vec128<int32_t, N> vi =
BitCast(di,
v);
2015 _mm_maskstore_epi32(pi, m.raw, vi.raw);
2018 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2029 auto pi =
reinterpret_cast<long long*
>(p);
2031 _mm_maskstore_epi64(pi, m.raw, vi.raw);
2045 const Vec128<int32_t, N> mi =
2047 _mm_maskstore_ps(p, mi.raw,
v.raw);
2061 const Vec128<int64_t, N> mi =
2063 _mm_maskstore_pd(p, mi.raw,
v.raw);
2068 template <
typename T,
size_t N>
2084 const Vec128<uint8_t, N> b) {
2085 return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2089 const Vec128<uint16_t, N> b) {
2090 return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2094 const Vec128<uint32_t, N> b) {
2095 return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2099 const Vec128<uint64_t, N> b) {
2100 return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2106 const Vec128<int8_t, N> b) {
2107 return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
2111 const Vec128<int16_t, N> b) {
2112 return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
2116 const Vec128<int32_t, N> b) {
2117 return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
2121 const Vec128<int64_t, N> b) {
2122 return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
2128 const Vec128<float, N> b) {
2129 return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
2142 const Vec128<uint8_t, N> b) {
2143 return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2147 Vec128<uint16_t, N> b) {
2148 return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2152 const Vec128<uint32_t, N> b) {
2153 return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2157 const Vec128<uint64_t, N> b) {
2158 return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2164 const Vec128<int8_t, N> b) {
2165 return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
2169 const Vec128<int16_t, N> b) {
2170 return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
2174 const Vec128<int32_t, N> b) {
2175 return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
2179 const Vec128<int64_t, N> b) {
2180 return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
2186 const Vec128<float, N> b) {
2187 return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
2198 return Vec128<uint64_t,
N / 8>{_mm_sad_epu8(
v.raw, _mm_setzero_si128())};
2208 const Vec128<uint8_t, N> b) {
2209 return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
2213 const Vec128<uint16_t, N> b) {
2214 return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
2220 const Vec128<int8_t, N> b) {
2221 return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
2225 const Vec128<int16_t, N> b) {
2226 return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
2236 const Vec128<uint8_t, N> b) {
2237 return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
2241 const Vec128<uint16_t, N> b) {
2242 return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
2248 const Vec128<int8_t, N> b) {
2249 return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
2253 const Vec128<int16_t, N> b) {
2254 return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
2264 const Vec128<uint8_t, N> b) {
2265 return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
2269 const Vec128<uint16_t, N> b) {
2270 return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
2277 const Vec128<uint16_t, N> b) {
2278 return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2282 const Vec128<int16_t, N> b) {
2283 return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
2289 const Vec128<uint16_t, N> b) {
2290 return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
2294 const Vec128<int16_t, N> b) {
2295 return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
2300 const Vec128<int16_t, N> b) {
2301 return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
2307 HWY_API Vec128<uint64_t, (
N + 1) / 2>
MulEven(
const Vec128<uint32_t, N> a,
2308 const Vec128<uint32_t, N> b) {
2309 return Vec128<uint64_t, (
N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
2312 #if HWY_TARGET == HWY_SSSE3
2314 template <
size_t N, HWY_IF_LE64(
int32_t, N)>
2320 const Vec128<int32_t> b) {
2321 alignas(16) int32_t a_lanes[4];
2322 alignas(16) int32_t b_lanes[4];
2323 const Full128<int32_t> di32;
2324 Store(a, di32, a_lanes);
2325 Store(b, di32, b_lanes);
2326 alignas(16) int64_t mul[2];
2327 mul[0] = int64_t(a_lanes[0]) * b_lanes[0];
2328 mul[1] = int64_t(a_lanes[2]) * b_lanes[2];
2329 return Load(Full128<int64_t>(), mul);
2335 HWY_API Vec128<int64_t, (
N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
2336 const Vec128<int32_t, N> b) {
2337 return Vec128<int64_t, (
N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
2344 const Vec128<uint32_t, N> b) {
2345 #if HWY_TARGET == HWY_SSSE3
2349 const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
2350 const auto mullo_x2x0 =
MulEven(a, b);
2351 const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
2352 const auto mullo_x3x1 =
2353 MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
2356 const __m128i mul_20 =
2357 _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
2358 const __m128i mul_31 =
2359 _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
2360 return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
2362 return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
2368 const Vec128<int32_t, N> b) {
2377 template <
int kBits,
size_t N>
2379 return Vec128<uint16_t, N>{_mm_slli_epi16(
v.raw, kBits)};
2382 template <
int kBits,
size_t N>
2384 return Vec128<uint32_t, N>{_mm_slli_epi32(
v.raw, kBits)};
2387 template <
int kBits,
size_t N>
2389 return Vec128<uint64_t, N>{_mm_slli_epi64(
v.raw, kBits)};
2392 template <
int kBits,
size_t N>
2394 return Vec128<int16_t, N>{_mm_slli_epi16(
v.raw, kBits)};
2396 template <
int kBits,
size_t N>
2398 return Vec128<int32_t, N>{_mm_slli_epi32(
v.raw, kBits)};
2400 template <
int kBits,
size_t N>
2402 return Vec128<int64_t, N>{_mm_slli_epi64(
v.raw, kBits)};
2405 template <
int kBits,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2409 const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{
v.raw}).raw};
2412 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
2417 template <
int kBits,
size_t N>
2419 return Vec128<uint16_t, N>{_mm_srli_epi16(
v.raw, kBits)};
2421 template <
int kBits,
size_t N>
2423 return Vec128<uint32_t, N>{_mm_srli_epi32(
v.raw, kBits)};
2425 template <
int kBits,
size_t N>
2427 return Vec128<uint64_t, N>{_mm_srli_epi64(
v.raw, kBits)};
2430 template <
int kBits,
size_t N>
2434 const Vec128<uint8_t, N> shifted{
2435 ShiftRight<kBits>(Vec128<uint16_t>{
v.raw}).raw};
2436 return shifted &
Set(d8, 0xFF >> kBits);
2439 template <
int kBits,
size_t N>
2441 return Vec128<int16_t, N>{_mm_srai_epi16(
v.raw, kBits)};
2443 template <
int kBits,
size_t N>
2445 return Vec128<int32_t, N>{_mm_srai_epi32(
v.raw, kBits)};
2448 template <
int kBits,
size_t N>
2453 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
2454 return (shifted ^ shifted_sign) - shifted_sign;
2461 template <
int kBits,
size_t N>
2463 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
2464 #if HWY_TARGET <= HWY_AVX3
2465 return Vec128<uint32_t, N>{_mm_ror_epi32(
v.raw, kBits)};
2467 if (kBits == 0)
return v;
2472 template <
int kBits,
size_t N>
2474 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
2475 #if HWY_TARGET <= HWY_AVX3
2476 return Vec128<uint64_t, N>{_mm_ror_epi64(
v.raw, kBits)};
2478 if (kBits == 0)
return v;
2493 return ShiftRight<15>(
v);
2498 return ShiftRight<31>(
v);
2504 #if HWY_TARGET <= HWY_AVX3
2507 #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
2513 const auto sign = ShiftRight<31>(
BitCast(d32,
v));
2515 _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
2520 HWY_API Vec128<int64_t, N>
Abs(
const Vec128<int64_t, N>
v) {
2521 #if HWY_TARGET <= HWY_AVX3
2522 return Vec128<int64_t, N>{_mm_abs_epi64(
v.raw)};
2529 template <
int kBits,
size_t N>
2531 #if HWY_TARGET <= HWY_AVX3
2532 return Vec128<int64_t, N>{_mm_srai_epi64(
v.raw, kBits)};
2538 return right | sign;
2543 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
2546 #if HWY_TARGET == HWY_SSSE3
2564 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2567 static_assert(IsSigned<T>(),
"Only works for signed/float");
2576 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2579 static_assert(IsSigned<T>(),
"Only works for signed/float");
2593 return Vec128<uint16_t, N>{_mm_sll_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2598 return Vec128<uint32_t, N>{_mm_sll_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2603 return Vec128<uint64_t, N>{_mm_sll_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2609 return Vec128<int16_t, N>{_mm_sll_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2615 return Vec128<int32_t, N>{_mm_sll_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2621 return Vec128<int64_t, N>{_mm_sll_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2624 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2628 const Vec128<T, N> shifted{
2630 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
2638 return Vec128<uint16_t, N>{_mm_srl_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2643 return Vec128<uint32_t, N>{_mm_srl_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2648 return Vec128<uint64_t, N>{_mm_srl_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2656 const Vec128<uint8_t, N> shifted{
2658 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
2664 return Vec128<int16_t, N>{_mm_sra_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
2670 return Vec128<int32_t, N>{_mm_sra_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
2675 #if HWY_TARGET <= HWY_AVX3
2676 return Vec128<int64_t, N>{_mm_sra_epi64(
v.raw, _mm_cvtsi32_si128(bits))};
2682 return right | sign;
2691 const auto shifted_sign =
2692 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
2693 return (shifted ^ shifted_sign) - shifted_sign;
2699 HWY_API Vec128<float, N>
operator*(Vec128<float, N> a, Vec128<float, N> b) {
2700 return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
2717 const Vec128<float, N> b) {
2718 return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
2736 return Vec128<float, N>{_mm_rcp_ps(
v.raw)};
2745 const Vec128<float, N> b) {
2754 const Vec128<float, N> x,
2755 const Vec128<float, N> add) {
2756 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2757 return mul * x + add;
2759 return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
2766 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2767 return mul * x + add;
2776 const Vec128<float, N> x,
2777 const Vec128<float, N> add) {
2778 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2779 return add - mul * x;
2781 return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
2788 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2789 return add - mul * x;
2798 const Vec128<float, N> x,
2799 const Vec128<float, N> sub) {
2800 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2801 return mul * x - sub;
2803 return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
2810 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2811 return mul * x - sub;
2820 const Vec128<float, N> x,
2821 const Vec128<float, N> sub) {
2822 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2823 return Neg(mul) * x - sub;
2825 return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2832 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
2833 return Neg(mul) * x - sub;
2843 HWY_API Vec128<float, N>
Sqrt(
const Vec128<float, N>
v) {
2844 return Vec128<float, N>{_mm_sqrt_ps(
v.raw)};
2860 return Vec128<float, N>{_mm_rsqrt_ps(
v.raw)};
2870 template <
typename T,
size_t N>
2876 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
2885 HWY_API Vec128<uint8_t, N>
Min(
const Vec128<uint8_t, N> a,
2886 const Vec128<uint8_t, N> b) {
2887 return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
2890 HWY_API Vec128<uint16_t, N>
Min(
const Vec128<uint16_t, N> a,
2891 const Vec128<uint16_t, N> b) {
2892 #if HWY_TARGET == HWY_SSSE3
2895 return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
2899 HWY_API Vec128<uint32_t, N>
Min(
const Vec128<uint32_t, N> a,
2900 const Vec128<uint32_t, N> b) {
2901 #if HWY_TARGET == HWY_SSSE3
2904 return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
2908 HWY_API Vec128<uint64_t, N>
Min(
const Vec128<uint64_t, N> a,
2909 const Vec128<uint64_t, N> b) {
2910 #if HWY_TARGET <= HWY_AVX3
2911 return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
2919 HWY_API Vec128<int8_t, N>
Min(
const Vec128<int8_t, N> a,
2920 const Vec128<int8_t, N> b) {
2921 #if HWY_TARGET == HWY_SSSE3
2924 return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
2928 HWY_API Vec128<int16_t, N>
Min(
const Vec128<int16_t, N> a,
2929 const Vec128<int16_t, N> b) {
2930 return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
2933 HWY_API Vec128<int32_t, N>
Min(
const Vec128<int32_t, N> a,
2934 const Vec128<int32_t, N> b) {
2935 #if HWY_TARGET == HWY_SSSE3
2938 return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
2942 HWY_API Vec128<int64_t, N>
Min(
const Vec128<int64_t, N> a,
2943 const Vec128<int64_t, N> b) {
2944 #if HWY_TARGET <= HWY_AVX3
2945 return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
2953 HWY_API Vec128<float, N>
Min(
const Vec128<float, N> a,
2954 const Vec128<float, N> b) {
2955 return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
2966 template <
typename T,
size_t N>
2972 const auto msb =
Set(du,
static_cast<T
>(T(1) << (
sizeof(T) * 8 - 1)));
2981 HWY_API Vec128<uint8_t, N>
Max(
const Vec128<uint8_t, N> a,
2982 const Vec128<uint8_t, N> b) {
2983 return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
2986 HWY_API Vec128<uint16_t, N>
Max(
const Vec128<uint16_t, N> a,
2987 const Vec128<uint16_t, N> b) {
2988 #if HWY_TARGET == HWY_SSSE3
2991 return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
2995 HWY_API Vec128<uint32_t, N>
Max(
const Vec128<uint32_t, N> a,
2996 const Vec128<uint32_t, N> b) {
2997 #if HWY_TARGET == HWY_SSSE3
3000 return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
3004 HWY_API Vec128<uint64_t, N>
Max(
const Vec128<uint64_t, N> a,
3005 const Vec128<uint64_t, N> b) {
3006 #if HWY_TARGET <= HWY_AVX3
3007 return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
3015 HWY_API Vec128<int8_t, N>
Max(
const Vec128<int8_t, N> a,
3016 const Vec128<int8_t, N> b) {
3017 #if HWY_TARGET == HWY_SSSE3
3020 return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
3024 HWY_API Vec128<int16_t, N>
Max(
const Vec128<int16_t, N> a,
3025 const Vec128<int16_t, N> b) {
3026 return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
3029 HWY_API Vec128<int32_t, N>
Max(
const Vec128<int32_t, N> a,
3030 const Vec128<int32_t, N> b) {
3031 #if HWY_TARGET == HWY_SSSE3
3034 return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
3038 HWY_API Vec128<int64_t, N>
Max(
const Vec128<int64_t, N> a,
3039 const Vec128<int64_t, N> b) {
3040 #if HWY_TARGET <= HWY_AVX3
3041 return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
3049 HWY_API Vec128<float, N>
Max(
const Vec128<float, N> a,
3050 const Vec128<float, N> b) {
3051 return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
3065 template <
typename T,
size_t N>
3068 _mm_stream_si128(
reinterpret_cast<__m128i*
>(aligned),
v.raw);
3073 _mm_stream_ps(aligned,
v.raw);
3078 _mm_stream_pd(aligned,
v.raw);
3089 static_assert(sizeof(
GatherIndex64) == 8, "Must be 64-bit type");
3091 #if HWY_TARGET <= HWY_AVX3
3094 template <
typename T,
size_t N>
3099 _mm_i32scatter_epi32(base, offset.
raw,
v.raw, 1);
3101 const __mmask8 mask = (1u <<
N) - 1;
3102 _mm_mask_i32scatter_epi32(base, mask, offset.
raw,
v.raw, 1);
3105 template <
typename T,
size_t N>
3110 _mm_i32scatter_epi32(base, index.
raw,
v.raw, 4);
3112 const __mmask8 mask = (1u <<
N) - 1;
3113 _mm_mask_i32scatter_epi32(base, mask, index.
raw,
v.raw, 4);
3117 template <
typename T,
size_t N>
3122 _mm_i64scatter_epi64(base, offset.
raw,
v.raw, 1);
3124 const __mmask8 mask = (1u <<
N) - 1;
3125 _mm_mask_i64scatter_epi64(base, mask, offset.
raw,
v.raw, 1);
3128 template <
typename T,
size_t N>
3133 _mm_i64scatter_epi64(base, index.
raw,
v.raw, 8);
3135 const __mmask8 mask = (1u <<
N) - 1;
3136 _mm_mask_i64scatter_epi64(base, mask, index.
raw,
v.raw, 8);
3142 template <
typename T,
size_t N,
typename Offset>
3146 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3149 template <
typename T,
size_t N,
typename Index>
3152 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3161 _mm_i32scatter_ps(base, offset.
raw,
v.raw, 1);
3163 const __mmask8 mask = (1u <<
N) - 1;
3164 _mm_mask_i32scatter_ps(base, mask, offset.
raw,
v.raw, 1);
3172 _mm_i32scatter_ps(base, index.
raw,
v.raw, 4);
3174 const __mmask8 mask = (1u <<
N) - 1;
3175 _mm_mask_i32scatter_ps(base, mask, index.
raw,
v.raw, 4);
3184 _mm_i64scatter_pd(base, offset.
raw,
v.raw, 1);
3186 const __mmask8 mask = (1u <<
N) - 1;
3187 _mm_mask_i64scatter_pd(base, mask, offset.
raw,
v.raw, 1);
3195 _mm_i64scatter_pd(base, index.
raw,
v.raw, 8);
3197 const __mmask8 mask = (1u <<
N) - 1;
3198 _mm_mask_i64scatter_pd(base, mask, index.
raw,
v.raw, 8);
3203 template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
3206 const Vec128<Offset, N> offset) {
3207 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3209 alignas(16) T lanes[
N];
3212 alignas(16) Offset offset_lanes[
N];
3213 Store(offset,
Rebind<Offset, decltype(
d)>(), offset_lanes);
3215 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
3216 for (
size_t i = 0; i <
N; ++i) {
3217 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
3221 template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
3223 const Vec128<Index, N> index) {
3224 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3226 alignas(16) T lanes[
N];
3229 alignas(16) Index index_lanes[
N];
3230 Store(index,
Rebind<Index, decltype(
d)>(), index_lanes);
3232 for (
size_t i = 0; i <
N; ++i) {
3233 base[index_lanes[i]] = lanes[i];
3241 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
3243 template <
typename T,
size_t N,
typename Offset>
3246 const Vec128<Offset, N> offset) {
3247 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
3249 alignas(16) Offset offset_lanes[
N];
3250 Store(offset,
Rebind<Offset, decltype(
d)>(), offset_lanes);
3252 alignas(16) T lanes[
N];
3253 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
3254 for (
size_t i = 0; i <
N; ++i) {
3255 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
3257 return Load(
d, lanes);
3260 template <
typename T,
size_t N,
typename Index>
3263 const Vec128<Index, N> index) {
3264 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
3266 alignas(16) Index index_lanes[
N];
3267 Store(index,
Rebind<Index, decltype(
d)>(), index_lanes);
3269 alignas(16) T lanes[
N];
3270 for (
size_t i = 0; i <
N; ++i) {
3271 lanes[i] = base[index_lanes[i]];
3273 return Load(
d, lanes);
3280 template <
typename T,
size_t N>
3284 const Vec128<int32_t, N> offset) {
3285 return Vec128<T, N>{_mm_i32gather_epi32(
3286 reinterpret_cast<const int32_t*
>(base), offset.raw, 1)};
3288 template <
typename T,
size_t N>
3292 const Vec128<int32_t, N> index) {
3293 return Vec128<T, N>{_mm_i32gather_epi32(
3294 reinterpret_cast<const int32_t*
>(base), index.raw, 4)};
3297 template <
typename T,
size_t N>
3301 const Vec128<int64_t, N> offset) {
3302 return Vec128<T, N>{_mm_i64gather_epi64(
3303 reinterpret_cast<const GatherIndex64*
>(base), offset.raw, 1)};
3305 template <
typename T,
size_t N>
3309 const Vec128<int64_t, N> index) {
3310 return Vec128<T, N>{_mm_i64gather_epi64(
3311 reinterpret_cast<const GatherIndex64*
>(base), index.raw, 8)};
3316 template <
typename T,
size_t N,
typename Offset>
3318 const Vec128<Offset, N> offset) {
3321 template <
typename T,
size_t N,
typename Index>
3323 const Vec128<Index, N> index) {
3330 const Vec128<int32_t, N> offset) {
3331 return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
3336 const Vec128<int32_t, N> index) {
3337 return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
3343 const Vec128<int64_t, N> offset) {
3344 return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
3349 const Vec128<int64_t, N> index) {
3350 return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
3362 template <
typename T,
size_t N>
3365 return Vec128<T,
N / 2>{
v.raw};
3368 template <
typename T,
size_t N>
3375 template <
int kBytes,
typename T,
size_t N>
3377 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3378 return Vec128<T, N>{_mm_slli_si128(
v.raw, kBytes)};
3381 template <
int kBytes,
typename T,
size_t N>
3383 return ShiftLeftBytes<kBytes>(
DFromV<decltype(
v)>(),
v);
3388 template <
int kLanes,
typename T,
size_t N>
3394 template <
int kLanes,
typename T,
size_t N>
3396 return ShiftLeftLanes<kLanes>(
DFromV<decltype(
v)>(),
v);
3400 template <
int kBytes,
typename T,
size_t N>
3402 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
3404 if (
N != 16 /
sizeof(T)) {
3405 const Vec128<T> vfull{
v.raw};
3408 return Vec128<T, N>{_mm_srli_si128(
v.raw, kBytes)};
3412 template <
int kLanes,
typename T,
size_t N>
3421 template <
typename T>
3423 return Vec64<T>{_mm_unpackhi_epi64(
v.raw,
v.raw)};
3426 return Vec128<float, 2>{_mm_movehl_ps(
v.raw,
v.raw)};
3433 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3440 return Vec128<T, (
N + 1) / 2>{upper.raw};
3445 template <
int kBytes,
typename T,
class V = Vec128<T>>
3448 return BitCast(
d, Vec128<uint8_t>{_mm_alignr_epi8(
3452 template <
int kBytes,
typename T,
size_t N,
HWY_IF_LE64(T,
N),
3453 class V = Vec128<T, N>>
3455 constexpr
size_t kSize =
N *
sizeof(T);
3456 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
3458 const Full128<uint8_t> d_full8;
3459 using V8 =
VFromD<decltype(d_full8)>;
3460 const V8 hi8{
BitCast(d8, hi).raw};
3464 return V{
BitCast(Full128<T>(), r).raw};
3470 template <
int kLane,
size_t N>
3472 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3474 const __m128i lo = _mm_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
3477 const __m128i hi = _mm_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
3481 template <
int kLane,
size_t N>
3483 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3486 template <
int kLane,
size_t N>
3488 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3493 template <
int kLane,
size_t N>
3495 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3497 const __m128i lo = _mm_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
3500 const __m128i hi = _mm_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
3504 template <
int kLane,
size_t N>
3506 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3509 template <
int kLane,
size_t N>
3511 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3516 template <
int kLane,
size_t N>
3518 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3521 template <
int kLane,
size_t N>
3523 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
3528 template <
typename T,
size_t N,
typename TI,
size_t NI>
3530 const Vec128<TI, NI> from) {
3531 return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
3536 template <
class V,
class VI>
3544 template <
typename T,
size_t N = 16 /
sizeof(T)>
3552 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3553 #if HWY_IS_DEBUG_BUILD
3554 const Rebind<TI, decltype(
d)> di;
3559 #if HWY_TARGET <= HWY_AVX2
3564 using V8 =
VFromD<decltype(d8)>;
3565 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
3566 0, 1, 2, 3, 0, 1, 2, 3};
3569 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
3570 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
3575 const V8 byte_indices =
BitCast(d8, ShiftLeft<2>(
BitCast(d16, lane_indices)));
3584 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3585 #if HWY_IS_DEBUG_BUILD
3586 const Rebind<TI, decltype(
d)> di;
3594 return Indices128<T, N>{vec.raw};
3597 template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
3599 const Rebind<TI, decltype(
d)> di;
3603 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3605 #if HWY_TARGET <= HWY_AVX2
3615 template <
size_t N, HWY_IF_GE64(
float, N)>
3618 #if HWY_TARGET <= HWY_AVX2
3629 template <
typename T>
3635 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3639 #if HWY_TARGET <= HWY_AVX2
3659 #if HWY_TARGET <= HWY_AVX2
3677 template <
typename T>
3685 template <
typename T>
3691 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3696 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3702 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3708 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3710 #if HWY_TARGET <= HWY_AVX3
3711 if (
N == 1)
return v;
3717 alignas(16) constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
3718 const Vec128<int16_t, N> idx =
Load(di, kReverse + (
N == 8 ? 0 : 4));
3719 return BitCast(
d, Vec128<int16_t, N>{
3720 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
3729 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3735 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
3740 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3747 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3752 return BitCast(
d, Vec128<int16_t, N>{_mm_shufflelo_epi16(
3753 BitCast(di,
v).raw, _MM_SHUFFLE(0, 1, 2, 3))});
3756 #if HWY_TARGET <= HWY_AVX3
3757 alignas(16) constexpr int16_t kReverse4[8] = {3, 2, 1, 0, 7, 6, 5, 4};
3758 const Vec128<int16_t, N> idx =
Load(di, kReverse4);
3759 return BitCast(
d, Vec128<int16_t, N>{
3760 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
3768 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3773 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
3780 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
3782 #if HWY_TARGET <= HWY_AVX3
3784 alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
3785 15, 14, 13, 12, 11, 10, 9, 8};
3786 const Vec128<int16_t, N> idx =
Load(di, kReverse8);
3787 return BitCast(
d, Vec128<int16_t, N>{
3788 _mm_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
3795 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
3806 template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
3811 template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
3816 template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
3821 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
3827 template <
size_t N, HWY_IF_LE128(
int8_t, N)>
3832 template <
size_t N, HWY_IF_LE128(
int16_t, N)>
3837 template <
size_t N, HWY_IF_LE128(
int32_t, N)>
3842 template <
size_t N, HWY_IF_LE128(
int64_t, N)>
3848 template <
size_t N, HWY_IF_LE128(
float, N)>
3850 const Vec128<float, N> b) {
3851 return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
3853 template <
size_t N, HWY_IF_LE128(
double, N)>
3905 const Vec128<float> b) {
3906 return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
3916 template <
typename T,
class V = Vec128<T>>
3922 template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
3924 const Half<decltype(
d)> d2;
3932 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
3936 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
3941 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
3951 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
3952 HWY_API Vec128<T, N>
Combine(Simd<T, N, 0>
d, Vec128<T, N / 2> hi_half,
3953 Vec128<T, N / 2> lo_half) {
3954 const Half<decltype(
d)> d2;
3958 const VU lo{
BitCast(du2, lo_half).raw};
3959 const VU hi{
BitCast(du2, hi_half).raw};
3965 template <
typename T, HWY_IF_NOT_FLOAT(T)>
3970 template <
typename T, HWY_IF_FLOAT(T)>
3976 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3984 template <
typename T>
3991 template <
typename T>
3998 template <
typename T>
4000 const Vec128<T> lo) {
4001 return CombineShiftRightBytes<8>(
d, hi, lo);
4005 template <
typename T>
4007 #if HWY_TARGET == HWY_SSSE3
4009 const __m128d concat = _mm_move_sd(
BitCast(dd, hi).raw,
BitCast(dd, lo).raw);
4029 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4032 const Half<decltype(
d)> d2;
4036 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4039 const Half<decltype(
d)> d2;
4043 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4045 const Vec128<T, N> lo) {
4046 const Half<decltype(
d)> d2;
4050 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
4053 const Half<decltype(
d)> d2;
4060 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4064 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
4065 _MM_SHUFFLE(3, 1, 3, 1))});
4070 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
4074 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4081 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4089 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4093 d, Vec128<float>{_mm_shuffle_ps(
BitCast(df, lo).raw,
BitCast(df, hi).raw,
4094 _MM_SHUFFLE(2, 0, 2, 0))});
4099 return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
4103 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4110 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4117 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4119 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4124 _mm_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
4127 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4134 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4136 return Vec128<T, N>{_mm_shuffle_epi32(
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4141 _mm_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
4144 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
4153 template <
typename T,
size_t N>
4155 const Vec128<T, N> b) {
4158 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
4159 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
4162 template <
typename T,
size_t N>
4164 const Vec128<T, N> b) {
4165 #if HWY_TARGET == HWY_SSSE3
4168 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
4169 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
4172 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
4175 template <
typename T,
size_t N>
4177 const Vec128<T, N> b) {
4178 #if HWY_TARGET == HWY_SSSE3
4179 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4180 const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4181 return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
4183 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x33)};
4186 template <
typename T,
size_t N>
4188 const Vec128<T, N> b) {
4189 #if HWY_TARGET == HWY_SSSE3
4190 const Full128<double> dd;
4191 const __m128d concat = _mm_move_sd(
BitCast(dd, a).raw,
BitCast(dd, b).raw);
4192 return BitCast(Full128<T>(), Vec128<double>{concat});
4194 return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x0F)};
4200 template <
typename T,
size_t N>
4201 HWY_API Vec128<T, N>
OddEven(
const Vec128<T, N> a,
const Vec128<T, N> b) {
4206 const Vec128<float, N> b) {
4207 #if HWY_TARGET == HWY_SSSE3
4210 const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
4211 const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
4212 return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
4214 return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
4225 template <
typename T,
size_t N>
4232 template <
typename T,
size_t N>
4243 #if HWY_TARGET > HWY_AVX3
4247 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
4251 const Rebind<float, decltype(dw)> df;
4252 const auto zero =
Zero(
d);
4255 const auto upper = exp +
Set(
d, 0x3F80);
4257 const auto f0 =
ZipLower(dw, zero, upper);
4258 const auto f1 =
ZipUpper(dw, zero, upper);
4260 const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(
BitCast(df, f0).raw)};
4261 const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(
BitCast(df, f1).raw)};
4262 return Vec128<MakeUnsigned<T>,
N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
4266 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
4269 const auto exp = ShiftLeft<23>(
v);
4270 const auto f = exp +
Set(
d, 0x3F800000);
4274 return Vec128<MakeUnsigned<T>,
N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
4283 #if HWY_TARGET <= HWY_AVX3
4297 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4309 const Vec128<uint64_t> bits) {
4310 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4312 const Vec128<uint64_t> out0{_mm_sll_epi64(
v.raw, bits.raw)};
4313 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
4314 const Vec128<uint64_t> out1{_mm_sll_epi64(
v.raw, bits1)};
4317 return Vec128<uint64_t>{_mm_sllv_epi64(
v.raw, bits.raw)};
4321 const Vec64<uint64_t> bits) {
4322 return Vec64<uint64_t>{_mm_sll_epi64(
v.raw, bits.raw)};
4326 template <
typename T,
size_t N, HWY_IF_SIGNED(T)>
4343 #if HWY_TARGET <= HWY_AVX3
4361 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4369 const auto out20 = ShiftRight<32>(
MulEven(in, mul));
4386 const Vec128<uint64_t> bits) {
4387 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
4389 const Vec128<uint64_t> out0{_mm_srl_epi64(
v.raw, bits.raw)};
4390 const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
4391 const Vec128<uint64_t> out1{_mm_srl_epi64(
v.raw, bits1)};
4394 return Vec128<uint64_t>{_mm_srlv_epi64(
v.raw, bits.raw)};
4398 const Vec64<uint64_t> bits) {
4399 return Vec64<uint64_t>{_mm_srl_epi64(
v.raw, bits.raw)};
4402 #if HWY_TARGET > HWY_AVX3
4406 template <
class DI,
class V>
4407 HWY_INLINE V SignedShr(
const DI di,
const V
v,
const V count_i) {
4408 const RebindToUnsigned<DI> du;
4409 const auto count =
BitCast(du, count_i);
4413 const auto abs =
BitCast(du,
v ^ sign);
4414 return BitCast(di, abs >> count) ^ sign;
4423 #if HWY_TARGET <= HWY_AVX3
4437 #if HWY_TARGET <= HWY_AVX3
4451 #if HWY_TARGET <= HWY_AVX3
4461 const Vec128<uint64_t> b) {
4462 alignas(16) uint64_t mul[2];
4464 return Load(Full128<uint64_t>(), mul);
4468 const Vec128<uint64_t> b) {
4469 alignas(16) uint64_t mul[2];
4470 const Half<Full128<uint64_t>> d2;
4473 return Load(Full128<uint64_t>(), mul);
4480 Vec128<bfloat16_t, 2 * N> a,
4481 Vec128<bfloat16_t, 2 * N> b,
4482 const Vec128<float, N> sum0,
4483 Vec128<float, N>& sum1) {
4487 const Vec128<uint16_t, 2 * N> zero =
Zero(du16);
4490 const Vec128<uint32_t, N> a0 =
ZipLower(du32, zero,
BitCast(du16, a));
4491 const Vec128<uint32_t, N> a1 =
ZipUpper(du32, zero,
BitCast(du16, a));
4492 const Vec128<uint32_t, N> b0 =
ZipLower(du32, zero,
BitCast(du16, b));
4493 const Vec128<uint32_t, N> b1 =
ZipUpper(du32, zero,
BitCast(du16, b));
4505 const Vec128<uint8_t, N>
v) {
4506 #if HWY_TARGET == HWY_SSSE3
4507 const __m128i zero = _mm_setzero_si128();
4508 return Vec128<uint16_t, N>{_mm_unpacklo_epi8(
v.raw, zero)};
4510 return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(
v.raw)};
4515 const Vec128<uint16_t, N>
v) {
4516 #if HWY_TARGET == HWY_SSSE3
4517 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(
v.raw, _mm_setzero_si128())};
4519 return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(
v.raw)};
4524 const Vec128<uint32_t, N>
v) {
4525 #if HWY_TARGET == HWY_SSSE3
4526 return Vec128<uint64_t, N>{_mm_unpacklo_epi32(
v.raw, _mm_setzero_si128())};
4528 return Vec128<uint64_t, N>{_mm_cvtepu32_epi64(
v.raw)};
4533 const Vec128<uint8_t, N>
v) {
4534 #if HWY_TARGET == HWY_SSSE3
4535 const __m128i zero = _mm_setzero_si128();
4536 const __m128i u16 = _mm_unpacklo_epi8(
v.raw, zero);
4537 return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)};
4539 return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(
v.raw)};
4546 const Vec128<uint8_t, N>
v) {
4551 const Vec128<uint16_t, N>
v) {
4556 const Vec128<uint8_t, N>
v) {
4563 const Vec128<int8_t, N>
v) {
4564 #if HWY_TARGET == HWY_SSSE3
4565 return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(
v.raw,
v.raw)});
4567 return Vec128<int16_t, N>{_mm_cvtepi8_epi16(
v.raw)};
4572 const Vec128<int16_t, N>
v) {
4573 #if HWY_TARGET == HWY_SSSE3
4574 return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(
v.raw,
v.raw)});
4576 return Vec128<int32_t, N>{_mm_cvtepi16_epi32(
v.raw)};
4581 const Vec128<int32_t, N>
v) {
4582 #if HWY_TARGET == HWY_SSSE3
4583 return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(
v.raw,
v.raw)});
4585 return Vec128<int64_t, N>{_mm_cvtepi32_epi64(
v.raw)};
4590 const Vec128<int8_t, N>
v) {
4591 #if HWY_TARGET == HWY_SSSE3
4592 const __m128i x2 = _mm_unpacklo_epi8(
v.raw,
v.raw);
4593 const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
4594 return ShiftRight<24>(Vec128<int32_t, N>{x4});
4596 return Vec128<int32_t, N>{_mm_cvtepi8_epi32(
v.raw)};
4602 #if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
4603 #define HWY_INLINE_F16 HWY_NOINLINE
4605 #define HWY_INLINE_F16 HWY_INLINE
4610 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
4615 const auto sign = ShiftRight<15>(bits16);
4616 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
4617 const auto mantissa = bits16 &
Set(du32, 0x3FF);
4618 const auto subnormal =
4620 Set(df32, 1.0f / 16384 / 1024));
4622 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
4623 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
4624 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
4625 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
4626 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
4635 const Vec128<bfloat16_t, N>
v) {
4636 const Rebind<uint16_t, decltype(df32)> du16;
4649 const Vec128<int32_t, N>
v) {
4650 return Vec128<double, N>{_mm_cvtepi32_pd(
v.raw)};
4657 const Vec128<int32_t, N>
v) {
4658 #if HWY_TARGET == HWY_SSSE3
4659 const Simd<int32_t, N, 0> di32;
4660 const Simd<uint16_t, N * 2, 0> du16;
4661 const auto zero_if_neg =
AndNot(ShiftRight<31>(
v),
v);
4663 const auto clamped =
Or(zero_if_neg, too_big);
4665 alignas(16) constexpr uint16_t kLower2Bytes[16] = {
4666 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
4667 const auto lo2 =
Load(du16, kLower2Bytes);
4670 return Vec128<uint16_t, N>{_mm_packus_epi32(
v.raw,
v.raw)};
4676 const Vec128<int32_t, N>
v) {
4677 return Vec128<int16_t, N>{_mm_packs_epi32(
v.raw,
v.raw)};
4682 const Vec128<int32_t, N>
v) {
4683 const __m128i i16 = _mm_packs_epi32(
v.raw,
v.raw);
4684 return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
4689 const Vec128<int16_t, N>
v) {
4690 return Vec128<uint8_t, N>{_mm_packus_epi16(
v.raw,
v.raw)};
4695 const Vec128<int32_t, N>
v) {
4696 const __m128i i16 = _mm_packs_epi32(
v.raw,
v.raw);
4697 return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
4702 const Vec128<int16_t, N>
v) {
4703 return Vec128<int8_t, N>{_mm_packs_epi16(
v.raw,
v.raw)};
4708 const Vec128<float, N>
v) {
4709 #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
4711 const Rebind<uint32_t, decltype(df16)> du;
4713 const auto bits32 =
BitCast(du,
v);
4714 const auto sign = ShiftRight<31>(bits32);
4715 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
4716 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
4718 const auto k15 =
Set(di, 15);
4719 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
4720 const auto is_tiny = exp <
Set(di, -24);
4722 const auto is_subnormal = exp <
Set(di, -14);
4723 const auto biased_exp16 =
4725 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
4726 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
4727 (mantissa32 >> (
Set(du, 13) + sub_exp));
4729 ShiftRight<13>(mantissa32));
4731 const auto sign16 = ShiftLeft<15>(sign);
4732 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
4737 return Vec128<float16_t, N>{_mm_cvtps_ph(
v.raw, _MM_FROUND_NO_EXC)};
4743 const Vec128<float, N>
v) {
4745 const Rebind<int32_t, decltype(dbf16)> di32;
4746 const Rebind<uint32_t, decltype(dbf16)> du32;
4747 const Rebind<uint16_t, decltype(dbf16)> du16;
4748 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
4754 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
4757 const Repartition<uint32_t, decltype(dbf16)> du32;
4758 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(
BitCast(du32, b));
4774 -> decltype(
Zero(
d)) {
4777 return Min(
v,
Set(
d, 2147483647.0));
4783 template <
class DI,
class DF = RebindToFloat<DI>>
4785 decltype(
Zero(di).raw) converted_raw)
4792 const auto converted =
VFromD<DI>{converted_raw};
4793 const auto sign_wrong =
AndNot(
BitCast(di, original), converted);
4794 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
4810 const Vec128<double, N>
v) {
4812 return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
4818 const Simd<uint32_t, N, 0> d32;
4819 const Simd<uint8_t, N * 4, 0> d8;
4820 alignas(16)
static constexpr uint32_t k8From32[4] = {
4821 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
4831 const Vec128<int32_t, N>
v) {
4832 return Vec128<float, N>{_mm_cvtepi32_ps(
v.raw)};
4838 #if HWY_TARGET <= HWY_AVX3
4847 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
4848 const auto v_upper =
BitCast(dd, ShiftRight<32>(
BitCast(d64,
v)) ^ k84_63);
4851 const auto k52 =
Set(d32, 0x43300000);
4854 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
4855 return (v_upper - k84_63_52) + v_lower;
4862 const Vec128<float, N>
v) {
4868 #if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
4870 #elif HWY_ARCH_X86_64
4871 const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(
v.raw));
4873 const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(
UpperHalf(dd2,
v).raw));
4876 using VI =
VFromD<decltype(di)>;
4877 const VI k0 =
Zero(di);
4878 const VI k1 =
Set(di, 1);
4879 const VI k51 =
Set(di, 51);
4882 const VI biased_exp = ShiftRight<52>(
BitCast(di,
v)) &
Set(di, 0x7FF);
4883 const VI exp = biased_exp -
Set(di, 0x3FF);
4884 const auto in_range = exp <
Set(di, 63);
4892 const VI shift_mnt =
Max(k51 - exp, k0);
4893 const VI shift_int =
Max(exp - k51, k0);
4894 const VI mantissa =
BitCast(di,
v) &
Set(di, (1ULL << 52) - 1);
4896 const VI int52 = (mantissa |
Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4898 const VI shifted = int52 << shift_int;
4900 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4904 const VI limit =
Set(di, LimitsMax<int64_t>()) - sign_mask;
4905 const VI magnitude =
IfThenElse(in_range, restored, limit);
4908 return (magnitude ^ sign_mask) - sign_mask;
4913 #if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
4925 const Simd<int32_t, N, 0> di;
4931 #if HWY_TARGET == HWY_SSSE3
4934 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4940 const auto max =
Set(df, MantissaEnd<T>());
4942 const auto added = large +
v;
4943 const auto rounded = added - large;
4953 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4961 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4967 const auto int_f =
ConvertTo(df, integer);
4973 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4979 const auto int_f =
ConvertTo(df, integer);
4988 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
4994 const auto int_f =
ConvertTo(df, integer);
5007 return Vec128<float, N>{
5008 _mm_round_ps(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5011 HWY_API Vec128<double, N>
Round(
const Vec128<double, N>
v) {
5012 return Vec128<double, N>{
5013 _mm_round_pd(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
5019 return Vec128<float, N>{
5020 _mm_round_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5023 HWY_API Vec128<double, N>
Trunc(
const Vec128<double, N>
v) {
5024 return Vec128<double, N>{
5025 _mm_round_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
5030 HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N>
v) {
5031 return Vec128<float, N>{
5032 _mm_round_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5035 HWY_API Vec128<double, N>
Ceil(
const Vec128<double, N>
v) {
5036 return Vec128<double, N>{
5037 _mm_round_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
5043 return Vec128<float, N>{
5044 _mm_round_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5047 HWY_API Vec128<double, N>
Floor(
const Vec128<double, N>
v) {
5048 return Vec128<double, N>{
5049 _mm_round_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
5056 #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
5059 #ifdef HWY_NATIVE_AES
5060 #undef HWY_NATIVE_AES
5062 #define HWY_NATIVE_AES
5066 Vec128<uint8_t> round_key) {
5067 return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
5071 Vec128<uint8_t> round_key) {
5072 return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
5075 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
5077 Vec128<uint64_t, N> b) {
5078 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
5081 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
5083 Vec128<uint64_t, N> b) {
5084 return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
5091 template <
typename T>
5092 struct CompressIsPartition {
5093 #if HWY_TARGET <= HWY_AVX3
5099 enum {
value = (
sizeof(T) == 8) };
5105 #if HWY_TARGET <= HWY_AVX3
5110 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
5113 uint64_t mask_bits = 0;
5114 constexpr
size_t kNumBytes = (
N + 7) / 8;
5115 CopyBytes<kNumBytes>(bits, &mask_bits);
5117 mask_bits &= (1ull <<
N) - 1;
5126 template <
typename T,
size_t N>
5128 const Mask128<T, N> mask, uint8_t* bits) {
5129 constexpr
size_t kNumBytes = (
N + 7) / 8;
5130 CopyBytes<kNumBytes>(&mask.raw, bits);
5134 const int mask = (1 <<
N) - 1;
5135 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
5145 template <
typename T,
size_t N>
5148 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.
raw) & ((1u <<
N) - 1);
5152 template <
typename T,
size_t N>
5154 const Mask128<T, N> mask) {
5155 const uint32_t mask_bits =
static_cast<uint32_t
>(mask.raw) & ((1u <<
N) - 1);
5159 template <
typename T,
size_t N>
5161 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.
raw) & ((1u <<
N) - 1);
5162 return mask_bits == 0;
5165 template <
typename T,
size_t N>
5166 HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
5167 const uint64_t mask_bits =
static_cast<uint64_t
>(mask.raw) & ((1u <<
N) - 1);
5169 return mask_bits == (1u <<
N) - 1;
5174 #if HWY_TARGET != HWY_AVX3_DL
5178 HWY_INLINE Vec128<uint16_t> IndicesForCompress16(uint64_t mask_bits) {
5179 Full128<uint16_t> du16;
5183 Rebind<uint8_t, decltype(du16)> du8;
5184 alignas(16) constexpr uint8_t tbl[2048] = {
5185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
5186 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2,
5187 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
5188 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0,
5189 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0,
5190 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0,
5191 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
5192 0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0,
5193 0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2,
5194 3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1,
5195 2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0,
5196 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0,
5197 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0,
5198 0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0,
5199 0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0,
5200 0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0,
5201 1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0,
5202 2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4,
5203 5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3,
5204 4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4,
5205 5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0,
5206 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0,
5207 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0,
5208 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0,
5209 0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1,
5210 2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4,
5211 6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6,
5212 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4,
5213 6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0,
5214 0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0,
5215 0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0,
5216 0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0,
5217 2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0,
5218 1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3,
5219 5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3,
5220 5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0,
5221 0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6,
5222 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0,
5223 0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0,
5224 0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0,
5225 0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7,
5226 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1,
5227 7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7,
5228 0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0,
5229 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0,
5230 0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0,
5231 0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0,
5232 0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0,
5233 1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0,
5234 3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3,
5235 4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2,
5236 3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0,
5237 0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0,
5238 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0,
5239 0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0,
5240 0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0,
5241 0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1,
5242 4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2,
5243 4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5,
5244 7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4,
5245 5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5,
5246 7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0,
5247 0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0,
5248 0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0,
5249 3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0,
5250 1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2,
5251 3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6,
5252 7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7,
5253 0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6,
5254 7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0,
5255 0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0,
5256 0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0,
5257 0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2,
5258 5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1,
5259 2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5,
5260 6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5,
5261 6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0,
5262 0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7,
5263 0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0,
5264 0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0,
5265 1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0,
5266 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
5273 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
5276 const Rebind<uint16_t, decltype(
d)> du;
5279 #if HWY_TARGET == HWY_AVX3_DL
5282 const auto idx = detail::IndicesForCompress16(uint64_t{mask.
raw});
5288 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
5290 return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw,
v.raw)};
5298 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
5303 alignas(16) constexpr uint8_t packed_array[64] = {
5304 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5305 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5306 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5307 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5309 const Simd<T, N, 0>
d;
5311 const auto index =
Load(d8, packed_array + 16 * mask.raw);
5317 template <
typename T,
size_t N>
5325 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
5328 const Rebind<uint16_t, decltype(
d)> du;
5331 const uint64_t mask_bits{mask.
raw};
5333 #if HWY_TARGET == HWY_AVX3_DL
5334 _mm_mask_compressstoreu_epi16(unaligned, mask.
raw, vu.raw);
5336 const auto idx = detail::IndicesForCompress16(mask_bits);
5341 const size_t count =
PopCount(mask_bits & ((1ull <<
N) - 1));
5344 __msan_unpoison(unaligned, count *
sizeof(T));
5349 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
5353 _mm_mask_compressstoreu_epi32(unaligned, mask.raw,
v.raw);
5354 const size_t count =
PopCount(uint64_t{mask.raw} & ((1ull <<
N) - 1));
5357 __msan_unpoison(unaligned, count *
sizeof(T));
5362 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
5366 _mm_mask_compressstoreu_epi64(unaligned, mask.raw,
v.raw);
5367 const size_t count =
PopCount(uint64_t{mask.raw} & ((1ull <<
N) - 1));
5370 __msan_unpoison(unaligned, count *
sizeof(T));
5375 template <
size_t N, HWY_IF_LE128(
float, N)>
5379 _mm_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
5380 const size_t count =
PopCount(uint64_t{mask.
raw} & ((1ull <<
N) - 1));
5383 __msan_unpoison(unaligned, count *
sizeof(
float));
5388 template <
size_t N, HWY_IF_LE128(
double, N)>
5392 _mm_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
5393 const size_t count =
PopCount(uint64_t{mask.
raw} & ((1ull <<
N) - 1));
5396 __msan_unpoison(unaligned, count *
sizeof(
double));
5402 template <
typename T,
size_t N>
5410 if (
N != 16 /
sizeof(T)) {
5416 const Vec128<T, N> compressed =
Compress(
v, m);
5417 #if HWY_MEM_OPS_MIGHT_FAULT
5420 alignas(16) T buf[
N];
5421 Store(compressed,
d, buf);
5422 memcpy(unaligned, buf, count *
sizeof(T));
5429 __msan_unpoison(unaligned, count *
sizeof(T));
5437 template <
typename T,
size_t N>
5450 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
5455 const Vec128<T, N> vbits{_mm_cvtsi32_si128(
static_cast<int>(mask_bits))};
5458 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
5459 1, 1, 1, 1, 1, 1, 1, 1};
5462 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
5463 1, 2, 4, 8, 16, 32, 64, 128};
5467 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
5470 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
5471 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
5475 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
5478 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
5479 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
5483 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
5486 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
5493 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
5496 uint64_t mask_bits = 0;
5497 constexpr
size_t kNumBytes = (
N + 7) / 8;
5498 CopyBytes<kNumBytes>(bits, &mask_bits);
5500 mask_bits &= (1ull <<
N) - 1;
5510 constexpr
HWY_INLINE uint64_t U64FromInt(
int mask_bits) {
5511 return static_cast<uint64_t
>(
static_cast<unsigned>(mask_bits));
5514 template <
typename T,
size_t N>
5516 const Mask128<T, N> mask) {
5517 const Simd<T, N, 0>
d;
5519 return U64FromInt(_mm_movemask_epi8(sign_bits));
5522 template <
typename T,
size_t N>
5524 const Mask128<T, N> mask) {
5526 const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
5527 return U64FromInt(_mm_movemask_epi8(sign_bits));
5530 template <
typename T,
size_t N>
5532 const Mask128<T, N> mask) {
5533 const Simd<T, N, 0>
d;
5534 const Simd<float, N, 0> df;
5536 return U64FromInt(_mm_movemask_ps(sign_bits.raw));
5539 template <
typename T,
size_t N>
5541 const Mask128<T, N> mask) {
5542 const Simd<T, N, 0>
d;
5543 const Simd<double, N, 0> df;
5545 return U64FromInt(_mm_movemask_pd(sign_bits.raw));
5549 template <
typename T,
size_t N>
5550 constexpr uint64_t
OnlyActive(uint64_t mask_bits) {
5551 return ((
N *
sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull <<
N) - 1);
5554 template <
typename T,
size_t N>
5562 template <
typename T,
size_t N>
5564 const Mask128<T, N> mask, uint8_t* bits) {
5565 constexpr
size_t kNumBytes = (
N + 7) / 8;
5567 CopyBytes<kNumBytes>(&mask_bits, bits);
5573 template <
typename T,
size_t N>
5574 HWY_API bool AllFalse(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
5579 template <
typename T,
size_t N>
5580 HWY_API bool AllTrue(
const Simd<T, N, 0> ,
const Mask128<T, N> mask) {
5581 constexpr uint64_t kAllBits =
5582 detail::OnlyActive<T, N>((1ull << (16 /
sizeof(T))) - 1);
5586 template <
typename T,
size_t N>
5588 const Mask128<T, N> mask) {
5592 template <
typename T,
size_t N>
5594 const Mask128<T, N> mask) {
5603 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
5604 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
5606 const Rebind<uint8_t, decltype(
d)> d8;
5607 const Simd<uint16_t, N, 0> du;
5617 alignas(16) constexpr uint8_t table[2048] = {
5618 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5619 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5620 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
5621 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5622 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
5623 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
5624 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
5625 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5626 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
5627 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
5628 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
5629 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
5630 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
5631 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
5632 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
5633 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5634 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
5635 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
5636 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
5637 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
5638 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
5639 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
5640 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
5641 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
5642 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
5643 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
5644 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
5645 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
5646 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
5647 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
5648 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
5649 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5650 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
5651 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
5652 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
5653 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
5654 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
5655 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
5656 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
5657 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
5658 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
5659 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
5660 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
5661 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
5662 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
5663 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
5664 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
5665 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
5666 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
5667 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
5668 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
5669 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
5670 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
5671 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
5672 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
5673 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
5674 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
5675 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
5676 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
5677 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
5678 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
5679 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
5680 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
5681 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
5682 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
5683 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
5684 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
5685 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
5686 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
5687 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
5688 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
5689 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
5690 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
5691 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
5692 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
5693 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
5694 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
5695 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
5696 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
5697 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
5698 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
5699 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
5700 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
5701 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
5702 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
5703 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
5704 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
5705 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
5706 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
5707 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
5708 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
5709 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
5710 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
5711 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
5712 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
5713 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
5714 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
5715 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
5716 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
5717 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
5718 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
5719 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
5720 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
5721 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
5722 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
5723 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
5724 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
5725 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
5726 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
5727 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
5728 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
5729 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
5730 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
5731 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
5732 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
5733 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
5734 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
5735 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
5736 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
5737 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
5738 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
5739 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
5740 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
5741 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
5742 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
5743 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
5744 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
5745 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
5747 const Vec128<uint8_t, 2 * N> byte_idx{
Load(d8, table + mask_bits * 8).raw};
5748 const Vec128<uint16_t, N> pairs =
ZipLower(byte_idx, byte_idx);
5752 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
5753 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
5757 alignas(16) constexpr uint8_t packed_array[256] = {
5758 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5759 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5760 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
5761 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5762 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
5763 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
5764 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
5765 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5766 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
5767 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
5768 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
5769 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
5770 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5771 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
5772 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
5773 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5776 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
5779 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
5780 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0>
d, uint64_t mask_bits) {
5784 alignas(16) constexpr uint8_t packed_array[64] = {
5785 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5786 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
5787 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
5788 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
5791 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
5794 template <
typename T,
size_t N>
5796 const Simd<T, N, 0>
d;
5800 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
5806 template <
typename T,
size_t N>
5811 template <
typename T,
size_t N>
5814 uint64_t mask_bits = 0;
5815 constexpr
size_t kNumBytes = (
N + 7) / 8;
5816 CopyBytes<kNumBytes>(bits, &mask_bits);
5818 mask_bits &= (1ull <<
N) - 1;
5826 template <
typename T,
size_t N>
5835 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
5837 StoreU(compressed,
d, unaligned);
5841 template <
typename T,
size_t N>
5849 const size_t count =
PopCount(mask_bits);
5852 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
5858 template <
typename T,
size_t N>
5864 uint64_t mask_bits = 0;
5865 constexpr
size_t kNumBytes = (
N + 7) / 8;
5866 CopyBytes<kNumBytes>(bits, &mask_bits);
5868 mask_bits &= (1ull <<
N) - 1;
5872 const auto indices =
BitCast(du, detail::IndicesFromBits(
d, mask_bits));
5874 StoreU(compressed,
d, unaligned);
5885 const Vec128<uint8_t> v1,
5886 const Vec128<uint8_t> v2, Full128<uint8_t>
d,
5888 const auto k5 =
Set(
d, 5);
5889 const auto k6 =
Set(
d, 6);
5893 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
5894 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
5895 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
5896 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
5897 0x80, 0, 0x80, 0x80, 1, 0x80,
5898 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
5899 const auto shuf_r0 =
Load(
d, tbl_r0);
5900 const auto shuf_g0 =
Load(
d, tbl_g0);
5901 const auto shuf_b0 = CombineShiftRightBytes<15>(
d, shuf_g0, shuf_g0);
5905 const auto int0 = r0 | g0 | b0;
5906 StoreU(int0,
d, unaligned + 0 * 16);
5909 const auto shuf_r1 = shuf_b0 + k6;
5910 const auto shuf_g1 = shuf_r0 + k5;
5911 const auto shuf_b1 = shuf_g0 + k5;
5915 const auto int1 = r1 | g1 | b1;
5916 StoreU(int1,
d, unaligned + 1 * 16);
5919 const auto shuf_r2 = shuf_b1 + k6;
5920 const auto shuf_g2 = shuf_r1 + k5;
5921 const auto shuf_b2 = shuf_g1 + k5;
5925 const auto int2 = r2 | g2 | b2;
5926 StoreU(int2,
d, unaligned + 2 * 16);
5931 const Vec64<uint8_t> v2, Full64<uint8_t>
d,
5934 const Full128<uint8_t> d_full;
5935 const auto k5 =
Set(d_full, 5);
5936 const auto k6 =
Set(d_full, 6);
5938 const Vec128<uint8_t> full_a{v0.raw};
5939 const Vec128<uint8_t> full_b{v1.raw};
5940 const Vec128<uint8_t> full_c{v2.raw};
5944 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
5945 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
5946 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
5947 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
5948 0x80, 0, 0x80, 0x80, 1, 0x80,
5949 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
5950 const auto shuf_r0 =
Load(d_full, tbl_r0);
5951 const auto shuf_g0 =
Load(d_full, tbl_g0);
5952 const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
5956 const auto int0 = r0 | g0 | b0;
5957 StoreU(int0, d_full, unaligned + 0 * 16);
5960 const auto shuf_r1 = shuf_b0 + k6;
5961 const auto shuf_g1 = shuf_r0 + k5;
5962 const auto shuf_b1 = shuf_g0 + k5;
5966 const decltype(
Zero(
d)) int1{(r1 | g1 | b1).raw};
5967 StoreU(int1,
d, unaligned + 1 * 16);
5971 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
5973 const Vec128<uint8_t, N> v1,
5974 const Vec128<uint8_t, N> v2,
5975 Simd<uint8_t, N, 0> ,
5978 const Full128<uint8_t> d_full;
5980 const Vec128<uint8_t> full_a{v0.raw};
5981 const Vec128<uint8_t> full_b{v1.raw};
5982 const Vec128<uint8_t> full_c{v2.raw};
5986 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
5987 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80,
5988 0x80, 0x80, 0x80, 0x80};
5989 const auto shuf_r0 =
Load(d_full, tbl_r0);
5990 const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
5991 const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
5995 const auto int0 = r0 | g0 | b0;
5996 alignas(16) uint8_t buf[16];
5997 StoreU(int0, d_full, buf);
5998 CopyBytes<N * 3>(buf, unaligned);
6005 const Vec128<uint8_t> v1,
6006 const Vec128<uint8_t> v2,
6007 const Vec128<uint8_t> v3, Full128<uint8_t> d8,
6012 const auto ba0 =
ZipLower(d16, v0, v1);
6013 const auto dc0 =
ZipLower(d16, v2, v3);
6014 const auto ba8 =
ZipUpper(d16, v0, v1);
6015 const auto dc8 =
ZipUpper(d16, v2, v3);
6016 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
6017 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
6018 const auto dcba_8 =
ZipLower(d32, ba8, dc8);
6019 const auto dcba_C =
ZipUpper(d32, ba8, dc8);
6028 const Vec64<uint8_t> in1,
6029 const Vec64<uint8_t> in2,
6030 const Vec64<uint8_t> in3,
6034 const Full128<uint8_t> d_full8;
6037 const Vec128<uint8_t> v0{in0.raw};
6038 const Vec128<uint8_t> v1{in1.raw};
6039 const Vec128<uint8_t> v2{in2.raw};
6040 const Vec128<uint8_t> v3{in3.raw};
6042 const auto ba0 =
ZipLower(d16, v0, v1);
6043 const auto dc0 =
ZipLower(d16, v2, v3);
6044 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
6045 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
6046 StoreU(
BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
6047 StoreU(
BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
6051 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
6053 const Vec128<uint8_t, N> in1,
6054 const Vec128<uint8_t, N> in2,
6055 const Vec128<uint8_t, N> in3,
6056 Simd<uint8_t, N, 0> ,
6059 const Full128<uint8_t> d_full8;
6062 const Vec128<uint8_t> v0{in0.raw};
6063 const Vec128<uint8_t> v1{in1.raw};
6064 const Vec128<uint8_t> v2{in2.raw};
6065 const Vec128<uint8_t> v3{in3.raw};
6067 const auto ba0 =
ZipLower(d16, v0, v1);
6068 const auto dc0 =
ZipLower(d16, v2, v3);
6069 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
6070 alignas(16) uint8_t buf[16];
6072 CopyBytes<4 * N>(buf, unaligned);
6080 template <
typename T>
6082 const Vec128<T, 1>
v) {
6085 template <
typename T>
6087 const Vec128<T, 1>
v) {
6090 template <
typename T>
6092 const Vec128<T, 1>
v) {
6099 template <
typename T>
6101 const Vec128<T, 2> v10) {
6104 template <
typename T>
6106 const Vec128<T, 2> v10) {
6109 template <
typename T>
6111 const Vec128<T, 2> v10) {
6116 template <
typename T>
6118 const Vec128<T> v3210) {
6120 const Vec128<T> v31_20_31_20 = v3210 + v1032;
6121 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
6122 return v20_31_20_31 + v31_20_31_20;
6124 template <
typename T>
6126 const Vec128<T> v3210) {
6128 const Vec128<T> v31_20_31_20 =
Min(v3210, v1032);
6129 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
6130 return Min(v20_31_20_31, v31_20_31_20);
6132 template <
typename T>
6134 const Vec128<T> v3210) {
6136 const Vec128<T> v31_20_31_20 =
Max(v3210, v1032);
6137 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
6138 return Max(v20_31_20_31, v31_20_31_20);
6144 template <
typename T>
6146 const Vec128<T> v10) {
6150 template <
typename T>
6152 const Vec128<T> v10) {
6154 return Min(v10, v01);
6156 template <
typename T>
6158 const Vec128<T> v10) {
6160 return Max(v10, v01);
6164 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
6166 const Repartition<int32_t, Simd<T, N, 0>> d32;
6168 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
6171 return BitCast(Simd<T, N, 0>(),
Or(min, ShiftLeft<16>(min)));
6173 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
6175 const Repartition<int32_t, Simd<T, N, 0>> d32;
6177 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
6180 return BitCast(Simd<T, N, 0>(),
Or(min, ShiftLeft<16>(min)));
6186 template <
typename T,
size_t N>
6190 template <
typename T,
size_t N>
6194 template <
typename T,
size_t N>
6204 template <
class D,
class V = VFromD<D>>
6222 const V ltLX = ShiftLeftLanes<1>(ltHL);
6223 const V vecHx =
OrAnd(ltHL, eqHL, ltLX);
6229 template <
class D,
class V = VFromD<D>>
6237 template <
class D,
class V = VFromD<D>>
6242 template <
class D,
class V = VFromD<D>>
6279 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
6283 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
6287 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
6292 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
6296 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
6301 HWY_API auto Le(V a, V b) -> decltype(a == b) {
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_IF_LANE_SIZE(T, bytes)
Definition: base.h:310
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_IF_LE64(T, N)
Definition: base.h:297
#define HWY_API
Definition: base.h:122
#define HWY_IF_LE128(T, N)
Definition: base.h:296
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_MAYBE_UNUSED
Definition: base.h:75
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: arm_neon-inl.h:529
static Mask128< T, N > FromBits(uint64_t mask_bits)
Definition: x86_128-inl.h:152
Raw raw
Definition: arm_neon-inl.h:539
typename detail::Raw128< MakeUnsigned< T >, N >::type Raw
Definition: arm_neon-inl.h:531
Definition: arm_neon-inl.h:485
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: x86_128-inl.h:102
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: x86_128-inl.h:105
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: x86_128-inl.h:93
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: x86_128-inl.h:108
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: x86_128-inl.h:90
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:486
Raw raw
Definition: arm_neon-inl.h:518
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: x86_128-inl.h:96
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: x86_128-inl.h:99
Definition: wasm_256-inl.h:39
Definition: x86_512-inl.h:103
#define HWY_AVX3_DL
Definition: detect_targets.h:58
#define HWY_TARGET
Definition: detect_targets.h:328
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1538
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2860
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4522
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:4784
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4773
HWY_API void ScalarMaskedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: x86_128-inl.h:1938
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3106
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2568
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3095
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:134
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4309
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:936
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:728
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4314
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MaxU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2967
HWY_INLINE HWY_MAYBE_UNUSED Vec128< T, N > MinU(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: x86_128-inl.h:2871
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2434
HWY_INLINE V Lt128Vec(const D d, const V a, const V b)
Definition: x86_128-inl.h:6205
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4664
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4467
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:770
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2426
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:862
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4305
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:680
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1120
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
Simd< T, 4/sizeof(T), 0 > Full32
Definition: arm_neon-inl.h:40
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:247
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
Simd< T, 8/sizeof(T), 0 > Full64
Definition: arm_neon-inl.h:37
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:203
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
Simd< T, 32/sizeof(T), 0 > Full256
Definition: wasm_256-inl.h:32
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:212
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
Simd< T, 16/sizeof(T), 0 > Full128
Definition: arm_neon-inl.h:34
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:522
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
long long int GatherIndex64
Definition: x86_128-inl.h:3088
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:711
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:466
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:598
constexpr HWY_API bool IsSigned()
Definition: base.h:483
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: arm_neon-inl.h:3368
detail::Raw128< T, N >::type raw
Definition: arm_neon-inl.h:3369
__m128i raw
Definition: x86_128-inl.h:3546
Definition: ops/shared-inl.h:40
HWY_INLINE __m128d operator()(__m128i v)
Definition: x86_128-inl.h:237
HWY_INLINE __m128 operator()(__m128i v)
Definition: x86_128-inl.h:233
Definition: wasm_128-inl.h:149
HWY_INLINE __m128i operator()(__m128i v)
Definition: x86_128-inl.h:229
Definition: arm_neon-inl.h:545
Full512< T > operator()(const hwy::HWY_NAMESPACE::Vec512< T > *) const
Definition: x86_128-inl.h:193
Simd< T, N, 0 > operator()(const Vec128< T, N > *) const
Definition: x86_128-inl.h:182
Full256< T > operator()(const hwy::HWY_NAMESPACE::Vec256< T > *) const
Definition: x86_128-inl.h:187
Definition: x86_128-inl.h:201
decltype(DeduceD()(static_cast< V * >(nullptr))) type
Definition: x86_128-inl.h:202
__m128d type
Definition: x86_128-inl.h:78
__f32x4 type
Definition: wasm_128-inl.h:66
Definition: x86_128-inl.h:69
__v128_u type
Definition: wasm_128-inl.h:62
Definition: x86_128-inl.h:130
__mmask16 type
Definition: x86_128-inl.h:131
Definition: x86_128-inl.h:134
__mmask8 type
Definition: x86_128-inl.h:135
Definition: x86_128-inl.h:138
__mmask8 type
Definition: x86_128-inl.h:139
Definition: x86_128-inl.h:142
__mmask8 type
Definition: x86_128-inl.h:143
Definition: x86_128-inl.h:128
#define HWY_INLINE_F16
Definition: x86_128-inl.h:4605