23 #include <immintrin.h>
26 #if defined(_MSC_VER) && defined(__clang__)
30 #include <avxintrin.h>
32 #include <avx2intrin.h>
33 #include <bmi2intrin.h>
34 #include <f16cintrin.h>
35 #include <fmaintrin.h>
36 #include <smmintrin.h>
73 return *
this = (*
this * other);
76 return *
this = (*
this / other);
79 return *
this = (*
this + other);
82 return *
this = (*
this - other);
85 return *
this = (*
this & other);
88 return *
this = (*
this | other);
91 return *
this = (*
this ^ other);
97 #if HWY_TARGET <= HWY_AVX3
102 template <
size_t size>
123 template <
typename T>
137 template <
typename T>
151 return _mm256_castpd_si256(
v);
154 template <
typename T>
160 template <
typename T>
173 template <
typename T>
180 template <
typename T,
typename FromT>
188 template <
typename T>
190 return Vec256<T>{_mm256_setzero_si256()};
193 return Vec256<float>{_mm256_setzero_ps()};
200 HWY_API Vec256<uint8_t>
Set(Full256<uint8_t> ,
const uint8_t t) {
201 return Vec256<uint8_t>{_mm256_set1_epi8(
static_cast<char>(t))};
203 HWY_API Vec256<uint16_t>
Set(Full256<uint16_t> ,
const uint16_t t) {
204 return Vec256<uint16_t>{_mm256_set1_epi16(
static_cast<short>(t))};
206 HWY_API Vec256<uint32_t>
Set(Full256<uint32_t> ,
const uint32_t t) {
207 return Vec256<uint32_t>{_mm256_set1_epi32(
static_cast<int>(t))};
209 HWY_API Vec256<uint64_t>
Set(Full256<uint64_t> ,
const uint64_t t) {
210 return Vec256<uint64_t>{
211 _mm256_set1_epi64x(
static_cast<long long>(t))};
213 HWY_API Vec256<int8_t>
Set(Full256<int8_t> ,
const int8_t t) {
214 return Vec256<int8_t>{_mm256_set1_epi8(
static_cast<char>(t))};
216 HWY_API Vec256<int16_t>
Set(Full256<int16_t> ,
const int16_t t) {
217 return Vec256<int16_t>{_mm256_set1_epi16(
static_cast<short>(t))};
219 HWY_API Vec256<int32_t>
Set(Full256<int32_t> ,
const int32_t t) {
220 return Vec256<int32_t>{_mm256_set1_epi32(t)};
222 HWY_API Vec256<int64_t>
Set(Full256<int64_t> ,
const int64_t t) {
223 return Vec256<int64_t>{
224 _mm256_set1_epi64x(
static_cast<long long>(t))};
226 HWY_API Vec256<float>
Set(Full256<float> ,
const float t) {
227 return Vec256<float>{_mm256_set1_ps(t)};
237 template <
typename T>
241 return Vec256<T>{_mm256_undefined_si256()};
256 template <
typename T>
257 HWY_API Vec256<T>
And(Vec256<T> a, Vec256<T> b) {
258 return Vec256<T>{_mm256_and_si256(a.raw, b.raw)};
271 template <
typename T>
272 HWY_API Vec256<T>
AndNot(Vec256<T> not_mask, Vec256<T> mask) {
273 return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)};
286 template <
typename T>
287 HWY_API Vec256<T>
Or(Vec256<T> a, Vec256<T> b) {
288 return Vec256<T>{_mm256_or_si256(a.raw, b.raw)};
300 template <
typename T>
301 HWY_API Vec256<T>
Xor(Vec256<T> a, Vec256<T> b) {
302 return Vec256<T>{_mm256_xor_si256(a.raw, b.raw)};
314 template <
typename T>
317 #if HWY_TARGET <= HWY_AVX3
318 const __m256i vu =
BitCast(Full256<TU>(),
v).raw;
320 Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
322 return Xor(
v,
BitCast(Full256<T>(), Vec256<TU>{_mm256_set1_epi32(-1)}));
328 template <
typename T>
329 HWY_API Vec256<T>
OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
330 #if HWY_TARGET <= HWY_AVX3
333 using VU =
VFromD<decltype(du)>;
334 const __m256i ret = _mm256_ternarylogic_epi64(
338 return Or(o,
And(a1, a2));
344 template <
typename T>
346 #if HWY_TARGET <= HWY_AVX3
349 using VU =
VFromD<decltype(du)>;
360 template <
typename T>
365 template <
typename T>
370 template <
typename T>
378 #if HWY_TARGET == HWY_AVX3_DL
380 #ifdef HWY_NATIVE_POPCNT
381 #undef HWY_NATIVE_POPCNT
383 #define HWY_NATIVE_POPCNT
388 template <
typename T>
392 template <
typename T>
396 template <
typename T>
400 template <
typename T>
407 template <
typename T>
418 template <
typename T>
419 HWY_API Vec256<T>
CopySign(
const Vec256<T> magn,
const Vec256<T> sign) {
420 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
425 #if HWY_TARGET <= HWY_AVX3
426 const Rebind<MakeUnsigned<T>, decltype(
d)> du;
437 const __m256i out = _mm256_ternarylogic_epi32(
445 template <
typename T>
447 #if HWY_TARGET <= HWY_AVX3
457 #if HWY_TARGET <= HWY_AVX3
466 template <
typename T>
471 template <
typename T>
476 template <
typename T>
481 template <
typename T>
489 template <
typename T>
504 template <
typename T>
509 template <
typename T>
514 template <
typename T>
519 template <
typename T>
527 template <
typename T>
541 template <
typename T>
547 template <
typename T>
552 template <
typename T>
557 template <
typename T>
565 template <
typename T>
576 template <
typename T, HWY_IF_FLOAT(T)>
586 template <
typename T>
589 #if HWY_COMPILER_HAS_MASK_INTRINSICS
595 template <
typename T>
598 #if HWY_COMPILER_HAS_MASK_INTRINSICS
604 template <
typename T>
607 #if HWY_COMPILER_HAS_MASK_INTRINSICS
613 template <
typename T>
616 #if HWY_COMPILER_HAS_MASK_INTRINSICS
623 template <
typename T>
626 #if HWY_COMPILER_HAS_MASK_INTRINSICS
632 template <
typename T>
635 #if HWY_COMPILER_HAS_MASK_INTRINSICS
641 template <
typename T>
644 #if HWY_COMPILER_HAS_MASK_INTRINSICS
650 template <
typename T>
653 #if HWY_COMPILER_HAS_MASK_INTRINSICS
660 template <
typename T>
663 #if HWY_COMPILER_HAS_MASK_INTRINSICS
669 template <
typename T>
672 #if HWY_COMPILER_HAS_MASK_INTRINSICS
678 template <
typename T>
681 #if HWY_COMPILER_HAS_MASK_INTRINSICS
687 template <
typename T>
690 #if HWY_COMPILER_HAS_MASK_INTRINSICS
697 template <
typename T>
700 #if HWY_COMPILER_HAS_MASK_INTRINSICS
706 template <
typename T>
709 #if HWY_COMPILER_HAS_MASK_INTRINSICS
715 template <
typename T>
718 #if HWY_COMPILER_HAS_MASK_INTRINSICS
724 template <
typename T>
727 #if HWY_COMPILER_HAS_MASK_INTRINSICS
736 template <
typename T>
741 template <
typename T>
746 template <
typename T>
751 template <
typename T>
756 template <
typename T>
759 constexpr
size_t N = 32 /
sizeof(T);
768 template <
typename T>
770 return Mask256<T>{
v.raw};
773 template <
typename T>
775 return Vec256<T>{
v.raw};
778 template <
typename T>
780 return Vec256<T>{
v.raw};
786 template <
typename T>
788 const Vec256<T> no) {
789 return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
792 const Vec256<float> yes,
793 const Vec256<float> no) {
794 return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
797 const Vec256<double> yes,
798 const Vec256<double> no) {
799 return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
803 template <
typename T>
809 template <
typename T>
814 template <
typename T, HWY_IF_FLOAT(T)>
816 const auto zero =
Zero(Full256<T>());
823 template <
typename T>
828 template <
typename T>
829 HWY_API Mask256<T>
And(
const Mask256<T> a, Mask256<T> b) {
834 template <
typename T>
835 HWY_API Mask256<T>
AndNot(
const Mask256<T> a, Mask256<T> b) {
840 template <
typename T>
841 HWY_API Mask256<T>
Or(
const Mask256<T> a, Mask256<T> b) {
846 template <
typename T>
847 HWY_API Mask256<T>
Xor(
const Mask256<T> a, Mask256<T> b) {
856 #if HWY_TARGET <= HWY_AVX3
860 template <
typename TFrom,
typename TTo>
862 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
863 return Mask256<TTo>{m.raw};
868 template <
typename T>
873 template <
typename T>
878 template <
typename T>
883 template <
typename T>
891 template <
typename T>
893 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
899 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
903 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
905 return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
907 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
909 return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
911 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
913 return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
917 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
926 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
930 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
932 return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
934 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
936 return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
938 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
940 return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
944 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
954 return Mask256<int8_t>{_mm256_cmpgt_epi8_mask(a.raw, b.raw)};
957 return Mask256<int16_t>{_mm256_cmpgt_epi16_mask(a.raw, b.raw)};
960 return Mask256<int32_t>{_mm256_cmpgt_epi32_mask(a.raw, b.raw)};
963 return Mask256<int64_t>{_mm256_cmpgt_epi64_mask(a.raw, b.raw)};
983 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
992 return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
1002 template <
typename T>
1006 template <
typename T>
1010 template <
typename T>
1014 template <
typename T>
1021 template <
typename T>
1033 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1038 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1040 return Vec256<T>{_mm256_movm_epi16(
v.raw)};
1043 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1045 return Vec256<T>{_mm256_movm_epi32(
v.raw)};
1048 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1050 return Vec256<T>{_mm256_movm_epi64(
v.raw)};
1054 return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(
v.raw))};
1061 template <
typename T>
1070 template <
typename TFrom,
typename TTo>
1072 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1076 template <
typename T>
1078 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1079 return (
v & bit) == bit;
1084 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1086 return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1089 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1091 return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1094 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1096 return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1099 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1101 return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1105 const Vec256<float> b) {
1106 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1110 const Vec256<double> b) {
1111 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1116 template <
typename T, HWY_IF_NOT_FLOAT(T)>
1122 const Vec256<float> b) {
1123 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1126 const Vec256<double> b) {
1127 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1135 #if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
1136 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1138 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1142 #if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1143 using i8x32 =
signed char __attribute__((__vector_size__(32)));
1144 return Mask256<int8_t>{
static_cast<__m256i
>(
reinterpret_cast<i8x32
>(a.raw) >
1145 reinterpret_cast<i8x32
>(b.raw))};
1147 return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1151 const Vec256<int16_t> b) {
1152 return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1155 const Vec256<int32_t> b) {
1156 return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1159 const Vec256<int64_t> b) {
1160 return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1163 template <
typename T, HWY_IF_UNSIGNED(T)>
1165 const Full256<T> du;
1167 const Vec256<T> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
1171 HWY_API Mask256<float>
operator>(
const Vec256<float> a,
const Vec256<float> b) {
1172 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1175 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1181 const Vec256<float> b) {
1182 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1185 const Vec256<double> b) {
1186 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1193 template <
typename T>
1198 template <
typename T>
1206 HWY_API Vec256<uint8_t>
Min(
const Vec256<uint8_t> a,
const Vec256<uint8_t> b) {
1207 return Vec256<uint8_t>{_mm256_min_epu8(a.raw, b.raw)};
1209 HWY_API Vec256<uint16_t>
Min(
const Vec256<uint16_t> a,
1210 const Vec256<uint16_t> b) {
1211 return Vec256<uint16_t>{_mm256_min_epu16(a.raw, b.raw)};
1213 HWY_API Vec256<uint32_t>
Min(
const Vec256<uint32_t> a,
1214 const Vec256<uint32_t> b) {
1215 return Vec256<uint32_t>{_mm256_min_epu32(a.raw, b.raw)};
1217 HWY_API Vec256<uint64_t>
Min(
const Vec256<uint64_t> a,
1218 const Vec256<uint64_t> b) {
1219 #if HWY_TARGET <= HWY_AVX3
1220 return Vec256<uint64_t>{_mm256_min_epu64(a.raw, b.raw)};
1222 const Full256<uint64_t> du;
1223 const Full256<int64_t> di;
1224 const auto msb =
Set(du, 1ull << 63);
1231 HWY_API Vec256<int8_t>
Min(
const Vec256<int8_t> a,
const Vec256<int8_t> b) {
1232 return Vec256<int8_t>{_mm256_min_epi8(a.raw, b.raw)};
1234 HWY_API Vec256<int16_t>
Min(
const Vec256<int16_t> a,
const Vec256<int16_t> b) {
1235 return Vec256<int16_t>{_mm256_min_epi16(a.raw, b.raw)};
1237 HWY_API Vec256<int32_t>
Min(
const Vec256<int32_t> a,
const Vec256<int32_t> b) {
1238 return Vec256<int32_t>{_mm256_min_epi32(a.raw, b.raw)};
1240 HWY_API Vec256<int64_t>
Min(
const Vec256<int64_t> a,
const Vec256<int64_t> b) {
1241 #if HWY_TARGET <= HWY_AVX3
1242 return Vec256<int64_t>{_mm256_min_epi64(a.raw, b.raw)};
1249 HWY_API Vec256<float>
Min(
const Vec256<float> a,
const Vec256<float> b) {
1250 return Vec256<float>{_mm256_min_ps(a.raw, b.raw)};
1259 HWY_API Vec256<uint8_t>
Max(
const Vec256<uint8_t> a,
const Vec256<uint8_t> b) {
1260 return Vec256<uint8_t>{_mm256_max_epu8(a.raw, b.raw)};
1262 HWY_API Vec256<uint16_t>
Max(
const Vec256<uint16_t> a,
1263 const Vec256<uint16_t> b) {
1264 return Vec256<uint16_t>{_mm256_max_epu16(a.raw, b.raw)};
1266 HWY_API Vec256<uint32_t>
Max(
const Vec256<uint32_t> a,
1267 const Vec256<uint32_t> b) {
1268 return Vec256<uint32_t>{_mm256_max_epu32(a.raw, b.raw)};
1270 HWY_API Vec256<uint64_t>
Max(
const Vec256<uint64_t> a,
1271 const Vec256<uint64_t> b) {
1272 #if HWY_TARGET <= HWY_AVX3
1273 return Vec256<uint64_t>{_mm256_max_epu64(a.raw, b.raw)};
1275 const Full256<uint64_t> du;
1276 const Full256<int64_t> di;
1277 const auto msb =
Set(du, 1ull << 63);
1284 HWY_API Vec256<int8_t>
Max(
const Vec256<int8_t> a,
const Vec256<int8_t> b) {
1285 return Vec256<int8_t>{_mm256_max_epi8(a.raw, b.raw)};
1287 HWY_API Vec256<int16_t>
Max(
const Vec256<int16_t> a,
const Vec256<int16_t> b) {
1288 return Vec256<int16_t>{_mm256_max_epi16(a.raw, b.raw)};
1290 HWY_API Vec256<int32_t>
Max(
const Vec256<int32_t> a,
const Vec256<int32_t> b) {
1291 return Vec256<int32_t>{_mm256_max_epi32(a.raw, b.raw)};
1293 HWY_API Vec256<int64_t>
Max(
const Vec256<int64_t> a,
const Vec256<int64_t> b) {
1294 #if HWY_TARGET <= HWY_AVX3
1295 return Vec256<int64_t>{_mm256_max_epi64(a.raw, b.raw)};
1302 HWY_API Vec256<float>
Max(
const Vec256<float> a,
const Vec256<float> b) {
1303 return Vec256<float>{_mm256_max_ps(a.raw, b.raw)};
1311 template <
typename T>
1313 #if HWY_TARGET <= HWY_AVX3
1315 constexpr
size_t N = 32 /
sizeof(T);
1317 const uint64_t all = (1ull <<
N) - 1;
1321 const uint32_t all =
static_cast<uint32_t
>((1ull <<
N) - 1);
1324 (n > 255) ? all : _bzhi_u32(all,
static_cast<uint32_t
>(n)));
1338 const Vec256<uint8_t> b) {
1339 return Vec256<uint8_t>{_mm256_add_epi8(a.raw, b.raw)};
1342 const Vec256<uint16_t> b) {
1343 return Vec256<uint16_t>{_mm256_add_epi16(a.raw, b.raw)};
1346 const Vec256<uint32_t> b) {
1347 return Vec256<uint32_t>{_mm256_add_epi32(a.raw, b.raw)};
1356 const Vec256<int8_t> b) {
1357 return Vec256<int8_t>{_mm256_add_epi8(a.raw, b.raw)};
1360 const Vec256<int16_t> b) {
1361 return Vec256<int16_t>{_mm256_add_epi16(a.raw, b.raw)};
1364 const Vec256<int32_t> b) {
1365 return Vec256<int32_t>{_mm256_add_epi32(a.raw, b.raw)};
1373 HWY_API Vec256<float>
operator+(
const Vec256<float> a,
const Vec256<float> b) {
1374 return Vec256<float>{_mm256_add_ps(a.raw, b.raw)};
1385 const Vec256<uint8_t> b) {
1386 return Vec256<uint8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1389 const Vec256<uint16_t> b) {
1390 return Vec256<uint16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1393 const Vec256<uint32_t> b) {
1394 return Vec256<uint32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1403 const Vec256<int8_t> b) {
1404 return Vec256<int8_t>{_mm256_sub_epi8(a.raw, b.raw)};
1407 const Vec256<int16_t> b) {
1408 return Vec256<int16_t>{_mm256_sub_epi16(a.raw, b.raw)};
1411 const Vec256<int32_t> b) {
1412 return Vec256<int32_t>{_mm256_sub_epi32(a.raw, b.raw)};
1420 HWY_API Vec256<float>
operator-(
const Vec256<float> a,
const Vec256<float> b) {
1421 return Vec256<float>{_mm256_sub_ps(a.raw, b.raw)};
1430 return Vec256<uint64_t>{_mm256_sad_epu8(
v.raw, _mm256_setzero_si256())};
1439 const Vec256<uint8_t> b) {
1440 return Vec256<uint8_t>{_mm256_adds_epu8(a.raw, b.raw)};
1443 const Vec256<uint16_t> b) {
1444 return Vec256<uint16_t>{_mm256_adds_epu16(a.raw, b.raw)};
1449 const Vec256<int8_t> b) {
1450 return Vec256<int8_t>{_mm256_adds_epi8(a.raw, b.raw)};
1453 const Vec256<int16_t> b) {
1454 return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
1463 const Vec256<uint8_t> b) {
1464 return Vec256<uint8_t>{_mm256_subs_epu8(a.raw, b.raw)};
1467 const Vec256<uint16_t> b) {
1468 return Vec256<uint16_t>{_mm256_subs_epu16(a.raw, b.raw)};
1473 const Vec256<int8_t> b) {
1474 return Vec256<int8_t>{_mm256_subs_epi8(a.raw, b.raw)};
1477 const Vec256<int16_t> b) {
1478 return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
1487 const Vec256<uint8_t> b) {
1488 return Vec256<uint8_t>{_mm256_avg_epu8(a.raw, b.raw)};
1491 const Vec256<uint16_t> b) {
1492 return Vec256<uint16_t>{_mm256_avg_epu16(a.raw, b.raw)};
1498 HWY_API Vec256<int8_t>
Abs(
const Vec256<int8_t>
v) {
1499 #if HWY_COMPILER_MSVC
1501 const auto zero =
Zero(Full256<int8_t>());
1502 return Vec256<int8_t>{_mm256_max_epi8(
v.raw, (zero -
v).raw)};
1504 return Vec256<int8_t>{_mm256_abs_epi8(
v.raw)};
1507 HWY_API Vec256<int16_t>
Abs(
const Vec256<int16_t>
v) {
1508 return Vec256<int16_t>{_mm256_abs_epi16(
v.raw)};
1510 HWY_API Vec256<int32_t>
Abs(
const Vec256<int32_t>
v) {
1511 return Vec256<int32_t>{_mm256_abs_epi32(
v.raw)};
1515 HWY_API Vec256<float>
Abs(
const Vec256<float>
v) {
1516 const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
1517 return v &
BitCast(Full256<float>(), mask);
1527 HWY_API Vec256<uint16_t>
operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1528 return Vec256<uint16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1530 HWY_API Vec256<uint32_t>
operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1531 return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1536 return Vec256<int16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
1539 return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
1543 HWY_API Vec256<uint16_t>
MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) {
1544 return Vec256<uint16_t>{_mm256_mulhi_epu16(a.raw, b.raw)};
1546 HWY_API Vec256<int16_t>
MulHigh(Vec256<int16_t> a, Vec256<int16_t> b) {
1547 return Vec256<int16_t>{_mm256_mulhi_epi16(a.raw, b.raw)};
1551 return Vec256<int16_t>{_mm256_mulhrs_epi16(a.raw, b.raw)};
1556 HWY_API Vec256<int64_t>
MulEven(Vec256<int32_t> a, Vec256<int32_t> b) {
1557 return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)};
1559 HWY_API Vec256<uint64_t>
MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1560 return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)};
1565 template <
int kBits>
1567 return Vec256<uint16_t>{_mm256_slli_epi16(
v.raw, kBits)};
1570 template <
int kBits>
1572 return Vec256<uint32_t>{_mm256_slli_epi32(
v.raw, kBits)};
1575 template <
int kBits>
1580 template <
int kBits>
1582 return Vec256<int16_t>{_mm256_slli_epi16(
v.raw, kBits)};
1585 template <
int kBits>
1587 return Vec256<int32_t>{_mm256_slli_epi32(
v.raw, kBits)};
1590 template <
int kBits>
1595 template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
1597 const Full256<T> d8;
1602 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
1607 template <
int kBits>
1609 return Vec256<uint16_t>{_mm256_srli_epi16(
v.raw, kBits)};
1612 template <
int kBits>
1614 return Vec256<uint32_t>{_mm256_srli_epi32(
v.raw, kBits)};
1617 template <
int kBits>
1622 template <
int kBits>
1624 const Full256<uint8_t> d8;
1626 const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{
v.raw}).raw};
1627 return shifted &
Set(d8, 0xFF >> kBits);
1630 template <
int kBits>
1632 return Vec256<int16_t>{_mm256_srai_epi16(
v.raw, kBits)};
1635 template <
int kBits>
1637 return Vec256<int32_t>{_mm256_srai_epi32(
v.raw, kBits)};
1640 template <
int kBits>
1642 const Full256<int8_t> di;
1643 const Full256<uint8_t> du;
1645 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
1646 return (shifted ^ shifted_sign) - shifted_sign;
1653 template <
int kBits>
1655 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
1656 #if HWY_TARGET <= HWY_AVX3
1659 if (kBits == 0)
return v;
1664 template <
int kBits>
1666 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
1667 #if HWY_TARGET <= HWY_AVX3
1670 if (kBits == 0)
return v;
1682 return ShiftRight<15>(
v);
1686 return ShiftRight<31>(
v);
1690 #if HWY_TARGET == HWY_AVX2
1697 template <
int kBits>
1699 #if HWY_TARGET <= HWY_AVX3
1706 return right | sign;
1710 HWY_API Vec256<int64_t>
Abs(
const Vec256<int64_t>
v) {
1711 #if HWY_TARGET <= HWY_AVX3
1712 return Vec256<int64_t>{_mm256_abs_epi64(
v.raw)};
1714 const auto zero =
Zero(Full256<int64_t>());
1726 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1728 static_assert(IsSigned<T>(),
"Only works for signed/float");
1737 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
1739 static_assert(IsSigned<T>(),
"Only works for signed/float");
1752 return Vec256<uint16_t>{_mm256_sll_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
1756 return Vec256<uint32_t>{_mm256_sll_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
1764 return Vec256<int16_t>{_mm256_sll_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
1768 return Vec256<int32_t>{_mm256_sll_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
1775 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1777 const Full256<T> d8;
1780 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
1787 return Vec256<uint16_t>{_mm256_srl_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
1791 return Vec256<uint32_t>{_mm256_srl_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
1799 const Full256<uint8_t> d8;
1802 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
1807 return Vec256<int16_t>{_mm256_sra_epi16(
v.raw, _mm_cvtsi32_si128(bits))};
1812 return Vec256<int32_t>{_mm256_sra_epi32(
v.raw, _mm_cvtsi32_si128(bits))};
1816 #if HWY_TARGET <= HWY_AVX3
1823 return right | sign;
1828 const Full256<int8_t> di;
1829 const Full256<uint8_t> du;
1831 const auto shifted_sign =
1832 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
1833 return (shifted ^ shifted_sign) - shifted_sign;
1838 template <
typename T, HWY_IF_FLOAT(T)>
1843 template <
typename T, HWY_IF_NOT_FLOAT(T)>
1845 return Zero(Full256<T>()) -
v;
1850 HWY_API Vec256<float>
operator*(
const Vec256<float> a,
const Vec256<float> b) {
1851 return Vec256<float>{_mm256_mul_ps(a.raw, b.raw)};
1858 HWY_API Vec256<float>
operator/(
const Vec256<float> a,
const Vec256<float> b) {
1859 return Vec256<float>{_mm256_div_ps(a.raw, b.raw)};
1868 return Vec256<float>{_mm256_rcp_ps(
v.raw)};
1872 HWY_API Vec256<float>
AbsDiff(
const Vec256<float> a,
const Vec256<float> b) {
1879 HWY_API Vec256<float>
MulAdd(
const Vec256<float> mul,
const Vec256<float> x,
1880 const Vec256<float> add) {
1881 #ifdef HWY_DISABLE_BMI2_FMA
1882 return mul * x + add;
1884 return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
1889 #ifdef HWY_DISABLE_BMI2_FMA
1890 return mul * x + add;
1897 HWY_API Vec256<float>
NegMulAdd(
const Vec256<float> mul,
const Vec256<float> x,
1898 const Vec256<float> add) {
1899 #ifdef HWY_DISABLE_BMI2_FMA
1900 return add - mul * x;
1902 return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
1908 #ifdef HWY_DISABLE_BMI2_FMA
1909 return add - mul * x;
1916 HWY_API Vec256<float>
MulSub(
const Vec256<float> mul,
const Vec256<float> x,
1917 const Vec256<float> sub) {
1918 #ifdef HWY_DISABLE_BMI2_FMA
1919 return mul * x - sub;
1921 return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
1926 #ifdef HWY_DISABLE_BMI2_FMA
1927 return mul * x - sub;
1934 HWY_API Vec256<float>
NegMulSub(
const Vec256<float> mul,
const Vec256<float> x,
1935 const Vec256<float> sub) {
1936 #ifdef HWY_DISABLE_BMI2_FMA
1937 return Neg(mul * x) - sub;
1939 return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
1945 #ifdef HWY_DISABLE_BMI2_FMA
1946 return Neg(mul * x) - sub;
1956 return Vec256<float>{_mm256_sqrt_ps(
v.raw)};
1964 return Vec256<float>{_mm256_rsqrt_ps(
v.raw)};
1971 return Vec256<float>{
1972 _mm256_round_ps(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1976 _mm256_round_pd(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1981 return Vec256<float>{
1982 _mm256_round_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1986 _mm256_round_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1991 return Vec256<float>{
1992 _mm256_round_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1996 _mm256_round_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2001 return Vec256<float>{
2002 _mm256_round_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2006 _mm256_round_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2013 template <
typename T>
2016 _mm256_load_si256(
reinterpret_cast<const __m256i*
>(aligned))};
2027 template <
typename T>
2029 return Vec256<T>{_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(p))};
2042 #if HWY_TARGET <= HWY_AVX3
2044 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2050 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2053 return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, p)};
2056 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2059 return Vec256<T>{_mm256_maskz_loadu_epi32(m.raw, p)};
2062 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2065 return Vec256<T>{_mm256_maskz_loadu_epi64(m.raw, p)};
2081 template <
typename T, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
2082 HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> d,
2083 const T* HWY_RESTRICT p) {
2084 return IfThenElseZero(m, LoadU(d, p));
2087 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2090 auto pi =
reinterpret_cast<const int*
>(p);
2091 return Vec256<T>{_mm256_maskload_epi32(pi, m.raw)};
2094 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2097 auto pi =
reinterpret_cast<const long long*
>(p);
2098 return Vec256<T>{_mm256_maskload_epi64(pi, m.raw)};
2103 const Vec256<int32_t> mi =
2105 return Vec256<float>{_mm256_maskload_ps(p, mi.raw)};
2110 const Vec256<int64_t> mi =
2112 return Vec256<double>{_mm256_maskload_pd(p, mi.raw)};
2121 template <
typename T>
2125 asm(
"vbroadcasti128 %1, %[reg]" : [ reg ]
"=x"(out) :
"m"(p[0]));
2126 return Vec256<T>{out};
2127 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
2131 const __m128i v128 =
LoadU(Full128<T>(), p).raw;
2133 _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
2135 return Vec256<T>{_mm256_broadcastsi128_si256(
LoadU(Full128<T>(), p).raw)};
2142 asm(
"vbroadcastf128 %1, %[reg]" : [ reg ]
"=x"(out) :
"m"(p[0]));
2144 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
2147 _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
2149 return Vec256<float>{_mm256_broadcast_ps(
reinterpret_cast<const __m128*
>(p))};
2156 asm(
"vbroadcastf128 %1, %[reg]" : [ reg ]
"=x"(out) :
"m"(p[0]));
2158 #elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
2161 _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
2164 _mm256_broadcast_pd(
reinterpret_cast<const __m128d*
>(p))};
2170 template <
typename T>
2172 _mm256_store_si256(
reinterpret_cast<__m256i*
>(aligned),
v.raw);
2176 _mm256_store_ps(aligned,
v.raw);
2180 _mm256_store_pd(aligned,
v.raw);
2183 template <
typename T>
2185 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(p),
v.raw);
2189 _mm256_storeu_ps(p,
v.raw);
2193 _mm256_storeu_pd(p,
v.raw);
2198 #if HWY_TARGET <= HWY_AVX3
2200 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2203 _mm256_mask_storeu_epi8(p, m.
raw,
v.raw);
2206 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2209 _mm256_mask_storeu_epi16(p, m.raw,
v.raw);
2212 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2215 _mm256_mask_storeu_epi32(p, m.raw,
v.raw);
2218 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2221 _mm256_mask_storeu_epi64(p, m.raw,
v.raw);
2226 _mm256_mask_storeu_ps(p, m.
raw,
v.raw);
2231 _mm256_mask_storeu_pd(p, m.
raw,
v.raw);
2245 template <
typename T, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
2246 HWY_API
void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
2247 T* HWY_RESTRICT p) {
2251 const RebindToUn
signed<decltype(d)> du;
2252 using TU = TFromD<decltype(du)>;
2253 alignas(32) TU buf[32 / sizeof(T)];
2254 alignas(32) TU mask[32 / sizeof(T)];
2255 Store(BitCast(du, v), du, buf);
2256 Store(BitCast(du, VecFromMask(d, m)), du, mask);
2257 for (
size_t i = 0; i < 32 / sizeof(T); ++i) {
2259 CopyBytes<sizeof(T)>(buf + i, p + i);
2264 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2267 auto pi =
reinterpret_cast<int*
>(p);
2268 _mm256_maskstore_epi32(pi, m.raw,
v.raw);
2271 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2274 auto pi =
reinterpret_cast<long long*
>(p);
2275 _mm256_maskstore_epi64(pi, m.raw,
v.raw);
2280 const Vec256<int32_t> mi =
2282 _mm256_maskstore_ps(p, mi.raw,
v.raw);
2287 const Vec256<int64_t> mi =
2289 _mm256_maskstore_pd(p, mi.raw,
v.raw);
2296 template <
typename T>
2299 _mm256_stream_si256(
reinterpret_cast<__m256i*
>(aligned),
v.raw);
2303 _mm256_stream_ps(aligned,
v.raw);
2307 _mm256_stream_pd(aligned,
v.raw);
2316 #if HWY_TARGET <= HWY_AVX3
2319 template <
typename T>
2323 _mm256_i32scatter_epi32(base, offset.
raw,
v.raw, 1);
2325 template <
typename T>
2329 _mm256_i32scatter_epi32(base, index.
raw,
v.raw, 4);
2332 template <
typename T>
2336 _mm256_i64scatter_epi64(base, offset.
raw,
v.raw, 1);
2338 template <
typename T>
2342 _mm256_i64scatter_epi64(base, index.
raw,
v.raw, 8);
2347 template <
typename T,
typename Offset>
2349 const Vec256<Offset> offset) {
2350 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2353 template <
typename T,
typename Index>
2355 const Vec256<Index> index) {
2356 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2363 _mm256_i32scatter_ps(base, offset.
raw,
v.raw, 1);
2368 _mm256_i32scatter_ps(base, index.
raw,
v.raw, 4);
2374 _mm256_i64scatter_pd(base, offset.
raw,
v.raw, 1);
2379 _mm256_i64scatter_pd(base, index.
raw,
v.raw, 8);
2384 template <
typename T,
typename Offset>
2386 const Vec256<Offset> offset) {
2387 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2389 constexpr
size_t N = 32 /
sizeof(T);
2390 alignas(32) T lanes[
N];
2393 alignas(32) Offset offset_lanes[
N];
2394 Store(offset, Full256<Offset>(), offset_lanes);
2396 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
2397 for (
size_t i = 0; i <
N; ++i) {
2398 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2402 template <
typename T,
typename Index>
2404 const Vec256<Index> index) {
2405 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2407 constexpr
size_t N = 32 /
sizeof(T);
2408 alignas(32) T lanes[
N];
2411 alignas(32) Index index_lanes[
N];
2412 Store(index, Full256<Index>(), index_lanes);
2414 for (
size_t i = 0; i <
N; ++i) {
2415 base[index_lanes[i]] = lanes[i];
2425 template <
typename T>
2430 return Vec256<T>{_mm256_i32gather_epi32(
2431 reinterpret_cast<const int32_t*
>(base), offset.
raw, 1)};
2433 template <
typename T>
2438 return Vec256<T>{_mm256_i32gather_epi32(
2439 reinterpret_cast<const int32_t*
>(base), index.
raw, 4)};
2442 template <
typename T>
2447 return Vec256<T>{_mm256_i64gather_epi64(
2450 template <
typename T>
2455 return Vec256<T>{_mm256_i64gather_epi64(
2461 template <
typename T,
typename Offset>
2463 const Vec256<Offset> offset) {
2464 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2467 template <
typename T,
typename Index>
2469 const Vec256<Index> index) {
2470 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2502 template <
typename T>
2504 return Vec128<T>{_mm256_castsi256_si128(
v.raw)};
2513 template <
typename T>
2520 template <
typename T>
2522 return Vec128<T>{_mm256_extracti128_si256(
v.raw, 1)};
2532 template <
typename T>
2550 template <
typename T>
2552 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2553 return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
2555 return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
2560 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2561 return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.
raw, 0)};
2568 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2577 template <
typename T>
2580 return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
2595 template <
int kBytes,
typename T>
2597 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2599 return Vec256<T>{_mm256_slli_si256(
v.raw, kBytes)};
2602 template <
int kBytes,
typename T>
2604 return ShiftLeftBytes<kBytes>(Full256<T>(),
v);
2609 template <
int kLanes,
typename T>
2615 template <
int kLanes,
typename T>
2617 return ShiftLeftLanes<kLanes>(Full256<T>(),
v);
2622 template <
int kBytes,
typename T>
2624 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2626 return Vec256<T>{_mm256_srli_si256(
v.raw, kBytes)};
2630 template <
int kLanes,
typename T>
2639 template <
int kBytes,
typename T,
class V = Vec256<T>>
2642 return BitCast(
d, Vec256<uint8_t>{_mm256_alignr_epi8(
2649 template <
int kLane>
2651 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2653 const __m256i lo = _mm256_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2654 return Vec256<uint16_t>{_mm256_unpacklo_epi64(lo, lo)};
2657 _mm256_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2658 return Vec256<uint16_t>{_mm256_unpackhi_epi64(hi, hi)};
2661 template <
int kLane>
2663 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2664 return Vec256<uint32_t>{_mm256_shuffle_epi32(
v.raw, 0x55 * kLane)};
2666 template <
int kLane>
2668 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2673 template <
int kLane>
2675 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2677 const __m256i lo = _mm256_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2678 return Vec256<int16_t>{_mm256_unpacklo_epi64(lo, lo)};
2681 _mm256_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2682 return Vec256<int16_t>{_mm256_unpackhi_epi64(hi, hi)};
2685 template <
int kLane>
2687 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2688 return Vec256<int32_t>{_mm256_shuffle_epi32(
v.raw, 0x55 * kLane)};
2690 template <
int kLane>
2692 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2697 template <
int kLane>
2699 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2700 return Vec256<float>{_mm256_shuffle_ps(
v.raw,
v.raw, 0x55 * kLane)};
2702 template <
int kLane>
2704 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2784 template <
typename T>
2790 template <
typename T,
typename TI, HWY_IF_LANE_SIZE(T, 4)>
2792 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2793 #if HWY_IS_DEBUG_BUILD
2796 AllTrue(di,
Lt(vec,
Set(di,
static_cast<TI
>(32 /
sizeof(T))))));
2802 template <
typename T,
typename TI, HWY_IF_LANE_SIZE(T, 8)>
2804 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2805 const Rebind<TI, decltype(
d)> di;
2807 #if HWY_IS_DEBUG_BUILD
2809 AllTrue(di,
Lt(idx64,
Set(di,
static_cast<TI
>(32 /
sizeof(T))))));
2812 #if HWY_TARGET <= HWY_AVX3
2814 return Indices256<T>{idx64.raw};
2818 const Vec256<TI> dup =
2819 BitCast(di, Vec256<float>{_mm256_moveldup_ps(
BitCast(df, idx64).raw)});
2821 const Vec256<TI> idx32 = dup + dup +
Set(di, TI(1) << 32);
2822 return Indices256<T>{idx32.raw};
2826 template <
typename T,
typename TI>
2828 const Rebind<TI, decltype(
d)> di;
2832 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2834 return Vec256<T>{_mm256_permutevar8x32_epi32(
v.raw, idx.raw)};
2837 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2839 #if HWY_TARGET <= HWY_AVX3
2840 return Vec256<T>{_mm256_permutexvar_epi64(idx.raw,
v.raw)};
2842 return Vec256<T>{_mm256_permutevar8x32_epi32(
v.raw, idx.raw)};
2853 #if HWY_TARGET <= HWY_AVX3
2865 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2867 alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2871 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2873 alignas(32) constexpr int64_t kReverse[4] = {3, 2, 1, 0};
2877 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2879 #if HWY_TARGET <= HWY_AVX3
2881 alignas(32) constexpr int16_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2882 7, 6, 5, 4, 3, 2, 1, 0};
2883 const Vec256<int16_t> idx =
Load(di, kReverse);
2885 _mm256_permutexvar_epi16(idx.raw,
BitCast(di,
v).raw)});
2889 return BitCast(
d, RotateRight<16>(rev32));
2895 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2901 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2906 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2913 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2915 #if HWY_TARGET <= HWY_AVX3
2917 alignas(32) constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4,
2918 11, 10, 9, 8, 15, 14, 13, 12};
2921 _mm256_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2928 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2933 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2935 return Vec256<T>{_mm256_permute4x64_epi64(
v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2938 return Vec256<double>{_mm256_permute4x64_pd(
v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2943 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2945 #if HWY_TARGET <= HWY_AVX3
2947 alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
2948 15, 14, 13, 12, 11, 10, 9, 8};
2951 _mm256_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2958 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2963 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2975 const Vec256<uint8_t> b) {
2976 return Vec256<uint8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
2979 const Vec256<uint16_t> b) {
2980 return Vec256<uint16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
2983 const Vec256<uint32_t> b) {
2984 return Vec256<uint32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
2987 const Vec256<uint64_t> b) {
2988 return Vec256<uint64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
2992 const Vec256<int8_t> b) {
2993 return Vec256<int8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
2996 const Vec256<int16_t> b) {
2997 return Vec256<int16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
3000 const Vec256<int32_t> b) {
3001 return Vec256<int32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
3004 const Vec256<int64_t> b) {
3005 return Vec256<int64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
3009 const Vec256<float> b) {
3010 return Vec256<float>{_mm256_unpacklo_ps(a.raw, b.raw)};
3023 const Vec256<uint8_t> b) {
3024 return Vec256<uint8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3027 const Vec256<uint16_t> b) {
3028 return Vec256<uint16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3031 const Vec256<uint32_t> b) {
3032 return Vec256<uint32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3035 const Vec256<uint64_t> b) {
3036 return Vec256<uint64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3040 const Vec256<int8_t> b) {
3041 return Vec256<int8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
3044 const Vec256<int16_t> b) {
3045 return Vec256<int16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
3048 const Vec256<int32_t> b) {
3049 return Vec256<int32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
3052 const Vec256<int64_t> b) {
3053 return Vec256<int64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
3057 const Vec256<float> b) {
3058 return Vec256<float>{_mm256_unpackhi_ps(a.raw, b.raw)};
3067 template <
typename T,
class V = Vec256<T>>
3076 template <
typename T,
typename TW = MakeW
ide<T>>
3080 template <
typename T,
typename TW = MakeW
ide<T>>
3085 template <
typename T,
typename TW = MakeW
ide<T>>
3097 template <
typename T>
3099 const Vec256<T> lo) {
3100 const Half<decltype(
d)> d2;
3101 return Vec256<T>{_mm256_inserti128_si256(lo.raw,
LowerHalf(d2, hi).raw, 1)};
3105 const Half<decltype(
d)> d2;
3111 const Half<decltype(
d)> d2;
3116 template <
typename T>
3118 const Vec256<T> lo) {
3119 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
3133 template <
typename T>
3135 const Vec256<T> lo) {
3136 return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
3150 template <
typename T>
3152 const Vec256<T> lo) {
3153 const Half<decltype(
d)> d2;
3159 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3162 #if HWY_TARGET <= HWY_AVX3
3163 alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3164 return BitCast(
d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3165 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3169 const Vec256<float> v3131{_mm256_shuffle_ps(
3170 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
3171 return Vec256<T>{_mm256_permute4x64_epi64(
BitCast(du, v3131).raw,
3172 _MM_SHUFFLE(3, 1, 2, 0))};
3179 #if HWY_TARGET <= HWY_AVX3
3180 alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3182 __mmask8{0xFF}, hi.
raw)};
3185 _mm256_shuffle_ps(lo.
raw, hi.
raw, _MM_SHUFFLE(3, 1, 3, 1))};
3187 BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3191 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3194 #if HWY_TARGET <= HWY_AVX3
3195 alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3196 return BitCast(
d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3197 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3201 const Vec256<double> v31{
3202 _mm256_shuffle_pd(
BitCast(df, lo).raw,
BitCast(df, hi).raw, 15)};
3204 _mm256_permute4x64_epi64(
BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3210 #if HWY_TARGET <= HWY_AVX3
3212 alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3214 __mmask8{0xFF}, hi.
raw)};
3219 _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3225 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3228 #if HWY_TARGET <= HWY_AVX3
3229 alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3230 return BitCast(
d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3231 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3235 const Vec256<float> v2020{_mm256_shuffle_ps(
3236 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
3237 return Vec256<T>{_mm256_permute4x64_epi64(
BitCast(du, v2020).raw,
3238 _MM_SHUFFLE(3, 1, 2, 0))};
3246 #if HWY_TARGET <= HWY_AVX3
3247 alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3249 __mmask8{0xFF}, hi.
raw)};
3252 _mm256_shuffle_ps(lo.
raw, hi.
raw, _MM_SHUFFLE(2, 0, 2, 0))};
3254 BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3259 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3262 #if HWY_TARGET <= HWY_AVX3
3263 alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3264 return BitCast(
d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3265 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3269 const Vec256<double> v20{
3272 _mm256_permute4x64_epi64(
BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3279 #if HWY_TARGET <= HWY_AVX3
3281 alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3283 __mmask8{0xFF}, hi.
raw)};
3288 _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3294 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3296 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3300 _mm256_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3303 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3310 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3312 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3316 _mm256_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3319 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3328 template <
typename T>
3330 const Vec256<T> b) {
3332 const Full256<uint8_t> d8;
3333 alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3334 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3337 template <
typename T>
3339 const Vec256<T> b) {
3340 return Vec256<T>{_mm256_blend_epi16(a.raw, b.raw, 0x55)};
3342 template <
typename T>
3344 const Vec256<T> b) {
3345 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)};
3347 template <
typename T>
3349 const Vec256<T> b) {
3350 return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)};
3355 template <
typename T>
3356 HWY_API Vec256<T>
OddEven(
const Vec256<T> a,
const Vec256<T> b) {
3359 HWY_API Vec256<float>
OddEven(
const Vec256<float> a,
const Vec256<float> b) {
3360 return Vec256<float>{_mm256_blend_ps(a.raw, b.raw, 0x55)};
3369 template <
typename T>
3384 template <
typename T>
3386 return Vec256<T>{_mm256_permute4x64_epi64(
v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
3394 BitCast(di,
v).raw, _MM_SHUFFLE(1, 0, 3, 2))});
3398 return Vec256<double>{_mm256_permute4x64_pd(
v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
3403 template <
typename T>
3411 template <
typename T,
typename TI>
3413 const Vec256<TI> from) {
3414 return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
3418 template <
typename T,
typename TI,
size_t NI>
3429 template <
typename T,
size_t N,
typename TI>
3441 #if HWY_TARGET > HWY_AVX3
3445 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3449 const Rebind<float, decltype(dw)> df;
3450 const auto zero =
Zero(
d);
3453 const auto upper = exp +
Set(
d, 0x3F80);
3455 const auto f0 =
ZipLower(dw, zero, upper);
3456 const auto f1 =
ZipUpper(dw, zero, upper);
3459 const Vec256<int32_t> bits0{_mm256_cvttps_epi32(
BitCast(df, f0).raw)};
3460 const Vec256<int32_t> bits1{_mm256_cvttps_epi32(
BitCast(df, f1).raw)};
3461 return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
3469 #if HWY_TARGET <= HWY_AVX3
3487 template <
typename T, HWY_IF_SIGNED(T)>
3498 #if HWY_TARGET <= HWY_AVX3
3521 #if HWY_TARGET <= HWY_AVX3
3535 #if HWY_TARGET <= HWY_AVX3
3543 const Vec256<uint64_t> b) {
3544 const DFromV<decltype(a)> du64;
3546 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3547 const auto a32 =
BitCast(du32, a);
3548 const auto b32 =
BitCast(du32, b);
3556 const auto aLbL =
MulEven(a32, b32);
3557 const auto w3 = aLbL & maskL;
3559 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
3560 const auto w2 = t2 & maskL;
3561 const auto w1 = ShiftRight<32>(t2);
3563 const auto t =
MulEven(a32, bH) + w2;
3564 const auto k = ShiftRight<32>(t);
3566 const auto mulH =
MulEven(aH, bH) + w1 + k;
3567 const auto mulL = ShiftLeft<32>(t) + w3;
3572 const Vec256<uint64_t> b) {
3573 const DFromV<decltype(a)> du64;
3575 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3576 const auto a32 =
BitCast(du32, a);
3577 const auto b32 =
BitCast(du32, b);
3583 const auto aLbL =
MulEven(a32, b32);
3584 const auto w3 = aLbL & maskL;
3586 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
3587 const auto w2 = t2 & maskL;
3588 const auto w1 = ShiftRight<32>(t2);
3590 const auto t =
MulEven(a32, bH) + w2;
3591 const auto k = ShiftRight<32>(t);
3593 const auto mulH =
MulEven(aH, bH) + w1 + k;
3594 const auto mulL = ShiftLeft<32>(t) + w3;
3601 Vec256<bfloat16_t> a,
3602 Vec256<bfloat16_t> b,
3603 const Vec256<float> sum0,
3604 Vec256<float>& sum1) {
3608 const Vec256<uint16_t> zero =
Zero(du16);
3637 Vec128<uint8_t>
v) {
3638 return Vec256<uint16_t>{_mm256_cvtepu8_epi16(
v.raw)};
3645 Vec128<uint8_t>
v) {
3646 return Vec256<int16_t>{_mm256_cvtepu8_epi16(
v.raw)};
3653 Vec128<uint16_t>
v) {
3654 return Vec256<uint32_t>{_mm256_cvtepu16_epi32(
v.raw)};
3657 Vec128<uint16_t>
v) {
3658 return Vec256<int32_t>{_mm256_cvtepu16_epi32(
v.raw)};
3671 return Vec256<int16_t>{_mm256_cvtepi8_epi16(
v.raw)};
3678 Vec128<int16_t>
v) {
3679 return Vec256<int32_t>{_mm256_cvtepi16_epi32(
v.raw)};
3689 const Vec256<int32_t>
v) {
3690 const __m256i u16 = _mm256_packus_epi32(
v.raw,
v.raw);
3693 return Vec128<uint16_t>{
3694 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
3698 const Vec256<int32_t>
v) {
3699 const __m256i i16 = _mm256_packs_epi32(
v.raw,
v.raw);
3700 return Vec128<int16_t>{
3701 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
3706 const __m256i u16_blocks = _mm256_packus_epi32(
v.raw,
v.raw);
3708 const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
3709 const __m128i u16 = _mm256_castsi256_si128(u16_concat);
3712 const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
3717 const Vec256<int16_t>
v) {
3718 const __m256i u8 = _mm256_packus_epi16(
v.raw,
v.raw);
3719 return Vec128<uint8_t>{
3720 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
3725 const __m256i i16_blocks = _mm256_packs_epi32(
v.raw,
v.raw);
3727 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
3728 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
3733 const Vec256<int16_t>
v) {
3734 const __m256i i8 = _mm256_packs_epi16(
v.raw,
v.raw);
3735 return Vec128<int8_t>{
3736 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
3745 const Vec256<float>
v) {
3746 #ifdef HWY_DISABLE_F16C
3748 const Rebind<uint32_t, decltype(df16)> du;
3750 const auto bits32 =
BitCast(du,
v);
3751 const auto sign = ShiftRight<31>(bits32);
3752 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
3753 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
3755 const auto k15 =
Set(di, 15);
3756 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
3757 const auto is_tiny = exp <
Set(di, -24);
3759 const auto is_subnormal = exp <
Set(di, -14);
3760 const auto biased_exp16 =
3762 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
3763 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
3764 (mantissa32 >> (
Set(du, 13) + sub_exp));
3766 ShiftRight<13>(mantissa32));
3768 const auto sign16 = ShiftLeft<15>(sign);
3769 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
3774 return Vec128<float16_t>{_mm256_cvtps_ph(
v.raw, _MM_FROUND_NO_EXC)};
3781 const Vec256<float>
v) {
3783 const Rebind<int32_t, decltype(dbf16)> di32;
3784 const Rebind<uint32_t, decltype(dbf16)> du32;
3785 const Rebind<uint16_t, decltype(dbf16)> du16;
3786 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
3794 const Repartition<uint32_t, decltype(dbf16)> du32;
3805 const Vec256<double>
v) {
3807 return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)};
3812 const Full256<uint32_t> d32;
3813 alignas(32)
static constexpr uint32_t k8From32[8] = {
3814 0x0C080400u, ~0
u, ~0
u, ~0
u, ~0
u, 0x0C080400u, ~0
u, ~0
u};
3819 const auto hi =
UpperHalf(Full128<uint32_t>(), quad);
3821 return BitCast(Full64<uint8_t>(), pair);
3827 const Vec256<int32_t>
v) {
3828 return Vec256<float>{_mm256_cvtepi32_ps(
v.raw)};
3832 #if HWY_TARGET <= HWY_AVX3
3841 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
3842 const auto v_upper =
BitCast(dd, ShiftRight<32>(
BitCast(d64,
v)) ^ k84_63);
3845 const auto k52 =
Set(d32, 0x43300000);
3848 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
3849 return (v_upper - k84_63_52) + v_lower;
3859 #if HWY_TARGET <= HWY_AVX3
3862 using VI = decltype(
Zero(di));
3863 const VI k0 =
Zero(di);
3864 const VI k1 =
Set(di, 1);
3865 const VI k51 =
Set(di, 51);
3868 const VI biased_exp = ShiftRight<52>(
BitCast(di,
v)) &
Set(di, 0x7FF);
3869 const VI exp = biased_exp -
Set(di, 0x3FF);
3870 const auto in_range = exp <
Set(di, 63);
3878 const VI shift_mnt =
Max(k51 - exp, k0);
3879 const VI shift_int =
Max(exp - k51, k0);
3880 const VI mantissa =
BitCast(di,
v) &
Set(di, (1ULL << 52) - 1);
3882 const VI int52 = (mantissa |
Set(di, 1ULL << 52)) >> (shift_mnt + k1);
3884 const VI shifted = int52 << shift_int;
3886 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
3890 const VI limit =
Set(di, LimitsMax<int64_t>()) - sign_mask;
3891 const VI magnitude =
IfThenElse(in_range, restored, limit);
3894 return (magnitude ^ sign_mask) - sign_mask;
3899 const Full256<int32_t> di;
3905 const Vec128<float16_t>
v) {
3906 #ifdef HWY_DISABLE_F16C
3910 const auto bits16 =
PromoteTo(du32, Vec128<uint16_t>{
v.raw});
3911 const auto sign = ShiftRight<15>(bits16);
3912 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
3913 const auto mantissa = bits16 &
Set(du32, 0x3FF);
3914 const auto subnormal =
3916 Set(df32, 1.0f / 16384 / 1024));
3918 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
3919 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
3920 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
3921 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
3922 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
3925 return Vec256<float>{_mm256_cvtph_ps(
v.raw)};
3930 const Vec128<bfloat16_t>
v) {
3931 const Rebind<uint16_t, decltype(df32)> du16;
3938 #if !defined(HWY_DISABLE_PCLMUL_AES)
3941 #ifdef HWY_NATIVE_AES
3942 #undef HWY_NATIVE_AES
3944 #define HWY_NATIVE_AES
3949 #if HWY_TARGET == HWY_AVX3_DL
3953 const Half<decltype(
d)> d2;
3961 #if HWY_TARGET == HWY_AVX3_DL
3965 const Half<decltype(
d)> d2;
3973 #if HWY_TARGET == HWY_AVX3_DL
3977 const Half<decltype(
d)> d2;
3984 #if HWY_TARGET == HWY_AVX3_DL
3988 const Half<decltype(
d)> d2;
3999 template <
typename T,
typename T2>
4002 for (
size_t i = 0; i < 32 /
sizeof(T); ++i) {
4003 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
4005 return Load(
d, lanes);
4008 #if HWY_TARGET <= HWY_AVX3
4013 template <
typename T>
4016 constexpr
size_t N = 32 /
sizeof(T);
4017 constexpr
size_t kNumBytes = (
N + 7) / 8;
4019 uint64_t mask_bits = 0;
4020 CopyBytes<kNumBytes>(bits, &mask_bits);
4023 mask_bits &= (1ull <<
N) - 1;
4032 template <
typename T>
4035 constexpr
size_t N = 32 /
sizeof(T);
4036 constexpr
size_t kNumBytes = (
N + 7) / 8;
4038 CopyBytes<kNumBytes>(&mask.raw, bits);
4042 const int mask =
static_cast<int>((1ull <<
N) - 1);
4043 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
4050 template <
typename T>
4055 template <
typename T>
4057 const Mask256<T> mask) {
4065 template <
typename T>
4067 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4068 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
4070 return mask.
raw == 0;
4073 template <
typename T>
4075 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4076 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
4078 return mask.
raw == 0;
4081 template <
typename T>
4083 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4084 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
4086 return mask.
raw == 0;
4089 template <
typename T>
4091 return (uint64_t{mask.
raw} & 0xF) == 0;
4096 template <
typename T>
4103 template <
typename T>
4105 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4106 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
4108 return mask.
raw == 0xFFFFFFFFu;
4111 template <
typename T>
4113 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4114 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
4116 return mask.
raw == 0xFFFFu;
4119 template <
typename T>
4121 #if HWY_COMPILER_HAS_MASK_INTRINSICS
4122 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
4124 return mask.
raw == 0xFFu;
4127 template <
typename T>
4130 return mask.
raw == 0xFu;
4135 template <
typename T>
4144 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4146 return Vec256<T>{_mm256_maskz_compress_epi32(mask.
raw,
v.raw)};
4153 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4156 alignas(16) constexpr uint64_t packed_array[16] = {
4157 0x3210, 0x3210, 0x3201, 0x3210, 0x3102, 0x3120, 0x3021, 0x3210,
4158 0x2103, 0x2130, 0x2031, 0x2310, 0x1032, 0x1320, 0x0321, 0x3210};
4164 const auto packed =
Set(du64, packed_array[mask.raw]);
4165 alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4166 const auto indices = Indices256<T>{(packed >>
Load(du64, shifts)).raw};
4172 template <
typename T>
4179 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4182 const Rebind<uint16_t, decltype(
d)> du;
4185 const uint64_t mask_bits{mask.
raw};
4187 #if HWY_TARGET == HWY_AVX3_DL
4188 _mm256_mask_compressstoreu_epi16(unaligned, mask.
raw, vu.raw);
4191 const Half<decltype(du)> duh;
4195 const uint64_t mask_bitsL = mask_bits & 0xFF;
4196 const uint64_t mask_bitsH = mask_bits >> 8;
4198 const auto idxL = detail::IndicesForCompress16(mask_bitsL);
4199 const auto idxH = detail::IndicesForCompress16(mask_bitsH);
4204 const Half<decltype(
d)> dh;
4212 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4215 _mm256_mask_compressstoreu_epi32(unaligned, mask.raw,
v.raw);
4216 const size_t count =
PopCount(uint64_t{mask.raw});
4219 __msan_unpoison(unaligned, count *
sizeof(T));
4224 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4227 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw,
v.raw);
4228 const size_t count =
PopCount(uint64_t{mask.raw} & 0xFull);
4231 __msan_unpoison(unaligned, count *
sizeof(T));
4239 _mm256_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
4240 const size_t count =
PopCount(uint64_t{mask.
raw});
4243 __msan_unpoison(unaligned, count *
sizeof(
float));
4251 _mm256_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
4252 const size_t count =
PopCount(uint64_t{mask.
raw} & 0xFull);
4255 __msan_unpoison(unaligned, count *
sizeof(
double));
4262 #if HWY_TARGET <= HWY_AVX3
4264 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4272 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4275 #if HWY_TARGET <= HWY_AVX3_DL
4282 __msan_unpoison(unaligned, count *
sizeof(T));
4290 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4298 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4302 const Vec256<T> compressed =
Compress(
v, m);
4303 #if HWY_MEM_OPS_MIGHT_FAULT
4306 alignas(32) T buf[16];
4307 Store(compressed,
d, buf);
4308 memcpy(unaligned, buf, count *
sizeof(T));
4319 template <
typename T>
4332 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4333 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T>
d, uint64_t mask_bits) {
4336 const auto vbits =
BitCast(du,
Set(du32,
static_cast<uint32_t
>(mask_bits)));
4340 alignas(32) constexpr uint64_t kRep8[4] = {
4341 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
4342 0x0303030303030303ull};
4345 alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4346 1, 2, 4, 8, 16, 32, 64, 128};
4350 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4351 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T>
d, uint64_t mask_bits) {
4353 alignas(32) constexpr uint16_t kBit[16] = {
4354 1, 2, 4, 8, 16, 32, 64, 128,
4355 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
4356 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
4360 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4361 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T>
d, uint64_t mask_bits) {
4363 alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4364 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
4368 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4369 HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T>
d, uint64_t mask_bits) {
4371 alignas(32) constexpr uint64_t kBit[8] = {1, 2, 4, 8};
4378 template <
typename T>
4381 constexpr
size_t N = 32 /
sizeof(T);
4382 constexpr
size_t kNumBytes = (
N + 7) / 8;
4384 uint64_t mask_bits = 0;
4385 CopyBytes<kNumBytes>(bits, &mask_bits);
4388 mask_bits &= (1ull <<
N) - 1;
4391 return detail::LoadMaskBits256(
d, mask_bits);
4398 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4401 const Full256<uint8_t> d8;
4404 return static_cast<uint32_t
>(_mm256_movemask_epi8(sign_bits));
4407 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4411 const Full256<uint8_t> d8;
4416 return _pext_u64(sign_bits8, 0xAAAAAAAAull);
4421 const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
4423 const auto compressed =
4424 _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
4425 return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
4429 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4432 const Full256<float> df;
4434 return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
4437 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4440 const Full256<double> df;
4442 return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
4448 template <
typename T>
4451 constexpr
size_t N = 32 /
sizeof(T);
4452 constexpr
size_t kNumBytes = (
N + 7) / 8;
4455 CopyBytes<kNumBytes>(&mask_bits, bits);
4463 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4470 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4476 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4482 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4484 constexpr uint64_t kAllBits = (1ull << (32 /
sizeof(T))) - 1;
4488 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4494 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4499 template <
typename T>
4501 const Mask256<T> mask) {
4510 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4511 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T>
d,
4512 uint64_t mask_bits) {
4519 alignas(16) constexpr uint32_t packed_array[256] = {
4520 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
4521 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
4522 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
4523 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
4524 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
4525 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
4526 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
4527 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
4528 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
4529 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
4530 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
4531 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
4532 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
4533 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
4534 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
4535 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
4536 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
4537 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
4538 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
4539 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
4540 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
4541 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
4542 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
4543 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
4544 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
4545 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
4546 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
4547 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
4548 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
4549 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
4550 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
4551 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
4552 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
4553 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
4554 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
4555 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
4556 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
4557 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
4558 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
4559 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
4560 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
4561 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
4562 0x10765432, 0x17654320, 0x07654321, 0x76543210};
4568 const auto packed =
Set(d32, packed_array[mask_bits]);
4569 alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4570 return Indices256<uint32_t>{(packed >>
Load(d32, shifts)).raw};
4573 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4574 HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T>
d,
4575 uint64_t mask_bits) {
4581 alignas(32) constexpr uint32_t packed_array[128] = {
4582 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
4583 2, 3, 0, 1, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
4584 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
4585 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
4586 6, 7, 0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5,
4587 2, 3, 6, 7, 0, 1, 4, 5, 0, 1, 2, 3, 6, 7, 4, 5,
4588 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7, 2, 3,
4589 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
4590 return Indices256<uint32_t>{
Load(d32, packed_array + 8 * mask_bits).raw};
4593 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4598 HWY_DASSERT(mask_bits < (1ull << (32 /
sizeof(T))));
4599 const auto indices = IndicesFromBits(
d, mask_bits);
4605 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4610 const Half<decltype(du)> duh;
4611 const auto half0 =
LowerHalf(duh, vu16);
4612 const auto half1 =
UpperHalf(duh, vu16);
4614 const uint64_t mask_bits0 = mask_bits & 0xFF;
4615 const uint64_t mask_bits1 = mask_bits >> 8;
4619 alignas(32) uint16_t all_true[16] = {};
4621 const size_t num_true0 =
PopCount(mask_bits0);
4622 Store(compressed0, duh, all_true);
4623 StoreU(compressed1, duh, all_true + num_true0);
4629 alignas(32) uint16_t all_false[16] = {};
4630 const size_t num_true1 =
PopCount(mask_bits1);
4631 Store(compressed1, duh, all_false + 8);
4632 StoreU(compressed0, duh, all_false + num_true1);
4634 const auto mask =
FirstN(du, num_true0 + num_true1);
4645 template <
typename T>
4651 template <
typename T>
4653 constexpr
size_t N = 32 /
sizeof(T);
4654 constexpr
size_t kNumBytes = (
N + 7) / 8;
4656 uint64_t mask_bits = 0;
4657 CopyBytes<kNumBytes>(bits, &mask_bits);
4660 mask_bits &= (1ull <<
N) - 1;
4668 template <
typename T>
4676 template <
typename T>
4680 const size_t count =
PopCount(mask_bits);
4685 template <
typename T>
4688 constexpr
size_t N = 32 /
sizeof(T);
4689 constexpr
size_t kNumBytes = (
N + 7) / 8;
4691 uint64_t mask_bits = 0;
4692 CopyBytes<kNumBytes>(bits, &mask_bits);
4695 mask_bits &= (1ull <<
N) - 1;
4708 const Vec256<uint8_t> v1,
4709 const Vec256<uint8_t> v2, Full256<uint8_t>
d,
4711 const auto k5 =
Set(
d, 5);
4712 const auto k6 =
Set(
d, 6);
4716 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
4717 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
4718 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
4719 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
4720 0x80, 0, 0x80, 0x80, 1, 0x80,
4721 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
4724 const auto shuf_b0 = CombineShiftRightBytes<15>(
d, shuf_g0, shuf_g0);
4728 const auto interleaved_10_00 = r0 | g0 | b0;
4731 const auto shuf_r1 = shuf_b0 + k6;
4732 const auto shuf_g1 = shuf_r0 + k5;
4733 const auto shuf_b1 = shuf_g0 + k5;
4737 const auto interleaved_15_05 = r1 | g1 | b1;
4744 StoreU(out0,
d, unaligned + 0 * 32);
4747 const auto shuf_r2 = shuf_b1 + k6;
4748 const auto shuf_g2 = shuf_r1 + k5;
4749 const auto shuf_b2 = shuf_g1 + k5;
4753 const auto interleaved_1A_0A = r2 | g2 | b2;
4756 StoreU(out1,
d, unaligned + 1 * 32);
4759 StoreU(out2,
d, unaligned + 2 * 32);
4765 const Vec256<uint8_t> v1,
4766 const Vec256<uint8_t> v2,
4767 const Vec256<uint8_t> v3, Full256<uint8_t> d8,
4772 const auto ba0 =
ZipLower(d16, v0, v1);
4773 const auto dc0 =
ZipLower(d16, v2, v3);
4774 const auto ba8 =
ZipUpper(d16, v0, v1);
4775 const auto dc8 =
ZipUpper(d16, v2, v3);
4776 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
4777 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
4778 const auto dcba_8 =
ZipLower(d32, ba8, dc8);
4779 const auto dcba_C =
ZipUpper(d32, ba8, dc8);
4784 StoreU(out0, d8, unaligned + 0 * 32);
4785 StoreU(out1, d8, unaligned + 1 * 32);
4788 StoreU(out2, d8, unaligned + 2 * 32);
4789 StoreU(out3, d8, unaligned + 3 * 32);
4798 template <
typename T>
4800 const Vec256<T> v3210) {
4802 const auto v31_20_31_20 = v3210 + v1032;
4803 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
4804 return v20_31_20_31 + v31_20_31_20;
4806 template <
typename T>
4808 const Vec256<T> v3210) {
4810 const auto v31_20_31_20 =
Min(v3210, v1032);
4811 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
4812 return Min(v20_31_20_31, v31_20_31_20);
4814 template <
typename T>
4816 const Vec256<T> v3210) {
4818 const auto v31_20_31_20 =
Max(v3210, v1032);
4819 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
4820 return Max(v20_31_20_31, v31_20_31_20);
4823 template <
typename T>
4825 const Vec256<T> v10) {
4829 template <
typename T>
4831 const Vec256<T> v10) {
4833 return Min(v10, v01);
4835 template <
typename T>
4837 const Vec256<T> v10) {
4839 return Max(v10, v01);
4843 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4845 const Repartition<int32_t, Full256<T>> d32;
4847 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4850 return BitCast(Full256<T>(),
Or(min, ShiftLeft<16>(min)));
4852 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4854 const Repartition<int32_t, Full256<T>> d32;
4856 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
4859 return BitCast(Full256<T>(),
Or(min, ShiftLeft<16>(min)));
4865 template <
typename T>
4870 template <
typename T>
4875 template <
typename T>
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: arm_neon-inl.h:485
Raw raw
Definition: arm_neon-inl.h:518
Definition: wasm_256-inl.h:39
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition: x86_256-inl.h:75
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition: x86_256-inl.h:72
Raw raw
Definition: x86_256-inl.h:94
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition: x86_256-inl.h:87
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition: x86_256-inl.h:90
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition: x86_256-inl.h:78
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition: x86_256-inl.h:84
typename detail::Raw256< T >::type Raw
Definition: x86_256-inl.h:67
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition: x86_256-inl.h:81
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1538
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4522
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:4784
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4773
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3111
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3106
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2568
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3095
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:134
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4066
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4309
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:936
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5020
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:728
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4314
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2434
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:770
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2426
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:862
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4305
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:680
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
decltype(MaskFromVec(Zero(D()))) MFromD
Definition: rvv-inl.h:1120
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_INLINE constexpr HWY_MAYBE_UNUSED int Pow2(D)
Definition: ops/shared-inl.h:247
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition: ops/shared-inl.h:203
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:212
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
long long int GatherIndex64
Definition: x86_128-inl.h:3088
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
u
Definition: rvv-inl.h:1405
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:598
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
Definition: arm_neon-inl.h:4797
Definition: wasm_256-inl.h:1752
__m256i raw
Definition: x86_256-inl.h:2786
Definition: wasm_256-inl.h:70
typename detail::RawMask256< sizeof(T)>::type Raw
Definition: x86_256-inl.h:125
Raw raw
Definition: x86_256-inl.h:131
static Mask256< T > FromBits(uint64_t mask_bits)
Definition: x86_256-inl.h:127
Definition: ops/shared-inl.h:40
HWY_INLINE __m256d operator()(__m256i v)
Definition: x86_256-inl.h:170
HWY_INLINE __m256 operator()(__m256i v)
Definition: x86_256-inl.h:166
Definition: x86_256-inl.h:161
HWY_INLINE __m256i operator()(__m256i v)
Definition: x86_256-inl.h:162
__m256d type
Definition: x86_256-inl.h:60
__m256 type
Definition: x86_256-inl.h:56
Definition: x86_256-inl.h:51
__m256i type
Definition: x86_256-inl.h:52
__mmask32 type
Definition: x86_256-inl.h:106
__mmask16 type
Definition: x86_256-inl.h:110
__mmask8 type
Definition: x86_256-inl.h:114
__mmask8 type
Definition: x86_256-inl.h:118
Definition: x86_256-inl.h:103