22 #include <immintrin.h>
27 #include <sanitizer/msan_interface.h>
30 #if defined(_MSC_VER) && defined(__clang__)
35 #include <smmintrin.h>
37 #include <avxintrin.h>
38 #include <avx2intrin.h>
39 #include <f16cintrin.h>
40 #include <fmaintrin.h>
42 #include <avx512fintrin.h>
43 #include <avx512vlintrin.h>
44 #include <avx512bwintrin.h>
45 #include <avx512dqintrin.h>
46 #include <avx512vlbwintrin.h>
47 #include <avx512vldqintrin.h>
48 #include <avx512bitalgintrin.h>
49 #include <avx512vlbitalgintrin.h>
50 #include <avx512vpopcntdqintrin.h>
51 #include <avx512vpopcntdqvlintrin.h>
81 template <
size_t size>
102 template <
typename T>
110 return *
this = (*
this * other);
113 return *
this = (*
this / other);
116 return *
this = (*
this + other);
119 return *
this = (*
this - other);
122 return *
this = (*
this & other);
125 return *
this = (*
this | other);
128 return *
this = (*
this ^ other);
135 template <
typename T>
147 return _mm512_castpd_si512(
v);
150 template <
typename T>
156 template <
typename T>
169 template <
typename T>
176 template <
typename T,
typename FromT>
184 template <
typename T>
186 return Vec512<T>{_mm512_setzero_si512()};
207 _mm512_set1_epi64(
static_cast<long long>(t))};
220 _mm512_set1_epi64(
static_cast<long long>(t))};
233 template <
typename T>
237 return Vec512<T>{_mm512_undefined_epi32()};
252 template <
typename T>
257 Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
262 template <
typename T>
277 template <
typename T>
292 template <
typename T>
306 template <
typename T>
320 template <
typename T>
324 using VU =
VFromD<decltype(du)>;
325 const __m512i ret = _mm512_ternarylogic_epi64(
332 template <
typename T>
336 using VU =
VFromD<decltype(du)>;
344 template <
typename T>
349 template <
typename T>
354 template <
typename T>
362 #if HWY_TARGET == HWY_AVX3_DL
364 #ifdef HWY_NATIVE_POPCNT
365 #undef HWY_NATIVE_POPCNT
367 #define HWY_NATIVE_POPCNT
372 template <
typename T>
376 template <
typename T>
380 template <
typename T>
384 template <
typename T>
391 template <
typename T>
402 template <
typename T>
404 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
420 const __m512i out = _mm512_ternarylogic_epi32(
425 template <
typename T>
446 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
449 const uint32_t all = ~uint32_t(0);
451 m.raw =
static_cast<decltype(m.raw)
>((n > 255) ? all : _bzhi_u32(all, n));
455 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
457 const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
458 return Mask512<T>{
static_cast<__mmask64
>(bits)};
464 template <
typename T>
468 const uint64_t all = ~uint64_t(0);
470 m.
raw =
static_cast<decltype(m.raw)
>((n > 255) ? all : _bzhi_u64(all, n));
473 return detail::FirstN<T>(n);
484 template <
typename T>
490 template <
typename T>
496 template <
typename T>
502 template <
typename T>
511 template <
typename T>
529 template <
typename T>
535 template <
typename T>
541 template <
typename T>
547 template <
typename T>
556 template <
typename T>
571 template <
typename T>
577 template <
typename T>
582 template <
typename T>
587 template <
typename T>
595 template <
typename T>
608 template <
typename T>
610 static_assert(IsSigned<T>(),
"Only works for signed/float");
615 template <
typename T, HWY_IF_FLOAT(T)>
788 #if HWY_COMPILER_MSVC
845 template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
852 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
877 return shifted &
Set(d8, 0xFF >> kBits);
900 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
901 return (shifted ^ shifted_sign) - shifted_sign;
908 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
914 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
945 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
950 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
972 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
993 const auto shifted_sign =
994 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
995 return (shifted ^ shifted_sign) - shifted_sign;
1016 template <
typename T, HWY_IF_SIGNED(T)>
1178 template <
typename T, HWY_IF_FLOAT(T)>
1183 template <
typename T, HWY_IF_NOT_FLOAT(T)>
1185 return Zero(Full512<T>()) -
v;
1284 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1288 v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
1294 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1298 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
1304 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1308 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
1314 _mm512_roundscale_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1318 _mm512_roundscale_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
1327 template <
typename TFrom,
typename TTo>
1329 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1335 template <
typename T>
1340 template <
typename T>
1345 template <
typename T>
1350 template <
typename T>
1358 template <
typename T>
1360 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1366 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1370 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1372 return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
1374 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1376 return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
1378 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1380 return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
1393 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1397 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1399 return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
1401 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1403 return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
1405 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1407 return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
1464 template <
typename T>
1469 template <
typename T>
1478 template <
typename T>
1482 template <
typename T>
1486 template <
typename T>
1490 template <
typename T>
1497 template <
typename T>
1530 return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(
v.raw))};
1543 template <
typename T>
1552 template <
typename T>
1554 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1560 template <
typename T>
1562 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1568 template <
typename T>
1570 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1576 template <
typename T>
1578 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1585 template <
typename T>
1588 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1594 template <
typename T>
1597 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1603 template <
typename T>
1606 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1612 template <
typename T>
1615 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1622 template <
typename T>
1625 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1631 template <
typename T>
1634 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1640 template <
typename T>
1643 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1649 template <
typename T>
1652 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1659 template <
typename T>
1662 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1668 template <
typename T>
1671 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1677 template <
typename T>
1680 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1686 template <
typename T>
1689 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1696 template <
typename T>
1699 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1705 template <
typename T>
1708 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1714 template <
typename T>
1717 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1723 template <
typename T>
1726 #if HWY_COMPILER_HAS_MASK_INTRINSICS
1735 template <
typename T>
1740 template <
typename T>
1745 template <
typename T>
1750 template <
typename T>
1755 template <
typename T>
1767 return ShiftRight<15>(
v);
1771 return ShiftRight<31>(
v);
1782 template <
typename T>
1784 return Vec512<T>{_mm512_load_si512(aligned)};
1795 template <
typename T>
1797 return Vec512<T>{_mm512_loadu_si512(p)};
1810 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1816 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1819 return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, p)};
1822 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1825 return Vec512<T>{_mm512_maskz_loadu_epi32(m.raw, p)};
1828 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1831 return Vec512<T>{_mm512_maskz_loadu_epi64(m.raw, p)};
1848 template <
typename T>
1856 asm(
"vbroadcasti128 %1, %[reg]" : [reg]
"=x"(out) :
"m"(p[0]));
1860 return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
1867 asm(
"vbroadcastf128 %1, %[reg]" : [reg]
"=x"(out) :
"m"(p[0]));
1870 const __m128 x4 = _mm_loadu_ps(p);
1879 asm(
"vbroadcastf128 %1, %[reg]" : [reg]
"=x"(out) :
"m"(p[0]));
1882 const __m128d x2 = _mm_loadu_pd(p);
1889 template <
typename T>
1892 _mm512_store_si512(
reinterpret_cast<__m512i*
>(aligned),
v.raw);
1896 _mm512_store_ps(aligned,
v.raw);
1900 _mm512_store_pd(aligned,
v.raw);
1903 template <
typename T>
1906 _mm512_storeu_si512(
reinterpret_cast<__m512i*
>(p),
v.raw);
1910 _mm512_storeu_ps(p,
v.raw);
1914 _mm512_storeu_pd(p,
v.raw);
1919 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1922 _mm512_mask_storeu_epi8(p, m.
raw,
v.raw);
1925 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1928 _mm512_mask_storeu_epi16(p, m.raw,
v.raw);
1931 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1934 _mm512_mask_storeu_epi32(p, m.raw,
v.raw);
1937 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1940 _mm512_mask_storeu_epi64(p, m.raw,
v.raw);
1945 _mm512_mask_storeu_ps(p, m.
raw,
v.raw);
1950 _mm512_mask_storeu_pd(p, m.
raw,
v.raw);
1955 template <
typename T>
1958 _mm512_stream_si512(
reinterpret_cast<__m512i*
>(aligned),
v.raw);
1962 _mm512_stream_ps(aligned,
v.raw);
1966 _mm512_stream_pd(aligned,
v.raw);
1977 template <
typename T>
1981 _mm512_i32scatter_epi32(base, offset.
raw,
v.raw, 1);
1983 template <
typename T>
1987 _mm512_i32scatter_epi32(base, index.
raw,
v.raw, 4);
1990 template <
typename T>
1994 _mm512_i64scatter_epi64(base, offset.
raw,
v.raw, 1);
1996 template <
typename T>
2000 _mm512_i64scatter_epi64(base, index.
raw,
v.raw, 8);
2005 template <
typename T,
typename Offset>
2008 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2011 template <
typename T,
typename Index>
2014 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2021 _mm512_i32scatter_ps(base, offset.
raw,
v.raw, 1);
2026 _mm512_i32scatter_ps(base, index.
raw,
v.raw, 4);
2032 _mm512_i64scatter_pd(base, offset.
raw,
v.raw, 1);
2037 _mm512_i64scatter_pd(base, index.
raw,
v.raw, 8);
2044 template <
typename T>
2049 return Vec512<T>{_mm512_i32gather_epi32(offset.
raw, base, 1)};
2051 template <
typename T>
2056 return Vec512<T>{_mm512_i32gather_epi32(index.
raw, base, 4)};
2059 template <
typename T>
2064 return Vec512<T>{_mm512_i64gather_epi64(offset.
raw, base, 1)};
2066 template <
typename T>
2071 return Vec512<T>{_mm512_i64gather_epi64(index.
raw, base, 8)};
2076 template <
typename T,
typename Offset>
2079 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2082 template <
typename T,
typename Index>
2085 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2117 template <
typename T>
2119 return Vec256<T>{_mm512_castsi512_si256(
v.raw)};
2128 template <
typename T>
2135 template <
typename T>
2137 return Vec256<T>{_mm512_extracti32x8_epi32(
v.raw, 1)};
2147 template <
typename T>
2165 template <
typename T>
2167 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2168 return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.
raw, 0)};
2175 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2183 #if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
2192 template <
typename T>
2195 return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.
raw, 1)};
2210 template <
int kBytes,
typename T>
2212 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2213 return Vec512<T>{_mm512_bslli_epi128(
v.raw, kBytes)};
2216 template <
int kBytes,
typename T>
2223 template <
int kLanes,
typename T>
2229 template <
int kLanes,
typename T>
2235 template <
int kBytes,
typename T>
2237 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2238 return Vec512<T>{_mm512_bsrli_epi128(
v.raw, kBytes)};
2242 template <
int kLanes,
typename T>
2250 template <
int kBytes,
typename T,
class V = Vec512<T>>
2260 template <
int kLane>
2262 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2264 const __m512i lo = _mm512_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2268 _mm512_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2272 template <
int kLane>
2274 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2275 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2278 template <
int kLane>
2280 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2281 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2286 template <
int kLane>
2288 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2290 const __m512i lo = _mm512_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2294 _mm512_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2298 template <
int kLane>
2300 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2301 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2304 template <
int kLane>
2306 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2307 constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
2312 template <
int kLane>
2314 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2315 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0x55 * kLane);
2318 template <
int kLane>
2320 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2321 constexpr _MM_PERM_ENUM perm =
static_cast<_MM_PERM_ENUM
>(0xFF * kLane);
2401 template <
typename T>
2406 template <
typename T,
typename TI>
2408 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2409 #if HWY_IS_DEBUG_BUILD
2412 AllTrue(di,
Lt(vec,
Set(di,
static_cast<TI
>(64 /
sizeof(T))))));
2417 template <
typename T,
typename TI>
2419 const Rebind<TI, decltype(
d)> di;
2423 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2428 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2430 return Vec512<T>{_mm512_permutexvar_epi64(idx.raw,
v.raw)};
2444 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2447 alignas(64) constexpr int16_t kReverse[32] = {
2448 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
2449 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
2452 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2455 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2457 alignas(64) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
2458 7, 6, 5, 4, 3, 2, 1, 0};
2462 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2464 alignas(64) constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
2470 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2476 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2481 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2488 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2491 alignas(64) constexpr int16_t kReverse4[32] = {
2492 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
2493 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
2496 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2499 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2504 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2506 return Vec512<T>{_mm512_permutex_epi64(
v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2509 return Vec512<double>{_mm512_permutex_pd(
v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
2514 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2517 alignas(64) constexpr int16_t kReverse8[32] = {
2518 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
2519 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
2522 _mm512_permutexvar_epi16(idx.
raw,
BitCast(di,
v).raw)});
2525 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2528 alignas(64) constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
2529 15, 14, 13, 12, 11, 10, 9, 8};
2530 const Vec512<int32_t> idx =
Load(di, kReverse8);
2532 _mm512_permutexvar_epi32(idx.raw,
BitCast(di,
v).raw)});
2535 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2639 template <
typename T,
class V = Vec512<T>>
2648 template <
typename T,
typename TW = MakeW
ide<T>>
2652 template <
typename T,
typename TW = MakeW
ide<T>>
2657 template <
typename T,
typename TW = MakeW
ide<T>>
2665 template <
typename T>
2668 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_BABA)};
2682 template <
typename T>
2685 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_DCDC)};
2699 template <
typename T>
2702 return Vec512<T>{_mm512_shuffle_i32x4(lo.
raw, hi.
raw, _MM_PERM_BADC)};
2716 template <
typename T>
2721 const __mmask32 mask = (0x0000FFFF);
2727 const __mmask16 mask = (0x00FF);
2733 const __mmask8 mask = (0x0F);
2739 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2742 alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2743 17, 19, 21, 23, 25, 27, 29, 31};
2746 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
2752 alignas(64) constexpr uint32_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
2753 17, 19, 21, 23, 25, 27, 29, 31};
2755 __mmask16{0xFFFF}, hi.
raw)};
2758 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2761 alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2762 return BitCast(
d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2763 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2770 alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
2772 __mmask8{0xFF}, hi.
raw)};
2777 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2780 alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2781 16, 18, 20, 22, 24, 26, 28, 30};
2784 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
2790 alignas(64) constexpr uint32_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
2791 16, 18, 20, 22, 24, 26, 28, 30};
2793 __mmask16{0xFFFF}, hi.
raw)};
2796 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2799 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2800 return BitCast(
d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
2801 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
2808 alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
2810 __mmask8{0xFF}, hi.
raw)};
2815 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2817 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_CCAA)};
2823 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2830 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2832 return Vec512<T>{_mm512_shuffle_epi32(
v.raw, _MM_PERM_DDBB)};
2838 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2845 template <
typename T>
2847 constexpr
size_t s =
sizeof(T);
2848 constexpr
int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
2854 template <
typename T>
2856 return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.
raw, even.
raw)};
2861 _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.
raw, even.
raw)};
2866 _mm512_mask_blend_pd(__mmask8{0x33u}, odd.
raw, even.
raw)};
2871 template <
typename T>
2873 return Vec512<T>{_mm512_shuffle_i32x4(
v.raw,
v.raw, _MM_PERM_CDAB)};
2877 return Vec512<float>{_mm512_shuffle_f32x4(
v.raw,
v.raw, _MM_PERM_CDAB)};
2886 template <
typename T>
2888 return Vec512<T>{_mm512_shuffle_i32x4(
v.raw,
v.raw, _MM_PERM_ABCD)};
2891 return Vec512<float>{_mm512_shuffle_f32x4(
v.raw,
v.raw, _MM_PERM_ABCD)};
2901 template <
typename T,
typename TI>
2907 template <
typename T,
typename TI,
size_t NI>
2910 const Half<decltype(d512)> d256;
2911 const Half<decltype(d256)> d128;
2914 const auto from_512 =
2920 template <
typename T,
typename TI>
2927 template <
typename T,
size_t N,
typename TI>
2930 const Half<decltype(d512)> d256;
2931 const Half<decltype(d256)> d128;
2934 const auto bytes_512 =
2938 template <
typename T,
typename TI>
3011 const Rebind<uint16_t, decltype(df32)> du16;
3031 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3042 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3044 const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
3054 _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
3057 alignas(16)
static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
3059 const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
3068 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3070 const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3079 alignas(16)
static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
3080 0, 4, 8, 12, 0, 4, 8, 12};
3082 const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
3091 alignas(64)
static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
3093 const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
3109 const Rebind<int32_t, decltype(dbf16)> di32;
3110 const Rebind<uint32_t, decltype(dbf16)> du32;
3111 const Rebind<uint16_t, decltype(dbf16)> du16;
3112 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
3120 const Repartition<uint32_t, decltype(dbf16)> du32;
3141 alignas(16)
static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0
u, ~0
u,
3145 alignas(16)
static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
3147 _mm512_permutexvar_epi32(
LoadDup128(d32, kIndex32).raw, quads.raw)};
3178 #if !defined(HWY_DISABLE_PCLMUL_AES)
3181 #ifdef HWY_NATIVE_AES
3182 #undef HWY_NATIVE_AES
3184 #define HWY_NATIVE_AES
3189 #if HWY_TARGET == HWY_AVX3_DL
3193 const Half<decltype(
d)> d2;
3201 #if HWY_TARGET == HWY_AVX3_DL
3205 const Half<decltype(
d)> d2;
3213 #if HWY_TARGET == HWY_AVX3_DL
3216 alignas(64) uint64_t a[8];
3217 alignas(64) uint64_t b[8];
3222 for (
size_t i = 0; i < 8; i += 2) {
3224 Store(mul, d128, a + i);
3231 #if HWY_TARGET == HWY_AVX3_DL
3234 alignas(64) uint64_t a[8];
3235 alignas(64) uint64_t b[8];
3240 for (
size_t i = 0; i < 8; i += 2) {
3242 Store(mul, d128, a + i);
3253 template <
typename T,
typename T2>
3256 for (
size_t i = 0; i < 64 /
sizeof(T); ++i) {
3257 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
3259 return Load(
d, lanes);
3268 template <
typename T>
3270 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3271 return _kortestz_mask64_u8(mask.
raw, mask.
raw);
3273 return mask.
raw == 0;
3276 template <
typename T>
3278 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3279 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
3281 return mask.
raw == 0;
3284 template <
typename T>
3286 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3287 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
3289 return mask.
raw == 0;
3292 template <
typename T>
3294 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3295 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
3297 return mask.
raw == 0;
3303 template <
typename T>
3310 template <
typename T>
3312 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3313 return _kortestc_mask64_u8(mask.
raw, mask.
raw);
3315 return mask.
raw == 0xFFFFFFFFFFFFFFFFull;
3318 template <
typename T>
3320 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3321 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
3323 return mask.
raw == 0xFFFFFFFFull;
3326 template <
typename T>
3328 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3329 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
3331 return mask.
raw == 0xFFFFull;
3334 template <
typename T>
3336 #if HWY_COMPILER_HAS_MASK_INTRINSICS
3337 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
3339 return mask.
raw == 0xFFull;
3345 template <
typename T>
3351 template <
typename T>
3361 template <
typename T>
3364 const size_t kNumBytes = 8 /
sizeof(T);
3365 CopyBytes<kNumBytes>(&mask.
raw, bits);
3370 template <
typename T>
3375 template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
3381 template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3383 const Mask512<T> mask) {
3389 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3391 return Vec512<T>{_mm512_maskz_compress_epi32(mask.
raw,
v.raw)};
3398 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3401 alignas(16) constexpr uint64_t packed_array[256] = {
3402 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
3403 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
3404 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
3405 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
3406 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
3407 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
3408 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
3409 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
3410 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
3411 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
3412 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
3413 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
3414 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
3415 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
3416 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
3417 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
3418 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
3419 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
3420 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
3421 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
3422 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
3423 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
3424 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
3425 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
3426 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
3427 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
3428 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
3429 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
3430 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
3431 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
3432 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
3433 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
3434 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
3435 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
3436 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
3437 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
3438 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
3439 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
3440 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
3441 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
3442 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
3443 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
3444 0x10765432, 0x17654320, 0x07654321, 0x76543210};
3450 const auto packed =
Set(du64, packed_array[mask.raw]);
3451 alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
3452 const auto indices = Indices512<T>{(packed >>
Load(du64, shifts)).raw};
3460 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3463 const Rebind<uint16_t, decltype(
d)> du;
3466 #if HWY_TARGET == HWY_AVX3_DL
3467 const Vec256<uint16_t> cu{_mm256_maskz_compress_epi16(mask.raw, vu.raw)};
3471 const Mask512<int32_t> mask32{
static_cast<__mmask16
>(mask.raw)};
3479 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3482 const Rebind<uint16_t, decltype(
d)> du;
3485 #if HWY_TARGET == HWY_AVX3_DL
3486 const Vec512<uint16_t> cu{_mm512_maskz_compress_epi16(mask.raw, vu.raw)};
3489 const Half<decltype(du)> duh;
3493 const uint32_t mask_bits{mask.raw};
3494 const Mask512<int32_t> mask0{
static_cast<__mmask16
>(mask_bits & 0xFFFF)};
3495 const Mask512<int32_t> mask1{
static_cast<__mmask16
>(mask_bits >> 16)};
3496 const auto compressed0 =
Compress(promoted0, mask0);
3497 const auto compressed1 =
Compress(promoted1, mask1);
3503 const size_t num0 =
CountTrue(dw, mask0);
3504 const __mmask32 m_upper = ~((1u << num0) - 1);
3505 alignas(64) uint16_t iota[64] = {
3506 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3507 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3508 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3509 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
3510 const auto idx =
LoadU(du, iota + 32 - num0);
3511 const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
3512 demoted0.raw, m_upper, idx.raw, demoted1.raw)};
3519 template <
typename T>
3526 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3529 const Rebind<uint16_t, decltype(
d)> du;
3532 const uint64_t mask_bits{mask.
raw};
3534 #if HWY_TARGET == HWY_AVX3_DL
3535 _mm512_mask_compressstoreu_epi16(unaligned, mask.
raw, vu.raw);
3538 const Half<decltype(du)> duh;
3542 const uint64_t maskL = mask_bits & 0xFFFF;
3543 const uint64_t maskH = mask_bits >> 16;
3546 const auto compressed0 =
Compress(promoted0, mask0);
3547 const auto compressed1 =
Compress(promoted1, mask1);
3549 const Half<decltype(
d)> dh;
3554 StoreU(demoted0, dh, unaligned);
3561 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3564 _mm512_mask_compressstoreu_epi32(unaligned, mask.raw,
v.raw);
3565 const size_t count =
PopCount(uint64_t{mask.raw});
3568 __msan_unpoison(unaligned, count *
sizeof(T));
3573 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3576 _mm512_mask_compressstoreu_epi64(unaligned, mask.raw,
v.raw);
3577 const size_t count =
PopCount(uint64_t{mask.raw});
3580 __msan_unpoison(unaligned, count *
sizeof(T));
3588 _mm512_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
3589 const size_t count =
PopCount(uint64_t{mask.
raw});
3592 __msan_unpoison(unaligned, count *
sizeof(
float));
3600 _mm512_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
3601 const size_t count =
PopCount(uint64_t{mask.
raw});
3604 __msan_unpoison(unaligned, count *
sizeof(
double));
3610 template <
typename T>
3622 __msan_unpoison(unaligned, count *
sizeof(T));
3629 template <
typename T>
3641 const auto k5 =
Set(
d, 5);
3642 const auto k6 =
Set(
d, 6);
3646 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
3647 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
3648 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3649 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
3650 0x80, 0, 0x80, 0x80, 1, 0x80,
3651 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3654 const auto shuf_b0 = CombineShiftRightBytes<15>(
d, shuf_g0, shuf_g0);
3658 const auto i = (r0 | g0 | b0).raw;
3661 const auto shuf_r1 = shuf_b0 + k6;
3662 const auto shuf_g1 = shuf_r0 + k5;
3663 const auto shuf_b1 = shuf_g0 + k5;
3667 const auto j = (r1 | g1 | b1).raw;
3670 const auto shuf_r2 = shuf_b1 + k6;
3671 const auto shuf_g2 = shuf_r1 + k5;
3672 const auto shuf_b2 = shuf_g1 + k5;
3676 const auto k = (r2 | g2 | b2).raw;
3679 const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_PERM_DADA);
3680 const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_PERM_BCAB);
3681 const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_PERM_CDBC);
3684 const __mmask8 m = 0xCC;
3685 const auto i1_k0_j0_i0 = _mm512_mask_blend_epi64(m, k3_k0_i3_i0, i1_i2_j0_j1);
3686 const auto j2_i2_k1_j1 = _mm512_mask_blend_epi64(m, i1_i2_j0_j1, j2_j3_k1_k2);
3687 const auto k3_j3_i3_k2 = _mm512_mask_blend_epi64(m, j2_j3_k1_k2, k3_k0_i3_i0);
3704 const auto ba0 =
ZipLower(d16, v0, v1);
3705 const auto dc0 =
ZipLower(d16, v2, v3);
3706 const auto ba8 =
ZipUpper(d16, v0, v1);
3707 const auto dc8 =
ZipUpper(d16, v2, v3);
3708 const auto i =
ZipLower(d32, ba0, dc0).raw;
3709 const auto j =
ZipUpper(d32, ba0, dc0).raw;
3710 const auto k =
ZipLower(d32, ba8, dc8).raw;
3711 const auto l =
ZipUpper(d32, ba8, dc8).raw;
3713 const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_PERM_BABA);
3714 const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_PERM_BABA);
3715 const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_PERM_DCDC);
3716 const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_PERM_DCDC);
3717 constexpr _MM_PERM_ENUM k20 = _MM_PERM_CACA;
3718 constexpr _MM_PERM_ENUM k31 = _MM_PERM_DBDB;
3719 const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
3720 const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
3721 const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
3722 const auto l3_k3_j3_i3 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k31);
3733 const DFromV<decltype(a)> du64;
3735 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3736 const auto a32 =
BitCast(du32, a);
3737 const auto b32 =
BitCast(du32, b);
3745 const auto aLbL =
MulEven(a32, b32);
3746 const auto w3 = aLbL & maskL;
3748 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
3749 const auto w2 = t2 & maskL;
3750 const auto w1 = ShiftRight<32>(t2);
3752 const auto t =
MulEven(a32, bH) + w2;
3753 const auto k = ShiftRight<32>(t);
3755 const auto mulH =
MulEven(aH, bH) + w1 + k;
3756 const auto mulL = ShiftLeft<32>(t) + w3;
3762 const DFromV<decltype(a)> du64;
3764 const auto maskL =
Set(du64, 0xFFFFFFFFULL);
3765 const auto a32 =
BitCast(du32, a);
3766 const auto b32 =
BitCast(du32, b);
3772 const auto aLbL =
MulEven(a32, b32);
3773 const auto w3 = aLbL & maskL;
3775 const auto t2 =
MulEven(aH, b32) + ShiftRight<32>(aLbL);
3776 const auto w2 = t2 & maskL;
3777 const auto w1 = ShiftRight<32>(t2);
3779 const auto t =
MulEven(a32, bH) + w2;
3780 const auto k = ShiftRight<32>(t);
3782 const auto mulH =
MulEven(aH, bH) + w1 + k;
3783 const auto mulL = ShiftLeft<32>(t) + w3;
3812 return Set(
d, _mm512_reduce_add_epi32(
v.raw));
3815 return Set(
d, _mm512_reduce_add_epi64(
v.raw));
3818 return Set(
d,
static_cast<uint32_t
>(_mm512_reduce_add_epi32(
v.raw)));
3821 return Set(
d,
static_cast<uint64_t
>(_mm512_reduce_add_epi64(
v.raw)));
3824 return Set(
d, _mm512_reduce_add_ps(
v.raw));
3827 return Set(
d, _mm512_reduce_add_pd(
v.raw));
3832 return Set(
d, _mm512_reduce_min_epi32(
v.raw));
3835 return Set(
d, _mm512_reduce_min_epi64(
v.raw));
3838 return Set(
d, _mm512_reduce_min_epu32(
v.raw));
3841 return Set(
d, _mm512_reduce_min_epu64(
v.raw));
3844 return Set(
d, _mm512_reduce_min_ps(
v.raw));
3847 return Set(
d, _mm512_reduce_min_pd(
v.raw));
3849 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3853 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
3856 return BitCast(
d,
Or(min, ShiftLeft<16>(min)));
3861 return Set(
d, _mm512_reduce_max_epi32(
v.raw));
3864 return Set(
d, _mm512_reduce_max_epi64(
v.raw));
3867 return Set(
d, _mm512_reduce_max_epu32(
v.raw));
3870 return Set(
d, _mm512_reduce_max_epu64(
v.raw));
3873 return Set(
d, _mm512_reduce_max_ps(
v.raw));
3876 return Set(
d, _mm512_reduce_max_pd(
v.raw));
3878 template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3882 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
3885 return BitCast(
d,
Or(min, ShiftLeft<16>(min)));
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_API
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
Definition: arm_neon-inl.h:485
Raw raw
Definition: arm_neon-inl.h:518
Definition: wasm_256-inl.h:39
Raw raw
Definition: x86_256-inl.h:94
Definition: x86_512-inl.h:103
typename detail::Raw512< T >::type Raw
Definition: x86_512-inl.h:104
HWY_INLINE Vec512 & operator*=(const Vec512 other)
Definition: x86_512-inl.h:109
Raw raw
Definition: x86_512-inl.h:131
HWY_INLINE Vec512 & operator+=(const Vec512 other)
Definition: x86_512-inl.h:115
HWY_INLINE Vec512 & operator&=(const Vec512 other)
Definition: x86_512-inl.h:121
HWY_INLINE Vec512 & operator|=(const Vec512 other)
Definition: x86_512-inl.h:124
HWY_INLINE Vec512 & operator-=(const Vec512 other)
Definition: x86_512-inl.h:118
HWY_INLINE Vec512 & operator^=(const Vec512 other)
Definition: x86_512-inl.h:127
HWY_INLINE Vec512 & operator/=(const Vec512 other)
Definition: x86_512-inl.h:112
#define HWY_AVX3_DL
Definition: detect_targets.h:58
#define HWY_TARGET
Definition: detect_targets.h:328
const double shift
Definition: RateControl.cpp:165
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition: arm_neon-inl.h:1538
HWY_INLINE auto FixConversionOverflow(DI di, VFromD< DF > original, decltype(Zero(di).raw) converted_raw) -> VFromD< DI >
Definition: x86_128-inl.h:4784
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_INLINE auto ClampF64ToI32Max(Simd< double, N, 0 > d, decltype(Zero(d)) v) -> decltype(Zero(d))
Definition: x86_128-inl.h:4773
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3111
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE void ScatterIndex(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > index)
Definition: x86_128-inl.h:3106
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE void ScatterOffset(hwy::SizeTag< 4 >, Vec128< T, N > v, Simd< T, N, 0 >, T *HWY_RESTRICT base, const Vec128< int32_t, N > offset)
Definition: x86_128-inl.h:3095
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:134
HWY_INLINE bool AllFalse(hwy::SizeTag< 1 >, const Mask256< T > mask)
Definition: x86_256-inl.h:4066
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:936
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:728
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Mask512< T > Not(hwy::SizeTag< 1 >, const Mask512< T > m)
Definition: x86_512-inl.h:1553
HWY_INLINE Vec256< T > GatherIndex(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > index)
Definition: x86_256-inl.h:2434
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition: x86_128-inl.h:770
HWY_INLINE Vec256< T > GatherOffset(hwy::SizeTag< 4 >, Full256< T >, const T *HWY_RESTRICT base, const Vec256< int32_t > offset)
Definition: x86_256-inl.h:2426
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:862
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:680
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:212
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
u
Definition: rvv-inl.h:1405
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition: base.h:598
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
Definition: x86_512-inl.h:2402
__m512i raw
Definition: x86_512-inl.h:2403
Definition: x86_512-inl.h:136
detail::RawMask512< sizeof(T)>::type raw
Definition: x86_512-inl.h:137
Definition: ops/shared-inl.h:40
HWY_INLINE __m512d operator()(__m512i v)
Definition: x86_512-inl.h:166
HWY_INLINE __m512 operator()(__m512i v)
Definition: x86_512-inl.h:162
Definition: x86_512-inl.h:157
HWY_INLINE __m512i operator()(__m512i v)
Definition: x86_512-inl.h:158
__m512d type
Definition: x86_512-inl.h:77
__m512 type
Definition: x86_512-inl.h:73
Definition: x86_512-inl.h:68
__m512i type
Definition: x86_512-inl.h:69
__mmask64 type
Definition: x86_512-inl.h:85
__mmask32 type
Definition: x86_512-inl.h:89
__mmask16 type
Definition: x86_512-inl.h:93
__mmask8 type
Definition: x86_512-inl.h:97
Definition: x86_512-inl.h:82