21 #include <wasm_simd128.h>
26 #ifdef HWY_WASM_OLD_NAMES
27 #define wasm_i8x16_shuffle wasm_v8x16_shuffle
28 #define wasm_i16x8_shuffle wasm_v16x8_shuffle
29 #define wasm_i32x4_shuffle wasm_v32x4_shuffle
30 #define wasm_i64x2_shuffle wasm_v64x2_shuffle
31 #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
32 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
33 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
34 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
35 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
36 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
37 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
38 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate
39 #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
40 #define wasm_u16x8_add_sat wasm_u16x8_add_saturate
41 #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
42 #define wasm_i8x16_add_sat wasm_i8x16_add_saturate
43 #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
44 #define wasm_i16x8_add_sat wasm_i16x8_add_saturate
45 #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
53 using Full128 = Simd<T, 16 /
sizeof(T), 0>;
56 using Full64 = Simd<T, 8 /
sizeof(T), 0>;
71 template <
typename T,
size_t N = 16 /
sizeof(T)>
79 return *
this = (*
this * other);
82 return *
this = (*
this / other);
85 return *
this = (*
this + other);
88 return *
this = (*
this - other);
91 return *
this = (*
this & other);
94 return *
this = (*
this | other);
97 return *
this = (*
this ^ other);
103 template <
typename T>
104 using Vec64 = Vec128<T, 8 /
sizeof(T)>;
107 template <
typename T,
size_t N = 16 /
sizeof(T)>
116 template <
typename T,
size_t N>
125 using DFromV = decltype(detail::DeduceD()(V()));
128 using TFromV = TFromD<DFromV<V>>;
136 return static_cast<__v128_u
>(
v);
139 return static_cast<__v128_u
>(
v);
142 template <
typename T,
size_t N>
148 template <
typename T>
157 template <
typename T,
size_t N>
165 template <
typename T,
size_t N,
typename FromT>
167 Vec128<FromT,
N *
sizeof(T) /
sizeof(FromT)>
v) {
174 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
178 template <
size_t N, HWY_IF_LE128(
float, N)>
189 template <
size_t N, HWY_IF_LE128(u
int8_t, N)>
193 template <
size_t N, HWY_IF_LE128(u
int16_t, N)>
198 template <
size_t N, HWY_IF_LE128(u
int32_t, N)>
203 template <
size_t N, HWY_IF_LE128(u
int64_t, N)>
209 template <
size_t N, HWY_IF_LE128(
int8_t, N)>
213 template <
size_t N, HWY_IF_LE128(
int16_t, N)>
217 template <
size_t N, HWY_IF_LE128(
int32_t, N)>
221 template <
size_t N, HWY_IF_LE128(
int64_t, N)>
226 template <
size_t N, HWY_IF_LE128(
float, N)>
235 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
243 template <
typename T,
size_t N,
typename T2>
244 Vec128<T, N>
Iota(
const Simd<T, N, 0>
d,
const T2 first) {
246 for (
size_t i = 0; i < 16 /
sizeof(T); ++i) {
247 lanes[i] =
static_cast<T
>(first +
static_cast<T2
>(i));
249 return Load(
d, lanes);
460 template <
int kBits,
size_t N>
464 template <
int kBits,
size_t N>
468 template <
int kBits,
size_t N>
472 template <
int kBits,
size_t N>
476 template <
int kBits,
size_t N>
480 template <
int kBits,
size_t N>
486 template <
int kBits,
size_t N>
490 template <
int kBits,
size_t N>
494 template <
int kBits,
size_t N>
498 template <
int kBits,
size_t N>
502 template <
int kBits,
size_t N>
506 template <
int kBits,
size_t N>
512 template <
int kBits,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
519 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
522 template <
int kBits,
size_t N>
528 return shifted &
Set(d8, 0xFF >> kBits);
531 template <
int kBits,
size_t N>
536 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
537 return (shifted ^ shifted_sign) - shifted_sign;
541 template <
int kBits,
typename T,
size_t N>
543 constexpr
size_t kSizeInBits =
sizeof(T) * 8;
544 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
545 if (kBits == 0)
return v;
546 return Or(ShiftRight<kBits>(
v), ShiftLeft<kSizeInBits - kBits>(
v));
620 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
626 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
636 return shifted &
Set(d8, 0xFF >> bits);
644 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> bits));
645 return (shifted ^ shifted_sign) - shifted_sign;
667 HWY_API Vec128<uint64_t, N>
Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
669 const uint64_t a0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0));
670 const uint64_t b0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0));
671 const uint64_t a1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1));
672 const uint64_t b1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1));
674 return Vec128<uint64_t, N>{wasm_v128_load(min)};
691 HWY_API Vec128<int64_t, N>
Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
692 alignas(16) int64_t min[4];
693 min[0] =
HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
694 wasm_i64x2_extract_lane(b.raw, 0));
695 min[1] =
HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
696 wasm_i64x2_extract_lane(b.raw, 1));
697 return Vec128<int64_t, N>{wasm_v128_load(min)};
722 HWY_API Vec128<uint64_t, N>
Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
724 const uint64_t a0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0));
725 const uint64_t b0 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0));
726 const uint64_t a1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1));
727 const uint64_t b1 =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1));
729 return Vec128<uint64_t, N>{wasm_v128_load(max)};
746 HWY_API Vec128<int64_t, N>
Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
747 alignas(16) int64_t max[2];
748 max[0] =
HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
749 wasm_i64x2_extract_lane(b.raw, 0));
750 max[1] =
HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
751 wasm_i64x2_extract_lane(b.raw, 1));
752 return Vec128<int64_t, N>{wasm_v128_load(max)};
792 const auto al = wasm_u32x4_extend_low_u16x8(a.
raw);
793 const auto ah = wasm_u32x4_extend_high_u16x8(a.
raw);
794 const auto bl = wasm_u32x4_extend_low_u16x8(b.
raw);
795 const auto bh = wasm_u32x4_extend_high_u16x8(b.
raw);
796 const auto l = wasm_i32x4_mul(al, bl);
797 const auto h = wasm_i32x4_mul(ah, bh);
800 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
806 const auto al = wasm_i32x4_extend_low_i16x8(a.
raw);
807 const auto ah = wasm_i32x4_extend_high_i16x8(a.
raw);
808 const auto bl = wasm_i32x4_extend_low_i16x8(b.
raw);
809 const auto bh = wasm_i32x4_extend_high_i16x8(b.
raw);
810 const auto l = wasm_i32x4_mul(al, bl);
811 const auto h = wasm_i32x4_mul(ah, bh);
814 wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
836 HWY_API Vec128<int64_t, (
N + 1) / 2>
MulEven(
const Vec128<int32_t, N> a,
837 const Vec128<int32_t, N> b) {
839 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
840 const auto ae = wasm_v128_and(a.raw, kEvenMask);
841 const auto be = wasm_v128_and(b.raw, kEvenMask);
842 return Vec128<int64_t, (
N + 1) / 2>{wasm_i64x2_mul(ae, be)};
845 HWY_API Vec128<uint64_t, (
N + 1) / 2>
MulEven(
const Vec128<uint32_t, N> a,
846 const Vec128<uint32_t, N> b) {
848 const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
849 const auto ae = wasm_v128_and(a.raw, kEvenMask);
850 const auto be = wasm_v128_and(b.raw, kEvenMask);
851 return Vec128<uint64_t, (
N + 1) / 2>{wasm_i64x2_mul(ae, be)};
856 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
887 const Vec128<float, N> b) {
888 return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
894 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
910 const Vec128<float, N> x,
911 const Vec128<float, N> add) {
914 return mul * x + add;
920 const Vec128<float, N> x,
921 const Vec128<float, N> add) {
923 return add - mul * x;
929 const Vec128<float, N> x,
930 const Vec128<float, N> sub) {
933 return mul * x - sub;
939 const Vec128<float, N> x,
940 const Vec128<float, N> sub) {
942 return Neg(mul) * x - sub;
949 HWY_API Vec128<float, N>
Sqrt(
const Vec128<float, N>
v) {
950 return Vec128<float, N>{wasm_f32x4_sqrt(
v.raw)};
957 const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
958 return one /
Sqrt(
v);
966 return Vec128<float, N>{wasm_f32x4_nearest(
v.raw)};
972 return Vec128<float, N>{wasm_f32x4_trunc(
v.raw)};
977 HWY_API Vec128<float, N>
Ceil(
const Vec128<float, N>
v) {
978 return Vec128<float, N>{wasm_f32x4_ceil(
v.raw)};
984 return Vec128<float, N>{wasm_f32x4_floor(
v.raw)};
991 template <
typename TFrom,
typename TTo,
size_t N>
993 Mask128<TFrom, N> m) {
994 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
995 return Mask128<TTo, N>{m.raw};
998 template <
typename T,
size_t N>
1000 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1001 return (
v & bit) == bit;
1153 const auto a32 =
BitCast(d32, a);
1154 const auto b32 =
BitCast(d32, b);
1156 const auto m_gt = a32 > b32;
1159 const auto m_eq = a32 == b32;
1160 const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
1163 const auto gt =
Or(lo_gt, m_gt);
1174 template <
typename T,
size_t N>
1195 template <
typename T,
size_t N>
1196 HWY_API Mask128<T, N>
FirstN(
const Simd<T, N, 0>
d,
size_t num) {
1205 template <
typename T,
size_t N>
1212 template <
typename T,
size_t N>
1220 template <
typename T,
size_t N>
1227 template <
typename T,
size_t N>
1234 template <
typename T,
size_t N>
1241 template <
typename T,
size_t N>
1242 HWY_API Vec128<T, N>
OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
1243 return Or(o,
And(a1, a2));
1248 template <
typename T,
size_t N>
1256 template <
typename T,
size_t N>
1257 HWY_API Vec128<T, N>
operator&(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1261 template <
typename T,
size_t N>
1262 HWY_API Vec128<T, N>
operator|(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1266 template <
typename T,
size_t N>
1267 HWY_API Vec128<T, N>
operator^(
const Vec128<T, N> a,
const Vec128<T, N> b) {
1273 template <
typename T,
size_t N>
1275 const Vec128<T, N> sign) {
1276 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1281 template <
typename T,
size_t N>
1283 const Vec128<T, N> sign) {
1284 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
1290 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
1303 template <
typename T,
size_t N>
1305 return Mask128<T, N>{
v.raw};
1308 template <
typename T,
size_t N>
1310 return Vec128<T, N>{
v.raw};
1314 template <
typename T,
size_t N>
1321 template <
typename T,
size_t N>
1327 template <
typename T,
size_t N>
1332 template <
typename T,
size_t N>
1335 static_assert(IsSigned<T>(),
"Only works for signed/float");
1343 template <
typename T,
size_t N, HWY_IF_FLOAT(T)>
1346 const auto zero =
Zero(
d);
1352 template <
typename T,
size_t N>
1353 HWY_API Mask128<T, N>
Not(
const Mask128<T, N> m) {
1357 template <
typename T,
size_t N>
1358 HWY_API Mask128<T, N>
And(
const Mask128<T, N> a, Mask128<T, N> b) {
1359 const Simd<T, N, 0>
d;
1363 template <
typename T,
size_t N>
1364 HWY_API Mask128<T, N>
AndNot(
const Mask128<T, N> a, Mask128<T, N> b) {
1365 const Simd<T, N, 0>
d;
1369 template <
typename T,
size_t N>
1370 HWY_API Mask128<T, N>
Or(
const Mask128<T, N> a, Mask128<T, N> b) {
1371 const Simd<T, N, 0>
d;
1375 template <
typename T,
size_t N>
1376 HWY_API Mask128<T, N>
Xor(
const Mask128<T, N> a, Mask128<T, N> b) {
1377 const Simd<T, N, 0>
d;
1391 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1398 test = ShiftLeft<12>(test);
1401 test = ShiftLeft<1>(test);
1405 test = ShiftLeft<1>(test);
1409 test = ShiftLeft<1>(test);
1416 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1423 test = ShiftLeft<27>(test);
1426 test = ShiftLeft<1>(test);
1430 test = ShiftLeft<1>(test);
1434 test = ShiftLeft<1>(test);
1438 test = ShiftLeft<1>(test);
1445 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
1448 alignas(16) T lanes[2];
1449 alignas(16) T bits_lanes[2];
1451 Store(bits,
d, bits_lanes);
1452 lanes[0] <<= bits_lanes[0];
1453 lanes[1] <<= bits_lanes[1];
1454 return Load(
d, lanes);
1459 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
1466 test = ShiftLeft<12>(test);
1469 test = ShiftLeft<1>(test);
1473 test = ShiftLeft<1>(test);
1477 test = ShiftLeft<1>(test);
1484 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
1491 test = ShiftLeft<27>(test);
1494 test = ShiftLeft<1>(test);
1498 test = ShiftLeft<1>(test);
1502 test = ShiftLeft<1>(test);
1506 test = ShiftLeft<1>(test);
1517 template <
typename T>
1519 return Vec128<T>{wasm_v128_load(aligned)};
1522 template <
typename T,
size_t N>
1529 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1532 CopyBytes<sizeof(T) * N>(p, &
v);
1537 template <
typename T,
size_t N>
1543 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
1550 template <
typename T>
1552 wasm_v128_store(aligned,
v.raw);
1556 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1558 CopyBytes<sizeof(T) * N>(&
v, p);
1563 *p = wasm_f32x4_extract_lane(
v.raw, 0);
1567 template <
typename T,
size_t N>
1572 template <
typename T,
size_t N>
1582 template <
typename T,
size_t N>
1585 wasm_v128_store(aligned,
v.raw);
1590 template <
typename T,
size_t N,
typename Offset, HWY_IF_LE128(T, N)>
1593 const Vec128<Offset, N> offset) {
1594 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1596 alignas(16) T lanes[
N];
1599 alignas(16) Offset offset_lanes[
N];
1600 Store(offset,
Rebind<Offset, decltype(
d)>(), offset_lanes);
1602 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
1603 for (
size_t i = 0; i <
N; ++i) {
1604 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
1608 template <
typename T,
size_t N,
typename Index, HWY_IF_LE128(T, N)>
1610 const Vec128<Index, N> index) {
1611 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1613 alignas(16) T lanes[
N];
1616 alignas(16) Index index_lanes[
N];
1617 Store(index,
Rebind<Index, decltype(
d)>(), index_lanes);
1619 for (
size_t i = 0; i <
N; ++i) {
1620 base[index_lanes[i]] = lanes[i];
1626 template <
typename T,
size_t N,
typename Offset>
1629 const Vec128<Offset, N> offset) {
1630 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
1632 alignas(16) Offset offset_lanes[
N];
1633 Store(offset,
Rebind<Offset, decltype(
d)>(), offset_lanes);
1635 alignas(16) T lanes[
N];
1636 const uint8_t* base_bytes =
reinterpret_cast<const uint8_t*
>(base);
1637 for (
size_t i = 0; i <
N; ++i) {
1638 CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
1640 return Load(
d, lanes);
1643 template <
typename T,
size_t N,
typename Index>
1646 const Vec128<Index, N> index) {
1647 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
1649 alignas(16) Index index_lanes[
N];
1650 Store(index,
Rebind<Index, decltype(
d)>(), index_lanes);
1652 alignas(16) T lanes[
N];
1653 for (
size_t i = 0; i <
N; ++i) {
1654 lanes[i] = base[index_lanes[i]];
1656 return Load(
d, lanes);
1666 return static_cast<uint8_t
>(wasm_i8x16_extract_lane(
v.raw, 0));
1670 return static_cast<int8_t
>(wasm_i8x16_extract_lane(
v.raw, 0));
1674 return static_cast<uint16_t
>(wasm_i16x8_extract_lane(
v.raw, 0));
1678 return static_cast<int16_t
>(wasm_i16x8_extract_lane(
v.raw, 0));
1682 return static_cast<uint32_t
>(wasm_i32x4_extract_lane(
v.raw, 0));
1686 return static_cast<int32_t
>(wasm_i32x4_extract_lane(
v.raw, 0));
1690 return static_cast<uint64_t
>(wasm_i64x2_extract_lane(
v.raw, 0));
1694 return static_cast<int64_t
>(wasm_i64x2_extract_lane(
v.raw, 0));
1699 return wasm_f32x4_extract_lane(
v.raw, 0);
1704 template <
typename T,
size_t N>
1707 return Vec128<T,
N / 2>{
v.raw};
1710 template <
typename T,
size_t N>
1718 template <
int kBytes,
typename T,
size_t N>
1720 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1721 const __i8x16 zero = wasm_i8x16_splat(0);
1727 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
1728 6, 7, 8, 9, 10, 11, 12, 13, 14)};
1731 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
1732 5, 6, 7, 8, 9, 10, 11, 12, 13)};
1735 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 0, 1, 2,
1736 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
1739 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 0, 1,
1740 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
1743 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16, 0,
1744 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
1747 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
1748 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
1751 return Vec128<T, N>{wasm_i8x16_shuffle(
1752 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
1755 return Vec128<T, N>{wasm_i8x16_shuffle(
1756 v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
1759 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
1760 16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
1764 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
1765 16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
1769 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
1770 16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
1774 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
1775 16, 16, 16, 16, 16, 16, 16, 0, 1,
1779 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
1780 16, 16, 16, 16, 16, 16, 16, 16, 0,
1784 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
1785 16, 16, 16, 16, 16, 16, 16, 16, 16,
1789 return Vec128<T, N>{wasm_i8x16_shuffle(
v.raw, zero, 16, 16, 16, 16, 16,
1790 16, 16, 16, 16, 16, 16, 16, 16, 16,
1793 return Vec128<T, N>{zero};
1796 template <
int kBytes,
typename T,
size_t N>
1798 return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(),
v);
1803 template <
int kLanes,
typename T,
size_t N>
1809 template <
int kLanes,
typename T,
size_t N>
1811 return ShiftLeftLanes<kLanes>(
DFromV<decltype(
v)>(),
v);
1818 template <
int kBytes,
typename T,
size_t N>
1820 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1821 const __i8x16 zero = wasm_i8x16_splat(0);
1828 return wasm_i8x16_shuffle(
v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1829 12, 13, 14, 15, 16);
1832 return wasm_i8x16_shuffle(
v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1833 13, 14, 15, 16, 16);
1836 return wasm_i8x16_shuffle(
v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1837 13, 14, 15, 16, 16, 16);
1840 return wasm_i8x16_shuffle(
v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1841 14, 15, 16, 16, 16, 16);
1844 return wasm_i8x16_shuffle(
v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1845 15, 16, 16, 16, 16, 16);
1848 return wasm_i8x16_shuffle(
v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1849 16, 16, 16, 16, 16, 16);
1852 return wasm_i8x16_shuffle(
v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1853 16, 16, 16, 16, 16, 16, 16);
1856 return wasm_i8x16_shuffle(
v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1857 16, 16, 16, 16, 16, 16, 16);
1860 return wasm_i8x16_shuffle(
v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
1861 16, 16, 16, 16, 16, 16, 16);
1864 return wasm_i8x16_shuffle(
v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
1865 16, 16, 16, 16, 16, 16, 16);
1868 return wasm_i8x16_shuffle(
v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
1869 16, 16, 16, 16, 16, 16, 16);
1872 return wasm_i8x16_shuffle(
v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
1873 16, 16, 16, 16, 16, 16, 16);
1876 return wasm_i8x16_shuffle(
v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
1877 16, 16, 16, 16, 16, 16, 16);
1880 return wasm_i8x16_shuffle(
v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
1881 16, 16, 16, 16, 16, 16, 16);
1884 return wasm_i8x16_shuffle(
v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
1885 16, 16, 16, 16, 16, 16, 16);
1894 template <
int kBytes,
typename T,
size_t N>
1897 if (
N != 16 /
sizeof(T)) {
1898 const Vec128<T> vfull{
v.raw};
1901 return Vec128<T, N>{detail::ShrBytes<kBytes>(
v)};
1905 template <
int kLanes,
typename T,
size_t N>
1914 template <
typename T>
1916 return Vec64<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
1919 return Vec64<float>{wasm_i32x4_shuffle(
v.raw,
v.raw, 2, 3, 2, 3)};
1923 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
1930 return Vec128<T, (
N + 1) / 2>{upper.raw};
1935 template <
int kBytes,
typename T,
class V = Vec128<T>>
1937 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
1943 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1944 11, 12, 13, 14, 15, 16)};
1947 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
1948 11, 12, 13, 14, 15, 16, 17)};
1951 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
1952 12, 13, 14, 15, 16, 17, 18)};
1955 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
1956 13, 14, 15, 16, 17, 18, 19)};
1959 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1960 14, 15, 16, 17, 18, 19, 20)};
1963 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
1964 14, 15, 16, 17, 18, 19, 20, 21)};
1967 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
1968 15, 16, 17, 18, 19, 20, 21, 22)};
1971 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
1972 16, 17, 18, 19, 20, 21, 22, 23)};
1975 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
1976 17, 18, 19, 20, 21, 22, 23, 24)};
1979 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
1980 17, 18, 19, 20, 21, 22, 23, 24, 25)};
1983 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
1984 18, 19, 20, 21, 22, 23, 24, 25, 26)};
1987 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
1988 19, 20, 21, 22, 23, 24, 25, 26, 27)};
1991 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
1992 20, 21, 22, 23, 24, 25, 26, 27, 28)};
1995 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
1996 21, 22, 23, 24, 25, 26, 27, 28, 29)};
1999 return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
2000 22, 23, 24, 25, 26, 27, 28, 29, 30)};
2005 template <
int kBytes,
typename T,
size_t N,
HWY_IF_LE64(T,
N),
2006 class V = Vec128<T, N>>
2008 constexpr
size_t kSize =
N *
sizeof(T);
2009 static_assert(0 < kBytes && kBytes < kSize,
"kBytes invalid");
2012 using V8 =
VFromD<decltype(d_full8)>;
2013 const V8 hi8{
BitCast(d8, hi).raw};
2022 template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2024 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2025 return Vec128<T, N>{wasm_i16x8_shuffle(
v.raw,
v.raw, kLane, kLane, kLane,
2026 kLane, kLane, kLane, kLane, kLane)};
2029 template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2031 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2032 return Vec128<T, N>{
2033 wasm_i32x4_shuffle(
v.raw,
v.raw, kLane, kLane, kLane, kLane)};
2036 template <
int kLane,
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2038 static_assert(0 <= kLane && kLane <
N,
"Invalid lane");
2039 return Vec128<T, N>{wasm_i64x2_shuffle(
v.raw,
v.raw, kLane, kLane)};
2046 template <
typename T,
size_t N,
typename TI,
size_t NI>
2056 alignas(16) uint8_t control[16];
2057 alignas(16) uint8_t input[16];
2058 alignas(16) uint8_t output[16];
2059 wasm_v128_store(control, from.
raw);
2060 wasm_v128_store(input, bytes.
raw);
2061 for (
size_t i = 0; i < 16; ++i) {
2062 output[i] = control[i] < 16 ? input[control[i]] : 0;
2068 template <
typename T,
size_t N,
typename TI,
size_t NI>
2089 template <
typename T,
size_t N>
2091 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2092 static_assert(
N == 2 ||
N == 4,
"Does not make sense for N=1");
2093 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 0, 3, 2)};
2097 template <
typename T>
2099 static_assert(
sizeof(T) == 8,
"Only for 64-bit lanes");
2100 return Vec128<T>{wasm_i64x2_shuffle(
v.raw,
v.raw, 1, 0)};
2102 template <
typename T>
2104 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2105 return Vec128<T>{wasm_i64x2_shuffle(
v.raw,
v.raw, 1, 0)};
2109 template <
typename T>
2111 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2112 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 2, 3, 0)};
2116 template <
typename T>
2118 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2119 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 3, 0, 1, 2)};
2123 template <
typename T>
2125 static_assert(
sizeof(T) == 4,
"Only for 32-bit lanes");
2126 return Vec128<T>{wasm_i32x4_shuffle(
v.raw,
v.raw, 3, 2, 1, 0)};
2132 template <
typename T,
size_t N>
2137 template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
2139 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
2140 #if HWY_IS_DEBUG_BUILD
2141 const Rebind<TI, decltype(
d)> di;
2147 using V8 =
VFromD<decltype(d8)>;
2151 static_assert(
sizeof(T) == 4 ||
sizeof(T) == 8,
"");
2152 if (
sizeof(T) == 4) {
2153 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2154 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
2155 const V8 lane_indices =
2157 const V8 byte_indices =
2159 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
2160 0, 1, 2, 3, 0, 1, 2, 3};
2163 alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
2164 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
2165 const V8 lane_indices =
2167 const V8 byte_indices =
2169 alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
2170 0, 1, 2, 3, 4, 5, 6, 7};
2171 return Indices128<T, N>{
Add(byte_indices,
Load(d8, kByteOffsets)).raw};
2175 template <
typename T,
size_t N,
typename TI, HWY_IF_LE128(T, N)>
2177 const Rebind<TI, decltype(
d)> di;
2181 template <
typename T,
size_t N>
2183 using TI = MakeSigned<T>;
2185 const Rebind<TI, decltype(
d)> di;
2192 template <
typename T>
2198 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2200 return Vec128<T, 2>{
Shuffle2301(Vec128<T>{
v.raw}).raw};
2203 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2209 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2215 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2223 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2229 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2234 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2241 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2243 return BitCast(
d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(
v.raw,
v.raw, 3, 2,
2244 1, 0, 7, 6, 5, 4)});
2247 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2252 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2259 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2264 template <
typename T,
size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
2275 a.
raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2281 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2298 a.
raw, b.
raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
2304 wasm_i16x8_shuffle(a.
raw, b.
raw, 0, 8, 1, 9, 2, 10, 3, 11)};
2338 26, 11, 27, 12, 28, 13, 29, 14,
2345 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2362 26, 11, 27, 12, 28, 13, 29, 14,
2369 wasm_i16x8_shuffle(a.
raw, b.
raw, 4, 12, 5, 13, 6, 14, 7, 15)};
2391 template <
typename T,
class V = Vec128<T>>
2397 template <
typename T,
size_t N, HWY_IF_LE64(T, N),
class V = Vec128<T, N>>
2399 const Half<decltype(
d)> d2;
2407 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2411 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2416 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
2426 template <
typename T,
size_t N>
2429 const Half<decltype(
d)> d2;
2433 const VU lo{
BitCast(du2, lo_half).raw};
2434 const VU hi{
BitCast(du2, hi_half).raw};
2440 template <
typename T,
size_t N>
2448 template <
typename T>
2453 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2455 const Vec128<T, N> lo) {
2456 const Half<decltype(
d)> d2;
2462 template <
typename T>
2467 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2469 const Vec128<T, N> lo) {
2470 const Half<decltype(
d)> d2;
2476 template <
typename T>
2479 return CombineShiftRightBytes<8>(
d, hi, lo);
2481 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
2483 const Vec128<T, N> lo) {
2484 const Half<decltype(
d)> d2;
2489 template <
typename T,
size_t N>
2491 const Vec128<T, N> lo) {
2498 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2500 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
2504 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2512 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2520 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2522 return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
2526 template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2534 template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2541 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2543 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 0, 0, 2, 2)};
2546 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2553 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2555 return Vec128<T, N>{wasm_i32x4_shuffle(
v.raw,
v.raw, 1, 1, 3, 3)};
2558 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2567 template <
typename T,
size_t N>
2572 alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
2573 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
2576 template <
typename T,
size_t N>
2580 wasm_i16x8_shuffle(a.
raw, b.
raw, 8, 1, 10, 3, 12, 5, 14, 7)};
2582 template <
typename T,
size_t N>
2587 template <
typename T,
size_t N>
2595 template <
typename T,
size_t N>
2596 HWY_API Vec128<T, N>
OddEven(
const Vec128<T, N> a,
const Vec128<T, N> b) {
2606 template <
typename T,
size_t N>
2613 template <
typename T,
size_t N>
2621 template <
typename T>
2640 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
2651 wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(
v.raw))};
2655 const Vec128<uint16_t, N>
v) {
2656 return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(
v.raw)};
2673 const Vec128<int8_t, N>
v) {
2674 return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(
v.raw)};
2678 const Vec128<int8_t, N>
v) {
2679 return Vec128<int32_t, N>{
2680 wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(
v.raw))};
2684 const Vec128<int16_t, N>
v) {
2685 return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(
v.raw)};
2689 const Vec128<int32_t, N>
v) {
2690 return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(
v.raw)};
2701 const Vec128<float16_t, N>
v) {
2705 const auto bits16 =
PromoteTo(du32, Vec128<uint16_t, N>{
v.raw});
2706 const auto sign = ShiftRight<15>(bits16);
2707 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
2708 const auto mantissa = bits16 &
Set(du32, 0x3FF);
2709 const auto subnormal =
2711 Set(df32, 1.0f / 16384 / 1024));
2713 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
2714 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
2715 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
2716 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
2717 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
2722 const Vec128<bfloat16_t, N>
v) {
2723 const Rebind<uint16_t, decltype(df32)> du16;
2745 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2747 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2759 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2777 const Vec128<float, N>
v) {
2779 const Rebind<uint32_t, decltype(du16)> du;
2781 const auto bits32 =
BitCast(du,
v);
2782 const auto sign = ShiftRight<31>(bits32);
2783 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
2784 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
2786 const auto k15 =
Set(di, 15);
2787 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
2788 const auto is_tiny = exp <
Set(di, -24);
2790 const auto is_subnormal = exp <
Set(di, -14);
2791 const auto biased_exp16 =
2793 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
2794 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
2795 (mantissa32 >> (
Set(du, 13) + sub_exp));
2797 ShiftRight<13>(mantissa32));
2799 const auto sign16 = ShiftLeft<15>(sign);
2800 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
2802 return Vec128<float16_t, N>{
DemoteTo(du16, bits16).raw};
2807 const Vec128<float, N>
v) {
2808 const Rebind<int32_t, decltype(dbf16)> di32;
2809 const Rebind<uint32_t, decltype(dbf16)> du32;
2810 const Rebind<uint16_t, decltype(dbf16)> du16;
2811 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32,
v)));
2817 Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
2819 const Repartition<uint32_t, decltype(dbf16)> du32;
2820 const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(
BitCast(du32, b));
2827 const auto intermediate = wasm_i16x8_narrow_i32x4(
v.raw,
v.raw);
2829 wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
2856 const DFromV<decltype(
v)> du8;
2860 using VU16 =
VFromD<decltype(du16)>;
2862 const VU16 vFDB97531 = ShiftRight<8>(
BitCast(du16,
v));
2864 const VU16 sFE_DC_BA_98_76_54_32_10 =
Add(vFDB97531, vECA86420);
2866 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
2867 BitCast(du16, ShiftRight<16>(
BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
2868 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
2869 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
2870 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
2871 BitCast(du16, ShiftRight<32>(
BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
2872 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
2873 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
2874 return And(
BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70),
Set(du64, 0xFFFF));
2881 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 1)>
2886 const Vec128<T, N> vbits{wasm_i32x4_splat(
static_cast<int32_t
>(bits))};
2889 alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
2890 1, 1, 1, 1, 1, 1, 1, 1};
2893 alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
2894 1, 2, 4, 8, 16, 32, 64, 128};
2898 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2)>
2901 alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2906 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 4)>
2909 alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
2914 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 8)>
2917 alignas(16) constexpr uint64_t kBit[8] = {1, 2};
2924 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
2927 uint64_t mask_bits = 0;
2937 template <
typename T>
2939 const Mask128<T> mask) {
2940 alignas(16) uint64_t lanes[2];
2941 wasm_v128_store(lanes, mask.raw);
2943 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2944 const uint64_t lo = ((lanes[0] * kMagic) >> 56);
2945 const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
2950 template <
typename T>
2953 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2954 return (
static_cast<uint64_t
>(wasm_i64x2_extract_lane(mask.
raw, 0)) *
2960 template <
typename T,
size_t N, HWY_IF_LE32(T, N)>
2963 uint64_t bytes =
static_cast<uint64_t
>(wasm_i64x2_extract_lane(mask.
raw, 0));
2965 bytes &= (1ULL << (
N * 8)) - 1;
2966 constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
2967 return (bytes * kMagic) >> 56;
2970 template <
typename T,
size_t N>
2974 const __i16x8 zero = wasm_i16x8_splat(0);
2979 template <
typename T,
size_t N>
2982 const __i32x4 mask_i =
static_cast<__i32x4
>(mask.
raw);
2983 const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
2984 const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
2985 alignas(16) uint32_t lanes[4];
2986 wasm_v128_store(lanes, sliced_mask);
2987 return lanes[0] | lanes[1] | lanes[2] | lanes[3];
2990 template <
typename T,
size_t N>
2993 const __i64x2 mask_i =
static_cast<__i64x2
>(mask.
raw);
2994 const __i64x2 slice = wasm_i64x2_make(1, 2);
2995 const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
2996 alignas(16) uint64_t lanes[2];
2997 wasm_v128_store(lanes, sliced_mask);
2998 return lanes[0] | lanes[1];
3002 template <
typename T,
size_t N>
3003 constexpr uint64_t
OnlyActive(uint64_t bits) {
3004 return ((
N *
sizeof(T)) == 16) ? bits : bits & ((1ull <<
N) - 1);
3011 (
N == 0) ? wasm_i32x4_make(-1, -1, -1, -1)
3012 : (
N == 4) ? wasm_i32x4_make(0, -1, -1, -1)
3013 : (
N == 8) ? wasm_i32x4_make(0, 0, -1, -1)
3014 : (
N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
3015 : (
N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
3016 : (
N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
3017 : (
N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
3018 : (
N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
3019 : (
N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
3020 : (
N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3022 : (
N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
3024 : (
N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
3026 : (
N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
3028 : (
N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
3031 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
3033 ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
3034 : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
3037 template <
typename T,
size_t N>
3042 template <
typename T>
3047 template <
typename T>
3052 template <
typename T>
3054 const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
3055 const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
3056 alignas(16) uint64_t lanes[2];
3057 wasm_v128_store(lanes, shifted_bits);
3058 return PopCount(lanes[0] | lanes[1]);
3061 template <
typename T>
3063 alignas(16) int64_t lanes[2];
3064 wasm_v128_store(lanes, m.raw);
3065 return static_cast<size_t>(-(lanes[0] + lanes[1]));
3071 template <
typename T,
size_t N>
3073 const Mask128<T, N> mask, uint8_t* bits) {
3075 const size_t kNumBytes = (
N + 7) / 8;
3076 CopyBytes<kNumBytes>(&mask_bits, bits);
3080 template <
typename T,
size_t N>
3086 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3089 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3094 template <
typename T>
3100 return !wasm_i8x16_any_true(v8.raw);
3103 return (wasm_i64x2_extract_lane(m.raw, 0) |
3104 wasm_i64x2_extract_lane(m.raw, 1)) == 0;
3110 template <
typename T>
3112 return wasm_i8x16_all_true(m.
raw);
3114 template <
typename T>
3116 return wasm_i16x8_all_true(m.
raw);
3118 template <
typename T>
3120 return wasm_i32x4_all_true(m.
raw);
3122 template <
typename T>
3124 return wasm_i64x2_all_true(m.
raw);
3129 template <
typename T,
size_t N>
3136 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3139 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3143 template <
typename T,
size_t N, HWY_IF_LE64(T, N)>
3146 const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
3150 template <
typename T,
size_t N>
3152 const Mask128<T, N> mask) {
3161 template <
typename T,
size_t N>
3165 const Rebind<uint8_t, decltype(
d)> d8;
3173 alignas(16) constexpr uint8_t table[256 * 8] = {
3174 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3175 2, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3176 4, 0, 2, 6, 8, 10, 12, 14, 0, 4, 2, 6, 8, 10, 12, 14,
3177 2, 4, 0, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3178 6, 0, 2, 4, 8, 10, 12, 14, 0, 6, 2, 4, 8, 10, 12, 14,
3179 2, 6, 0, 4, 8, 10, 12, 14, 0, 2, 6, 4, 8, 10, 12, 14,
3180 4, 6, 0, 2, 8, 10, 12, 14, 0, 4, 6, 2, 8, 10, 12, 14,
3181 2, 4, 6, 0, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3182 8, 0, 2, 4, 6, 10, 12, 14, 0, 8, 2, 4, 6, 10, 12, 14,
3183 2, 8, 0, 4, 6, 10, 12, 14, 0, 2, 8, 4, 6, 10, 12, 14,
3184 4, 8, 0, 2, 6, 10, 12, 14, 0, 4, 8, 2, 6, 10, 12, 14,
3185 2, 4, 8, 0, 6, 10, 12, 14, 0, 2, 4, 8, 6, 10, 12, 14,
3186 6, 8, 0, 2, 4, 10, 12, 14, 0, 6, 8, 2, 4, 10, 12, 14,
3187 2, 6, 8, 0, 4, 10, 12, 14, 0, 2, 6, 8, 4, 10, 12, 14,
3188 4, 6, 8, 0, 2, 10, 12, 14, 0, 4, 6, 8, 2, 10, 12, 14,
3189 2, 4, 6, 8, 0, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3190 10, 0, 2, 4, 6, 8, 12, 14, 0, 10, 2, 4, 6, 8, 12, 14,
3191 2, 10, 0, 4, 6, 8, 12, 14, 0, 2, 10, 4, 6, 8, 12, 14,
3192 4, 10, 0, 2, 6, 8, 12, 14, 0, 4, 10, 2, 6, 8, 12, 14,
3193 2, 4, 10, 0, 6, 8, 12, 14, 0, 2, 4, 10, 6, 8, 12, 14,
3194 6, 10, 0, 2, 4, 8, 12, 14, 0, 6, 10, 2, 4, 8, 12, 14,
3195 2, 6, 10, 0, 4, 8, 12, 14, 0, 2, 6, 10, 4, 8, 12, 14,
3196 4, 6, 10, 0, 2, 8, 12, 14, 0, 4, 6, 10, 2, 8, 12, 14,
3197 2, 4, 6, 10, 0, 8, 12, 14, 0, 2, 4, 6, 10, 8, 12, 14,
3198 8, 10, 0, 2, 4, 6, 12, 14, 0, 8, 10, 2, 4, 6, 12, 14,
3199 2, 8, 10, 0, 4, 6, 12, 14, 0, 2, 8, 10, 4, 6, 12, 14,
3200 4, 8, 10, 0, 2, 6, 12, 14, 0, 4, 8, 10, 2, 6, 12, 14,
3201 2, 4, 8, 10, 0, 6, 12, 14, 0, 2, 4, 8, 10, 6, 12, 14,
3202 6, 8, 10, 0, 2, 4, 12, 14, 0, 6, 8, 10, 2, 4, 12, 14,
3203 2, 6, 8, 10, 0, 4, 12, 14, 0, 2, 6, 8, 10, 4, 12, 14,
3204 4, 6, 8, 10, 0, 2, 12, 14, 0, 4, 6, 8, 10, 2, 12, 14,
3205 2, 4, 6, 8, 10, 0, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3206 12, 0, 2, 4, 6, 8, 10, 14, 0, 12, 2, 4, 6, 8, 10, 14,
3207 2, 12, 0, 4, 6, 8, 10, 14, 0, 2, 12, 4, 6, 8, 10, 14,
3208 4, 12, 0, 2, 6, 8, 10, 14, 0, 4, 12, 2, 6, 8, 10, 14,
3209 2, 4, 12, 0, 6, 8, 10, 14, 0, 2, 4, 12, 6, 8, 10, 14,
3210 6, 12, 0, 2, 4, 8, 10, 14, 0, 6, 12, 2, 4, 8, 10, 14,
3211 2, 6, 12, 0, 4, 8, 10, 14, 0, 2, 6, 12, 4, 8, 10, 14,
3212 4, 6, 12, 0, 2, 8, 10, 14, 0, 4, 6, 12, 2, 8, 10, 14,
3213 2, 4, 6, 12, 0, 8, 10, 14, 0, 2, 4, 6, 12, 8, 10, 14,
3214 8, 12, 0, 2, 4, 6, 10, 14, 0, 8, 12, 2, 4, 6, 10, 14,
3215 2, 8, 12, 0, 4, 6, 10, 14, 0, 2, 8, 12, 4, 6, 10, 14,
3216 4, 8, 12, 0, 2, 6, 10, 14, 0, 4, 8, 12, 2, 6, 10, 14,
3217 2, 4, 8, 12, 0, 6, 10, 14, 0, 2, 4, 8, 12, 6, 10, 14,
3218 6, 8, 12, 0, 2, 4, 10, 14, 0, 6, 8, 12, 2, 4, 10, 14,
3219 2, 6, 8, 12, 0, 4, 10, 14, 0, 2, 6, 8, 12, 4, 10, 14,
3220 4, 6, 8, 12, 0, 2, 10, 14, 0, 4, 6, 8, 12, 2, 10, 14,
3221 2, 4, 6, 8, 12, 0, 10, 14, 0, 2, 4, 6, 8, 12, 10, 14,
3222 10, 12, 0, 2, 4, 6, 8, 14, 0, 10, 12, 2, 4, 6, 8, 14,
3223 2, 10, 12, 0, 4, 6, 8, 14, 0, 2, 10, 12, 4, 6, 8, 14,
3224 4, 10, 12, 0, 2, 6, 8, 14, 0, 4, 10, 12, 2, 6, 8, 14,
3225 2, 4, 10, 12, 0, 6, 8, 14, 0, 2, 4, 10, 12, 6, 8, 14,
3226 6, 10, 12, 0, 2, 4, 8, 14, 0, 6, 10, 12, 2, 4, 8, 14,
3227 2, 6, 10, 12, 0, 4, 8, 14, 0, 2, 6, 10, 12, 4, 8, 14,
3228 4, 6, 10, 12, 0, 2, 8, 14, 0, 4, 6, 10, 12, 2, 8, 14,
3229 2, 4, 6, 10, 12, 0, 8, 14, 0, 2, 4, 6, 10, 12, 8, 14,
3230 8, 10, 12, 0, 2, 4, 6, 14, 0, 8, 10, 12, 2, 4, 6, 14,
3231 2, 8, 10, 12, 0, 4, 6, 14, 0, 2, 8, 10, 12, 4, 6, 14,
3232 4, 8, 10, 12, 0, 2, 6, 14, 0, 4, 8, 10, 12, 2, 6, 14,
3233 2, 4, 8, 10, 12, 0, 6, 14, 0, 2, 4, 8, 10, 12, 6, 14,
3234 6, 8, 10, 12, 0, 2, 4, 14, 0, 6, 8, 10, 12, 2, 4, 14,
3235 2, 6, 8, 10, 12, 0, 4, 14, 0, 2, 6, 8, 10, 12, 4, 14,
3236 4, 6, 8, 10, 12, 0, 2, 14, 0, 4, 6, 8, 10, 12, 2, 14,
3237 2, 4, 6, 8, 10, 12, 0, 14, 0, 2, 4, 6, 8, 10, 12, 14,
3238 14, 0, 2, 4, 6, 8, 10, 12, 0, 14, 2, 4, 6, 8, 10, 12,
3239 2, 14, 0, 4, 6, 8, 10, 12, 0, 2, 14, 4, 6, 8, 10, 12,
3240 4, 14, 0, 2, 6, 8, 10, 12, 0, 4, 14, 2, 6, 8, 10, 12,
3241 2, 4, 14, 0, 6, 8, 10, 12, 0, 2, 4, 14, 6, 8, 10, 12,
3242 6, 14, 0, 2, 4, 8, 10, 12, 0, 6, 14, 2, 4, 8, 10, 12,
3243 2, 6, 14, 0, 4, 8, 10, 12, 0, 2, 6, 14, 4, 8, 10, 12,
3244 4, 6, 14, 0, 2, 8, 10, 12, 0, 4, 6, 14, 2, 8, 10, 12,
3245 2, 4, 6, 14, 0, 8, 10, 12, 0, 2, 4, 6, 14, 8, 10, 12,
3246 8, 14, 0, 2, 4, 6, 10, 12, 0, 8, 14, 2, 4, 6, 10, 12,
3247 2, 8, 14, 0, 4, 6, 10, 12, 0, 2, 8, 14, 4, 6, 10, 12,
3248 4, 8, 14, 0, 2, 6, 10, 12, 0, 4, 8, 14, 2, 6, 10, 12,
3249 2, 4, 8, 14, 0, 6, 10, 12, 0, 2, 4, 8, 14, 6, 10, 12,
3250 6, 8, 14, 0, 2, 4, 10, 12, 0, 6, 8, 14, 2, 4, 10, 12,
3251 2, 6, 8, 14, 0, 4, 10, 12, 0, 2, 6, 8, 14, 4, 10, 12,
3252 4, 6, 8, 14, 0, 2, 10, 12, 0, 4, 6, 8, 14, 2, 10, 12,
3253 2, 4, 6, 8, 14, 0, 10, 12, 0, 2, 4, 6, 8, 14, 10, 12,
3254 10, 14, 0, 2, 4, 6, 8, 12, 0, 10, 14, 2, 4, 6, 8, 12,
3255 2, 10, 14, 0, 4, 6, 8, 12, 0, 2, 10, 14, 4, 6, 8, 12,
3256 4, 10, 14, 0, 2, 6, 8, 12, 0, 4, 10, 14, 2, 6, 8, 12,
3257 2, 4, 10, 14, 0, 6, 8, 12, 0, 2, 4, 10, 14, 6, 8, 12,
3258 6, 10, 14, 0, 2, 4, 8, 12, 0, 6, 10, 14, 2, 4, 8, 12,
3259 2, 6, 10, 14, 0, 4, 8, 12, 0, 2, 6, 10, 14, 4, 8, 12,
3260 4, 6, 10, 14, 0, 2, 8, 12, 0, 4, 6, 10, 14, 2, 8, 12,
3261 2, 4, 6, 10, 14, 0, 8, 12, 0, 2, 4, 6, 10, 14, 8, 12,
3262 8, 10, 14, 0, 2, 4, 6, 12, 0, 8, 10, 14, 2, 4, 6, 12,
3263 2, 8, 10, 14, 0, 4, 6, 12, 0, 2, 8, 10, 14, 4, 6, 12,
3264 4, 8, 10, 14, 0, 2, 6, 12, 0, 4, 8, 10, 14, 2, 6, 12,
3265 2, 4, 8, 10, 14, 0, 6, 12, 0, 2, 4, 8, 10, 14, 6, 12,
3266 6, 8, 10, 14, 0, 2, 4, 12, 0, 6, 8, 10, 14, 2, 4, 12,
3267 2, 6, 8, 10, 14, 0, 4, 12, 0, 2, 6, 8, 10, 14, 4, 12,
3268 4, 6, 8, 10, 14, 0, 2, 12, 0, 4, 6, 8, 10, 14, 2, 12,
3269 2, 4, 6, 8, 10, 14, 0, 12, 0, 2, 4, 6, 8, 10, 14, 12,
3270 12, 14, 0, 2, 4, 6, 8, 10, 0, 12, 14, 2, 4, 6, 8, 10,
3271 2, 12, 14, 0, 4, 6, 8, 10, 0, 2, 12, 14, 4, 6, 8, 10,
3272 4, 12, 14, 0, 2, 6, 8, 10, 0, 4, 12, 14, 2, 6, 8, 10,
3273 2, 4, 12, 14, 0, 6, 8, 10, 0, 2, 4, 12, 14, 6, 8, 10,
3274 6, 12, 14, 0, 2, 4, 8, 10, 0, 6, 12, 14, 2, 4, 8, 10,
3275 2, 6, 12, 14, 0, 4, 8, 10, 0, 2, 6, 12, 14, 4, 8, 10,
3276 4, 6, 12, 14, 0, 2, 8, 10, 0, 4, 6, 12, 14, 2, 8, 10,
3277 2, 4, 6, 12, 14, 0, 8, 10, 0, 2, 4, 6, 12, 14, 8, 10,
3278 8, 12, 14, 0, 2, 4, 6, 10, 0, 8, 12, 14, 2, 4, 6, 10,
3279 2, 8, 12, 14, 0, 4, 6, 10, 0, 2, 8, 12, 14, 4, 6, 10,
3280 4, 8, 12, 14, 0, 2, 6, 10, 0, 4, 8, 12, 14, 2, 6, 10,
3281 2, 4, 8, 12, 14, 0, 6, 10, 0, 2, 4, 8, 12, 14, 6, 10,
3282 6, 8, 12, 14, 0, 2, 4, 10, 0, 6, 8, 12, 14, 2, 4, 10,
3283 2, 6, 8, 12, 14, 0, 4, 10, 0, 2, 6, 8, 12, 14, 4, 10,
3284 4, 6, 8, 12, 14, 0, 2, 10, 0, 4, 6, 8, 12, 14, 2, 10,
3285 2, 4, 6, 8, 12, 14, 0, 10, 0, 2, 4, 6, 8, 12, 14, 10,
3286 10, 12, 14, 0, 2, 4, 6, 8, 0, 10, 12, 14, 2, 4, 6, 8,
3287 2, 10, 12, 14, 0, 4, 6, 8, 0, 2, 10, 12, 14, 4, 6, 8,
3288 4, 10, 12, 14, 0, 2, 6, 8, 0, 4, 10, 12, 14, 2, 6, 8,
3289 2, 4, 10, 12, 14, 0, 6, 8, 0, 2, 4, 10, 12, 14, 6, 8,
3290 6, 10, 12, 14, 0, 2, 4, 8, 0, 6, 10, 12, 14, 2, 4, 8,
3291 2, 6, 10, 12, 14, 0, 4, 8, 0, 2, 6, 10, 12, 14, 4, 8,
3292 4, 6, 10, 12, 14, 0, 2, 8, 0, 4, 6, 10, 12, 14, 2, 8,
3293 2, 4, 6, 10, 12, 14, 0, 8, 0, 2, 4, 6, 10, 12, 14, 8,
3294 8, 10, 12, 14, 0, 2, 4, 6, 0, 8, 10, 12, 14, 2, 4, 6,
3295 2, 8, 10, 12, 14, 0, 4, 6, 0, 2, 8, 10, 12, 14, 4, 6,
3296 4, 8, 10, 12, 14, 0, 2, 6, 0, 4, 8, 10, 12, 14, 2, 6,
3297 2, 4, 8, 10, 12, 14, 0, 6, 0, 2, 4, 8, 10, 12, 14, 6,
3298 6, 8, 10, 12, 14, 0, 2, 4, 0, 6, 8, 10, 12, 14, 2, 4,
3299 2, 6, 8, 10, 12, 14, 0, 4, 0, 2, 6, 8, 10, 12, 14, 4,
3300 4, 6, 8, 10, 12, 14, 0, 2, 0, 4, 6, 8, 10, 12, 14, 2,
3301 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14};
3308 template <
typename T,
size_t N>
3313 alignas(16) constexpr uint8_t packed_array[16 * 16] = {
3314 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3315 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3316 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15,
3317 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3318 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15,
3319 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15,
3320 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15,
3321 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3322 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
3323 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11,
3324 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11,
3325 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11,
3326 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
3327 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7,
3328 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
3329 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3332 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
3335 template <
typename T,
size_t N>
3340 alignas(16) constexpr uint8_t packed_array[4 * 16] = {
3341 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3342 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3343 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7,
3344 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3348 return BitCast(
d,
Load(d8, packed_array + 16 * mask_bits));
3354 template <
typename T,
size_t N>
3356 const uint64_t mask_bits) {
3357 const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
3363 template <
typename T,
size_t N>
3365 const uint64_t mask_bits) {
3366 const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
3372 template <
typename T,
size_t N>
3374 const uint64_t mask_bits) {
3375 const auto idx = detail::Idx64x2FromBits<T, N>(mask_bits);
3383 template <
typename T>
3384 struct CompressIsPartition {
3388 template <
typename T,
size_t N>
3396 template <
typename T,
size_t N>
3399 uint64_t mask_bits = 0;
3400 constexpr
size_t kNumBytes = (
N + 7) / 8;
3401 CopyBytes<kNumBytes>(bits, &mask_bits);
3403 mask_bits &= (1ull <<
N) - 1;
3410 template <
typename T,
size_t N>
3420 template <
typename T,
size_t N>
3425 using TU =
TFromD<decltype(du)>;
3427 const size_t count =
PopCount(mask_bits);
3428 const Vec128<TU, N> compressed =
3437 template <
typename T,
size_t N>
3441 uint64_t mask_bits = 0;
3442 constexpr
size_t kNumBytes = (
N + 7) / 8;
3443 CopyBytes<kNumBytes>(bits, &mask_bits);
3445 mask_bits &= (1ull <<
N) - 1;
3458 const Vec128<uint8_t> c, Full128<uint8_t>
d,
3460 const auto k5 =
Set(
d, 5);
3461 const auto k6 =
Set(
d, 6);
3465 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
3466 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
3467 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3468 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
3469 0x80, 0, 0x80, 0x80, 1, 0x80,
3470 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3471 const auto shuf_r0 =
Load(
d, tbl_r0);
3472 const auto shuf_g0 =
Load(
d, tbl_g0);
3473 const auto shuf_b0 = CombineShiftRightBytes<15>(
d, shuf_g0, shuf_g0);
3477 const auto int0 = r0 | g0 | b0;
3478 StoreU(int0,
d, unaligned + 0 * 16);
3481 const auto shuf_r1 = shuf_b0 + k6;
3482 const auto shuf_g1 = shuf_r0 + k5;
3483 const auto shuf_b1 = shuf_g0 + k5;
3487 const auto int1 = r1 | g1 | b1;
3488 StoreU(int1,
d, unaligned + 1 * 16);
3491 const auto shuf_r2 = shuf_b1 + k6;
3492 const auto shuf_g2 = shuf_r1 + k5;
3493 const auto shuf_b2 = shuf_g1 + k5;
3497 const auto int2 = r2 | g2 | b2;
3498 StoreU(int2,
d, unaligned + 2 * 16);
3508 const auto k5 =
Set(d_full, 5);
3509 const auto k6 =
Set(d_full, 6);
3517 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
3518 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,
3519 3, 0x80, 0x80, 4, 0x80, 0x80, 5};
3520 alignas(16)
static constexpr uint8_t tbl_g0[16] = {
3521 0x80, 0, 0x80, 0x80, 1, 0x80,
3522 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
3523 const auto shuf_r0 =
Load(d_full, tbl_r0);
3524 const auto shuf_g0 =
Load(d_full, tbl_g0);
3525 const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0);
3529 const auto int0 = r0 | g0 | b0;
3530 StoreU(int0, d_full, unaligned + 0 * 16);
3533 const auto shuf_r1 = shuf_b0 + k6;
3534 const auto shuf_g1 = shuf_r0 + k5;
3535 const auto shuf_b1 = shuf_g0 + k5;
3539 const decltype(
Zero(
d)) int1{(r1 | g1 | b1).raw};
3540 StoreU(int1,
d, unaligned + 1 * 16);
3544 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
3546 const Vec128<uint8_t, N> b,
3547 const Vec128<uint8_t, N> c,
3548 Simd<uint8_t, N, 0> ,
3551 const Full128<uint8_t> d_full;
3553 const Vec128<uint8_t> full_a{a.raw};
3554 const Vec128<uint8_t> full_b{b.raw};
3555 const Vec128<uint8_t> full_c{c.raw};
3559 alignas(16)
static constexpr uint8_t tbl_r0[16] = {
3560 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80,
3561 0x80, 0x80, 0x80, 0x80};
3562 const auto shuf_r0 =
Load(d_full, tbl_r0);
3563 const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0);
3564 const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0);
3568 const auto int0 = r0 | g0 | b0;
3569 alignas(16) uint8_t buf[16];
3570 StoreU(int0, d_full, buf);
3571 CopyBytes<N * 3>(buf, unaligned);
3578 const Vec128<uint8_t> v1,
3579 const Vec128<uint8_t> v2,
3580 const Vec128<uint8_t> v3, Full128<uint8_t> d8,
3585 const auto ba0 =
ZipLower(d16, v0, v1);
3586 const auto dc0 =
ZipLower(d16, v2, v3);
3587 const auto ba8 =
ZipUpper(d16, v0, v1);
3588 const auto dc8 =
ZipUpper(d16, v2, v3);
3589 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
3590 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
3591 const auto dcba_8 =
ZipLower(d32, ba8, dc8);
3592 const auto dcba_C =
ZipUpper(d32, ba8, dc8);
3615 const auto ba0 =
ZipLower(d16, v0, v1);
3616 const auto dc0 =
ZipLower(d16, v2, v3);
3617 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
3618 const auto dcba_4 =
ZipUpper(d32, ba0, dc0);
3619 StoreU(
BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16);
3620 StoreU(
BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16);
3624 template <
size_t N, HWY_IF_LE32(u
int8_t, N)>
3626 const Vec128<uint8_t, N> in1,
3627 const Vec128<uint8_t, N> in2,
3628 const Vec128<uint8_t, N> in3,
3629 Simd<uint8_t, N, 0> ,
3632 const Full128<uint8_t> d_full8;
3635 const Vec128<uint8_t> v0{in0.raw};
3636 const Vec128<uint8_t> v1{in1.raw};
3637 const Vec128<uint8_t> v2{in2.raw};
3638 const Vec128<uint8_t> v3{in3.raw};
3640 const auto ba0 =
ZipLower(d16, v0, v1);
3641 const auto dc0 =
ZipLower(d16, v2, v3);
3642 const auto dcba_0 =
ZipLower(d32, ba0, dc0);
3643 alignas(16) uint8_t buf[16];
3645 CopyBytes<4 * N>(buf, unaligned);
3651 const Vec128<uint64_t> b) {
3652 alignas(16) uint64_t mul[2];
3654 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 0)),
3655 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
3656 return Load(Full128<uint64_t>(), mul);
3660 const Vec128<uint64_t> b) {
3661 alignas(16) uint64_t mul[2];
3663 Mul128(
static_cast<uint64_t
>(wasm_i64x2_extract_lane(a.raw, 1)),
3664 static_cast<uint64_t
>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
3665 return Load(Full128<uint64_t>(), mul);
3672 Vec128<bfloat16_t, 2 * N> a,
3673 Vec128<bfloat16_t, 2 * N> b,
3674 const Vec128<float, N> sum0,
3675 Vec128<float, N>& sum1) {
3678 const Vec128<uint16_t, 2 * N> zero =
Zero(du16);
3679 const Vec128<uint32_t, N> a0 =
ZipLower(du32, zero,
BitCast(du16, a));
3680 const Vec128<uint32_t, N> a1 =
ZipUpper(du32, zero,
BitCast(du16, a));
3681 const Vec128<uint32_t, N> b0 =
ZipLower(du32, zero,
BitCast(du16, b));
3682 const Vec128<uint32_t, N> b1 =
ZipUpper(du32, zero,
BitCast(du16, b));
3692 template <
typename T>
3697 template <
typename T>
3702 template <
typename T>
3704 const Vec128<T, 1>
v) {
3711 template <
typename T>
3716 template <
typename T>
3721 template <
typename T>
3723 const Vec128<T, 2> v10) {
3724 return Max(v10, Vec128<T, 2>{
Shuffle2301(Vec128<T>{v10.raw}).raw});
3728 template <
typename T>
3732 const Vec128<T> v31_20_31_20 = v3210 + v1032;
3734 return v20_31_20_31 + v31_20_31_20;
3736 template <
typename T>
3742 return Min(v20_31_20_31, v31_20_31_20);
3744 template <
typename T>
3746 const Vec128<T> v3210) {
3748 const Vec128<T> v31_20_31_20 =
Max(v3210, v1032);
3749 const Vec128<T> v20_31_20_31 =
Shuffle0321(v31_20_31_20);
3750 return Max(v20_31_20_31, v31_20_31_20);
3756 template <
typename T>
3762 template <
typename T>
3766 return Min(v10, v01);
3768 template <
typename T>
3770 const Vec128<T> v10) {
3772 return Max(v10, v01);
3776 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
3781 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
3784 return BitCast(
d,
Or(min, ShiftLeft<16>(min)));
3786 template <
typename T,
size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
3791 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
3794 return BitCast(
d,
Or(min, ShiftLeft<16>(min)));
3800 template <
typename T,
size_t N>
3804 template <
typename T,
size_t N>
3808 template <
typename T,
size_t N>
3817 template <
size_t kLanes,
typename T,
size_t N>
3824 template <
typename T,
size_t N, HWY_IF_LE128(T, N)>
3827 static_assert(!IsSigned<T>() &&
sizeof(T) == 8,
"Use u64");
3841 const Mask128<T, N> eqHL =
Eq(a, b);
3842 const Mask128<T, N> ltHL =
Lt(a, b);
3846 const Mask128<T, N> ltLx = detail::ShiftMaskLeft<1>(ltHL);
3847 const Mask128<T, N> outHx =
Or(ltHL,
And(eqHL, ltLx));
3895 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
3899 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
3903 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
3908 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
3912 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
3917 HWY_API auto Le(V a, V b) -> decltype(a == b) {
#define HWY_MAX(a, b)
Definition: base.h:128
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DIAGNOSTICS(tokens)
Definition: base.h:71
#define HWY_IF_LE64(T, N)
Definition: base.h:297
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition: base.h:72
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_ASSERT(condition)
Definition: base.h:147
Definition: arm_neon-inl.h:529
detail::Raw128< T >::type raw
Definition: wasm_128-inl.h:109
Raw raw
Definition: arm_neon-inl.h:539
Definition: arm_neon-inl.h:485
HWY_INLINE Vec128 & operator&=(const Vec128 other)
Definition: wasm_128-inl.h:90
HWY_INLINE Vec128 & operator|=(const Vec128 other)
Definition: wasm_128-inl.h:93
HWY_INLINE Vec128 & operator/=(const Vec128 other)
Definition: wasm_128-inl.h:81
HWY_INLINE Vec128 & operator^=(const Vec128 other)
Definition: wasm_128-inl.h:96
HWY_INLINE Vec128 & operator*=(const Vec128 other)
Definition: wasm_128-inl.h:78
typename detail::Raw128< T, N >::type Raw
Definition: arm_neon-inl.h:486
Raw raw
Definition: arm_neon-inl.h:518
HWY_INLINE Vec128 & operator+=(const Vec128 other)
Definition: wasm_128-inl.h:84
HWY_INLINE Vec128 & operator-=(const Vec128 other)
Definition: wasm_128-inl.h:87
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4522
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_API __i8x16 ShrBytes(const Vec128< T, N > v)
Definition: wasm_128-inl.h:1819
HWY_INLINE Vec128< T, N > Idx16x8FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3162
HWY_INLINE Vec128< T, N > Idx64x2FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3336
constexpr __i8x16 BytesAbove()
Definition: wasm_128-inl.h:3009
HWY_INLINE bool AllTrue(hwy::SizeTag< 1 >, const Mask128< T > m)
Definition: wasm_128-inl.h:3111
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition: x86_128-inl.h:1357
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition: x86_128-inl.h:1193
HWY_INLINE Vec128< T, N > OddEven(hwy::SizeTag< 1 >, const Vec128< T, N > a, const Vec128< T, N > b)
Definition: wasm_128-inl.h:2568
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition: wasm_128-inl.h:134
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4309
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4680
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition: arm_neon-inl.h:5020
HWY_INLINE Vec128< T, N > Idx32x4FromBits(const uint64_t mask_bits)
Definition: wasm_128-inl.h:3309
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4314
constexpr uint64_t OnlyActive(uint64_t bits)
Definition: arm_neon-inl.h:4664
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3635
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition: arm_neon-inl.h:4467
Mask128< T, N > ShiftMaskLeft(Mask128< T, N > m)
Definition: arm_neon-inl.h:5165
HWY_INLINE Vec128< T, 1 > SumOfLanes(const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:4305
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: x86_128-inl.h:680
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
Simd< T, 8/sizeof(T), 0 > Full64
Definition: arm_neon-inl.h:37
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
Simd< T, 16/sizeof(T), 0 > Full128
Definition: arm_neon-inl.h:34
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
Vec128< T, 8/sizeof(T)> Vec64
Definition: arm_neon-inl.h:522
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition: base.h:608
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t *HWY_RESTRICT upper)
Definition: base.h:711
typename detail::TypeFromSize< N >::Unsigned UnsignedFromSize
Definition: base.h:466
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
typename detail::Relations< T >::Wide MakeWide
Definition: base.h:460
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: arm_neon-inl.h:3368
__v128_u raw
Definition: wasm_128-inl.h:2134
Definition: ops/shared-inl.h:40
HWY_INLINE __f32x4 operator()(__v128_u v)
Definition: wasm_128-inl.h:154
Definition: wasm_128-inl.h:149
HWY_INLINE __v128_u operator()(__v128_u v)
Definition: wasm_128-inl.h:150
Simd< T, N, 0 > operator()(Vec128< T, N >) const
Definition: wasm_128-inl.h:117
__f32x4 type
Definition: wasm_128-inl.h:66
Definition: x86_128-inl.h:69
__v128_u type
Definition: wasm_128-inl.h:62