42 return *
this = (*
this * other);
45 return *
this = (*
this / other);
48 return *
this = (*
this + other);
51 return *
this = (*
this - other);
54 return *
this = (*
this & other);
57 return *
this = (*
this | other);
60 return *
this = (*
this ^ other);
94 using DFromV = decltype(detail::Deduce1()(V()));
97 using TFromV = TFromD<DFromV<V>>;
101 template <
typename T,
typename FromT>
103 static_assert(
sizeof(T) <=
sizeof(FromT),
"Promoting is undefined");
105 CopyBytes<sizeof(FromT)>(&
v.raw, &to);
111 template <
typename T>
116 template <
typename T,
typename T2>
118 return Vec1<T>(
static_cast<T
>(t));
121 template <
typename T>
126 template <
typename T,
typename T2>
128 return Vec1<T>(
static_cast<T
>(first));
135 template <
typename T>
144 template <
typename T>
150 template <
typename T>
157 template <
typename T>
167 template <
typename T>
173 template <
typename T>
180 template <
typename T>
186 template <
typename T>
193 template <
typename T>
195 return Or(o,
And(a1, a2));
200 template <
typename T>
207 template <
typename T>
209 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
214 template <
typename T>
216 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
222 template <
typename T>
230 #ifdef HWY_NATIVE_POPCNT
231 #undef HWY_NATIVE_POPCNT
233 #define HWY_NATIVE_POPCNT
236 template <
typename T>
243 template <
typename TFrom,
typename TTo>
245 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
250 template <
typename T>
257 template <
typename T>
264 template <
typename T>
271 template <
typename T>
277 template <
typename T>
280 return mask.
bits ? yes : no;
283 template <
typename T>
288 template <
typename T>
293 template <
typename T>
295 return v.raw < 0 ? yes : no;
298 template <
typename T>
305 template <
typename T>
310 template <
typename T>
316 template <
typename T>
322 template <
typename T>
328 template <
typename T>
338 template <
int kBits,
typename T>
340 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
344 template <
int kBits,
typename T>
346 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
347 #if __cplusplus >= 202002L
357 const TU shifted =
BitCast(du,
v).raw >> kBits;
359 const TU upper = sign << (
sizeof(TU) * 8 - 1 - kBits);
369 template <
int kBits,
typename T>
371 static_assert(0 <= kBits && kBits <
sizeof(T) * 8,
"Invalid shift");
372 if (kBits == 0)
return v;
373 return Or(ShiftRight<kBits>(
v),
ShiftLeft<
sizeof(T) * 8 - kBits>(
v));
378 template <
typename T>
383 template <
typename T>
385 #if __cplusplus >= 202002L
395 const TU shifted =
BitCast(du,
v).raw >> bits;
397 const TU upper = sign << (
sizeof(TU) * 8 - 1 - bits);
408 template <
typename T>
413 template <
typename T>
420 template <
typename T>
422 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw);
423 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw);
424 return Vec1<T>(
static_cast<T
>((a64 + b64) &
static_cast<uint64_t
>(~T(0))));
433 template <
typename T>
435 const uint64_t a64 =
static_cast<uint64_t
>(a.
raw);
436 const uint64_t b64 =
static_cast<uint64_t
>(b.
raw);
437 return Vec1<T>(
static_cast<T
>((a64 - b64) &
static_cast<uint64_t
>(~T(0))));
521 template <
typename T>
524 return (i >= 0 || i == hwy::LimitsMin<T>()) ? a :
Vec1<T>(-i);
535 template <
typename T, HWY_IF_NOT_FLOAT(T)>
540 template <
typename T, HWY_IF_FLOAT(T)>
541 HWY_API Vec1<T>
Min(
const Vec1<T> a,
const Vec1<T> b) {
542 if (std::isnan(a.raw))
return b;
543 if (std::isnan(b.raw))
return a;
544 return Vec1<T>(
HWY_MIN(a.raw, b.raw));
547 template <
typename T, HWY_IF_NOT_FLOAT(T)>
552 template <
typename T, HWY_IF_FLOAT(T)>
553 HWY_API Vec1<T>
Max(
const Vec1<T> a,
const Vec1<T> b) {
554 if (std::isnan(a.raw))
return b;
555 if (std::isnan(b.raw))
return a;
556 return Vec1<T>(
HWY_MAX(a.raw, b.raw));
561 template <
typename T, HWY_IF_FLOAT(T)>
566 template <
typename T, HWY_IF_NOT_FLOAT(T)>
568 return Zero(Sisd<T>()) -
v;
573 template <
typename T, HWY_IF_FLOAT(T)>
578 template <
typename T, HWY_IF_SIGNED(T)>
580 return Vec1<T>(
static_cast<T
>(int64_t(a.raw) * b.raw));
583 template <
typename T, HWY_IF_UNSIGNED(T)>
585 return Vec1<T>(
static_cast<T
>(uint64_t(a.raw) * b.raw));
588 template <
typename T>
602 (
static_cast<uint32_t
>(a.
raw) *
static_cast<uint32_t
>(b.
raw)) >> 16));
611 const int64_t a64 = a.
raw;
615 const uint64_t a64 = a.
raw;
635 template <
typename T>
637 return mul * x + add;
640 template <
typename T>
643 return add - mul * x;
646 template <
typename T>
648 return mul * x - sub;
651 template <
typename T>
654 return Neg(mul) * x - sub;
662 const float half = f * 0.5f;
664 CopyBytes<4>(&f, &bits);
666 bits = 0x5F3759DF - (bits >> 1);
667 CopyBytes<4>(&bits, &f);
682 template <
typename T>
685 if (!(
Abs(
v).raw < MantissaEnd<T>())) {
688 const T bias =
v.raw < T(0.0) ? T(-0.5) : T(0.5);
689 const TI rounded =
static_cast<TI
>(
v.raw + bias);
692 if ((rounded & 1) && std::abs(rounded -
v.raw) == T(0.5)) {
693 return Vec1<T>(
static_cast<T
>(rounded - (
v.raw < T(0) ? -1 : 1)));
695 return Vec1<T>(
static_cast<T
>(rounded));
703 const T abs =
Abs(
v).raw;
704 const bool signbit = std::signbit(
v.raw);
706 if (!(abs < MantissaEnd<T>())) {
708 if (!(abs <=
static_cast<T
>(LimitsMax<TI>()))) {
709 return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
713 const T bias =
v.raw < T(0.0) ? T(-0.5) : T(0.5);
714 const TI rounded =
static_cast<TI
>(
v.raw + bias);
717 if ((rounded & 1) && std::abs(
static_cast<T
>(rounded) -
v.raw) == T(0.5)) {
718 return Vec1<TI>(rounded - (signbit ? -1 : 1));
723 template <
typename T>
726 if (!(
Abs(
v).raw <= MantissaEnd<T>())) {
729 const TI truncated =
static_cast<TI
>(
v.raw);
731 return Vec1<T>(
static_cast<T
>(truncated));
734 template <
typename Float,
typename Bits,
int kMantissaBits,
int kExponentBits,
737 const Bits kExponentMask = (1ull << kExponentBits) - 1;
738 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
739 const Bits kBias = kExponentMask / 2;
742 const bool positive = f > Float(0.0);
745 CopyBytes<sizeof(Bits)>(&
v, &bits);
748 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
750 if (exponent >= kMantissaBits)
return v;
752 if (exponent < 0)
return positive ? V(1) : V(-0.0);
754 const Bits mantissa_mask = kMantissaMask >> exponent;
756 if ((bits & mantissa_mask) == 0)
return v;
759 if (positive) bits += (kMantissaMask + 1) >> exponent;
760 bits &= ~mantissa_mask;
762 CopyBytes<sizeof(Bits)>(&bits, &f);
766 template <
typename Float,
typename Bits,
int kMantissaBits,
int kExponentBits,
769 const Bits kExponentMask = (1ull << kExponentBits) - 1;
770 const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
771 const Bits kBias = kExponentMask / 2;
774 const bool negative = f < Float(0.0);
777 CopyBytes<sizeof(Bits)>(&
v, &bits);
780 static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
782 if (exponent >= kMantissaBits)
return v;
784 if (exponent < 0)
return V(negative ? Float(-1.0) : Float(0.0));
786 const Bits mantissa_mask = kMantissaMask >> exponent;
788 if ((bits & mantissa_mask) == 0)
return v;
791 if (negative) bits += (kMantissaMask + 1) >> exponent;
792 bits &= ~mantissa_mask;
794 CopyBytes<sizeof(Bits)>(&bits, &f);
800 return Ceiling<float, uint32_t, 23, 8>(
v);
803 return Ceiling<double, uint64_t, 52, 11>(
v);
808 return Floor<float, uint32_t, 23, 8>(
v);
811 return Floor<double, uint64_t, 52, 11>(
v);
816 template <
typename T>
821 template <
typename T>
826 template <
typename T>
828 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
829 return (
v & bit) == bit;
832 template <
typename T>
836 template <
typename T>
841 template <
typename T>
845 template <
typename T>
854 template <
typename T>
857 CopyBytes<sizeof(T)>(aligned, &t);
861 template <
typename T>
867 template <
typename T>
873 template <
typename T>
875 return Load(
d, aligned);
880 template <
typename T>
883 CopyBytes<sizeof(T)>(&
v.raw, aligned);
886 template <
typename T>
891 template <
typename T>
920 template <
typename T>
927 template <
typename T,
typename Offset>
930 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
931 uint8_t*
const base8 =
reinterpret_cast<uint8_t*
>(base) + offset.
raw;
932 return Store(
v,
d,
reinterpret_cast<T*
>(base8));
935 template <
typename T,
typename Index>
938 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
944 template <
typename T,
typename Offset>
947 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
948 const uintptr_t addr =
reinterpret_cast<uintptr_t
>(base) + offset.
raw;
949 return Load(
d,
reinterpret_cast<const T*
>(addr));
952 template <
typename T,
typename Index>
955 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
964 template <
typename FromT,
typename ToT>
966 static_assert(
sizeof(ToT) >
sizeof(FromT),
"Not promoting");
975 if (std::isinf(from.
raw) ||
984 if (std::isinf(from.
raw) ||
985 std::fabs(from.
raw) >
static_cast<double>(HighestValue<int32_t>())) {
987 : HighestValue<int32_t>());
992 template <
typename FromT,
typename ToT>
994 static_assert(!IsFloat<FromT>(),
"FromT=double are handled above");
995 static_assert(
sizeof(ToT) <
sizeof(FromT),
"Not demoting");
1003 #if HWY_NATIVE_FLOAT16
1005 CopyBytes<2>(&
v.raw, &bits16);
1007 const uint16_t bits16 =
v.raw.bits;
1009 const uint32_t sign =
static_cast<uint32_t
>(bits16 >> 15);
1010 const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
1011 const uint32_t mantissa = bits16 & 0x3FF;
1014 if (biased_exp == 0) {
1015 const float subnormal =
1016 (1.0f / 16384) * (
static_cast<float>(mantissa) * (1.0f / 1024));
1017 return Vec1<float>(sign ? -subnormal : subnormal);
1021 const uint32_t biased_exp32 = biased_exp + (127 - 15);
1022 const uint32_t mantissa32 = mantissa << (23 - 10);
1023 const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
1025 CopyBytes<4>(&bits32, &out);
1036 CopyBytes<4>(&
v.raw, &bits32);
1037 const uint32_t sign = bits32 >> 31;
1038 const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
1039 const uint32_t mantissa32 = bits32 & 0x7FFFFF;
1041 const int32_t exp =
HWY_MIN(
static_cast<int32_t
>(biased_exp32) - 127, 15);
1046 #if HWY_NATIVE_FLOAT16
1047 const uint16_t zero = 0;
1048 CopyBytes<2>(&zero, &out.
raw);
1055 uint32_t biased_exp16, mantissa16;
1060 const uint32_t sub_exp =
static_cast<uint32_t
>(-14 - exp);
1062 mantissa16 =
static_cast<uint32_t
>((1u << (10 - sub_exp)) +
1063 (mantissa32 >> (13 + sub_exp)));
1066 biased_exp16 =
static_cast<uint32_t
>(exp + 15);
1067 HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
1068 mantissa16 = mantissa32 >> 13;
1072 const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
1074 #if HWY_NATIVE_FLOAT16
1075 const uint16_t narrowed =
static_cast<uint16_t
>(bits16);
1076 CopyBytes<2>(&narrowed, &out.
raw);
1078 out.
raw.bits =
static_cast<uint16_t
>(bits16);
1087 template <
typename FromT,
typename ToT, HWY_IF_FLOAT(FromT)>
1089 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1092 const double f =
static_cast<double>(from.
raw);
1093 if (std::isinf(from.
raw) ||
1094 std::fabs(f) >
static_cast<double>(LimitsMax<ToT>())) {
1095 return Vec1<ToT>(std::signbit(from.
raw) ? LimitsMin<ToT>()
1096 : LimitsMax<ToT>());
1101 template <
typename FromT,
typename ToT, HWY_IF_NOT_FLOAT(FromT)>
1103 static_assert(
sizeof(ToT) ==
sizeof(FromT),
"Should have same size");
1105 return Vec1<ToT>(
static_cast<ToT
>(from.raw));
1115 template <
typename T>
1120 template <
typename T>
1127 template <
typename T>
1132 template <
typename T>
1138 template <
typename T>
1143 template <
typename T>
1150 template <
typename T>
1158 template <
typename T>
1163 template <
typename T,
typename TI>
1165 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane size");
1170 template <
typename T,
typename TI>
1175 template <
typename T>
1183 template <
typename T>
1190 template <
typename T>
1195 template <
typename T>
1200 template <
typename T>
1205 template <
typename T>
1215 template <
int kLane,
typename T>
1217 static_assert(kLane == 0,
"Scalar only has one lane");
1223 template <
typename T,
typename TI>
1225 uint8_t in_bytes[
sizeof(T)];
1226 uint8_t idx_bytes[
sizeof(T)];
1227 uint8_t out_bytes[
sizeof(T)];
1228 CopyBytes<sizeof(T)>(&in, &in_bytes);
1229 CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1230 for (
size_t i = 0; i <
sizeof(T); ++i) {
1231 out_bytes[i] = in_bytes[idx_bytes[i]];
1234 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1238 template <
typename T,
typename TI>
1240 uint8_t in_bytes[
sizeof(T)];
1241 uint8_t idx_bytes[
sizeof(T)];
1242 uint8_t out_bytes[
sizeof(T)];
1243 CopyBytes<sizeof(T)>(&in, &in_bytes);
1244 CopyBytes<sizeof(T)>(&indices, &idx_bytes);
1245 for (
size_t i = 0; i <
sizeof(T); ++i) {
1246 out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
1249 CopyBytes<sizeof(TI)>(&out_bytes, &out);
1276 template <
typename T,
typename TW = MakeW
ide<T>,
class VW = Vec1<TW>>
1278 return VW(
static_cast<TW
>((TW{b.
raw} << (
sizeof(T) * 8)) + a.
raw));
1283 template <
typename T>
1285 return mask.
bits == 0;
1288 template <
typename T>
1290 return mask.
bits != 0;
1294 template <
typename T>
1301 template <
typename T>
1307 template <
typename T>
1309 return mask.
bits == 0 ? 0 : 1;
1312 template <
typename T>
1314 return mask.
bits == 0 ? -1 : 0;
1319 template <
typename T>
1320 struct CompressIsPartition {
1324 template <
typename T>
1330 template <
typename T>
1337 template <
typename T>
1346 template <
typename T>
1349 if (!mask.
bits)
return 0;
1356 template <
typename T>
1378 template <
typename T>
1382 template <
typename T>
1386 template <
typename T>
1421 HWY_API auto Eq(V a, V b) -> decltype(a == b) {
1425 HWY_API auto Ne(V a, V b) -> decltype(a == b) {
1429 HWY_API auto Lt(V a, V b) -> decltype(a == b) {
1434 HWY_API auto Gt(V a, V b) -> decltype(a == b) {
1438 HWY_API auto Ge(V a, V b) -> decltype(a == b) {
1443 HWY_API auto Le(V a, V b) -> decltype(a == b) {
#define HWY_MAX(a, b)
Definition: base.h:128
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DASSERT(condition)
Definition: base.h:193
Definition: scalar-inl.h:68
Raw bits
Definition: scalar-inl.h:78
hwy::MakeUnsigned< T > Raw
Definition: scalar-inl.h:69
static HWY_INLINE Mask1< T > FromBool(bool b)
Definition: scalar-inl.h:72
HWY_API Mask1< T > operator<=(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:842
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1122
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_API Vec128< uint8_t > operator<<(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:953
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1885
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1513
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1518
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
HWY_API Mask1< T > operator==(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:817
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1523
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec1< T > operator+(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:421
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1815
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
HWY_API Mask1< T > operator<(const Vec1< T > a, const Vec1< T > b)
Definition: scalar-inl.h:833
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition: arm_neon-inl.h:1252
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:1889
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition: arm_neon-inl.h:1133
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
V Ceiling(const V v)
Definition: scalar-inl.h:736
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition: arm_neon-inl.h:1126
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition: arm_neon-inl.h:1033
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec1< T > operator-(Vec1< T > a, Vec1< T > b)
Definition: scalar-inl.h:434
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
HWY_API void CopyBytes(const From *from, To *to)
Definition: base.h:732
HWY_API float F32FromBF16(bfloat16_t bf)
Definition: base.h:746
HWY_API bfloat16_t BF16FromF32(float f)
Definition: base.h:754
constexpr float HighestValue< float >()
Definition: base.h:529
HWY_API size_t PopCount(uint64_t x)
Definition: base.h:667
constexpr float LowestValue< float >()
Definition: base.h:516
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
typename detail::Relations< T >::Signed MakeSigned
Definition: base.h:454
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: scalar-inl.h:1159
MakeSigned< T > raw
Definition: scalar-inl.h:1160
Definition: ops/shared-inl.h:40
Definition: scalar-inl.h:35
T raw
Definition: scalar-inl.h:63
Vec1 & operator=(const Vec1 &)=default
HWY_INLINE Vec1 & operator*=(const Vec1 other)
Definition: scalar-inl.h:41
HWY_INLINE Vec1 & operator-=(const Vec1 other)
Definition: scalar-inl.h:50
HWY_INLINE Vec1 & operator+=(const Vec1 other)
Definition: scalar-inl.h:47
Vec1(const Vec1 &)=default
HWY_INLINE Vec1()=default
HWY_INLINE Vec1 & operator/=(const Vec1 other)
Definition: scalar-inl.h:44
HWY_INLINE Vec1 & operator&=(const Vec1 other)
Definition: scalar-inl.h:53
HWY_INLINE Vec1(const T t)
Definition: scalar-inl.h:39
HWY_INLINE Vec1 & operator^=(const Vec1 other)
Definition: scalar-inl.h:59
HWY_INLINE Vec1 & operator|=(const Vec1 other)
Definition: scalar-inl.h:56
Definition: scalar-inl.h:84
Sisd< T > operator()(Vec1< T >) const
Definition: scalar-inl.h:86