39 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
40 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
41 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
42 #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
52 #define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP)
53 #define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP)
54 #define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
55 X_MACRO(uint, u, 32, 16, NAME, OP)
56 #define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
57 X_MACRO(uint, u, 64, 32, NAME, OP)
60 #define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP)
61 #define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP)
62 #define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP)
63 #define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP)
66 #define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
67 X_MACRO(float, f, 16, 16, NAME, OP)
68 #define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
69 X_MACRO(float, f, 32, 16, NAME, OP)
70 #define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
71 X_MACRO(float, f, 64, 32, NAME, OP)
74 #define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
75 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
76 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
77 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
78 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
80 #define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
81 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \
82 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \
83 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \
84 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
86 #define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
87 HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
88 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
89 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
92 #define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
93 HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \
94 HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
96 #define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
97 HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \
98 HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
100 #define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
101 HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
102 HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
104 #define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
105 HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
106 HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
108 #define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
109 HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
110 HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
111 HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
112 HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
115 #define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
116 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
117 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
119 #define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
120 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
121 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
123 #define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
124 HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
125 HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
126 HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
129 #define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
130 #define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2>
131 #define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
135 #define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \
137 struct DFromV_t<HWY_SVE_V(BASE, BITS)> { \
138 using type = ScalableTag<HWY_SVE_T(BASE, BITS)>; \
142 #undef HWY_SPECIALIZE
148 #define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \
149 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
150 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
152 #define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \
153 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
154 return sv##OP##_##CHAR##BITS(v); \
158 #define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \
159 HWY_API HWY_SVE_V(BASE, BITS) \
160 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
161 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
163 #define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \
164 HWY_API HWY_SVE_V(BASE, BITS) \
165 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
166 return sv##OP##_##CHAR##BITS(a, b); \
170 #define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \
171 HWY_API HWY_SVE_V(BASE, BITS) \
172 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
173 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
175 #define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \
176 HWY_API HWY_SVE_V(BASE, BITS) \
177 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
178 return sv##OP##_##CHAR##BITS(a, b); \
187 return svcntb_pat(SV_ALL);
190 return svcnth_pat(SV_ALL);
193 return svcntw_pat(SV_ALL);
196 return svcntd_pat(SV_ALL);
201 return svcntb_pat(SV_POW2);
204 return svcnth_pat(SV_POW2);
207 return svcntw_pat(SV_POW2);
210 return svcntd_pat(SV_POW2);
217 template <
typename T,
size_t N,
int kPow2>
221 if (detail::IsFull(
d))
return actual;
230 #define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \
231 template <size_t N, int kPow2> \
232 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \
233 const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \
234 return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit)); \
237 #undef HWY_SVE_FIRSTN
242 #define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
244 #define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
245 template <size_t N, int kPow2> \
246 HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
247 return HWY_SVE_PTRUE(BITS); \
251 #undef HWY_SVE_WRAP_PTRUE
253 HWY_API svbool_t PFalse() {
return svpfalse_b(); }
270 #define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \
271 template <size_t N, int kPow2> \
272 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
273 HWY_SVE_T(BASE, BITS) arg) { \
274 return sv##OP##_##CHAR##BITS(arg); \
281 template <
size_t N,
int kPow2>
287 using VFromD = decltype(
Set(D(), TFromD<D>()));
298 #define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
299 template <size_t N, int kPow2> \
300 HWY_API HWY_SVE_V(BASE, BITS) \
301 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) ) { \
302 return sv##OP##_##CHAR##BITS(); \
312 #define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \
313 HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
316 template <size_t N, int kPow2> \
317 HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \
318 HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(BASE, BITS) v) { \
323 #define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \
324 HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \
325 return sv##OP##_u8_##CHAR##BITS(v); \
327 template <size_t N, int kPow2> \
328 HWY_INLINE HWY_SVE_V(BASE, BITS) \
329 BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) , svuint8_t v) { \
330 return sv##OP##_##CHAR##BITS##_u8(v); \
340 #undef HWY_SVE_CAST_NOP
343 template <
size_t N,
int kPow2>
351 template <
class D,
class FromV>
372 template <
class V, HWY_IF_FLOAT_V(V)>
383 template <
class V, HWY_IF_FLOAT_V(V)>
398 template <
class V, HWY_IF_FLOAT_V(V)>
408 #define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
409 HWY_API HWY_SVE_V(BASE, BITS) \
410 NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
411 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
415 #undef HWY_SVE_RETV_ARGPVN_SWAP
418 #define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
419 HWY_API HWY_SVE_V(BASE, BITS) \
420 NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
421 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \
424 #undef HWY_SVE_RETV_ARGPVV_SWAP
426 template <
class V, HWY_IF_FLOAT_V(V)>
437 return Or(o,
And(a1, a2));
442 #ifdef HWY_NATIVE_POPCNT
443 #undef HWY_NATIVE_POPCNT
445 #define HWY_NATIVE_POPCNT
449 #define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \
450 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
451 return BitCast(DFromV<decltype(v)>(), \
452 sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
455 #undef HWY_SVE_POPCNT
476 return Or(abs,
And(msb, sign));
493 #define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \
494 HWY_API HWY_SVE_V(BASE, BITS) \
495 NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
496 return sv##OP##_##CHAR##BITS##_z(pg, a, b); \
500 #undef HWY_SVE_RETV_ARGPVN_MASK
509 const svbool_t pg = detail::PTrue(du64);
511 const svuint32_t sums_of_4 = svdot_n_u32(
Zero(du32),
v, 1);
514 const svuint64_t hi = svlsr_n_u64_x(pg,
BitCast(du64, sums_of_4), 32);
516 const svuint64_t lo = svextw_u64_x(pg,
BitCast(du64, sums_of_4));
535 #define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \
536 template <int kBits> \
537 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
538 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \
540 HWY_API HWY_SVE_V(BASE, BITS) \
541 NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
542 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \
552 #undef HWY_SVE_SHIFT_N
557 template <
int kBits,
class V>
559 constexpr
size_t kSizeInBits =
sizeof(
TFromV<V>) * 8;
560 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
561 if (kBits == 0)
return v;
562 return Or(ShiftRight<kBits>(
v), ShiftLeft<kSizeInBits - kBits>(
v));
567 #define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \
568 HWY_API HWY_SVE_V(BASE, BITS) \
569 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
570 const RebindToUnsigned<DFromV<decltype(v)>> du; \
571 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \
572 BitCast(du, bits)); \
607 #if HWY_TARGET == HWY_SVE2
608 return svqrdmulh_s16(a, b);
614 const svint16_t hi =
MulHigh(a, b);
618 const svuint16_t lo_top2 = ShiftRight<14>(lo);
620 const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1));
638 #define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \
639 HWY_API HWY_SVE_V(BASE, BITS) \
640 NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \
641 HWY_SVE_V(BASE, BITS) add) { \
642 return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
668 template <
class D,
typename MFrom>
682 return svand_b_z(b, b, a);
685 return svbic_b_z(b, b, a);
688 return svsel_b(a, a, b);
691 return svsel_b(a, svnand_b_z(a, a, b), b);
696 #define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
697 template <size_t N, int kPow2> \
698 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \
699 return sv##OP##_b##BITS(detail::MakeMask(d), m); \
703 #undef HWY_SVE_COUNT_TRUE
708 #define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \
709 template <size_t N, int kPow2> \
710 HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , svbool_t m) { \
711 return sv##OP##_b##BITS(svptrue_b##BITS(), m); \
715 #undef HWY_SVE_COUNT_TRUE_FULL
735 :
static_cast<intptr_t
>(
740 #define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
741 HWY_API HWY_SVE_V(BASE, BITS) \
742 NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
743 return sv##OP##_##CHAR##BITS(m, yes, no); \
747 #undef HWY_SVE_IF_THEN_ELSE
750 template <
class M,
class V>
756 template <
class M,
class V>
764 #define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \
765 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
766 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
768 #define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \
769 HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
770 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \
794 #undef HWY_SVE_COMPARE
795 #undef HWY_SVE_COMPARE_N
811 return detail::NeN(
And(a, bit), 0);
817 return detail::NeN(
v,
static_cast<TFromV<V>>(0));
822 template <
class D, HWY_IF_NOT_FLOAT_D(D)>
825 return BitCast(
d, detail::SubN(mask, v0, 1));
828 template <
class D, HWY_IF_FLOAT_D(D)>
845 #define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
846 template <size_t N, int kPow2> \
847 HWY_API HWY_SVE_V(BASE, BITS) \
848 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
849 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
850 return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p); \
853 #define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \
854 template <size_t N, int kPow2> \
855 HWY_API HWY_SVE_V(BASE, BITS) \
856 NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) , \
857 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
858 return sv##OP##_##CHAR##BITS(m, p); \
861 #define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
862 template <size_t N, int kPow2> \
863 HWY_API HWY_SVE_V(BASE, BITS) \
864 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
865 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
867 return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \
870 #define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
871 template <size_t N, int kPow2> \
872 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
873 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
874 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
875 sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v); \
878 #define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
879 template <size_t N, int kPow2> \
880 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \
881 HWY_SVE_D(BASE, BITS, N, kPow2) , \
882 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
883 sv##OP##_##CHAR##BITS(m, p, v); \
894 #undef HWY_SVE_MASKED_LOAD
895 #undef HWY_SVE_LOAD_DUP128
897 #undef HWY_SVE_BLENDED_STORE
900 template <
size_t N,
int kPow2>
907 template <
size_t N,
int kPow2>
923 template <
class V,
class D>
930 #define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
931 template <size_t N, int kPow2> \
932 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \
933 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
934 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
935 HWY_SVE_V(int, BITS) offset) { \
936 sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \
940 #define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
941 template <size_t N, int kPow2> \
943 HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N, kPow2) d, \
944 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, HWY_SVE_V(int, BITS) index) { \
945 sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, index, v); \
950 #undef HWY_SVE_SCATTER_OFFSET
951 #undef HWY_SVE_SCATTER_INDEX
955 #define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \
956 template <size_t N, int kPow2> \
957 HWY_API HWY_SVE_V(BASE, BITS) \
958 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
959 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
960 HWY_SVE_V(int, BITS) offset) { \
961 return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \
964 #define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \
965 template <size_t N, int kPow2> \
966 HWY_API HWY_SVE_V(BASE, BITS) \
967 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \
968 const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \
969 HWY_SVE_V(int, BITS) index) { \
970 return sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, \
976 #undef HWY_SVE_GATHER_OFFSET
977 #undef HWY_SVE_GATHER_INDEX
981 #define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \
982 template <size_t N, int kPow2> \
983 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
984 HWY_SVE_V(BASE, BITS) v2, \
985 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
986 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
987 const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \
988 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, triple); \
992 #undef HWY_SVE_STORE3
996 #define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \
997 template <size_t N, int kPow2> \
998 HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
999 HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
1000 HWY_SVE_D(BASE, BITS, N, kPow2) d, \
1001 HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \
1002 const sv##BASE##BITS##x4_t quad = \
1003 svcreate4##_##CHAR##BITS(v0, v1, v2, v3); \
1004 sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, quad); \
1008 #undef HWY_SVE_STORE4
1015 #define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \
1016 template <size_t N, int kPow2> \
1017 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1018 HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(BASE, HALF) v) { \
1019 return sv##OP##_##CHAR##BITS(v); \
1027 template <
size_t N,
int kPow2>
1032 template <
size_t N,
int kPow2>
1039 template <
size_t N,
int kPow2>
1044 template <
size_t N,
int kPow2>
1049 template <
size_t N,
int kPow2>
1066 template <
size_t N,
int kPow2>
1068 const svfloat16_t
v) {
1073 template <
size_t N,
int kPow2>
1075 const svfloat32_t
v) {
1080 template <
size_t N,
int kPow2>
1082 const svint32_t
v) {
1090 #undef HWY_SVE_PROMOTE_TO
1092 template <
size_t N,
int kPow2>
1106 template <
typename TN,
class VU>
1108 return detail::MinN(
v,
static_cast<TFromV<VU>>(LimitsMax<TN>()));
1112 template <
typename TN,
class VI>
1114 return detail::MinN(detail::MaxN(
v, LimitsMin<TN>()), LimitsMax<TN>());
1119 template <
size_t N,
int kPow2>
1123 using TN =
TFromD<decltype(dn)>;
1125 const svuint16_t clamped =
BitCast(du, detail::MaxN(
v, 0));
1127 const svuint8_t vn =
BitCast(dn, detail::SaturateU<TN>(clamped));
1128 return svuzp1_u8(vn, vn);
1131 template <
size_t N,
int kPow2>
1135 using TN =
TFromD<decltype(dn)>;
1137 const svuint32_t clamped =
BitCast(du, detail::MaxN(
v, 0));
1139 const svuint16_t vn =
BitCast(dn, detail::SaturateU<TN>(clamped));
1140 return svuzp1_u16(vn, vn);
1143 template <
size_t N,
int kPow2>
1148 using TN =
TFromD<decltype(dn)>;
1150 const svuint32_t clamped =
BitCast(du, detail::MaxN(
v, 0));
1152 const svuint16_t cast16 =
BitCast(d2, detail::SaturateU<TN>(clamped));
1153 const svuint8_t x2 =
BitCast(dn, svuzp1_u16(cast16, cast16));
1154 return svuzp1_u8(x2, x2);
1162 const svuint16_t cast16 =
BitCast(du16,
v);
1163 const svuint16_t x2 = svuzp1_u16(cast16, cast16);
1164 const svuint8_t cast8 =
BitCast(du8, x2);
1165 return svuzp1_u8(cast8, cast8);
1170 template <
size_t N,
int kPow2>
1172 #if HWY_TARGET == HWY_SVE2
1173 const svint8_t vn =
BitCast(dn, svqxtnb_s16(
v));
1175 using TN =
TFromD<decltype(dn)>;
1176 const svint8_t vn =
BitCast(dn, detail::SaturateI<TN>(
v));
1178 return svuzp1_s8(vn, vn);
1181 template <
size_t N,
int kPow2>
1183 #if HWY_TARGET == HWY_SVE2
1184 const svint16_t vn =
BitCast(dn, svqxtnb_s32(
v));
1186 using TN =
TFromD<decltype(dn)>;
1187 const svint16_t vn =
BitCast(dn, detail::SaturateI<TN>(
v));
1189 return svuzp1_s16(vn, vn);
1192 template <
size_t N,
int kPow2>
1195 #if HWY_TARGET == HWY_SVE2
1196 const svint16_t cast16 =
BitCast(d2, svqxtnb_s16(svqxtnb_s32(
v)));
1198 using TN =
TFromD<decltype(dn)>;
1199 const svint16_t cast16 =
BitCast(d2, detail::SaturateI<TN>(
v));
1201 const svint8_t v2 =
BitCast(dn, svuzp1_s16(cast16, cast16));
1202 return BitCast(dn, svuzp1_s8(v2, v2));
1211 #define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
1212 HWY_INLINE HWY_SVE_V(BASE, BITS) \
1213 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1214 return sv##OP##_##CHAR##BITS(lo, hi); \
1218 #undef HWY_SVE_CONCAT_EVERY_SECOND
1222 #define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \
1223 HWY_API HWY_SVE_V(BASE, BITS) NAME( \
1224 HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
1225 return sv##OP##_##CHAR##BITS(mask, lo, hi); \
1228 #undef HWY_SVE_SPLICE
1239 return detail::Splice(hi_odd, lo_odd,
FirstN(
d,
Lanes(
d) / 2));
1250 return detail::Splice(hi_odd, lo_odd,
FirstN(
d,
Lanes(
d) / 2));
1256 template <
size_t N,
int kPow2>
1258 const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(
d),
v);
1262 template <
size_t N,
int kPow2>
1268 template <
size_t N,
int kPow2>
1270 const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(
d),
v);
1274 template <
size_t N,
int kPow2>
1276 const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(
d),
v);
1282 #define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \
1283 template <size_t N, int kPow2> \
1284 HWY_API HWY_SVE_V(BASE, BITS) \
1285 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , HWY_SVE_V(int, BITS) v) { \
1286 return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1289 template <size_t N, int kPow2> \
1290 HWY_API HWY_SVE_V(int, BITS) \
1291 NAME(HWY_SVE_D(int, BITS, N, kPow2) , HWY_SVE_V(BASE, BITS) v) { \
1292 return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \
1297 #undef HWY_SVE_CONVERT
1301 template <
class VF,
class DI = RebindToSigned<DFromV<VF>>>
1309 #define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \
1310 template <size_t N, int kPow2> \
1311 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) , \
1312 HWY_SVE_T(BASE, BITS) first) { \
1313 return sv##OP##_##CHAR##BITS(first, 1); \
1319 template <
class D, HWY_IF_FLOAT_D(D)>
1341 #define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \
1342 template <size_t kIndex> \
1343 HWY_API HWY_SVE_V(BASE, BITS) \
1344 NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
1345 return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \
1353 template <
class D,
class V>
1359 template <
class D,
class V>
1365 template <
class D,
class V>
1371 template <
class D,
class V>
1374 const V lo_upper = detail::Splice(lo, lo, mask_upper);
1379 template <
class D,
class V2>
1386 template <
class D,
class V>
1393 template <
class D2,
class V>
1403 template <
class D2,
class V>
1412 #define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
1413 HWY_API HWY_SVE_T(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
1414 return sv##OP##_##CHAR##BITS(detail::PFalse(), v); \
1418 #undef HWY_SVE_GET_LANE
1428 return detail::InterleaveEven(
v,
v);
1439 return detail::InterleaveOdd(
v,
v);
1450 const auto even_in_odd = detail::Insert(even, 0);
1451 return detail::InterleaveOdd(even_in_odd, odd);
1458 using TU =
TFromD<decltype(du)>;
1459 constexpr
size_t kShift =
CeilLog2(16 /
sizeof(TU));
1460 const auto idx_block = ShiftRight<kShift>(
Iota(du, 0));
1461 const auto lsb = detail::AndN(idx_block,
static_cast<TU
>(1));
1462 const svbool_t is_even = detail::EqN(lsb,
static_cast<TU
>(0));
1468 template <
class D,
class VI>
1471 static_assert(
sizeof(
TFromD<D>) ==
sizeof(TI),
"Index/lane size mismatch");
1473 const auto indices =
BitCast(du, vec);
1474 #if HWY_IS_DEBUG_BUILD
1482 template <
class D,
typename TI>
1484 static_assert(
sizeof(
TFromD<D>) ==
sizeof(TI),
"Index size must match lane");
1489 #define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \
1490 HWY_API HWY_SVE_V(BASE, BITS) \
1491 NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \
1492 return sv##OP##_##CHAR##BITS(v, idx); \
1496 #undef HWY_SVE_TABLE
1502 template <
typename T,
size_t N,
int kPow2>
1514 constexpr
auto kLanesPerBlock =
1516 const VFromD<decltype(du)> idx = detail::XorN(
Iota(du, 0), kLanesPerBlock);
1523 #error "Update macro"
1525 #define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \
1526 template <size_t N, int kPow2> \
1527 HWY_API HWY_SVE_V(BASE, BITS) \
1528 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, HWY_SVE_V(BASE, BITS) v) { \
1529 const auto reversed = sv##OP##_##CHAR##BITS(v); \
1531 const size_t all_lanes = \
1532 detail::AllHardwareLanes(hwy::SizeTag<BITS / 8>()); \
1535 const ScalableTag<HWY_SVE_T(BASE, BITS)> dfull; \
1536 const svbool_t mask = Not(FirstN(dfull, all_lanes - Lanes(d))); \
1537 return detail::Splice(reversed, reversed, mask); \
1541 #undef HWY_SVE_REVERSE
1545 template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
1552 template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
1559 template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
1561 const auto even_in_odd = detail::Insert(
v, 0);
1562 return detail::InterleaveOdd(
v, even_in_odd);
1571 const auto idx = detail::XorN(
Iota(du, 0), 3);
1580 const auto idx = detail::XorN(
Iota(du, 0), 7);
1586 template <
typename T>
1587 struct CompressIsPartition {
1591 #define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \
1592 HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
1593 return sv##OP##_##CHAR##BITS(mask, v); \
1597 #undef HWY_SVE_COMPRESS
1599 template <
class V, HWY_IF_LANE_SIZE_V(V, 2)>
1601 static_assert(!IsSame<V, svfloat16_t>(),
"Must use overload");
1608 const svbool_t mask32L = svunpklo_b(mask16);
1609 const svbool_t mask32H = svunpkhi_b(mask16);
1611 const auto compressedL =
Compress(v32L, mask32L);
1612 const auto compressedH =
Compress(v32H, mask32H);
1615 const V evenL =
BitCast(d16, compressedL);
1616 const V evenH =
BitCast(d16, compressedH);
1623 const size_t countL = detail::CountTrueFull(dw, mask32L);
1624 const auto compressed_maskL =
FirstN(d16, countL);
1625 return detail::Splice(v16H, v16L, compressed_maskL);
1637 template <
class V,
class M,
class D>
1646 template <
class V,
class M,
class D>
1650 const svbool_t store_mask =
FirstN(
d, count);
1663 template <
class D,
class V>
1666 return detail::AndNotN(
static_cast<T
>(
LanesPerBlock(
d) - 1), iota0);
1669 template <
size_t kLanes,
class D>
1673 const auto idx_mod = detail::AndN(
Iota(di, 0), kLanesPerBlock - 1);
1674 return detail::LtN(
BitCast(di, idx_mod), kLanes);
1679 template <
size_t kBytes,
class D,
class V = VFromD<D>>
1682 const auto hi8 =
BitCast(d8, hi);
1683 const auto lo8 =
BitCast(d8, lo);
1684 const auto hi_up = detail::Splice(hi8, hi8,
FirstN(d8, 16 - kBytes));
1685 const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
1695 static_assert(
sizeof(
TFromD<decltype(
d)>) == 4,
"Defined for 32-bit types");
1704 static_assert(
sizeof(
TFromD<decltype(
d)>) == 4,
"Defined for 32-bit types");
1705 const svuint8_t v8 =
BitCast(d8,
v);
1706 return BitCast(
d, CombineShiftRightBytes<12>(d8, v8, v8));
1714 static_assert(
sizeof(
TFromD<decltype(
d)>) == 4,
"Defined for 32-bit types");
1715 const svuint8_t v8 =
BitCast(d8,
v);
1716 return BitCast(
d, CombineShiftRightBytes<4>(d8, v8, v8));
1724 static_assert(
sizeof(
TFromD<decltype(
d)>) == 4,
"Defined for 32-bit types");
1725 const svuint8_t v8 =
BitCast(d8,
v);
1726 return BitCast(
d, CombineShiftRightBytes<8>(d8, v8, v8));
1734 static_assert(
sizeof(
TFromD<decltype(
d)>) == 8,
"Defined for 64-bit types");
1735 const svuint8_t v8 =
BitCast(d8,
v);
1736 return BitCast(
d, CombineShiftRightBytes<8>(d8, v8, v8));
1746 template <
class D,
class V = VFromD<D>>
1754 template <
class V,
class VI>
1759 const auto idx8 =
Add(
BitCast(du8, idx), offsets128);
1763 template <
class V,
class VI>
1769 auto idx8 =
BitCast(di8, idx);
1770 const auto msb = detail::LtN(idx8, 0);
1778 template <
int kLane,
class V>
1783 static_assert(0 <= kLane && kLane < kLanesPerBlock,
"Invalid lane");
1786 idx = detail::AddN(idx, kLane);
1793 template <
size_t kLanes,
class D,
class V = VFromD<D>>
1795 const auto zero =
Zero(
d);
1796 const auto shifted = detail::Splice(
v, zero,
FirstN(
d, kLanes));
1798 return IfThenElse(detail::FirstNPerBlock<kLanes>(
d), zero, shifted);
1801 template <
size_t kLanes,
class V>
1803 return ShiftLeftLanes<kLanes>(
DFromV<V>(),
v);
1807 template <
size_t kLanes,
class D,
class V = VFromD<D>>
1810 if (!detail::IsFull(
d)) {
1814 const auto shifted = detail::Ext<kLanes>(
v,
v);
1823 template <
int kBytes,
class D,
class V = VFromD<D>>
1829 template <
int kBytes,
class V>
1831 return ShiftLeftBytes<kBytes>(
DFromV<V>(),
v);
1835 template <
int kBytes,
class D,
class V = VFromD<D>>
1843 template <
class D,
class V>
1848 const auto a64 =
BitCast(d64, a);
1849 const auto b64 =
BitCast(d64, b);
1864 template <
class D,
class V = VFromD<D>,
1865 hwy::EnableIf<detail::IsFull(D())>* =
nullptr>
1869 const auto a64 =
BitCast(d64, a);
1870 const auto b64 =
BitCast(d64, b);
1877 template <
class D,
class V = VFromD<D>,
1878 hwy::EnableIf<!detail::IsFull(D())>* =
nullptr>
1881 if (
Lanes(
d) *
sizeof(TFromD<D>) < 16) {
1882 const Half<decltype(
d)> d2;
1890 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
1896 template <
class V,
class D = DFromV<V>,
class DW = RepartitionToW
ide<D>>
1902 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
1911 #define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \
1912 template <size_t N, int kPow2> \
1913 HWY_API HWY_SVE_V(BASE, BITS) \
1914 NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, HWY_SVE_V(BASE, BITS) v) { \
1915 return Set(d, static_cast<HWY_SVE_T(BASE, BITS)>( \
1916 sv##OP##_##CHAR##BITS(detail::MakeMask(d), v))); \
1926 #undef HWY_SVE_REDUCE
1932 template <
size_t N,
int kPow2>
1934 const svuint16_t
v) {
1940 template <
size_t N,
int kPow2>
1942 svfloat32_t a, svfloat32_t b) {
1944 const Repartition<uint32_t, decltype(dbf16)> du32;
1945 const svuint32_t b_in_even = ShiftRight<16>(
BitCast(du32, b));
1974 #if HWY_TARGET == HWY_SVE2
1980 return ShiftRight<1>(detail::AddN(
Add(a, b), 1));
1987 template <
class D, HWY_IF_LANE_SIZE_D(D, 1)>
1990 const svuint8_t iota =
Iota(du, 0);
1993 const svuint8_t bytes =
BitCast(du, svld1ub_u64(detail::PTrue(
d), bits));
1995 const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
1998 const svuint8_t bit =
Shl(
Set(du, 1), detail::AndN(iota, 7));
2003 template <
class D, HWY_IF_LANE_SIZE_D(D, 2)>
2006 const RebindToUnsigned<D> du;
2007 const Repartition<uint8_t, D> du8;
2010 const svuint8_t bytes = svld1(
FirstN(du8, (
Lanes(du) + 7) / 8), bits);
2013 const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(
Iota(du8, 0)));
2016 const svuint16_t bit =
Shl(
Set(du, 1), detail::AndN(
Iota(du, 0), 7));
2021 template <
class D, HWY_IF_LANE_SIZE_D(D, 4)>
2024 const RebindToUnsigned<D> du;
2025 const Repartition<uint8_t, D> du8;
2029 const svuint8_t bytes = svld1(
FirstN(du8, 8), bits);
2032 const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(
Iota(du8, 0)));
2035 const svuint32_t bit =
Shl(
Set(du, 1), detail::AndN(
Iota(du, 0), 7));
2040 template <
class D, HWY_IF_LANE_SIZE_D(D, 8)>
2043 const RebindToUnsigned<D> du;
2048 CopyBytes<4>(bits, &mask_bits);
2049 const auto vbits =
Set(du, mask_bits);
2052 const svuint64_t bit =
Shl(
Set(du, 1),
Iota(du, 0));
2062 template <
class T, HWY_IF_LANE_SIZE(T, 1)>
2064 return svdup_n_u8_z(m, 1);
2066 template <
class T, HWY_IF_LANE_SIZE(T, 2)>
2069 const svuint8_t b16 =
BitCast(d8, svdup_n_u16_z(m, 1));
2072 template <
class T, HWY_IF_LANE_SIZE(T, 4)>
2076 template <
class T, HWY_IF_LANE_SIZE(T, 8)>
2078 const ScalableTag<uint32_t> d32;
2079 const svuint32_t b64 =
BitCast(d32, svdup_n_u64_z(m, 1));
2101 svuint64_t bits_in_u64 =
2104 const size_t num_bits =
Lanes(
d);
2105 const size_t num_bytes = (num_bits + 8 - 1) / 8;
2113 const int mask = (1 << num_bits) - 1;
2114 bits[0] =
static_cast<uint8_t
>(bits[0] & mask);
2136 #if HWY_TARGET == HWY_SVE2
2138 #define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \
2139 HWY_API HWY_SVE_V(BASE, BITS) \
2140 NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \
2141 return sv##OP##_##CHAR##BITS(a, b); \
2145 #undef HWY_SVE_MUL_EVEN
2149 template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
2151 #if HWY_TARGET == HWY_SVE2
2154 const auto lo =
Mul(a, b);
2156 return BitCast(DW(), detail::InterleaveEven(lo, hi));
2161 const auto lo =
Mul(a, b);
2163 return detail::InterleaveEven(lo, hi);
2167 const auto lo =
Mul(a, b);
2169 return detail::InterleaveOdd(lo, hi);
2174 template <
size_t N,
int kPow2>
2176 svuint16_t a, svuint16_t b,
2177 const svfloat32_t sum0,
2178 svfloat32_t& sum1) {
2182 const svuint16_t zero =
Zero(du16);
2193 #if defined(__ARM_FEATURE_SVE2_AES)
2196 #ifdef HWY_NATIVE_AES
2197 #undef HWY_NATIVE_AES
2199 #define HWY_NATIVE_AES
2204 const svuint8_t zero = svdup_n_u8(0);
2205 return Xor(svaesmc_u8(svaese_u8(state, zero)), round_key);
2209 return Xor(svaese_u8(state, svdup_n_u8(0)), round_key);
2213 return svpmullb_pair(a, b);
2217 return svpmullt_pair(a, b);
2240 const svbool_t eqHL =
Eq(a, b);
2241 const svbool_t ltHL =
Lt(a, b);
2243 const svbool_t cmpLL = svtrn1_b64(ltHL, ltHL);
2244 const svbool_t outHx = svsel_b(eqHL, cmpLL, ltHL);
2245 return svtrn2_b64(outHx, outHx);
2262 #undef HWY_IF_FLOAT_V
2263 #undef HWY_IF_LANE_SIZE_V
2264 #undef HWY_IF_SIGNED_V
2265 #undef HWY_IF_UNSIGNED_V
2267 #undef HWY_SVE_FOREACH
2268 #undef HWY_SVE_FOREACH_F
2269 #undef HWY_SVE_FOREACH_F16
2270 #undef HWY_SVE_FOREACH_F32
2271 #undef HWY_SVE_FOREACH_F64
2272 #undef HWY_SVE_FOREACH_I
2273 #undef HWY_SVE_FOREACH_I08
2274 #undef HWY_SVE_FOREACH_I16
2275 #undef HWY_SVE_FOREACH_I32
2276 #undef HWY_SVE_FOREACH_I64
2277 #undef HWY_SVE_FOREACH_IF
2278 #undef HWY_SVE_FOREACH_U
2279 #undef HWY_SVE_FOREACH_U08
2280 #undef HWY_SVE_FOREACH_U16
2281 #undef HWY_SVE_FOREACH_U32
2282 #undef HWY_SVE_FOREACH_U64
2283 #undef HWY_SVE_FOREACH_UI
2284 #undef HWY_SVE_FOREACH_UI08
2285 #undef HWY_SVE_FOREACH_UI16
2286 #undef HWY_SVE_FOREACH_UI32
2287 #undef HWY_SVE_FOREACH_UI64
2288 #undef HWY_SVE_FOREACH_UIF3264
2289 #undef HWY_SVE_PTRUE
2290 #undef HWY_SVE_RETV_ARGPV
2291 #undef HWY_SVE_RETV_ARGPVN
2292 #undef HWY_SVE_RETV_ARGPVV
2293 #undef HWY_SVE_RETV_ARGV
2294 #undef HWY_SVE_RETV_ARGVN
2295 #undef HWY_SVE_RETV_ARGVV
2297 #undef HWY_SVE_UNDEFINED
#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:100
#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:56
#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:68
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:696
#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1222
#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:940
#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1211
#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:52
#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1309
#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:323
#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:638
#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:312
#define HWY_SVE_FOREACH(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:123
#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:230
#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1525
#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:861
#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:493
#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:955
#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:270
#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:853
#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:115
#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:108
#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:152
#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:60
#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:298
#define HWY_SVE_PTRUE(BITS)
Definition: arm_sve-inl.h:242
#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1341
#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:996
#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:981
#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:708
#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:964
#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:163
#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:86
#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:158
#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:870
#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1489
#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:175
#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:567
#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:92
#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1015
#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:878
#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1591
#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:845
#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:768
#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:449
#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:80
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:135
#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1282
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:740
#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:408
#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:119
#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:53
#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:74
#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:764
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:244
#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:2138
#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:104
#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:930
#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:535
#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1911
#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP)
Definition: arm_sve-inl.h:96
#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:170
#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:148
#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:418
#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition: arm_sve-inl.h:1412
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_API
Definition: base.h:122
#define HWY_MIN(a, b)
Definition: base.h:127
#define HWY_INLINE
Definition: base.h:64
#define HWY_DASSERT(condition)
Definition: base.h:193
HWY_INLINE svuint8_t BoolFromMask(svbool_t m)
Definition: arm_sve-inl.h:2063
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:899
HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:186
HWY_INLINE svuint64_t BitsFromBool(svuint8_t x)
Definition: arm_sve-inl.h:2084
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition: arm_sve-inl.h:1664
svbool_t MakeMask(D d)
Definition: arm_sve-inl.h:260
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 >)
Definition: arm_sve-inl.h:1503
VI SaturateI(VI v)
Definition: arm_sve-inl.h:1113
svbool_t FirstNPerBlock(D d)
Definition: arm_sve-inl.h:1670
svbool_t MaskUpperHalf(D d)
Definition: arm_sve-inl.h:1334
VU SaturateU(VU v)
Definition: arm_sve-inl.h:1107
HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) HWY_API svbool_t PFalse()
Definition: arm_sve-inl.h:250
svbool_t MaskLowerHalf(D d)
Definition: arm_sve-inl.h:1330
HWY_INLINE size_t HardwareLanes(hwy::SizeTag< 1 >)
Definition: arm_sve-inl.h:200
HWY_API svfloat32_t PromoteUpperTo(Simd< float, N, kPow2 > df, svfloat16_t v)
Definition: arm_sve-inl.h:1093
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition: ops/shared-inl.h:115
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:601
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition: arm_neon-inl.h:574
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:862
d
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1648
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition: arm_neon-inl.h:4038
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition: arm_neon-inl.h:3709
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition: arm_neon-inl.h:1688
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition: arm_neon-inl.h:4003
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition: arm_neon-inl.h:1225
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition: arm_neon-inl.h:3672
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition: arm_neon-inl.h:3531
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition: arm_neon-inl.h:3581
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition: arm_neon-inl.h:3547
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2878
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition: arm_neon-inl.h:3769
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5244
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition: arm_neon-inl.h:2999
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5257
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3102
_
Definition: rvv-inl.h:1405
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition: arm_neon-inl.h:4761
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1290
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2416
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1604
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition: ops/shared-inl.h:210
HWY_API bool AllTrue(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition: arm_neon-inl.h:4790
HWY_API Vec128< T, N > SumOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4437
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1665
V Shl(V a, V b)
Definition: arm_neon-inl.h:5235
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5261
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition: arm_neon-inl.h:1264
StoreInterleaved3
Definition: rvv-inl.h:1405
HWY_API Vec128< uint32_t > ConcatOdd(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3903
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3842
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition: ops/shared-inl.h:201
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4284
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Vec1< uint8_t > SaturatedAdd(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:457
sseg3 sseg3 StoreInterleaved4
Definition: rvv-inl.h:1428
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition: arm_neon-inl.h:3541
HWY_API Vec128< uint32_t > ConcatEven(Full128< uint32_t >, Vec128< uint32_t > hi, Vec128< uint32_t > lo)
Definition: arm_neon-inl.h:3945
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2205
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3869
HWY_API Vec128< int64_t > Neg(const Vec128< int64_t > v)
Definition: arm_neon-inl.h:904
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition: arm_neon-inl.h:4060
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition: arm_neon-inl.h:2952
typename D::Twice Twice
Definition: ops/shared-inl.h:220
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition: arm_neon-inl.h:2748
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition: arm_neon-inl.h:3688
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1505
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b)
Definition: arm_sve-inl.h:2160
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition: arm_neon-inl.h:1681
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 >, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:4753
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2788
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition: arm_neon-inl.h:3987
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1711
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
HWY_API bool AllFalse(const Full128< T > d, const Mask128< T > m)
Definition: arm_neon-inl.h:4771
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition: arm_neon-inl.h:3419
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3490
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2909
HWY_API VFromD< D > ConcatEven(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1244
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1344
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition: arm_neon-inl.h:1656
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition: arm_neon-inl.h:1735
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2895
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition: ops/shared-inl.h:212
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition: arm_neon-inl.h:3373
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition: arm_neon-inl.h:4045
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3091
HWY_INLINE VFromD< D > Min128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5203
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3461
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3513
HWY_API Vec128< T, N > MaxOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4445
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition: arm_neon-inl.h:757
HWY_API Vec1< uint8_t > AverageRound(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:510
HWY_API Vec1< T > ShiftRight(const Vec1< T > v)
Definition: scalar-inl.h:345
HWY_API Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:4510
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition: arm_neon-inl.h:3535
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition: arm_neon-inl.h:1917
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition: arm_neon-inl.h:1175
HWY_API Vec1< uint8_t > SaturatedSub(const Vec1< uint8_t > a, const Vec1< uint8_t > b)
Definition: scalar-inl.h:484
HWY_INLINE Mask128< T, N > Lt128(Simd< T, N, 0 > d, Vec128< T, N > a, Vec128< T, N > b)
Definition: arm_neon-inl.h:5172
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition: arm_neon-inl.h:1498
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition: arm_neon-inl.h:1724
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3895
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition: ops/shared-inl.h:162
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition: arm_neon-inl.h:1211
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4231
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition: arm_neon-inl.h:3681
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition: arm_neon-inl.h:3777
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition: arm_neon-inl.h:747
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5077
HWY_API V InterleaveUpper(Simd< T, N, 0 >, V a, V b)
Definition: arm_neon-inl.h:3656
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition: arm_neon-inl.h:4267
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1718
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_INLINE VFromD< D > Max128(D d, const VFromD< D > a, const VFromD< D > b)
Definition: arm_neon-inl.h:5208
HWY_API Vec1< T > ShiftLeft(const Vec1< T > v)
Definition: scalar-inl.h:339
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5266
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2939
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition: arm_neon-inl.h:3413
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition: arm_neon-inl.h:4249
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1323
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition: arm_neon-inl.h:1778
V Shr(V a, V b)
Definition: arm_neon-inl.h:5239
decltype(Zero(D())) VFromD
Definition: arm_neon-inl.h:743
HWY_API VFromD< DW > ZipLower(const V a, const V b)
Definition: arm_sve-inl.h:1897
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:4019
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition: arm_neon-inl.h:3285
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition: arm_neon-inl.h:5038
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition: arm_neon-inl.h:3553
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition: arm_neon-inl.h:2867
typename D::Half Half
Definition: ops/shared-inl.h:216
HWY_API Vec128< T, N > MinOfLanes(Simd< T, N, 0 >, const Vec128< T, N > v)
Definition: arm_neon-inl.h:4441
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition: arm_neon-inl.h:3114
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5248
N
Definition: rvv-inl.h:1656
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1404
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2606
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5052
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition: arm_neon-inl.h:935
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1455
HWY_API Vec128< T > ReverseBlocks(Full128< T >, const Vec128< T > v)
Definition: arm_neon-inl.h:4053
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5230
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:852
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5226
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition: arm_neon-inl.h:3430
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2397
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition: arm_neon-inl.h:2426
TFromD< DFromV< V > > TFromV
Definition: arm_neon-inl.h:558
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition: arm_neon-inl.h:3146
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition: arm_neon-inl.h:1376
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API Vec128< T, N > Compress(Vec128< T, N > v, const Mask128< T, N > mask)
Definition: arm_neon-inl.h:5031
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition: arm_neon-inl.h:3120
HWY_API VFromD< D > ConcatOdd(D d, VFromD< D > hi, VFromD< D > lo)
Definition: arm_sve-inl.h:1233
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1352
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:278
Definition: aligned_allocator.h:27
constexpr HWY_API bool IsSame()
Definition: base.h:286
constexpr size_t CeilLog2(TI x)
Definition: base.h:700
constexpr HWY_API bool IsSigned()
Definition: base.h:483
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
@ value
Definition: arm_neon-inl.h:4798
Definition: arm_sve-inl.h:32
Definition: ops/shared-inl.h:40
uint16_t bits
Definition: base.h:254